aboutsummaryrefslogtreecommitdiff
path: root/llvm
diff options
context:
space:
mode:
authorAlex MacLean <amaclean@nvidia.com>2024-02-22 17:27:28 -0800
committerGitHub <noreply@github.com>2024-02-22 17:27:28 -0800
commit590c968e7943e51bb00ff75d312435f24d983b2a (patch)
treee49fd62479030788c7079c24fbebfe5b7465e7fe /llvm
parent7a5c01dbca3ddfc6dd87775ec90346783c8e2c73 (diff)
downloadllvm-590c968e7943e51bb00ff75d312435f24d983b2a.zip
llvm-590c968e7943e51bb00ff75d312435f24d983b2a.tar.gz
llvm-590c968e7943e51bb00ff75d312435f24d983b2a.tar.bz2
[NVPTX] fixup support for unaligned parameters and returns (#82562)
Add support for unaligned parameters and return values. These must be loaded and stored one byte at a time and then bit manipulation is used to assemble the correct final result.
Diffstat (limited to 'llvm')
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp30
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp257
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXInstrInfo.td4
-rw-r--r--llvm/test/CodeGen/NVPTX/param-load-store.ll93
-rw-r--r--llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll385
5 files changed, 730 insertions, 39 deletions
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index ded2f25..3ff8994 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -2135,6 +2135,21 @@ bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
NVPTX::StoreRetvalI8, NVPTX::StoreRetvalI16,
NVPTX::StoreRetvalI32, NVPTX::StoreRetvalI64,
NVPTX::StoreRetvalF32, NVPTX::StoreRetvalF64);
+ if (Opcode == NVPTX::StoreRetvalI8) {
+ // Fine tune the opcode depending on the size of the operand.
+ // This helps to avoid creating redundant COPY instructions in
+ // InstrEmitter::AddRegisterOperand().
+ switch (Ops[0].getSimpleValueType().SimpleTy) {
+ default:
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::StoreRetvalI8TruncI32;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::StoreRetvalI8TruncI64;
+ break;
+ }
+ }
break;
case 2:
Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
@@ -2211,6 +2226,21 @@ bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
NVPTX::StoreParamI8, NVPTX::StoreParamI16,
NVPTX::StoreParamI32, NVPTX::StoreParamI64,
NVPTX::StoreParamF32, NVPTX::StoreParamF64);
+ if (Opcode == NVPTX::StoreParamI8) {
+ // Fine tune the opcode depending on the size of the operand.
+ // This helps to avoid creating redundant COPY instructions in
+ // InstrEmitter::AddRegisterOperand().
+ switch (Ops[0].getSimpleValueType().SimpleTy) {
+ default:
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::StoreParamI8TruncI32;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::StoreParamI8TruncI64;
+ break;
+ }
+ }
break;
case 2:
Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 7d2fe78..66a1010 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -47,6 +47,7 @@
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
+#include "llvm/Support/Alignment.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/CommandLine.h"
@@ -59,6 +60,7 @@
#include <cmath>
#include <cstdint>
#include <iterator>
+#include <optional>
#include <sstream>
#include <string>
#include <utility>
@@ -1529,6 +1531,105 @@ Align NVPTXTargetLowering::getArgumentAlignment(const CallBase *CB, Type *Ty,
return DL.getABITypeAlign(Ty);
}
+static bool adjustElementType(EVT &ElementType) {
+ switch (ElementType.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::f16:
+ case MVT::bf16:
+ ElementType = MVT::i16;
+ return true;
+ case MVT::f32:
+ case MVT::v2f16:
+ case MVT::v2bf16:
+ ElementType = MVT::i32;
+ return true;
+ case MVT::f64:
+ ElementType = MVT::i64;
+ return true;
+ }
+}
+
+// Use byte-store when the param address of the argument value is unaligned.
+// This may happen when the return value is a field of a packed structure.
+//
+// This is called in LowerCall() when passing the param values.
+static SDValue LowerUnalignedStoreParam(SelectionDAG &DAG, SDValue Chain,
+ uint64_t Offset, EVT ElementType,
+ SDValue StVal, SDValue &InGlue,
+ unsigned ArgID, const SDLoc &dl) {
+ // Bit logic only works on integer types
+ if (adjustElementType(ElementType))
+ StVal = DAG.getNode(ISD::BITCAST, dl, ElementType, StVal);
+
+ // Store each byte
+ SDVTList StoreVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
+ // Shift the byte to the last byte position
+ SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, StVal,
+ DAG.getConstant(i * 8, dl, MVT::i32));
+ SDValue StoreOperands[] = {Chain, DAG.getConstant(ArgID, dl, MVT::i32),
+ DAG.getConstant(Offset + i, dl, MVT::i32),
+ ShiftVal, InGlue};
+ // Trunc store only the last byte by using
+ // st.param.b8
+ // The register type can be larger than b8.
+ Chain = DAG.getMemIntrinsicNode(
+ NVPTXISD::StoreParam, dl, StoreVTs, StoreOperands, MVT::i8,
+ MachinePointerInfo(), Align(1), MachineMemOperand::MOStore);
+ InGlue = Chain.getValue(1);
+ }
+ return Chain;
+}
+
+// Use byte-load when the param adress of the returned value is unaligned.
+// This may happen when the returned value is a field of a packed structure.
+static SDValue
+LowerUnalignedLoadRetParam(SelectionDAG &DAG, SDValue &Chain, uint64_t Offset,
+ EVT ElementType, SDValue &InGlue,
+ SmallVectorImpl<SDValue> &TempProxyRegOps,
+ const SDLoc &dl) {
+ // Bit logic only works on integer types
+ EVT MergedType = ElementType;
+ adjustElementType(MergedType);
+
+ // Load each byte and construct the whole value. Initial value to 0
+ SDValue RetVal = DAG.getConstant(0, dl, MergedType);
+ // LoadParamMemI8 loads into i16 register only
+ SDVTList LoadVTs = DAG.getVTList(MVT::i16, MVT::Other, MVT::Glue);
+ for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
+ SDValue LoadOperands[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
+ DAG.getConstant(Offset + i, dl, MVT::i32),
+ InGlue};
+ // This will be selected to LoadParamMemI8
+ SDValue LdVal =
+ DAG.getMemIntrinsicNode(NVPTXISD::LoadParam, dl, LoadVTs, LoadOperands,
+ MVT::i8, MachinePointerInfo(), Align(1));
+ SDValue TmpLdVal = LdVal.getValue(0);
+ Chain = LdVal.getValue(1);
+ InGlue = LdVal.getValue(2);
+
+ TmpLdVal = DAG.getNode(NVPTXISD::ProxyReg, dl,
+ TmpLdVal.getSimpleValueType(), TmpLdVal);
+ TempProxyRegOps.push_back(TmpLdVal);
+
+ SDValue CMask = DAG.getConstant(255, dl, MergedType);
+ SDValue CShift = DAG.getConstant(i * 8, dl, MVT::i32);
+ // Need to extend the i16 register to the whole width.
+ TmpLdVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MergedType, TmpLdVal);
+ // Mask off the high bits. Leave only the lower 8bits.
+ // Do this because we are using loadparam.b8.
+ TmpLdVal = DAG.getNode(ISD::AND, dl, MergedType, TmpLdVal, CMask);
+ // Shift and merge
+ TmpLdVal = DAG.getNode(ISD::SHL, dl, MergedType, TmpLdVal, CShift);
+ RetVal = DAG.getNode(ISD::OR, dl, MergedType, RetVal, TmpLdVal);
+ }
+ if (ElementType != MergedType)
+ RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal);
+
+ return RetVal;
+}
+
SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const {
@@ -1680,17 +1781,6 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
if (NeedAlign)
PartAlign = commonAlignment(ArgAlign, CurOffset);
- // New store.
- if (VectorInfo[j] & PVF_FIRST) {
- assert(StoreOperands.empty() && "Unfinished preceding store.");
- StoreOperands.push_back(Chain);
- StoreOperands.push_back(
- DAG.getConstant(IsVAArg ? FirstVAArg : ParamCount, dl, MVT::i32));
- StoreOperands.push_back(DAG.getConstant(
- IsByVal ? CurOffset + VAOffset : (IsVAArg ? VAOffset : CurOffset),
- dl, MVT::i32));
- }
-
SDValue StVal = OutVals[OIdx];
MVT PromotedVT;
@@ -1723,6 +1813,35 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
}
+ // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a
+ // scalar store. In such cases, fall back to byte stores.
+ if (VectorInfo[j] == PVF_SCALAR && !IsVAArg && PartAlign.has_value() &&
+ PartAlign.value() <
+ DL.getABITypeAlign(EltVT.getTypeForEVT(*DAG.getContext()))) {
+ assert(StoreOperands.empty() && "Unfinished preceeding store.");
+ Chain = LowerUnalignedStoreParam(
+ DAG, Chain, IsByVal ? CurOffset + VAOffset : CurOffset, EltVT,
+ StVal, InGlue, ParamCount, dl);
+
+ // LowerUnalignedStoreParam took care of inserting the necessary nodes
+ // into the SDAG, so just move on to the next element.
+ if (!IsByVal)
+ ++OIdx;
+ continue;
+ }
+
+ // New store.
+ if (VectorInfo[j] & PVF_FIRST) {
+ assert(StoreOperands.empty() && "Unfinished preceding store.");
+ StoreOperands.push_back(Chain);
+ StoreOperands.push_back(
+ DAG.getConstant(IsVAArg ? FirstVAArg : ParamCount, dl, MVT::i32));
+
+ StoreOperands.push_back(DAG.getConstant(
+ IsByVal ? CurOffset + VAOffset : (IsVAArg ? VAOffset : CurOffset),
+ dl, MVT::i32));
+ }
+
// Record the value to store.
StoreOperands.push_back(StVal);
@@ -1923,6 +2042,14 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SmallVector<SDValue, 16> ProxyRegOps;
SmallVector<std::optional<MVT>, 16> ProxyRegTruncates;
+ // An item of the vector is filled if the element does not need a ProxyReg
+ // operation on it and should be added to InVals as is. ProxyRegOps and
+ // ProxyRegTruncates contain empty/none items at the same index.
+ SmallVector<SDValue, 16> RetElts;
+ // A temporary ProxyReg operations inserted in `LowerUnalignedLoadRetParam()`
+ // to use the values of `LoadParam`s and to be replaced later then
+ // `CALLSEQ_END` is added.
+ SmallVector<SDValue, 16> TempProxyRegOps;
// Generate loads from param memory/moves from registers for result
if (Ins.size() > 0) {
@@ -1966,6 +2093,22 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
EltType = MVT::i16;
}
+ // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a
+ // scalar load. In such cases, fall back to byte loads.
+ if (VectorInfo[i] == PVF_SCALAR && RetTy->isAggregateType() &&
+ EltAlign < DL.getABITypeAlign(
+ TheLoadType.getTypeForEVT(*DAG.getContext()))) {
+ assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
+ SDValue Ret = LowerUnalignedLoadRetParam(
+ DAG, Chain, Offsets[i], TheLoadType, InGlue, TempProxyRegOps, dl);
+ ProxyRegOps.push_back(SDValue());
+ ProxyRegTruncates.push_back(std::optional<MVT>());
+ RetElts.resize(i);
+ RetElts.push_back(Ret);
+
+ continue;
+ }
+
// Record index of the very first element of the vector.
if (VectorInfo[i] & PVF_FIRST) {
assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
@@ -2028,6 +2171,11 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// will not get lost. Otherwise, during libcalls expansion, the nodes can become
// dangling.
for (unsigned i = 0; i < ProxyRegOps.size(); ++i) {
+ if (i < RetElts.size() && RetElts[i]) {
+ InVals.push_back(RetElts[i]);
+ continue;
+ }
+
SDValue Ret = DAG.getNode(
NVPTXISD::ProxyReg, dl,
DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue),
@@ -2044,6 +2192,18 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
InVals.push_back(Ret);
}
+ for (SDValue &T : TempProxyRegOps) {
+ SDValue Repl = DAG.getNode(
+ NVPTXISD::ProxyReg, dl,
+ DAG.getVTList(T.getSimpleValueType(), MVT::Other, MVT::Glue),
+ {Chain, T.getOperand(0), InGlue});
+ DAG.ReplaceAllUsesWith(T, Repl);
+ DAG.RemoveDeadNode(T.getNode());
+
+ Chain = Repl.getValue(1);
+ InGlue = Repl.getValue(2);
+ }
+
// set isTailCall to false for now, until we figure out how to express
// tail call optimization in PTX
isTailCall = false;
@@ -3045,9 +3205,20 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
DAG.getConstant(Offsets[VecIdx], dl, PtrVT));
Value *srcValue = Constant::getNullValue(PointerType::get(
EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
+
+ const MaybeAlign PartAlign = [&]() -> MaybeAlign {
+ if (aggregateIsPacked)
+ return Align(1);
+ if (NumElts != 1)
+ return std::nullopt;
+ Align PartAlign =
+ (Offsets[parti] == 0 && PAL.getParamAlignment(i))
+ ? PAL.getParamAlignment(i).value()
+ : DL.getABITypeAlign(EltVT.getTypeForEVT(F->getContext()));
+ return commonAlignment(PartAlign, Offsets[parti]);
+ }();
SDValue P = DAG.getLoad(VecVT, dl, Root, VecAddr,
- MachinePointerInfo(srcValue),
- MaybeAlign(aggregateIsPacked ? 1 : 0),
+ MachinePointerInfo(srcValue), PartAlign,
MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant);
if (P.getNode())
@@ -3113,6 +3284,33 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
return Chain;
}
+// Use byte-store when the param adress of the return value is unaligned.
+// This may happen when the return value is a field of a packed structure.
+static SDValue LowerUnalignedStoreRet(SelectionDAG &DAG, SDValue Chain,
+ uint64_t Offset, EVT ElementType,
+ SDValue RetVal, const SDLoc &dl) {
+ // Bit logic only works on integer types
+ if (adjustElementType(ElementType))
+ RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal);
+
+ // Store each byte
+ for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
+ // Shift the byte to the last byte position
+ SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, RetVal,
+ DAG.getConstant(i * 8, dl, MVT::i32));
+ SDValue StoreOperands[] = {Chain, DAG.getConstant(Offset + i, dl, MVT::i32),
+ ShiftVal};
+ // Trunc store only the last byte by using
+ // st.param.b8
+ // The register type can be larger than b8.
+ Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl,
+ DAG.getVTList(MVT::Other), StoreOperands,
+ MVT::i8, MachinePointerInfo(), std::nullopt,
+ MachineMemOperand::MOStore);
+ }
+ return Chain;
+}
+
SDValue
NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
bool isVarArg,
@@ -3162,13 +3360,6 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
SmallVector<SDValue, 6> StoreOperands;
for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
- // New load/store. Record chain and offset operands.
- if (VectorInfo[i] & PVF_FIRST) {
- assert(StoreOperands.empty() && "Orphaned operand list.");
- StoreOperands.push_back(Chain);
- StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));
- }
-
SDValue OutVal = OutVals[i];
SDValue RetVal = PromotedOutVals[i];
@@ -3182,6 +3373,32 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal);
}
+ // If we have a PVF_SCALAR entry, it may not even be sufficiently aligned
+ // for a scalar store. In such cases, fall back to byte stores.
+ if (VectorInfo[i] == PVF_SCALAR && RetTy->isAggregateType()) {
+ EVT ElementType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
+ Align ElementTypeAlign =
+ DL.getABITypeAlign(ElementType.getTypeForEVT(RetTy->getContext()));
+ Align ElementAlign =
+ commonAlignment(DL.getABITypeAlign(RetTy), Offsets[i]);
+ if (ElementAlign < ElementTypeAlign) {
+ assert(StoreOperands.empty() && "Orphaned operand list.");
+ Chain = LowerUnalignedStoreRet(DAG, Chain, Offsets[i], ElementType,
+ RetVal, dl);
+
+ // The call to LowerUnalignedStoreRet inserted the necessary SDAG nodes
+ // into the graph, so just move on to the next element.
+ continue;
+ }
+ }
+
+ // New load/store. Record chain and offset operands.
+ if (VectorInfo[i] & PVF_FIRST) {
+ assert(StoreOperands.empty() && "Orphaned operand list.");
+ StoreOperands.push_back(Chain);
+ StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));
+ }
+
// Record the value to return.
StoreOperands.push_back(RetVal);
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 55a1955..b3517ce 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -2738,6 +2738,8 @@ def StoreParamI32 : StoreParamInst<Int32Regs, ".b32">;
def StoreParamI16 : StoreParamInst<Int16Regs, ".b16">;
def StoreParamI8 : StoreParamInst<Int16Regs, ".b8">;
+def StoreParamI8TruncI32 : StoreParamInst<Int32Regs, ".b8">;
+def StoreParamI8TruncI64 : StoreParamInst<Int64Regs, ".b8">;
def StoreParamV2I64 : StoreParamV2Inst<Int64Regs, ".b64">;
def StoreParamV2I32 : StoreParamV2Inst<Int32Regs, ".b32">;
def StoreParamV2I16 : StoreParamV2Inst<Int16Regs, ".b16">;
@@ -2757,6 +2759,8 @@ def StoreRetvalI64 : StoreRetvalInst<Int64Regs, ".b64">;
def StoreRetvalI32 : StoreRetvalInst<Int32Regs, ".b32">;
def StoreRetvalI16 : StoreRetvalInst<Int16Regs, ".b16">;
def StoreRetvalI8 : StoreRetvalInst<Int16Regs, ".b8">;
+def StoreRetvalI8TruncI32 : StoreRetvalInst<Int32Regs, ".b8">;
+def StoreRetvalI8TruncI64 : StoreRetvalInst<Int64Regs, ".b8">;
def StoreRetvalV2I64 : StoreRetvalV2Inst<Int64Regs, ".b64">;
def StoreRetvalV2I32 : StoreRetvalV2Inst<Int32Regs, ".b32">;
def StoreRetvalV2I16 : StoreRetvalV2Inst<Int16Regs, ".b16">;
diff --git a/llvm/test/CodeGen/NVPTX/param-load-store.ll b/llvm/test/CodeGen/NVPTX/param-load-store.ll
index c14dc88..a29d4e1 100644
--- a/llvm/test/CodeGen/NVPTX/param-load-store.ll
+++ b/llvm/test/CodeGen/NVPTX/param-load-store.ll
@@ -1135,31 +1135,86 @@ define %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a) {
; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+2];
; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+1];
; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0];
-; --- TODO
-; --- Unaligned parameter store/ return value load is broken in both nvcc
-; --- and llvm and needs to be fixed.
; CHECK: .param .align 1 .b8 param0[25];
-; CHECK-DAG: st.param.b32 [param0+0],
-; CHECK-DAG: st.param.b32 [param0+4],
+; CHECK-DAG: st.param.b8 [param0+0],
+; CHECK-DAG: st.param.b8 [param0+1],
+; CHECK-DAG: st.param.b8 [param0+2],
+; CHECK-DAG: st.param.b8 [param0+3],
+; CHECK-DAG: st.param.b8 [param0+4],
+; CHECK-DAG: st.param.b8 [param0+5],
+; CHECK-DAG: st.param.b8 [param0+6],
+; CHECK-DAG: st.param.b8 [param0+7],
; CHECK-DAG: st.param.b8 [param0+8],
-; CHECK-DAG: st.param.b32 [param0+9],
-; CHECK-DAG: st.param.b32 [param0+13],
-; CHECK-DAG: st.param.b64 [param0+17],
+; CHECK-DAG: st.param.b8 [param0+9],
+; CHECK-DAG: st.param.b8 [param0+10],
+; CHECK-DAG: st.param.b8 [param0+11],
+; CHECK-DAG: st.param.b8 [param0+12],
+; CHECK-DAG: st.param.b8 [param0+13],
+; CHECK-DAG: st.param.b8 [param0+14],
+; CHECK-DAG: st.param.b8 [param0+15],
+; CHECK-DAG: st.param.b8 [param0+16],
+; CHECK-DAG: st.param.b8 [param0+17],
+; CHECK-DAG: st.param.b8 [param0+18],
+; CHECK-DAG: st.param.b8 [param0+19],
+; CHECK-DAG: st.param.b8 [param0+20],
+; CHECK-DAG: st.param.b8 [param0+21],
+; CHECK-DAG: st.param.b8 [param0+22],
+; CHECK-DAG: st.param.b8 [param0+23],
+; CHECK-DAG: st.param.b8 [param0+24],
; CHECK: .param .align 1 .b8 retval0[25];
; CHECK: call.uni (retval0),
; CHECK-NEXT: test_s_i1i32x4p,
-; CHECK-DAG: ld.param.b32 %r41, [retval0+0];
-; CHECK-DAG: ld.param.b32 %r42, [retval0+4];
-; CHECK-DAG: ld.param.b8 %rs2, [retval0+8];
-; CHECK-DAG: ld.param.b32 %r43, [retval0+9];
-; CHECK-DAG: ld.param.b32 %r44, [retval0+13];
-; CHECK-DAG: ld.param.b64 %rd23, [retval0+17];
-; CHECK-DAG: st.param.b32 [func_retval0+0],
-; CHECK-DAG: st.param.b32 [func_retval0+4],
+; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+0];
+; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+1];
+; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+2];
+; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+3];
+; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+4];
+; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+5];
+; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+6];
+; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+7];
+; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+8];
+; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+9];
+; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+10];
+; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+11];
+; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+12];
+; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+13];
+; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+14];
+; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+15];
+; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+16];
+; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+17];
+; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+18];
+; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+19];
+; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+20];
+; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+21];
+; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+22];
+; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+23];
+; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+24];
+; CHECK: } // callseq
+; CHECK-DAG: st.param.b8 [func_retval0+0],
+; CHECK-DAG: st.param.b8 [func_retval0+1],
+; CHECK-DAG: st.param.b8 [func_retval0+2],
+; CHECK-DAG: st.param.b8 [func_retval0+3],
+; CHECK-DAG: st.param.b8 [func_retval0+4],
+; CHECK-DAG: st.param.b8 [func_retval0+5],
+; CHECK-DAG: st.param.b8 [func_retval0+6],
+; CHECK-DAG: st.param.b8 [func_retval0+7],
; CHECK-DAG: st.param.b8 [func_retval0+8],
-; CHECK-DAG: st.param.b32 [func_retval0+9],
-; CHECK-DAG: st.param.b32 [func_retval0+13],
-; CHECK-DAG: st.param.b64 [func_retval0+17],
+; CHECK-DAG: st.param.b8 [func_retval0+9],
+; CHECK-DAG: st.param.b8 [func_retval0+10],
+; CHECK-DAG: st.param.b8 [func_retval0+11],
+; CHECK-DAG: st.param.b8 [func_retval0+12],
+; CHECK-DAG: st.param.b8 [func_retval0+13],
+; CHECK-DAG: st.param.b8 [func_retval0+14],
+; CHECK-DAG: st.param.b8 [func_retval0+15],
+; CHECK-DAG: st.param.b8 [func_retval0+16],
+; CHECK-DAG: st.param.b8 [func_retval0+17],
+; CHECK-DAG: st.param.b8 [func_retval0+18],
+; CHECK-DAG: st.param.b8 [func_retval0+19],
+; CHECK-DAG: st.param.b8 [func_retval0+20],
+; CHECK-DAG: st.param.b8 [func_retval0+21],
+; CHECK-DAG: st.param.b8 [func_retval0+22],
+; CHECK-DAG: st.param.b8 [func_retval0+23],
+; CHECK-DAG: st.param.b8 [func_retval0+24],
define %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a) {
%r = tail call %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a);
diff --git a/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll b/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll
new file mode 100644
index 0000000..40a3e9e
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll
@@ -0,0 +1,385 @@
+; Verifies correctness of load/store of parameters and return values.
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -O0 -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 -O0 -verify-machineinstrs | %ptxas-verify %}
+
+%s_i8i16p = type { <{ i16, i8, i16 }>, i64 }
+%s_i8i32p = type { <{ i32, i8, i32 }>, i64 }
+%s_i8i64p = type { <{ i64, i8, i64 }>, i64 }
+%s_i8f16p = type { <{ half, i8, half }>, i64 }
+%s_i8f16x2p = type { <{ <2 x half>, i8, <2 x half> }>, i64 }
+%s_i8f32p = type { <{ float, i8, float }>, i64 }
+%s_i8f64p = type { <{ double, i8, double }>, i64 }
+
+; -- All loads/stores from parameters aligned by one must be done one
+; byte at a time.
+; -- Notes:
+; -- There are two fields of interest in the packed part of the struct, one
+; with a proper offset and one without. The former should be loaded or
+; stored as a whole, and the latter by bytes.
+; -- Only loading and storing the said fields are checked in the following
+; series of tests so that they are more concise.
+
+; CHECK: .visible .func (.param .align 8 .b8 func_retval0[16])
+; CHECK-LABEL: test_s_i8i16p(
+; CHECK: .param .align 8 .b8 test_s_i8i16p_param_0[16]
+; CHECK-DAG: ld.param.u16 [[P0:%rs[0-9]+]], [test_s_i8i16p_param_0];
+; CHECK-DAG: ld.param.u8 [[P2_0:%rs[0-9]+]], [test_s_i8i16p_param_0+3];
+; CHECK-DAG: ld.param.u8 [[P2_1:%rs[0-9]+]], [test_s_i8i16p_param_0+4];
+; CHECK-DAG: shl.b16 [[P2_1_shl:%rs[0-9]+]], [[P2_1]], 8;
+; CHECK-DAG: or.b16 [[P2_1_or:%rs[0-9]+]], [[P2_1_shl]], [[P2_0]];
+; CHECK: { // callseq
+; CHECK: .param .align 8 .b8 param0[16];
+; CHECK-DAG: st.param.b16 [param0+0], [[P0]];
+; CHECK-DAG: st.param.b8 [param0+3], [[P2_1_or]];
+; CHECK-DAG: st.param.b8 [param0+4], [[P2_1]];
+; CHECK: .param .align 8 .b8 retval0[16];
+; CHECK-NEXT: call.uni (retval0),
+; CHECK-NEXT: test_s_i8i16p,
+; CHECK-NEXT: (
+; CHECK-NEXT: param0
+; CHECK-NEXT: );
+; CHECK-DAG: ld.param.b16 [[R0:%rs[0-9]+]], [retval0+0];
+; CHECK-DAG: ld.param.b8 [[R2_0:%rs[0-9]+]], [retval0+3];
+; CHECK-DAG: ld.param.b8 [[R2_1:%rs[0-9]+]], [retval0+4];
+; CHECK: } // callseq
+; CHECK-DAG: st.param.b16 [func_retval0+0], [[R0]];
+; CHECK-DAG: shl.b16 [[R2_1_shl:%rs[0-9]+]], [[R2_1]], 8;
+; CHECK-DAG: and.b16 [[R2_0_and:%rs[0-9]+]], [[R2_0]], 255;
+; CHECK-DAG: or.b16 [[R2:%rs[0-9]+]], [[R2_0_and]], [[R2_1_shl]];
+; CHECK-DAG: st.param.b8 [func_retval0+3], [[R2]];
+; CHECK-DAG: and.b16 [[R2_1_and:%rs[0-9]+]], [[R2_1]], 255;
+; CHECK-DAG: st.param.b8 [func_retval0+4], [[R2_1_and]];
+; CHECK: ret;
+
+define %s_i8i16p @test_s_i8i16p(%s_i8i16p %a) {
+ %r = tail call %s_i8i16p @test_s_i8i16p(%s_i8i16p %a)
+ ret %s_i8i16p %r
+}
+
+; CHECK: .visible .func (.param .align 8 .b8 func_retval0[24])
+; CHECK-LABEL: test_s_i8i32p(
+; CHECK: .param .align 8 .b8 test_s_i8i32p_param_0[24]
+; CHECK-DAG: ld.param.u32 [[P0:%r[0-9]+]], [test_s_i8i32p_param_0];
+; CHECK-DAG: ld.param.u8 [[P2_0:%r[0-9]+]], [test_s_i8i32p_param_0+5];
+; CHECK-DAG: ld.param.u8 [[P2_1:%r[0-9]+]], [test_s_i8i32p_param_0+6];
+; CHECK-DAG: ld.param.u8 [[P2_2:%r[0-9]+]], [test_s_i8i32p_param_0+7];
+; CHECK-DAG: ld.param.u8 [[P2_3:%r[0-9]+]], [test_s_i8i32p_param_0+8];
+; CHECK-DAG: shl.b32 [[P2_1_shl:%r[0-9]+]], [[P2_1]], 8;
+; CHECK-DAG: shl.b32 [[P2_2_shl:%r[0-9]+]], [[P2_2]], 16;
+; CHECK-DAG: shl.b32 [[P2_3_shl:%r[0-9]+]], [[P2_3]], 24;
+; CHECK-DAG: or.b32 [[P2_or:%r[0-9]+]], [[P2_1_shl]], [[P2_0]];
+; CHECK-DAG: or.b32 [[P2_or_1:%r[0-9]+]], [[P2_3_shl]], [[P2_2_shl]];
+; CHECK-DAG: or.b32 [[P2:%r[0-9]+]], [[P2_or_1]], [[P2_or]];
+; CHECK-DAG: shr.u32 [[P2_1_shr:%r[0-9]+]], [[P2]], 8;
+; CHECK-DAG: shr.u32 [[P2_2_shr:%r[0-9]+]], [[P2_or_1]], 16;
+; CHECK: { // callseq
+; CHECK-DAG: .param .align 8 .b8 param0[24];
+; CHECK-DAG: st.param.b32 [param0+0], [[P0]];
+; CHECK-DAG: st.param.b8 [param0+5], [[P2]];
+; CHECK-DAG: st.param.b8 [param0+6], [[P2_1_shr]];
+; CHECK-DAG: st.param.b8 [param0+7], [[P2_2_shr]];
+; CHECK-DAG: st.param.b8 [param0+8], [[P2_3]];
+; CHECK: .param .align 8 .b8 retval0[24];
+; CHECK-NEXT: call.uni (retval0),
+; CHECK-NEXT: test_s_i8i32p,
+; CHECK-NEXT: (
+; CHECK-NEXT: param0
+; CHECK-NEXT: );
+; CHECK-DAG: ld.param.b32 [[R0:%r[0-9]+]], [retval0+0];
+; CHECK-DAG: ld.param.b8 [[R2_0:%rs[0-9]+]], [retval0+5];
+; CHECK-DAG: ld.param.b8 [[R2_1:%rs[0-9]+]], [retval0+6];
+; CHECK-DAG: ld.param.b8 [[R2_2:%rs[0-9]+]], [retval0+7];
+; CHECK-DAG: ld.param.b8 [[R2_3:%rs[0-9]+]], [retval0+8];
+; CHECK: } // callseq
+; CHECK-DAG: st.param.b32 [func_retval0+0], [[R0]];
+; CHECK-DAG: st.param.b8 [func_retval0+5],
+; CHECK-DAG: st.param.b8 [func_retval0+6],
+; CHECK-DAG: st.param.b8 [func_retval0+7],
+; CHECK-DAG: st.param.b8 [func_retval0+8],
+; CHECK: ret;
+
+define %s_i8i32p @test_s_i8i32p(%s_i8i32p %a) {
+ %r = tail call %s_i8i32p @test_s_i8i32p(%s_i8i32p %a)
+ ret %s_i8i32p %r
+}
+
+; CHECK: .visible .func (.param .align 8 .b8 func_retval0[32])
+; CHECK-LABEL: test_s_i8i64p(
+; CHECK: .param .align 8 .b8 test_s_i8i64p_param_0[32]
+; CHECK-DAG: ld.param.u64 [[P0:%rd[0-9]+]], [test_s_i8i64p_param_0];
+; CHECK-DAG: ld.param.u8 [[P2_0:%rd[0-9]+]], [test_s_i8i64p_param_0+9];
+; CHECK-DAG: ld.param.u8 [[P2_1:%rd[0-9]+]], [test_s_i8i64p_param_0+10];
+; CHECK-DAG: ld.param.u8 [[P2_2:%rd[0-9]+]], [test_s_i8i64p_param_0+11];
+; CHECK-DAG: ld.param.u8 [[P2_3:%rd[0-9]+]], [test_s_i8i64p_param_0+12];
+; CHECK-DAG: ld.param.u8 [[P2_4:%rd[0-9]+]], [test_s_i8i64p_param_0+13];
+; CHECK-DAG: ld.param.u8 [[P2_5:%rd[0-9]+]], [test_s_i8i64p_param_0+14];
+; CHECK-DAG: ld.param.u8 [[P2_6:%rd[0-9]+]], [test_s_i8i64p_param_0+15];
+; CHECK-DAG: ld.param.u8 [[P2_7:%rd[0-9]+]], [test_s_i8i64p_param_0+16];
+; CHECK-DAG: shl.b64 [[P2_1_shl:%rd[0-9]+]], [[P2_1]], 8;
+; CHECK-DAG: shl.b64 [[P2_2_shl:%rd[0-9]+]], [[P2_2]], 16;
+; CHECK-DAG: shl.b64 [[P2_3_shl:%rd[0-9]+]], [[P2_3]], 24;
+; CHECK-DAG: or.b64 [[P2_or_0:%rd[0-9]+]], [[P2_1_shl]], [[P2_0]];
+; CHECK-DAG: or.b64 [[P2_or_1:%rd[0-9]+]], [[P2_3_shl]], [[P2_2_shl]];
+; CHECK-DAG: or.b64 [[P2_or_2:%rd[0-9]+]], [[P2_or_1]], [[P2_or_0]];
+; CHECK-DAG: shl.b64 [[P2_5_shl:%rd[0-9]+]], [[P2_5]], 8;
+; CHECK-DAG: shl.b64 [[P2_6_shl:%rd[0-9]+]], [[P2_6]], 16;
+; CHECK-DAG: shl.b64 [[P2_7_shl:%rd[0-9]+]], [[P2_7]], 24;
+; CHECK-DAG: or.b64 [[P2_or_3:%rd[0-9]+]], [[P2_5_shl]], [[P2_4]];
+; CHECK-DAG: or.b64 [[P2_or_4:%rd[0-9]+]], [[P2_7_shl]], [[P2_6_shl]];
+; CHECK-DAG: or.b64 [[P2_or_5:%rd[0-9]+]], [[P2_or_4]], [[P2_or_3]];
+; CHECK-DAG: shl.b64 [[P2_or_shl:%rd[0-9]+]], [[P2_or_5]], 32;
+; CHECK-DAG: or.b64 [[P2:%rd[0-9]+]], [[P2_or_shl]], [[P2_or_2]];
+; CHECK-DAG: shr.u64 [[P2_shr_1:%rd[0-9]+]], [[P2]], 8;
+; CHECK-DAG: shr.u64 [[P2_shr_2:%rd[0-9]+]], [[P2]], 16;
+; CHECK-DAG: shr.u64 [[P2_shr_3:%rd[0-9]+]], [[P2]], 24;
+; CHECK-DAG: bfe.u64 [[P2_bfe_4:%rd[0-9]+]], [[P2_or_5]], 8, 24;
+; CHECK-DAG: bfe.u64 [[P2_bfe_5:%rd[0-9]+]], [[P2_or_5]], 16, 16;
+; CHECK-DAG: bfe.u64 [[P2_bfe_6:%rd[0-9]+]], [[P2_or_5]], 24, 8;
+; CHECK: { // callseq
+; CHECK: .param .align 8 .b8 param0[32];
+; CHECK-DAG: st.param.b64 [param0+0], [[P0]];
+; CHECK-DAG: st.param.b8 [param0+9], [[P2]];
+; CHECK-DAG: st.param.b8 [param0+10], [[P2_shr_1]];
+; CHECK-DAG: st.param.b8 [param0+11], [[P2_shr_2]];
+; CHECK-DAG: st.param.b8 [param0+12], [[P2_shr_3]];
+; CHECK-DAG: st.param.b8 [param0+13], [[P2_or_5]];
+; CHECK-DAG: st.param.b8 [param0+14], [[P2_bfe_4]];
+; CHECK-DAG: st.param.b8 [param0+15], [[P2_bfe_5]];
+; CHECK-DAG: st.param.b8 [param0+16], [[P2_bfe_6]];
+; CHECK: .param .align 8 .b8 retval0[32];
+; CHECK-NEXT: call.uni (retval0),
+; CHECK-NEXT: test_s_i8i64p,
+; CHECK-NEXT: (
+; CHECK-NEXT: param0
+; CHECK-NEXT: );
+; CHECK-DAG: ld.param.b64 [[R0:%rd[0-9]+]], [retval0+0];
+; CHECK-DAG: ld.param.b8 [[R2_0:%rs[0-9]+]], [retval0+9];
+; CHECK-DAG: ld.param.b8 [[R2_1:%rs[0-9]+]], [retval0+10];
+; CHECK-DAG: ld.param.b8 [[R2_2:%rs[0-9]+]], [retval0+11];
+; CHECK-DAG: ld.param.b8 [[R2_3:%rs[0-9]+]], [retval0+12];
+; CHECK-DAG: ld.param.b8 [[R2_4:%rs[0-9]+]], [retval0+13];
+; CHECK-DAG: ld.param.b8 [[R2_5:%rs[0-9]+]], [retval0+14];
+; CHECK-DAG: ld.param.b8 [[R2_6:%rs[0-9]+]], [retval0+15];
+; CHECK-DAG: ld.param.b8 [[R2_7:%rs[0-9]+]], [retval0+16];
+; CHECK: } // callseq
+; CHECK-DAG: st.param.b64 [func_retval0+0], [[R0]];
+; CHECK-DAG: st.param.b8 [func_retval0+9],
+; CHECK-DAG: st.param.b8 [func_retval0+10],
+; CHECK-DAG: st.param.b8 [func_retval0+11],
+; CHECK-DAG: st.param.b8 [func_retval0+12],
+; CHECK-DAG: st.param.b8 [func_retval0+13],
+; CHECK-DAG: st.param.b8 [func_retval0+14],
+; CHECK-DAG: st.param.b8 [func_retval0+15],
+; CHECK-DAG: st.param.b8 [func_retval0+16],
+; CHECK: ret;
+
+define %s_i8i64p @test_s_i8i64p(%s_i8i64p %a) {
+ %r = tail call %s_i8i64p @test_s_i8i64p(%s_i8i64p %a)
+ ret %s_i8i64p %r
+}
+
+; CHECK: .visible .func (.param .align 8 .b8 func_retval0[16])
+; CHECK-LABEL: test_s_i8f16p(
+; CHECK: .param .align 8 .b8 test_s_i8f16p_param_0[16]
+; CHECK-DAG: ld.param.b16 [[P0:%rs[0-9]+]], [test_s_i8f16p_param_0];
+; CHECK-DAG: ld.param.u8 [[P2_0:%rs[0-9]+]], [test_s_i8f16p_param_0+3];
+; CHECK-DAG: ld.param.u8 [[P2_1:%rs[0-9]+]], [test_s_i8f16p_param_0+4];
+; CHECK-DAG: shl.b16 [[P2_1_shl:%rs[0-9]+]], [[P2_1]], 8;
+; CHECK-DAG: or.b16 [[P2_1_or:%rs[0-9]+]], [[P2_1_shl]], [[P2_0]];
+; CHECK: { // callseq
+; CHECK: .param .align 8 .b8 param0[16];
+; CHECK-DAG: st.param.b16 [param0+0], [[P0]];
+; CHECK-DAG: st.param.b8 [param0+3], [[P2_1_or]];
+; CHECK-DAG: st.param.b8 [param0+4], [[P2_1]];
+; CHECK: .param .align 8 .b8 retval0[16];
+; CHECK-NEXT: call.uni (retval0),
+; CHECK-NEXT: test_s_i8f16p,
+; CHECK-NEXT: (
+; CHECK-NEXT: param0
+; CHECK-NEXT: );
+; CHECK-DAG: ld.param.b16 [[R0:%rs[0-9]+]], [retval0+0];
+; CHECK-DAG: ld.param.b8 [[R2I_0:%rs[0-9]+]], [retval0+3];
+; CHECK-DAG: ld.param.b8 [[R2I_1:%rs[0-9]+]], [retval0+4];
+; CHECK: } // callseq
+; CHECK-DAG: st.param.b16 [func_retval0+0], [[R0]];
+; CHECK-DAG: shl.b16 [[R2I_1_shl:%rs[0-9]+]], [[R2I_1]], 8;
+; CHECK-DAG: and.b16 [[R2I_0_and:%rs[0-9]+]], [[R2I_0]], 255;
+; CHECK-DAG: or.b16 [[R2I:%rs[0-9]+]], [[R2I_0_and]], [[R2I_1_shl]];
+; CHECK-DAG: st.param.b8 [func_retval0+3], [[R2I]];
+; CHECK-DAG: and.b16 [[R2I_1_and:%rs[0-9]+]], [[R2I_1]], 255;
+; CHECK-DAG: st.param.b8 [func_retval0+4], [[R2I_1_and]];
+; CHECK: ret;
+
+define %s_i8f16p @test_s_i8f16p(%s_i8f16p %a) {
+ %r = tail call %s_i8f16p @test_s_i8f16p(%s_i8f16p %a)
+ ret %s_i8f16p %r
+}
+
+; CHECK: .visible .func (.param .align 8 .b8 func_retval0[24])
+; CHECK-LABEL: test_s_i8f16x2p(
+; CHECK: .param .align 8 .b8 test_s_i8f16x2p_param_0[24]
+; CHECK-DAG: ld.param.b32 [[P0:%r[0-9]+]], [test_s_i8f16x2p_param_0];
+; CHECK-DAG: ld.param.u8 [[P2_0:%r[0-9]+]], [test_s_i8f16x2p_param_0+5];
+; CHECK-DAG: ld.param.u8 [[P2_1:%r[0-9]+]], [test_s_i8f16x2p_param_0+6];
+; CHECK-DAG: ld.param.u8 [[P2_2:%r[0-9]+]], [test_s_i8f16x2p_param_0+7];
+; CHECK-DAG: ld.param.u8 [[P2_3:%r[0-9]+]], [test_s_i8f16x2p_param_0+8];
+; CHECK-DAG: shl.b32 [[P2_1_shl:%r[0-9]+]], [[P2_1]], 8;
+; CHECK-DAG: shl.b32 [[P2_2_shl:%r[0-9]+]], [[P2_2]], 16;
+; CHECK-DAG: shl.b32 [[P2_3_shl:%r[0-9]+]], [[P2_3]], 24;
+; CHECK-DAG: or.b32 [[P2_or:%r[0-9]+]], [[P2_1_shl]], [[P2_0]];
+; CHECK-DAG: or.b32 [[P2_or_1:%r[0-9]+]], [[P2_3_shl]], [[P2_2_shl]];
+; CHECK-DAG: or.b32 [[P2:%r[0-9]+]], [[P2_or_1]], [[P2_or]];
+; CHECK-DAG: shr.u32 [[P2_1_shr:%r[0-9]+]], [[P2]], 8;
+; CHECK-DAG: shr.u32 [[P2_2_shr:%r[0-9]+]], [[P2_or_1]], 16;
+; CHECK: { // callseq
+; CHECK-DAG: .param .align 8 .b8 param0[24];
+; CHECK-DAG: st.param.b32 [param0+0], [[P0]];
+; CHECK-DAG: st.param.b8 [param0+5], [[P2]];
+; CHECK-DAG: st.param.b8 [param0+6], [[P2_1_shr]];
+; CHECK-DAG: st.param.b8 [param0+7], [[P2_2_shr]];
+; CHECK-DAG: st.param.b8 [param0+8], [[P2_3]];
+; CHECK: .param .align 8 .b8 retval0[24];
+; CHECK-NEXT: call.uni (retval0),
+; CHECK-NEXT: test_s_i8f16x2p,
+; CHECK-NEXT: (
+; CHECK-NEXT: param0
+; CHECK-NEXT: );
+; CHECK-DAG: ld.param.b32 [[R0:%r[0-9]+]], [retval0+0];
+; CHECK-DAG: ld.param.b8 [[R2_0:%rs[0-9]+]], [retval0+5];
+; CHECK-DAG: ld.param.b8 [[R2_1:%rs[0-9]+]], [retval0+6];
+; CHECK-DAG: ld.param.b8 [[R2_2:%rs[0-9]+]], [retval0+7];
+; CHECK-DAG: ld.param.b8 [[R2_3:%rs[0-9]+]], [retval0+8];
+; CHECK: } // callseq
+; CHECK-DAG: st.param.b32 [func_retval0+0], [[R0]];
+; CHECK-DAG: st.param.b8 [func_retval0+5],
+; CHECK-DAG: st.param.b8 [func_retval0+6],
+; CHECK-DAG: st.param.b8 [func_retval0+7],
+; CHECK-DAG: st.param.b8 [func_retval0+8],
+; CHECK: ret;
+
+define %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a) {
+ %r = tail call %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a)
+ ret %s_i8f16x2p %r
+}
+
+; CHECK: .visible .func (.param .align 8 .b8 func_retval0[24])
+; CHECK-LABEL: test_s_i8f32p(
+; CHECK: .param .align 8 .b8 test_s_i8f32p_param_0[24]
+; CHECK-DAG: ld.param.f32 [[P0:%f[0-9]+]], [test_s_i8f32p_param_0];
+; CHECK-DAG: ld.param.u8 [[P2_0:%r[0-9]+]], [test_s_i8f32p_param_0+5];
+; CHECK-DAG: ld.param.u8 [[P2_1:%r[0-9]+]], [test_s_i8f32p_param_0+6];
+; CHECK-DAG: ld.param.u8 [[P2_2:%r[0-9]+]], [test_s_i8f32p_param_0+7];
+; CHECK-DAG: ld.param.u8 [[P2_3:%r[0-9]+]], [test_s_i8f32p_param_0+8];
+; CHECK-DAG: shl.b32 [[P2_1_shl:%r[0-9]+]], [[P2_1]], 8;
+; CHECK-DAG: shl.b32 [[P2_2_shl:%r[0-9]+]], [[P2_2]], 16;
+; CHECK-DAG: shl.b32 [[P2_3_shl:%r[0-9]+]], [[P2_3]], 24;
+; CHECK-DAG: or.b32 [[P2_or:%r[0-9]+]], [[P2_1_shl]], [[P2_0]];
+; CHECK-DAG: or.b32 [[P2_or_1:%r[0-9]+]], [[P2_3_shl]], [[P2_2_shl]];
+; CHECK-DAG: or.b32 [[P2:%r[0-9]+]], [[P2_or_1]], [[P2_or]];
+; CHECK-DAG: shr.u32 [[P2_1_shr:%r[0-9]+]], [[P2]], 8;
+; CHECK-DAG: shr.u32 [[P2_2_shr:%r[0-9]+]], [[P2_or_1]], 16;
+; CHECK: { // callseq
+; CHECK-DAG: .param .align 8 .b8 param0[24];
+; CHECK-DAG: st.param.f32 [param0+0], [[P0]];
+; CHECK-DAG: st.param.b8 [param0+5], [[P2]];
+; CHECK-DAG: st.param.b8 [param0+6], [[P2_1_shr]];
+; CHECK-DAG: st.param.b8 [param0+7], [[P2_2_shr]];
+; CHECK-DAG: st.param.b8 [param0+8], [[P2_3]];
+; CHECK: .param .align 8 .b8 retval0[24];
+; CHECK-NEXT: call.uni (retval0),
+; CHECK-NEXT: test_s_i8f32p,
+; CHECK-NEXT: (
+; CHECK-NEXT: param0
+; CHECK-NEXT: );
+; CHECK-DAG: ld.param.f32 [[R0:%f[0-9]+]], [retval0+0];
+; CHECK-DAG: ld.param.b8 [[R2_0:%rs[0-9]+]], [retval0+5];
+; CHECK-DAG: ld.param.b8 [[R2_1:%rs[0-9]+]], [retval0+6];
+; CHECK-DAG: ld.param.b8 [[R2_2:%rs[0-9]+]], [retval0+7];
+; CHECK-DAG: ld.param.b8 [[R2_3:%rs[0-9]+]], [retval0+8];
+; CHECK: } // callseq
+; CHECK-DAG: st.param.f32 [func_retval0+0], [[R0]];
+; CHECK-DAG: st.param.b8 [func_retval0+5],
+; CHECK-DAG: st.param.b8 [func_retval0+6],
+; CHECK-DAG: st.param.b8 [func_retval0+7],
+; CHECK-DAG: st.param.b8 [func_retval0+8],
+; CHECK: ret;
+
+define %s_i8f32p @test_s_i8f32p(%s_i8f32p %a) {
+ %r = tail call %s_i8f32p @test_s_i8f32p(%s_i8f32p %a)
+ ret %s_i8f32p %r
+}
+
+; CHECK: .visible .func (.param .align 8 .b8 func_retval0[32])
+; CHECK-LABEL: test_s_i8f64p(
+; CHECK: .param .align 8 .b8 test_s_i8f64p_param_0[32]
+; CHECK-DAG: ld.param.f64 [[P0:%fd[0-9]+]], [test_s_i8f64p_param_0];
+; CHECK-DAG: ld.param.u8 [[P2_0:%rd[0-9]+]], [test_s_i8f64p_param_0+9];
+; CHECK-DAG: ld.param.u8 [[P2_1:%rd[0-9]+]], [test_s_i8f64p_param_0+10];
+; CHECK-DAG: ld.param.u8 [[P2_2:%rd[0-9]+]], [test_s_i8f64p_param_0+11];
+; CHECK-DAG: ld.param.u8 [[P2_3:%rd[0-9]+]], [test_s_i8f64p_param_0+12];
+; CHECK-DAG: ld.param.u8 [[P2_4:%rd[0-9]+]], [test_s_i8f64p_param_0+13];
+; CHECK-DAG: ld.param.u8 [[P2_5:%rd[0-9]+]], [test_s_i8f64p_param_0+14];
+; CHECK-DAG: ld.param.u8 [[P2_6:%rd[0-9]+]], [test_s_i8f64p_param_0+15];
+; CHECK-DAG: ld.param.u8 [[P2_7:%rd[0-9]+]], [test_s_i8f64p_param_0+16];
+; CHECK-DAG: shl.b64 [[P2_1_shl:%rd[0-9]+]], [[P2_1]], 8;
+; CHECK-DAG: shl.b64 [[P2_2_shl:%rd[0-9]+]], [[P2_2]], 16;
+; CHECK-DAG: shl.b64 [[P2_3_shl:%rd[0-9]+]], [[P2_3]], 24;
+; CHECK-DAG: or.b64 [[P2_or_0:%rd[0-9]+]], [[P2_1_shl]], [[P2_0]];
+; CHECK-DAG: or.b64 [[P2_or_1:%rd[0-9]+]], [[P2_3_shl]], [[P2_2_shl]];
+; CHECK-DAG: or.b64 [[P2_or_2:%rd[0-9]+]], [[P2_or_1]], [[P2_or_0]];
+; CHECK-DAG: shl.b64 [[P2_5_shl:%rd[0-9]+]], [[P2_5]], 8;
+; CHECK-DAG: shl.b64 [[P2_6_shl:%rd[0-9]+]], [[P2_6]], 16;
+; CHECK-DAG: shl.b64 [[P2_7_shl:%rd[0-9]+]], [[P2_7]], 24;
+; CHECK-DAG: or.b64 [[P2_or_3:%rd[0-9]+]], [[P2_5_shl]], [[P2_4]];
+; CHECK-DAG: or.b64 [[P2_or_4:%rd[0-9]+]], [[P2_7_shl]], [[P2_6_shl]];
+; CHECK-DAG: or.b64 [[P2_or_5:%rd[0-9]+]], [[P2_or_4]], [[P2_or_3]];
+; CHECK-DAG: shl.b64 [[P2_or_shl:%rd[0-9]+]], [[P2_or_5]], 32;
+; CHECK-DAG: or.b64 [[P2:%rd[0-9]+]], [[P2_or_shl]], [[P2_or_2]];
+; CHECK-DAG: shr.u64 [[P2_shr_1:%rd[0-9]+]], [[P2]], 8;
+; CHECK-DAG: shr.u64 [[P2_shr_2:%rd[0-9]+]], [[P2]], 16;
+; CHECK-DAG: shr.u64 [[P2_shr_3:%rd[0-9]+]], [[P2]], 24;
+; CHECK-DAG: bfe.u64 [[P2_bfe_4:%rd[0-9]+]], [[P2_or_5]], 8, 24;
+; CHECK-DAG: bfe.u64 [[P2_bfe_5:%rd[0-9]+]], [[P2_or_5]], 16, 16;
+; CHECK-DAG: bfe.u64 [[P2_bfe_6:%rd[0-9]+]], [[P2_or_5]], 24, 8;
+; CHECK: { // callseq
+; CHECK: .param .align 8 .b8 param0[32];
+; CHECK-DAG: st.param.f64 [param0+0], [[P0]];
+; CHECK-DAG: st.param.b8 [param0+9], [[P2]];
+; CHECK-DAG: st.param.b8 [param0+10], [[P2_shr_1]];
+; CHECK-DAG: st.param.b8 [param0+11], [[P2_shr_2]];
+; CHECK-DAG: st.param.b8 [param0+12], [[P2_shr_3]];
+; CHECK-DAG: st.param.b8 [param0+13], [[P2_or_5]];
+; CHECK-DAG: st.param.b8 [param0+14], [[P2_bfe_4]];
+; CHECK-DAG: st.param.b8 [param0+15], [[P2_bfe_5]];
+; CHECK-DAG: st.param.b8 [param0+16], [[P2_bfe_6]];
+; CHECK: .param .align 8 .b8 retval0[32];
+; CHECK-NEXT: call.uni (retval0),
+; CHECK-NEXT: test_s_i8f64p,
+; CHECK-NEXT: (
+; CHECK-NEXT: param0
+; CHECK-NEXT: );
+; CHECK-DAG: ld.param.f64 [[R0:%fd[0-9]+]], [retval0+0];
+; CHECK-DAG: ld.param.b8 [[R2_0:%rs[0-9]+]], [retval0+9];
+; CHECK-DAG: ld.param.b8 [[R2_1:%rs[0-9]+]], [retval0+10];
+; CHECK-DAG: ld.param.b8 [[R2_2:%rs[0-9]+]], [retval0+11];
+; CHECK-DAG: ld.param.b8 [[R2_3:%rs[0-9]+]], [retval0+12];
+; CHECK-DAG: ld.param.b8 [[R2_4:%rs[0-9]+]], [retval0+13];
+; CHECK-DAG: ld.param.b8 [[R2_5:%rs[0-9]+]], [retval0+14];
+; CHECK-DAG: ld.param.b8 [[R2_6:%rs[0-9]+]], [retval0+15];
+; CHECK-DAG: ld.param.b8 [[R2_7:%rs[0-9]+]], [retval0+16];
+; CHECK: } // callseq
+; CHECK-DAG: st.param.f64 [func_retval0+0], [[R0]];
+; CHECK-DAG: st.param.b8 [func_retval0+9],
+; CHECK-DAG: st.param.b8 [func_retval0+10],
+; CHECK-DAG: st.param.b8 [func_retval0+11],
+; CHECK-DAG: st.param.b8 [func_retval0+12],
+; CHECK-DAG: st.param.b8 [func_retval0+13],
+; CHECK-DAG: st.param.b8 [func_retval0+14],
+; CHECK-DAG: st.param.b8 [func_retval0+15],
+; CHECK-DAG: st.param.b8 [func_retval0+16],
+; CHECK: ret;
+
+define %s_i8f64p @test_s_i8f64p(%s_i8f64p %a) {
+ %r = tail call %s_i8f64p @test_s_i8f64p(%s_i8f64p %a)
+ ret %s_i8f64p %r
+}