diff options
author | Alex MacLean <amaclean@nvidia.com> | 2024-02-22 17:27:28 -0800 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-02-22 17:27:28 -0800 |
commit | 590c968e7943e51bb00ff75d312435f24d983b2a (patch) | |
tree | e49fd62479030788c7079c24fbebfe5b7465e7fe /llvm | |
parent | 7a5c01dbca3ddfc6dd87775ec90346783c8e2c73 (diff) | |
download | llvm-590c968e7943e51bb00ff75d312435f24d983b2a.zip llvm-590c968e7943e51bb00ff75d312435f24d983b2a.tar.gz llvm-590c968e7943e51bb00ff75d312435f24d983b2a.tar.bz2 |
[NVPTX] fixup support for unaligned parameters and returns (#82562)
Add support for unaligned parameters and return values. These must be
loaded and stored one byte at a time and then bit manipulation is used
to assemble the correct final result.
Diffstat (limited to 'llvm')
-rw-r--r-- | llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 30 | ||||
-rw-r--r-- | llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 257 | ||||
-rw-r--r-- | llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 4 | ||||
-rw-r--r-- | llvm/test/CodeGen/NVPTX/param-load-store.ll | 93 | ||||
-rw-r--r-- | llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll | 385 |
5 files changed, 730 insertions, 39 deletions
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index ded2f25..3ff8994 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -2135,6 +2135,21 @@ bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) { NVPTX::StoreRetvalI8, NVPTX::StoreRetvalI16, NVPTX::StoreRetvalI32, NVPTX::StoreRetvalI64, NVPTX::StoreRetvalF32, NVPTX::StoreRetvalF64); + if (Opcode == NVPTX::StoreRetvalI8) { + // Fine tune the opcode depending on the size of the operand. + // This helps to avoid creating redundant COPY instructions in + // InstrEmitter::AddRegisterOperand(). + switch (Ops[0].getSimpleValueType().SimpleTy) { + default: + break; + case MVT::i32: + Opcode = NVPTX::StoreRetvalI8TruncI32; + break; + case MVT::i64: + Opcode = NVPTX::StoreRetvalI8TruncI64; + break; + } + } break; case 2: Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy, @@ -2211,6 +2226,21 @@ bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) { NVPTX::StoreParamI8, NVPTX::StoreParamI16, NVPTX::StoreParamI32, NVPTX::StoreParamI64, NVPTX::StoreParamF32, NVPTX::StoreParamF64); + if (Opcode == NVPTX::StoreParamI8) { + // Fine tune the opcode depending on the size of the operand. + // This helps to avoid creating redundant COPY instructions in + // InstrEmitter::AddRegisterOperand(). + switch (Ops[0].getSimpleValueType().SimpleTy) { + default: + break; + case MVT::i32: + Opcode = NVPTX::StoreParamI8TruncI32; + break; + case MVT::i64: + Opcode = NVPTX::StoreParamI8TruncI64; + break; + } + } break; case 2: Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy, diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 7d2fe78..66a1010 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -47,6 +47,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" +#include "llvm/Support/Alignment.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CodeGen.h" #include "llvm/Support/CommandLine.h" @@ -59,6 +60,7 @@ #include <cmath> #include <cstdint> #include <iterator> +#include <optional> #include <sstream> #include <string> #include <utility> @@ -1529,6 +1531,105 @@ Align NVPTXTargetLowering::getArgumentAlignment(const CallBase *CB, Type *Ty, return DL.getABITypeAlign(Ty); } +static bool adjustElementType(EVT &ElementType) { + switch (ElementType.getSimpleVT().SimpleTy) { + default: + return false; + case MVT::f16: + case MVT::bf16: + ElementType = MVT::i16; + return true; + case MVT::f32: + case MVT::v2f16: + case MVT::v2bf16: + ElementType = MVT::i32; + return true; + case MVT::f64: + ElementType = MVT::i64; + return true; + } +} + +// Use byte-store when the param address of the argument value is unaligned. +// This may happen when the return value is a field of a packed structure. +// +// This is called in LowerCall() when passing the param values. +static SDValue LowerUnalignedStoreParam(SelectionDAG &DAG, SDValue Chain, + uint64_t Offset, EVT ElementType, + SDValue StVal, SDValue &InGlue, + unsigned ArgID, const SDLoc &dl) { + // Bit logic only works on integer types + if (adjustElementType(ElementType)) + StVal = DAG.getNode(ISD::BITCAST, dl, ElementType, StVal); + + // Store each byte + SDVTList StoreVTs = DAG.getVTList(MVT::Other, MVT::Glue); + for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) { + // Shift the byte to the last byte position + SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, StVal, + DAG.getConstant(i * 8, dl, MVT::i32)); + SDValue StoreOperands[] = {Chain, DAG.getConstant(ArgID, dl, MVT::i32), + DAG.getConstant(Offset + i, dl, MVT::i32), + ShiftVal, InGlue}; + // Trunc store only the last byte by using + // st.param.b8 + // The register type can be larger than b8. + Chain = DAG.getMemIntrinsicNode( + NVPTXISD::StoreParam, dl, StoreVTs, StoreOperands, MVT::i8, + MachinePointerInfo(), Align(1), MachineMemOperand::MOStore); + InGlue = Chain.getValue(1); + } + return Chain; +} + +// Use byte-load when the param adress of the returned value is unaligned. +// This may happen when the returned value is a field of a packed structure. +static SDValue +LowerUnalignedLoadRetParam(SelectionDAG &DAG, SDValue &Chain, uint64_t Offset, + EVT ElementType, SDValue &InGlue, + SmallVectorImpl<SDValue> &TempProxyRegOps, + const SDLoc &dl) { + // Bit logic only works on integer types + EVT MergedType = ElementType; + adjustElementType(MergedType); + + // Load each byte and construct the whole value. Initial value to 0 + SDValue RetVal = DAG.getConstant(0, dl, MergedType); + // LoadParamMemI8 loads into i16 register only + SDVTList LoadVTs = DAG.getVTList(MVT::i16, MVT::Other, MVT::Glue); + for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) { + SDValue LoadOperands[] = {Chain, DAG.getConstant(1, dl, MVT::i32), + DAG.getConstant(Offset + i, dl, MVT::i32), + InGlue}; + // This will be selected to LoadParamMemI8 + SDValue LdVal = + DAG.getMemIntrinsicNode(NVPTXISD::LoadParam, dl, LoadVTs, LoadOperands, + MVT::i8, MachinePointerInfo(), Align(1)); + SDValue TmpLdVal = LdVal.getValue(0); + Chain = LdVal.getValue(1); + InGlue = LdVal.getValue(2); + + TmpLdVal = DAG.getNode(NVPTXISD::ProxyReg, dl, + TmpLdVal.getSimpleValueType(), TmpLdVal); + TempProxyRegOps.push_back(TmpLdVal); + + SDValue CMask = DAG.getConstant(255, dl, MergedType); + SDValue CShift = DAG.getConstant(i * 8, dl, MVT::i32); + // Need to extend the i16 register to the whole width. + TmpLdVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MergedType, TmpLdVal); + // Mask off the high bits. Leave only the lower 8bits. + // Do this because we are using loadparam.b8. + TmpLdVal = DAG.getNode(ISD::AND, dl, MergedType, TmpLdVal, CMask); + // Shift and merge + TmpLdVal = DAG.getNode(ISD::SHL, dl, MergedType, TmpLdVal, CShift); + RetVal = DAG.getNode(ISD::OR, dl, MergedType, RetVal, TmpLdVal); + } + if (ElementType != MergedType) + RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal); + + return RetVal; +} + SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const { @@ -1680,17 +1781,6 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (NeedAlign) PartAlign = commonAlignment(ArgAlign, CurOffset); - // New store. - if (VectorInfo[j] & PVF_FIRST) { - assert(StoreOperands.empty() && "Unfinished preceding store."); - StoreOperands.push_back(Chain); - StoreOperands.push_back( - DAG.getConstant(IsVAArg ? FirstVAArg : ParamCount, dl, MVT::i32)); - StoreOperands.push_back(DAG.getConstant( - IsByVal ? CurOffset + VAOffset : (IsVAArg ? VAOffset : CurOffset), - dl, MVT::i32)); - } - SDValue StVal = OutVals[OIdx]; MVT PromotedVT; @@ -1723,6 +1813,35 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal); } + // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a + // scalar store. In such cases, fall back to byte stores. + if (VectorInfo[j] == PVF_SCALAR && !IsVAArg && PartAlign.has_value() && + PartAlign.value() < + DL.getABITypeAlign(EltVT.getTypeForEVT(*DAG.getContext()))) { + assert(StoreOperands.empty() && "Unfinished preceeding store."); + Chain = LowerUnalignedStoreParam( + DAG, Chain, IsByVal ? CurOffset + VAOffset : CurOffset, EltVT, + StVal, InGlue, ParamCount, dl); + + // LowerUnalignedStoreParam took care of inserting the necessary nodes + // into the SDAG, so just move on to the next element. + if (!IsByVal) + ++OIdx; + continue; + } + + // New store. + if (VectorInfo[j] & PVF_FIRST) { + assert(StoreOperands.empty() && "Unfinished preceding store."); + StoreOperands.push_back(Chain); + StoreOperands.push_back( + DAG.getConstant(IsVAArg ? FirstVAArg : ParamCount, dl, MVT::i32)); + + StoreOperands.push_back(DAG.getConstant( + IsByVal ? CurOffset + VAOffset : (IsVAArg ? VAOffset : CurOffset), + dl, MVT::i32)); + } + // Record the value to store. StoreOperands.push_back(StVal); @@ -1923,6 +2042,14 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVector<SDValue, 16> ProxyRegOps; SmallVector<std::optional<MVT>, 16> ProxyRegTruncates; + // An item of the vector is filled if the element does not need a ProxyReg + // operation on it and should be added to InVals as is. ProxyRegOps and + // ProxyRegTruncates contain empty/none items at the same index. + SmallVector<SDValue, 16> RetElts; + // A temporary ProxyReg operations inserted in `LowerUnalignedLoadRetParam()` + // to use the values of `LoadParam`s and to be replaced later then + // `CALLSEQ_END` is added. + SmallVector<SDValue, 16> TempProxyRegOps; // Generate loads from param memory/moves from registers for result if (Ins.size() > 0) { @@ -1966,6 +2093,22 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, EltType = MVT::i16; } + // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a + // scalar load. In such cases, fall back to byte loads. + if (VectorInfo[i] == PVF_SCALAR && RetTy->isAggregateType() && + EltAlign < DL.getABITypeAlign( + TheLoadType.getTypeForEVT(*DAG.getContext()))) { + assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list."); + SDValue Ret = LowerUnalignedLoadRetParam( + DAG, Chain, Offsets[i], TheLoadType, InGlue, TempProxyRegOps, dl); + ProxyRegOps.push_back(SDValue()); + ProxyRegTruncates.push_back(std::optional<MVT>()); + RetElts.resize(i); + RetElts.push_back(Ret); + + continue; + } + // Record index of the very first element of the vector. if (VectorInfo[i] & PVF_FIRST) { assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list."); @@ -2028,6 +2171,11 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // will not get lost. Otherwise, during libcalls expansion, the nodes can become // dangling. for (unsigned i = 0; i < ProxyRegOps.size(); ++i) { + if (i < RetElts.size() && RetElts[i]) { + InVals.push_back(RetElts[i]); + continue; + } + SDValue Ret = DAG.getNode( NVPTXISD::ProxyReg, dl, DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue), @@ -2044,6 +2192,18 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, InVals.push_back(Ret); } + for (SDValue &T : TempProxyRegOps) { + SDValue Repl = DAG.getNode( + NVPTXISD::ProxyReg, dl, + DAG.getVTList(T.getSimpleValueType(), MVT::Other, MVT::Glue), + {Chain, T.getOperand(0), InGlue}); + DAG.ReplaceAllUsesWith(T, Repl); + DAG.RemoveDeadNode(T.getNode()); + + Chain = Repl.getValue(1); + InGlue = Repl.getValue(2); + } + // set isTailCall to false for now, until we figure out how to express // tail call optimization in PTX isTailCall = false; @@ -3045,9 +3205,20 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( DAG.getConstant(Offsets[VecIdx], dl, PtrVT)); Value *srcValue = Constant::getNullValue(PointerType::get( EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM)); + + const MaybeAlign PartAlign = [&]() -> MaybeAlign { + if (aggregateIsPacked) + return Align(1); + if (NumElts != 1) + return std::nullopt; + Align PartAlign = + (Offsets[parti] == 0 && PAL.getParamAlignment(i)) + ? PAL.getParamAlignment(i).value() + : DL.getABITypeAlign(EltVT.getTypeForEVT(F->getContext())); + return commonAlignment(PartAlign, Offsets[parti]); + }(); SDValue P = DAG.getLoad(VecVT, dl, Root, VecAddr, - MachinePointerInfo(srcValue), - MaybeAlign(aggregateIsPacked ? 1 : 0), + MachinePointerInfo(srcValue), PartAlign, MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant); if (P.getNode()) @@ -3113,6 +3284,33 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( return Chain; } +// Use byte-store when the param adress of the return value is unaligned. +// This may happen when the return value is a field of a packed structure. +static SDValue LowerUnalignedStoreRet(SelectionDAG &DAG, SDValue Chain, + uint64_t Offset, EVT ElementType, + SDValue RetVal, const SDLoc &dl) { + // Bit logic only works on integer types + if (adjustElementType(ElementType)) + RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal); + + // Store each byte + for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) { + // Shift the byte to the last byte position + SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, RetVal, + DAG.getConstant(i * 8, dl, MVT::i32)); + SDValue StoreOperands[] = {Chain, DAG.getConstant(Offset + i, dl, MVT::i32), + ShiftVal}; + // Trunc store only the last byte by using + // st.param.b8 + // The register type can be larger than b8. + Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl, + DAG.getVTList(MVT::Other), StoreOperands, + MVT::i8, MachinePointerInfo(), std::nullopt, + MachineMemOperand::MOStore); + } + return Chain; +} + SDValue NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, @@ -3162,13 +3360,6 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, SmallVector<SDValue, 6> StoreOperands; for (unsigned i = 0, e = VTs.size(); i != e; ++i) { - // New load/store. Record chain and offset operands. - if (VectorInfo[i] & PVF_FIRST) { - assert(StoreOperands.empty() && "Orphaned operand list."); - StoreOperands.push_back(Chain); - StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32)); - } - SDValue OutVal = OutVals[i]; SDValue RetVal = PromotedOutVals[i]; @@ -3182,6 +3373,32 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal); } + // If we have a PVF_SCALAR entry, it may not even be sufficiently aligned + // for a scalar store. In such cases, fall back to byte stores. + if (VectorInfo[i] == PVF_SCALAR && RetTy->isAggregateType()) { + EVT ElementType = ExtendIntegerRetVal ? MVT::i32 : VTs[i]; + Align ElementTypeAlign = + DL.getABITypeAlign(ElementType.getTypeForEVT(RetTy->getContext())); + Align ElementAlign = + commonAlignment(DL.getABITypeAlign(RetTy), Offsets[i]); + if (ElementAlign < ElementTypeAlign) { + assert(StoreOperands.empty() && "Orphaned operand list."); + Chain = LowerUnalignedStoreRet(DAG, Chain, Offsets[i], ElementType, + RetVal, dl); + + // The call to LowerUnalignedStoreRet inserted the necessary SDAG nodes + // into the graph, so just move on to the next element. + continue; + } + } + + // New load/store. Record chain and offset operands. + if (VectorInfo[i] & PVF_FIRST) { + assert(StoreOperands.empty() && "Orphaned operand list."); + StoreOperands.push_back(Chain); + StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32)); + } + // Record the value to return. StoreOperands.push_back(RetVal); diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 55a1955..b3517ce 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -2738,6 +2738,8 @@ def StoreParamI32 : StoreParamInst<Int32Regs, ".b32">; def StoreParamI16 : StoreParamInst<Int16Regs, ".b16">; def StoreParamI8 : StoreParamInst<Int16Regs, ".b8">; +def StoreParamI8TruncI32 : StoreParamInst<Int32Regs, ".b8">; +def StoreParamI8TruncI64 : StoreParamInst<Int64Regs, ".b8">; def StoreParamV2I64 : StoreParamV2Inst<Int64Regs, ".b64">; def StoreParamV2I32 : StoreParamV2Inst<Int32Regs, ".b32">; def StoreParamV2I16 : StoreParamV2Inst<Int16Regs, ".b16">; @@ -2757,6 +2759,8 @@ def StoreRetvalI64 : StoreRetvalInst<Int64Regs, ".b64">; def StoreRetvalI32 : StoreRetvalInst<Int32Regs, ".b32">; def StoreRetvalI16 : StoreRetvalInst<Int16Regs, ".b16">; def StoreRetvalI8 : StoreRetvalInst<Int16Regs, ".b8">; +def StoreRetvalI8TruncI32 : StoreRetvalInst<Int32Regs, ".b8">; +def StoreRetvalI8TruncI64 : StoreRetvalInst<Int64Regs, ".b8">; def StoreRetvalV2I64 : StoreRetvalV2Inst<Int64Regs, ".b64">; def StoreRetvalV2I32 : StoreRetvalV2Inst<Int32Regs, ".b32">; def StoreRetvalV2I16 : StoreRetvalV2Inst<Int16Regs, ".b16">; diff --git a/llvm/test/CodeGen/NVPTX/param-load-store.ll b/llvm/test/CodeGen/NVPTX/param-load-store.ll index c14dc88..a29d4e1 100644 --- a/llvm/test/CodeGen/NVPTX/param-load-store.ll +++ b/llvm/test/CodeGen/NVPTX/param-load-store.ll @@ -1135,31 +1135,86 @@ define %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a) { ; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+2]; ; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+1]; ; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0]; -; --- TODO -; --- Unaligned parameter store/ return value load is broken in both nvcc -; --- and llvm and needs to be fixed. ; CHECK: .param .align 1 .b8 param0[25]; -; CHECK-DAG: st.param.b32 [param0+0], -; CHECK-DAG: st.param.b32 [param0+4], +; CHECK-DAG: st.param.b8 [param0+0], +; CHECK-DAG: st.param.b8 [param0+1], +; CHECK-DAG: st.param.b8 [param0+2], +; CHECK-DAG: st.param.b8 [param0+3], +; CHECK-DAG: st.param.b8 [param0+4], +; CHECK-DAG: st.param.b8 [param0+5], +; CHECK-DAG: st.param.b8 [param0+6], +; CHECK-DAG: st.param.b8 [param0+7], ; CHECK-DAG: st.param.b8 [param0+8], -; CHECK-DAG: st.param.b32 [param0+9], -; CHECK-DAG: st.param.b32 [param0+13], -; CHECK-DAG: st.param.b64 [param0+17], +; CHECK-DAG: st.param.b8 [param0+9], +; CHECK-DAG: st.param.b8 [param0+10], +; CHECK-DAG: st.param.b8 [param0+11], +; CHECK-DAG: st.param.b8 [param0+12], +; CHECK-DAG: st.param.b8 [param0+13], +; CHECK-DAG: st.param.b8 [param0+14], +; CHECK-DAG: st.param.b8 [param0+15], +; CHECK-DAG: st.param.b8 [param0+16], +; CHECK-DAG: st.param.b8 [param0+17], +; CHECK-DAG: st.param.b8 [param0+18], +; CHECK-DAG: st.param.b8 [param0+19], +; CHECK-DAG: st.param.b8 [param0+20], +; CHECK-DAG: st.param.b8 [param0+21], +; CHECK-DAG: st.param.b8 [param0+22], +; CHECK-DAG: st.param.b8 [param0+23], +; CHECK-DAG: st.param.b8 [param0+24], ; CHECK: .param .align 1 .b8 retval0[25]; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_s_i1i32x4p, -; CHECK-DAG: ld.param.b32 %r41, [retval0+0]; -; CHECK-DAG: ld.param.b32 %r42, [retval0+4]; -; CHECK-DAG: ld.param.b8 %rs2, [retval0+8]; -; CHECK-DAG: ld.param.b32 %r43, [retval0+9]; -; CHECK-DAG: ld.param.b32 %r44, [retval0+13]; -; CHECK-DAG: ld.param.b64 %rd23, [retval0+17]; -; CHECK-DAG: st.param.b32 [func_retval0+0], -; CHECK-DAG: st.param.b32 [func_retval0+4], +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+0]; +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+1]; +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+2]; +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+3]; +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+4]; +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+5]; +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+6]; +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+7]; +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+8]; +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+9]; +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+10]; +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+11]; +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+12]; +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+13]; +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+14]; +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+15]; +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+16]; +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+17]; +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+18]; +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+19]; +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+20]; +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+21]; +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+22]; +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+23]; +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+24]; +; CHECK: } // callseq +; CHECK-DAG: st.param.b8 [func_retval0+0], +; CHECK-DAG: st.param.b8 [func_retval0+1], +; CHECK-DAG: st.param.b8 [func_retval0+2], +; CHECK-DAG: st.param.b8 [func_retval0+3], +; CHECK-DAG: st.param.b8 [func_retval0+4], +; CHECK-DAG: st.param.b8 [func_retval0+5], +; CHECK-DAG: st.param.b8 [func_retval0+6], +; CHECK-DAG: st.param.b8 [func_retval0+7], ; CHECK-DAG: st.param.b8 [func_retval0+8], -; CHECK-DAG: st.param.b32 [func_retval0+9], -; CHECK-DAG: st.param.b32 [func_retval0+13], -; CHECK-DAG: st.param.b64 [func_retval0+17], +; CHECK-DAG: st.param.b8 [func_retval0+9], +; CHECK-DAG: st.param.b8 [func_retval0+10], +; CHECK-DAG: st.param.b8 [func_retval0+11], +; CHECK-DAG: st.param.b8 [func_retval0+12], +; CHECK-DAG: st.param.b8 [func_retval0+13], +; CHECK-DAG: st.param.b8 [func_retval0+14], +; CHECK-DAG: st.param.b8 [func_retval0+15], +; CHECK-DAG: st.param.b8 [func_retval0+16], +; CHECK-DAG: st.param.b8 [func_retval0+17], +; CHECK-DAG: st.param.b8 [func_retval0+18], +; CHECK-DAG: st.param.b8 [func_retval0+19], +; CHECK-DAG: st.param.b8 [func_retval0+20], +; CHECK-DAG: st.param.b8 [func_retval0+21], +; CHECK-DAG: st.param.b8 [func_retval0+22], +; CHECK-DAG: st.param.b8 [func_retval0+23], +; CHECK-DAG: st.param.b8 [func_retval0+24], define %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a) { %r = tail call %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a); diff --git a/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll b/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll new file mode 100644 index 0000000..40a3e9e --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll @@ -0,0 +1,385 @@ +; Verifies correctness of load/store of parameters and return values. +; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -O0 -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap %s +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 -O0 -verify-machineinstrs | %ptxas-verify %} + +%s_i8i16p = type { <{ i16, i8, i16 }>, i64 } +%s_i8i32p = type { <{ i32, i8, i32 }>, i64 } +%s_i8i64p = type { <{ i64, i8, i64 }>, i64 } +%s_i8f16p = type { <{ half, i8, half }>, i64 } +%s_i8f16x2p = type { <{ <2 x half>, i8, <2 x half> }>, i64 } +%s_i8f32p = type { <{ float, i8, float }>, i64 } +%s_i8f64p = type { <{ double, i8, double }>, i64 } + +; -- All loads/stores from parameters aligned by one must be done one +; byte at a time. +; -- Notes: +; -- There are two fields of interest in the packed part of the struct, one +; with a proper offset and one without. The former should be loaded or +; stored as a whole, and the latter by bytes. +; -- Only loading and storing the said fields are checked in the following +; series of tests so that they are more concise. + +; CHECK: .visible .func (.param .align 8 .b8 func_retval0[16]) +; CHECK-LABEL: test_s_i8i16p( +; CHECK: .param .align 8 .b8 test_s_i8i16p_param_0[16] +; CHECK-DAG: ld.param.u16 [[P0:%rs[0-9]+]], [test_s_i8i16p_param_0]; +; CHECK-DAG: ld.param.u8 [[P2_0:%rs[0-9]+]], [test_s_i8i16p_param_0+3]; +; CHECK-DAG: ld.param.u8 [[P2_1:%rs[0-9]+]], [test_s_i8i16p_param_0+4]; +; CHECK-DAG: shl.b16 [[P2_1_shl:%rs[0-9]+]], [[P2_1]], 8; +; CHECK-DAG: or.b16 [[P2_1_or:%rs[0-9]+]], [[P2_1_shl]], [[P2_0]]; +; CHECK: { // callseq +; CHECK: .param .align 8 .b8 param0[16]; +; CHECK-DAG: st.param.b16 [param0+0], [[P0]]; +; CHECK-DAG: st.param.b8 [param0+3], [[P2_1_or]]; +; CHECK-DAG: st.param.b8 [param0+4], [[P2_1]]; +; CHECK: .param .align 8 .b8 retval0[16]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_s_i8i16p, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-DAG: ld.param.b16 [[R0:%rs[0-9]+]], [retval0+0]; +; CHECK-DAG: ld.param.b8 [[R2_0:%rs[0-9]+]], [retval0+3]; +; CHECK-DAG: ld.param.b8 [[R2_1:%rs[0-9]+]], [retval0+4]; +; CHECK: } // callseq +; CHECK-DAG: st.param.b16 [func_retval0+0], [[R0]]; +; CHECK-DAG: shl.b16 [[R2_1_shl:%rs[0-9]+]], [[R2_1]], 8; +; CHECK-DAG: and.b16 [[R2_0_and:%rs[0-9]+]], [[R2_0]], 255; +; CHECK-DAG: or.b16 [[R2:%rs[0-9]+]], [[R2_0_and]], [[R2_1_shl]]; +; CHECK-DAG: st.param.b8 [func_retval0+3], [[R2]]; +; CHECK-DAG: and.b16 [[R2_1_and:%rs[0-9]+]], [[R2_1]], 255; +; CHECK-DAG: st.param.b8 [func_retval0+4], [[R2_1_and]]; +; CHECK: ret; + +define %s_i8i16p @test_s_i8i16p(%s_i8i16p %a) { + %r = tail call %s_i8i16p @test_s_i8i16p(%s_i8i16p %a) + ret %s_i8i16p %r +} + +; CHECK: .visible .func (.param .align 8 .b8 func_retval0[24]) +; CHECK-LABEL: test_s_i8i32p( +; CHECK: .param .align 8 .b8 test_s_i8i32p_param_0[24] +; CHECK-DAG: ld.param.u32 [[P0:%r[0-9]+]], [test_s_i8i32p_param_0]; +; CHECK-DAG: ld.param.u8 [[P2_0:%r[0-9]+]], [test_s_i8i32p_param_0+5]; +; CHECK-DAG: ld.param.u8 [[P2_1:%r[0-9]+]], [test_s_i8i32p_param_0+6]; +; CHECK-DAG: ld.param.u8 [[P2_2:%r[0-9]+]], [test_s_i8i32p_param_0+7]; +; CHECK-DAG: ld.param.u8 [[P2_3:%r[0-9]+]], [test_s_i8i32p_param_0+8]; +; CHECK-DAG: shl.b32 [[P2_1_shl:%r[0-9]+]], [[P2_1]], 8; +; CHECK-DAG: shl.b32 [[P2_2_shl:%r[0-9]+]], [[P2_2]], 16; +; CHECK-DAG: shl.b32 [[P2_3_shl:%r[0-9]+]], [[P2_3]], 24; +; CHECK-DAG: or.b32 [[P2_or:%r[0-9]+]], [[P2_1_shl]], [[P2_0]]; +; CHECK-DAG: or.b32 [[P2_or_1:%r[0-9]+]], [[P2_3_shl]], [[P2_2_shl]]; +; CHECK-DAG: or.b32 [[P2:%r[0-9]+]], [[P2_or_1]], [[P2_or]]; +; CHECK-DAG: shr.u32 [[P2_1_shr:%r[0-9]+]], [[P2]], 8; +; CHECK-DAG: shr.u32 [[P2_2_shr:%r[0-9]+]], [[P2_or_1]], 16; +; CHECK: { // callseq +; CHECK-DAG: .param .align 8 .b8 param0[24]; +; CHECK-DAG: st.param.b32 [param0+0], [[P0]]; +; CHECK-DAG: st.param.b8 [param0+5], [[P2]]; +; CHECK-DAG: st.param.b8 [param0+6], [[P2_1_shr]]; +; CHECK-DAG: st.param.b8 [param0+7], [[P2_2_shr]]; +; CHECK-DAG: st.param.b8 [param0+8], [[P2_3]]; +; CHECK: .param .align 8 .b8 retval0[24]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_s_i8i32p, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-DAG: ld.param.b32 [[R0:%r[0-9]+]], [retval0+0]; +; CHECK-DAG: ld.param.b8 [[R2_0:%rs[0-9]+]], [retval0+5]; +; CHECK-DAG: ld.param.b8 [[R2_1:%rs[0-9]+]], [retval0+6]; +; CHECK-DAG: ld.param.b8 [[R2_2:%rs[0-9]+]], [retval0+7]; +; CHECK-DAG: ld.param.b8 [[R2_3:%rs[0-9]+]], [retval0+8]; +; CHECK: } // callseq +; CHECK-DAG: st.param.b32 [func_retval0+0], [[R0]]; +; CHECK-DAG: st.param.b8 [func_retval0+5], +; CHECK-DAG: st.param.b8 [func_retval0+6], +; CHECK-DAG: st.param.b8 [func_retval0+7], +; CHECK-DAG: st.param.b8 [func_retval0+8], +; CHECK: ret; + +define %s_i8i32p @test_s_i8i32p(%s_i8i32p %a) { + %r = tail call %s_i8i32p @test_s_i8i32p(%s_i8i32p %a) + ret %s_i8i32p %r +} + +; CHECK: .visible .func (.param .align 8 .b8 func_retval0[32]) +; CHECK-LABEL: test_s_i8i64p( +; CHECK: .param .align 8 .b8 test_s_i8i64p_param_0[32] +; CHECK-DAG: ld.param.u64 [[P0:%rd[0-9]+]], [test_s_i8i64p_param_0]; +; CHECK-DAG: ld.param.u8 [[P2_0:%rd[0-9]+]], [test_s_i8i64p_param_0+9]; +; CHECK-DAG: ld.param.u8 [[P2_1:%rd[0-9]+]], [test_s_i8i64p_param_0+10]; +; CHECK-DAG: ld.param.u8 [[P2_2:%rd[0-9]+]], [test_s_i8i64p_param_0+11]; +; CHECK-DAG: ld.param.u8 [[P2_3:%rd[0-9]+]], [test_s_i8i64p_param_0+12]; +; CHECK-DAG: ld.param.u8 [[P2_4:%rd[0-9]+]], [test_s_i8i64p_param_0+13]; +; CHECK-DAG: ld.param.u8 [[P2_5:%rd[0-9]+]], [test_s_i8i64p_param_0+14]; +; CHECK-DAG: ld.param.u8 [[P2_6:%rd[0-9]+]], [test_s_i8i64p_param_0+15]; +; CHECK-DAG: ld.param.u8 [[P2_7:%rd[0-9]+]], [test_s_i8i64p_param_0+16]; +; CHECK-DAG: shl.b64 [[P2_1_shl:%rd[0-9]+]], [[P2_1]], 8; +; CHECK-DAG: shl.b64 [[P2_2_shl:%rd[0-9]+]], [[P2_2]], 16; +; CHECK-DAG: shl.b64 [[P2_3_shl:%rd[0-9]+]], [[P2_3]], 24; +; CHECK-DAG: or.b64 [[P2_or_0:%rd[0-9]+]], [[P2_1_shl]], [[P2_0]]; +; CHECK-DAG: or.b64 [[P2_or_1:%rd[0-9]+]], [[P2_3_shl]], [[P2_2_shl]]; +; CHECK-DAG: or.b64 [[P2_or_2:%rd[0-9]+]], [[P2_or_1]], [[P2_or_0]]; +; CHECK-DAG: shl.b64 [[P2_5_shl:%rd[0-9]+]], [[P2_5]], 8; +; CHECK-DAG: shl.b64 [[P2_6_shl:%rd[0-9]+]], [[P2_6]], 16; +; CHECK-DAG: shl.b64 [[P2_7_shl:%rd[0-9]+]], [[P2_7]], 24; +; CHECK-DAG: or.b64 [[P2_or_3:%rd[0-9]+]], [[P2_5_shl]], [[P2_4]]; +; CHECK-DAG: or.b64 [[P2_or_4:%rd[0-9]+]], [[P2_7_shl]], [[P2_6_shl]]; +; CHECK-DAG: or.b64 [[P2_or_5:%rd[0-9]+]], [[P2_or_4]], [[P2_or_3]]; +; CHECK-DAG: shl.b64 [[P2_or_shl:%rd[0-9]+]], [[P2_or_5]], 32; +; CHECK-DAG: or.b64 [[P2:%rd[0-9]+]], [[P2_or_shl]], [[P2_or_2]]; +; CHECK-DAG: shr.u64 [[P2_shr_1:%rd[0-9]+]], [[P2]], 8; +; CHECK-DAG: shr.u64 [[P2_shr_2:%rd[0-9]+]], [[P2]], 16; +; CHECK-DAG: shr.u64 [[P2_shr_3:%rd[0-9]+]], [[P2]], 24; +; CHECK-DAG: bfe.u64 [[P2_bfe_4:%rd[0-9]+]], [[P2_or_5]], 8, 24; +; CHECK-DAG: bfe.u64 [[P2_bfe_5:%rd[0-9]+]], [[P2_or_5]], 16, 16; +; CHECK-DAG: bfe.u64 [[P2_bfe_6:%rd[0-9]+]], [[P2_or_5]], 24, 8; +; CHECK: { // callseq +; CHECK: .param .align 8 .b8 param0[32]; +; CHECK-DAG: st.param.b64 [param0+0], [[P0]]; +; CHECK-DAG: st.param.b8 [param0+9], [[P2]]; +; CHECK-DAG: st.param.b8 [param0+10], [[P2_shr_1]]; +; CHECK-DAG: st.param.b8 [param0+11], [[P2_shr_2]]; +; CHECK-DAG: st.param.b8 [param0+12], [[P2_shr_3]]; +; CHECK-DAG: st.param.b8 [param0+13], [[P2_or_5]]; +; CHECK-DAG: st.param.b8 [param0+14], [[P2_bfe_4]]; +; CHECK-DAG: st.param.b8 [param0+15], [[P2_bfe_5]]; +; CHECK-DAG: st.param.b8 [param0+16], [[P2_bfe_6]]; +; CHECK: .param .align 8 .b8 retval0[32]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_s_i8i64p, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-DAG: ld.param.b64 [[R0:%rd[0-9]+]], [retval0+0]; +; CHECK-DAG: ld.param.b8 [[R2_0:%rs[0-9]+]], [retval0+9]; +; CHECK-DAG: ld.param.b8 [[R2_1:%rs[0-9]+]], [retval0+10]; +; CHECK-DAG: ld.param.b8 [[R2_2:%rs[0-9]+]], [retval0+11]; +; CHECK-DAG: ld.param.b8 [[R2_3:%rs[0-9]+]], [retval0+12]; +; CHECK-DAG: ld.param.b8 [[R2_4:%rs[0-9]+]], [retval0+13]; +; CHECK-DAG: ld.param.b8 [[R2_5:%rs[0-9]+]], [retval0+14]; +; CHECK-DAG: ld.param.b8 [[R2_6:%rs[0-9]+]], [retval0+15]; +; CHECK-DAG: ld.param.b8 [[R2_7:%rs[0-9]+]], [retval0+16]; +; CHECK: } // callseq +; CHECK-DAG: st.param.b64 [func_retval0+0], [[R0]]; +; CHECK-DAG: st.param.b8 [func_retval0+9], +; CHECK-DAG: st.param.b8 [func_retval0+10], +; CHECK-DAG: st.param.b8 [func_retval0+11], +; CHECK-DAG: st.param.b8 [func_retval0+12], +; CHECK-DAG: st.param.b8 [func_retval0+13], +; CHECK-DAG: st.param.b8 [func_retval0+14], +; CHECK-DAG: st.param.b8 [func_retval0+15], +; CHECK-DAG: st.param.b8 [func_retval0+16], +; CHECK: ret; + +define %s_i8i64p @test_s_i8i64p(%s_i8i64p %a) { + %r = tail call %s_i8i64p @test_s_i8i64p(%s_i8i64p %a) + ret %s_i8i64p %r +} + +; CHECK: .visible .func (.param .align 8 .b8 func_retval0[16]) +; CHECK-LABEL: test_s_i8f16p( +; CHECK: .param .align 8 .b8 test_s_i8f16p_param_0[16] +; CHECK-DAG: ld.param.b16 [[P0:%rs[0-9]+]], [test_s_i8f16p_param_0]; +; CHECK-DAG: ld.param.u8 [[P2_0:%rs[0-9]+]], [test_s_i8f16p_param_0+3]; +; CHECK-DAG: ld.param.u8 [[P2_1:%rs[0-9]+]], [test_s_i8f16p_param_0+4]; +; CHECK-DAG: shl.b16 [[P2_1_shl:%rs[0-9]+]], [[P2_1]], 8; +; CHECK-DAG: or.b16 [[P2_1_or:%rs[0-9]+]], [[P2_1_shl]], [[P2_0]]; +; CHECK: { // callseq +; CHECK: .param .align 8 .b8 param0[16]; +; CHECK-DAG: st.param.b16 [param0+0], [[P0]]; +; CHECK-DAG: st.param.b8 [param0+3], [[P2_1_or]]; +; CHECK-DAG: st.param.b8 [param0+4], [[P2_1]]; +; CHECK: .param .align 8 .b8 retval0[16]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_s_i8f16p, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-DAG: ld.param.b16 [[R0:%rs[0-9]+]], [retval0+0]; +; CHECK-DAG: ld.param.b8 [[R2I_0:%rs[0-9]+]], [retval0+3]; +; CHECK-DAG: ld.param.b8 [[R2I_1:%rs[0-9]+]], [retval0+4]; +; CHECK: } // callseq +; CHECK-DAG: st.param.b16 [func_retval0+0], [[R0]]; +; CHECK-DAG: shl.b16 [[R2I_1_shl:%rs[0-9]+]], [[R2I_1]], 8; +; CHECK-DAG: and.b16 [[R2I_0_and:%rs[0-9]+]], [[R2I_0]], 255; +; CHECK-DAG: or.b16 [[R2I:%rs[0-9]+]], [[R2I_0_and]], [[R2I_1_shl]]; +; CHECK-DAG: st.param.b8 [func_retval0+3], [[R2I]]; +; CHECK-DAG: and.b16 [[R2I_1_and:%rs[0-9]+]], [[R2I_1]], 255; +; CHECK-DAG: st.param.b8 [func_retval0+4], [[R2I_1_and]]; +; CHECK: ret; + +define %s_i8f16p @test_s_i8f16p(%s_i8f16p %a) { + %r = tail call %s_i8f16p @test_s_i8f16p(%s_i8f16p %a) + ret %s_i8f16p %r +} + +; CHECK: .visible .func (.param .align 8 .b8 func_retval0[24]) +; CHECK-LABEL: test_s_i8f16x2p( +; CHECK: .param .align 8 .b8 test_s_i8f16x2p_param_0[24] +; CHECK-DAG: ld.param.b32 [[P0:%r[0-9]+]], [test_s_i8f16x2p_param_0]; +; CHECK-DAG: ld.param.u8 [[P2_0:%r[0-9]+]], [test_s_i8f16x2p_param_0+5]; +; CHECK-DAG: ld.param.u8 [[P2_1:%r[0-9]+]], [test_s_i8f16x2p_param_0+6]; +; CHECK-DAG: ld.param.u8 [[P2_2:%r[0-9]+]], [test_s_i8f16x2p_param_0+7]; +; CHECK-DAG: ld.param.u8 [[P2_3:%r[0-9]+]], [test_s_i8f16x2p_param_0+8]; +; CHECK-DAG: shl.b32 [[P2_1_shl:%r[0-9]+]], [[P2_1]], 8; +; CHECK-DAG: shl.b32 [[P2_2_shl:%r[0-9]+]], [[P2_2]], 16; +; CHECK-DAG: shl.b32 [[P2_3_shl:%r[0-9]+]], [[P2_3]], 24; +; CHECK-DAG: or.b32 [[P2_or:%r[0-9]+]], [[P2_1_shl]], [[P2_0]]; +; CHECK-DAG: or.b32 [[P2_or_1:%r[0-9]+]], [[P2_3_shl]], [[P2_2_shl]]; +; CHECK-DAG: or.b32 [[P2:%r[0-9]+]], [[P2_or_1]], [[P2_or]]; +; CHECK-DAG: shr.u32 [[P2_1_shr:%r[0-9]+]], [[P2]], 8; +; CHECK-DAG: shr.u32 [[P2_2_shr:%r[0-9]+]], [[P2_or_1]], 16; +; CHECK: { // callseq +; CHECK-DAG: .param .align 8 .b8 param0[24]; +; CHECK-DAG: st.param.b32 [param0+0], [[P0]]; +; CHECK-DAG: st.param.b8 [param0+5], [[P2]]; +; CHECK-DAG: st.param.b8 [param0+6], [[P2_1_shr]]; +; CHECK-DAG: st.param.b8 [param0+7], [[P2_2_shr]]; +; CHECK-DAG: st.param.b8 [param0+8], [[P2_3]]; +; CHECK: .param .align 8 .b8 retval0[24]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_s_i8f16x2p, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-DAG: ld.param.b32 [[R0:%r[0-9]+]], [retval0+0]; +; CHECK-DAG: ld.param.b8 [[R2_0:%rs[0-9]+]], [retval0+5]; +; CHECK-DAG: ld.param.b8 [[R2_1:%rs[0-9]+]], [retval0+6]; +; CHECK-DAG: ld.param.b8 [[R2_2:%rs[0-9]+]], [retval0+7]; +; CHECK-DAG: ld.param.b8 [[R2_3:%rs[0-9]+]], [retval0+8]; +; CHECK: } // callseq +; CHECK-DAG: st.param.b32 [func_retval0+0], [[R0]]; +; CHECK-DAG: st.param.b8 [func_retval0+5], +; CHECK-DAG: st.param.b8 [func_retval0+6], +; CHECK-DAG: st.param.b8 [func_retval0+7], +; CHECK-DAG: st.param.b8 [func_retval0+8], +; CHECK: ret; + +define %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a) { + %r = tail call %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a) + ret %s_i8f16x2p %r +} + +; CHECK: .visible .func (.param .align 8 .b8 func_retval0[24]) +; CHECK-LABEL: test_s_i8f32p( +; CHECK: .param .align 8 .b8 test_s_i8f32p_param_0[24] +; CHECK-DAG: ld.param.f32 [[P0:%f[0-9]+]], [test_s_i8f32p_param_0]; +; CHECK-DAG: ld.param.u8 [[P2_0:%r[0-9]+]], [test_s_i8f32p_param_0+5]; +; CHECK-DAG: ld.param.u8 [[P2_1:%r[0-9]+]], [test_s_i8f32p_param_0+6]; +; CHECK-DAG: ld.param.u8 [[P2_2:%r[0-9]+]], [test_s_i8f32p_param_0+7]; +; CHECK-DAG: ld.param.u8 [[P2_3:%r[0-9]+]], [test_s_i8f32p_param_0+8]; +; CHECK-DAG: shl.b32 [[P2_1_shl:%r[0-9]+]], [[P2_1]], 8; +; CHECK-DAG: shl.b32 [[P2_2_shl:%r[0-9]+]], [[P2_2]], 16; +; CHECK-DAG: shl.b32 [[P2_3_shl:%r[0-9]+]], [[P2_3]], 24; +; CHECK-DAG: or.b32 [[P2_or:%r[0-9]+]], [[P2_1_shl]], [[P2_0]]; +; CHECK-DAG: or.b32 [[P2_or_1:%r[0-9]+]], [[P2_3_shl]], [[P2_2_shl]]; +; CHECK-DAG: or.b32 [[P2:%r[0-9]+]], [[P2_or_1]], [[P2_or]]; +; CHECK-DAG: shr.u32 [[P2_1_shr:%r[0-9]+]], [[P2]], 8; +; CHECK-DAG: shr.u32 [[P2_2_shr:%r[0-9]+]], [[P2_or_1]], 16; +; CHECK: { // callseq +; CHECK-DAG: .param .align 8 .b8 param0[24]; +; CHECK-DAG: st.param.f32 [param0+0], [[P0]]; +; CHECK-DAG: st.param.b8 [param0+5], [[P2]]; +; CHECK-DAG: st.param.b8 [param0+6], [[P2_1_shr]]; +; CHECK-DAG: st.param.b8 [param0+7], [[P2_2_shr]]; +; CHECK-DAG: st.param.b8 [param0+8], [[P2_3]]; +; CHECK: .param .align 8 .b8 retval0[24]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_s_i8f32p, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-DAG: ld.param.f32 [[R0:%f[0-9]+]], [retval0+0]; +; CHECK-DAG: ld.param.b8 [[R2_0:%rs[0-9]+]], [retval0+5]; +; CHECK-DAG: ld.param.b8 [[R2_1:%rs[0-9]+]], [retval0+6]; +; CHECK-DAG: ld.param.b8 [[R2_2:%rs[0-9]+]], [retval0+7]; +; CHECK-DAG: ld.param.b8 [[R2_3:%rs[0-9]+]], [retval0+8]; +; CHECK: } // callseq +; CHECK-DAG: st.param.f32 [func_retval0+0], [[R0]]; +; CHECK-DAG: st.param.b8 [func_retval0+5], +; CHECK-DAG: st.param.b8 [func_retval0+6], +; CHECK-DAG: st.param.b8 [func_retval0+7], +; CHECK-DAG: st.param.b8 [func_retval0+8], +; CHECK: ret; + +define %s_i8f32p @test_s_i8f32p(%s_i8f32p %a) { + %r = tail call %s_i8f32p @test_s_i8f32p(%s_i8f32p %a) + ret %s_i8f32p %r +} + +; CHECK: .visible .func (.param .align 8 .b8 func_retval0[32]) +; CHECK-LABEL: test_s_i8f64p( +; CHECK: .param .align 8 .b8 test_s_i8f64p_param_0[32] +; CHECK-DAG: ld.param.f64 [[P0:%fd[0-9]+]], [test_s_i8f64p_param_0]; +; CHECK-DAG: ld.param.u8 [[P2_0:%rd[0-9]+]], [test_s_i8f64p_param_0+9]; +; CHECK-DAG: ld.param.u8 [[P2_1:%rd[0-9]+]], [test_s_i8f64p_param_0+10]; +; CHECK-DAG: ld.param.u8 [[P2_2:%rd[0-9]+]], [test_s_i8f64p_param_0+11]; +; CHECK-DAG: ld.param.u8 [[P2_3:%rd[0-9]+]], [test_s_i8f64p_param_0+12]; +; CHECK-DAG: ld.param.u8 [[P2_4:%rd[0-9]+]], [test_s_i8f64p_param_0+13]; +; CHECK-DAG: ld.param.u8 [[P2_5:%rd[0-9]+]], [test_s_i8f64p_param_0+14]; +; CHECK-DAG: ld.param.u8 [[P2_6:%rd[0-9]+]], [test_s_i8f64p_param_0+15]; +; CHECK-DAG: ld.param.u8 [[P2_7:%rd[0-9]+]], [test_s_i8f64p_param_0+16]; +; CHECK-DAG: shl.b64 [[P2_1_shl:%rd[0-9]+]], [[P2_1]], 8; +; CHECK-DAG: shl.b64 [[P2_2_shl:%rd[0-9]+]], [[P2_2]], 16; +; CHECK-DAG: shl.b64 [[P2_3_shl:%rd[0-9]+]], [[P2_3]], 24; +; CHECK-DAG: or.b64 [[P2_or_0:%rd[0-9]+]], [[P2_1_shl]], [[P2_0]]; +; CHECK-DAG: or.b64 [[P2_or_1:%rd[0-9]+]], [[P2_3_shl]], [[P2_2_shl]]; +; CHECK-DAG: or.b64 [[P2_or_2:%rd[0-9]+]], [[P2_or_1]], [[P2_or_0]]; +; CHECK-DAG: shl.b64 [[P2_5_shl:%rd[0-9]+]], [[P2_5]], 8; +; CHECK-DAG: shl.b64 [[P2_6_shl:%rd[0-9]+]], [[P2_6]], 16; +; CHECK-DAG: shl.b64 [[P2_7_shl:%rd[0-9]+]], [[P2_7]], 24; +; CHECK-DAG: or.b64 [[P2_or_3:%rd[0-9]+]], [[P2_5_shl]], [[P2_4]]; +; CHECK-DAG: or.b64 [[P2_or_4:%rd[0-9]+]], [[P2_7_shl]], [[P2_6_shl]]; +; CHECK-DAG: or.b64 [[P2_or_5:%rd[0-9]+]], [[P2_or_4]], [[P2_or_3]]; +; CHECK-DAG: shl.b64 [[P2_or_shl:%rd[0-9]+]], [[P2_or_5]], 32; +; CHECK-DAG: or.b64 [[P2:%rd[0-9]+]], [[P2_or_shl]], [[P2_or_2]]; +; CHECK-DAG: shr.u64 [[P2_shr_1:%rd[0-9]+]], [[P2]], 8; +; CHECK-DAG: shr.u64 [[P2_shr_2:%rd[0-9]+]], [[P2]], 16; +; CHECK-DAG: shr.u64 [[P2_shr_3:%rd[0-9]+]], [[P2]], 24; +; CHECK-DAG: bfe.u64 [[P2_bfe_4:%rd[0-9]+]], [[P2_or_5]], 8, 24; +; CHECK-DAG: bfe.u64 [[P2_bfe_5:%rd[0-9]+]], [[P2_or_5]], 16, 16; +; CHECK-DAG: bfe.u64 [[P2_bfe_6:%rd[0-9]+]], [[P2_or_5]], 24, 8; +; CHECK: { // callseq +; CHECK: .param .align 8 .b8 param0[32]; +; CHECK-DAG: st.param.f64 [param0+0], [[P0]]; +; CHECK-DAG: st.param.b8 [param0+9], [[P2]]; +; CHECK-DAG: st.param.b8 [param0+10], [[P2_shr_1]]; +; CHECK-DAG: st.param.b8 [param0+11], [[P2_shr_2]]; +; CHECK-DAG: st.param.b8 [param0+12], [[P2_shr_3]]; +; CHECK-DAG: st.param.b8 [param0+13], [[P2_or_5]]; +; CHECK-DAG: st.param.b8 [param0+14], [[P2_bfe_4]]; +; CHECK-DAG: st.param.b8 [param0+15], [[P2_bfe_5]]; +; CHECK-DAG: st.param.b8 [param0+16], [[P2_bfe_6]]; +; CHECK: .param .align 8 .b8 retval0[32]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_s_i8f64p, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-DAG: ld.param.f64 [[R0:%fd[0-9]+]], [retval0+0]; +; CHECK-DAG: ld.param.b8 [[R2_0:%rs[0-9]+]], [retval0+9]; +; CHECK-DAG: ld.param.b8 [[R2_1:%rs[0-9]+]], [retval0+10]; +; CHECK-DAG: ld.param.b8 [[R2_2:%rs[0-9]+]], [retval0+11]; +; CHECK-DAG: ld.param.b8 [[R2_3:%rs[0-9]+]], [retval0+12]; +; CHECK-DAG: ld.param.b8 [[R2_4:%rs[0-9]+]], [retval0+13]; +; CHECK-DAG: ld.param.b8 [[R2_5:%rs[0-9]+]], [retval0+14]; +; CHECK-DAG: ld.param.b8 [[R2_6:%rs[0-9]+]], [retval0+15]; +; CHECK-DAG: ld.param.b8 [[R2_7:%rs[0-9]+]], [retval0+16]; +; CHECK: } // callseq +; CHECK-DAG: st.param.f64 [func_retval0+0], [[R0]]; +; CHECK-DAG: st.param.b8 [func_retval0+9], +; CHECK-DAG: st.param.b8 [func_retval0+10], +; CHECK-DAG: st.param.b8 [func_retval0+11], +; CHECK-DAG: st.param.b8 [func_retval0+12], +; CHECK-DAG: st.param.b8 [func_retval0+13], +; CHECK-DAG: st.param.b8 [func_retval0+14], +; CHECK-DAG: st.param.b8 [func_retval0+15], +; CHECK-DAG: st.param.b8 [func_retval0+16], +; CHECK: ret; + +define %s_i8f64p @test_s_i8f64p(%s_i8f64p %a) { + %r = tail call %s_i8f64p @test_s_i8f64p(%s_i8f64p %a) + ret %s_i8f64p %r +} |