aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Target/PowerPC/PPCISelLowering.cpp124
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstrVSX.td133
2 files changed, 172 insertions, 85 deletions
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index c62fedf..37e1568 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -9562,7 +9562,8 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
// which is strictly wider than the loaded value by 8 bytes. So we need to
// adjust the splat index to point to the correct address in memory.
if (IsPermutedLoad) {
- assert(isLittleEndian && "Unexpected permuted load on big endian target");
+ assert((isLittleEndian || IsFourByte) &&
+ "Unexpected size for permuted load on big endian target");
SplatIdx += IsFourByte ? 2 : 1;
assert((SplatIdx < (IsFourByte ? 4 : 2)) &&
"Splat of a value outside of the loaded memory");
@@ -9577,6 +9578,11 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
else
Offset = isLittleEndian ? (1 - SplatIdx) * 8 : SplatIdx * 8;
+ // If the width of the load is the same as the width of the splat,
+ // loading with an offset would load the wrong memory.
+ if (LD->getValueType(0).getSizeInBits() == (IsFourByte ? 32 : 64))
+ Offset = 0;
+
SDValue BasePtr = LD->getBasePtr();
if (Offset != 0)
BasePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
@@ -14200,13 +14206,24 @@ static SDValue isScalarToVec(SDValue Op) {
return SDValue();
}
+// Fix up the shuffle mask to account for the fact that the result of
+// scalar_to_vector is not in lane zero. This just takes all values in
+// the ranges specified by the min/max indices and adds the number of
+// elements required to ensure each element comes from the respective
+// position in the valid lane.
+// On little endian, that's just the corresponding element in the other
+// half of the vector. On big endian, it is in the same half but right
+// justified rather than left justified in that half.
static void fixupShuffleMaskForPermutedSToV(SmallVectorImpl<int> &ShuffV,
int LHSMaxIdx, int RHSMinIdx,
- int RHSMaxIdx, int HalfVec) {
+ int RHSMaxIdx, int HalfVec,
+ unsigned ValidLaneWidth,
+ const PPCSubtarget &Subtarget) {
for (int i = 0, e = ShuffV.size(); i < e; i++) {
int Idx = ShuffV[i];
if ((Idx >= 0 && Idx < LHSMaxIdx) || (Idx >= RHSMinIdx && Idx < RHSMaxIdx))
- ShuffV[i] += HalfVec;
+ ShuffV[i] +=
+ Subtarget.isLittleEndian() ? HalfVec : HalfVec - ValidLaneWidth;
}
}
@@ -14215,7 +14232,8 @@ static void fixupShuffleMaskForPermutedSToV(SmallVectorImpl<int> &ShuffV,
// (<n x Ty> (scalar_to_vector (Ty (extract_elt <n x Ty> %a, C))))
// In such a case, just change the shuffle mask to extract the element
// from the permuted index.
-static SDValue getSToVPermuted(SDValue OrigSToV, SelectionDAG &DAG) {
+static SDValue getSToVPermuted(SDValue OrigSToV, SelectionDAG &DAG,
+ const PPCSubtarget &Subtarget) {
SDLoc dl(OrigSToV);
EVT VT = OrigSToV.getValueType();
assert(OrigSToV.getOpcode() == ISD::SCALAR_TO_VECTOR &&
@@ -14229,8 +14247,14 @@ static SDValue getSToVPermuted(SDValue OrigSToV, SelectionDAG &DAG) {
// Can't handle non-const element indices or different vector types
// for the input to the extract and the output of the scalar_to_vector.
if (Idx && VT == OrigVector.getValueType()) {
- SmallVector<int, 16> NewMask(VT.getVectorNumElements(), -1);
- NewMask[VT.getVectorNumElements() / 2] = Idx->getZExtValue();
+ unsigned NumElts = VT.getVectorNumElements();
+ assert(
+ NumElts > 1 &&
+ "Cannot produce a permuted scalar_to_vector for one element vector");
+ SmallVector<int, 16> NewMask(NumElts, -1);
+ unsigned ResultInElt = NumElts / 2;
+ ResultInElt -= Subtarget.isLittleEndian() ? 0 : 1;
+ NewMask[ResultInElt] = Idx->getZExtValue();
return DAG.getVectorShuffle(VT, dl, OrigVector, OrigVector, NewMask);
}
}
@@ -14246,6 +14270,10 @@ static SDValue getSToVPermuted(SDValue OrigSToV, SelectionDAG &DAG) {
// Furthermore, SCALAR_TO_VECTOR on little endian always involves a permute
// to put the value into element zero. Adjust the shuffle mask so that the
// vector can remain in permuted form (to prevent a swap prior to a shuffle).
+// On big endian targets, this is still useful for SCALAR_TO_VECTOR
+// nodes with elements smaller than doubleword because all the ways
+// of getting scalar data into a vector register put the value in the
+// rightmost element of the left half of the vector.
SDValue PPCTargetLowering::combineVectorShuffle(ShuffleVectorSDNode *SVN,
SelectionDAG &DAG) const {
SDValue LHS = SVN->getOperand(0);
@@ -14254,10 +14282,12 @@ SDValue PPCTargetLowering::combineVectorShuffle(ShuffleVectorSDNode *SVN,
int NumElts = LHS.getValueType().getVectorNumElements();
SDValue Res(SVN, 0);
SDLoc dl(SVN);
+ bool IsLittleEndian = Subtarget.isLittleEndian();
- // None of these combines are useful on big endian systems since the ISA
- // already has a big endian bias.
- if (!Subtarget.isLittleEndian() || !Subtarget.hasVSX())
+ // On little endian targets, do these combines on all VSX targets since
+ // canonical shuffles match efficient permutes. On big endian targets,
+ // this is only useful for targets with direct moves.
+ if (!Subtarget.hasDirectMove() && !(IsLittleEndian && Subtarget.hasVSX()))
return Res;
// If this is not a shuffle of a shuffle and the first element comes from
@@ -14280,6 +14310,18 @@ SDValue PPCTargetLowering::combineVectorShuffle(ShuffleVectorSDNode *SVN,
int NumEltsIn = SToVLHS ? SToVLHS.getValueType().getVectorNumElements()
: SToVRHS.getValueType().getVectorNumElements();
int NumEltsOut = ShuffV.size();
+ unsigned InElemSizeInBits =
+ SToVLHS ? SToVLHS.getValueType().getScalarSizeInBits()
+ : SToVRHS.getValueType().getScalarSizeInBits();
+ unsigned OutElemSizeInBits = SToVLHS
+ ? LHS.getValueType().getScalarSizeInBits()
+ : RHS.getValueType().getScalarSizeInBits();
+
+ // The width of the "valid lane" (i.e. the lane that contains the value that
+ // is vectorized) needs to be expressed in terms of the number of elements
+ // of the shuffle. It is thereby the ratio of the values before and after
+ // any bitcast.
+ unsigned ValidLaneWidth = InElemSizeInBits / OutElemSizeInBits;
// Initially assume that neither input is permuted. These will be adjusted
// accordingly if either input is.
@@ -14290,18 +14332,25 @@ SDValue PPCTargetLowering::combineVectorShuffle(ShuffleVectorSDNode *SVN,
// Get the permuted scalar to vector nodes for the source(s) that come from
// ISD::SCALAR_TO_VECTOR.
+ // On big endian systems, this only makes sense for element sizes smaller
+ // than 64 bits since for 64-bit elements, all instructions already put
+ // the value into element zero.
if (SToVLHS) {
+ if (!IsLittleEndian && InElemSizeInBits >= 64)
+ return Res;
// Set up the values for the shuffle vector fixup.
LHSMaxIdx = NumEltsOut / NumEltsIn;
- SToVLHS = getSToVPermuted(SToVLHS, DAG);
+ SToVLHS = getSToVPermuted(SToVLHS, DAG, Subtarget);
if (SToVLHS.getValueType() != LHS.getValueType())
SToVLHS = DAG.getBitcast(LHS.getValueType(), SToVLHS);
LHS = SToVLHS;
}
if (SToVRHS) {
+ if (!IsLittleEndian && InElemSizeInBits >= 64)
+ return Res;
RHSMinIdx = NumEltsOut;
RHSMaxIdx = NumEltsOut / NumEltsIn + RHSMinIdx;
- SToVRHS = getSToVPermuted(SToVRHS, DAG);
+ SToVRHS = getSToVPermuted(SToVRHS, DAG, Subtarget);
if (SToVRHS.getValueType() != RHS.getValueType())
SToVRHS = DAG.getBitcast(RHS.getValueType(), SToVRHS);
RHS = SToVRHS;
@@ -14311,10 +14360,9 @@ SDValue PPCTargetLowering::combineVectorShuffle(ShuffleVectorSDNode *SVN,
// The minimum and maximum indices that correspond to element zero for both
// the LHS and RHS are computed and will control which shuffle mask entries
// are to be changed. For example, if the RHS is permuted, any shuffle mask
- // entries in the range [RHSMinIdx,RHSMaxIdx) will be incremented by
- // HalfVec to refer to the corresponding element in the permuted vector.
+ // entries in the range [RHSMinIdx,RHSMaxIdx) will be adjusted.
fixupShuffleMaskForPermutedSToV(ShuffV, LHSMaxIdx, RHSMinIdx, RHSMaxIdx,
- HalfVec);
+ HalfVec, ValidLaneWidth, Subtarget);
Res = DAG.getVectorShuffle(SVN->getValueType(0), dl, LHS, RHS, ShuffV);
// We may have simplified away the shuffle. We won't be able to do anything
@@ -14324,12 +14372,13 @@ SDValue PPCTargetLowering::combineVectorShuffle(ShuffleVectorSDNode *SVN,
Mask = cast<ShuffleVectorSDNode>(Res)->getMask();
}
+ SDValue TheSplat = IsLittleEndian ? RHS : LHS;
// The common case after we commuted the shuffle is that the RHS is a splat
// and we have elements coming in from the splat at indices that are not
// conducive to using a merge.
// Example:
// vector_shuffle<0,17,1,19,2,21,3,23,4,25,5,27,6,29,7,31> t1, <zero>
- if (!isSplatBV(RHS))
+ if (!isSplatBV(TheSplat))
return Res;
// We are looking for a mask such that all even elements are from
@@ -14339,24 +14388,41 @@ SDValue PPCTargetLowering::combineVectorShuffle(ShuffleVectorSDNode *SVN,
// Adjust the mask so we are pulling in the same index from the splat
// as the index from the interesting vector in consecutive elements.
- // Example (even elements from first vector):
- // vector_shuffle<0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23> t1, <zero>
- if (Mask[0] < NumElts)
- for (int i = 1, e = Mask.size(); i < e; i += 2)
- ShuffV[i] = (ShuffV[i - 1] + NumElts);
- // Example (odd elements from first vector):
- // vector_shuffle<16,0,17,1,18,2,19,3,20,4,21,5,22,6,23,7> t1, <zero>
- else
- for (int i = 0, e = Mask.size(); i < e; i += 2)
- ShuffV[i] = (ShuffV[i + 1] + NumElts);
+ if (IsLittleEndian) {
+ // Example (even elements from first vector):
+ // vector_shuffle<0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23> t1, <zero>
+ if (Mask[0] < NumElts)
+ for (int i = 1, e = Mask.size(); i < e; i += 2)
+ ShuffV[i] = (ShuffV[i - 1] + NumElts);
+ // Example (odd elements from first vector):
+ // vector_shuffle<16,0,17,1,18,2,19,3,20,4,21,5,22,6,23,7> t1, <zero>
+ else
+ for (int i = 0, e = Mask.size(); i < e; i += 2)
+ ShuffV[i] = (ShuffV[i + 1] + NumElts);
+ } else {
+ // Example (even elements from first vector):
+ // vector_shuffle<0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23> <zero>, t1
+ if (Mask[0] < NumElts)
+ for (int i = 0, e = Mask.size(); i < e; i += 2)
+ ShuffV[i] = ShuffV[i + 1] - NumElts;
+ // Example (odd elements from first vector):
+ // vector_shuffle<16,0,17,1,18,2,19,3,20,4,21,5,22,6,23,7> <zero>, t1
+ else
+ for (int i = 1, e = Mask.size(); i < e; i += 2)
+ ShuffV[i] = ShuffV[i - 1] - NumElts;
+ }
// If the RHS has undefs, we need to remove them since we may have created
// a shuffle that adds those instead of the splat value.
- SDValue SplatVal = cast<BuildVectorSDNode>(RHS.getNode())->getSplatValue();
- RHS = DAG.getSplatBuildVector(RHS.getValueType(), dl, SplatVal);
+ SDValue SplatVal =
+ cast<BuildVectorSDNode>(TheSplat.getNode())->getSplatValue();
+ TheSplat = DAG.getSplatBuildVector(TheSplat.getValueType(), dl, SplatVal);
- Res = DAG.getVectorShuffle(SVN->getValueType(0), dl, LHS, RHS, ShuffV);
- return Res;
+ if (IsLittleEndian)
+ RHS = TheSplat;
+ else
+ LHS = TheSplat;
+ return DAG.getVectorShuffle(SVN->getValueType(0), dl, LHS, RHS, ShuffV);
}
SDValue PPCTargetLowering::combineVReverseMemOP(ShuffleVectorSDNode *SVN,
diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
index 869e06c..e57f299 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -3088,6 +3088,8 @@ let Predicates = [HasVSX, HasOnlySwappingMemOps, IsBigEndian] in {
def : Pat<(store v4i32:$XT, xoaddr:$dst), (STXVW4X $XT, xoaddr:$dst)>;
def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, xoaddr:$dst),
(STXVW4X $rS, xoaddr:$dst)>;
+ def : Pat<(v2i64 (scalar_to_vector (i64 (load xoaddr:$src)))),
+ (SUBREG_TO_REG (i64 1), (XFLOADf64 xoaddr:$src), sub_64)>;
} // HasVSX, HasOnlySwappingMemOps, IsBigEndian
// Any Power8 VSX subtarget.
@@ -3181,8 +3183,7 @@ def : Pat<DWToSPExtractConv.El1US1,
(f32 (XSCVUXDSP (COPY_TO_REGCLASS (XXPERMDI $S1, $S1, 2), VSFRC)))>;
// v4f32 scalar <-> vector conversions (BE)
-def : Pat<(v4f32 (scalar_to_vector f32:$A)),
- (v4f32 (XSCVDPSPN $A))>;
+defm : ScalToVecWPermute<v4f32, (f32 f32:$A), (XSCVDPSPN $A), (XSCVDPSPN $A)>;
def : Pat<(f32 (vector_extract v4f32:$S, 0)),
(f32 (XSCVSPDPN $S))>;
def : Pat<(f32 (vector_extract v4f32:$S, 1)),
@@ -3228,10 +3229,14 @@ def : Pat<(v2i64 (scalar_to_vector (i64 (sextloadi32 xoaddr:$src)))),
(v2i64 (SUBREG_TO_REG (i64 1), (LIWAX xoaddr:$src), sub_64))>;
def : Pat<(v2i64 (scalar_to_vector (i64 (zextloadi32 xoaddr:$src)))),
(v2i64 (SUBREG_TO_REG (i64 1), (LIWZX xoaddr:$src), sub_64))>;
-def : Pat<(v4i32 (scalar_to_vector (i32 (load xoaddr:$src)))),
- (v4i32 (XXSLDWIs (LIWZX xoaddr:$src), 1))>;
-def : Pat<(v4f32 (scalar_to_vector (f32 (load xoaddr:$src)))),
- (v4f32 (XXSLDWIs (LIWZX xoaddr:$src), 1))>;
+defm : ScalToVecWPermute<
+ v4i32, (i32 (load xoaddr:$src)),
+ (XXSLDWIs (LIWZX xoaddr:$src), 1),
+ (SUBREG_TO_REG (i64 1), (LIWZX xoaddr:$src), sub_64)>;
+defm : ScalToVecWPermute<
+ v4f32, (f32 (load xoaddr:$src)),
+ (XXSLDWIs (LIWZX xoaddr:$src), 1),
+ (SUBREG_TO_REG (i64 1), (LIWZX xoaddr:$src), sub_64)>;
def : Pat<DWToSPExtractConv.BVU,
(v4f32 (VPKUDUM (XXSLDWI (XVCVUXDSP $S1), (XVCVUXDSP $S1), 3),
@@ -3272,12 +3277,9 @@ def : Pat<DWToSPExtractConv.El1US1,
(f64 (COPY_TO_REGCLASS $S1, VSRC)), VSFRC)))>;
// v4f32 scalar <-> vector conversions (LE)
- // The permuted version is no better than the version that puts the value
- // into the right element because XSCVDPSPN is different from all the other
- // instructions used for PPCSToV.
defm : ScalToVecWPermute<v4f32, (f32 f32:$A),
(XXSLDWI (XSCVDPSPN $A), (XSCVDPSPN $A), 1),
- (XXSLDWI (XSCVDPSPN $A), (XSCVDPSPN $A), 3)>;
+ (XSCVDPSPN $A)>;
def : Pat<(f32 (vector_extract v4f32:$S, 0)),
(f32 (XSCVSPDPN (XXSLDWI $S, $S, 3)))>;
def : Pat<(f32 (vector_extract v4f32:$S, 1)),
@@ -3439,12 +3441,18 @@ def : Pat<(v4i32 (build_vector immSExt5NonZero:$A, immSExt5NonZero:$A,
// Big endian VSX subtarget with direct moves.
let Predicates = [HasVSX, HasDirectMove, IsBigEndian] in {
// v16i8 scalar <-> vector conversions (BE)
-def : Pat<(v16i8 (scalar_to_vector i32:$A)),
- (v16i8 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_BYTE_0, sub_64))>;
-def : Pat<(v8i16 (scalar_to_vector i32:$A)),
- (v8i16 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_HALF_0, sub_64))>;
-def : Pat<(v4i32 (scalar_to_vector i32:$A)),
- (v4i32 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_WORD_0, sub_64))>;
+defm : ScalToVecWPermute<
+ v16i8, (i32 i32:$A),
+ (SUBREG_TO_REG (i64 1), MovesToVSR.BE_BYTE_0, sub_64),
+ (SUBREG_TO_REG (i64 1), (MTVSRWZ $A), sub_64)>;
+defm : ScalToVecWPermute<
+ v8i16, (i32 i32:$A),
+ (SUBREG_TO_REG (i64 1), MovesToVSR.BE_BYTE_0, sub_64),
+ (SUBREG_TO_REG (i64 1), (MTVSRWZ $A), sub_64)>;
+defm : ScalToVecWPermute<
+ v4i32, (i32 i32:$A),
+ (SUBREG_TO_REG (i64 1), MovesToVSR.BE_WORD_0, sub_64),
+ (SUBREG_TO_REG (i64 1), (MTVSRWZ $A), sub_64)>;
def : Pat<(v2i64 (scalar_to_vector i64:$A)),
(v2i64 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_DWORD_0, sub_64))>;
@@ -3770,33 +3778,39 @@ def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xoaddr:$dst),
// Build vectors from i8 loads
defm : ScalToVecWPermute<v8i16, ScalarLoads.ZELi8,
(VSPLTHs 3, (LXSIBZX xoaddr:$src)),
- (VSPLTHs 3, (LXSIBZX xoaddr:$src))>;
+ (SUBREG_TO_REG (i64 1), (LXSIBZX xoaddr:$src), sub_64)>;
defm : ScalToVecWPermute<v4i32, ScalarLoads.ZELi8,
(XXSPLTWs (LXSIBZX xoaddr:$src), 1),
- (XXSPLTWs (LXSIBZX xoaddr:$src), 1)>;
+ (SUBREG_TO_REG (i64 1), (LXSIBZX xoaddr:$src), sub_64)>;
defm : ScalToVecWPermute<v2i64, ScalarLoads.ZELi8i64,
(XXPERMDIs (LXSIBZX xoaddr:$src), 0),
- (XXPERMDIs (LXSIBZX xoaddr:$src), 0)>;
-defm : ScalToVecWPermute<v4i32, ScalarLoads.SELi8,
- (XXSPLTWs (VEXTSB2Ws (LXSIBZX xoaddr:$src)), 1),
- (XXSPLTWs (VEXTSB2Ws (LXSIBZX xoaddr:$src)), 1)>;
-defm : ScalToVecWPermute<v2i64, ScalarLoads.SELi8i64,
- (XXPERMDIs (VEXTSB2Ds (LXSIBZX xoaddr:$src)), 0),
- (XXPERMDIs (VEXTSB2Ds (LXSIBZX xoaddr:$src)), 0)>;
+ (SUBREG_TO_REG (i64 1), (LXSIBZX xoaddr:$src), sub_64)>;
+defm : ScalToVecWPermute<
+ v4i32, ScalarLoads.SELi8,
+ (XXSPLTWs (VEXTSB2Ws (LXSIBZX xoaddr:$src)), 1),
+ (SUBREG_TO_REG (i64 1), (VEXTSB2Ws (LXSIBZX xoaddr:$src)), sub_64)>;
+defm : ScalToVecWPermute<
+ v2i64, ScalarLoads.SELi8i64,
+ (XXPERMDIs (VEXTSB2Ds (LXSIBZX xoaddr:$src)), 0),
+ (SUBREG_TO_REG (i64 1), (VEXTSB2Ds (LXSIBZX xoaddr:$src)), sub_64)>;
// Build vectors from i16 loads
-defm : ScalToVecWPermute<v4i32, ScalarLoads.ZELi16,
- (XXSPLTWs (LXSIHZX xoaddr:$src), 1),
- (XXSPLTWs (LXSIHZX xoaddr:$src), 1)>;
-defm : ScalToVecWPermute<v2i64, ScalarLoads.ZELi16i64,
- (XXPERMDIs (LXSIHZX xoaddr:$src), 0),
- (XXPERMDIs (LXSIHZX xoaddr:$src), 0)>;
-defm : ScalToVecWPermute<v4i32, ScalarLoads.SELi16,
- (XXSPLTWs (VEXTSH2Ws (LXSIHZX xoaddr:$src)), 1),
- (XXSPLTWs (VEXTSH2Ws (LXSIHZX xoaddr:$src)), 1)>;
-defm : ScalToVecWPermute<v2i64, ScalarLoads.SELi16i64,
- (XXPERMDIs (VEXTSH2Ds (LXSIHZX xoaddr:$src)), 0),
- (XXPERMDIs (VEXTSH2Ds (LXSIHZX xoaddr:$src)), 0)>;
+defm : ScalToVecWPermute<
+ v4i32, ScalarLoads.ZELi16,
+ (XXSPLTWs (LXSIHZX xoaddr:$src), 1),
+ (SUBREG_TO_REG (i64 1), (LXSIHZX xoaddr:$src), sub_64)>;
+defm : ScalToVecWPermute<
+ v2i64, ScalarLoads.ZELi16i64,
+ (XXPERMDIs (LXSIHZX xoaddr:$src), 0),
+ (SUBREG_TO_REG (i64 1), (LXSIHZX xoaddr:$src), sub_64)>;
+defm : ScalToVecWPermute<
+ v4i32, ScalarLoads.SELi16,
+ (XXSPLTWs (VEXTSH2Ws (LXSIHZX xoaddr:$src)), 1),
+ (SUBREG_TO_REG (i64 1), (VEXTSH2Ws (LXSIHZX xoaddr:$src)), sub_64)>;
+defm : ScalToVecWPermute<
+ v2i64, ScalarLoads.SELi16i64,
+ (XXPERMDIs (VEXTSH2Ds (LXSIHZX xoaddr:$src)), 0),
+ (SUBREG_TO_REG (i64 1), (VEXTSH2Ds (LXSIHZX xoaddr:$src)), sub_64)>;
// Load/convert and convert/store patterns for f16.
def : Pat<(f64 (extloadf16 xoaddr:$src)),
@@ -3938,7 +3952,8 @@ def : Pat<(f32 (PPCxsminc f32:$XA, f32:$XB)),
VSSRC))>;
// Endianness-neutral patterns for const splats with ISA 3.0 instructions.
-defm : ScalToVecWPermute<v4i32, (i32 i32:$A), (MTVSRWS $A), (MTVSRWS $A)>;
+defm : ScalToVecWPermute<v4i32, (i32 i32:$A), (MTVSRWS $A),
+ (SUBREG_TO_REG (i64 1), (MTVSRWZ $A), sub_64)>;
def : Pat<(v4i32 (build_vector i32:$A, i32:$A, i32:$A, i32:$A)),
(v4i32 (MTVSRWS $A))>;
def : Pat<(v16i8 (build_vector immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A,
@@ -3950,12 +3965,14 @@ def : Pat<(v16i8 (build_vector immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A,
immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A,
immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A)),
(v16i8 (COPY_TO_REGCLASS (XXSPLTIB imm:$A), VSRC))>;
-defm : ScalToVecWPermute<v4i32, FltToIntLoad.A,
- (XVCVSPSXWS (LXVWSX xoaddr:$A)),
- (XVCVSPSXWS (LXVWSX xoaddr:$A))>;
-defm : ScalToVecWPermute<v4i32, FltToUIntLoad.A,
- (XVCVSPUXWS (LXVWSX xoaddr:$A)),
- (XVCVSPUXWS (LXVWSX xoaddr:$A))>;
+defm : ScalToVecWPermute<
+ v4i32, FltToIntLoad.A,
+ (XVCVSPSXWS (LXVWSX xoaddr:$A)),
+ (XVCVSPSXWS (SUBREG_TO_REG (i64 1), (LIWZX xoaddr:$A), sub_64))>;
+defm : ScalToVecWPermute<
+ v4i32, FltToUIntLoad.A,
+ (XVCVSPUXWS (LXVWSX xoaddr:$A)),
+ (XVCVSPUXWS (SUBREG_TO_REG (i64 1), (LIWZX xoaddr:$A), sub_64))>;
defm : ScalToVecWPermute<
v4i32, DblToIntLoadP9.A,
(XXSPLTW (SUBREG_TO_REG (i64 1), (XSCVDPSXWS (DFLOADf64 iaddrX4:$A)), sub_64), 1),
@@ -3991,13 +4008,15 @@ let Predicates = [HasVSX, HasP9Vector, NoP10Vector] in {
// COPY_TO_REGCLASS. The COPY_TO_REGCLASS makes it appear to need two instructions
// to perform the operation, when only one instruction is produced in practice.
// The NoP10Vector predicate excludes these patterns from Power10 VSX subtargets.
-defm : ScalToVecWPermute<v16i8, ScalarLoads.Li8,
- (VSPLTBs 7, (LXSIBZX xoaddr:$src)),
- (VSPLTBs 7, (LXSIBZX xoaddr:$src))>;
+defm : ScalToVecWPermute<
+ v16i8, ScalarLoads.Li8,
+ (VSPLTBs 7, (LXSIBZX xoaddr:$src)),
+ (SUBREG_TO_REG (i64 1), (LXSIBZX xoaddr:$src), sub_64)>;
// Build vectors from i16 loads
-defm : ScalToVecWPermute<v8i16, ScalarLoads.Li16,
- (VSPLTHs 3, (LXSIHZX xoaddr:$src)),
- (VSPLTHs 3, (LXSIHZX xoaddr:$src))>;
+defm : ScalToVecWPermute<
+ v8i16, ScalarLoads.Li16,
+ (VSPLTHs 3, (LXSIHZX xoaddr:$src)),
+ (SUBREG_TO_REG (i64 1), (LXSIHZX xoaddr:$src), sub_64)>;
} // HasVSX, HasP9Vector, NoP10Vector
// Any big endian Power9 VSX subtarget
@@ -4005,13 +4024,15 @@ let Predicates = [HasVSX, HasP9Vector, IsBigEndian] in {
// Power10 VSX subtargets produce a shorter pattern for little endian targets
// but this is still the best pattern for Power9 and Power10 VSX big endian
// Build vectors from i8 loads
-defm : ScalToVecWPermute<v16i8, ScalarLoads.Li8,
- (VSPLTBs 7, (LXSIBZX xoaddr:$src)),
- (VSPLTBs 7, (LXSIBZX xoaddr:$src))>;
+defm : ScalToVecWPermute<
+ v16i8, ScalarLoads.Li8,
+ (VSPLTBs 7, (LXSIBZX xoaddr:$src)),
+ (SUBREG_TO_REG (i64 1), (LXSIBZX xoaddr:$src), sub_64)>;
// Build vectors from i16 loads
-defm : ScalToVecWPermute<v8i16, ScalarLoads.Li16,
- (VSPLTHs 3, (LXSIHZX xoaddr:$src)),
- (VSPLTHs 3, (LXSIHZX xoaddr:$src))>;
+defm : ScalToVecWPermute<
+ v8i16, ScalarLoads.Li16,
+ (VSPLTHs 3, (LXSIHZX xoaddr:$src)),
+ (SUBREG_TO_REG (i64 1), (LXSIHZX xoaddr:$src), sub_64)>;
} // HasVSX, HasP9Vector, NoP10Vector
// Big endian 64Bit Power9 subtarget.