diff options
author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2025-04-01 08:39:29 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-04-01 08:39:29 +0100 |
commit | c5afcfe0bb44067b2cd050ed9cff311eada9cc37 (patch) | |
tree | 89d9bbfb93b413ac35bc4f03d89b6d5ec188c3f7 /llvm/lib | |
parent | 5ff8c036063d83c6eff495de7709b12875113d62 (diff) | |
download | llvm-c5afcfe0bb44067b2cd050ed9cff311eada9cc37.zip llvm-c5afcfe0bb44067b2cd050ed9cff311eada9cc37.tar.gz llvm-c5afcfe0bb44067b2cd050ed9cff311eada9cc37.tar.bz2 |
[X86] combineINSERT_SUBVECTOR - fold insert_subvector(base,extract_subvector(broadcast)) -> blend shuffle(base,broadcast) (REAPPLIED) (#133724)
If the broadcast is already the full vector width, try to prefer a blend/vshuff64x2 over a vector insertion which is usually lower latency (and sometimes a lower uop count), and reduces changes in vector sizes that can interfere with further combines.
Updated version of #133083 - which lead to infinite loops due to shuffle lowering recreating the INSERT_SUBVECTOR pattern, this variant creates the BLENDI/SHUF128 nodes directly.
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 24 |
1 files changed, 24 insertions, 0 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 76de7e8..5fff78f 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -58896,6 +58896,30 @@ static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts; return DAG.getVectorShuffle(OpVT, dl, Vec, ExtSrc, Mask); } + // If we're broadcasting, see if we can use a blend instead of + // extract/insert pair. Ensure that the subvector is aligned with the + // insertion/extractions. + if ((ExtIdxVal % SubVecNumElts) == 0 && (IdxVal % SubVecNumElts) == 0 && + (ExtSrc.getOpcode() == X86ISD::VBROADCAST || + ExtSrc.getOpcode() == X86ISD::VBROADCAST_LOAD || + (ExtSrc.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD && + cast<MemIntrinsicSDNode>(ExtSrc)->getMemoryVT() == SubVecVT))) { + if (OpVT.is256BitVector() && SubVecVT.is128BitVector()) { + uint64_t BlendMask = IdxVal == 0 ? 0x0F : 0xF0; + SDValue Blend = DAG.getNode( + X86ISD::BLENDI, dl, MVT::v8f32, DAG.getBitcast(MVT::v8f32, Vec), + DAG.getBitcast(MVT::v8f32, ExtSrc), + DAG.getTargetConstant(BlendMask, dl, MVT::i8)); + return DAG.getBitcast(OpVT, Blend); + } else if (OpVT.is512BitVector() && SubVecVT.is256BitVector()) { + SDValue Lo = DAG.getBitcast(MVT::v8f64, IdxVal == 0 ? ExtSrc : Vec); + SDValue Hi = DAG.getBitcast(MVT::v8f64, IdxVal == 0 ? Vec : ExtSrc); + SDValue Shuffle = + DAG.getNode(X86ISD::SHUF128, dl, MVT::v8f64, Lo, Hi, + getV4X86ShuffleImm8ForMask({0, 1, 2, 3}, dl, DAG)); + return DAG.getBitcast(OpVT, Shuffle); + } + } } // Match concat_vector style patterns. |