aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib
diff options
context:
space:
mode:
authorSimon Pilgrim <llvm-dev@redking.me.uk>2025-04-01 08:39:29 +0100
committerGitHub <noreply@github.com>2025-04-01 08:39:29 +0100
commitc5afcfe0bb44067b2cd050ed9cff311eada9cc37 (patch)
tree89d9bbfb93b413ac35bc4f03d89b6d5ec188c3f7 /llvm/lib
parent5ff8c036063d83c6eff495de7709b12875113d62 (diff)
downloadllvm-c5afcfe0bb44067b2cd050ed9cff311eada9cc37.zip
llvm-c5afcfe0bb44067b2cd050ed9cff311eada9cc37.tar.gz
llvm-c5afcfe0bb44067b2cd050ed9cff311eada9cc37.tar.bz2
[X86] combineINSERT_SUBVECTOR - fold insert_subvector(base,extract_subvector(broadcast)) -> blend shuffle(base,broadcast) (REAPPLIED) (#133724)
If the broadcast is already the full vector width, try to prefer a blend/vshuff64x2 over a vector insertion which is usually lower latency (and sometimes a lower uop count), and reduces changes in vector sizes that can interfere with further combines. Updated version of #133083 - which lead to infinite loops due to shuffle lowering recreating the INSERT_SUBVECTOR pattern, this variant creates the BLENDI/SHUF128 nodes directly.
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp24
1 files changed, 24 insertions, 0 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 76de7e8..5fff78f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -58896,6 +58896,30 @@ static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
return DAG.getVectorShuffle(OpVT, dl, Vec, ExtSrc, Mask);
}
+ // If we're broadcasting, see if we can use a blend instead of
+ // extract/insert pair. Ensure that the subvector is aligned with the
+ // insertion/extractions.
+ if ((ExtIdxVal % SubVecNumElts) == 0 && (IdxVal % SubVecNumElts) == 0 &&
+ (ExtSrc.getOpcode() == X86ISD::VBROADCAST ||
+ ExtSrc.getOpcode() == X86ISD::VBROADCAST_LOAD ||
+ (ExtSrc.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
+ cast<MemIntrinsicSDNode>(ExtSrc)->getMemoryVT() == SubVecVT))) {
+ if (OpVT.is256BitVector() && SubVecVT.is128BitVector()) {
+ uint64_t BlendMask = IdxVal == 0 ? 0x0F : 0xF0;
+ SDValue Blend = DAG.getNode(
+ X86ISD::BLENDI, dl, MVT::v8f32, DAG.getBitcast(MVT::v8f32, Vec),
+ DAG.getBitcast(MVT::v8f32, ExtSrc),
+ DAG.getTargetConstant(BlendMask, dl, MVT::i8));
+ return DAG.getBitcast(OpVT, Blend);
+ } else if (OpVT.is512BitVector() && SubVecVT.is256BitVector()) {
+ SDValue Lo = DAG.getBitcast(MVT::v8f64, IdxVal == 0 ? ExtSrc : Vec);
+ SDValue Hi = DAG.getBitcast(MVT::v8f64, IdxVal == 0 ? Vec : ExtSrc);
+ SDValue Shuffle =
+ DAG.getNode(X86ISD::SHUF128, dl, MVT::v8f64, Lo, Hi,
+ getV4X86ShuffleImm8ForMask({0, 1, 2, 3}, dl, DAG));
+ return DAG.getBitcast(OpVT, Shuffle);
+ }
+ }
}
// Match concat_vector style patterns.