[𝘀𝗽𝗿] initial versionusers/pcc/spr/codegenprepare-x86-support-sinking-phi-nodes

Created using spr 1.3.6-beta.1
author: Peter Collingbourne <peter@pcc.me.uk> 2025-05-27 21:55:04 -0700
committer: Peter Collingbourne <peter@pcc.me.uk> 2025-05-27 21:55:04 -0700
commit: b74ffea23f0cd7f96f35f467da82a302c425e9db (patch)
tree: f0d755b2f88916c2d8be848727eb94a5e9b3cd2a
parent: 782a9e9f64dfa21ca21f4b5c09569b35dd041bb0 (diff)
parent: 852b18f51c4f3636b8850c74497e96ec22667514 (diff)
download: llvm-users/pcc/spr/codegenprepare-x86-support-sinking-phi-nodes.zip
llvm-users/pcc/spr/codegenprepare-x86-support-sinking-phi-nodes.tar.gz
llvm-users/pcc/spr/codegenprepare-x86-support-sinking-phi-nodes.tar.bz2
6 files changed, 349 insertions, 13 deletions
diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h
index c164f76..e822903 100644
--- a/llvm/include/llvm/IR/Instructions.h
+++ b/llvm/include/llvm/IR/Instructions.h
@@ -2822,6 +2822,11 @@ public:
   /// non-undef value.
   bool hasConstantOrUndefValue() const;
 
+  /// If the specified PHI node (possibly via other PHI nodes) merges together
+  /// the same or identical (i.e. Instruction::isIdenticalTo() returns true)
+  /// values, return one of the values, otherwise return null.
+  Value *hasIdenticalValue();
+
   /// If the PHI node is complete which means all of its parent's predecessors
   /// have incoming value in this PHI, return true, otherwise return false.
   bool isComplete() const {
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 5226302..422916a 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -7739,9 +7739,14 @@ bool CodeGenPrepare::tryToSinkFreeOperands(Instruction *I) {
 
   for (Use *U : reverse(OpsToSink)) {
     auto *UI = cast<Instruction>(U->get());
-    if (isa<PHINode>(UI))
-      continue;
-    if (UI->getParent() == TargetBB) {
+    if (auto *PN = dyn_cast<PHINode>(UI)) {
+      auto *I0 = dyn_cast<Instruction>(PN->hasIdenticalValue());
+      if (!I0)
+        continue;
+      if (I0->getParent() == TargetBB &&
+          InstOrdering[I0] < InstOrdering[InsertPoint])
+        InsertPoint = I0;
+    } else if (UI->getParent() == TargetBB) {
       if (InstOrdering[UI] < InstOrdering[InsertPoint])
         InsertPoint = UI;
       continue;
@@ -7753,7 +7758,11 @@ bool CodeGenPrepare::tryToSinkFreeOperands(Instruction *I) {
   DenseMap<Instruction *, Instruction *> NewInstructions;
   for (Use *U : ToReplace) {
     auto *UI = cast<Instruction>(U->get());
-    Instruction *NI = UI->clone();
+    Instruction *NI;
+    if (auto *PN = dyn_cast<PHINode>(UI))
+      NI = cast<Instruction>(PN->hasIdenticalValue())->clone();
+    else
+      NI = UI->clone();
 
     if (IsHugeFunc) {
       // Now we clone an instruction, its operands' defs may sink to this BB
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index f404e11..9c59378 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -48,6 +48,7 @@
 #include <cassert>
 #include <cstdint>
 #include <optional>
+#include <set>
 #include <vector>
 
 using namespace llvm;
@@ -239,6 +240,40 @@ bool PHINode::hasConstantOrUndefValue() const {
   return true;
 }
 
+/// If the specified PHI node (possibly via other PHI nodes) merges together the
+/// same or identical (i.e. Instruction::isIdenticalTo() returns true) values,
+/// return one of the values, otherwise return null.
+Value *PHINode::hasIdenticalValue() {
+  std::vector<PHINode *> Worklist;
+  std::set<PHINode *> Seen;
+  Value *Result = nullptr;
+  Worklist.push_back(this);
+  while (!Worklist.empty()) {
+    PHINode *PN = Worklist.back();
+    Worklist.pop_back();
+    if (!Seen.insert(PN).second)
+      continue;
+    for (Value *V : PN->incoming_values()) {
+      if (auto *PN = dyn_cast<PHINode>(V)) {
+        Worklist.push_back(PN);
+        continue;
+      }
+      if (!Result) {
+        Result = V;
+        continue;
+      }
+      if (V == Result)
+        continue;
+      if (auto *I = dyn_cast<Instruction>(V))
+        if (auto *ResultI = dyn_cast<Instruction>(Result))
+          if (I->isIdenticalTo(ResultI))
+            continue;
+      return nullptr;
+    }
+  }
+  return Result;
+}
+
 //===----------------------------------------------------------------------===//
 //                       LandingPadInst Implementation
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 9864adc..5baf5e3 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -7132,10 +7132,8 @@ bool X86TTIImpl::isProfitableToSinkOperands(Instruction *I,
   using namespace llvm::PatternMatch;
 
   FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType());
-  if (!VTy)
-    return false;
 
-  if (I->getOpcode() == Instruction::Mul &&
+  if (VTy && I->getOpcode() == Instruction::Mul &&
       VTy->getElementType()->isIntegerTy(64)) {
     for (auto &Op : I->operands()) {
       // Make sure we are not already sinking this operand
@@ -7159,9 +7157,6 @@ bool X86TTIImpl::isProfitableToSinkOperands(Instruction *I,
     return !Ops.empty();
   }
 
-  // A uniform shift amount in a vector shift or funnel shift may be much
-  // cheaper than a generic variable vector shift, so make that pattern visible
-  // to SDAG by sinking the shuffle instruction next to the shift.
   int ShiftAmountOpNum = -1;
   if (I->isShift())
     ShiftAmountOpNum = 1;
@@ -7170,16 +7165,38 @@ bool X86TTIImpl::isProfitableToSinkOperands(Instruction *I,
         II->getIntrinsicID() == Intrinsic::fshr)
       ShiftAmountOpNum = 2;
   }
-
   if (ShiftAmountOpNum == -1)
     return false;
+  auto *ShiftAmountUse = &I->getOperandUse(ShiftAmountOpNum);
+
+  Value *ShiftAmount = ShiftAmountUse->get();
+  if (auto *PN = dyn_cast<PHINode>(ShiftAmount)) {
+    ShiftAmount = PN->hasIdenticalValue();
+    if (!ShiftAmount)
+      return false;
+  }
 
-  auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
+  // A uniform shift amount in a vector shift or funnel shift may be much
+  // cheaper than a generic variable vector shift, so make that pattern visible
+  // to SDAG by sinking the shuffle instruction next to the shift.
+  auto *Shuf = dyn_cast<ShuffleVectorInst>(ShiftAmount);
   if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
       isVectorShiftByScalarCheap(I->getType())) {
-    Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
+    Ops.push_back(ShiftAmountUse);
     return true;
   }
 
+  // Casts taking a constant expression (generally derived from a global
+  // variable address) as an operand are profitable to sink because they appear
+  // as subexpressions in the instruction sequence generated by the
+  // LowerTypeTests pass which is expected to pattern match to the rotate
+  // instruction's immediate operand.
+  if (auto *CI = dyn_cast<CastInst>(ShiftAmount)) {
+    if (isa<ConstantExpr>(CI->getOperand(0))) {
+      Ops.push_back(ShiftAmountUse);
+      return true;
+    }
+  }
+
   return false;
 }
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/inhibit-zext-constant-hoist-phi.ll b/llvm/test/Transforms/CodeGenPrepare/X86/inhibit-zext-constant-hoist-phi.ll
new file mode 100644
index 0000000..98f71cd
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/inhibit-zext-constant-hoist-phi.ll
@@ -0,0 +1,92 @@
+; Make sure that if a phi with identical inputs gets created it gets undone by CodeGenPrepare.
+
+; RUN: opt -codegenprepare -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@__typeid__ZTS1S_global_addr = external hidden global [0 x i8], code_model "small"
+@__typeid__ZTS1S_align = external hidden global [0 x i8], !absolute_symbol !0
+@__typeid__ZTS1S_size_m1 = external hidden global [0 x i8], !absolute_symbol !1
+
+; Check that we recover the third pair of zexts from the phi.
+
+; CHECK: define void @f4
+define void @f4(i1 noundef zeroext %0, ptr noundef %1, ptr noundef %2, ptr noundef %3) #1 {
+  br i1 %0, label %5, label %18
+
+5:
+  %6 = load ptr, ptr %1, align 8
+  %7 = ptrtoint ptr %6 to i64
+  %8 = sub i64 %7, ptrtoint (ptr @__typeid__ZTS1S_global_addr to i64)
+  ; CHECK: zext {{.*}} @__typeid__ZTS1S_align
+  %9 = zext nneg i8 ptrtoint (ptr @__typeid__ZTS1S_align to i8) to i64
+  %10 = lshr i64 %8, %9
+  ; CHECK: zext {{.*}} @__typeid__ZTS1S_align
+  %11 = zext nneg i8 sub (i8 64, i8 ptrtoint (ptr @__typeid__ZTS1S_align to i8)) to i64
+  %12 = shl i64 %8, %11
+  %13 = or i64 %10, %12
+  %14 = icmp ugt i64 %13, ptrtoint (ptr @__typeid__ZTS1S_size_m1 to i64)
+  br i1 %14, label %15, label %16
+
+15:
+  tail call void @llvm.ubsantrap(i8 2) #5
+  unreachable
+
+16:
+  %17 = load ptr, ptr %6, align 8
+  tail call void %17(ptr noundef nonnull align 8 dereferenceable(8) %1)
+  br label %31
+
+18:
+  %19 = load ptr, ptr %2, align 8
+  %20 = ptrtoint ptr %19 to i64
+  %21 = sub i64 %20, ptrtoint (ptr @__typeid__ZTS1S_global_addr to i64)
+  ; CHECK: zext {{.*}} @__typeid__ZTS1S_align
+  %22 = zext nneg i8 ptrtoint (ptr @__typeid__ZTS1S_align to i8) to i64
+  %23 = lshr i64 %21, %22
+  ; CHECK: zext {{.*}} @__typeid__ZTS1S_align
+  %24 = zext nneg i8 sub (i8 64, i8 ptrtoint (ptr @__typeid__ZTS1S_align to i8)) to i64
+  %25 = shl i64 %21, %24
+  %26 = or i64 %23, %25
+  %27 = icmp ugt i64 %26, ptrtoint (ptr @__typeid__ZTS1S_size_m1 to i64)
+  br i1 %27, label %28, label %29
+
+28:
+  tail call void @llvm.ubsantrap(i8 2) #5
+  unreachable
+
+29:
+  %30 = load ptr, ptr %19, align 8
+  tail call void %30(ptr noundef nonnull align 8 dereferenceable(8) %2)
+  br label %31
+
+31:
+  %32 = phi i64 [ %24, %29 ], [ %11, %16 ]
+  %33 = phi i64 [ %22, %29 ], [ %9, %16 ]
+  %34 = load ptr, ptr %3, align 8
+  %35 = ptrtoint ptr %34 to i64
+  %36 = sub i64 %35, ptrtoint (ptr @__typeid__ZTS1S_global_addr to i64)
+  ; CHECK: zext {{.*}} @__typeid__ZTS1S_align
+  %37 = lshr i64 %36, %33
+  ; CHECK: zext {{.*}} @__typeid__ZTS1S_align
+  %38 = shl i64 %36, %32
+  %39 = or i64 %37, %38
+  %40 = icmp ugt i64 %39, ptrtoint (ptr @__typeid__ZTS1S_size_m1 to i64)
+  br i1 %40, label %41, label %42
+
+41:
+  tail call void @llvm.ubsantrap(i8 2) #5
+  unreachable
+
+42:
+  %43 = load ptr, ptr %34, align 8
+  tail call void %43(ptr noundef nonnull align 8 dereferenceable(8) %3)
+  ret void
+}
+
+declare i1 @llvm.type.test(ptr, metadata)
+declare void @llvm.ubsantrap(i8 immarg)
+
+!0 = !{i64 0, i64 256}
+!1 = !{i64 0, i64 128}
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/inhibit-zext-constant-hoist.ll b/llvm/test/Transforms/CodeGenPrepare/X86/inhibit-zext-constant-hoist.ll
new file mode 100644
index 0000000..301f2248
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/inhibit-zext-constant-hoist.ll
@@ -0,0 +1,178 @@
+; Make sure that if optimizations hoist or CSE zext(const) it gets undone by CodeGenPrepare.
+
+; This IR is normally generated by LowerTypeTests during ThinLTO importing
+; so it will go through the ThinLTO pass pipeline.
+; RUN: opt -passes='thinlto<O0>' -S < %s | opt -codegenprepare -S | FileCheck %s
+; RUN: opt -passes='thinlto<O1>' -S < %s | opt -codegenprepare -S | FileCheck %s
+; RUN: opt -passes='thinlto<O2>' -S < %s | opt -codegenprepare -S | FileCheck %s
+; RUN: opt -passes='thinlto<O3>' -S < %s | opt -codegenprepare -S | FileCheck %s
+
+; Also check the regular pipelines for completeness.
+; RUN: opt -O0 -S < %s | opt -codegenprepare -S | FileCheck %s
+; RUN: opt -O1 -S < %s | opt -codegenprepare -S | FileCheck %s
+; RUN: opt -O2 -S < %s | opt -codegenprepare -S | FileCheck %s
+; RUN: opt -O3 -S < %s | opt -codegenprepare -S | FileCheck %s
+; RUN: opt -Os -S < %s | opt -codegenprepare -S | FileCheck %s
+; RUN: opt -Oz -S < %s | opt -codegenprepare -S | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@__typeid__ZTS1S_global_addr = external hidden global [0 x i8], code_model "small"
+@__typeid__ZTS1S_align = external hidden global [0 x i8], !absolute_symbol !0
+@__typeid__ZTS1S_size_m1 = external hidden global [0 x i8], !absolute_symbol !1
+
+; Check that we still have two pairs of zexts (non dominating case).
+
+; CHECK: define void @f1
+define void @f1(i1 noundef zeroext %0, ptr noundef %1, ptr noundef %2) {
+  br i1 %0, label %4, label %17
+
+4:
+  %5 = load ptr, ptr %1, align 8
+  %6 = ptrtoint ptr %5 to i64
+  %7 = sub i64 %6, ptrtoint (ptr @__typeid__ZTS1S_global_addr to i64)
+  ; CHECK: zext {{.*}} @__typeid__ZTS1S_align
+  %8 = zext i8 ptrtoint (ptr @__typeid__ZTS1S_align to i8) to i64
+  %9 = lshr i64 %7, %8
+  ; CHECK: zext {{.*}} @__typeid__ZTS1S_align
+  %10 = zext i8 sub (i8 64, i8 ptrtoint (ptr @__typeid__ZTS1S_align to i8)) to i64
+  %11 = shl i64 %7, %10
+  %12 = or i64 %9, %11
+  %13 = icmp ule i64 %12, ptrtoint (ptr @__typeid__ZTS1S_size_m1 to i64)
+  br i1 %13, label %15, label %14
+
+14:
+  call void @llvm.ubsantrap(i8 2)
+  unreachable
+
+15:
+  %16 = load ptr, ptr %5, align 8
+  call void %16(ptr noundef nonnull align 8 dereferenceable(8) %1)
+  br label %30
+
+17:
+  %18 = load ptr, ptr %2, align 8
+  %19 = ptrtoint ptr %18 to i64
+  %20 = sub i64 %19, ptrtoint (ptr @__typeid__ZTS1S_global_addr to i64)
+  ; CHECK: zext {{.*}} @__typeid__ZTS1S_align
+  %21 = zext i8 ptrtoint (ptr @__typeid__ZTS1S_align to i8) to i64
+  %22 = lshr i64 %20, %21
+  ; CHECK: zext {{.*}} @__typeid__ZTS1S_align
+  %23 = zext i8 sub (i8 64, i8 ptrtoint (ptr @__typeid__ZTS1S_align to i8)) to i64
+  %24 = shl i64 %20, %23
+  %25 = or i64 %22, %24
+  %26 = icmp ule i64 %25, ptrtoint (ptr @__typeid__ZTS1S_size_m1 to i64)
+  br i1 %26, label %28, label %27
+
+27:
+  call void @llvm.ubsantrap(i8 2) #6
+  unreachable
+
+28:
+  %29 = load ptr, ptr %18, align 8
+  call void %29(ptr noundef nonnull align 8 dereferenceable(8) %2)
+  br label %30
+
+30:
+  ret void
+}
+
+; Check that we still have two pairs of zexts (dominating case).
+
+; CHECK: define void @f2
+define void @f2(i1 noundef zeroext %0, ptr noundef %1, ptr noundef %2) {
+  %4 = load ptr, ptr %1, align 8
+  %5 = ptrtoint ptr %4 to i64
+  %6 = sub i64 %5, ptrtoint (ptr @__typeid__ZTS1S_global_addr to i64)
+  ; CHECK: zext {{.*}} @__typeid__ZTS1S_align
+  %7 = zext i8 ptrtoint (ptr @__typeid__ZTS1S_align to i8) to i64
+  %8 = lshr i64 %6, %7
+  ; CHECK: zext {{.*}} @__typeid__ZTS1S_align
+  %9 = zext i8 sub (i8 64, i8 ptrtoint (ptr @__typeid__ZTS1S_align to i8)) to i64
+  %10 = shl i64 %6, %9
+  %11 = or i64 %8, %10
+  %12 = icmp ule i64 %11, ptrtoint (ptr @__typeid__ZTS1S_size_m1 to i64)
+  br i1 %12, label %14, label %13
+
+13:                                               ; preds = %3
+  call void @llvm.ubsantrap(i8 2)
+  unreachable
+
+14:                                               ; preds = %3
+  %15 = load ptr, ptr %4, align 8
+  call void %15(ptr noundef nonnull align 8 dereferenceable(8) %1)
+  br i1 %0, label %16, label %29
+
+16:                                               ; preds = %14
+  %17 = load ptr, ptr %2, align 8
+  %18 = ptrtoint ptr %17 to i64
+  %19 = sub i64 %18, ptrtoint (ptr @__typeid__ZTS1S_global_addr to i64)
+  ; CHECK: zext {{.*}} @__typeid__ZTS1S_align
+  %20 = zext i8 ptrtoint (ptr @__typeid__ZTS1S_align to i8) to i64
+  %21 = lshr i64 %19, %20
+  ; CHECK: zext {{.*}} @__typeid__ZTS1S_align
+  %22 = zext i8 sub (i8 64, i8 ptrtoint (ptr @__typeid__ZTS1S_align to i8)) to i64
+  %23 = shl i64 %19, %22
+  %24 = or i64 %21, %23
+  %25 = icmp ule i64 %24, ptrtoint (ptr @__typeid__ZTS1S_size_m1 to i64)
+  br i1 %25, label %27, label %26
+
+26:                                               ; preds = %16
+  call void @llvm.ubsantrap(i8 2) #6
+  unreachable
+
+27:                                               ; preds = %16
+  %28 = load ptr, ptr %17, align 8
+  call void %28(ptr noundef nonnull align 8 dereferenceable(8) %2)
+  br label %29
+
+29:                                               ; preds = %27, %14
+  ret void
+}
+
+; Check that the zexts aren't moved to the preheader (or anywhere else)
+; and stay in the same basic block.
+
+; CHECK: define void @f3
+define void @f3(ptr noundef readonly captures(address) %0, ptr noundef readnone captures(address) %1) {
+  %3 = icmp eq ptr %0, %1
+  br i1 %3, label %21, label %4
+
+4:
+  ; CHECK: = phi
+  %5 = phi ptr [ %19, %17 ], [ %0, %2 ]
+  %6 = load ptr, ptr %5, align 8
+  %7 = load ptr, ptr %6, align 8
+  %8 = ptrtoint ptr %7 to i64
+  %9 = sub i64 %8, ptrtoint (ptr @__typeid__ZTS1S_global_addr to i64)
+  ; CHECK: zext {{.*}} @__typeid__ZTS1S_align
+  %10 = zext i8 ptrtoint (ptr @__typeid__ZTS1S_align to i8) to i64
+  %11 = lshr i64 %9, %10
+  ; CHECK: zext {{.*}} @__typeid__ZTS1S_align
+  %12 = zext i8 sub (i8 64, i8 ptrtoint (ptr @__typeid__ZTS1S_align to i8)) to i64
+  %13 = shl i64 %9, %12
+  %14 = or i64 %11, %13
+  %15 = icmp ule i64 %14, ptrtoint (ptr @__typeid__ZTS1S_size_m1 to i64)
+  br i1 %15, label %17, label %16
+
+16:
+  call void @llvm.ubsantrap(i8 2)
+  unreachable
+
+17:
+  %18 = load ptr, ptr %7, align 8
+  call void %18(ptr noundef nonnull align 8 dereferenceable(8) %6)
+  %19 = getelementptr inbounds nuw i8, ptr %5, i64 8
+  %20 = icmp eq ptr %19, %1
+  br i1 %20, label %21, label %4
+
+21:
+  ret void
+}
+
+declare i1 @llvm.type.test(ptr, metadata)
+declare void @llvm.ubsantrap(i8 immarg)
+
+!0 = !{i64 0, i64 256}
+!1 = !{i64 0, i64 128}
author	Peter Collingbourne <peter@pcc.me.uk>	2025-05-27 21:55:04 -0700
committer	Peter Collingbourne <peter@pcc.me.uk>	2025-05-27 21:55:04 -0700
commit	b74ffea23f0cd7f96f35f467da82a302c425e9db (patch)
tree	f0d755b2f88916c2d8be848727eb94a5e9b3cd2a
parent	782a9e9f64dfa21ca21f4b5c09569b35dd041bb0 (diff)
parent	852b18f51c4f3636b8850c74497e96ec22667514 (diff)
download	llvm-users/pcc/spr/codegenprepare-x86-support-sinking-phi-nodes.zip llvm-users/pcc/spr/codegenprepare-x86-support-sinking-phi-nodes.tar.gz llvm-users/pcc/spr/codegenprepare-x86-support-sinking-phi-nodes.tar.bz2