aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBjorn Pettersson <bjorn.a.pettersson@ericsson.com>2022-01-18 19:50:50 +0100
committerBjorn Pettersson <bjorn.a.pettersson@ericsson.com>2022-01-24 12:22:04 +0100
commit46cacdbb21c221a4304c489cb4a1abbc51967bb1 (patch)
tree5224b75a3334472d7e595ff5fdd6f1885aa0484c
parent12a499eb00e36bb0944c6b1f7f8721fd90a5bd8f (diff)
downloadllvm-46cacdbb21c221a4304c489cb4a1abbc51967bb1.zip
llvm-46cacdbb21c221a4304c489cb4a1abbc51967bb1.tar.gz
llvm-46cacdbb21c221a4304c489cb4a1abbc51967bb1.tar.bz2
[DAGCombiner] Adjust some checks in DAGCombiner::reduceLoadWidth
In code review for D117104 two slightly weird checks were found in DAGCombiner::reduceLoadWidth. They were typically checking if BitsA was a mulitple of BitsB by looking at (BitsA & (BitsB - 1)), but such a comparison actually only make sense if BitsB is a power of two. The checks were related to the code that attempted to shrink a load based on the fact that the loaded value would be right shifted. Afaict the legality of the value types is checked later (typically in isLegalNarrowLdSt), so the existing checks were both overly conservative as well as being wrong whenever ExtVTBits wasn't a power of two. The latter was a situation triggered by a number of lit tests so we could not just assert on ExtVTBIts being a power of two). When attempting to simply remove the checks I found some problems, that seems to have been guarded by the checks (maybe just out of luck). A typical example would be a pattern like this: t1 = load i96* ptr t2 = srl t1, 64 t3 = truncate t2 to i64 When DAGCombine is visiting the truncate reduceLoadWidth is called attempting to narrow the load to 64 bits (ExtVT := MVT::i64). Then the SRL is detected and we set ShAmt to 64. In the past we've bailed out due to i96 not being a multiple of 64. If we simply remove that check then we would end up replacing the load with a new load that would read 64 bits but with a base pointer adjusted by 64 bits. So we would read 32 bits the wasn't accessed by the original load. This patch will instead utilize the fact that the logical left shift can be folded away by using a zextload. Thus, the pattern above will now be combined into t3 = load i32* ptr+offset, zext to i64 Another case is shown in the X86/shift-folding.ll test case: t1 = load i32* ptr t2 = srl i32 t1, 8 t3 = truncate t2 to i16 In the past we bailed out due to the shift count (8) not being a multiple of 16. Now the narrowing kicks in and we get t3 = load i16* ptr+offset Differential Revision: https://reviews.llvm.org/D117406
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp25
-rw-r--r--llvm/test/CodeGen/ARM/shift-combine.ll20
-rw-r--r--llvm/test/CodeGen/X86/shift-folding.ll4
3 files changed, 22 insertions, 27 deletions
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 861beee..bf4409e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -12214,7 +12214,8 @@ SDValue DAGCombiner::reduceLoadWidth(SDNode *N) {
// accessing any of the loaded bytes. If the load was a zextload/extload
// then the result of the shift+trunc is zero/undef (handled elsewhere).
ShAmt = SRL1C->getZExtValue();
- if (ShAmt >= LN->getMemoryVT().getSizeInBits())
+ uint64_t MemoryWidth = LN->getMemoryVT().getSizeInBits();
+ if (ShAmt >= MemoryWidth)
return SDValue();
// Because a SRL must be assumed to *need* to zero-extend the high bits
@@ -12223,13 +12224,19 @@ SDValue DAGCombiner::reduceLoadWidth(SDNode *N) {
if (LN->getExtensionType() == ISD::SEXTLOAD)
return SDValue();
- unsigned ExtVTBits = ExtVT.getScalarSizeInBits();
- // Is the shift amount a multiple of size of ExtVT?
- if ((ShAmt & (ExtVTBits - 1)) != 0)
- return SDValue();
- // Is the load width a multiple of size of ExtVT?
- if ((SRL.getScalarValueSizeInBits() & (ExtVTBits - 1)) != 0)
- return SDValue();
+ // Avoid reading outside the memory accessed by the original load (could
+ // happened if we only adjust the load base pointer by ShAmt). Instead we
+ // try to narrow the load even further. The typical scenario here is:
+ // (i64 (truncate (i96 (srl (load x), 64)))) ->
+ // (i64 (truncate (i96 (zextload (load i32 + offset) from i32))))
+ if (ExtVT.getScalarSizeInBits() > MemoryWidth - ShAmt) {
+ // Don't replace sextload by zextload.
+ if (ExtType == ISD::SEXTLOAD)
+ return SDValue();
+ // Narrow the load.
+ ExtType = ISD::ZEXTLOAD;
+ ExtVT = EVT::getIntegerVT(*DAG.getContext(), MemoryWidth - ShAmt);
+ }
// If the SRL is only used by a masking AND, we may be able to adjust
// the ExtVT to make the AND redundant.
@@ -12241,7 +12248,7 @@ SDValue DAGCombiner::reduceLoadWidth(SDNode *N) {
EVT MaskedVT = EVT::getIntegerVT(*DAG.getContext(),
ShiftMask.countTrailingOnes());
// If the mask is smaller, recompute the type.
- if ((ExtVTBits > MaskedVT.getScalarSizeInBits()) &&
+ if ((ExtVT.getScalarSizeInBits() > MaskedVT.getScalarSizeInBits()) &&
TLI.isLoadExtLegal(ExtType, SRL.getValueType(), MaskedVT))
ExtVT = MaskedVT;
}
diff --git a/llvm/test/CodeGen/ARM/shift-combine.ll b/llvm/test/CodeGen/ARM/shift-combine.ll
index de1beb7..549d709 100644
--- a/llvm/test/CodeGen/ARM/shift-combine.ll
+++ b/llvm/test/CodeGen/ARM/shift-combine.ll
@@ -302,9 +302,7 @@ define arm_aapcscc i32 @test_lshr_load64_4_unaligned(i64* %a) {
;
; CHECK-BE-LABEL: test_lshr_load64_4_unaligned:
; CHECK-BE: @ %bb.0: @ %entry
-; CHECK-BE-NEXT: ldr r1, [r0]
-; CHECK-BE-NEXT: ldrh r0, [r0, #4]
-; CHECK-BE-NEXT: orr r0, r0, r1, lsl #16
+; CHECK-BE-NEXT: ldr r0, [r0, #2]
; CHECK-BE-NEXT: bx lr
;
; CHECK-THUMB-LABEL: test_lshr_load64_4_unaligned:
@@ -341,9 +339,7 @@ define arm_aapcscc i32 @test_lshr_load64_1_lsb(i64* %a) {
;
; CHECK-BE-LABEL: test_lshr_load64_1_lsb:
; CHECK-BE: @ %bb.0: @ %entry
-; CHECK-BE-NEXT: ldr r1, [r0]
-; CHECK-BE-NEXT: ldrb r0, [r0, #4]
-; CHECK-BE-NEXT: orr r0, r0, r1, lsl #8
+; CHECK-BE-NEXT: ldr r0, [r0, #1]
; CHECK-BE-NEXT: bx lr
;
; CHECK-THUMB-LABEL: test_lshr_load64_1_lsb:
@@ -441,23 +437,17 @@ entry:
define arm_aapcscc i32 @test_lshr_load4_fail(i64* %a) {
; CHECK-ARM-LABEL: test_lshr_load4_fail:
; CHECK-ARM: @ %bb.0: @ %entry
-; CHECK-ARM-NEXT: ldrd r0, r1, [r0]
-; CHECK-ARM-NEXT: lsr r0, r0, #8
-; CHECK-ARM-NEXT: orr r0, r0, r1, lsl #24
+; CHECK-ARM-NEXT: ldr r0, [r0, #1]
; CHECK-ARM-NEXT: bx lr
;
; CHECK-BE-LABEL: test_lshr_load4_fail:
; CHECK-BE: @ %bb.0: @ %entry
-; CHECK-BE-NEXT: ldrd r0, r1, [r0]
-; CHECK-BE-NEXT: lsr r1, r1, #8
-; CHECK-BE-NEXT: orr r0, r1, r0, lsl #24
+; CHECK-BE-NEXT: ldr r0, [r0, #3]
; CHECK-BE-NEXT: bx lr
;
; CHECK-THUMB-LABEL: test_lshr_load4_fail:
; CHECK-THUMB: @ %bb.0: @ %entry
-; CHECK-THUMB-NEXT: ldrd r0, r1, [r0]
-; CHECK-THUMB-NEXT: lsrs r0, r0, #8
-; CHECK-THUMB-NEXT: orr.w r0, r0, r1, lsl #24
+; CHECK-THUMB-NEXT: ldr.w r0, [r0, #1]
; CHECK-THUMB-NEXT: bx lr
;
; CHECK-ALIGN-LABEL: test_lshr_load4_fail:
diff --git a/llvm/test/CodeGen/X86/shift-folding.ll b/llvm/test/CodeGen/X86/shift-folding.ll
index cc03b4b..bb59cdf 100644
--- a/llvm/test/CodeGen/X86/shift-folding.ll
+++ b/llvm/test/CodeGen/X86/shift-folding.ll
@@ -88,9 +88,7 @@ define i16 @srl_load_narrowing1(i32* %arg) {
; CHECK-LABEL: srl_load_narrowing1:
; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movl (%eax), %eax
-; CHECK-NEXT: shrl $8, %eax
-; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT: movzwl 1(%eax), %eax
; CHECK-NEXT: retl
%tmp1 = load i32, i32* %arg, align 1
%tmp2 = lshr i32 %tmp1, 8