aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/Analysis/CostModel/SystemZ/reduce-add.ll
diff options
context:
space:
mode:
authorDominik Steenken <dost@de.ibm.com>2024-04-12 18:05:30 +0200
committerGitHub <noreply@github.com>2024-04-12 18:05:30 +0200
commitb794dc23255505dd7735f995b8ff1192305a072e (patch)
tree87825d610148c0a1f551f5df088a40f2acc2b5ec /llvm/test/Analysis/CostModel/SystemZ/reduce-add.ll
parentb614e5b0340f783ad355899248c52cb22a04b014 (diff)
downloadllvm-b794dc23255505dd7735f995b8ff1192305a072e.zip
llvm-b794dc23255505dd7735f995b8ff1192305a072e.tar.gz
llvm-b794dc23255505dd7735f995b8ff1192305a072e.tar.bz2
[SystemZ] Add custom handling of legal vectors with reduce-add. (#88495)
This commit skips the expansion of the `vector.reduce.add` intrinsic on vector-enabled SystemZ targets in order to introduce custom handling of `vector.reduce.add` for legal vector types using the VSUM instructions. This is limited to full vectors with scalar types up to `i32` due to performance concerns. It also adds testing for the generation of such custom handling, and adapts the related cost computation, as well as the testing for that. The expected result is a performance boost in certain benchmarks that make heavy use of `vector.reduce.add` with other benchmarks remaining constant. For instance, the assembly for `vector.reduce.add<4 x i32>` changes from ```hlasm vmrlg %v0, %v24, %v24 vaf %v0, %v24, %v0 vrepf %v1, %v0, 1 vaf %v0, %v0, %v1 vlgvf %r2, %v0, 0 ``` to ```hlasm vgbm %v0, 0 vsumqf %v0, %v24, %v0 vlgvf %r2, %v0, 3 ```
Diffstat (limited to 'llvm/test/Analysis/CostModel/SystemZ/reduce-add.ll')
-rw-r--r--llvm/test/Analysis/CostModel/SystemZ/reduce-add.ll24
1 files changed, 12 insertions, 12 deletions
diff --git a/llvm/test/Analysis/CostModel/SystemZ/reduce-add.ll b/llvm/test/Analysis/CostModel/SystemZ/reduce-add.ll
index 061e5ec..90b5b746 100644
--- a/llvm/test/Analysis/CostModel/SystemZ/reduce-add.ll
+++ b/llvm/test/Analysis/CostModel/SystemZ/reduce-add.ll
@@ -7,19 +7,19 @@ define void @reduce(ptr %src, ptr %dst) {
; CHECK: Cost Model: Found an estimated cost of 5 for instruction: %R8_64 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %V8_64)
; CHECK: Cost Model: Found an estimated cost of 9 for instruction: %R16_64 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %V16_64)
; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %R2_32 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %V2_32)
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %R4_32 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %V4_32)
-; CHECK: Cost Model: Found an estimated cost of 5 for instruction: %R8_32 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %V8_32)
-; CHECK: Cost Model: Found an estimated cost of 7 for instruction: %R16_32 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %V16_32)
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %R2_16 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %V2_16)
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %R4_16 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %V4_16)
-; CHECK: Cost Model: Found an estimated cost of 6 for instruction: %R8_16 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %V8_16)
-; CHECK: Cost Model: Found an estimated cost of 7 for instruction: %R16_16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %V16_16)
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %R2_8 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %V2_8)
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %R4_8 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %V4_8)
-; CHECK: Cost Model: Found an estimated cost of 6 for instruction: %R8_8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %V8_8)
-; CHECK: Cost Model: Found an estimated cost of 8 for instruction: %R16_8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %V16_8)
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %R4_32 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %V4_32)
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction: %R8_32 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %V8_32)
+; CHECK: Cost Model: Found an estimated cost of 5 for instruction: %R16_32 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %V16_32)
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction: %R2_16 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %V2_16)
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction: %R4_16 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %V4_16)
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction: %R8_16 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %V8_16)
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %R16_16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %V16_16)
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction: %R2_8 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %V2_8)
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction: %R4_8 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %V4_8)
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction: %R8_8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %V8_8)
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction: %R16_8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %V16_8)
;
-; CHECK: Cost Model: Found an estimated cost of 15 for instruction: %R128_8 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> %V128_8)
+; CHECK: Cost Model: Found an estimated cost of 10 for instruction: %R128_8 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> %V128_8)
; CHECK: Cost Model: Found an estimated cost of 20 for instruction: %R4_256 = call i256 @llvm.vector.reduce.add.v4i256(<4 x i256> %V4_256)
; REDUCEADD64