[SystemZ] Add custom handling of legal vectors with reduce-add. (#88495)

This commit skips the expansion of the `vector.reduce.add` intrinsic on vector-enabled SystemZ targets in order to introduce custom handling of `vector.reduce.add` for legal vector types using the VSUM instructions. This is limited to full vectors with scalar types up to `i32` due to performance concerns. It also adds testing for the generation of such custom handling, and adapts the related cost computation, as well as the testing for that. The expected result is a performance boost in certain benchmarks that make heavy use of `vector.reduce.add` with other benchmarks remaining constant. For instance, the assembly for `vector.reduce.add<4 x i32>` changes from ```hlasm vmrlg %v0, %v24, %v24 vaf %v0, %v24, %v0 vrepf %v1, %v0, 1 vaf %v0, %v0, %v1 vlgvf %r2, %v0, 0 ``` to ```hlasm vgbm %v0, 0 vsumqf %v0, %v24, %v0 vlgvf %r2, %v0, 3 ```
author: Dominik Steenken <dost@de.ibm.com> 2024-04-12 18:05:30 +0200
committer: GitHub <noreply@github.com> 2024-04-12 18:05:30 +0200
commit: b794dc23255505dd7735f995b8ff1192305a072e (patch)
tree: 87825d610148c0a1f551f5df088a40f2acc2b5ec /llvm/test/Analysis/CostModel/SystemZ/reduce-add.ll
parent: b614e5b0340f783ad355899248c52cb22a04b014 (diff)
download: llvm-b794dc23255505dd7735f995b8ff1192305a072e.zip
llvm-b794dc23255505dd7735f995b8ff1192305a072e.tar.gz
llvm-b794dc23255505dd7735f995b8ff1192305a072e.tar.bz2
1 files changed, 12 insertions, 12 deletions
diff --git a/llvm/test/Analysis/CostModel/SystemZ/reduce-add.ll b/llvm/test/Analysis/CostModel/SystemZ/reduce-add.ll
index 061e5ec..90b5b746 100644
--- a/llvm/test/Analysis/CostModel/SystemZ/reduce-add.ll
+++ b/llvm/test/Analysis/CostModel/SystemZ/reduce-add.ll
@@ -7,19 +7,19 @@ define void @reduce(ptr %src, ptr %dst) {
 ; CHECK:  Cost Model: Found an estimated cost of 5 for instruction: %R8_64 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %V8_64)
 ; CHECK:  Cost Model: Found an estimated cost of 9 for instruction: %R16_64 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %V16_64)
 ; CHECK:  Cost Model: Found an estimated cost of 2 for instruction: %R2_32 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %V2_32)
-; CHECK:  Cost Model: Found an estimated cost of 4 for instruction: %R4_32 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %V4_32)
-; CHECK:  Cost Model: Found an estimated cost of 5 for instruction: %R8_32 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %V8_32)
-; CHECK:  Cost Model: Found an estimated cost of 7 for instruction: %R16_32 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %V16_32)
-; CHECK:  Cost Model: Found an estimated cost of 2 for instruction: %R2_16 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %V2_16)
-; CHECK:  Cost Model: Found an estimated cost of 4 for instruction: %R4_16 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %V4_16)
-; CHECK:  Cost Model: Found an estimated cost of 6 for instruction: %R8_16 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %V8_16)
-; CHECK:  Cost Model: Found an estimated cost of 7 for instruction: %R16_16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %V16_16)
-; CHECK:  Cost Model: Found an estimated cost of 2 for instruction: %R2_8 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %V2_8)
-; CHECK:  Cost Model: Found an estimated cost of 4 for instruction: %R4_8 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %V4_8)
-; CHECK:  Cost Model: Found an estimated cost of 6 for instruction: %R8_8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %V8_8)
-; CHECK:  Cost Model: Found an estimated cost of 8 for instruction: %R16_8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %V16_8)
+; CHECK:  Cost Model: Found an estimated cost of 2 for instruction: %R4_32 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %V4_32)
+; CHECK:  Cost Model: Found an estimated cost of 3 for instruction: %R8_32 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %V8_32)
+; CHECK:  Cost Model: Found an estimated cost of 5 for instruction: %R16_32 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %V16_32)
+; CHECK:  Cost Model: Found an estimated cost of 3 for instruction: %R2_16 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %V2_16)
+; CHECK:  Cost Model: Found an estimated cost of 3 for instruction: %R4_16 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %V4_16)
+; CHECK:  Cost Model: Found an estimated cost of 3 for instruction: %R8_16 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %V8_16)
+; CHECK:  Cost Model: Found an estimated cost of 4 for instruction: %R16_16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %V16_16)
+; CHECK:  Cost Model: Found an estimated cost of 3 for instruction: %R2_8 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %V2_8)
+; CHECK:  Cost Model: Found an estimated cost of 3 for instruction: %R4_8 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %V4_8)
+; CHECK:  Cost Model: Found an estimated cost of 3 for instruction: %R8_8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %V8_8)
+; CHECK:  Cost Model: Found an estimated cost of 3 for instruction: %R16_8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %V16_8)
 ;
-; CHECK:  Cost Model: Found an estimated cost of 15 for instruction: %R128_8 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> %V128_8)
+; CHECK:  Cost Model: Found an estimated cost of 10 for instruction: %R128_8 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> %V128_8)
 ; CHECK:  Cost Model: Found an estimated cost of 20 for instruction: %R4_256 = call i256 @llvm.vector.reduce.add.v4i256(<4 x i256> %V4_256)
 
   ; REDUCEADD64
author	Dominik Steenken <dost@de.ibm.com>	2024-04-12 18:05:30 +0200
committer	GitHub <noreply@github.com>	2024-04-12 18:05:30 +0200
commit	b794dc23255505dd7735f995b8ff1192305a072e (patch)
tree	87825d610148c0a1f551f5df088a40f2acc2b5ec /llvm/test/Analysis/CostModel/SystemZ/reduce-add.ll
parent	b614e5b0340f783ad355899248c52cb22a04b014 (diff)
download	llvm-b794dc23255505dd7735f995b8ff1192305a072e.zip llvm-b794dc23255505dd7735f995b8ff1192305a072e.tar.gz llvm-b794dc23255505dd7735f995b8ff1192305a072e.tar.bz2