aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDominik Steenken <dost@de.ibm.com>2024-04-12 18:05:30 +0200
committerGitHub <noreply@github.com>2024-04-12 18:05:30 +0200
commitb794dc23255505dd7735f995b8ff1192305a072e (patch)
tree87825d610148c0a1f551f5df088a40f2acc2b5ec
parentb614e5b0340f783ad355899248c52cb22a04b014 (diff)
downloadllvm-b794dc23255505dd7735f995b8ff1192305a072e.zip
llvm-b794dc23255505dd7735f995b8ff1192305a072e.tar.gz
llvm-b794dc23255505dd7735f995b8ff1192305a072e.tar.bz2
[SystemZ] Add custom handling of legal vectors with reduce-add. (#88495)
This commit skips the expansion of the `vector.reduce.add` intrinsic on vector-enabled SystemZ targets in order to introduce custom handling of `vector.reduce.add` for legal vector types using the VSUM instructions. This is limited to full vectors with scalar types up to `i32` due to performance concerns. It also adds testing for the generation of such custom handling, and adapts the related cost computation, as well as the testing for that. The expected result is a performance boost in certain benchmarks that make heavy use of `vector.reduce.add` with other benchmarks remaining constant. For instance, the assembly for `vector.reduce.add<4 x i32>` changes from ```hlasm vmrlg %v0, %v24, %v24 vaf %v0, %v24, %v0 vrepf %v1, %v0, 1 vaf %v0, %v0, %v1 vlgvf %r2, %v0, 0 ``` to ```hlasm vgbm %v0, 0 vsumqf %v0, %v24, %v0 vlgvf %r2, %v0, 3 ```
-rw-r--r--llvm/lib/Target/SystemZ/SystemZISelLowering.cpp43
-rw-r--r--llvm/lib/Target/SystemZ/SystemZISelLowering.h1
-rw-r--r--llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp31
-rw-r--r--llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h2
-rw-r--r--llvm/test/Analysis/CostModel/SystemZ/reduce-add.ll24
-rw-r--r--llvm/test/CodeGen/SystemZ/vec-reduce-add-01.ll289
6 files changed, 373 insertions, 17 deletions
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index efffd66..3b3057f 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -16,6 +16,7 @@
#include "SystemZMachineFunctionInfo.h"
#include "SystemZTargetMachine.h"
#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
@@ -23,6 +24,7 @@
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicsS390.h"
#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/KnownBits.h"
#include <cctype>
#include <optional>
@@ -451,6 +453,10 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SRL, VT, Custom);
setOperationAction(ISD::ROTL, VT, Custom);
+ // Add ISD::VECREDUCE_ADD as custom in order to implement
+ // it with VZERO+VSUM
+ setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
+
// Map SETCCs onto one of VCE, VCH or VCHL, swapping the operands
// and inverting the result as necessary.
setOperationAction(ISD::SETCC, VT, Custom);
@@ -6167,6 +6173,8 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
return lowerOR(Op, DAG);
case ISD::CTPOP:
return lowerCTPOP(Op, DAG);
+ case ISD::VECREDUCE_ADD:
+ return lowerVECREDUCE_ADD(Op, DAG);
case ISD::ATOMIC_FENCE:
return lowerATOMIC_FENCE(Op, DAG);
case ISD::ATOMIC_SWAP:
@@ -9600,3 +9608,38 @@ SDValue SystemZTargetLowering::lowerGET_ROUNDING(SDValue Op,
return DAG.getMergeValues({RetVal, Chain}, dl);
}
+
+SDValue SystemZTargetLowering::lowerVECREDUCE_ADD(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ Op = Op.getOperand(0);
+ EVT OpVT = Op.getValueType();
+
+ assert(OpVT.isVector() && "Operand type for VECREDUCE_ADD is not a vector.");
+
+ SDLoc DL(Op);
+
+ // load a 0 vector for the third operand of VSUM.
+ SDValue Zero = DAG.getSplatBuildVector(OpVT, DL, DAG.getConstant(0, DL, VT));
+
+ // execute VSUM.
+ switch (OpVT.getScalarSizeInBits()) {
+ case 8:
+ case 16:
+ Op = DAG.getNode(SystemZISD::VSUM, DL, MVT::v4i32, Op, Zero);
+ LLVM_FALLTHROUGH;
+ case 32:
+ case 64:
+ Op = DAG.getNode(SystemZISD::VSUM, DL, MVT::i128, Op,
+ DAG.getBitcast(Op.getValueType(), Zero));
+ break;
+ case 128:
+ break; // VSUM over v1i128 should not happen and would be a noop
+ default:
+ llvm_unreachable("Unexpected scalar size.");
+ }
+ // Cast to original vector type, retrieve last element.
+ return DAG.getNode(
+ ISD::EXTRACT_VECTOR_ELT, DL, VT, DAG.getBitcast(OpVT, Op),
+ DAG.getConstant(OpVT.getVectorNumElements() - 1, DL, MVT::i32));
+}
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index 7140287..2290a7d 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -696,6 +696,7 @@ private:
SDValue lowerBITCAST(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerOR(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerVECREDUCE_ADD(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerATOMIC_LDST_I128(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerATOMIC_LOAD_OP(SDValue Op, SelectionDAG &DAG,
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 17e534f..4c9e78c 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -19,6 +19,7 @@
#include "llvm/CodeGen/CostTable.h"
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/MathExtras.h"
@@ -1293,18 +1294,14 @@ getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
if (ID == Intrinsic::vector_reduce_add) {
// Retrieve number and size of elements for the vector op.
auto *VTy = cast<FixedVectorType>(ParamTys.front());
- unsigned NumElements = VTy->getNumElements();
unsigned ScalarSize = VTy->getScalarSizeInBits();
// For scalar sizes >128 bits, we fall back to the generic cost estimate.
if (ScalarSize > SystemZ::VectorBits)
return -1;
- // A single vector register can hold this many elements.
- unsigned MaxElemsPerVector = SystemZ::VectorBits / ScalarSize;
// This many vector regs are needed to represent the input elements (V).
unsigned VectorRegsNeeded = getNumVectorRegs(VTy);
// This many instructions are needed for the final sum of vector elems (S).
- unsigned LastVectorHandling =
- 2 * Log2_32_Ceil(std::min(NumElements, MaxElemsPerVector));
+ unsigned LastVectorHandling = (ScalarSize < 32) ? 3 : 2;
// We use vector adds to create a sum vector, which takes
// V/2 + V/4 + ... = V - 1 operations.
// Then, we need S operations to sum up the elements of that sum vector,
@@ -1324,3 +1321,27 @@ SystemZTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
return Cost;
return BaseT::getIntrinsicInstrCost(ICA, CostKind);
}
+
+bool SystemZTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const {
+ // Always expand on Subtargets without vector instructions
+ if (!ST->hasVector())
+ return true;
+
+ // Always expand for operands that do not fill one vector reg
+ auto *Type = cast<FixedVectorType>(II->getOperand(0)->getType());
+ unsigned NumElts = Type->getNumElements();
+ unsigned ScalarSize = Type->getScalarSizeInBits();
+ unsigned MaxElts = SystemZ::VectorBits / ScalarSize;
+ if (NumElts < MaxElts)
+ return true;
+
+ // Otherwise
+ switch (II->getIntrinsicID()) {
+ // Do not expand vector.reduce.add
+ case Intrinsic::vector_reduce_add:
+ // Except for i64, since the performance benefit is dubious there
+ return ScalarSize >= 64;
+ default:
+ return true;
+ }
+}
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index 1d824d3..696d887 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -127,6 +127,8 @@ public:
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
TTI::TargetCostKind CostKind);
+
+ bool shouldExpandReduction(const IntrinsicInst *II) const;
/// @}
};
diff --git a/llvm/test/Analysis/CostModel/SystemZ/reduce-add.ll b/llvm/test/Analysis/CostModel/SystemZ/reduce-add.ll
index 061e5ec..90b5b746 100644
--- a/llvm/test/Analysis/CostModel/SystemZ/reduce-add.ll
+++ b/llvm/test/Analysis/CostModel/SystemZ/reduce-add.ll
@@ -7,19 +7,19 @@ define void @reduce(ptr %src, ptr %dst) {
; CHECK: Cost Model: Found an estimated cost of 5 for instruction: %R8_64 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %V8_64)
; CHECK: Cost Model: Found an estimated cost of 9 for instruction: %R16_64 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %V16_64)
; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %R2_32 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %V2_32)
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %R4_32 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %V4_32)
-; CHECK: Cost Model: Found an estimated cost of 5 for instruction: %R8_32 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %V8_32)
-; CHECK: Cost Model: Found an estimated cost of 7 for instruction: %R16_32 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %V16_32)
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %R2_16 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %V2_16)
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %R4_16 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %V4_16)
-; CHECK: Cost Model: Found an estimated cost of 6 for instruction: %R8_16 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %V8_16)
-; CHECK: Cost Model: Found an estimated cost of 7 for instruction: %R16_16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %V16_16)
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %R2_8 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %V2_8)
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %R4_8 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %V4_8)
-; CHECK: Cost Model: Found an estimated cost of 6 for instruction: %R8_8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %V8_8)
-; CHECK: Cost Model: Found an estimated cost of 8 for instruction: %R16_8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %V16_8)
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %R4_32 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %V4_32)
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction: %R8_32 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %V8_32)
+; CHECK: Cost Model: Found an estimated cost of 5 for instruction: %R16_32 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %V16_32)
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction: %R2_16 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %V2_16)
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction: %R4_16 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %V4_16)
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction: %R8_16 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %V8_16)
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %R16_16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %V16_16)
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction: %R2_8 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %V2_8)
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction: %R4_8 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %V4_8)
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction: %R8_8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %V8_8)
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction: %R16_8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %V16_8)
;
-; CHECK: Cost Model: Found an estimated cost of 15 for instruction: %R128_8 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> %V128_8)
+; CHECK: Cost Model: Found an estimated cost of 10 for instruction: %R128_8 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> %V128_8)
; CHECK: Cost Model: Found an estimated cost of 20 for instruction: %R4_256 = call i256 @llvm.vector.reduce.add.v4i256(<4 x i256> %V4_256)
; REDUCEADD64
diff --git a/llvm/test/CodeGen/SystemZ/vec-reduce-add-01.ll b/llvm/test/CodeGen/SystemZ/vec-reduce-add-01.ll
new file mode 100644
index 0000000..56b151d
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/vec-reduce-add-01.ll
@@ -0,0 +1,289 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; Test vector add reduction instrinsic
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 | FileCheck %s
+
+; 1 vector length
+declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %a)
+declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
+declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a)
+declare i128 @llvm.vector.reduce.add.v1i128(<1 x i128> %a)
+; 2 vector lengths
+declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %a)
+declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a)
+declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a)
+declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a)
+declare i128 @llvm.vector.reduce.add.v2i128(<2 x i128> %a)
+; ; TODO
+; ; 4 vector lengths
+declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8> %a)
+declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %a)
+declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a)
+declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a)
+declare i128 @llvm.vector.reduce.add.v4i128(<4 x i128> %a)
+; ; Subvector lengths
+declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a)
+declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a)
+declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a)
+declare i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a)
+
+define i8 @f1_1(<16 x i8> %a) {
+; CHECK-LABEL: f1_1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vgbm %v0, 0
+; CHECK-NEXT: vsumb %v1, %v24, %v0
+; CHECK-NEXT: vsumqf %v0, %v1, %v0
+; CHECK-NEXT: vlgvf %r2, %v0, 3
+; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
+; CHECK-NEXT: br %r14
+ %redadd = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %a)
+ ret i8 %redadd
+}
+
+define i16 @f1_2(<8 x i16> %a) {
+; CHECK-LABEL: f1_2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vgbm %v0, 0
+; CHECK-NEXT: vsumh %v1, %v24, %v0
+; CHECK-NEXT: vsumqf %v0, %v1, %v0
+; CHECK-NEXT: vlgvf %r2, %v0, 3
+; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
+; CHECK-NEXT: br %r14
+ %redadd = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a)
+ ret i16 %redadd
+}
+
+define i32 @f1_3(<4 x i32> %a) {
+; CHECK-LABEL: f1_3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vgbm %v0, 0
+; CHECK-NEXT: vsumqf %v0, %v24, %v0
+; CHECK-NEXT: vlgvf %r2, %v0, 3
+; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
+; CHECK-NEXT: br %r14
+
+ %redadd = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
+ ret i32 %redadd
+}
+
+define i64 @f1_4(<2 x i64> %a) {
+; CHECK-LABEL: f1_4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vrepg %v0, %v24, 1
+; CHECK-NEXT: vag %v0, %v24, %v0
+; CHECK-NEXT: vlgvg %r2, %v0, 0
+; CHECK-NEXT: br %r14
+
+ %redadd = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a)
+ ret i64 %redadd
+}
+
+define i128 @f1_5(<1 x i128> %a) {
+; CHECK-LABEL: f1_5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vst %v24, 0(%r2), 3
+; CHECK-NEXT: br %r14
+ %redadd = call i128 @llvm.vector.reduce.add.v1i128(<1 x i128> %a)
+ ret i128 %redadd
+}
+
+define i8 @f2_1(<32 x i8> %a) {
+; CHECK-LABEL: f2_1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vab %v0, %v24, %v26
+; CHECK-NEXT: vgbm %v1, 0
+; CHECK-NEXT: vsumb %v0, %v0, %v1
+; CHECK-NEXT: vsumqf %v0, %v0, %v1
+; CHECK-NEXT: vlgvf %r2, %v0, 3
+; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
+; CHECK-NEXT: br %r14
+ %redadd = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %a)
+ ret i8 %redadd
+}
+
+define i16 @f2_2(<16 x i16> %a) {
+; CHECK-LABEL: f2_2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vah %v0, %v24, %v26
+; CHECK-NEXT: vgbm %v1, 0
+; CHECK-NEXT: vsumh %v0, %v0, %v1
+; CHECK-NEXT: vsumqf %v0, %v0, %v1
+; CHECK-NEXT: vlgvf %r2, %v0, 3
+; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
+; CHECK-NEXT: br %r14
+ %redadd = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a)
+ ret i16 %redadd
+}
+
+define i32 @f2_3(<8 x i32> %a) {
+; CHECK-LABEL: f2_3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vaf %v0, %v24, %v26
+; CHECK-NEXT: vgbm %v1, 0
+; CHECK-NEXT: vsumqf %v0, %v0, %v1
+; CHECK-NEXT: vlgvf %r2, %v0, 3
+; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
+; CHECK-NEXT: br %r14
+
+ %redadd = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a)
+ ret i32 %redadd
+}
+
+define i64 @f2_4(<4 x i64> %a) {
+; CHECK-LABEL: f2_4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vag %v0, %v24, %v26
+; CHECK-NEXT: vrepg %v1, %v0, 1
+; CHECK-NEXT: vag %v0, %v0, %v1
+; CHECK-NEXT: vlgvg %r2, %v0, 0
+; CHECK-NEXT: br %r14
+
+ %redadd = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a)
+ ret i64 %redadd
+}
+
+define i128 @f2_5(<2 x i128> %a) {
+; CHECK-LABEL: f2_5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vl %v0, 16(%r3), 3
+; CHECK-NEXT: vl %v1, 0(%r3), 3
+; CHECK-NEXT: vaq %v0, %v1, %v0
+; CHECK-NEXT: vst %v0, 0(%r2), 3
+; CHECK-NEXT: br %r14
+ %redadd = call i128 @llvm.vector.reduce.add.v2i128(<2 x i128> %a)
+ ret i128 %redadd
+}
+
+define i8 @f3_1(<64 x i8> %a) {
+; CHECK-LABEL: f3_1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vab %v0, %v26, %v30
+; CHECK-NEXT: vab %v1, %v24, %v28
+; CHECK-NEXT: vab %v0, %v1, %v0
+; CHECK-NEXT: vgbm %v1, 0
+; CHECK-NEXT: vsumb %v0, %v0, %v1
+; CHECK-NEXT: vsumqf %v0, %v0, %v1
+; CHECK-NEXT: vlgvf %r2, %v0, 3
+; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
+; CHECK-NEXT: br %r14
+ %redadd = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> %a)
+ ret i8 %redadd
+}
+
+define i16 @f3_2(<32 x i16> %a) {
+; CHECK-LABEL: f3_2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vah %v0, %v26, %v30
+; CHECK-NEXT: vah %v1, %v24, %v28
+; CHECK-NEXT: vah %v0, %v1, %v0
+; CHECK-NEXT: vgbm %v1, 0
+; CHECK-NEXT: vsumh %v0, %v0, %v1
+; CHECK-NEXT: vsumqf %v0, %v0, %v1
+; CHECK-NEXT: vlgvf %r2, %v0, 3
+; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
+; CHECK-NEXT: br %r14
+ %redadd = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %a)
+ ret i16 %redadd
+}
+
+define i32 @f3_3(<16 x i32> %a) {
+; CHECK-LABEL: f3_3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vaf %v0, %v26, %v30
+; CHECK-NEXT: vaf %v1, %v24, %v28
+; CHECK-NEXT: vaf %v0, %v1, %v0
+; CHECK-NEXT: vgbm %v1, 0
+; CHECK-NEXT: vsumqf %v0, %v0, %v1
+; CHECK-NEXT: vlgvf %r2, %v0, 3
+; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
+; CHECK-NEXT: br %r14
+
+ %redadd = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a)
+ ret i32 %redadd
+}
+
+define i64 @f3_4(<8 x i64> %a) {
+; CHECK-LABEL: f3_4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vag %v0, %v26, %v30
+; CHECK-NEXT: vag %v1, %v24, %v28
+; CHECK-NEXT: vag %v0, %v1, %v0
+; CHECK-NEXT: vrepg %v1, %v0, 1
+; CHECK-NEXT: vag %v0, %v0, %v1
+; CHECK-NEXT: vlgvg %r2, %v0, 0
+; CHECK-NEXT: br %r14
+
+ %redadd = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a)
+ ret i64 %redadd
+}
+
+define i128 @f3_5(<4 x i128> %a) {
+; CHECK-LABEL: f3_5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vl %v0, 32(%r3), 3
+; CHECK-NEXT: vl %v1, 0(%r3), 3
+; CHECK-NEXT: vl %v2, 48(%r3), 3
+; CHECK-NEXT: vl %v3, 16(%r3), 3
+; CHECK-NEXT: vaq %v2, %v3, %v2
+; CHECK-NEXT: vaq %v0, %v1, %v0
+; CHECK-NEXT: vaq %v0, %v0, %v2
+; CHECK-NEXT: vst %v0, 0(%r2), 3
+; CHECK-NEXT: br %r14
+ %redadd = call i128 @llvm.vector.reduce.add.v4i128(<4 x i128> %a)
+ ret i128 %redadd
+}
+
+
+define i8 @f4_1(<8 x i8> %a) {
+; CHECK-LABEL: f4_1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpkg %v0, %v24, %v24
+; CHECK-NEXT: vab %v0, %v24, %v0
+; CHECK-NEXT: vpkf %v1, %v0, %v0
+; CHECK-NEXT: vab %v0, %v0, %v1
+; CHECK-NEXT: vrepb %v1, %v0, 1
+; CHECK-NEXT: vab %v0, %v0, %v1
+; CHECK-NEXT: vlgvb %r2, %v0, 0
+; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
+; CHECK-NEXT: br %r14
+ %redadd = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a)
+ ret i8 %redadd
+}
+
+define i16 @f4_2(<4 x i16> %a) {
+; CHECK-LABEL: f4_2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpkg %v0, %v24, %v24
+; CHECK-NEXT: vah %v0, %v24, %v0
+; CHECK-NEXT: vreph %v1, %v0, 1
+; CHECK-NEXT: vah %v0, %v0, %v1
+; CHECK-NEXT: vlgvh %r2, %v0, 0
+; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
+; CHECK-NEXT: br %r14
+ %redadd = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a)
+ ret i16 %redadd
+}
+
+define i32 @f4_3(<2 x i32> %a) {
+; CHECK-LABEL: f4_3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vrepf %v0, %v24, 1
+; CHECK-NEXT: vaf %v0, %v24, %v0
+; CHECK-NEXT: vlgvf %r2, %v0, 0
+; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
+; CHECK-NEXT: br %r14
+
+ %redadd = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a)
+ ret i32 %redadd
+}
+
+define i64 @f4_4(<1 x i64> %a) {
+; CHECK-LABEL: f4_4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vlgvg %r2, %v24, 0
+; CHECK-NEXT: br %r14
+
+ %redadd = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a)
+ ret i64 %redadd
+}