From bd07c22e5372789c3eb47b9009029d5e99e0ef9f Mon Sep 17 00:00:00 2001 From: Lawrence Benson Date: Mon, 29 Apr 2024 16:45:33 +0200 Subject: [Clang] Add support for scalable vectors in __builtin_reduce_* functions (#87750) Currently, a lot of `__builtin_reduce_*` function do not support scalable vectors, i.e., ARM SVE and RISCV V. This PR adds support for them. The main code change is to use a different path to extract the type from the vectors, the rest is the same and LLVM supports the reduce functions for `vscale` vectors. This PR adds scalable vector support for: - `__builtin_reduce_add` - `__builtin_reduce_mul` - `__builtin_reduce_xor` - `__builtin_reduce_or` - `__builtin_reduce_and` - `__builtin_reduce_min` - `__builtin_reduce_max` Note: For all except `min/max`, the element type must still be an integer value. Adding floating point support for `add` and `mul` is still an open TODO. --- clang/docs/LanguageExtensions.rst | 2 ++ clang/docs/ReleaseNotes.rst | 1 + clang/include/clang/AST/Type.h | 4 +++ clang/lib/AST/Type.cpp | 12 +++++++ clang/lib/CodeGen/CGBuiltin.cpp | 10 ++++-- clang/lib/Sema/SemaChecking.cpp | 23 +++++++++--- clang/test/CodeGen/builtins-reduction-math.c | 53 ++++++++++++++++++++++++++++ 7 files changed, 99 insertions(+), 6 deletions(-) diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst index 127d1b6..87cb743 100644 --- a/clang/docs/LanguageExtensions.rst +++ b/clang/docs/LanguageExtensions.rst @@ -711,6 +711,8 @@ even-odd element pair with indices ``i * 2`` and ``i * 2 + 1`` with power of 2, the vector is widened with neutral elements for the reduction at the end to the next power of 2. +These reductions support both fixed-sized and scalable vector types. + Example: .. code-block:: c++ diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 347c812..4cb2462 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -222,6 +222,7 @@ Non-comprehensive list of changes in this release - ``__typeof_unqual__`` is available in all C modes as an extension, which behaves like ``typeof_unqual`` from C23, similar to ``__typeof__`` and ``typeof``. +- ``__builtin_reduce_{add|mul|xor|or|and|min|max}`` builtins now support scalable vectors. * Shared libraries linked with either the ``-ffast-math``, ``-Ofast``, or ``-funsafe-math-optimizations`` flags will no longer enable flush-to-zero diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h index dff02d4..fa2b47e 100644 --- a/clang/include/clang/AST/Type.h +++ b/clang/include/clang/AST/Type.h @@ -2378,6 +2378,10 @@ public: /// 'riscv_rvv_vector_bits' type attribute as VectorType. QualType getRVVEltType(const ASTContext &Ctx) const; + /// Returns the representative type for the element of a sizeless vector + /// builtin type. + QualType getSizelessVectorEltType(const ASTContext &Ctx) const; + /// Types are partitioned into 3 broad categories (C99 6.2.5p1): /// object types, function types, and incomplete types. diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp index 8aaa680..68e81f4 100644 --- a/clang/lib/AST/Type.cpp +++ b/clang/lib/AST/Type.cpp @@ -2510,6 +2510,18 @@ bool Type::isSveVLSBuiltinType() const { return false; } +QualType Type::getSizelessVectorEltType(const ASTContext &Ctx) const { + assert(isSizelessVectorType() && "Must be sizeless vector type"); + // Currently supports SVE and RVV + if (isSVESizelessBuiltinType()) + return getSveEltType(Ctx); + + if (isRVVSizelessBuiltinType()) + return getRVVEltType(Ctx); + + llvm_unreachable("Unhandled type"); +} + QualType Type::getSveEltType(const ASTContext &Ctx) const { assert(isSveVLSBuiltinType() && "unsupported type!"); diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index d08ab53..a370734 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -3885,9 +3885,12 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } case Builtin::BI__builtin_reduce_max: { - auto GetIntrinsicID = [](QualType QT) { + auto GetIntrinsicID = [this](QualType QT) { if (auto *VecTy = QT->getAs()) QT = VecTy->getElementType(); + else if (QT->isSizelessVectorType()) + QT = QT->getSizelessVectorEltType(CGM.getContext()); + if (QT->isSignedIntegerType()) return llvm::Intrinsic::vector_reduce_smax; if (QT->isUnsignedIntegerType()) @@ -3900,9 +3903,12 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } case Builtin::BI__builtin_reduce_min: { - auto GetIntrinsicID = [](QualType QT) { + auto GetIntrinsicID = [this](QualType QT) { if (auto *VecTy = QT->getAs()) QT = VecTy->getElementType(); + else if (QT->isSizelessVectorType()) + QT = QT->getSizelessVectorEltType(CGM.getContext()); + if (QT->isSignedIntegerType()) return llvm::Intrinsic::vector_reduce_smin; if (QT->isUnsignedIntegerType()) diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index e33113a..e26cf20 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -3164,13 +3164,20 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID, const Expr *Arg = TheCall->getArg(0); const auto *TyA = Arg->getType()->getAs(); - if (!TyA) { + + QualType ElTy; + if (TyA) + ElTy = TyA->getElementType(); + else if (Arg->getType()->isSizelessVectorType()) + ElTy = Arg->getType()->getSizelessVectorEltType(Context); + + if (ElTy.isNull()) { Diag(Arg->getBeginLoc(), diag::err_builtin_invalid_arg_type) << 1 << /* vector ty*/ 4 << Arg->getType(); return ExprError(); } - TheCall->setType(TyA->getElementType()); + TheCall->setType(ElTy); break; } @@ -3186,12 +3193,20 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID, const Expr *Arg = TheCall->getArg(0); const auto *TyA = Arg->getType()->getAs(); - if (!TyA || !TyA->getElementType()->isIntegerType()) { + + QualType ElTy; + if (TyA) + ElTy = TyA->getElementType(); + else if (Arg->getType()->isSizelessVectorType()) + ElTy = Arg->getType()->getSizelessVectorEltType(Context); + + if (ElTy.isNull() || !ElTy->isIntegerType()) { Diag(Arg->getBeginLoc(), diag::err_builtin_invalid_arg_type) << 1 << /* vector of integers */ 6 << Arg->getType(); return ExprError(); } - TheCall->setType(TyA->getElementType()); + + TheCall->setType(ElTy); break; } diff --git a/clang/test/CodeGen/builtins-reduction-math.c b/clang/test/CodeGen/builtins-reduction-math.c index 34f39ce..acafe92 100644 --- a/clang/test/CodeGen/builtins-reduction-math.c +++ b/clang/test/CodeGen/builtins-reduction-math.c @@ -1,5 +1,8 @@ // RUN: %clang_cc1 -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -O1 -triple aarch64 -target-feature +sve %s -emit-llvm -disable-llvm-passes -o - | FileCheck --check-prefixes=SVE %s + typedef float float4 __attribute__((ext_vector_type(4))); typedef short int si8 __attribute__((ext_vector_type(8))); typedef unsigned int u4 __attribute__((ext_vector_type(4))); @@ -134,3 +137,53 @@ void test_builtin_reduce_and(si8 vi1, u4 vu1) { // CHECK-NEXT: call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[VU1]]) unsigned r3 = __builtin_reduce_and(vu1); } + +#if defined(__ARM_FEATURE_SVE) +#include + +void test_builtin_reduce_SVE(int a, unsigned long long b, short c, float d) { + // SVE-LABEL: void @test_builtin_reduce_SVE( + + svint32_t vec_a = svdup_s32(a); + svuint64_t vec_b = svdup_u64(b); + svint16_t vec_c1 = svdup_s16(c); + svuint16_t vec_c2 = svdup_u16(c); + svfloat32_t vec_d = svdup_f32(d); + + // SVE: [[VF1:%.+]] = load , ptr %vec_a + // SVE-NEXT: call i32 @llvm.vector.reduce.add.nxv4i32( [[VF1]]) + int r1 = __builtin_reduce_add(vec_a); + + // SVE: [[VF2:%.+]] = load , ptr %vec_a + // SVE-NEXT: call i32 @llvm.vector.reduce.mul.nxv4i32( [[VF2]]) + int r2 = __builtin_reduce_mul(vec_a); + + // SVE: [[VF3:%.+]] = load , ptr %vec_b + // SVE-NEXT: call i64 @llvm.vector.reduce.xor.nxv2i64( [[VF3]]) + long long r3 = __builtin_reduce_xor(vec_b); + + // SVE: [[VF4:%.+]] = load , ptr %vec_b + // SVE-NEXT: call i64 @llvm.vector.reduce.or.nxv2i64( [[VF4]]) + long long r4 = __builtin_reduce_or(vec_b); + + // SVE: [[VF5:%.+]] = load , ptr %vec_b + // SVE-NEXT: call i64 @llvm.vector.reduce.and.nxv2i64( [[VF5]]) + long long r5 = __builtin_reduce_and(vec_b); + + // SVE: [[VF6:%.+]] = load , ptr %vec_c1 + // SVE-NEXT: call i16 @llvm.vector.reduce.smax.nxv8i16( [[VF6]]) + short r6 = __builtin_reduce_max(vec_c1); + + // SVE: [[VF7:%.+]] = load , ptr %vec_c2 + // SVE-NEXT: call i16 @llvm.vector.reduce.umin.nxv8i16( [[VF7]]) + unsigned short r7 = __builtin_reduce_min(vec_c2); + + // SVE: [[VF8:%.+]] = load , ptr %vec_d + // SVE-NEXT: call float @llvm.vector.reduce.fmax.nxv4f32( [[VF8]]) + float r8 = __builtin_reduce_max(vec_d); + + // SVE: [[VF9:%.+]] = load , ptr %vec_d + // SVE-NEXT: call float @llvm.vector.reduce.fmin.nxv4f32( [[VF9]]) + float r9 = __builtin_reduce_min(vec_d); +} +#endif -- cgit v1.1