diff options
Diffstat (limited to 'llvm')
54 files changed, 2142 insertions, 732 deletions
diff --git a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h index 1ade9ce..db781b58 100644 --- a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h +++ b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h @@ -1268,6 +1268,15 @@ struct WriteT { using EmptyTrait = std::true_type; }; +// V6: [6.4.7] Looprange clause +template <typename T, typename I, typename E> struct LoopRangeT { + using Begin = E; + using End = E; + + using TupleTrait = std::true_type; + std::tuple<Begin, End> t; +}; + // --- template <typename T, typename I, typename E> @@ -1300,8 +1309,8 @@ using TupleClausesT = DoacrossT<T, I, E>, DynGroupprivateT<T, I, E>, FromT<T, I, E>, GrainsizeT<T, I, E>, IfT<T, I, E>, InitT<T, I, E>, InReductionT<T, I, E>, LastprivateT<T, I, E>, LinearT<T, I, E>, - MapT<T, I, E>, NumTasksT<T, I, E>, OrderT<T, I, E>, - ReductionT<T, I, E>, ScheduleT<T, I, E>, + LoopRangeT<T, I, E>, MapT<T, I, E>, NumTasksT<T, I, E>, + OrderT<T, I, E>, ReductionT<T, I, E>, ScheduleT<T, I, E>, TaskReductionT<T, I, E>, ToT<T, I, E>>; template <typename T, typename I, typename E> diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td index ffefa26..38f95a1 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMP.td +++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td @@ -284,6 +284,10 @@ def OMPC_Linear : Clause<[Spelling<"linear">]> { def OMPC_Link : Clause<[Spelling<"link">]> { let flangClass = "OmpObjectList"; } +def OMPC_LoopRange : Clause<[Spelling<"looprange">]> { + let clangClass = "OMPLoopRangeClause"; + let flangClass = "OmpLoopRangeClause"; +} def OMPC_Map : Clause<[Spelling<"map">]> { let clangClass = "OMPMapClause"; let flangClass = "OmpMapClause"; @@ -902,6 +906,11 @@ def OMP_Groupprivate : Directive<[Spelling<"groupprivate">]> { let category = CA_Declarative; let languages = [L_C, L_Fortran]; } +def OMP_Fuse : Directive<[Spelling<"fuse">]> { + let allowedOnceClauses = [VersionedClause<OMPC_LoopRange, 60>]; + let association = AS_Block; + let category = CA_Executable; +} def OMP_Interchange : Directive<[Spelling<"interchange">]> { let allowedOnceClauses = [ VersionedClause<OMPC_Permutation>, diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 3c4ed94..96da698 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -960,8 +960,12 @@ def int_instrprof_mcdc_tvbitmap_update : Intrinsic<[], [llvm_ptr_ty, llvm_i64_ty, llvm_i32_ty, llvm_ptr_ty]>; -def int_call_preallocated_setup : DefaultAttrsIntrinsic<[llvm_token_ty], [llvm_i32_ty]>; -def int_call_preallocated_arg : DefaultAttrsIntrinsic<[llvm_ptr_ty], [llvm_token_ty, llvm_i32_ty]>; +def int_call_preallocated_setup + : DefaultAttrsIntrinsic<[llvm_token_ty], [llvm_i32_ty], + [ImmArg<ArgIndex<0>>]>; +def int_call_preallocated_arg + : DefaultAttrsIntrinsic<[llvm_ptr_ty], [llvm_token_ty, llvm_i32_ty], + [ImmArg<ArgIndex<1>>]>; def int_call_preallocated_teardown : DefaultAttrsIntrinsic<[], [llvm_token_ty]>; // This intrinsic is intentionally undocumented and users shouldn't call it; diff --git a/llvm/include/llvm/Support/TrailingObjects.h b/llvm/include/llvm/Support/TrailingObjects.h index d7211a9..3eb7c0b 100644 --- a/llvm/include/llvm/Support/TrailingObjects.h +++ b/llvm/include/llvm/Support/TrailingObjects.h @@ -57,25 +57,9 @@ namespace llvm { namespace trailing_objects_internal { -/// Helper template to calculate the max alignment requirement for a set of -/// objects. -template <typename First, typename... Rest> class AlignmentCalcHelper { -private: - enum { - FirstAlignment = alignof(First), - RestAlignment = AlignmentCalcHelper<Rest...>::Alignment, - }; -public: - enum { - Alignment = FirstAlignment > RestAlignment ? FirstAlignment : RestAlignment - }; -}; - -template <typename First> class AlignmentCalcHelper<First> { -public: - enum { Alignment = alignof(First) }; -}; +template <typename... T> +inline constexpr size_t MaxAlignment = std::max({alignof(T)...}); /// The base class for TrailingObjects* classes. class TrailingObjectsBase { @@ -209,11 +193,10 @@ protected: /// See the file comment for details on the usage of the /// TrailingObjects type. template <typename BaseTy, typename... TrailingTys> -class TrailingObjects : private trailing_objects_internal::TrailingObjectsImpl< - trailing_objects_internal::AlignmentCalcHelper< - TrailingTys...>::Alignment, - BaseTy, TrailingObjects<BaseTy, TrailingTys...>, - BaseTy, TrailingTys...> { +class TrailingObjects + : private trailing_objects_internal::TrailingObjectsImpl< + trailing_objects_internal::MaxAlignment<TrailingTys...>, BaseTy, + TrailingObjects<BaseTy, TrailingTys...>, BaseTy, TrailingTys...> { template <int A, typename B, typename T, typename P, typename... M> friend class trailing_objects_internal::TrailingObjectsImpl; @@ -221,8 +204,8 @@ class TrailingObjects : private trailing_objects_internal::TrailingObjectsImpl< template <typename... Tys> class Foo {}; typedef trailing_objects_internal::TrailingObjectsImpl< - trailing_objects_internal::AlignmentCalcHelper<TrailingTys...>::Alignment, - BaseTy, TrailingObjects<BaseTy, TrailingTys...>, BaseTy, TrailingTys...> + trailing_objects_internal::MaxAlignment<TrailingTys...>, BaseTy, + TrailingObjects<BaseTy, TrailingTys...>, BaseTy, TrailingTys...> ParentType; using TrailingObjectsBase = trailing_objects_internal::TrailingObjectsBase; diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index dba5a8c..cc503d3 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -7492,7 +7492,6 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG, // Pre-increment recursion depth for use in recursive calls. ++Depth; const SDNodeFlags Flags = Op->getFlags(); - const TargetOptions &Options = DAG.getTarget().Options; EVT VT = Op.getValueType(); unsigned Opcode = Op.getOpcode(); @@ -7572,7 +7571,7 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG, return DAG.getBuildVector(VT, DL, Ops); } case ISD::FADD: { - if (!Options.NoSignedZerosFPMath && !Flags.hasNoSignedZeros()) + if (!Flags.hasNoSignedZeros()) break; // After operation legalization, it might not be legal to create new FSUBs. @@ -7617,7 +7616,7 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG, } case ISD::FSUB: { // We can't turn -(A-B) into B-A when we honor signed zeros. - if (!Options.NoSignedZerosFPMath && !Flags.hasNoSignedZeros()) + if (!Flags.hasNoSignedZeros()) break; SDValue X = Op.getOperand(0), Y = Op.getOperand(1); @@ -7678,7 +7677,7 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG, } case ISD::FMA: case ISD::FMAD: { - if (!Options.NoSignedZerosFPMath && !Flags.hasNoSignedZeros()) + if (!Flags.hasNoSignedZeros()) break; SDValue X = Op.getOperand(0), Y = Op.getOperand(1), Z = Op.getOperand(2); @@ -8797,7 +8796,6 @@ SDValue TargetLowering::expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *Node, EVT VT = Node->getValueType(0); EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); bool IsMax = Opc == ISD::FMAXIMUMNUM; - const TargetOptions &Options = DAG.getTarget().Options; SDNodeFlags Flags = Node->getFlags(); unsigned NewOp = @@ -8858,8 +8856,8 @@ SDValue TargetLowering::expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *Node, // TODO: We need quiet sNaN if strictfp. // Fixup signed zero behavior. - if (Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros() || - DAG.isKnownNeverZeroFloat(LHS) || DAG.isKnownNeverZeroFloat(RHS)) { + if (Flags.hasNoSignedZeros() || DAG.isKnownNeverZeroFloat(LHS) || + DAG.isKnownNeverZeroFloat(RHS)) { return MinMax; } SDValue TestZero = diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index b2e76cc..8c03d6f 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -5869,9 +5869,7 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { break; } case Intrinsic::call_preallocated_setup: { - auto *NumArgs = dyn_cast<ConstantInt>(Call.getArgOperand(0)); - Check(NumArgs != nullptr, - "llvm.call.preallocated.setup argument must be a constant"); + auto *NumArgs = cast<ConstantInt>(Call.getArgOperand(0)); bool FoundCall = false; for (User *U : Call.users()) { auto *UseCall = dyn_cast<CallBase>(U); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index a4c1e26..899baa9 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -8086,13 +8086,76 @@ static SDValue getZT0FrameIndex(MachineFrameInfo &MFI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); } +// Emit a call to __arm_sme_save or __arm_sme_restore. +static SDValue emitSMEStateSaveRestore(const AArch64TargetLowering &TLI, + SelectionDAG &DAG, + AArch64FunctionInfo *Info, SDLoc DL, + SDValue Chain, bool IsSave) { + MachineFunction &MF = DAG.getMachineFunction(); + AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); + FuncInfo->setSMESaveBufferUsed(); + TargetLowering::ArgListTy Args; + Args.emplace_back( + DAG.getCopyFromReg(Chain, DL, Info->getSMESaveBufferAddr(), MVT::i64), + PointerType::getUnqual(*DAG.getContext())); + + RTLIB::Libcall LC = + IsSave ? RTLIB::SMEABI_SME_SAVE : RTLIB::SMEABI_SME_RESTORE; + SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC), + TLI.getPointerTy(DAG.getDataLayout())); + auto *RetTy = Type::getVoidTy(*DAG.getContext()); + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(DL).setChain(Chain).setLibCallee( + TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)); + return TLI.LowerCallTo(CLI).second; +} + +static SDValue emitRestoreZALazySave(SDValue Chain, SDLoc DL, + const AArch64TargetLowering &TLI, + const AArch64RegisterInfo &TRI, + AArch64FunctionInfo &FuncInfo, + SelectionDAG &DAG) { + // Conditionally restore the lazy save using a pseudo node. + RTLIB::Libcall LC = RTLIB::SMEABI_TPIDR2_RESTORE; + TPIDR2Object &TPIDR2 = FuncInfo.getTPIDR2Obj(); + SDValue RegMask = DAG.getRegisterMask(TRI.getCallPreservedMask( + DAG.getMachineFunction(), TLI.getLibcallCallingConv(LC))); + SDValue RestoreRoutine = DAG.getTargetExternalSymbol( + TLI.getLibcallName(LC), TLI.getPointerTy(DAG.getDataLayout())); + SDValue TPIDR2_EL0 = DAG.getNode( + ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Chain, + DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32)); + // Copy the address of the TPIDR2 block into X0 before 'calling' the + // RESTORE_ZA pseudo. + SDValue Glue; + SDValue TPIDR2Block = DAG.getFrameIndex( + TPIDR2.FrameIndex, + DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); + Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, TPIDR2Block, Glue); + Chain = + DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other, + {Chain, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64), + RestoreRoutine, RegMask, Chain.getValue(1)}); + // Finally reset the TPIDR2_EL0 register to 0. + Chain = DAG.getNode( + ISD::INTRINSIC_VOID, DL, MVT::Other, Chain, + DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32), + DAG.getConstant(0, DL, MVT::i64)); + TPIDR2.Uses++; + return Chain; +} + SDValue AArch64TargetLowering::lowerEHPadEntry(SDValue Chain, SDLoc const &DL, SelectionDAG &DAG) const { assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value"); SDValue Glue = Chain.getValue(1); MachineFunction &MF = DAG.getMachineFunction(); - SMEAttrs SMEFnAttrs = MF.getInfo<AArch64FunctionInfo>()->getSMEFnAttrs(); + auto &FuncInfo = *MF.getInfo<AArch64FunctionInfo>(); + auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>(); + const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo(); + + SMEAttrs SMEFnAttrs = FuncInfo.getSMEFnAttrs(); // The following conditions are true on entry to an exception handler: // - PSTATE.SM is 0. @@ -8107,14 +8170,43 @@ SDValue AArch64TargetLowering::lowerEHPadEntry(SDValue Chain, SDLoc const &DL, // These mode changes are usually optimized away in catch blocks as they // occur before the __cxa_begin_catch (which is a non-streaming function), // but are necessary in some cases (such as for cleanups). + // + // Additionally, if the function has ZA or ZT0 state, we must restore it. + // [COND_]SMSTART SM if (SMEFnAttrs.hasStreamingInterfaceOrBody()) - return changeStreamingMode(DAG, DL, /*Enable=*/true, Chain, - /*Glue*/ Glue, AArch64SME::Always); + Chain = changeStreamingMode(DAG, DL, /*Enable=*/true, Chain, + /*Glue*/ Glue, AArch64SME::Always); + else if (SMEFnAttrs.hasStreamingCompatibleInterface()) + Chain = changeStreamingMode(DAG, DL, /*Enable=*/true, Chain, Glue, + AArch64SME::IfCallerIsStreaming); - if (SMEFnAttrs.hasStreamingCompatibleInterface()) - return changeStreamingMode(DAG, DL, /*Enable=*/true, Chain, Glue, - AArch64SME::IfCallerIsStreaming); + if (getTM().useNewSMEABILowering()) + return Chain; + + if (SMEFnAttrs.hasAgnosticZAInterface()) { + // Restore full ZA + Chain = emitSMEStateSaveRestore(*this, DAG, &FuncInfo, DL, Chain, + /*IsSave=*/false); + } else if (SMEFnAttrs.hasZAState() || SMEFnAttrs.hasZT0State()) { + // SMSTART ZA + Chain = DAG.getNode( + AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain, + DAG.getTargetConstant(int32_t(AArch64SVCR::SVCRZA), DL, MVT::i32)); + + // Restore ZT0 + if (SMEFnAttrs.hasZT0State()) { + SDValue ZT0FrameIndex = + getZT0FrameIndex(MF.getFrameInfo(), FuncInfo, DAG); + Chain = + DAG.getNode(AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other), + {Chain, DAG.getConstant(0, DL, MVT::i32), ZT0FrameIndex}); + } + + // Restore ZA + if (SMEFnAttrs.hasZAState()) + Chain = emitRestoreZALazySave(Chain, DL, *this, TRI, FuncInfo, DAG); + } return Chain; } @@ -9232,30 +9324,6 @@ SDValue AArch64TargetLowering::changeStreamingMode( return GetCheckVL(SMChange.getValue(0), SMChange.getValue(1)); } -// Emit a call to __arm_sme_save or __arm_sme_restore. -static SDValue emitSMEStateSaveRestore(const AArch64TargetLowering &TLI, - SelectionDAG &DAG, - AArch64FunctionInfo *Info, SDLoc DL, - SDValue Chain, bool IsSave) { - MachineFunction &MF = DAG.getMachineFunction(); - AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); - FuncInfo->setSMESaveBufferUsed(); - TargetLowering::ArgListTy Args; - Args.emplace_back( - DAG.getCopyFromReg(Chain, DL, Info->getSMESaveBufferAddr(), MVT::i64), - PointerType::getUnqual(*DAG.getContext())); - - RTLIB::Libcall LC = - IsSave ? RTLIB::SMEABI_SME_SAVE : RTLIB::SMEABI_SME_RESTORE; - SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC), - TLI.getPointerTy(DAG.getDataLayout())); - auto *RetTy = Type::getVoidTy(*DAG.getContext()); - TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(DL).setChain(Chain).setLibCallee( - TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)); - return TLI.LowerCallTo(CLI).second; -} - static AArch64SME::ToggleCondition getSMToggleCondition(const SMECallAttrs &CallAttrs) { if (!CallAttrs.caller().hasStreamingCompatibleInterface() || @@ -10015,33 +10083,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, {Result, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx}); if (RequiresLazySave) { - // Conditionally restore the lazy save using a pseudo node. - RTLIB::Libcall LC = RTLIB::SMEABI_TPIDR2_RESTORE; - TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj(); - SDValue RegMask = DAG.getRegisterMask( - TRI->getCallPreservedMask(MF, getLibcallCallingConv(LC))); - SDValue RestoreRoutine = DAG.getTargetExternalSymbol( - getLibcallName(LC), getPointerTy(DAG.getDataLayout())); - SDValue TPIDR2_EL0 = DAG.getNode( - ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result, - DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32)); - // Copy the address of the TPIDR2 block into X0 before 'calling' the - // RESTORE_ZA pseudo. - SDValue Glue; - SDValue TPIDR2Block = DAG.getFrameIndex( - TPIDR2.FrameIndex, - DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); - Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue); - Result = - DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other, - {Result, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64), - RestoreRoutine, RegMask, Result.getValue(1)}); - // Finally reset the TPIDR2_EL0 register to 0. - Result = DAG.getNode( - ISD::INTRINSIC_VOID, DL, MVT::Other, Result, - DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32), - DAG.getConstant(0, DL, MVT::i64)); - TPIDR2.Uses++; + Result = emitRestoreZALazySave(Result, DL, *this, *TRI, *FuncInfo, DAG); } else if (RequiresSaveAllZA) { Result = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Result, /*IsSave=*/false); @@ -11736,6 +11778,28 @@ SDValue AArch64TargetLowering::LowerSELECT_CC( return DAG.getNode(ISD::AND, DL, VT, LHS, Shift); } + // Check for sign bit test patterns that can use TST optimization. + // (SELECT_CC setlt, sign_extend_inreg, 0, tval, fval) + // -> TST %operand, sign_bit; CSEL + // (SELECT_CC setlt, sign_extend, 0, tval, fval) + // -> TST %operand, sign_bit; CSEL + if (CC == ISD::SETLT && RHSC && RHSC->isZero() && LHS.hasOneUse() && + (LHS.getOpcode() == ISD::SIGN_EXTEND_INREG || + LHS.getOpcode() == ISD::SIGN_EXTEND)) { + + uint64_t SignBitPos; + std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS); + EVT TestVT = LHS.getValueType(); + SDValue SignBitConst = DAG.getConstant(1ULL << SignBitPos, DL, TestVT); + SDValue TST = + DAG.getNode(AArch64ISD::ANDS, DL, DAG.getVTList(TestVT, MVT::i32), + LHS, SignBitConst); + + SDValue Flags = TST.getValue(1); + return DAG.getNode(AArch64ISD::CSEL, DL, TVal.getValueType(), TVal, FVal, + DAG.getConstant(AArch64CC::NE, DL, MVT::i32), Flags); + } + // Canonicalise absolute difference patterns: // select_cc lhs, rhs, sub(lhs, rhs), sub(rhs, lhs), cc -> // select_cc lhs, rhs, sub(lhs, rhs), neg(sub(lhs, rhs)), cc diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 04b3c90..f788c75 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -9907,8 +9907,14 @@ def : Pat<(v4bf16 (bitconvert (v2f32 FPR64:$src))), def : Pat<(v4bf16 (bitconvert (v1f64 FPR64:$src))), (v4bf16 (REV64v4i16 FPR64:$src))>; } -def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))), (v4f16 FPR64:$src)>; -def : Pat<(v4bf16 (bitconvert (v4i16 FPR64:$src))), (v4bf16 FPR64:$src)>; +def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))), + (v4f16 FPR64:$src)>; +def : Pat<(v4f16 (bitconvert (v4bf16 FPR64:$src))), + (v4f16 FPR64:$src)>; +def : Pat<(v4bf16 (bitconvert (v4i16 FPR64:$src))), + (v4bf16 FPR64:$src)>; +def : Pat<(v4bf16 (bitconvert (v4f16 FPR64:$src))), + (v4bf16 FPR64:$src)>; let Predicates = [IsLE] in { def : Pat<(v8i8 (bitconvert (v1i64 FPR64:$src))), (v8i8 FPR64:$src)>; @@ -10236,8 +10242,14 @@ def : Pat<(v8bf16 (bitconvert (v2f64 FPR128:$src))), def : Pat<(v8bf16 (bitconvert (v4f32 FPR128:$src))), (v8bf16 (REV32v8i16 FPR128:$src))>; } -def : Pat<(v8f16 (bitconvert (v8i16 FPR128:$src))), (v8f16 FPR128:$src)>; -def : Pat<(v8bf16 (bitconvert (v8i16 FPR128:$src))), (v8bf16 FPR128:$src)>; +def : Pat<(v8f16 (bitconvert (v8i16 FPR128:$src))), + (v8f16 FPR128:$src)>; +def : Pat<(v8bf16 (bitconvert (v8i16 FPR128:$src))), + (v8bf16 FPR128:$src)>; +def : Pat<(v8f16 (bitconvert (v8bf16 FPR128:$src))), + (v8f16 FPR128:$src)>; +def : Pat<(v8bf16 (bitconvert (v8f16 FPR128:$src))), + (v8bf16 FPR128:$src)>; let Predicates = [IsLE] in { def : Pat<(v16i8 (bitconvert (f128 FPR128:$src))), (v16i8 FPR128:$src)>; diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index ecd003c..098bcfa 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -9559,3 +9559,20 @@ bool LoongArchTargetLowering::shouldScalarizeBinop(SDValue VecOp) const { EVT ScalarVT = VecVT.getScalarType(); return isOperationLegalOrCustomOrPromote(Opc, ScalarVT); } + +bool LoongArchTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, + unsigned Index) const { + if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) + return false; + + // Extract a 128-bit subvector from index 0 of a 256-bit vector is free. + return Index == 0; +} + +bool LoongArchTargetLowering::isExtractVecEltCheap(EVT VT, + unsigned Index) const { + EVT EltVT = VT.getScalarType(); + + // Extract a scalar FP value from index 0 of a vector is free. + return (EltVT == MVT::f32 || EltVT == MVT::f64) && Index == 0; +} diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h index 3c00296..9b60a9f 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h @@ -338,6 +338,9 @@ public: unsigned Depth) const override; bool shouldScalarizeBinop(SDValue VecOp) const override; + bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, + unsigned Index) const override; + bool isExtractVecEltCheap(EVT VT, unsigned Index) const override; /// Check if a constant splat can be generated using [x]vldi, where imm[12] /// is 1. diff --git a/llvm/lib/Target/X86/X86FixupInstTuning.cpp b/llvm/lib/Target/X86/X86FixupInstTuning.cpp index 33dc0a2..a1d4e0b 100644 --- a/llvm/lib/Target/X86/X86FixupInstTuning.cpp +++ b/llvm/lib/Target/X86/X86FixupInstTuning.cpp @@ -277,6 +277,22 @@ bool X86FixupInstTuningPass::processInstruction( return true; }; + // Is ADD(X,X) more efficient than SHL(X,1)? + auto ProcessShiftLeftToAdd = [&](unsigned AddOpc) -> bool { + if (MI.getOperand(NumOperands - 1).getImm() != 1) + return false; + if (!NewOpcPreferable(AddOpc, /*ReplaceInTie*/ true)) + return false; + LLVM_DEBUG(dbgs() << "Replacing: " << MI); + { + MI.setDesc(TII->get(AddOpc)); + MI.removeOperand(NumOperands - 1); + MI.addOperand(MI.getOperand(NumOperands - 2)); + } + LLVM_DEBUG(dbgs() << " With: " << MI); + return false; + }; + switch (Opc) { case X86::BLENDPDrri: return ProcessBLENDToMOV(X86::MOVSDrr, 0x3, 0x1); @@ -563,6 +579,44 @@ bool X86FixupInstTuningPass::processInstruction( return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rmkz); case X86::VUNPCKHPSZrmkz: return ProcessUNPCKPS(X86::VPUNPCKHDQZrmkz); + + case X86::PSLLWri: + return ProcessShiftLeftToAdd(X86::PADDWrr); + case X86::VPSLLWri: + return ProcessShiftLeftToAdd(X86::VPADDWrr); + case X86::VPSLLWYri: + return ProcessShiftLeftToAdd(X86::VPADDWYrr); + case X86::VPSLLWZ128ri: + return ProcessShiftLeftToAdd(X86::VPADDWZ128rr); + case X86::VPSLLWZ256ri: + return ProcessShiftLeftToAdd(X86::VPADDWZ256rr); + case X86::VPSLLWZri: + return ProcessShiftLeftToAdd(X86::VPADDWZrr); + case X86::PSLLDri: + return ProcessShiftLeftToAdd(X86::PADDDrr); + case X86::VPSLLDri: + return ProcessShiftLeftToAdd(X86::VPADDDrr); + case X86::VPSLLDYri: + return ProcessShiftLeftToAdd(X86::VPADDDYrr); + case X86::VPSLLDZ128ri: + return ProcessShiftLeftToAdd(X86::VPADDDZ128rr); + case X86::VPSLLDZ256ri: + return ProcessShiftLeftToAdd(X86::VPADDDZ256rr); + case X86::VPSLLDZri: + return ProcessShiftLeftToAdd(X86::VPADDDZrr); + case X86::PSLLQri: + return ProcessShiftLeftToAdd(X86::PADDQrr); + case X86::VPSLLQri: + return ProcessShiftLeftToAdd(X86::VPADDQrr); + case X86::VPSLLQYri: + return ProcessShiftLeftToAdd(X86::VPADDQYrr); + case X86::VPSLLQZ128ri: + return ProcessShiftLeftToAdd(X86::VPADDQZ128rr); + case X86::VPSLLQZ256ri: + return ProcessShiftLeftToAdd(X86::VPADDQZ256rr); + case X86::VPSLLQZri: + return ProcessShiftLeftToAdd(X86::VPADDQZrr); + default: return false; } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index efeddd7..e7eb67a 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -4456,8 +4456,8 @@ SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget, bool AllowAVX512 = true) { assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2"); unsigned NumSubs = 1; - if ((CheckBWI && Subtarget.useBWIRegs()) || - (!CheckBWI && AllowAVX512 && Subtarget.useAVX512Regs())) { + if (AllowAVX512 && ((CheckBWI && Subtarget.useBWIRegs()) || + (!CheckBWI && Subtarget.useAVX512Regs()))) { if (VT.getSizeInBits() > 512) { NumSubs = VT.getSizeInBits() / 512; assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size"); @@ -30313,22 +30313,8 @@ static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG, uint64_t ShiftAmt = APIntShiftAmt.getZExtValue(); - if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) { - // Hardware support for vector shifts is sparse which makes us scalarize the - // vector operations in many cases. Also, on sandybridge ADD is faster than - // shl: (shl V, 1) -> (add (freeze V), (freeze V)) - if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) { - // R may be undef at run-time, but (shl R, 1) must be an even number (LSB - // must be 0). (add undef, undef) however can be any value. To make this - // safe, we must freeze R to ensure that register allocation uses the same - // register for an undefined value. This ensures that the result will - // still be even and preserves the original semantics. - R = DAG.getFreeze(R); - return DAG.getNode(ISD::ADD, dl, VT, R, R); - } - + if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG); - } // i64 SRA needs to be performed as partial shifts. if (((!Subtarget.hasXOP() && VT == MVT::v2i64) || @@ -46211,7 +46197,7 @@ static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS, SDValue Zero = DAG.getConstant(0, DL, DpVT); return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1}, - DpBuilder, false); + DpBuilder, /*CheckBWI=*/false, Subtarget.hasVNNI()); } // Create a PSADBW given two sources representable as zexts of vXi8. diff --git a/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp b/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp index c215228..89980d5 100644 --- a/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp +++ b/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/DropUnnecessaryAssumes.h" +#include "llvm/ADT/SetVector.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/IntrinsicInst.h" @@ -17,13 +18,48 @@ using namespace llvm; using namespace llvm::PatternMatch; static bool affectedValuesAreEphemeral(ArrayRef<Value *> Affected) { - // If all the affected uses have only one use (part of the assume), then - // the assume does not provide useful information. Note that additional - // users may appear as a result of inlining and CSE, so we should only - // make this assumption late in the optimization pipeline. - // TODO: Handle dead cyclic usages. - // TODO: Handle multiple dead assumes on the same value. - return all_of(Affected, match_fn(m_OneUse(m_Value()))); + // Check whether all the uses are ephemeral, i.e. recursively only used + // by assumes. In that case, the assume does not provide useful information. + // Note that additional users may appear as a result of inlining and CSE, + // so we should only make this assumption late in the optimization pipeline. + SmallSetVector<Instruction *, 32> Worklist; + auto AddUsers = [&](Value *V) { + for (User *U : V->users()) { + // Bail out if we need to inspect too many users. + if (Worklist.size() >= 32) + return false; + Worklist.insert(cast<Instruction>(U)); + } + return true; + }; + + for (Value *V : Affected) { + // Do not handle assumes on globals for now. The use list for them may + // contain uses in other functions. + if (!isa<Instruction, Argument>(V)) + return false; + + if (!AddUsers(V)) + return false; + } + + for (unsigned Idx = 0; Idx < Worklist.size(); ++Idx) { + Instruction *I = Worklist[Idx]; + + // Use in assume is ephemeral. + if (isa<AssumeInst>(I)) + continue; + + // Use in side-effecting instruction is non-ephemeral. + if (I->mayHaveSideEffects() || I->isTerminator()) + return false; + + // Otherwise, recursively look at the users. + if (!AddUsers(I)) + return false; + } + + return true; } PreservedAnalyses diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 96f52076..ab5c9c9 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3902,7 +3902,8 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks( if (VF.isScalar()) continue; - VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, + *CM.PSE.getSE()); precomputeCosts(*Plan, VF, CostCtx); auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry()); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { @@ -4159,7 +4160,8 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { // Add on other costs that are modelled in VPlan, but not in the legacy // cost model. - VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind, + *CM.PSE.getSE()); VPRegionBlock *VectorRegion = P->getVectorLoopRegion(); assert(VectorRegion && "Expected to have a vector region!"); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( @@ -6834,7 +6836,7 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF, InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, ElementCount VF) const { - VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE()); InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx); // Now compute and add the VPlan-based cost. @@ -7067,7 +7069,8 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { // simplifications not accounted for in the legacy cost model. If that's the // case, don't trigger the assertion, as the extra simplifications may cause a // different VF to be picked by the VPlan-based cost model. - VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind, + *CM.PSE.getSE()); precomputeCosts(BestPlan, BestFactor.Width, CostCtx); // Verify that the VPlan-based and legacy cost models agree, except for VPlans // with early exits and plans with additional VPlan simplifications. The @@ -8597,7 +8600,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // TODO: Enable following transform when the EVL-version of extended-reduction // and mulacc-reduction are implemented. if (!CM.foldTailWithEVL()) { - VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, + *CM.PSE.getSE()); VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan, CostCtx, Range); } @@ -10054,7 +10058,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM, - CM.CostKind); + CM.CostKind, *CM.PSE.getSE()); if (!ForceVectorization && !isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx, LVP.getPlanFor(VF.Width), SEL, diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 81f1956..728d291 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -1750,7 +1750,8 @@ VPCostContext::getOperandInfo(VPValue *V) const { } InstructionCost VPCostContext::getScalarizationOverhead( - Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF) { + Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF, + bool AlwaysIncludeReplicatingR) { if (VF.isScalar()) return 0; @@ -1770,7 +1771,9 @@ InstructionCost VPCostContext::getScalarizationOverhead( SmallPtrSet<const VPValue *, 4> UniqueOperands; SmallVector<Type *> Tys; for (auto *Op : Operands) { - if (Op->isLiveIn() || isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op) || + if (Op->isLiveIn() || + (!AlwaysIncludeReplicatingR && + isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op)) || !UniqueOperands.insert(Op).second) continue; Tys.push_back(toVectorizedTy(Types.inferScalarType(Op), VF)); diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h index fe59774..2a8baec 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h +++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h @@ -349,12 +349,14 @@ struct VPCostContext { LoopVectorizationCostModel &CM; SmallPtrSet<Instruction *, 8> SkipCostComputation; TargetTransformInfo::TargetCostKind CostKind; + ScalarEvolution &SE; VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, const VPlan &Plan, LoopVectorizationCostModel &CM, - TargetTransformInfo::TargetCostKind CostKind) + TargetTransformInfo::TargetCostKind CostKind, + ScalarEvolution &SE) : TTI(TTI), TLI(TLI), Types(Plan), LLVMCtx(Plan.getContext()), CM(CM), - CostKind(CostKind) {} + CostKind(CostKind), SE(SE) {} /// Return the cost for \p UI with \p VF using the legacy cost model as /// fallback until computing the cost of all recipes migrates to VPlan. @@ -374,10 +376,12 @@ struct VPCostContext { /// Estimate the overhead of scalarizing a recipe with result type \p ResultTy /// and \p Operands with \p VF. This is a convenience wrapper for the - /// type-based getScalarizationOverhead API. - InstructionCost getScalarizationOverhead(Type *ResultTy, - ArrayRef<const VPValue *> Operands, - ElementCount VF); + /// type-based getScalarizationOverhead API. If \p AlwaysIncludeReplicatingR + /// is true, always compute the cost of scalarizing replicating operands. + InstructionCost + getScalarizationOverhead(Type *ResultTy, ArrayRef<const VPValue *> Operands, + ElementCount VF, + bool AlwaysIncludeReplicatingR = false); }; /// This class can be used to assign names to VPValues. For VPValues without diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index cf5e6bf..b5e30cb 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -3069,6 +3069,61 @@ bool VPReplicateRecipe::shouldPack() const { }); } +/// Returns true if \p Ptr is a pointer computation for which the legacy cost +/// model computes a SCEV expression when computing the address cost. +static bool shouldUseAddressAccessSCEV(const VPValue *Ptr) { + auto *PtrR = Ptr->getDefiningRecipe(); + if (!PtrR || !((isa<VPReplicateRecipe>(PtrR) && + cast<VPReplicateRecipe>(PtrR)->getOpcode() == + Instruction::GetElementPtr) || + isa<VPWidenGEPRecipe>(PtrR))) + return false; + + // We are looking for a GEP where all indices are either loop invariant or + // inductions. + for (VPValue *Opd : drop_begin(PtrR->operands())) { + if (!Opd->isDefinedOutsideLoopRegions() && + !isa<VPScalarIVStepsRecipe, VPWidenIntOrFpInductionRecipe>(Opd)) + return false; + } + + return true; +} + +/// Returns true if \p V is used as part of the address of another load or +/// store. +static bool isUsedByLoadStoreAddress(const VPUser *V) { + SmallPtrSet<const VPUser *, 4> Seen; + SmallVector<const VPUser *> WorkList = {V}; + + while (!WorkList.empty()) { + auto *Cur = dyn_cast<VPSingleDefRecipe>(WorkList.pop_back_val()); + if (!Cur || !Seen.insert(Cur).second) + continue; + + for (VPUser *U : Cur->users()) { + if (auto *InterleaveR = dyn_cast<VPInterleaveBase>(U)) + if (InterleaveR->getAddr() == Cur) + return true; + if (auto *RepR = dyn_cast<VPReplicateRecipe>(U)) { + if (RepR->getOpcode() == Instruction::Load && + RepR->getOperand(0) == Cur) + return true; + if (RepR->getOpcode() == Instruction::Store && + RepR->getOperand(1) == Cur) + return true; + } + if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U)) { + if (MemR->getAddr() == Cur && MemR->isConsecutive()) + return true; + } + } + + append_range(WorkList, cast<VPSingleDefRecipe>(Cur)->users()); + } + return false; +} + InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { Instruction *UI = cast<Instruction>(getUnderlyingValue()); @@ -3176,21 +3231,58 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, } case Instruction::Load: case Instruction::Store: { - if (isSingleScalar()) { - bool IsLoad = UI->getOpcode() == Instruction::Load; - Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0)); - Type *ScalarPtrTy = Ctx.Types.inferScalarType(getOperand(IsLoad ? 0 : 1)); - const Align Alignment = getLoadStoreAlignment(UI); - unsigned AS = getLoadStoreAddressSpace(UI); - TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0)); - InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost( - UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo, UI); - return ScalarMemOpCost + Ctx.TTI.getAddressComputationCost( - ScalarPtrTy, nullptr, nullptr, Ctx.CostKind); - } + if (VF.isScalable() && !isSingleScalar()) + return InstructionCost::getInvalid(); + // TODO: See getMemInstScalarizationCost for how to handle replicating and // predicated cases. - break; + const VPRegionBlock *ParentRegion = getParent()->getParent(); + if (ParentRegion && ParentRegion->isReplicator()) + break; + + bool IsLoad = UI->getOpcode() == Instruction::Load; + const VPValue *PtrOp = getOperand(!IsLoad); + // TODO: Handle cases where we need to pass a SCEV to + // getAddressComputationCost. + if (shouldUseAddressAccessSCEV(PtrOp)) + break; + + Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0)); + Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp); + const Align Alignment = getLoadStoreAlignment(UI); + unsigned AS = getLoadStoreAddressSpace(UI); + TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0)); + InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost( + UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo); + + Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF); + + InstructionCost ScalarCost = + ScalarMemOpCost + Ctx.TTI.getAddressComputationCost( + PtrTy, &Ctx.SE, nullptr, Ctx.CostKind); + if (isSingleScalar()) + return ScalarCost; + + SmallVector<const VPValue *> OpsToScalarize; + Type *ResultTy = Type::getVoidTy(PtrTy->getContext()); + // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we + // don't assign scalarization overhead in general, if the target prefers + // vectorized addressing or the loaded value is used as part of an address + // of another load or store. + bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing(); + if (PreferVectorizedAddressing || !isUsedByLoadStoreAddress(this)) { + bool EfficientVectorLoadStore = + Ctx.TTI.supportsEfficientVectorElementLoadStore(); + if (!(IsLoad && !PreferVectorizedAddressing) && + !(!IsLoad && EfficientVectorLoadStore)) + append_range(OpsToScalarize, operands()); + + if (!EfficientVectorLoadStore) + ResultTy = Ctx.Types.inferScalarType(this); + } + + return (ScalarCost * VF.getFixedValue()) + + Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, true); } } diff --git a/llvm/test/CodeGen/AArch64/bf16-vector-bitcast.ll b/llvm/test/CodeGen/AArch64/bf16-vector-bitcast.ll index 1c216e7..e371748 100644 --- a/llvm/test/CodeGen/AArch64/bf16-vector-bitcast.ll +++ b/llvm/test/CodeGen/AArch64/bf16-vector-bitcast.ll @@ -11,6 +11,16 @@ entry: ret <4 x i16> %1 } +define <4 x half> @v4bf16_to_v4f16(float, <4 x bfloat> %a) nounwind { +; CHECK-LABEL: v4bf16_to_v4f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov d0, d1 +; CHECK-NEXT: ret +entry: + %1 = bitcast <4 x bfloat> %a to <4 x half> + ret <4 x half> %1 +} + define <2 x i32> @v4bf16_to_v2i32(float, <4 x bfloat> %a) nounwind { ; CHECK-LABEL: v4bf16_to_v2i32: ; CHECK: // %bb.0: // %entry @@ -82,6 +92,16 @@ entry: ret <4 x bfloat> %1 } +define <4 x bfloat> @v4f16_to_v4bf16(float, <4 x half> %a) nounwind { +; CHECK-LABEL: v4f16_to_v4bf16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov d0, d1 +; CHECK-NEXT: ret +entry: + %1 = bitcast <4 x half> %a to <4 x bfloat> + ret <4 x bfloat> %1 +} + define <4 x bfloat> @v2i32_to_v4bf16(float, <2 x i32> %a) nounwind { ; CHECK-LABEL: v2i32_to_v4bf16: ; CHECK: // %bb.0: // %entry @@ -152,6 +172,16 @@ entry: ret <8 x i16> %1 } +define <8 x half> @v8bf16_to_v8f16(float, <8 x bfloat> %a) nounwind { +; CHECK-LABEL: v8bf16_to_v8f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %1 = bitcast <8 x bfloat> %a to <8 x half> + ret <8 x half> %1 +} + define <4 x i32> @v8bf16_to_v4i32(float, <8 x bfloat> %a) nounwind { ; CHECK-LABEL: v8bf16_to_v4i32: ; CHECK: // %bb.0: // %entry @@ -202,6 +232,16 @@ entry: ret <8 x bfloat> %1 } +define <8 x bfloat> @v8f16_to_v8bf16(float, <8 x half> %a) nounwind { +; CHECK-LABEL: v8f16_to_v8bf16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %1 = bitcast <8 x half> %a to <8 x bfloat> + ret <8 x bfloat> %1 +} + define <8 x bfloat> @v4i32_to_v8bf16(float, <4 x i32> %a) nounwind { ; CHECK-LABEL: v4i32_to_v8bf16: ; CHECK: // %bb.0: // %entry diff --git a/llvm/test/CodeGen/AArch64/check-sign-bit-before-extension.ll b/llvm/test/CodeGen/AArch64/check-sign-bit-before-extension.ll index 0960c4c..a56d5b1 100644 --- a/llvm/test/CodeGen/AArch64/check-sign-bit-before-extension.ll +++ b/llvm/test/CodeGen/AArch64/check-sign-bit-before-extension.ll @@ -78,9 +78,8 @@ B: define i32 @g_i8_sign_extend_inreg(i8 %in, i32 %a, i32 %b) nounwind { ; CHECK-LABEL: g_i8_sign_extend_inreg: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sxtb w8, w0 -; CHECK-NEXT: cmp w8, #0 -; CHECK-NEXT: csel w8, w1, w2, mi +; CHECK-NEXT: tst w0, #0x80 +; CHECK-NEXT: csel w8, w1, w2, ne ; CHECK-NEXT: add w0, w8, w0, uxtb ; CHECK-NEXT: ret entry: @@ -100,9 +99,8 @@ B: define i32 @g_i16_sign_extend_inreg(i16 %in, i32 %a, i32 %b) nounwind { ; CHECK-LABEL: g_i16_sign_extend_inreg: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sxth w8, w0 -; CHECK-NEXT: cmp w8, #0 -; CHECK-NEXT: csel w8, w1, w2, mi +; CHECK-NEXT: tst w0, #0x8000 +; CHECK-NEXT: csel w8, w1, w2, ne ; CHECK-NEXT: add w0, w8, w0, uxth ; CHECK-NEXT: ret entry: @@ -167,10 +165,8 @@ B: define i64 @g_i32_sign_extend_i64(i32 %in, i64 %a, i64 %b) nounwind { ; CHECK-LABEL: g_i32_sign_extend_i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-NEXT: sxtw x8, w0 -; CHECK-NEXT: cmp x8, #0 -; CHECK-NEXT: csel x8, x1, x2, mi +; CHECK-NEXT: tst w0, #0x80000000 +; CHECK-NEXT: csel x8, x1, x2, ne ; CHECK-NEXT: add x0, x8, w0, uxtw ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/icmp.ll b/llvm/test/CodeGen/AArch64/icmp.ll index 18665bc..7195e2b 100644 --- a/llvm/test/CodeGen/AArch64/icmp.ll +++ b/llvm/test/CodeGen/AArch64/icmp.ll @@ -2093,3 +2093,54 @@ define <2 x i1> @icmp_slt_v2i64_Zero_LHS(<2 x i64> %a) { %c = icmp slt <2 x i64> <i64 0, i64 0>, %a ret <2 x i1> %c } + +; Test TST optimization for i8 sign bit testing with cross-type select +; This tests the pattern: icmp slt i8 %val, 0; select i1 %cmp, i32 %a, i32 %b +; The optimization should convert sxtb+cmp to tst for sign bit testing. + +define i32 @i8_signbit_tst_constants(i8 %x, i8 %y) { +; CHECK-SD-LABEL: i8_signbit_tst_constants: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: add w9, w0, w1 +; CHECK-SD-NEXT: mov w8, #42 // =0x2a +; CHECK-SD-NEXT: tst w9, #0x80 +; CHECK-SD-NEXT: mov w9, #20894 // =0x519e +; CHECK-SD-NEXT: csel w0, w9, w8, ne +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: i8_signbit_tst_constants: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: add w8, w0, w1 +; CHECK-GI-NEXT: mov w9, #42 // =0x2a +; CHECK-GI-NEXT: mov w10, #20894 // =0x519e +; CHECK-GI-NEXT: sxtb w8, w8 +; CHECK-GI-NEXT: cmp w8, #0 +; CHECK-GI-NEXT: csel w0, w10, w9, mi +; CHECK-GI-NEXT: ret + %add = add i8 %x, %y + %cmp = icmp slt i8 %add, 0 + %sel = select i1 %cmp, i32 20894, i32 42 + ret i32 %sel +} + +; Test i8 sign bit testing with variable select values (problematic case) +define i32 @i8_signbit_variables(i8 %x, i8 %y, i32 %a, i32 %b) { +; CHECK-SD-LABEL: i8_signbit_variables: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: add w8, w0, w1 +; CHECK-SD-NEXT: tst w8, #0x80 +; CHECK-SD-NEXT: csel w0, w2, w3, ne +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: i8_signbit_variables: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: add w8, w0, w1 +; CHECK-GI-NEXT: sxtb w8, w8 +; CHECK-GI-NEXT: cmp w8, #0 +; CHECK-GI-NEXT: csel w0, w2, w3, mi +; CHECK-GI-NEXT: ret + %add = add i8 %x, %y + %cmp = icmp slt i8 %add, 0 + %sel = select i1 %cmp, i32 %a, i32 %b + ret i32 %sel +} diff --git a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll index fc43c71..b6dee97e 100644 --- a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll +++ b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -aarch64-new-sme-abi -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -aarch64-new-sme-abi -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-SDAG ; A simple EH test case that corresponds to the following C++ source: ; @@ -87,6 +88,90 @@ define void @za_with_raii(i1 %fail) "aarch64_inout_za" personality ptr @__gxx_pe ; CHECK-NEXT: mov x0, x19 ; CHECK-NEXT: msr TPIDR2_EL0, x8 ; CHECK-NEXT: bl _Unwind_Resume +; +; CHECK-SDAG-LABEL: za_with_raii: +; CHECK-SDAG: .Lfunc_begin0: +; CHECK-SDAG-NEXT: .cfi_startproc +; CHECK-SDAG-NEXT: .cfi_personality 156, DW.ref.__gxx_personality_v0 +; CHECK-SDAG-NEXT: .cfi_lsda 28, .Lexception0 +; CHECK-SDAG-NEXT: // %bb.0: +; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-SDAG-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-SDAG-NEXT: mov x29, sp +; CHECK-SDAG-NEXT: sub sp, sp, #16 +; CHECK-SDAG-NEXT: .cfi_def_cfa w29, 32 +; CHECK-SDAG-NEXT: .cfi_offset w19, -8 +; CHECK-SDAG-NEXT: .cfi_offset w20, -16 +; CHECK-SDAG-NEXT: .cfi_offset w30, -24 +; CHECK-SDAG-NEXT: .cfi_offset w29, -32 +; CHECK-SDAG-NEXT: rdsvl x8, #1 +; CHECK-SDAG-NEXT: mov x9, sp +; CHECK-SDAG-NEXT: msub x9, x8, x8, x9 +; CHECK-SDAG-NEXT: mov sp, x9 +; CHECK-SDAG-NEXT: stp x9, x8, [x29, #-16] +; CHECK-SDAG-NEXT: tbnz w0, #0, .LBB0_2 +; CHECK-SDAG-NEXT: // %bb.1: // %return_normally +; CHECK-SDAG-NEXT: mov sp, x29 +; CHECK-SDAG-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-SDAG-NEXT: b shared_za_call +; CHECK-SDAG-NEXT: .LBB0_2: // %throw_exception +; CHECK-SDAG-NEXT: sub x20, x29, #16 +; CHECK-SDAG-NEXT: mov w0, #8 // =0x8 +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x20 +; CHECK-SDAG-NEXT: bl __cxa_allocate_exception +; CHECK-SDAG-NEXT: mov x8, x0 +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x9, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x9, .LBB0_4 +; CHECK-SDAG-NEXT: // %bb.3: // %throw_exception +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB0_4: // %throw_exception +; CHECK-SDAG-NEXT: adrp x9, .L.str +; CHECK-SDAG-NEXT: add x9, x9, :lo12:.L.str +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: str x9, [x8] +; CHECK-SDAG-NEXT: .Ltmp0: // EH_LABEL +; CHECK-SDAG-NEXT: adrp x1, :got:typeinfo_for_char_const_ptr +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x20 +; CHECK-SDAG-NEXT: mov x0, x8 +; CHECK-SDAG-NEXT: ldr x1, [x1, :got_lo12:typeinfo_for_char_const_ptr] +; CHECK-SDAG-NEXT: mov x2, xzr +; CHECK-SDAG-NEXT: bl __cxa_throw +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB0_6 +; CHECK-SDAG-NEXT: // %bb.5: // %throw_exception +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB0_6: // %throw_exception +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: .Ltmp1: // EH_LABEL +; CHECK-SDAG-NEXT: // %bb.7: // %throw_fail +; CHECK-SDAG-NEXT: .LBB0_8: // %unwind_dtors +; CHECK-SDAG-NEXT: .Ltmp2: // EH_LABEL +; CHECK-SDAG-NEXT: mov x19, x0 +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB0_10 +; CHECK-SDAG-NEXT: // %bb.9: // %unwind_dtors +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB0_10: // %unwind_dtors +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: bl shared_za_call +; CHECK-SDAG-NEXT: mov x0, x19 +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x20 +; CHECK-SDAG-NEXT: bl _Unwind_Resume +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB0_12 +; CHECK-SDAG-NEXT: // %bb.11: // %unwind_dtors +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB0_12: // %unwind_dtors +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr br i1 %fail, label %throw_exception, label %return_normally throw_exception: @@ -124,7 +209,7 @@ throw_fail: ; } ; shared_za_call(); ; } -define dso_local void @try_catch() "aarch64_inout_za" personality ptr @__gxx_personality_v0 { +define void @try_catch() "aarch64_inout_za" personality ptr @__gxx_personality_v0 { ; CHECK-LABEL: try_catch: ; CHECK: .Lfunc_begin1: ; CHECK-NEXT: .cfi_startproc @@ -142,11 +227,11 @@ define dso_local void @try_catch() "aarch64_inout_za" personality ptr @__gxx_per ; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: mov sp, x9 ; CHECK-NEXT: stp x9, x8, [x29, #-16] -; CHECK-NEXT: .Ltmp3: +; CHECK-NEXT: .Ltmp3: // EH_LABEL ; CHECK-NEXT: sub x8, x29, #16 ; CHECK-NEXT: msr TPIDR2_EL0, x8 ; CHECK-NEXT: bl may_throw -; CHECK-NEXT: .Ltmp4: +; CHECK-NEXT: .Ltmp4: // EH_LABEL ; CHECK-NEXT: .LBB1_1: // %after_catch ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -160,7 +245,7 @@ define dso_local void @try_catch() "aarch64_inout_za" personality ptr @__gxx_per ; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; CHECK-NEXT: b shared_za_call ; CHECK-NEXT: .LBB1_4: // %catch -; CHECK-NEXT: .Ltmp5: +; CHECK-NEXT: .Ltmp5: // EH_LABEL ; CHECK-NEXT: bl __cxa_begin_catch ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -175,6 +260,78 @@ define dso_local void @try_catch() "aarch64_inout_za" personality ptr @__gxx_per ; CHECK-NEXT: msr TPIDR2_EL0, x8 ; CHECK-NEXT: bl __cxa_end_catch ; CHECK-NEXT: b .LBB1_1 +; +; CHECK-SDAG-LABEL: try_catch: +; CHECK-SDAG: .Lfunc_begin1: +; CHECK-SDAG-NEXT: .cfi_startproc +; CHECK-SDAG-NEXT: .cfi_personality 156, DW.ref.__gxx_personality_v0 +; CHECK-SDAG-NEXT: .cfi_lsda 28, .Lexception1 +; CHECK-SDAG-NEXT: // %bb.0: +; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-SDAG-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-SDAG-NEXT: mov x29, sp +; CHECK-SDAG-NEXT: sub sp, sp, #16 +; CHECK-SDAG-NEXT: .cfi_def_cfa w29, 32 +; CHECK-SDAG-NEXT: .cfi_offset w19, -16 +; CHECK-SDAG-NEXT: .cfi_offset w30, -24 +; CHECK-SDAG-NEXT: .cfi_offset w29, -32 +; CHECK-SDAG-NEXT: rdsvl x8, #1 +; CHECK-SDAG-NEXT: mov x9, sp +; CHECK-SDAG-NEXT: msub x9, x8, x8, x9 +; CHECK-SDAG-NEXT: mov sp, x9 +; CHECK-SDAG-NEXT: stp x9, x8, [x29, #-16] +; CHECK-SDAG-NEXT: .Ltmp3: // EH_LABEL +; CHECK-SDAG-NEXT: sub x19, x29, #16 +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x19 +; CHECK-SDAG-NEXT: bl may_throw +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB1_2 +; CHECK-SDAG-NEXT: // %bb.1: +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB1_2: +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: .Ltmp4: // EH_LABEL +; CHECK-SDAG-NEXT: .LBB1_3: // %after_catch +; CHECK-SDAG-NEXT: mov sp, x29 +; CHECK-SDAG-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-SDAG-NEXT: b shared_za_call +; CHECK-SDAG-NEXT: .LBB1_4: // %catch +; CHECK-SDAG-NEXT: .Ltmp5: // EH_LABEL +; CHECK-SDAG-NEXT: mov x1, x0 +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB1_6 +; CHECK-SDAG-NEXT: // %bb.5: // %catch +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB1_6: // %catch +; CHECK-SDAG-NEXT: mov x0, x1 +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x19 +; CHECK-SDAG-NEXT: bl __cxa_begin_catch +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB1_8 +; CHECK-SDAG-NEXT: // %bb.7: // %catch +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB1_8: // %catch +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: bl shared_za_call +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x19 +; CHECK-SDAG-NEXT: bl __cxa_end_catch +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB1_10 +; CHECK-SDAG-NEXT: // %bb.9: // %catch +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB1_10: // %catch +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: b .LBB1_3 invoke void @may_throw() to label %after_catch unwind label %catch @@ -235,16 +392,16 @@ define void @try_catch_shared_za_callee() "aarch64_new_za" personality ptr @__gx ; CHECK-NEXT: zero {za} ; CHECK-NEXT: .LBB2_2: ; CHECK-NEXT: smstart za -; CHECK-NEXT: .Ltmp6: +; CHECK-NEXT: .Ltmp6: // EH_LABEL ; CHECK-NEXT: bl shared_za_call -; CHECK-NEXT: .Ltmp7: +; CHECK-NEXT: .Ltmp7: // EH_LABEL ; CHECK-NEXT: .LBB2_3: // %exit ; CHECK-NEXT: smstop za ; CHECK-NEXT: mov sp, x29 ; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB2_4: // %catch -; CHECK-NEXT: .Ltmp8: +; CHECK-NEXT: .Ltmp8: // EH_LABEL ; CHECK-NEXT: bl __cxa_begin_catch ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -260,6 +417,78 @@ define void @try_catch_shared_za_callee() "aarch64_new_za" personality ptr @__gx ; CHECK-NEXT: bl __cxa_end_catch ; CHECK-NEXT: msr TPIDR2_EL0, xzr ; CHECK-NEXT: b .LBB2_3 +; +; CHECK-SDAG-LABEL: try_catch_shared_za_callee: +; CHECK-SDAG: .Lfunc_begin2: +; CHECK-SDAG-NEXT: .cfi_startproc +; CHECK-SDAG-NEXT: .cfi_personality 156, DW.ref.__gxx_personality_v0 +; CHECK-SDAG-NEXT: .cfi_lsda 28, .Lexception2 +; CHECK-SDAG-NEXT: // %bb.0: // %prelude +; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-SDAG-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-SDAG-NEXT: mov x29, sp +; CHECK-SDAG-NEXT: sub sp, sp, #16 +; CHECK-SDAG-NEXT: .cfi_def_cfa w29, 32 +; CHECK-SDAG-NEXT: .cfi_offset w19, -16 +; CHECK-SDAG-NEXT: .cfi_offset w30, -24 +; CHECK-SDAG-NEXT: .cfi_offset w29, -32 +; CHECK-SDAG-NEXT: rdsvl x8, #1 +; CHECK-SDAG-NEXT: mov x9, sp +; CHECK-SDAG-NEXT: msub x9, x8, x8, x9 +; CHECK-SDAG-NEXT: mov sp, x9 +; CHECK-SDAG-NEXT: stp x9, x8, [x29, #-16] +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: cbz x8, .LBB2_2 +; CHECK-SDAG-NEXT: // %bb.1: // %save.za +; CHECK-SDAG-NEXT: bl __arm_tpidr2_save +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: .LBB2_2: +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: zero {za} +; CHECK-SDAG-NEXT: .Ltmp6: // EH_LABEL +; CHECK-SDAG-NEXT: bl shared_za_call +; CHECK-SDAG-NEXT: .Ltmp7: // EH_LABEL +; CHECK-SDAG-NEXT: .LBB2_3: // %exit +; CHECK-SDAG-NEXT: smstop za +; CHECK-SDAG-NEXT: mov sp, x29 +; CHECK-SDAG-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ret +; CHECK-SDAG-NEXT: .LBB2_4: // %catch +; CHECK-SDAG-NEXT: .Ltmp8: // EH_LABEL +; CHECK-SDAG-NEXT: mov x1, x0 +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: sub x19, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB2_6 +; CHECK-SDAG-NEXT: // %bb.5: // %catch +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB2_6: // %catch +; CHECK-SDAG-NEXT: mov x0, x1 +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x19 +; CHECK-SDAG-NEXT: bl __cxa_begin_catch +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB2_8 +; CHECK-SDAG-NEXT: // %bb.7: // %catch +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB2_8: // %catch +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: bl noexcept_shared_za_call +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x19 +; CHECK-SDAG-NEXT: bl __cxa_end_catch +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB2_10 +; CHECK-SDAG-NEXT: // %bb.9: // %catch +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB2_10: // %catch +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: b .LBB2_3 invoke void @shared_za_call() #4 to label %exit unwind label %catch catch: @@ -275,6 +504,234 @@ exit: ret void } +; A simple ZT0 exception example that corresponds to: +; +; struct ZT0Resource { +; ~ZT0Resource() __arm_inout("zt0") { +; shared_zt0_call(); // simulate cleanup in destructor +; } +; }; +; +; void za_with_raii() __arm_inout("zt0") { +; ZT0Resource r; +; may_throw(); +; } +; +; This code may require reloading ZT0 in the cleanup for ~ZT0Resource(). +; +; FIXME: Codegen with `-aarch64-new-sme-abi` is broken with ZT0 (as it is not implemented). +define void @try_catch_shared_zt0_callee() "aarch64_inout_zt0" personality ptr @__gxx_personality_v0 { +; CHECK-LABEL: try_catch_shared_zt0_callee: +; CHECK: .Lfunc_begin3: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: .cfi_personality 156, DW.ref.__gxx_personality_v0 +; CHECK-NEXT: .cfi_lsda 28, .Lexception3 +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: sub sp, sp, #80 +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: msub x9, x8, x8, x9 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: stp x9, x8, [x29, #-80] +; CHECK-NEXT: .Ltmp9: // EH_LABEL +; CHECK-NEXT: sub x19, x29, #64 +; CHECK-NEXT: str zt0, [x19] +; CHECK-NEXT: smstop za +; CHECK-NEXT: bl may_throw +; CHECK-NEXT: smstart za +; CHECK-NEXT: ldr zt0, [x19] +; CHECK-NEXT: .Ltmp10: // EH_LABEL +; CHECK-NEXT: // %bb.1: // %return_normally +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB3_2: // %unwind_dtors +; CHECK-NEXT: .Ltmp11: // EH_LABEL +; CHECK-NEXT: sub x20, x29, #64 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #80 +; CHECK-NEXT: cbnz x8, .LBB3_4 +; CHECK-NEXT: // %bb.3: // %unwind_dtors +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB3_4: // %unwind_dtors +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: bl shared_zt0_call +; CHECK-NEXT: str zt0, [x20] +; CHECK-NEXT: smstop za +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl _Unwind_Resume +; CHECK-NEXT: smstart za +; CHECK-NEXT: ldr zt0, [x20] +; +; CHECK-SDAG-LABEL: try_catch_shared_zt0_callee: +; CHECK-SDAG: .Lfunc_begin3: +; CHECK-SDAG-NEXT: .cfi_startproc +; CHECK-SDAG-NEXT: .cfi_personality 156, DW.ref.__gxx_personality_v0 +; CHECK-SDAG-NEXT: .cfi_lsda 28, .Lexception3 +; CHECK-SDAG-NEXT: // %bb.0: +; CHECK-SDAG-NEXT: sub sp, sp, #96 +; CHECK-SDAG-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-SDAG-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-SDAG-NEXT: .cfi_def_cfa_offset 96 +; CHECK-SDAG-NEXT: .cfi_offset w19, -8 +; CHECK-SDAG-NEXT: .cfi_offset w20, -16 +; CHECK-SDAG-NEXT: .cfi_offset w30, -32 +; CHECK-SDAG-NEXT: .Ltmp9: // EH_LABEL +; CHECK-SDAG-NEXT: mov x19, sp +; CHECK-SDAG-NEXT: str zt0, [x19] +; CHECK-SDAG-NEXT: smstop za +; CHECK-SDAG-NEXT: bl may_throw +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: ldr zt0, [x19] +; CHECK-SDAG-NEXT: .Ltmp10: // EH_LABEL +; CHECK-SDAG-NEXT: // %bb.1: // %return_normally +; CHECK-SDAG-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-SDAG-NEXT: add sp, sp, #96 +; CHECK-SDAG-NEXT: ret +; CHECK-SDAG-NEXT: .LBB3_2: // %unwind_dtors +; CHECK-SDAG-NEXT: .Ltmp11: // EH_LABEL +; CHECK-SDAG-NEXT: mov x20, sp +; CHECK-SDAG-NEXT: mov x19, x0 +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: ldr zt0, [x20] +; CHECK-SDAG-NEXT: bl shared_zt0_call +; CHECK-SDAG-NEXT: str zt0, [x20] +; CHECK-SDAG-NEXT: smstop za +; CHECK-SDAG-NEXT: mov x0, x19 +; CHECK-SDAG-NEXT: bl _Unwind_Resume +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: ldr zt0, [x20] + invoke void @may_throw() + to label %return_normally unwind label %unwind_dtors + +unwind_dtors: + %5 = landingpad { ptr, i32 } + cleanup + tail call void @shared_zt0_call() + resume { ptr, i32 } %5 + +return_normally: + ret void +} + +; This example corresponds to: +; +; __arm_agnostic("sme_za_state") void try_catch_agnostic_za() +; { +; try { +; may_throw(); +; } catch(...) { +; } +; } +; +; In this example we must execute __arm_sme_restore once we enter the catch block +; (before executing __arm_sme_save again, which would invalidate the prior save). +define void @try_catch_agnostic_za() "aarch64_za_state_agnostic" personality ptr @__gxx_personality_v0 { +; CHECK-LABEL: try_catch_agnostic_za: +; CHECK: .Lfunc_begin4: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: .cfi_personality 156, DW.ref.__gxx_personality_v0 +; CHECK-NEXT: .cfi_lsda 28, .Lexception4 +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: bl __arm_sme_state_size +; CHECK-NEXT: sub sp, sp, x0 +; CHECK-NEXT: mov x19, sp +; CHECK-NEXT: .Ltmp12: // EH_LABEL +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl __arm_sme_save +; CHECK-NEXT: bl may_throw +; CHECK-NEXT: .Ltmp13: // EH_LABEL +; CHECK-NEXT: .LBB4_1: // %exit +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl __arm_sme_restore +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB4_2: // %catch +; CHECK-NEXT: .Ltmp14: // EH_LABEL +; CHECK-NEXT: bl __cxa_begin_catch +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: b .LBB4_1 +; +; CHECK-SDAG-LABEL: try_catch_agnostic_za: +; CHECK-SDAG: .Lfunc_begin4: +; CHECK-SDAG-NEXT: .cfi_startproc +; CHECK-SDAG-NEXT: .cfi_personality 156, DW.ref.__gxx_personality_v0 +; CHECK-SDAG-NEXT: .cfi_lsda 28, .Lexception4 +; CHECK-SDAG-NEXT: // %bb.0: +; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-SDAG-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-SDAG-NEXT: mov x29, sp +; CHECK-SDAG-NEXT: .cfi_def_cfa w29, 32 +; CHECK-SDAG-NEXT: .cfi_offset w19, -16 +; CHECK-SDAG-NEXT: .cfi_offset w30, -24 +; CHECK-SDAG-NEXT: .cfi_offset w29, -32 +; CHECK-SDAG-NEXT: bl __arm_sme_state_size +; CHECK-SDAG-NEXT: sub sp, sp, x0 +; CHECK-SDAG-NEXT: mov x19, sp +; CHECK-SDAG-NEXT: .Ltmp12: // EH_LABEL +; CHECK-SDAG-NEXT: mov x0, x19 +; CHECK-SDAG-NEXT: bl __arm_sme_save +; CHECK-SDAG-NEXT: bl may_throw +; CHECK-SDAG-NEXT: mov x0, x19 +; CHECK-SDAG-NEXT: bl __arm_sme_restore +; CHECK-SDAG-NEXT: .Ltmp13: // EH_LABEL +; CHECK-SDAG-NEXT: .LBB4_1: // %exit +; CHECK-SDAG-NEXT: mov sp, x29 +; CHECK-SDAG-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ret +; CHECK-SDAG-NEXT: .LBB4_2: // %catch +; CHECK-SDAG-NEXT: .Ltmp14: // EH_LABEL +; CHECK-SDAG-NEXT: mov x1, x0 +; CHECK-SDAG-NEXT: mov x0, x19 +; CHECK-SDAG-NEXT: bl __arm_sme_restore +; CHECK-SDAG-NEXT: mov x0, x19 +; CHECK-SDAG-NEXT: bl __arm_sme_save +; CHECK-SDAG-NEXT: mov x0, x1 +; CHECK-SDAG-NEXT: bl __cxa_begin_catch +; CHECK-SDAG-NEXT: mov x0, x19 +; CHECK-SDAG-NEXT: bl __arm_sme_restore +; CHECK-SDAG-NEXT: mov x0, x19 +; CHECK-SDAG-NEXT: bl __arm_sme_save +; CHECK-SDAG-NEXT: bl __cxa_end_catch +; CHECK-SDAG-NEXT: mov x0, x19 +; CHECK-SDAG-NEXT: bl __arm_sme_restore +; CHECK-SDAG-NEXT: b .LBB4_1 + invoke void @may_throw() + to label %exit unwind label %catch +catch: + %eh_info = landingpad { ptr, i32 } + catch ptr null + %exception_ptr = extractvalue { ptr, i32 } %eh_info, 0 + tail call ptr @__cxa_begin_catch(ptr %exception_ptr) + tail call void @__cxa_end_catch() + br label %exit + +exit: + ret void +} + declare ptr @__cxa_allocate_exception(i64) declare void @__cxa_throw(ptr, ptr, ptr) declare ptr @__cxa_begin_catch(ptr) @@ -284,3 +741,4 @@ declare i32 @__gxx_personality_v0(...) declare void @may_throw() declare void @shared_za_call() "aarch64_inout_za" declare void @noexcept_shared_za_call() "aarch64_inout_za" +declare void @shared_zt0_call() "aarch64_inout_zt0" diff --git a/llvm/test/CodeGen/AArch64/vecreduce-bool.ll b/llvm/test/CodeGen/AArch64/vecreduce-bool.ll index 62d41fc..19e1aa5 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-bool.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-bool.ll @@ -26,9 +26,9 @@ define i32 @reduce_and_v1i8(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind { ; CHECK-LABEL: reduce_and_v1i8: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: smov w8, v0.b[0] -; CHECK-NEXT: cmp w8, #0 -; CHECK-NEXT: csel w0, w0, w1, mi +; CHECK-NEXT: umov w8, v0.b[0] +; CHECK-NEXT: tst w8, #0x80 +; CHECK-NEXT: csel w0, w0, w1, ne ; CHECK-NEXT: ret %x = icmp slt <1 x i8> %a0, zeroinitializer %y = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> %x) @@ -120,9 +120,9 @@ define i32 @reduce_and_v1i16(<1 x i16> %a0, i32 %a1, i32 %a2) nounwind { ; CHECK-LABEL: reduce_and_v1i16: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: smov w8, v0.h[0] -; CHECK-NEXT: cmp w8, #0 -; CHECK-NEXT: csel w0, w0, w1, mi +; CHECK-NEXT: umov w8, v0.h[0] +; CHECK-NEXT: tst w8, #0x8000 +; CHECK-NEXT: csel w0, w0, w1, ne ; CHECK-NEXT: ret %x = icmp slt <1 x i16> %a0, zeroinitializer %y = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> %x) @@ -305,9 +305,9 @@ define i32 @reduce_or_v1i8(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind { ; CHECK-LABEL: reduce_or_v1i8: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: smov w8, v0.b[0] -; CHECK-NEXT: cmp w8, #0 -; CHECK-NEXT: csel w0, w0, w1, mi +; CHECK-NEXT: umov w8, v0.b[0] +; CHECK-NEXT: tst w8, #0x80 +; CHECK-NEXT: csel w0, w0, w1, ne ; CHECK-NEXT: ret %x = icmp slt <1 x i8> %a0, zeroinitializer %y = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> %x) @@ -399,9 +399,9 @@ define i32 @reduce_or_v1i16(<1 x i16> %a0, i32 %a1, i32 %a2) nounwind { ; CHECK-LABEL: reduce_or_v1i16: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: smov w8, v0.h[0] -; CHECK-NEXT: cmp w8, #0 -; CHECK-NEXT: csel w0, w0, w1, mi +; CHECK-NEXT: umov w8, v0.h[0] +; CHECK-NEXT: tst w8, #0x8000 +; CHECK-NEXT: csel w0, w0, w1, ne ; CHECK-NEXT: ret %x = icmp slt <1 x i16> %a0, zeroinitializer %y = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> %x) @@ -584,9 +584,9 @@ define i32 @reduce_xor_v1i8(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind { ; CHECK-LABEL: reduce_xor_v1i8: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: smov w8, v0.b[0] -; CHECK-NEXT: cmp w8, #0 -; CHECK-NEXT: csel w0, w0, w1, mi +; CHECK-NEXT: umov w8, v0.b[0] +; CHECK-NEXT: tst w8, #0x80 +; CHECK-NEXT: csel w0, w0, w1, ne ; CHECK-NEXT: ret %x = icmp slt <1 x i8> %a0, zeroinitializer %y = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> %x) @@ -679,9 +679,9 @@ define i32 @reduce_xor_v1i16(<1 x i16> %a0, i32 %a1, i32 %a2) nounwind { ; CHECK-LABEL: reduce_xor_v1i16: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: smov w8, v0.h[0] -; CHECK-NEXT: cmp w8, #0 -; CHECK-NEXT: csel w0, w0, w1, mi +; CHECK-NEXT: umov w8, v0.h[0] +; CHECK-NEXT: tst w8, #0x8000 +; CHECK-NEXT: csel w0, w0, w1, ne ; CHECK-NEXT: ret %x = icmp slt <1 x i16> %a0, zeroinitializer %y = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> %x) diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll index 92d3277..bb22144 100644 --- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll @@ -4148,28 +4148,28 @@ define <2 x half> @mul_select_negk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; -------------------------------------------------------------------------------- define <2 x half> @select_fneg_posk_src_add_v2f16(<2 x i32> %c, <2 x half> %x, <2 x half> %y) { -; CI-SAFE-LABEL: select_fneg_posk_src_add_v2f16: -; CI-SAFE: ; %bb.0: -; CI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-SAFE-NEXT: v_add_f32_e32 v3, 4.0, v3 -; CI-SAFE-NEXT: v_add_f32_e32 v2, 4.0, v2 -; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-SAFE-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; CI-SAFE-NEXT: v_or_b32_e32 v2, v2, v3 -; CI-SAFE-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v2 -; CI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-SAFE-NEXT: v_cndmask_b32_e32 v0, 2.0, v3, vcc -; CI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-SAFE-NEXT: v_cndmask_b32_e32 v1, 2.0, v2, vcc -; CI-SAFE-NEXT: s_setpc_b64 s[30:31] +; CI-LABEL: select_fneg_posk_src_add_v2f16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_add_f32_e32 v3, 4.0, v3 +; CI-NEXT: v_add_f32_e32 v2, 4.0, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_or_b32_e32 v2, v2, v3 +; CI-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v2 +; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v3, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, 2.0, v2, vcc +; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: select_fneg_posk_src_add_v2f16: ; VI-SAFE: ; %bb.0: @@ -4229,21 +4229,6 @@ define <2 x half> @select_fneg_posk_src_add_v2f16(<2 x i32> %c, <2 x half> %x, < ; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] ; -; CI-NSZ-LABEL: select_fneg_posk_src_add_v2f16: -; CI-NSZ: ; %bb.0: -; CI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NSZ-NEXT: v_sub_f32_e32 v2, -4.0, v2 -; CI-NSZ-NEXT: v_sub_f32_e32 v3, -4.0, v3 -; CI-NSZ-NEXT: v_cndmask_b32_e32 v0, 2.0, v2, vcc -; CI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NSZ-NEXT: v_cndmask_b32_e32 v1, 2.0, v3, vcc -; CI-NSZ-NEXT: s_setpc_b64 s[30:31] -; ; VI-NSZ-LABEL: select_fneg_posk_src_add_v2f16: ; VI-NSZ: ; %bb.0: ; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -4302,6 +4287,105 @@ define <2 x half> @select_fneg_posk_src_add_v2f16(<2 x i32> %c, <2 x half> %x, < ret <2 x half> %select } +define <2 x half> @select_fneg_posk_src_add_v2f16_nsz(<2 x i32> %c, <2 x half> %x, <2 x half> %y) { +; CI-LABEL: select_fneg_posk_src_add_v2f16_nsz: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_sub_f32_e32 v2, -4.0, v2 +; CI-NEXT: v_sub_f32_e32 v3, -4.0, v3 +; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v2, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, 2.0, v3, vcc +; CI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: select_fneg_posk_src_add_v2f16_nsz: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; VI-NEXT: v_mov_b32_e32 v1, 0xc400 +; VI-NEXT: v_sub_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_sub_f16_e32 v2, -4.0, v2 +; VI-NEXT: v_mov_b32_e32 v3, 0x4000 +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[4:5] +; VI-NEXT: v_cndmask_b32_sdwa v1, v3, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: select_fneg_posk_src_add_v2f16_nsz: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: v_pk_add_f16 v1, v2, -4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] +; GFX9-NEXT: v_mov_b32_e32 v2, 0x4000 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-TRUE16-LABEL: select_fneg_posk_src_add_v2f16_nsz: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: v_pk_add_f16 v0, v2, -4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v1 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4000, v0.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x4000, v0.h, s0 +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: select_fneg_posk_src_add_v2f16_nsz: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_pk_add_f16 v2, v2, -4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: select_fneg_posk_src_add_v2f16_nsz: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: v_pk_add_f16 v0, v2, -4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v1 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4000, v0.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x4000, v0.h, s0 +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: select_fneg_posk_src_add_v2f16_nsz: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_pk_add_f16 v2, v2, -4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq <2 x i32> %c, zeroinitializer + %add = fadd nsz <2 x half> %x, <half 4.0, half 4.0> + %fneg = fneg <2 x half> %add + %select = select <2 x i1> %cmp, <2 x half> %fneg, <2 x half> <half 2.0, half 2.0> + ret <2 x half> %select +} + define <2 x half> @select_fneg_posk_src_sub_v2f16(<2 x i32> %c, <2 x half> %x) { ; CI-SAFE-LABEL: select_fneg_posk_src_sub_v2f16: ; CI-SAFE: ; %bb.0: @@ -4704,34 +4788,34 @@ define <2 x half> @select_fneg_posk_src_fma_v2f16(<2 x i32> %c, <2 x half> %x, < } define <2 x half> @select_fneg_posk_src_fmad_v2f16(<2 x i32> %c, <2 x half> %x, <2 x half> %z) { -; CI-SAFE-LABEL: select_fneg_posk_src_fmad_v2f16: -; CI-SAFE: ; %bb.0: -; CI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-SAFE-NEXT: v_mul_f32_e32 v3, 4.0, v3 -; CI-SAFE-NEXT: v_add_f32_e32 v3, v3, v5 -; CI-SAFE-NEXT: v_mul_f32_e32 v2, 4.0, v2 -; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-SAFE-NEXT: v_add_f32_e32 v2, v2, v4 -; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-SAFE-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; CI-SAFE-NEXT: v_or_b32_e32 v2, v2, v3 -; CI-SAFE-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v2 -; CI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-SAFE-NEXT: v_cndmask_b32_e32 v0, 2.0, v3, vcc -; CI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-SAFE-NEXT: v_cndmask_b32_e32 v1, 2.0, v2, vcc -; CI-SAFE-NEXT: s_setpc_b64 s[30:31] +; CI-LABEL: select_fneg_posk_src_fmad_v2f16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_mul_f32_e32 v3, 4.0, v3 +; CI-NEXT: v_add_f32_e32 v3, v3, v5 +; CI-NEXT: v_mul_f32_e32 v2, 4.0, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_add_f32_e32 v2, v2, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_or_b32_e32 v2, v2, v3 +; CI-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v2 +; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v3, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, 2.0, v2, vcc +; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: select_fneg_posk_src_fmad_v2f16: ; VI-SAFE: ; %bb.0: @@ -4793,27 +4877,6 @@ define <2 x half> @select_fneg_posk_src_fmad_v2f16(<2 x i32> %c, <2 x half> %x, ; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] ; -; CI-NSZ-LABEL: select_fneg_posk_src_fmad_v2f16: -; CI-NSZ: ; %bb.0: -; CI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NSZ-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NSZ-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NSZ-NEXT: v_mul_f32_e32 v2, -4.0, v2 -; CI-NSZ-NEXT: v_mul_f32_e32 v3, -4.0, v3 -; CI-NSZ-NEXT: v_sub_f32_e32 v2, v2, v4 -; CI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NSZ-NEXT: v_sub_f32_e32 v3, v3, v5 -; CI-NSZ-NEXT: v_cndmask_b32_e32 v0, 2.0, v2, vcc -; CI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NSZ-NEXT: v_cndmask_b32_e32 v1, 2.0, v3, vcc -; CI-NSZ-NEXT: s_setpc_b64 s[30:31] -; ; VI-NSZ-LABEL: select_fneg_posk_src_fmad_v2f16: ; VI-NSZ: ; %bb.0: ; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -4873,6 +4936,112 @@ define <2 x half> @select_fneg_posk_src_fmad_v2f16(<2 x i32> %c, <2 x half> %x, ret <2 x half> %select } +define <2 x half> @select_fneg_posk_src_fmad_v2f16_nsz(<2 x i32> %c, <2 x half> %x, <2 x half> %z) { +; CI-LABEL: select_fneg_posk_src_fmad_v2f16_nsz: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_mul_f32_e32 v2, -4.0, v2 +; CI-NEXT: v_mul_f32_e32 v3, -4.0, v3 +; CI-NEXT: v_sub_f32_e32 v2, v2, v4 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_sub_f32_e32 v3, v3, v5 +; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v2, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, 2.0, v3, vcc +; CI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: select_fneg_posk_src_fmad_v2f16_nsz: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; VI-NEXT: v_fma_f16 v1, v4, -4.0, -v1 +; VI-NEXT: v_fma_f16 v2, v2, -4.0, -v3 +; VI-NEXT: v_mov_b32_e32 v3, 0x4000 +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[4:5] +; VI-NEXT: v_cndmask_b32_sdwa v1, v3, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: select_fneg_posk_src_fmad_v2f16_nsz: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: v_pk_fma_f16 v1, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX9-NEXT: v_mov_b32_e32 v2, 0x4000 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-TRUE16-LABEL: select_fneg_posk_src_fmad_v2f16_nsz: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: v_pk_fma_f16 v0, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v1 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4000, v0.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x4000, v0.h, s0 +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: select_fneg_posk_src_fmad_v2f16_nsz: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_pk_fma_f16 v2, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: select_fneg_posk_src_fmad_v2f16_nsz: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: v_pk_fma_f16 v0, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v1 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4000, v0.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x4000, v0.h, s0 +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: select_fneg_posk_src_fmad_v2f16_nsz: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_pk_fma_f16 v2, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq <2 x i32> %c, zeroinitializer + %fmad = call nsz <2 x half> @llvm.fmuladd.v2f16(<2 x half> %x, <2 x half> <half 4.0, half 4.0>, <2 x half> %z) + %fneg = fneg <2 x half> %fmad + %select = select <2 x i1> %cmp, <2 x half> %fneg, <2 x half> <half 2.0, half 2.0> + ret <2 x half> %select +} + declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #0 declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #0 declare <2 x half> @llvm.fmuladd.v2f16(<2 x half>, <2 x half>, <2 x half>) #0 diff --git a/llvm/test/CodeGen/AMDGPU/v_mac.ll b/llvm/test/CodeGen/AMDGPU/v_mac.ll index c128715..f5dc824 100644 --- a/llvm/test/CodeGen/AMDGPU/v_mac.ll +++ b/llvm/test/CodeGen/AMDGPU/v_mac.ll @@ -116,7 +116,7 @@ entry: ; GCN-LABEL: {{^}}nsz_mad_sub0_src0: ; GCN-NOT: v_mac_f32 ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} -define amdgpu_kernel void @nsz_mad_sub0_src0(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +define amdgpu_kernel void @nsz_mad_sub0_src0(ptr addrspace(1) %out, ptr addrspace(1) %in) { entry: %b_ptr = getelementptr float, ptr addrspace(1) %in, i32 1 %c_ptr = getelementptr float, ptr addrspace(1) %in, i32 2 @@ -125,7 +125,7 @@ entry: %b = load float, ptr addrspace(1) %b_ptr %c = load float, ptr addrspace(1) %c_ptr - %neg_a = fsub float 0.0, %a + %neg_a = fsub nsz float 0.0, %a %tmp0 = fmul float %neg_a, %b %tmp1 = fadd float %tmp0, %c @@ -176,7 +176,7 @@ entry: ; GCN-LABEL: {{^}}nsz_mad_sub0_src1: ; GCN-NOT: v_mac_f32 ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} -define amdgpu_kernel void @nsz_mad_sub0_src1(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +define amdgpu_kernel void @nsz_mad_sub0_src1(ptr addrspace(1) %out, ptr addrspace(1) %in) { entry: %b_ptr = getelementptr float, ptr addrspace(1) %in, i32 1 %c_ptr = getelementptr float, ptr addrspace(1) %in, i32 2 @@ -185,7 +185,7 @@ entry: %b = load float, ptr addrspace(1) %b_ptr %c = load float, ptr addrspace(1) %c_ptr - %neg_b = fsub float 0.0, %b + %neg_b = fsub nsz float 0.0, %b %tmp0 = fmul float %a, %neg_b %tmp1 = fadd float %tmp0, %c @@ -310,6 +310,5 @@ define float @v_mac_f32_dynamic_ftz(float %a, float %b, float %c) "denormal-fp-m declare i32 @llvm.amdgcn.workitem.id.x() #2 attributes #0 = { nounwind "no-signed-zeros-fp-math"="false" } -attributes #1 = { nounwind "no-signed-zeros-fp-math"="true" } attributes #2 = { nounwind readnone } attributes #3 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll b/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll index bcc60b0..8da6f23 100644 --- a/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll @@ -236,7 +236,7 @@ entry: %b.val = load half, ptr addrspace(1) %b %c.val = load half, ptr addrspace(1) %c - %a.neg = fsub half 0.0, %a.val + %a.neg = fsub nsz half 0.0, %a.val %t.val = fmul half %a.neg, %b.val %r.val = fadd half %t.val, %c.val @@ -263,7 +263,7 @@ entry: %b.val = load half, ptr addrspace(1) %b %c.val = load half, ptr addrspace(1) %c - %b.neg = fsub half 0.0, %b.val + %b.neg = fsub nsz half 0.0, %b.val %t.val = fmul half %a.val, %b.neg %r.val = fadd half %t.val, %c.val @@ -290,7 +290,7 @@ entry: %b.val = load half, ptr addrspace(1) %b %c.val = load half, ptr addrspace(1) %c - %c.neg = fsub half 0.0, %c.val + %c.neg = fsub nsz half 0.0, %c.val %t.val = fmul half %a.val, %b.val %r.val = fadd half %t.val, %c.neg @@ -601,7 +601,7 @@ entry: %b.val = load <2 x half>, ptr addrspace(1) %b %c.val = load <2 x half>, ptr addrspace(1) %c - %a.neg = fsub <2 x half> <half 0.0, half 0.0>, %a.val + %a.neg = fsub nsz <2 x half> <half 0.0, half 0.0>, %a.val %t.val = fmul <2 x half> %a.neg, %b.val %r.val = fadd <2 x half> %t.val, %c.val @@ -634,7 +634,7 @@ entry: %b.val = load <2 x half>, ptr addrspace(1) %b %c.val = load <2 x half>, ptr addrspace(1) %c - %b.neg = fsub <2 x half> <half 0.0, half 0.0>, %b.val + %b.neg = fsub nsz <2 x half> <half 0.0, half 0.0>, %b.val %t.val = fmul <2 x half> %a.val, %b.neg %r.val = fadd <2 x half> %t.val, %c.val @@ -667,7 +667,7 @@ entry: %b.val = load <2 x half>, ptr addrspace(1) %b %c.val = load <2 x half>, ptr addrspace(1) %c - %c.neg = fsub <2 x half> <half 0.0, half 0.0>, %c.val + %c.neg = fsub nsz <2 x half> <half 0.0, half 0.0>, %c.val %t.val = fmul <2 x half> %a.val, %b.val %r.val = fadd <2 x half> %t.val, %c.neg @@ -678,5 +678,5 @@ entry: declare void @llvm.amdgcn.s.barrier() #2 attributes #0 = { nounwind "no-signed-zeros-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" } -attributes #1 = { nounwind "no-signed-zeros-fp-math"="true" "denormal-fp-math"="preserve-sign,preserve-sign" } +attributes #1 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" } attributes #2 = { nounwind convergent } diff --git a/llvm/test/CodeGen/LoongArch/lasx/scalarize-fp.ll b/llvm/test/CodeGen/LoongArch/lasx/scalarize-fp.ll new file mode 100644 index 0000000..39ac647 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lasx/scalarize-fp.ll @@ -0,0 +1,58 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc --mtriple=loongarch32 -mattr=+32s,+lasx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch64 -mattr=+lasx < %s | FileCheck %s + +define <8 x float> @fadd_elt0_v8f32(float %a) nounwind { +; CHECK-LABEL: fadd_elt0_v8f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vldi $vr1, -1168 +; CHECK-NEXT: fadd.s $fa0, $fa0, $fa1 +; CHECK-NEXT: ret +entry: + %b = insertelement <8 x float> poison, float %a, i32 0 + %c = fadd <8 x float> %b, <float 1.0, float poison, float poison, float poison, float poison, float poison, float poison, float poison> + ret <8 x float> %c +} + +define <4 x double> @fadd_elt0_v4f64(double %a) nounwind { +; CHECK-LABEL: fadd_elt0_v4f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vldi $vr1, -912 +; CHECK-NEXT: fadd.d $fa0, $fa0, $fa1 +; CHECK-NEXT: ret +entry: + %b = insertelement <4 x double> poison, double %a, i32 0 + %c = fadd <4 x double> %b, <double 1.0, double poison, double poison, double poison> + ret <4 x double> %c +} + +define <8 x float> @fsub_splat_v8f32(float %a, float %b) nounwind { +; CHECK-LABEL: fsub_splat_v8f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: fsub.s $fa0, $fa0, $fa1 +; CHECK-NEXT: xvreplve0.w $xr0, $xr0 +; CHECK-NEXT: ret +entry: + %insa = insertelement <8 x float> poison, float %a, i32 0 + %insb = insertelement <8 x float> poison, float %b, i32 0 + %va = shufflevector <8 x float> %insa, <8 x float> poison, <8 x i32> zeroinitializer + %vb = shufflevector <8 x float> %insb, <8 x float> poison, <8 x i32> zeroinitializer + %c = fsub <8 x float> %va, %vb + ret <8 x float> %c +} + +define <4 x double> @fsub_splat_v4f64(double %a) nounwind { +; CHECK-LABEL: fsub_splat_v4f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vldi $vr1, -784 +; CHECK-NEXT: fadd.d $fa0, $fa0, $fa1 +; CHECK-NEXT: xvreplve0.d $xr0, $xr0 +; CHECK-NEXT: ret +entry: + %insa = insertelement <4 x double> poison, double %a, i32 0 + %insb = insertelement <4 x double> poison, double 1.0, i32 0 + %va = shufflevector <4 x double> %insa, <4 x double> poison, <4 x i32> zeroinitializer + %vb = shufflevector <4 x double> %insb, <4 x double> poison, <4 x i32> zeroinitializer + %c = fsub <4 x double> %va, %vb + ret <4 x double> %c +} diff --git a/llvm/test/CodeGen/LoongArch/lsx/scalarize-fp.ll b/llvm/test/CodeGen/LoongArch/lsx/scalarize-fp.ll new file mode 100644 index 0000000..b651f11 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lsx/scalarize-fp.ll @@ -0,0 +1,58 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc --mtriple=loongarch32 -mattr=+32s,+lsx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch64 -mattr=+lsx < %s | FileCheck %s + +define <4 x float> @fadd_elt0_v4f32(float %a) nounwind { +; CHECK-LABEL: fadd_elt0_v4f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vldi $vr1, -1168 +; CHECK-NEXT: fadd.s $fa0, $fa0, $fa1 +; CHECK-NEXT: ret +entry: + %b = insertelement <4 x float> poison, float %a, i32 0 + %c = fadd <4 x float> %b, <float 1.0, float poison, float poison, float poison> + ret <4 x float> %c +} + +define <2 x double> @fadd_elt0_v2f64(double %a) nounwind { +; CHECK-LABEL: fadd_elt0_v2f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vldi $vr1, -912 +; CHECK-NEXT: fadd.d $fa0, $fa0, $fa1 +; CHECK-NEXT: ret +entry: + %b = insertelement <2 x double> poison, double %a, i32 0 + %c = fadd <2 x double> %b, <double 1.0, double poison> + ret <2 x double> %c +} + +define <4 x float> @fsub_splat_v4f32(float %b) nounwind { +; CHECK-LABEL: fsub_splat_v4f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vldi $vr1, -1168 +; CHECK-NEXT: fsub.s $fa0, $fa1, $fa0 +; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0 +; CHECK-NEXT: ret +entry: + %insa = insertelement <4 x float> poison, float 1.0, i32 0 + %insb = insertelement <4 x float> poison, float %b, i32 0 + %va = shufflevector <4 x float> %insa, <4 x float> poison, <4 x i32> zeroinitializer + %vb = shufflevector <4 x float> %insb, <4 x float> poison, <4 x i32> zeroinitializer + %c = fsub <4 x float> %va, %vb + ret <4 x float> %c +} + +define <2 x double> @fsub_splat_v2f64(double %a, double %b) nounwind { +; CHECK-LABEL: fsub_splat_v2f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: fsub.d $fa0, $fa0, $fa1 +; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 +; CHECK-NEXT: ret +entry: + %insa = insertelement <2 x double> poison, double %a, i32 0 + %insb = insertelement <2 x double> poison, double %b, i32 0 + %va = shufflevector <2 x double> %insa, <2 x double> poison, <2 x i32> zeroinitializer + %vb = shufflevector <2 x double> %insb, <2 x double> poison, <2 x i32> zeroinitializer + %c = fsub <2 x double> %va, %vb + ret <2 x double> %c +} diff --git a/llvm/test/CodeGen/PowerPC/scalar_cmp.ll b/llvm/test/CodeGen/PowerPC/scalar_cmp.ll index aaabd76e..fd0b494 100644 --- a/llvm/test/CodeGen/PowerPC/scalar_cmp.ll +++ b/llvm/test/CodeGen/PowerPC/scalar_cmp.ll @@ -20,18 +20,18 @@ define float @select_oeq_float(float %a, float %b, float %c, float %d) { ; FAST-P8-LABEL: select_oeq_float: ; FAST-P8: # %bb.0: # %entry -; FAST-P8-NEXT: xssubsp f0, f2, f1 -; FAST-P8-NEXT: xssubsp f1, f1, f2 -; FAST-P8-NEXT: fsel f1, f1, f3, f4 -; FAST-P8-NEXT: fsel f1, f0, f1, f4 +; FAST-P8-NEXT: xssubsp f0, f1, f2 +; FAST-P8-NEXT: xsnegdp f1, f0 +; FAST-P8-NEXT: fsel f0, f0, f3, f4 +; FAST-P8-NEXT: fsel f1, f1, f0, f4 ; FAST-P8-NEXT: blr ; ; FAST-P9-LABEL: select_oeq_float: ; FAST-P9: # %bb.0: # %entry -; FAST-P9-NEXT: xssubsp f0, f2, f1 -; FAST-P9-NEXT: xssubsp f1, f1, f2 -; FAST-P9-NEXT: fsel f1, f1, f3, f4 -; FAST-P9-NEXT: fsel f1, f0, f1, f4 +; FAST-P9-NEXT: xssubsp f0, f1, f2 +; FAST-P9-NEXT: xsnegdp f1, f0 +; FAST-P9-NEXT: fsel f0, f0, f3, f4 +; FAST-P9-NEXT: fsel f1, f1, f0, f4 ; FAST-P9-NEXT: blr ; ; NO-FAST-P8-LABEL: select_oeq_float: @@ -59,6 +59,48 @@ entry: ret float %cond } +define float @select_oeq_float_nsz(float %a, float %b, float %c, float %d) { +; FAST-P8-LABEL: select_oeq_float_nsz: +; FAST-P8: # %bb.0: # %entry +; FAST-P8-NEXT: xssubsp f0, f2, f1 +; FAST-P8-NEXT: xssubsp f1, f1, f2 +; FAST-P8-NEXT: fsel f1, f1, f3, f4 +; FAST-P8-NEXT: fsel f1, f0, f1, f4 +; FAST-P8-NEXT: blr +; +; FAST-P9-LABEL: select_oeq_float_nsz: +; FAST-P9: # %bb.0: # %entry +; FAST-P9-NEXT: xssubsp f0, f2, f1 +; FAST-P9-NEXT: xssubsp f1, f1, f2 +; FAST-P9-NEXT: fsel f1, f1, f3, f4 +; FAST-P9-NEXT: fsel f1, f0, f1, f4 +; FAST-P9-NEXT: blr +; +; NO-FAST-P8-LABEL: select_oeq_float_nsz: +; NO-FAST-P8: # %bb.0: # %entry +; NO-FAST-P8-NEXT: fcmpu cr0, f1, f2 +; NO-FAST-P8-NEXT: beq cr0, .LBB1_2 +; NO-FAST-P8-NEXT: # %bb.1: # %entry +; NO-FAST-P8-NEXT: fmr f3, f4 +; NO-FAST-P8-NEXT: .LBB1_2: # %entry +; NO-FAST-P8-NEXT: fmr f1, f3 +; NO-FAST-P8-NEXT: blr +; +; NO-FAST-P9-LABEL: select_oeq_float_nsz: +; NO-FAST-P9: # %bb.0: # %entry +; NO-FAST-P9-NEXT: fcmpu cr0, f1, f2 +; NO-FAST-P9-NEXT: beq cr0, .LBB1_2 +; NO-FAST-P9-NEXT: # %bb.1: # %entry +; NO-FAST-P9-NEXT: fmr f3, f4 +; NO-FAST-P9-NEXT: .LBB1_2: # %entry +; NO-FAST-P9-NEXT: fmr f1, f3 +; NO-FAST-P9-NEXT: blr +entry: + %cmp = fcmp nsz oeq float %a, %b + %cond = select i1 %cmp, float %c, float %d + ret float %cond +} + define double @select_oeq_double(double %a, double %b, double %c, double %d) { ; FAST-P8-LABEL: select_oeq_double: ; FAST-P8: # %bb.0: # %entry @@ -79,20 +121,20 @@ define double @select_oeq_double(double %a, double %b, double %c, double %d) { ; NO-FAST-P8-LABEL: select_oeq_double: ; NO-FAST-P8: # %bb.0: # %entry ; NO-FAST-P8-NEXT: xscmpudp cr0, f1, f2 -; NO-FAST-P8-NEXT: beq cr0, .LBB1_2 +; NO-FAST-P8-NEXT: beq cr0, .LBB2_2 ; NO-FAST-P8-NEXT: # %bb.1: # %entry ; NO-FAST-P8-NEXT: fmr f3, f4 -; NO-FAST-P8-NEXT: .LBB1_2: # %entry +; NO-FAST-P8-NEXT: .LBB2_2: # %entry ; NO-FAST-P8-NEXT: fmr f1, f3 ; NO-FAST-P8-NEXT: blr ; ; NO-FAST-P9-LABEL: select_oeq_double: ; NO-FAST-P9: # %bb.0: # %entry ; NO-FAST-P9-NEXT: xscmpudp cr0, f1, f2 -; NO-FAST-P9-NEXT: beq cr0, .LBB1_2 +; NO-FAST-P9-NEXT: beq cr0, .LBB2_2 ; NO-FAST-P9-NEXT: # %bb.1: # %entry ; NO-FAST-P9-NEXT: fmr f3, f4 -; NO-FAST-P9-NEXT: .LBB1_2: # %entry +; NO-FAST-P9-NEXT: .LBB2_2: # %entry ; NO-FAST-P9-NEXT: fmr f1, f3 ; NO-FAST-P9-NEXT: blr entry: @@ -182,13 +224,57 @@ entry: define float @select_one_float(float %a, float %b, float %c, float %d) { ; FAST-P8-LABEL: select_one_float: ; FAST-P8: # %bb.0: # %entry +; FAST-P8-NEXT: xssubsp f0, f1, f2 +; FAST-P8-NEXT: xsnegdp f1, f0 +; FAST-P8-NEXT: fsel f0, f0, f4, f3 +; FAST-P8-NEXT: fsel f1, f1, f0, f3 +; FAST-P8-NEXT: blr +; +; FAST-P9-LABEL: select_one_float: +; FAST-P9: # %bb.0: # %entry +; FAST-P9-NEXT: xssubsp f0, f1, f2 +; FAST-P9-NEXT: xsnegdp f1, f0 +; FAST-P9-NEXT: fsel f0, f0, f4, f3 +; FAST-P9-NEXT: fsel f1, f1, f0, f3 +; FAST-P9-NEXT: blr +; +; NO-FAST-P8-LABEL: select_one_float: +; NO-FAST-P8: # %bb.0: # %entry +; NO-FAST-P8-NEXT: fcmpu cr0, f1, f2 +; NO-FAST-P8-NEXT: crnor 4*cr5+lt, un, eq +; NO-FAST-P8-NEXT: bc 12, 4*cr5+lt, .LBB5_2 +; NO-FAST-P8-NEXT: # %bb.1: # %entry +; NO-FAST-P8-NEXT: fmr f3, f4 +; NO-FAST-P8-NEXT: .LBB5_2: # %entry +; NO-FAST-P8-NEXT: fmr f1, f3 +; NO-FAST-P8-NEXT: blr +; +; NO-FAST-P9-LABEL: select_one_float: +; NO-FAST-P9: # %bb.0: # %entry +; NO-FAST-P9-NEXT: fcmpu cr0, f1, f2 +; NO-FAST-P9-NEXT: crnor 4*cr5+lt, un, eq +; NO-FAST-P9-NEXT: bc 12, 4*cr5+lt, .LBB5_2 +; NO-FAST-P9-NEXT: # %bb.1: # %entry +; NO-FAST-P9-NEXT: fmr f3, f4 +; NO-FAST-P9-NEXT: .LBB5_2: # %entry +; NO-FAST-P9-NEXT: fmr f1, f3 +; NO-FAST-P9-NEXT: blr +entry: + %cmp = fcmp one float %a, %b + %cond = select i1 %cmp, float %c, float %d + ret float %cond +} + +define float @select_one_float_nsz(float %a, float %b, float %c, float %d) { +; FAST-P8-LABEL: select_one_float_nsz: +; FAST-P8: # %bb.0: # %entry ; FAST-P8-NEXT: xssubsp f0, f2, f1 ; FAST-P8-NEXT: xssubsp f1, f1, f2 ; FAST-P8-NEXT: fsel f1, f1, f4, f3 ; FAST-P8-NEXT: fsel f1, f0, f1, f3 ; FAST-P8-NEXT: blr ; -; FAST-P9-LABEL: select_one_float: +; FAST-P9-LABEL: select_one_float_nsz: ; FAST-P9: # %bb.0: # %entry ; FAST-P9-NEXT: xssubsp f0, f2, f1 ; FAST-P9-NEXT: xssubsp f1, f1, f2 @@ -196,29 +282,29 @@ define float @select_one_float(float %a, float %b, float %c, float %d) { ; FAST-P9-NEXT: fsel f1, f0, f1, f3 ; FAST-P9-NEXT: blr ; -; NO-FAST-P8-LABEL: select_one_float: +; NO-FAST-P8-LABEL: select_one_float_nsz: ; NO-FAST-P8: # %bb.0: # %entry ; NO-FAST-P8-NEXT: fcmpu cr0, f1, f2 ; NO-FAST-P8-NEXT: crnor 4*cr5+lt, un, eq -; NO-FAST-P8-NEXT: bc 12, 4*cr5+lt, .LBB4_2 +; NO-FAST-P8-NEXT: bc 12, 4*cr5+lt, .LBB6_2 ; NO-FAST-P8-NEXT: # %bb.1: # %entry ; NO-FAST-P8-NEXT: fmr f3, f4 -; NO-FAST-P8-NEXT: .LBB4_2: # %entry +; NO-FAST-P8-NEXT: .LBB6_2: # %entry ; NO-FAST-P8-NEXT: fmr f1, f3 ; NO-FAST-P8-NEXT: blr ; -; NO-FAST-P9-LABEL: select_one_float: +; NO-FAST-P9-LABEL: select_one_float_nsz: ; NO-FAST-P9: # %bb.0: # %entry ; NO-FAST-P9-NEXT: fcmpu cr0, f1, f2 ; NO-FAST-P9-NEXT: crnor 4*cr5+lt, un, eq -; NO-FAST-P9-NEXT: bc 12, 4*cr5+lt, .LBB4_2 +; NO-FAST-P9-NEXT: bc 12, 4*cr5+lt, .LBB6_2 ; NO-FAST-P9-NEXT: # %bb.1: # %entry ; NO-FAST-P9-NEXT: fmr f3, f4 -; NO-FAST-P9-NEXT: .LBB4_2: # %entry +; NO-FAST-P9-NEXT: .LBB6_2: # %entry ; NO-FAST-P9-NEXT: fmr f1, f3 ; NO-FAST-P9-NEXT: blr entry: - %cmp = fcmp one float %a, %b + %cmp = fcmp nsz one float %a, %b %cond = select i1 %cmp, float %c, float %d ret float %cond } @@ -244,10 +330,10 @@ define double @select_one_double(double %a, double %b, double %c, double %d) { ; NO-FAST-P8: # %bb.0: # %entry ; NO-FAST-P8-NEXT: fcmpu cr0, f1, f2 ; NO-FAST-P8-NEXT: crnor 4*cr5+lt, un, eq -; NO-FAST-P8-NEXT: bc 12, 4*cr5+lt, .LBB5_2 +; NO-FAST-P8-NEXT: bc 12, 4*cr5+lt, .LBB7_2 ; NO-FAST-P8-NEXT: # %bb.1: # %entry ; NO-FAST-P8-NEXT: fmr f3, f4 -; NO-FAST-P8-NEXT: .LBB5_2: # %entry +; NO-FAST-P8-NEXT: .LBB7_2: # %entry ; NO-FAST-P8-NEXT: fmr f1, f3 ; NO-FAST-P8-NEXT: blr ; @@ -255,10 +341,10 @@ define double @select_one_double(double %a, double %b, double %c, double %d) { ; NO-FAST-P9: # %bb.0: # %entry ; NO-FAST-P9-NEXT: fcmpu cr0, f1, f2 ; NO-FAST-P9-NEXT: crnor 4*cr5+lt, un, eq -; NO-FAST-P9-NEXT: bc 12, 4*cr5+lt, .LBB5_2 +; NO-FAST-P9-NEXT: bc 12, 4*cr5+lt, .LBB7_2 ; NO-FAST-P9-NEXT: # %bb.1: # %entry ; NO-FAST-P9-NEXT: fmr f3, f4 -; NO-FAST-P9-NEXT: .LBB5_2: # %entry +; NO-FAST-P9-NEXT: .LBB7_2: # %entry ; NO-FAST-P9-NEXT: fmr f1, f3 ; NO-FAST-P9-NEXT: blr entry: @@ -362,10 +448,10 @@ define float @select_oge_float(float %a, float %b, float %c, float %d) { ; NO-FAST-P8: # %bb.0: # %entry ; NO-FAST-P8-NEXT: fcmpu cr0, f1, f2 ; NO-FAST-P8-NEXT: crnor 4*cr5+lt, un, lt -; NO-FAST-P8-NEXT: bc 12, 4*cr5+lt, .LBB8_2 +; NO-FAST-P8-NEXT: bc 12, 4*cr5+lt, .LBB10_2 ; NO-FAST-P8-NEXT: # %bb.1: # %entry ; NO-FAST-P8-NEXT: fmr f3, f4 -; NO-FAST-P8-NEXT: .LBB8_2: # %entry +; NO-FAST-P8-NEXT: .LBB10_2: # %entry ; NO-FAST-P8-NEXT: fmr f1, f3 ; NO-FAST-P8-NEXT: blr ; @@ -373,10 +459,10 @@ define float @select_oge_float(float %a, float %b, float %c, float %d) { ; NO-FAST-P9: # %bb.0: # %entry ; NO-FAST-P9-NEXT: fcmpu cr0, f1, f2 ; NO-FAST-P9-NEXT: crnor 4*cr5+lt, un, lt -; NO-FAST-P9-NEXT: bc 12, 4*cr5+lt, .LBB8_2 +; NO-FAST-P9-NEXT: bc 12, 4*cr5+lt, .LBB10_2 ; NO-FAST-P9-NEXT: # %bb.1: # %entry ; NO-FAST-P9-NEXT: fmr f3, f4 -; NO-FAST-P9-NEXT: .LBB8_2: # %entry +; NO-FAST-P9-NEXT: .LBB10_2: # %entry ; NO-FAST-P9-NEXT: fmr f1, f3 ; NO-FAST-P9-NEXT: blr entry: @@ -402,10 +488,10 @@ define double @select_oge_double(double %a, double %b, double %c, double %d) { ; NO-FAST-P8: # %bb.0: # %entry ; NO-FAST-P8-NEXT: fcmpu cr0, f1, f2 ; NO-FAST-P8-NEXT: crnor 4*cr5+lt, un, lt -; NO-FAST-P8-NEXT: bc 12, 4*cr5+lt, .LBB9_2 +; NO-FAST-P8-NEXT: bc 12, 4*cr5+lt, .LBB11_2 ; NO-FAST-P8-NEXT: # %bb.1: # %entry ; NO-FAST-P8-NEXT: fmr f3, f4 -; NO-FAST-P8-NEXT: .LBB9_2: # %entry +; NO-FAST-P8-NEXT: .LBB11_2: # %entry ; NO-FAST-P8-NEXT: fmr f1, f3 ; NO-FAST-P8-NEXT: blr ; @@ -413,10 +499,10 @@ define double @select_oge_double(double %a, double %b, double %c, double %d) { ; NO-FAST-P9: # %bb.0: # %entry ; NO-FAST-P9-NEXT: fcmpu cr0, f1, f2 ; NO-FAST-P9-NEXT: crnor 4*cr5+lt, un, lt -; NO-FAST-P9-NEXT: bc 12, 4*cr5+lt, .LBB9_2 +; NO-FAST-P9-NEXT: bc 12, 4*cr5+lt, .LBB11_2 ; NO-FAST-P9-NEXT: # %bb.1: # %entry ; NO-FAST-P9-NEXT: fmr f3, f4 -; NO-FAST-P9-NEXT: .LBB9_2: # %entry +; NO-FAST-P9-NEXT: .LBB11_2: # %entry ; NO-FAST-P9-NEXT: fmr f1, f3 ; NO-FAST-P9-NEXT: blr entry: @@ -503,20 +589,20 @@ define float @select_olt_float(float %a, float %b, float %c, float %d) { ; NO-FAST-P8-LABEL: select_olt_float: ; NO-FAST-P8: # %bb.0: # %entry ; NO-FAST-P8-NEXT: fcmpu cr0, f1, f2 -; NO-FAST-P8-NEXT: blt cr0, .LBB12_2 +; NO-FAST-P8-NEXT: blt cr0, .LBB14_2 ; NO-FAST-P8-NEXT: # %bb.1: # %entry ; NO-FAST-P8-NEXT: fmr f3, f4 -; NO-FAST-P8-NEXT: .LBB12_2: # %entry +; NO-FAST-P8-NEXT: .LBB14_2: # %entry ; NO-FAST-P8-NEXT: fmr f1, f3 ; NO-FAST-P8-NEXT: blr ; ; NO-FAST-P9-LABEL: select_olt_float: ; NO-FAST-P9: # %bb.0: # %entry ; NO-FAST-P9-NEXT: fcmpu cr0, f1, f2 -; NO-FAST-P9-NEXT: blt cr0, .LBB12_2 +; NO-FAST-P9-NEXT: blt cr0, .LBB14_2 ; NO-FAST-P9-NEXT: # %bb.1: # %entry ; NO-FAST-P9-NEXT: fmr f3, f4 -; NO-FAST-P9-NEXT: .LBB12_2: # %entry +; NO-FAST-P9-NEXT: .LBB14_2: # %entry ; NO-FAST-P9-NEXT: fmr f1, f3 ; NO-FAST-P9-NEXT: blr entry: @@ -541,20 +627,20 @@ define double @select_olt_double(double %a, double %b, double %c, double %d) { ; NO-FAST-P8-LABEL: select_olt_double: ; NO-FAST-P8: # %bb.0: # %entry ; NO-FAST-P8-NEXT: xscmpudp cr0, f1, f2 -; NO-FAST-P8-NEXT: blt cr0, .LBB13_2 +; NO-FAST-P8-NEXT: blt cr0, .LBB15_2 ; NO-FAST-P8-NEXT: # %bb.1: # %entry ; NO-FAST-P8-NEXT: fmr f3, f4 -; NO-FAST-P8-NEXT: .LBB13_2: # %entry +; NO-FAST-P8-NEXT: .LBB15_2: # %entry ; NO-FAST-P8-NEXT: fmr f1, f3 ; NO-FAST-P8-NEXT: blr ; ; NO-FAST-P9-LABEL: select_olt_double: ; NO-FAST-P9: # %bb.0: # %entry ; NO-FAST-P9-NEXT: xscmpudp cr0, f1, f2 -; NO-FAST-P9-NEXT: blt cr0, .LBB13_2 +; NO-FAST-P9-NEXT: blt cr0, .LBB15_2 ; NO-FAST-P9-NEXT: # %bb.1: # %entry ; NO-FAST-P9-NEXT: fmr f3, f4 -; NO-FAST-P9-NEXT: .LBB13_2: # %entry +; NO-FAST-P9-NEXT: .LBB15_2: # %entry ; NO-FAST-P9-NEXT: fmr f1, f3 ; NO-FAST-P9-NEXT: blr entry: @@ -641,20 +727,20 @@ define float @select_ogt_float(float %a, float %b, float %c, float %d) { ; NO-FAST-P8-LABEL: select_ogt_float: ; NO-FAST-P8: # %bb.0: # %entry ; NO-FAST-P8-NEXT: fcmpu cr0, f1, f2 -; NO-FAST-P8-NEXT: bgt cr0, .LBB16_2 +; NO-FAST-P8-NEXT: bgt cr0, .LBB18_2 ; NO-FAST-P8-NEXT: # %bb.1: # %entry ; NO-FAST-P8-NEXT: fmr f3, f4 -; NO-FAST-P8-NEXT: .LBB16_2: # %entry +; NO-FAST-P8-NEXT: .LBB18_2: # %entry ; NO-FAST-P8-NEXT: fmr f1, f3 ; NO-FAST-P8-NEXT: blr ; ; NO-FAST-P9-LABEL: select_ogt_float: ; NO-FAST-P9: # %bb.0: # %entry ; NO-FAST-P9-NEXT: fcmpu cr0, f1, f2 -; NO-FAST-P9-NEXT: bgt cr0, .LBB16_2 +; NO-FAST-P9-NEXT: bgt cr0, .LBB18_2 ; NO-FAST-P9-NEXT: # %bb.1: # %entry ; NO-FAST-P9-NEXT: fmr f3, f4 -; NO-FAST-P9-NEXT: .LBB16_2: # %entry +; NO-FAST-P9-NEXT: .LBB18_2: # %entry ; NO-FAST-P9-NEXT: fmr f1, f3 ; NO-FAST-P9-NEXT: blr entry: @@ -679,20 +765,20 @@ define double @select_ogt_double(double %a, double %b, double %c, double %d) { ; NO-FAST-P8-LABEL: select_ogt_double: ; NO-FAST-P8: # %bb.0: # %entry ; NO-FAST-P8-NEXT: xscmpudp cr0, f1, f2 -; NO-FAST-P8-NEXT: bgt cr0, .LBB17_2 +; NO-FAST-P8-NEXT: bgt cr0, .LBB19_2 ; NO-FAST-P8-NEXT: # %bb.1: # %entry ; NO-FAST-P8-NEXT: fmr f3, f4 -; NO-FAST-P8-NEXT: .LBB17_2: # %entry +; NO-FAST-P8-NEXT: .LBB19_2: # %entry ; NO-FAST-P8-NEXT: fmr f1, f3 ; NO-FAST-P8-NEXT: blr ; ; NO-FAST-P9-LABEL: select_ogt_double: ; NO-FAST-P9: # %bb.0: # %entry ; NO-FAST-P9-NEXT: xscmpudp cr0, f1, f2 -; NO-FAST-P9-NEXT: bgt cr0, .LBB17_2 +; NO-FAST-P9-NEXT: bgt cr0, .LBB19_2 ; NO-FAST-P9-NEXT: # %bb.1: # %entry ; NO-FAST-P9-NEXT: fmr f3, f4 -; NO-FAST-P9-NEXT: .LBB17_2: # %entry +; NO-FAST-P9-NEXT: .LBB19_2: # %entry ; NO-FAST-P9-NEXT: fmr f1, f3 ; NO-FAST-P9-NEXT: blr entry: @@ -780,10 +866,10 @@ define float @select_ole_float(float %a, float %b, float %c, float %d) { ; NO-FAST-P8: # %bb.0: # %entry ; NO-FAST-P8-NEXT: fcmpu cr0, f1, f2 ; NO-FAST-P8-NEXT: crnor 4*cr5+lt, un, gt -; NO-FAST-P8-NEXT: bc 12, 4*cr5+lt, .LBB20_2 +; NO-FAST-P8-NEXT: bc 12, 4*cr5+lt, .LBB22_2 ; NO-FAST-P8-NEXT: # %bb.1: # %entry ; NO-FAST-P8-NEXT: fmr f3, f4 -; NO-FAST-P8-NEXT: .LBB20_2: # %entry +; NO-FAST-P8-NEXT: .LBB22_2: # %entry ; NO-FAST-P8-NEXT: fmr f1, f3 ; NO-FAST-P8-NEXT: blr ; @@ -791,10 +877,10 @@ define float @select_ole_float(float %a, float %b, float %c, float %d) { ; NO-FAST-P9: # %bb.0: # %entry ; NO-FAST-P9-NEXT: fcmpu cr0, f1, f2 ; NO-FAST-P9-NEXT: crnor 4*cr5+lt, un, gt -; NO-FAST-P9-NEXT: bc 12, 4*cr5+lt, .LBB20_2 +; NO-FAST-P9-NEXT: bc 12, 4*cr5+lt, .LBB22_2 ; NO-FAST-P9-NEXT: # %bb.1: # %entry ; NO-FAST-P9-NEXT: fmr f3, f4 -; NO-FAST-P9-NEXT: .LBB20_2: # %entry +; NO-FAST-P9-NEXT: .LBB22_2: # %entry ; NO-FAST-P9-NEXT: fmr f1, f3 ; NO-FAST-P9-NEXT: blr entry: @@ -820,10 +906,10 @@ define double @select_ole_double(double %a, double %b, double %c, double %d) { ; NO-FAST-P8: # %bb.0: # %entry ; NO-FAST-P8-NEXT: fcmpu cr0, f1, f2 ; NO-FAST-P8-NEXT: crnor 4*cr5+lt, un, gt -; NO-FAST-P8-NEXT: bc 12, 4*cr5+lt, .LBB21_2 +; NO-FAST-P8-NEXT: bc 12, 4*cr5+lt, .LBB23_2 ; NO-FAST-P8-NEXT: # %bb.1: # %entry ; NO-FAST-P8-NEXT: fmr f3, f4 -; NO-FAST-P8-NEXT: .LBB21_2: # %entry +; NO-FAST-P8-NEXT: .LBB23_2: # %entry ; NO-FAST-P8-NEXT: fmr f1, f3 ; NO-FAST-P8-NEXT: blr ; @@ -831,10 +917,10 @@ define double @select_ole_double(double %a, double %b, double %c, double %d) { ; NO-FAST-P9: # %bb.0: # %entry ; NO-FAST-P9-NEXT: fcmpu cr0, f1, f2 ; NO-FAST-P9-NEXT: crnor 4*cr5+lt, un, gt -; NO-FAST-P9-NEXT: bc 12, 4*cr5+lt, .LBB21_2 +; NO-FAST-P9-NEXT: bc 12, 4*cr5+lt, .LBB23_2 ; NO-FAST-P9-NEXT: # %bb.1: # %entry ; NO-FAST-P9-NEXT: fmr f3, f4 -; NO-FAST-P9-NEXT: .LBB21_2: # %entry +; NO-FAST-P9-NEXT: .LBB23_2: # %entry ; NO-FAST-P9-NEXT: fmr f1, f3 ; NO-FAST-P9-NEXT: blr entry: @@ -926,13 +1012,13 @@ define double @onecmp1(double %a, double %y, double %z) { ; NO-FAST-P8-NEXT: vspltisw v2, 1 ; NO-FAST-P8-NEXT: xvcvsxwdp vs0, vs34 ; NO-FAST-P8-NEXT: fcmpu cr0, f1, f0 -; NO-FAST-P8-NEXT: bc 12, lt, .LBB24_3 +; NO-FAST-P8-NEXT: bc 12, lt, .LBB26_3 ; NO-FAST-P8-NEXT: # %bb.1: # %entry ; NO-FAST-P8-NEXT: fcmpu cr0, f1, f1 -; NO-FAST-P8-NEXT: bc 12, un, .LBB24_3 +; NO-FAST-P8-NEXT: bc 12, un, .LBB26_3 ; NO-FAST-P8-NEXT: # %bb.2: # %entry ; NO-FAST-P8-NEXT: fmr f3, f2 -; NO-FAST-P8-NEXT: .LBB24_3: # %entry +; NO-FAST-P8-NEXT: .LBB26_3: # %entry ; NO-FAST-P8-NEXT: fmr f1, f3 ; NO-FAST-P8-NEXT: blr ; @@ -941,13 +1027,13 @@ define double @onecmp1(double %a, double %y, double %z) { ; NO-FAST-P9-NEXT: vspltisw v2, 1 ; NO-FAST-P9-NEXT: xvcvsxwdp vs0, vs34 ; NO-FAST-P9-NEXT: fcmpu cr0, f1, f0 -; NO-FAST-P9-NEXT: bc 12, lt, .LBB24_3 +; NO-FAST-P9-NEXT: bc 12, lt, .LBB26_3 ; NO-FAST-P9-NEXT: # %bb.1: # %entry ; NO-FAST-P9-NEXT: fcmpu cr0, f1, f1 -; NO-FAST-P9-NEXT: bc 12, un, .LBB24_3 +; NO-FAST-P9-NEXT: bc 12, un, .LBB26_3 ; NO-FAST-P9-NEXT: # %bb.2: # %entry ; NO-FAST-P9-NEXT: fmr f3, f2 -; NO-FAST-P9-NEXT: .LBB24_3: # %entry +; NO-FAST-P9-NEXT: .LBB26_3: # %entry ; NO-FAST-P9-NEXT: fmr f1, f3 ; NO-FAST-P9-NEXT: blr entry: @@ -978,10 +1064,10 @@ define double @onecmp2(double %a, double %y, double %z) { ; NO-FAST-P8-NEXT: vspltisw v2, 1 ; NO-FAST-P8-NEXT: xvcvsxwdp vs0, vs34 ; NO-FAST-P8-NEXT: xscmpudp cr0, f1, f0 -; NO-FAST-P8-NEXT: bgt cr0, .LBB25_2 +; NO-FAST-P8-NEXT: bgt cr0, .LBB27_2 ; NO-FAST-P8-NEXT: # %bb.1: # %entry ; NO-FAST-P8-NEXT: fmr f2, f3 -; NO-FAST-P8-NEXT: .LBB25_2: # %entry +; NO-FAST-P8-NEXT: .LBB27_2: # %entry ; NO-FAST-P8-NEXT: fmr f1, f2 ; NO-FAST-P8-NEXT: blr ; @@ -990,10 +1076,10 @@ define double @onecmp2(double %a, double %y, double %z) { ; NO-FAST-P9-NEXT: vspltisw v2, 1 ; NO-FAST-P9-NEXT: xvcvsxwdp vs0, vs34 ; NO-FAST-P9-NEXT: xscmpudp cr0, f1, f0 -; NO-FAST-P9-NEXT: bgt cr0, .LBB25_2 +; NO-FAST-P9-NEXT: bgt cr0, .LBB27_2 ; NO-FAST-P9-NEXT: # %bb.1: # %entry ; NO-FAST-P9-NEXT: fmr f2, f3 -; NO-FAST-P9-NEXT: .LBB25_2: # %entry +; NO-FAST-P9-NEXT: .LBB27_2: # %entry ; NO-FAST-P9-NEXT: fmr f1, f2 ; NO-FAST-P9-NEXT: blr entry: @@ -1028,10 +1114,10 @@ define double @onecmp3(double %a, double %y, double %z) { ; NO-FAST-P8-NEXT: vspltisw v2, 1 ; NO-FAST-P8-NEXT: xvcvsxwdp vs0, vs34 ; NO-FAST-P8-NEXT: xscmpudp cr0, f1, f0 -; NO-FAST-P8-NEXT: beq cr0, .LBB26_2 +; NO-FAST-P8-NEXT: beq cr0, .LBB28_2 ; NO-FAST-P8-NEXT: # %bb.1: # %entry ; NO-FAST-P8-NEXT: fmr f2, f3 -; NO-FAST-P8-NEXT: .LBB26_2: # %entry +; NO-FAST-P8-NEXT: .LBB28_2: # %entry ; NO-FAST-P8-NEXT: fmr f1, f2 ; NO-FAST-P8-NEXT: blr ; @@ -1040,10 +1126,10 @@ define double @onecmp3(double %a, double %y, double %z) { ; NO-FAST-P9-NEXT: vspltisw v2, 1 ; NO-FAST-P9-NEXT: xvcvsxwdp vs0, vs34 ; NO-FAST-P9-NEXT: xscmpudp cr0, f1, f0 -; NO-FAST-P9-NEXT: beq cr0, .LBB26_2 +; NO-FAST-P9-NEXT: beq cr0, .LBB28_2 ; NO-FAST-P9-NEXT: # %bb.1: # %entry ; NO-FAST-P9-NEXT: fmr f2, f3 -; NO-FAST-P9-NEXT: .LBB26_2: # %entry +; NO-FAST-P9-NEXT: .LBB28_2: # %entry ; NO-FAST-P9-NEXT: fmr f1, f2 ; NO-FAST-P9-NEXT: blr entry: diff --git a/llvm/test/CodeGen/X86/combine-add.ll b/llvm/test/CodeGen/X86/combine-add.ll index ff9f995..51a8bf5 100644 --- a/llvm/test/CodeGen/X86/combine-add.ll +++ b/llvm/test/CodeGen/X86/combine-add.ll @@ -235,10 +235,10 @@ define void @PR52039(ptr %pa, ptr %pb) { ; SSE-NEXT: psubd %xmm1, %xmm3 ; SSE-NEXT: psubd %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: paddd %xmm2, %xmm0 +; SSE-NEXT: paddd %xmm0, %xmm0 ; SSE-NEXT: paddd %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: paddd %xmm3, %xmm1 +; SSE-NEXT: paddd %xmm1, %xmm1 ; SSE-NEXT: paddd %xmm3, %xmm1 ; SSE-NEXT: movdqu %xmm3, 16(%rsi) ; SSE-NEXT: movdqu %xmm2, (%rsi) diff --git a/llvm/test/CodeGen/X86/combine-mul.ll b/llvm/test/CodeGen/X86/combine-mul.ll index 8e4a50e..ae4d24f 100644 --- a/llvm/test/CodeGen/X86/combine-mul.ll +++ b/llvm/test/CodeGen/X86/combine-mul.ll @@ -81,7 +81,7 @@ define <4 x i64> @combine_vec_mul_pow2c(<4 x i64> %x) { ; SSE-LABEL: combine_vec_mul_pow2c: ; SSE: # %bb.0: ; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: paddq %xmm0, %xmm2 +; SSE-NEXT: paddq %xmm2, %xmm2 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: psllq $4, %xmm2 diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll index 98187d6..6bcbfe1 100644 --- a/llvm/test/CodeGen/X86/combine-sdiv.ll +++ b/llvm/test/CodeGen/X86/combine-sdiv.ll @@ -2187,13 +2187,13 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm3 ; SSE41-NEXT: pcmpgtb %xmm1, %xmm3 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [256,2,2,2,2,128,2,128] ; SSE41-NEXT: psrlw $8, %xmm3 -; SSE41-NEXT: paddw %xmm4, %xmm4 -; SSE41-NEXT: pmovsxbw %xmm1, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4,5],xmm4[6],xmm2[7] +; SSE41-NEXT: pmovsxbw %xmm1, %xmm0 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; SSE41-NEXT: paddw %xmm2, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2],xmm0[3,4,5],xmm2[6],xmm0[7] ; SSE41-NEXT: psrlw $8, %xmm2 ; SSE41-NEXT: packuswb %xmm3, %xmm2 ; SSE41-NEXT: paddb %xmm1, %xmm2 @@ -2201,15 +2201,14 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) { ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE41-NEXT: psraw $8, %xmm0 ; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: paddw %xmm0, %xmm3 -; SSE41-NEXT: psllw $7, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5],xmm0[6],xmm3[7] -; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: psllw $7, %xmm3 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm0[5],xmm3[6],xmm0[7] +; SSE41-NEXT: psrlw $8, %xmm3 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE41-NEXT: psraw $8, %xmm2 ; SSE41-NEXT: psllw $7, %xmm2 ; SSE41-NEXT: psrlw $8, %xmm2 -; SSE41-NEXT: packuswb %xmm0, %xmm2 +; SSE41-NEXT: packuswb %xmm3, %xmm2 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255] ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255] @@ -2225,18 +2224,17 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) { ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [256,2,2,2,2,128,2,128] ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpmovsxbw %xmm0, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4,5],xmm2[6],xmm3[7] +; AVX1-NEXT: vpmovsxbw %xmm0, %xmm2 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4,5],xmm3[6],xmm2[7] ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3 -; AVX1-NEXT: vpsllw $7, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5],xmm2[6],xmm3[7] +; AVX1-NEXT: vpsllw $7, %xmm2, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5],xmm3[6],xmm2[7] ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/dpbusd.ll b/llvm/test/CodeGen/X86/dpbusd.ll index 3aa77c3..7bd22d5 100644 --- a/llvm/test/CodeGen/X86/dpbusd.ll +++ b/llvm/test/CodeGen/X86/dpbusd.ll @@ -1,40 +1,25 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnni | FileCheck %s --check-prefixes=AVXVNNI -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni | FileCheck %s --check-prefixes=AVX512,AVX512VNNI -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVNNI +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnni | FileCheck %s --check-prefixes=CHECK,AVXVNNI,AVXVNNI-AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnni,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVXVNNI,AVXVNNI-AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512VNNI +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512VLVNNI define i32 @no_dpbusd(ptr%a, ptr%b, i32 %c, i32 %n) { -; AVXVNNI-LABEL: no_dpbusd: -; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVXVNNI-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVXVNNI-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0 -; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVXVNNI-NEXT: vmovd %xmm0, %eax -; AVXVNNI-NEXT: addl %edx, %eax -; AVXVNNI-NEXT: vzeroupper -; AVXVNNI-NEXT: retq -; -; AVX512-LABEL: no_dpbusd: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: addl %edx, %eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; CHECK-LABEL: no_dpbusd: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; CHECK-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovd %xmm0, %eax +; CHECK-NEXT: addl %edx, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq entry: %0 = load <16 x i8>, ptr %a, align 16 %1 = zext <16 x i8> %0 to <16 x i32> @@ -99,25 +84,44 @@ entry: } define i32 @mul_zext(ptr%a, ptr%b, i32 %c, i32 %n) { -; AVXVNNI-LABEL: mul_zext: -; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVXVNNI-NEXT: vpmovsxbw (%rsi), %ymm1 -; AVXVNNI-NEXT: vpmullw %ymm0, %ymm1, %ymm0 -; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVXVNNI-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVXVNNI-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVXVNNI-NEXT: vmovd %xmm0, %eax -; AVXVNNI-NEXT: addl %edx, %eax -; AVXVNNI-NEXT: vzeroupper -; AVXVNNI-NEXT: retq +; AVXVNNI-AVX-LABEL: mul_zext: +; AVXVNNI-AVX: # %bb.0: # %entry +; AVXVNNI-AVX-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVXVNNI-AVX-NEXT: vpmovsxbw (%rsi), %ymm1 +; AVXVNNI-AVX-NEXT: vpmullw %ymm0, %ymm1, %ymm0 +; AVXVNNI-AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVXVNNI-AVX-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVXVNNI-AVX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVXVNNI-AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVXVNNI-AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVXVNNI-AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVXVNNI-AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVXVNNI-AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX-NEXT: vmovd %xmm0, %eax +; AVXVNNI-AVX-NEXT: addl %edx, %eax +; AVXVNNI-AVX-NEXT: vzeroupper +; AVXVNNI-AVX-NEXT: retq +; +; AVXVNNI-AVX512-LABEL: mul_zext: +; AVXVNNI-AVX512: # %bb.0: # %entry +; AVXVNNI-AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVXVNNI-AVX512-NEXT: vpmovsxbw (%rsi), %ymm1 +; AVXVNNI-AVX512-NEXT: vpmullw %ymm0, %ymm1, %ymm0 +; AVXVNNI-AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVXVNNI-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVXVNNI-AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVXVNNI-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVXVNNI-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVXVNNI-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVXVNNI-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX512-NEXT: vmovd %xmm0, %eax +; AVXVNNI-AVX512-NEXT: addl %edx, %eax +; AVXVNNI-AVX512-NEXT: vzeroupper +; AVXVNNI-AVX512-NEXT: retq ; ; AVX512-LABEL: mul_zext: ; AVX512: # %bb.0: # %entry @@ -153,25 +157,44 @@ entry: } define i32 @mul_sext(ptr%a, ptr%b, i32 %c, i32 %n) { -; AVXVNNI-LABEL: mul_sext: -; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVXVNNI-NEXT: vpmovsxbw (%rsi), %ymm1 -; AVXVNNI-NEXT: vpmullw %ymm0, %ymm1, %ymm0 -; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVXVNNI-NEXT: vpmovsxwd %xmm1, %ymm1 -; AVXVNNI-NEXT: vpmovsxwd %xmm0, %ymm0 -; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVXVNNI-NEXT: vmovd %xmm0, %eax -; AVXVNNI-NEXT: addl %edx, %eax -; AVXVNNI-NEXT: vzeroupper -; AVXVNNI-NEXT: retq +; AVXVNNI-AVX-LABEL: mul_sext: +; AVXVNNI-AVX: # %bb.0: # %entry +; AVXVNNI-AVX-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVXVNNI-AVX-NEXT: vpmovsxbw (%rsi), %ymm1 +; AVXVNNI-AVX-NEXT: vpmullw %ymm0, %ymm1, %ymm0 +; AVXVNNI-AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVXVNNI-AVX-NEXT: vpmovsxwd %xmm1, %ymm1 +; AVXVNNI-AVX-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVXVNNI-AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVXVNNI-AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVXVNNI-AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVXVNNI-AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVXVNNI-AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX-NEXT: vmovd %xmm0, %eax +; AVXVNNI-AVX-NEXT: addl %edx, %eax +; AVXVNNI-AVX-NEXT: vzeroupper +; AVXVNNI-AVX-NEXT: retq +; +; AVXVNNI-AVX512-LABEL: mul_sext: +; AVXVNNI-AVX512: # %bb.0: # %entry +; AVXVNNI-AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVXVNNI-AVX512-NEXT: vpmovsxbw (%rsi), %ymm1 +; AVXVNNI-AVX512-NEXT: vpmullw %ymm0, %ymm1, %ymm0 +; AVXVNNI-AVX512-NEXT: vpmovsxwd %ymm0, %zmm0 +; AVXVNNI-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVXVNNI-AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVXVNNI-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVXVNNI-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVXVNNI-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVXVNNI-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX512-NEXT: vmovd %xmm0, %eax +; AVXVNNI-AVX512-NEXT: addl %edx, %eax +; AVXVNNI-AVX512-NEXT: vzeroupper +; AVXVNNI-AVX512-NEXT: retq ; ; AVX512-LABEL: mul_sext: ; AVX512: # %bb.0: # %entry @@ -312,17 +335,30 @@ entry: declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) define i32 @vpdpbusd_128(ptr%a, ptr%b, i32 %c, i32 %n) { -; AVXVNNI-LABEL: vpdpbusd_128: -; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVXVNNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVXVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVXVNNI-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3] -; AVXVNNI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] -; AVXVNNI-NEXT: {vex} vpdpbusd %xmm1, %xmm0, %xmm2 -; AVXVNNI-NEXT: vmovd %xmm2, %eax -; AVXVNNI-NEXT: addl %edx, %eax -; AVXVNNI-NEXT: retq +; AVXVNNI-AVX-LABEL: vpdpbusd_128: +; AVXVNNI-AVX: # %bb.0: # %entry +; AVXVNNI-AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVXVNNI-AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVXVNNI-AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVXVNNI-AVX-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3] +; AVXVNNI-AVX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] +; AVXVNNI-AVX-NEXT: {vex} vpdpbusd %xmm1, %xmm0, %xmm2 +; AVXVNNI-AVX-NEXT: vmovd %xmm2, %eax +; AVXVNNI-AVX-NEXT: addl %edx, %eax +; AVXVNNI-AVX-NEXT: retq +; +; AVXVNNI-AVX512-LABEL: vpdpbusd_128: +; AVXVNNI-AVX512: # %bb.0: # %entry +; AVXVNNI-AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVXVNNI-AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVXVNNI-AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVXVNNI-AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3] +; AVXVNNI-AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] +; AVXVNNI-AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVXVNNI-AVX512-NEXT: {vex} vpdpbusd %xmm1, %xmm0, %xmm2 +; AVXVNNI-AVX512-NEXT: vmovd %xmm2, %eax +; AVXVNNI-AVX512-NEXT: addl %edx, %eax +; AVXVNNI-AVX512-NEXT: retq ; ; AVX512VNNI-LABEL: vpdpbusd_128: ; AVX512VNNI: # %bb.0: # %entry diff --git a/llvm/test/CodeGen/X86/dpbusd_const.ll b/llvm/test/CodeGen/X86/dpbusd_const.ll index 456e6e8..bb47df5 100644 --- a/llvm/test/CodeGen/X86/dpbusd_const.ll +++ b/llvm/test/CodeGen/X86/dpbusd_const.ll @@ -1,20 +1,21 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnni | FileCheck %s --check-prefixes=ALL,AVXVNNI -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni | FileCheck %s --check-prefixes=ALL,AVX512,AVX512VNNI -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni -mattr=+avx512vl | FileCheck %s --check-prefixes=ALL,AVX512,AVX512VLVNNI +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnni | FileCheck %s --check-prefixes=CHECK,AVXVNNI,AVXVNNI-AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnni,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVXVNNI,AVXVNNI-AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512VNNI +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512VLVNNI define i32 @mul_4xi8_zc_exceed(<4 x i8> %a, i32 %c) { -; ALL-LABEL: mul_4xi8_zc_exceed: -; ALL: # %bb.0: # %entry -; ALL-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; ALL-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,1,0,2,0,128,0] -; ALL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; ALL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; ALL-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; ALL-NEXT: vmovd %xmm0, %eax -; ALL-NEXT: addl %edi, %eax -; ALL-NEXT: retq +; CHECK-LABEL: mul_4xi8_zc_exceed: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; CHECK-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,1,0,2,0,128,0] +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vmovd %xmm0, %eax +; CHECK-NEXT: addl %edi, %eax +; CHECK-NEXT: retq entry: %0 = zext <4 x i8> %a to <4 x i32> %1 = mul nsw <4 x i32> %0, <i32 0, i32 1, i32 2, i32 128> @@ -24,14 +25,24 @@ entry: } define i32 @mul_4xi8_zc(<4 x i8> %a, i32 %c) { -; AVXVNNI-LABEL: mul_4xi8_zc: -; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVXVNNI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVXVNNI-NEXT: {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVXVNNI-NEXT: vmovd %xmm1, %eax -; AVXVNNI-NEXT: addl %edi, %eax -; AVXVNNI-NEXT: retq +; AVXVNNI-AVX-LABEL: mul_4xi8_zc: +; AVXVNNI-AVX: # %bb.0: # %entry +; AVXVNNI-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVXVNNI-AVX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVXVNNI-AVX-NEXT: {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVXVNNI-AVX-NEXT: vmovd %xmm1, %eax +; AVXVNNI-AVX-NEXT: addl %edi, %eax +; AVXVNNI-AVX-NEXT: retq +; +; AVXVNNI-AVX512-LABEL: mul_4xi8_zc: +; AVXVNNI-AVX512: # %bb.0: # %entry +; AVXVNNI-AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVXVNNI-AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVXVNNI-AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVXVNNI-AVX512-NEXT: {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVXVNNI-AVX512-NEXT: vmovd %xmm1, %eax +; AVXVNNI-AVX512-NEXT: addl %edi, %eax +; AVXVNNI-AVX512-NEXT: retq ; ; AVX512VNNI-LABEL: mul_4xi8_zc: ; AVX512VNNI: # %bb.0: # %entry @@ -62,16 +73,26 @@ entry: } define i32 @mul_4xi4_cz(<4 x i4> %a, i32 %c) { -; AVXVNNI-LABEL: mul_4xi4_cz: -; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] -; AVXVNNI-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVXVNNI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVXVNNI-NEXT: {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVXVNNI-NEXT: vmovd %xmm1, %eax -; AVXVNNI-NEXT: addl %edi, %eax -; AVXVNNI-NEXT: retq +; AVXVNNI-AVX-LABEL: mul_4xi4_cz: +; AVXVNNI-AVX: # %bb.0: # %entry +; AVXVNNI-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVXVNNI-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVXVNNI-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVXVNNI-AVX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVXVNNI-AVX-NEXT: {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVXVNNI-AVX-NEXT: vmovd %xmm1, %eax +; AVXVNNI-AVX-NEXT: addl %edi, %eax +; AVXVNNI-AVX-NEXT: retq +; +; AVXVNNI-AVX512-LABEL: mul_4xi4_cz: +; AVXVNNI-AVX512: # %bb.0: # %entry +; AVXVNNI-AVX512-NEXT: vpmovdb %xmm0, %xmm0 +; AVXVNNI-AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVXVNNI-AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVXVNNI-AVX512-NEXT: {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVXVNNI-AVX512-NEXT: vmovd %xmm1, %eax +; AVXVNNI-AVX512-NEXT: addl %edi, %eax +; AVXVNNI-AVX512-NEXT: retq ; ; AVX512VNNI-LABEL: mul_4xi4_cz: ; AVX512VNNI: # %bb.0: # %entry @@ -104,15 +125,26 @@ entry: } define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) { -; AVXVNNI-LABEL: mul_4xi8_cs: -; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVXVNNI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVXVNNI-NEXT: vmovd {{.*#+}} xmm2 = [16,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0] -; AVXVNNI-NEXT: {vex} vpdpbusd %xmm0, %xmm2, %xmm1 -; AVXVNNI-NEXT: vmovd %xmm1, %eax -; AVXVNNI-NEXT: addl %edi, %eax -; AVXVNNI-NEXT: retq +; AVXVNNI-AVX-LABEL: mul_4xi8_cs: +; AVXVNNI-AVX: # %bb.0: # %entry +; AVXVNNI-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVXVNNI-AVX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVXVNNI-AVX-NEXT: vmovd {{.*#+}} xmm2 = [16,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0] +; AVXVNNI-AVX-NEXT: {vex} vpdpbusd %xmm0, %xmm2, %xmm1 +; AVXVNNI-AVX-NEXT: vmovd %xmm1, %eax +; AVXVNNI-AVX-NEXT: addl %edi, %eax +; AVXVNNI-AVX-NEXT: retq +; +; AVXVNNI-AVX512-LABEL: mul_4xi8_cs: +; AVXVNNI-AVX512: # %bb.0: # %entry +; AVXVNNI-AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVXVNNI-AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVXVNNI-AVX512-NEXT: vmovd {{.*#+}} xmm1 = [16,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0] +; AVXVNNI-AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVXVNNI-AVX512-NEXT: {vex} vpdpbusd %xmm0, %xmm1, %xmm2 +; AVXVNNI-AVX512-NEXT: vmovd %xmm2, %eax +; AVXVNNI-AVX512-NEXT: addl %edi, %eax +; AVXVNNI-AVX512-NEXT: retq ; ; AVX512VNNI-LABEL: mul_4xi8_cs: ; AVX512VNNI: # %bb.0: # %entry @@ -145,17 +177,17 @@ entry: } define i32 @mul_4xi8_cs_exceed(<4 x i8> %a, i32 %c) { -; ALL-LABEL: mul_4xi8_cs_exceed: -; ALL: # %bb.0: # %entry -; ALL-NEXT: vpmovsxbd %xmm0, %xmm0 -; ALL-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,1,0,2,0,256,0] -; ALL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; ALL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; ALL-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; ALL-NEXT: vmovd %xmm0, %eax -; ALL-NEXT: addl %edi, %eax -; ALL-NEXT: retq +; CHECK-LABEL: mul_4xi8_cs_exceed: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vpmovsxbd %xmm0, %xmm0 +; CHECK-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,1,0,2,0,256,0] +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vmovd %xmm0, %eax +; CHECK-NEXT: addl %edi, %eax +; CHECK-NEXT: retq entry: %0 = sext <4 x i8> %a to <4 x i32> %1 = mul nsw <4 x i32> <i32 0, i32 1, i32 2, i32 256>, %0 @@ -265,24 +297,44 @@ entry: } define i32 @mul_64xi8_zc(<64 x i8> %a, i32 %c) { -; AVXVNNI-LABEL: mul_64xi8_zc: -; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64] -; AVXVNNI-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVXVNNI-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVXVNNI-NEXT: {vex} vpdpbusd %ymm2, %ymm1, %ymm4 -; AVXVNNI-NEXT: {vex} vpdpbusd %ymm2, %ymm0, %ymm3 -; AVXVNNI-NEXT: vpaddd %ymm4, %ymm3, %ymm0 -; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVXVNNI-NEXT: vmovd %xmm0, %eax -; AVXVNNI-NEXT: addl %edi, %eax -; AVXVNNI-NEXT: vzeroupper -; AVXVNNI-NEXT: retq +; AVXVNNI-AVX-LABEL: mul_64xi8_zc: +; AVXVNNI-AVX: # %bb.0: # %entry +; AVXVNNI-AVX-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64] +; AVXVNNI-AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVXVNNI-AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVXVNNI-AVX-NEXT: {vex} vpdpbusd %ymm2, %ymm1, %ymm4 +; AVXVNNI-AVX-NEXT: {vex} vpdpbusd %ymm2, %ymm0, %ymm3 +; AVXVNNI-AVX-NEXT: vpaddd %ymm4, %ymm3, %ymm0 +; AVXVNNI-AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVXVNNI-AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVXVNNI-AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVXVNNI-AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX-NEXT: vmovd %xmm0, %eax +; AVXVNNI-AVX-NEXT: addl %edi, %eax +; AVXVNNI-AVX-NEXT: vzeroupper +; AVXVNNI-AVX-NEXT: retq +; +; AVXVNNI-AVX512-LABEL: mul_64xi8_zc: +; AVXVNNI-AVX512: # %bb.0: # %entry +; AVXVNNI-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVXVNNI-AVX512-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64] +; AVXVNNI-AVX512-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVXVNNI-AVX512-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVXVNNI-AVX512-NEXT: {vex} vpdpbusd %ymm2, %ymm1, %ymm4 +; AVXVNNI-AVX512-NEXT: {vex} vpdpbusd %ymm2, %ymm0, %ymm3 +; AVXVNNI-AVX512-NEXT: vpaddd %ymm4, %ymm3, %ymm0 +; AVXVNNI-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVXVNNI-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVXVNNI-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVXVNNI-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX512-NEXT: vmovd %xmm0, %eax +; AVXVNNI-AVX512-NEXT: addl %edi, %eax +; AVXVNNI-AVX512-NEXT: vzeroupper +; AVXVNNI-AVX512-NEXT: retq ; ; AVX512-LABEL: mul_64xi8_zc: ; AVX512: # %bb.0: # %entry diff --git a/llvm/test/CodeGen/X86/known-signbits-shl.ll b/llvm/test/CodeGen/X86/known-signbits-shl.ll index 473fecc..57d557d 100644 --- a/llvm/test/CodeGen/X86/known-signbits-shl.ll +++ b/llvm/test/CodeGen/X86/known-signbits-shl.ll @@ -137,7 +137,7 @@ define void @computeNumSignBits_shl_zext_vec_3(<2 x i8> %x, ptr %p) nounwind { ; X64-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; X64-NEXT: por %xmm2, %xmm1 ; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: paddw %xmm0, %xmm2 +; X64-NEXT: paddw %xmm2, %xmm2 ; X64-NEXT: movdqa %xmm2, %xmm3 ; X64-NEXT: psraw $1, %xmm3 ; X64-NEXT: pcmpeqw %xmm0, %xmm3 diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll index 4e6f666..4cde581 100644 --- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -4806,9 +4806,8 @@ define <16 x float> @test_gather_structpt_16f32_mask_index(ptr %x, ptr %arr, <16 ; X64-KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; X64-KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; X64-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 -; X64-KNL-NEXT: vmovdqu64 (%rsi), %zmm0 +; X64-KNL-NEXT: vpslld $1, (%rsi), %zmm0 ; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; X64-KNL-NEXT: vpaddd %zmm0, %zmm0, %zmm0 ; X64-KNL-NEXT: vgatherdps (%rdi,%zmm0,8), %zmm1 {%k1} ; X64-KNL-NEXT: vmovaps %zmm1, %zmm0 ; X64-KNL-NEXT: retq @@ -4830,9 +4829,8 @@ define <16 x float> @test_gather_structpt_16f32_mask_index(ptr %x, ptr %arr, <16 ; X64-SKX-SMALL-NEXT: vpmovsxbd %xmm0, %zmm0 ; X64-SKX-SMALL-NEXT: vpslld $31, %zmm0, %zmm0 ; X64-SKX-SMALL-NEXT: vpmovd2m %zmm0, %k1 -; X64-SKX-SMALL-NEXT: vmovdqu64 (%rsi), %zmm0 +; X64-SKX-SMALL-NEXT: vpslld $1, (%rsi), %zmm0 ; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; X64-SKX-SMALL-NEXT: vpaddd %zmm0, %zmm0, %zmm0 ; X64-SKX-SMALL-NEXT: vgatherdps (%rdi,%zmm0,8), %zmm1 {%k1} ; X64-SKX-SMALL-NEXT: vmovaps %zmm1, %zmm0 ; X64-SKX-SMALL-NEXT: retq @@ -4842,10 +4840,9 @@ define <16 x float> @test_gather_structpt_16f32_mask_index(ptr %x, ptr %arr, <16 ; X64-SKX-LARGE-NEXT: vpmovsxbd %xmm0, %zmm0 ; X64-SKX-LARGE-NEXT: vpslld $31, %zmm0, %zmm0 ; X64-SKX-LARGE-NEXT: vpmovd2m %zmm0, %k1 -; X64-SKX-LARGE-NEXT: vmovdqu64 (%rsi), %zmm0 +; X64-SKX-LARGE-NEXT: vpslld $1, (%rsi), %zmm0 ; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax ; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0 -; X64-SKX-LARGE-NEXT: vpaddd %zmm0, %zmm0, %zmm0 ; X64-SKX-LARGE-NEXT: vgatherdps (%rdi,%zmm0,8), %zmm1 {%k1} ; X64-SKX-LARGE-NEXT: vmovaps %zmm1, %zmm0 ; X64-SKX-LARGE-NEXT: retq @@ -4875,9 +4872,8 @@ define <16 x float> @test_gather_structpt_16f32_mask_index_offset(ptr %x, ptr %a ; X64-KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; X64-KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; X64-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 -; X64-KNL-NEXT: vmovdqu64 (%rsi), %zmm0 +; X64-KNL-NEXT: vpslld $1, (%rsi), %zmm0 ; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; X64-KNL-NEXT: vpaddd %zmm0, %zmm0, %zmm0 ; X64-KNL-NEXT: vgatherdps 4(%rdi,%zmm0,8), %zmm1 {%k1} ; X64-KNL-NEXT: vmovaps %zmm1, %zmm0 ; X64-KNL-NEXT: retq @@ -4899,9 +4895,8 @@ define <16 x float> @test_gather_structpt_16f32_mask_index_offset(ptr %x, ptr %a ; X64-SKX-SMALL-NEXT: vpmovsxbd %xmm0, %zmm0 ; X64-SKX-SMALL-NEXT: vpslld $31, %zmm0, %zmm0 ; X64-SKX-SMALL-NEXT: vpmovd2m %zmm0, %k1 -; X64-SKX-SMALL-NEXT: vmovdqu64 (%rsi), %zmm0 +; X64-SKX-SMALL-NEXT: vpslld $1, (%rsi), %zmm0 ; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; X64-SKX-SMALL-NEXT: vpaddd %zmm0, %zmm0, %zmm0 ; X64-SKX-SMALL-NEXT: vgatherdps 4(%rdi,%zmm0,8), %zmm1 {%k1} ; X64-SKX-SMALL-NEXT: vmovaps %zmm1, %zmm0 ; X64-SKX-SMALL-NEXT: retq @@ -4911,10 +4906,9 @@ define <16 x float> @test_gather_structpt_16f32_mask_index_offset(ptr %x, ptr %a ; X64-SKX-LARGE-NEXT: vpmovsxbd %xmm0, %zmm0 ; X64-SKX-LARGE-NEXT: vpslld $31, %zmm0, %zmm0 ; X64-SKX-LARGE-NEXT: vpmovd2m %zmm0, %k1 -; X64-SKX-LARGE-NEXT: vmovdqu64 (%rsi), %zmm0 +; X64-SKX-LARGE-NEXT: vpslld $1, (%rsi), %zmm0 ; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax ; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0 -; X64-SKX-LARGE-NEXT: vpaddd %zmm0, %zmm0, %zmm0 ; X64-SKX-LARGE-NEXT: vgatherdps 4(%rdi,%zmm0,8), %zmm1 {%k1} ; X64-SKX-LARGE-NEXT: vmovaps %zmm1, %zmm0 ; X64-SKX-LARGE-NEXT: retq @@ -4944,9 +4938,8 @@ define {<16 x float>, <16 x float>} @test_gather_structpt_16f32_mask_index_pair( ; X64-KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; X64-KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; X64-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 -; X64-KNL-NEXT: vmovdqu64 (%rsi), %zmm0 -; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; X64-KNL-NEXT: vpaddd %zmm0, %zmm0, %zmm2 +; X64-KNL-NEXT: vpslld $1, (%rsi), %zmm0 +; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm2 ; X64-KNL-NEXT: kmovw %k1, %k2 ; X64-KNL-NEXT: vmovaps %zmm1, %zmm0 ; X64-KNL-NEXT: vgatherdps (%rdi,%zmm2,8), %zmm0 {%k2} @@ -4972,9 +4965,8 @@ define {<16 x float>, <16 x float>} @test_gather_structpt_16f32_mask_index_pair( ; X64-SKX-SMALL-NEXT: vpmovsxbd %xmm0, %zmm0 ; X64-SKX-SMALL-NEXT: vpslld $31, %zmm0, %zmm0 ; X64-SKX-SMALL-NEXT: vpmovd2m %zmm0, %k1 -; X64-SKX-SMALL-NEXT: vmovdqu64 (%rsi), %zmm0 -; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; X64-SKX-SMALL-NEXT: vpaddd %zmm0, %zmm0, %zmm2 +; X64-SKX-SMALL-NEXT: vpslld $1, (%rsi), %zmm0 +; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm2 ; X64-SKX-SMALL-NEXT: kmovw %k1, %k2 ; X64-SKX-SMALL-NEXT: vmovaps %zmm1, %zmm0 ; X64-SKX-SMALL-NEXT: vgatherdps (%rdi,%zmm2,8), %zmm0 {%k2} @@ -4986,10 +4978,9 @@ define {<16 x float>, <16 x float>} @test_gather_structpt_16f32_mask_index_pair( ; X64-SKX-LARGE-NEXT: vpmovsxbd %xmm0, %zmm0 ; X64-SKX-LARGE-NEXT: vpslld $31, %zmm0, %zmm0 ; X64-SKX-LARGE-NEXT: vpmovd2m %zmm0, %k1 -; X64-SKX-LARGE-NEXT: vmovdqu64 (%rsi), %zmm0 +; X64-SKX-LARGE-NEXT: vpslld $1, (%rsi), %zmm0 ; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax -; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0 -; X64-SKX-LARGE-NEXT: vpaddd %zmm0, %zmm0, %zmm2 +; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm2 ; X64-SKX-LARGE-NEXT: kmovw %k1, %k2 ; X64-SKX-LARGE-NEXT: vmovaps %zmm1, %zmm0 ; X64-SKX-LARGE-NEXT: vgatherdps (%rdi,%zmm2,8), %zmm0 {%k2} diff --git a/llvm/test/CodeGen/X86/negative-sin.ll b/llvm/test/CodeGen/X86/negative-sin.ll index f24507d..4836da2 100644 --- a/llvm/test/CodeGen/X86/negative-sin.ll +++ b/llvm/test/CodeGen/X86/negative-sin.ll @@ -82,18 +82,13 @@ define double @semi_strict2(double %e) nounwind { ret double %h } -; FIXME: -; Auto-upgrade function attribute to IR-level fast-math-flags. - -define double @fn_attr(double %e) nounwind #0 { -; CHECK-LABEL: fn_attr: +define double @nsz_flag(double %e) nounwind { +; CHECK-LABEL: nsz_flag: ; CHECK: # %bb.0: ; CHECK-NEXT: jmp sin@PLT # TAILCALL - %f = fsub double 0.0, %e - %g = call double @sin(double %f) readonly - %h = fsub double 0.0, %g + %f = fsub nsz double 0.0, %e + %g = call nsz double @sin(double %f) readonly + %h = fsub nsz double 0.0, %g ret double %h } -attributes #0 = { "unsafe-fp-math"="true" "no-signed-zeros-fp-math"="true" } - diff --git a/llvm/test/CodeGen/X86/oddsubvector.ll b/llvm/test/CodeGen/X86/oddsubvector.ll index f539830..5df1867 100644 --- a/llvm/test/CodeGen/X86/oddsubvector.ll +++ b/llvm/test/CodeGen/X86/oddsubvector.ll @@ -155,10 +155,10 @@ define <16 x i32> @PR42819(ptr %a0) { define void @PR42833() { ; SSE2-LABEL: PR42833: ; SSE2: # %bb.0: +; SSE2-NEXT: movl b(%rip), %eax ; SSE2-NEXT: movdqa c+144(%rip), %xmm2 ; SSE2-NEXT: movdqa c+128(%rip), %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: addl b(%rip), %eax +; SSE2-NEXT: addl c+128(%rip), %eax ; SSE2-NEXT: movd %eax, %xmm1 ; SSE2-NEXT: movd %eax, %xmm3 ; SSE2-NEXT: paddd %xmm0, %xmm3 @@ -166,7 +166,7 @@ define void @PR42833() { ; SSE2-NEXT: psubd %xmm2, %xmm4 ; SSE2-NEXT: paddd %xmm2, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: paddd %xmm0, %xmm5 +; SSE2-NEXT: paddd %xmm5, %xmm5 ; SSE2-NEXT: movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3] ; SSE2-NEXT: movdqa %xmm2, c+144(%rip) ; SSE2-NEXT: movaps %xmm5, c+128(%rip) @@ -191,17 +191,17 @@ define void @PR42833() { ; ; SSE42-LABEL: PR42833: ; SSE42: # %bb.0: +; SSE42-NEXT: movl b(%rip), %eax ; SSE42-NEXT: movdqa c+144(%rip), %xmm1 ; SSE42-NEXT: movdqa c+128(%rip), %xmm0 -; SSE42-NEXT: movd %xmm0, %eax -; SSE42-NEXT: addl b(%rip), %eax +; SSE42-NEXT: addl c+128(%rip), %eax ; SSE42-NEXT: movd %eax, %xmm2 ; SSE42-NEXT: paddd %xmm0, %xmm2 ; SSE42-NEXT: movdqa d+144(%rip), %xmm3 ; SSE42-NEXT: psubd %xmm1, %xmm3 ; SSE42-NEXT: paddd %xmm1, %xmm1 ; SSE42-NEXT: movdqa %xmm0, %xmm4 -; SSE42-NEXT: paddd %xmm0, %xmm4 +; SSE42-NEXT: paddd %xmm4, %xmm4 ; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm1, c+144(%rip) ; SSE42-NEXT: movdqa %xmm4, c+128(%rip) diff --git a/llvm/test/CodeGen/X86/pr62286.ll b/llvm/test/CodeGen/X86/pr62286.ll index ce03f8f..161e965 100644 --- a/llvm/test/CodeGen/X86/pr62286.ll +++ b/llvm/test/CodeGen/X86/pr62286.ll @@ -26,27 +26,33 @@ define i64 @PR62286(i32 %a) { ; AVX1-LABEL: PR62286: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm0 -; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6,7] ; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] -; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1 -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: PR62286: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm0 -; AVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm1 -; AVX2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0] +; AVX2-NEXT: vpaddd %ymm0, %ymm0, %ymm1 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 @@ -59,12 +65,12 @@ define i64 @PR62286(i32 %a) { ; AVX512-LABEL: PR62286: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovd %edi, %xmm0 -; AVX512-NEXT: movb $8, %al +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0] +; AVX512-NEXT: vpaddd %ymm0, %ymm0, %ymm1 +; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: movw $4369, %ax # imm = 0x1111 ; AVX512-NEXT: kmovd %eax, %k1 -; AVX512-NEXT: vpexpandd %ymm0, %ymm1 {%k1} {z} -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512-NEXT: vpaddd %ymm0, %ymm0, %ymm0 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX512-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512-NEXT: vpmovsxdq %ymm0, %zmm0 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 diff --git a/llvm/test/CodeGen/X86/pr74736.ll b/llvm/test/CodeGen/X86/pr74736.ll index ceccee0..5895526 100644 --- a/llvm/test/CodeGen/X86/pr74736.ll +++ b/llvm/test/CodeGen/X86/pr74736.ll @@ -6,8 +6,8 @@ define void @main(<16 x i32> %0, i32 %1) { ; SSE-LABEL: main: ; SSE: # %bb.0: # %entry ; SSE-NEXT: movd %edi, %xmm4 -; SSE-NEXT: movss {{.*#+}} xmm0 = [1,0,0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm4[1,0] +; SSE-NEXT: movsd {{.*#+}} xmm0 = [0,1,0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,0] ; SSE-NEXT: paddd %xmm0, %xmm0 ; SSE-NEXT: paddd %xmm1, %xmm1 ; SSE-NEXT: paddd %xmm3, %xmm3 @@ -32,20 +32,20 @@ define void @main(<16 x i32> %0, i32 %1) { ; AVX-LABEL: main: ; AVX: # %bb.0: # %entry ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3] ; AVX-NEXT: movl $1, %eax ; AVX-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 ; AVX-NEXT: vpinsrd $3, %edi, %xmm2, %xmm2 -; AVX-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX-NEXT: vpaddd %ymm0, %ymm0, %ymm0 -; AVX-NEXT: vpaddd %ymm1, %ymm1, %ymm1 -; AVX-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,1,1,3,3,5,5,7] -; AVX-NEXT: vpermd %ymm0, %ymm2, %ymm2 +; AVX-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX-NEXT: vpaddd %ymm2, %ymm2, %ymm2 +; AVX-NEXT: vpaddd %ymm1, %ymm1, %ymm3 ; AVX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] ; AVX-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7] -; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,5,7] +; AVX-NEXT: vpaddd %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[0,1,1,3,4,5,5,7] ; AVX-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX-NEXT: vpxor %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,1,3,3,5,5,7] +; AVX-NEXT: vpermd %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vpxor %ymm0, %ymm1, %ymm0 ; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] diff --git a/llvm/test/CodeGen/X86/shift-i512.ll b/llvm/test/CodeGen/X86/shift-i512.ll index 756019d..03b61d9 100644 --- a/llvm/test/CodeGen/X86/shift-i512.ll +++ b/llvm/test/CodeGen/X86/shift-i512.ll @@ -10,7 +10,7 @@ define <8 x i64> @shl_i512_1(<8 x i64> %a) { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: valignq {{.*#+}} zmm1 = zmm0[3,4,5,6,7,0,1,2] ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512VL-NEXT: vpsllq $1, %xmm0, %xmm3 +; AVX512VL-NEXT: vpaddq %xmm0, %xmm0, %xmm3 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpsrlq $63, %xmm4, %xmm4 ; AVX512VL-NEXT: vpaddq %xmm2, %xmm2, %xmm2 @@ -34,7 +34,7 @@ define <8 x i64> @shl_i512_1(<8 x i64> %a) { ; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; AVX512VBMI-NEXT: vpshldq $1, %xmm3, %xmm2, %xmm3 -; AVX512VBMI-NEXT: vpsllq $1, %xmm0, %xmm4 +; AVX512VBMI-NEXT: vpaddq %xmm0, %xmm0, %xmm4 ; AVX512VBMI-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 ; AVX512VBMI-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 ; AVX512VBMI-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] @@ -51,7 +51,7 @@ define <8 x i64> @shl_i512_1(<8 x i64> %a) { ; ZNVER4-NEXT: vextracti32x4 $2, %zmm0, %xmm1 ; ZNVER4-NEXT: vextracti128 $1, %ymm0, %xmm2 ; ZNVER4-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; ZNVER4-NEXT: vpsllq $1, %xmm0, %xmm4 +; ZNVER4-NEXT: vpaddq %xmm0, %xmm0, %xmm4 ; ZNVER4-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 ; ZNVER4-NEXT: vpshldq $1, %xmm3, %xmm2, %xmm3 ; ZNVER4-NEXT: vextracti64x4 $1, %zmm0, %ymm2 diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll index 3f48b22..a48be03 100644 --- a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -5791,20 +5791,20 @@ declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) nounwind readnone define <2 x i64> @test_mm_slli_epi16(<2 x i64> %a0) { ; SSE-LABEL: test_mm_slli_epi16: ; SSE: # %bb.0: -; SSE-NEXT: psllw $1, %xmm0 # encoding: [0x66,0x0f,0x71,0xf0,0x01] +; SSE-NEXT: psllw $2, %xmm0 # encoding: [0x66,0x0f,0x71,0xf0,0x02] ; SSE-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; ; AVX1-LABEL: test_mm_slli_epi16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsllw $1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x71,0xf0,0x01] +; AVX1-NEXT: vpsllw $2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x71,0xf0,0x02] ; AVX1-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; ; AVX512-LABEL: test_mm_slli_epi16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsllw $1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x71,0xf0,0x01] +; AVX512-NEXT: vpsllw $2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x71,0xf0,0x02] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %arg0 = bitcast <2 x i64> %a0 to <8 x i16> - %res = call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %arg0, i32 1) + %res = call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %arg0, i32 2) %bc = bitcast <8 x i16> %res to <2 x i64> ret <2 x i64> %bc } @@ -5813,20 +5813,20 @@ declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) nounwind readnone define <2 x i64> @test_mm_slli_epi32(<2 x i64> %a0) { ; SSE-LABEL: test_mm_slli_epi32: ; SSE: # %bb.0: -; SSE-NEXT: pslld $1, %xmm0 # encoding: [0x66,0x0f,0x72,0xf0,0x01] +; SSE-NEXT: pslld $2, %xmm0 # encoding: [0x66,0x0f,0x72,0xf0,0x02] ; SSE-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; ; AVX1-LABEL: test_mm_slli_epi32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpslld $1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x72,0xf0,0x01] +; AVX1-NEXT: vpslld $2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x72,0xf0,0x02] ; AVX1-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; ; AVX512-LABEL: test_mm_slli_epi32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpslld $1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x72,0xf0,0x01] +; AVX512-NEXT: vpslld $2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x72,0xf0,0x02] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %arg0 = bitcast <2 x i64> %a0 to <4 x i32> - %res = call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %arg0, i32 1) + %res = call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %arg0, i32 2) %bc = bitcast <4 x i32> %res to <2 x i64> ret <2 x i64> %bc } @@ -5835,19 +5835,19 @@ declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32) nounwind readnone define <2 x i64> @test_mm_slli_epi64(<2 x i64> %a0) { ; SSE-LABEL: test_mm_slli_epi64: ; SSE: # %bb.0: -; SSE-NEXT: psllq $1, %xmm0 # encoding: [0x66,0x0f,0x73,0xf0,0x01] +; SSE-NEXT: psllq $2, %xmm0 # encoding: [0x66,0x0f,0x73,0xf0,0x02] ; SSE-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; ; AVX1-LABEL: test_mm_slli_epi64: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x73,0xf0,0x01] +; AVX1-NEXT: vpsllq $2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x73,0xf0,0x02] ; AVX1-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; ; AVX512-LABEL: test_mm_slli_epi64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsllq $1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x73,0xf0,0x01] +; AVX512-NEXT: vpsllq $2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x73,0xf0,0x02] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %res = call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %a0, i32 1) + %res = call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %a0, i32 2) ret <2 x i64> %res } declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) nounwind readnone diff --git a/llvm/test/CodeGen/X86/vec_shift6.ll b/llvm/test/CodeGen/X86/vec_shift6.ll index 71e659c..219e32c 100644 --- a/llvm/test/CodeGen/X86/vec_shift6.ll +++ b/llvm/test/CodeGen/X86/vec_shift6.ll @@ -28,14 +28,14 @@ define <8 x i16> @test2(<8 x i16> %a) { ; SSE2-LABEL: test2: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: paddw %xmm0, %xmm1 +; SSE2-NEXT: paddw %xmm1, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; SSE2-NEXT: retq ; ; SSE41-LABEL: test2: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: paddw %xmm0, %xmm1 +; SSE41-NEXT: paddw %xmm1, %xmm1 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; SSE41-NEXT: retq ; @@ -56,7 +56,7 @@ define <4 x i32> @test3(<4 x i32> %a) { ; SSE2-LABEL: test3: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm1 ; SSE2-NEXT: pslld $2, %xmm0 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE2-NEXT: retq @@ -81,14 +81,14 @@ define <4 x i32> @test4(<4 x i32> %a) { ; SSE2-LABEL: test4: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; SSE2-NEXT: retq ; ; SSE41-LABEL: test4: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: paddd %xmm0, %xmm1 +; SSE41-NEXT: paddd %xmm1, %xmm1 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; SSE41-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-gep.ll b/llvm/test/CodeGen/X86/vector-gep.ll index 5c48559..b4cffcd 100644 --- a/llvm/test/CodeGen/X86/vector-gep.ll +++ b/llvm/test/CodeGen/X86/vector-gep.ll @@ -122,91 +122,87 @@ define <64 x ptr> @AGEP9(ptr %param, <64 x i32> %off) nounwind { ; CHECK-NEXT: movl %esp, %ebp ; CHECK-NEXT: andl $-32, %esp ; CHECK-NEXT: subl $160, %esp -; CHECK-NEXT: vmovdqa %ymm2, %ymm5 -; CHECK-NEXT: vmovdqa %ymm1, %ymm3 -; CHECK-NEXT: vmovdqa %ymm0, %ymm1 -; CHECK-NEXT: vmovdqa 72(%ebp), %ymm0 -; CHECK-NEXT: vmovdqa 40(%ebp), %ymm2 -; CHECK-NEXT: vpaddd %xmm2, %xmm2, %xmm4 -; CHECK-NEXT: vbroadcastss 12(%ebp), %xmm7 -; CHECK-NEXT: vpaddd %xmm4, %xmm7, %xmm4 -; CHECK-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm2 -; CHECK-NEXT: vpaddd %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpaddd %xmm2, %xmm7, %xmm2 -; CHECK-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm2 -; CHECK-NEXT: vpaddd %xmm2, %xmm7, %xmm2 -; CHECK-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm3 +; CHECK-NEXT: vbroadcastss 12(%ebp), %xmm5 +; CHECK-NEXT: vpaddd %xmm3, %xmm5, %xmm3 +; CHECK-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpaddd %xmm0, %xmm7, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 ; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovdqa 104(%ebp), %ymm0 -; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm2 -; CHECK-NEXT: vpaddd %xmm2, %xmm7, %xmm2 -; CHECK-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vpaddd %xmm1, %xmm1, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpaddd %xmm0, %xmm7, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 ; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovdqa 136(%ebp), %ymm0 -; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm2 -; CHECK-NEXT: vpaddd %xmm2, %xmm7, %xmm2 -; CHECK-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vpaddd %xmm2, %xmm2, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpaddd %xmm0, %xmm7, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 ; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovdqa 168(%ebp), %ymm0 -; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm2 -; CHECK-NEXT: vpaddd %xmm2, %xmm7, %xmm2 -; CHECK-NEXT: vmovdqa %xmm2, (%esp) # 16-byte Spill -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vmovdqa 40(%ebp), %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpaddd %xmm0, %xmm7, %xmm2 -; CHECK-NEXT: vpaddd %xmm1, %xmm1, %xmm0 -; CHECK-NEXT: vpaddd %xmm0, %xmm7, %xmm0 -; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm1 -; CHECK-NEXT: vpaddd %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpaddd %xmm1, %xmm7, %xmm1 -; CHECK-NEXT: vpaddd %xmm3, %xmm3, %xmm6 -; CHECK-NEXT: vpaddd %xmm6, %xmm7, %xmm6 -; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm3 -; CHECK-NEXT: vpaddd %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpaddd %xmm3, %xmm7, %xmm3 -; CHECK-NEXT: vmovdqa %ymm5, %ymm4 -; CHECK-NEXT: vpaddd %xmm4, %xmm4, %xmm5 -; CHECK-NEXT: vpaddd %xmm5, %xmm7, %xmm5 -; CHECK-NEXT: vextractf128 $1, %ymm4, %xmm4 +; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa 56(%ebp), %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa 72(%ebp), %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 +; CHECK-NEXT: vmovdqa %xmm0, (%esp) # 16-byte Spill +; CHECK-NEXT: vmovdqa 88(%ebp), %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm2 +; CHECK-NEXT: vmovdqa 104(%ebp), %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm1 +; CHECK-NEXT: vmovdqa 120(%ebp), %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 +; CHECK-NEXT: vmovdqa 136(%ebp), %xmm6 +; CHECK-NEXT: vpaddd %xmm6, %xmm6, %xmm6 +; CHECK-NEXT: vpaddd %xmm6, %xmm5, %xmm6 +; CHECK-NEXT: vmovdqa 152(%ebp), %xmm7 +; CHECK-NEXT: vpaddd %xmm7, %xmm7, %xmm7 +; CHECK-NEXT: vpaddd %xmm7, %xmm5, %xmm7 +; CHECK-NEXT: vmovdqa 168(%ebp), %xmm4 ; CHECK-NEXT: vpaddd %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpaddd %xmm4, %xmm7, %xmm4 +; CHECK-NEXT: vpaddd %xmm4, %xmm5, %xmm4 +; CHECK-NEXT: vmovdqa 184(%ebp), %xmm3 +; CHECK-NEXT: vpaddd %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vpaddd %xmm3, %xmm5, %xmm3 ; CHECK-NEXT: movl 8(%ebp), %eax -; CHECK-NEXT: vmovdqa %xmm4, 80(%eax) -; CHECK-NEXT: vmovdqa %xmm5, 64(%eax) -; CHECK-NEXT: vmovdqa %xmm3, 48(%eax) -; CHECK-NEXT: vmovdqa %xmm6, 32(%eax) -; CHECK-NEXT: vmovdqa %xmm1, 16(%eax) -; CHECK-NEXT: vmovdqa %xmm0, (%eax) -; CHECK-NEXT: vmovdqa %xmm2, 240(%eax) +; CHECK-NEXT: vmovdqa %xmm3, 240(%eax) +; CHECK-NEXT: vmovdqa %xmm4, 224(%eax) +; CHECK-NEXT: vmovdqa %xmm7, 208(%eax) +; CHECK-NEXT: vmovdqa %xmm6, 192(%eax) +; CHECK-NEXT: vmovdqa %xmm0, 176(%eax) +; CHECK-NEXT: vmovdqa %xmm1, 160(%eax) +; CHECK-NEXT: vmovdqa %xmm2, 144(%eax) ; CHECK-NEXT: vmovaps (%esp), %xmm0 # 16-byte Reload -; CHECK-NEXT: vmovaps %xmm0, 224(%eax) +; CHECK-NEXT: vmovaps %xmm0, 128(%eax) ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vmovaps %xmm0, 208(%eax) +; CHECK-NEXT: vmovaps %xmm0, 112(%eax) ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vmovaps %xmm0, 192(%eax) +; CHECK-NEXT: vmovaps %xmm0, 96(%eax) ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vmovaps %xmm0, 176(%eax) +; CHECK-NEXT: vmovaps %xmm0, 80(%eax) ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vmovaps %xmm0, 160(%eax) +; CHECK-NEXT: vmovaps %xmm0, 64(%eax) ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vmovaps %xmm0, 144(%eax) +; CHECK-NEXT: vmovaps %xmm0, 48(%eax) ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vmovaps %xmm0, 128(%eax) +; CHECK-NEXT: vmovaps %xmm0, 32(%eax) ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vmovaps %xmm0, 112(%eax) +; CHECK-NEXT: vmovaps %xmm0, 16(%eax) ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vmovaps %xmm0, 96(%eax) +; CHECK-NEXT: vmovaps %xmm0, (%eax) ; CHECK-NEXT: movl %ebp, %esp ; CHECK-NEXT: popl %ebp ; CHECK-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll index 13f7d68..33d80f6 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll @@ -652,7 +652,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind { ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE2-NEXT: paddb %xmm4, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psllw $1, %xmm2 +; SSE2-NEXT: paddw %xmm2, %xmm2 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE2-NEXT: psrlw $2, %xmm1 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -678,7 +678,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind { ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE41-NEXT: paddb %xmm3, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psllw $1, %xmm2 +; SSE41-NEXT: paddw %xmm2, %xmm2 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE41-NEXT: psrlw $2, %xmm1 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -701,7 +701,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind { ; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpsllw $1, %xmm1, %xmm2 +; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm1 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 @@ -720,7 +720,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind { ; AVX2NOBW-NEXT: vpsrlw $1, %xmm2, %xmm2 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX2NOBW-NEXT: vpsllw $1, %xmm1, %xmm2 +; AVX2NOBW-NEXT: vpaddw %xmm1, %xmm1, %xmm2 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX2NOBW-NEXT: vpsrlw $2, %xmm1, %xmm1 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 @@ -739,7 +739,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind { ; AVX512BW-NEXT: vpsrlw $1, %xmm2, %xmm2 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512BW-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX512BW-NEXT: vpsllw $1, %xmm1, %xmm2 +; AVX512BW-NEXT: vpaddw %xmm1, %xmm1, %xmm2 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512BW-NEXT: vpsrlw $2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll index 1a5c373..e43108f 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll @@ -590,7 +590,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5 ; AVX1-NEXT: vpaddb %xmm3, %xmm5, %xmm3 -; AVX1-NEXT: vpsllw $1, %xmm3, %xmm5 +; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm5 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] ; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 ; AVX1-NEXT: vpsrlw $2, %xmm3, %xmm3 @@ -609,7 +609,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3 ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsllw $1, %xmm2, %xmm3 +; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm2 ; AVX1-NEXT: vpand %xmm2, %xmm8, %xmm2 @@ -633,7 +633,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind { ; AVX2NOBW-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; AVX2NOBW-NEXT: vpaddb %ymm1, %ymm2, %ymm1 -; AVX2NOBW-NEXT: vpsllw $1, %ymm1, %ymm2 +; AVX2NOBW-NEXT: vpaddw %ymm1, %ymm1, %ymm2 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; AVX2NOBW-NEXT: vpsrlw $2, %ymm1, %ymm1 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 @@ -651,7 +651,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind { ; AVX512BW-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; AVX512BW-NEXT: vpaddb %ymm1, %ymm2, %ymm1 -; AVX512BW-NEXT: vpsllw $1, %ymm1, %ymm2 +; AVX512BW-NEXT: vpaddw %ymm1, %ymm1, %ymm2 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; AVX512BW-NEXT: vpsrlw $2, %ymm1, %ymm1 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll index 9c56894..bf98bcc 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll @@ -485,7 +485,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5 ; AVX512F-NEXT: vpaddb %ymm3, %ymm5, %ymm3 -; AVX512F-NEXT: vpsllw $1, %ymm3, %ymm5 +; AVX512F-NEXT: vpaddw %ymm3, %ymm3, %ymm5 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm7 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] ; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5 ; AVX512F-NEXT: vpsrlw $2, %ymm3, %ymm3 @@ -504,7 +504,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 ; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 ; AVX512F-NEXT: vpaddb %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpsllw $1, %ymm2, %ymm3 +; AVX512F-NEXT: vpaddw %ymm2, %ymm2, %ymm3 ; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3 ; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm2 ; AVX512F-NEXT: vpand %ymm2, %ymm8, %ymm2 @@ -528,7 +528,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: vpsrlw $1, %zmm2, %zmm2 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm2, %zmm1 -; AVX512BW-NEXT: vpsllw $1, %zmm1, %zmm2 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm1, %zmm2 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 ; AVX512BW-NEXT: vpsrlw $2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 diff --git a/llvm/test/CodeGen/X86/vector-mul.ll b/llvm/test/CodeGen/X86/vector-mul.ll index 13b21a7..6e1bf25 100644 --- a/llvm/test/CodeGen/X86/vector-mul.ll +++ b/llvm/test/CodeGen/X86/vector-mul.ll @@ -821,10 +821,10 @@ define <16 x i16> @madd_v16i16_3(<16 x i16> %a0, <16 x i16> %a1) nounwind { ; X86-SSE-NEXT: andl $-16, %esp ; X86-SSE-NEXT: subl $16, %esp ; X86-SSE-NEXT: movdqa %xmm1, %xmm3 -; X86-SSE-NEXT: paddw %xmm1, %xmm3 +; X86-SSE-NEXT: paddw %xmm3, %xmm3 ; X86-SSE-NEXT: paddw %xmm3, %xmm1 ; X86-SSE-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE-NEXT: paddw %xmm0, %xmm3 +; X86-SSE-NEXT: paddw %xmm3, %xmm3 ; X86-SSE-NEXT: paddw %xmm2, %xmm0 ; X86-SSE-NEXT: paddw %xmm3, %xmm0 ; X86-SSE-NEXT: paddw 8(%ebp), %xmm1 @@ -835,9 +835,9 @@ define <16 x i16> @madd_v16i16_3(<16 x i16> %a0, <16 x i16> %a1) nounwind { ; X64-SSE-LABEL: madd_v16i16_3: ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: movdqa %xmm1, %xmm4 -; X64-SSE-NEXT: paddw %xmm1, %xmm4 +; X64-SSE-NEXT: paddw %xmm4, %xmm4 ; X64-SSE-NEXT: movdqa %xmm0, %xmm5 -; X64-SSE-NEXT: paddw %xmm0, %xmm5 +; X64-SSE-NEXT: paddw %xmm5, %xmm5 ; X64-SSE-NEXT: paddw %xmm2, %xmm0 ; X64-SSE-NEXT: paddw %xmm5, %xmm0 ; X64-SSE-NEXT: paddw %xmm3, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll index 227e000..ab1feba 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll @@ -907,7 +907,7 @@ define i1 @mask_v8i32_2(<8 x i32> %a0) { ; SSE2-LABEL: mask_v8i32_2: ; SSE2: # %bb.0: ; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pslld $1, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm0 ; SSE2-NEXT: movmskps %xmm0, %eax ; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: sete %al diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll index 2b1cf5b..99dac74 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll @@ -927,7 +927,7 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind { ; SSE2-LABEL: constant_shift_v2i64: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: paddq %xmm0, %xmm1 +; SSE2-NEXT: paddq %xmm1, %xmm1 ; SSE2-NEXT: psllq $7, %xmm0 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE2-NEXT: retq @@ -975,7 +975,7 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind { ; X86-SSE-LABEL: constant_shift_v2i64: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE-NEXT: paddq %xmm0, %xmm1 +; X86-SSE-NEXT: paddq %xmm1, %xmm1 ; X86-SSE-NEXT: psllq $7, %xmm0 ; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; X86-SSE-NEXT: retl diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll index 5b61de5..ee9d8a5 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -3550,14 +3550,14 @@ define <8 x i16> @PR141475(i32 %in) { ; SSE-LABEL: PR141475: ; SSE: # %bb.0: ; SSE-NEXT: movd %edi, %xmm0 -; SSE-NEXT: pslld $1, %xmm0 +; SSE-NEXT: paddd %xmm0, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE-NEXT: retq ; ; AVX-LABEL: PR141475: ; AVX: # %bb.0: ; AVX-NEXT: vmovd %edi, %xmm0 -; AVX-NEXT: vpslld $1, %xmm0, %xmm0 +; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX-NEXT: retq %mul = shl i32 %in, 1 diff --git a/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll b/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll index 54dc107..3b93734 100644 --- a/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll +++ b/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll @@ -1438,26 +1438,26 @@ define <8 x i16> @test_128_i16_x_8_65024_mask_ashr_10(<8 x i16> %a0) { define <8 x i16> @test_128_i16_x_8_127_mask_shl_1(<8 x i16> %a0) { ; X86-SSE2-LABEL: test_128_i16_x_8_127_mask_shl_1: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: paddw %xmm0, %xmm0 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: retl ; ; X86-AVX-LABEL: test_128_i16_x_8_127_mask_shl_1: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-AVX-NEXT: vpaddw %xmm0, %xmm0, %xmm0 +; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-AVX-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i16_x_8_127_mask_shl_1: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: paddw %xmm0, %xmm0 +; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: retq ; ; X64-AVX-LABEL: test_128_i16_x_8_127_mask_shl_1: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vpaddw %xmm0, %xmm0, %xmm0 +; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: retq %t0 = and <8 x i16> %a0, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127> %t1 = shl <8 x i16> %t0, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> @@ -1656,26 +1656,26 @@ define <8 x i16> @test_128_i16_x_8_2032_mask_shl_6(<8 x i16> %a0) { define <8 x i16> @test_128_i16_x_8_65024_mask_shl_1(<8 x i16> %a0) { ; X86-SSE2-LABEL: test_128_i16_x_8_65024_mask_shl_1: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: paddw %xmm0, %xmm0 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: retl ; ; X86-AVX-LABEL: test_128_i16_x_8_65024_mask_shl_1: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-AVX-NEXT: vpaddw %xmm0, %xmm0, %xmm0 +; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-AVX-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i16_x_8_65024_mask_shl_1: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: paddw %xmm0, %xmm0 +; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: retq ; ; X64-AVX-LABEL: test_128_i16_x_8_65024_mask_shl_1: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vpaddw %xmm0, %xmm0, %xmm0 +; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: retq %t0 = and <8 x i16> %a0, <i16 65024, i16 65024, i16 65024, i16 65024, i16 65024, i16 65024, i16 65024, i16 65024> %t1 = shl <8 x i16> %t0, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> @@ -2373,40 +2373,40 @@ define <4 x i32> @test_128_i32_x_4_4294836224_mask_ashr_18(<4 x i32> %a0) { define <4 x i32> @test_128_i32_x_4_32767_mask_shl_1(<4 x i32> %a0) { ; X86-SSE2-LABEL: test_128_i32_x_4_32767_mask_shl_1: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: paddd %xmm0, %xmm0 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: retl ; ; X86-AVX1-LABEL: test_128_i32_x_4_32767_mask_shl_1: ; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-AVX1-NEXT: retl ; ; X86-AVX2-LABEL: test_128_i32_x_4_32767_mask_shl_1: ; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32767,32767,32767,32767] -; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65534,65534,65534,65534] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i32_x_4_32767_mask_shl_1: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: paddd %xmm0, %xmm0 +; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: retq ; ; X64-AVX1-LABEL: test_128_i32_x_4_32767_mask_shl_1: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-AVX1-NEXT: retq ; ; X64-AVX2-LABEL: test_128_i32_x_4_32767_mask_shl_1: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32767,32767,32767,32767] -; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65534,65534,65534,65534] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: retq %t0 = and <4 x i32> %a0, <i32 32767, i32 32767, i32 32767, i32 32767> %t1 = shl <4 x i32> %t0, <i32 1, i32 1, i32 1, i32 1> @@ -2675,40 +2675,40 @@ define <4 x i32> @test_128_i32_x_4_8388352_mask_shl_10(<4 x i32> %a0) { define <4 x i32> @test_128_i32_x_4_4294836224_mask_shl_1(<4 x i32> %a0) { ; X86-SSE2-LABEL: test_128_i32_x_4_4294836224_mask_shl_1: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: paddd %xmm0, %xmm0 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: retl ; ; X86-AVX1-LABEL: test_128_i32_x_4_4294836224_mask_shl_1: ; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-AVX1-NEXT: retl ; ; X86-AVX2-LABEL: test_128_i32_x_4_4294836224_mask_shl_1: ; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294836224,4294836224,4294836224,4294836224] -; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294705152,4294705152,4294705152,4294705152] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i32_x_4_4294836224_mask_shl_1: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: paddd %xmm0, %xmm0 +; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: retq ; ; X64-AVX1-LABEL: test_128_i32_x_4_4294836224_mask_shl_1: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-AVX1-NEXT: retq ; ; X64-AVX2-LABEL: test_128_i32_x_4_4294836224_mask_shl_1: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294836224,4294836224,4294836224,4294836224] -; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294705152,4294705152,4294705152,4294705152] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: retq %t0 = and <4 x i32> %a0, <i32 4294836224, i32 4294836224, i32 4294836224, i32 4294836224> %t1 = shl <4 x i32> %t0, <i32 1, i32 1, i32 1, i32 1> @@ -3325,26 +3325,26 @@ define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_ashr_34(<2 x i64> % define <2 x i64> @test_128_i64_x_2_2147483647_mask_shl_1(<2 x i64> %a0) { ; X86-SSE2-LABEL: test_128_i64_x_2_2147483647_mask_shl_1: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: paddq %xmm0, %xmm0 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: retl ; ; X86-AVX-LABEL: test_128_i64_x_2_2147483647_mask_shl_1: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-AVX-NEXT: vpaddq %xmm0, %xmm0, %xmm0 +; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-AVX-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i64_x_2_2147483647_mask_shl_1: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: paddq %xmm0, %xmm0 +; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: retq ; ; X64-AVX-LABEL: test_128_i64_x_2_2147483647_mask_shl_1: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vpaddq %xmm0, %xmm0, %xmm0 +; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: retq %t0 = and <2 x i64> %a0, <i64 2147483647, i64 2147483647> %t1 = shl <2 x i64> %t0, <i64 1, i64 1> @@ -3543,26 +3543,26 @@ define <2 x i64> @test_128_i64_x_2_140737488289792_mask_shl_18(<2 x i64> %a0) { define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_shl_1(<2 x i64> %a0) { ; X86-SSE2-LABEL: test_128_i64_x_2_18446744065119617024_mask_shl_1: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: paddq %xmm0, %xmm0 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: retl ; ; X86-AVX-LABEL: test_128_i64_x_2_18446744065119617024_mask_shl_1: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-AVX-NEXT: vpaddq %xmm0, %xmm0, %xmm0 +; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-AVX-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i64_x_2_18446744065119617024_mask_shl_1: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: paddq %xmm0, %xmm0 +; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: retq ; ; X64-AVX-LABEL: test_128_i64_x_2_18446744065119617024_mask_shl_1: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vpaddq %xmm0, %xmm0, %xmm0 +; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: retq %t0 = and <2 x i64> %a0, <i64 18446744065119617024, i64 18446744065119617024> %t1 = shl <2 x i64> %t0, <i64 1, i64 1> diff --git a/llvm/test/Transforms/DropUnnecessaryAssumes/basic.ll b/llvm/test/Transforms/DropUnnecessaryAssumes/basic.ll index e2a9b4e..8a6f60b 100644 --- a/llvm/test/Transforms/DropUnnecessaryAssumes/basic.ll +++ b/llvm/test/Transforms/DropUnnecessaryAssumes/basic.ll @@ -1,6 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 ; RUN: opt -S -passes=drop-unnecessary-assumes < %s | FileCheck %s +declare void @use(i32 %x) +declare i32 @get() + define void @basic_dead(i32 %x) { ; CHECK-LABEL: define void @basic_dead( ; CHECK-SAME: i32 [[X:%.*]]) { @@ -180,3 +183,136 @@ define void @type_test(ptr %x) { call void @llvm.assume(i1 %test) ret void } + +define void @multiple_dead_conds(i32 %x) { +; CHECK-LABEL: define void @multiple_dead_conds( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: ret void +; + %cond1 = icmp sge i32 %x, 0 + call void @llvm.assume(i1 %cond1) + %cond2 = icmp ne i32 %x, 64 + call void @llvm.assume(i1 %cond2) + ret void +} + +define void @multiple_dead_bundles(ptr %x) { +; CHECK-LABEL: define void @multiple_dead_bundles( +; CHECK-SAME: ptr [[X:%.*]]) { +; CHECK-NEXT: ret void +; + call void @llvm.assume(i1 true) ["align"(ptr %x, i64 8), "nonnull"(ptr %x)] + ret void +} + +; The assume is eliminated, but currently leaves behind a dead cycle. +define void @dead_cycle(i1 %loop.cond) { +; CHECK-LABEL: define void @dead_cycle( +; CHECK-SAME: i1 [[LOOP_COND:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: br i1 [[LOOP_COND]], label %[[LOOP]], label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %cond = icmp ne i32 %iv, 64 + call void @llvm.assume(i1 %cond) + %iv.next = add i32 %iv, 1 + br i1 %loop.cond, label %loop, label %exit + +exit: + ret void +} + +define void @use_in_side_effect(i32 %x) { +; CHECK-LABEL: define void @use_in_side_effect( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[COND:%.*]] = icmp sge i32 [[X]], 0 +; CHECK-NEXT: call void @llvm.assume(i1 [[COND]]) +; CHECK-NEXT: call void @use(i32 [[X]]) +; CHECK-NEXT: ret void +; + %cond = icmp sge i32 %x, 0 + call void @llvm.assume(i1 %cond) + call void @use(i32 %x) + ret void +} + +define void @indirect_use_in_side_effect(i32 %x) { +; CHECK-LABEL: define void @indirect_use_in_side_effect( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[COND:%.*]] = icmp sge i32 [[X]], 0 +; CHECK-NEXT: call void @llvm.assume(i1 [[COND]]) +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[X]], 1 +; CHECK-NEXT: call void @use(i32 [[ADD]]) +; CHECK-NEXT: ret void +; + %cond = icmp sge i32 %x, 0 + call void @llvm.assume(i1 %cond) + %add = add i32 %x, 1 + call void @use(i32 %add) + ret void +} + +; The affected value itself has a side effect, but we can still drop the +; assume. +define void @affected_value_has_side_effect() { +; CHECK-LABEL: define void @affected_value_has_side_effect() { +; CHECK-NEXT: [[X:%.*]] = call i32 @get() +; CHECK-NEXT: ret void +; + %x = call i32 @get() + %cond = icmp sge i32 %x, 0 + call void @llvm.assume(i1 %cond) + ret void +} + +define i32 @affected_value_has_side_effect_and_is_used() { +; CHECK-LABEL: define i32 @affected_value_has_side_effect_and_is_used() { +; CHECK-NEXT: [[X:%.*]] = call i32 @get() +; CHECK-NEXT: [[COND:%.*]] = icmp sge i32 [[X]], 0 +; CHECK-NEXT: call void @llvm.assume(i1 [[COND]]) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @get() + %cond = icmp sge i32 %x, 0 + call void @llvm.assume(i1 %cond) + ret i32 %x +} + +@g = external global i8 +@g2 = external global i8 + +; Assumes on globals are currently not supported. +define void @assume_on_global() { +; CHECK-LABEL: define void @assume_on_global() { +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr @g, i64 8) ] +; CHECK-NEXT: ret void +; + call void @llvm.assume(i1 true) ["align"(ptr @g, i64 8)] + ret void +} + +define void @assume_on_global_used_in_other_func() { +; CHECK-LABEL: define void @assume_on_global_used_in_other_func() { +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr @g2, i64 8) ] +; CHECK-NEXT: ret void +; + call void @llvm.assume(i1 true) ["align"(ptr @g2, i64 8)] + ret void +} + +define ptr @other_func() { +; CHECK-LABEL: define ptr @other_func() { +; CHECK-NEXT: ret ptr @g2 +; + ret ptr @g2 +} diff --git a/llvm/test/Verifier/preallocated-invalid.ll b/llvm/test/Verifier/preallocated-invalid.ll index 38ed106..2c5aff2 100644 --- a/llvm/test/Verifier/preallocated-invalid.ll +++ b/llvm/test/Verifier/preallocated-invalid.ll @@ -65,13 +65,21 @@ define void @preallocated_one_call() { ret void } -; CHECK: must be a constant +; CHECK: immarg operand has non-immediate parameter define void @preallocated_setup_constant() { %ac = call i32 @blackbox() %cs = call token @llvm.call.preallocated.setup(i32 %ac) ret void } +; CHECK: llvm.call.preallocated.alloc arg index must be a constant +define void @preallocated_arg_constant() { + %ac = call i32 @blackbox() + %cs = call token @llvm.call.preallocated.setup(i32 3) + call token @llvm.call.preallocated.arg(token %cs, i32 %ac) + ret void +} + ; CHECK: must be between 0 and corresponding define void @preallocated_setup_arg_index_in_bounds() { %cs = call token @llvm.call.preallocated.setup(i32 2) diff --git a/llvm/tools/llvm-cov/CoverageExporterJson.cpp b/llvm/tools/llvm-cov/CoverageExporterJson.cpp index ff86c8d..4c07c05 100644 --- a/llvm/tools/llvm-cov/CoverageExporterJson.cpp +++ b/llvm/tools/llvm-cov/CoverageExporterJson.cpp @@ -118,6 +118,7 @@ json::Value renderCondState(const coverage::MCDCRecord::CondState CondState) { case coverage::MCDCRecord::MCDC_False: return json::Value(false); } + llvm_unreachable("Unknown llvm::coverage::MCDCRecord::CondState enum"); } json::Array gatherTestVectors(coverage::MCDCRecord &Record) { |