diff options
10 files changed, 93 insertions, 42 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 1795470..19797e6 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -142,6 +142,10 @@ public: return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS}); } + VPValue *createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal) { + return createNaryOp(Instruction::Select, {Cond, TrueVal, FalseVal}); + } + //===--------------------------------------------------------------------===// // RAII helpers. //===--------------------------------------------------------------------===// diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 47635db..d36e078 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8195,8 +8195,15 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, if (BI->getSuccessor(0) != Dst) EdgeMask = Builder.createNot(EdgeMask); - if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. - EdgeMask = Builder.createAnd(EdgeMask, SrcMask); + if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. + // The condition is 'SrcMask && EdgeMask', which is equivalent to + // 'select i1 SrcMask, i1 EdgeMask, i1 false'. + // The select version does not introduce new UB if SrcMask is false and + // EdgeMask is poison. Using 'and' here introduces undefined behavior. + VPValue *False = Plan->getOrAddVPValue( + ConstantInt::getFalse(BI->getCondition()->getType())); + EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False); + } return EdgeMaskCache[Edge] = EdgeMask; } diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll index dddedcb..b464389 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll @@ -2042,10 +2042,10 @@ define void @foo7(double* noalias nocapture %out, double** noalias nocapture rea ; AVX1-NEXT: [[TMP49:%.*]] = xor <4 x i1> [[TMP41]], <i1 true, i1 true, i1 true, i1 true> ; AVX1-NEXT: [[TMP50:%.*]] = xor <4 x i1> [[TMP42]], <i1 true, i1 true, i1 true, i1 true> ; AVX1-NEXT: [[TMP51:%.*]] = xor <4 x i1> [[TMP43]], <i1 true, i1 true, i1 true, i1 true> -; AVX1-NEXT: [[TMP52:%.*]] = and <4 x i1> [[TMP48]], [[TMP28]] -; AVX1-NEXT: [[TMP53:%.*]] = and <4 x i1> [[TMP49]], [[TMP29]] -; AVX1-NEXT: [[TMP54:%.*]] = and <4 x i1> [[TMP50]], [[TMP30]] -; AVX1-NEXT: [[TMP55:%.*]] = and <4 x i1> [[TMP51]], [[TMP31]] +; AVX1-NEXT: [[TMP52:%.*]] = select <4 x i1> [[TMP28]], <4 x i1> [[TMP48]], <4 x i1> zeroinitializer +; AVX1-NEXT: [[TMP53:%.*]] = select <4 x i1> [[TMP29]], <4 x i1> [[TMP49]], <4 x i1> zeroinitializer +; AVX1-NEXT: [[TMP54:%.*]] = select <4 x i1> [[TMP30]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer +; AVX1-NEXT: [[TMP55:%.*]] = select <4 x i1> [[TMP31]], <4 x i1> [[TMP51]], <4 x i1> zeroinitializer ; AVX1-NEXT: [[TMP56:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 0 ; AVX1-NEXT: [[TMP57:%.*]] = bitcast double* [[TMP56]] to <4 x double>* ; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP57]], i32 8, <4 x i1> [[TMP52]]) @@ -2166,10 +2166,10 @@ define void @foo7(double* noalias nocapture %out, double** noalias nocapture rea ; AVX2-NEXT: [[TMP49:%.*]] = xor <4 x i1> [[TMP41]], <i1 true, i1 true, i1 true, i1 true> ; AVX2-NEXT: [[TMP50:%.*]] = xor <4 x i1> [[TMP42]], <i1 true, i1 true, i1 true, i1 true> ; AVX2-NEXT: [[TMP51:%.*]] = xor <4 x i1> [[TMP43]], <i1 true, i1 true, i1 true, i1 true> -; AVX2-NEXT: [[TMP52:%.*]] = and <4 x i1> [[TMP48]], [[TMP28]] -; AVX2-NEXT: [[TMP53:%.*]] = and <4 x i1> [[TMP49]], [[TMP29]] -; AVX2-NEXT: [[TMP54:%.*]] = and <4 x i1> [[TMP50]], [[TMP30]] -; AVX2-NEXT: [[TMP55:%.*]] = and <4 x i1> [[TMP51]], [[TMP31]] +; AVX2-NEXT: [[TMP52:%.*]] = select <4 x i1> [[TMP28]], <4 x i1> [[TMP48]], <4 x i1> zeroinitializer +; AVX2-NEXT: [[TMP53:%.*]] = select <4 x i1> [[TMP29]], <4 x i1> [[TMP49]], <4 x i1> zeroinitializer +; AVX2-NEXT: [[TMP54:%.*]] = select <4 x i1> [[TMP30]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer +; AVX2-NEXT: [[TMP55:%.*]] = select <4 x i1> [[TMP31]], <4 x i1> [[TMP51]], <4 x i1> zeroinitializer ; AVX2-NEXT: [[TMP56:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 0 ; AVX2-NEXT: [[TMP57:%.*]] = bitcast double* [[TMP56]] to <4 x double>* ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP57]], i32 8, <4 x i1> [[TMP52]]) @@ -2290,10 +2290,10 @@ define void @foo7(double* noalias nocapture %out, double** noalias nocapture rea ; AVX512-NEXT: [[TMP49:%.*]] = xor <8 x i1> [[TMP41]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true> ; AVX512-NEXT: [[TMP50:%.*]] = xor <8 x i1> [[TMP42]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true> ; AVX512-NEXT: [[TMP51:%.*]] = xor <8 x i1> [[TMP43]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true> -; AVX512-NEXT: [[TMP52:%.*]] = and <8 x i1> [[TMP48]], [[TMP28]] -; AVX512-NEXT: [[TMP53:%.*]] = and <8 x i1> [[TMP49]], [[TMP29]] -; AVX512-NEXT: [[TMP54:%.*]] = and <8 x i1> [[TMP50]], [[TMP30]] -; AVX512-NEXT: [[TMP55:%.*]] = and <8 x i1> [[TMP51]], [[TMP31]] +; AVX512-NEXT: [[TMP52:%.*]] = select <8 x i1> [[TMP28]], <8 x i1> [[TMP48]], <8 x i1> zeroinitializer +; AVX512-NEXT: [[TMP53:%.*]] = select <8 x i1> [[TMP29]], <8 x i1> [[TMP49]], <8 x i1> zeroinitializer +; AVX512-NEXT: [[TMP54:%.*]] = select <8 x i1> [[TMP30]], <8 x i1> [[TMP50]], <8 x i1> zeroinitializer +; AVX512-NEXT: [[TMP55:%.*]] = select <8 x i1> [[TMP31]], <8 x i1> [[TMP51]], <8 x i1> zeroinitializer ; AVX512-NEXT: [[TMP56:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 0 ; AVX512-NEXT: [[TMP57:%.*]] = bitcast double* [[TMP56]] to <8 x double>* ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP57]], i32 8, <8 x i1> [[TMP52]]) @@ -2459,10 +2459,10 @@ define void @foo8(double* noalias nocapture %out, i32 ()** noalias nocapture rea ; AVX1-NEXT: [[TMP49:%.*]] = xor <4 x i1> [[TMP41]], <i1 true, i1 true, i1 true, i1 true> ; AVX1-NEXT: [[TMP50:%.*]] = xor <4 x i1> [[TMP42]], <i1 true, i1 true, i1 true, i1 true> ; AVX1-NEXT: [[TMP51:%.*]] = xor <4 x i1> [[TMP43]], <i1 true, i1 true, i1 true, i1 true> -; AVX1-NEXT: [[TMP52:%.*]] = and <4 x i1> [[TMP48]], [[TMP28]] -; AVX1-NEXT: [[TMP53:%.*]] = and <4 x i1> [[TMP49]], [[TMP29]] -; AVX1-NEXT: [[TMP54:%.*]] = and <4 x i1> [[TMP50]], [[TMP30]] -; AVX1-NEXT: [[TMP55:%.*]] = and <4 x i1> [[TMP51]], [[TMP31]] +; AVX1-NEXT: [[TMP52:%.*]] = select <4 x i1> [[TMP28]], <4 x i1> [[TMP48]], <4 x i1> zeroinitializer +; AVX1-NEXT: [[TMP53:%.*]] = select <4 x i1> [[TMP29]], <4 x i1> [[TMP49]], <4 x i1> zeroinitializer +; AVX1-NEXT: [[TMP54:%.*]] = select <4 x i1> [[TMP30]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer +; AVX1-NEXT: [[TMP55:%.*]] = select <4 x i1> [[TMP31]], <4 x i1> [[TMP51]], <4 x i1> zeroinitializer ; AVX1-NEXT: [[TMP56:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 0 ; AVX1-NEXT: [[TMP57:%.*]] = bitcast double* [[TMP56]] to <4 x double>* ; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP57]], i32 8, <4 x i1> [[TMP52]]) @@ -2583,10 +2583,10 @@ define void @foo8(double* noalias nocapture %out, i32 ()** noalias nocapture rea ; AVX2-NEXT: [[TMP49:%.*]] = xor <4 x i1> [[TMP41]], <i1 true, i1 true, i1 true, i1 true> ; AVX2-NEXT: [[TMP50:%.*]] = xor <4 x i1> [[TMP42]], <i1 true, i1 true, i1 true, i1 true> ; AVX2-NEXT: [[TMP51:%.*]] = xor <4 x i1> [[TMP43]], <i1 true, i1 true, i1 true, i1 true> -; AVX2-NEXT: [[TMP52:%.*]] = and <4 x i1> [[TMP48]], [[TMP28]] -; AVX2-NEXT: [[TMP53:%.*]] = and <4 x i1> [[TMP49]], [[TMP29]] -; AVX2-NEXT: [[TMP54:%.*]] = and <4 x i1> [[TMP50]], [[TMP30]] -; AVX2-NEXT: [[TMP55:%.*]] = and <4 x i1> [[TMP51]], [[TMP31]] +; AVX2-NEXT: [[TMP52:%.*]] = select <4 x i1> [[TMP28]], <4 x i1> [[TMP48]], <4 x i1> zeroinitializer +; AVX2-NEXT: [[TMP53:%.*]] = select <4 x i1> [[TMP29]], <4 x i1> [[TMP49]], <4 x i1> zeroinitializer +; AVX2-NEXT: [[TMP54:%.*]] = select <4 x i1> [[TMP30]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer +; AVX2-NEXT: [[TMP55:%.*]] = select <4 x i1> [[TMP31]], <4 x i1> [[TMP51]], <4 x i1> zeroinitializer ; AVX2-NEXT: [[TMP56:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 0 ; AVX2-NEXT: [[TMP57:%.*]] = bitcast double* [[TMP56]] to <4 x double>* ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP57]], i32 8, <4 x i1> [[TMP52]]) @@ -2707,10 +2707,10 @@ define void @foo8(double* noalias nocapture %out, i32 ()** noalias nocapture rea ; AVX512-NEXT: [[TMP49:%.*]] = xor <8 x i1> [[TMP41]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true> ; AVX512-NEXT: [[TMP50:%.*]] = xor <8 x i1> [[TMP42]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true> ; AVX512-NEXT: [[TMP51:%.*]] = xor <8 x i1> [[TMP43]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true> -; AVX512-NEXT: [[TMP52:%.*]] = and <8 x i1> [[TMP48]], [[TMP28]] -; AVX512-NEXT: [[TMP53:%.*]] = and <8 x i1> [[TMP49]], [[TMP29]] -; AVX512-NEXT: [[TMP54:%.*]] = and <8 x i1> [[TMP50]], [[TMP30]] -; AVX512-NEXT: [[TMP55:%.*]] = and <8 x i1> [[TMP51]], [[TMP31]] +; AVX512-NEXT: [[TMP52:%.*]] = select <8 x i1> [[TMP28]], <8 x i1> [[TMP48]], <8 x i1> zeroinitializer +; AVX512-NEXT: [[TMP53:%.*]] = select <8 x i1> [[TMP29]], <8 x i1> [[TMP49]], <8 x i1> zeroinitializer +; AVX512-NEXT: [[TMP54:%.*]] = select <8 x i1> [[TMP30]], <8 x i1> [[TMP50]], <8 x i1> zeroinitializer +; AVX512-NEXT: [[TMP55:%.*]] = select <8 x i1> [[TMP31]], <8 x i1> [[TMP51]], <8 x i1> zeroinitializer ; AVX512-NEXT: [[TMP56:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 0 ; AVX512-NEXT: [[TMP57:%.*]] = bitcast double* [[TMP56]] to <8 x double>* ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP57]], i32 8, <8 x i1> [[TMP52]]) diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll index 285f460..aa8b136 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll @@ -408,7 +408,7 @@ define dso_local void @masked_strided1_optsize_unknown_tc(i8* noalias nocapture ; DISABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> -; DISABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = and <8 x i1> [[TMP0]], [[TMP1]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = and <8 x i1> [[TMP1]], [[TMP0]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; DISABLED_MASKED_STRIDED: pred.load.if: @@ -520,7 +520,7 @@ define dso_local void @masked_strided1_optsize_unknown_tc(i8* noalias nocapture ; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = shl nuw nsw i32 [[INDEX]], 1 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP2]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = and <8 x i1> [[TMP0]], [[TMP1]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = and <8 x i1> [[TMP1]], [[TMP0]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP3]] to <16 x i8>* ; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7> ; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false> @@ -615,7 +615,7 @@ define dso_local void @masked_strided3_optsize_unknown_tc(i8* noalias nocapture ; DISABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = mul nsw <8 x i32> [[VEC_IND]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> -; DISABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = and <8 x i1> [[TMP0]], [[TMP1]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = and <8 x i1> [[TMP1]], [[TMP0]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; DISABLED_MASKED_STRIDED: pred.load.if: @@ -727,7 +727,7 @@ define dso_local void @masked_strided3_optsize_unknown_tc(i8* noalias nocapture ; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = mul nsw i32 [[INDEX]], 3 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP2]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = and <8 x i1> [[TMP0]], [[TMP1]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = and <8 x i1> [[TMP1]], [[TMP0]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP3]] to <24 x i8>* ; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7> ; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = and <24 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false> @@ -1535,7 +1535,7 @@ define dso_local void @masked_strided2_unknown_tc(i8* noalias nocapture readonly ; DISABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp sgt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> -; DISABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = and <8 x i1> [[TMP0]], [[TMP1]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = and <8 x i1> [[TMP1]], [[TMP0]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; DISABLED_MASKED_STRIDED: pred.load.if: @@ -1871,7 +1871,7 @@ define dso_local void @masked_strided2_unknown_tc(i8* noalias nocapture readonly ; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = shl nuw nsw i32 [[INDEX]], 1 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP2]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = and <8 x i1> [[TMP0]], [[TMP1]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = and <8 x i1> [[TMP1]], [[TMP0]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP3]] to <16 x i8>* ; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7> ; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP5]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> poison) diff --git a/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll b/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll index 0cba3fc..f218869 100644 --- a/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll +++ b/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll @@ -39,7 +39,7 @@ define i32 @foo(i32* nocapture %A, i32* nocapture %B, i32 %n) { ; CHECK-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], <i32 19, i32 19, i32 19, i32 19> ; CHECK-NEXT: [[TMP12:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD6]], <i32 4, i32 4, i32 4, i32 4> ; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP12]], <4 x i32> <i32 4, i32 4, i32 4, i32 4>, <4 x i32> <i32 5, i32 5, i32 5, i32 5> -; CHECK-NEXT: [[TMP14:%.*]] = and <4 x i1> [[TMP11]], [[TMP10]] +; CHECK-NEXT: [[TMP14:%.*]] = and <4 x i1> [[TMP10]], [[TMP11]] ; CHECK-NEXT: [[TMP15:%.*]] = xor <4 x i1> [[TMP11]], <i1 true, i1 true, i1 true, i1 true> ; CHECK-NEXT: [[TMP16:%.*]] = and <4 x i1> [[TMP10]], [[TMP15]] ; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP14]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> <i32 9, i32 9, i32 9, i32 9> diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll b/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll index 308377f..b8d9b45 100644 --- a/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll +++ b/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll @@ -161,7 +161,7 @@ for.cond.cleanup: ; preds = %if.end ; CHECK: %[[CMP1:.+]] = icmp slt <2 x i32> %[[VAL:.+]], <i32 100, i32 100> ; CHECK: %[[CMP2:.+]] = icmp sge <2 x i32> %[[VAL]], <i32 200, i32 200> ; CHECK: %[[NOT:.+]] = xor <2 x i1> %[[CMP1]], <i1 true, i1 true> -; CHECK: %[[AND:.+]] = and <2 x i1> %[[CMP2]], %[[NOT]] +; CHECK: %[[AND:.+]] = select <2 x i1> %[[NOT]], <2 x i1> %[[CMP2]], <2 x i1> zeroinitializer ; CHECK: %[[OR:.+]] = or <2 x i1> %[[AND]], %[[CMP1]] ; CHECK: %[[EXTRACT:.+]] = extractelement <2 x i1> %[[OR]], i32 0 ; CHECK: br i1 %[[EXTRACT]], label %[[THEN:[a-zA-Z0-9.]+]], label %[[FI:[a-zA-Z0-9.]+]] diff --git a/llvm/test/Transforms/LoopVectorize/if-reduction.ll b/llvm/test/Transforms/LoopVectorize/if-reduction.ll index a973016..bde4fbc 100644 --- a/llvm/test/Transforms/LoopVectorize/if-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/if-reduction.ll @@ -610,9 +610,9 @@ for.end: ; preds = %for.body, %entry ; CHECK-DAG: %[[M1:.*]] = fmul fast <4 x float> %[[V0]], <float 3.000000e+00, ; CHECK-DAG: %[[M2:.*]] = fmul fast <4 x float> %[[V0]], <float 2.000000e+00, ; CHECK: %[[C11:.*]] = xor <4 x i1> %[[C1]], <i1 true, -; CHECK-DAG: %[[C12:.*]] = and <4 x i1> %[[C2]], %[[C11]] +; CHECK-DAG: %[[C12:.*]] = select <4 x i1> %[[C11]], <4 x i1> %[[C2]], <4 x i1> zeroinitializer ; CHECK-DAG: %[[C21:.*]] = xor <4 x i1> %[[C2]], <i1 true, -; CHECK: %[[C22:.*]] = and <4 x i1> %[[C21]], %[[C11]] +; CHECK: %[[C22:.*]] = select <4 x i1> %[[C11]], <4 x i1> %[[C21]], <4 x i1> zeroinitializer ; CHECK: %[[S1:.*]] = select <4 x i1> %[[C22]], <4 x float> %[[M1]], <4 x float> %[[M2]] ; CHECK: %[[S2:.*]] = select <4 x i1> %[[C1]], <4 x float> %[[V0]], <4 x float> %[[S1]] ; CHECK: fadd fast <4 x float> %[[S2]], @@ -678,9 +678,9 @@ for.end: ; preds = %for.inc, %entry ; CHECK-DAG: %[[SUB:.*]] = fsub fast <4 x float> ; CHECK-DAG: %[[ADD:.*]] = fadd fast <4 x float> ; CHECK: %[[C11:.*]] = xor <4 x i1> %[[C1]], <i1 true, -; CHECK-DAG: %[[C12:.*]] = and <4 x i1> %[[C2]], %[[C11]] +; CHECK-DAG: %[[C12:.*]] = select <4 x i1> %[[C11]], <4 x i1> %[[C2]], <4 x i1> zeroinitializer ; CHECK-DAG: %[[C21:.*]] = xor <4 x i1> %[[C2]], <i1 true, -; CHECK: %[[C22:.*]] = and <4 x i1> %[[C21]], %[[C11]] +; CHECK: %[[C22:.*]] = select <4 x i1> %[[C11]], <4 x i1> %[[C21]], <4 x i1> zeroinitializer ; CHECK: %[[S1:.*]] = select <4 x i1> %[[C12]], <4 x float> %[[SUB]], <4 x float> %[[ADD]] ; CHECK: %[[S2:.*]] = select <4 x i1> %[[C22]], {{.*}} <4 x float> %[[S1]] define float @fcmp_fadd_fsub(float* nocapture readonly %a, i32 %n) nounwind readonly { diff --git a/llvm/test/Transforms/LoopVectorize/pr48832.ll b/llvm/test/Transforms/LoopVectorize/pr48832.ll new file mode 100644 index 0000000..620da91 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/pr48832.ll @@ -0,0 +1,40 @@ +; RUN: opt -loop-vectorize -force-vector-width=4 -S -o - < %s | FileCheck %s +%arrayt = type [64 x i32] + +@v_146 = external global %arrayt, align 1 + +; Since the program has well defined behavior, it should not introduce store poison +; CHECK: vector.ph: +; CHECK-NEXT: br label %vector.body +; CHECK: vector.body: +; CHECK: store <4 x i32> zeroinitializer, +; CHECK: br i1 %{{.*}}, label %middle.block, label %vector.body + +define void @foo() { +entry: + br label %for.cond + +for.cond: ; preds = %cond.end, %entry + %storemerge = phi i16 [ 0, %entry ], [ %inc, %cond.end ] + %cmp = icmp slt i16 %storemerge, 15 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + br i1 true, label %cond.false, label %land.rhs + +land.rhs: ; preds = %for.body + br i1 poison, label %cond.end, label %cond.false + +cond.false: ; preds = %for.body, %land.rhs + br label %cond.end + +cond.end: ; preds = %land.rhs, %cond.false + %cond = phi i32 [ 0, %cond.false ], [ 1, %land.rhs ] + %arrayidx = getelementptr inbounds %arrayt, %arrayt* @v_146, i16 0, i16 %storemerge + store i32 %cond, i32* %arrayidx, align 1 + %inc = add nsw i16 %storemerge, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll index d1b99e4..e8271b9 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll @@ -1542,8 +1542,8 @@ define float @reduction_conditional(float* %A, float* %B, float* %C, float %S) { ; CHECK-NEXT: [[TMP5:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] ; CHECK-NEXT: [[TMP6:%.*]] = fcmp ule <4 x float> [[WIDE_LOAD1]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00> ; CHECK-NEXT: [[TMP7:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00> -; CHECK-NEXT: [[TMP8:%.*]] = and <4 x i1> [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[TMP9:%.*]] = and <4 x i1> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP8:%.*]] = and <4 x i1> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = and <4 x i1> [[TMP8]], [[TMP7]] ; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i1> [[TMP7]], <i1 true, i1 true, i1 true, i1 true> ; CHECK-NEXT: [[TMP11:%.*]] = and <4 x i1> [[TMP8]], [[TMP10]] ; CHECK-NEXT: [[TMP12:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true> diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll index 23bfc39..b295090 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll @@ -814,8 +814,8 @@ define float @reduction_conditional(float* %A, float* %B, float* %C, float %S) { ; CHECK-NEXT: [[TMP5:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] ; CHECK-NEXT: [[TMP6:%.*]] = fcmp ule <4 x float> [[WIDE_LOAD1]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00> ; CHECK-NEXT: [[TMP7:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00> -; CHECK-NEXT: [[TMP8:%.*]] = and <4 x i1> [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[TMP9:%.*]] = and <4 x i1> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP8:%.*]] = and <4 x i1> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = and <4 x i1> [[TMP8]], [[TMP7]] ; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i1> [[TMP7]], <i1 true, i1 true, i1 true, i1 true> ; CHECK-NEXT: [[TMP11:%.*]] = and <4 x i1> [[TMP8]], [[TMP10]] ; CHECK-NEXT: [[TMP12:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true> |