aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFlorian Mayer <fmayer@google.com>2025-07-07 11:24:17 -0700
committerGitHub <noreply@github.com>2025-07-07 11:24:17 -0700
commit0032148ea642dfb2f17b36201e82fee454fa6ebe (patch)
tree6df31ce7e056bfbd2f7bd2b978e395b6320bfdf1
parent778f60d92d30d7327dc426e3c1a94d9aae93987e (diff)
downloadllvm-0032148ea642dfb2f17b36201e82fee454fa6ebe.zip
llvm-0032148ea642dfb2f17b36201e82fee454fa6ebe.tar.gz
llvm-0032148ea642dfb2f17b36201e82fee454fa6ebe.tar.bz2
[MSAN] handle permi2var (#146437)
-rw-r--r--llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp74
-rw-r--r--llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll814
-rw-r--r--llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll812
-rw-r--r--llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll860
-rw-r--r--llvm/test/Instrumentation/MemorySanitizer/X86/x86-vpermi2.ll828
5 files changed, 2852 insertions, 536 deletions
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 82bafa3..ec94dca 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -158,6 +158,7 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/bit.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/ValueTracking.h"
@@ -4272,6 +4273,25 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
setOrigin(&I, PtrSrcOrigin);
}
+ void maskedCheckAVXIndexShadow(IRBuilder<> &IRB, Value *Idx, Instruction *I) {
+ auto IdxVectorSize =
+ cast<FixedVectorType>(Idx->getType())->getNumElements();
+ assert(isPowerOf2_64(IdxVectorSize));
+ auto *IdxVectorElemType =
+ cast<FixedVectorType>(Idx->getType())->getElementType();
+ Constant *IndexBits =
+ ConstantInt::get(IdxVectorElemType, IdxVectorSize - 1);
+ auto *IdxShadow = getShadow(Idx);
+ // Only the low bits of Idx are used.
+ Value *V = nullptr;
+ for (size_t i = 0; i < IdxVectorSize; ++i) {
+ V = IRB.CreateExtractElement(IdxShadow, i);
+ assert(V->getType() == IndexBits->getType());
+ V = IRB.CreateOr(V, IRB.CreateAnd(V, IndexBits));
+ }
+ insertShadowCheck(V, getOrigin(Idx), I);
+ }
+
// Instrument AVX permutation intrinsic.
// We apply the same permutation (argument index 1) to the shadow.
void handleAVXVpermilvar(IntrinsicInst &I) {
@@ -4289,6 +4309,39 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
setOriginForNaryOp(I);
}
+ // Instrument AVX permutation intrinsic.
+ // We apply the same permutation (argument index 1) to the shadows.
+ void handleAVXVpermi2var(IntrinsicInst &I) {
+ assert(I.arg_size() == 3);
+ assert(isa<FixedVectorType>(I.getArgOperand(0)->getType()));
+ assert(isa<FixedVectorType>(I.getArgOperand(1)->getType()));
+ assert(isa<FixedVectorType>(I.getArgOperand(2)->getType()));
+ [[maybe_unused]] auto ArgVectorSize =
+ cast<FixedVectorType>(I.getArgOperand(0)->getType())->getNumElements();
+ assert(cast<FixedVectorType>(I.getArgOperand(1)->getType())
+ ->getNumElements() == ArgVectorSize);
+ assert(cast<FixedVectorType>(I.getArgOperand(2)->getType())
+ ->getNumElements() == ArgVectorSize);
+ assert(I.getArgOperand(0)->getType() == I.getArgOperand(2)->getType());
+ assert(I.getType() == I.getArgOperand(0)->getType());
+ assert(I.getArgOperand(1)->getType()->isIntOrIntVectorTy());
+ IRBuilder<> IRB(&I);
+ Value *AShadow = getShadow(&I, 0);
+ Value *Idx = I.getArgOperand(1);
+ Value *BShadow = getShadow(&I, 2);
+
+ maskedCheckAVXIndexShadow(IRB, Idx, &I);
+
+ // Shadows are integer-ish types but some intrinsics require a
+ // different (e.g., floating-point) type.
+ AShadow = IRB.CreateBitCast(AShadow, I.getArgOperand(0)->getType());
+ BShadow = IRB.CreateBitCast(BShadow, I.getArgOperand(2)->getType());
+ CallInst *CI = IRB.CreateIntrinsic(I.getType(), I.getIntrinsicID(),
+ {AShadow, Idx, BShadow});
+ setShadow(&I, IRB.CreateBitCast(CI, getShadowTy(&I)));
+ setOriginForNaryOp(I);
+ }
+
// Instrument BMI / BMI2 intrinsics.
// All of these intrinsics are Z = I(X, Y)
// where the types of all operands and the result match, and are either i32 or
@@ -5244,6 +5297,27 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
break;
}
+ case Intrinsic::x86_avx512_vpermi2var_d_128:
+ case Intrinsic::x86_avx512_vpermi2var_d_256:
+ case Intrinsic::x86_avx512_vpermi2var_d_512:
+ case Intrinsic::x86_avx512_vpermi2var_hi_128:
+ case Intrinsic::x86_avx512_vpermi2var_hi_256:
+ case Intrinsic::x86_avx512_vpermi2var_hi_512:
+ case Intrinsic::x86_avx512_vpermi2var_pd_128:
+ case Intrinsic::x86_avx512_vpermi2var_pd_256:
+ case Intrinsic::x86_avx512_vpermi2var_pd_512:
+ case Intrinsic::x86_avx512_vpermi2var_ps_128:
+ case Intrinsic::x86_avx512_vpermi2var_ps_256:
+ case Intrinsic::x86_avx512_vpermi2var_ps_512:
+ case Intrinsic::x86_avx512_vpermi2var_q_128:
+ case Intrinsic::x86_avx512_vpermi2var_q_256:
+ case Intrinsic::x86_avx512_vpermi2var_q_512:
+ case Intrinsic::x86_avx512_vpermi2var_qi_128:
+ case Intrinsic::x86_avx512_vpermi2var_qi_256:
+ case Intrinsic::x86_avx512_vpermi2var_qi_512:
+ handleAVXVpermi2var(I);
+ break;
+
case Intrinsic::x86_avx512fp16_mask_add_sh_round:
case Intrinsic::x86_avx512fp16_mask_sub_sh_round:
case Intrinsic::x86_avx512fp16_mask_mul_sh_round:
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll
index 5aeaa12..2c9c6c7 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll
@@ -13700,8 +13700,8 @@ define <16 x i32>@test_int_x86_avx512_vpermi2var_d_512(<16 x i32> %x0, <16 x i32
; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_d_512(
; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT: [[TMP14:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
@@ -13714,9 +13714,62 @@ define <16 x i32>@test_int_x86_avx512_vpermi2var_d_512(<16 x i32> %x0, <16 x i32
; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP4]]
-; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X4:%.*]])
+; CHECK-NEXT: [[TMP42:%.*]] = extractelement <16 x i32> [[TMP14]], i64 0
+; CHECK-NEXT: [[TMP11:%.*]] = and i32 [[TMP42]], 15
+; CHECK-NEXT: [[TMP43:%.*]] = or i32 [[TMP42]], [[TMP11]]
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <16 x i32> [[TMP14]], i64 1
+; CHECK-NEXT: [[TMP13:%.*]] = and i32 [[TMP12]], 15
+; CHECK-NEXT: [[TMP44:%.*]] = or i32 [[TMP12]], [[TMP13]]
+; CHECK-NEXT: [[TMP45:%.*]] = extractelement <16 x i32> [[TMP14]], i64 2
+; CHECK-NEXT: [[TMP15:%.*]] = and i32 [[TMP45]], 15
+; CHECK-NEXT: [[TMP46:%.*]] = or i32 [[TMP45]], [[TMP15]]
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <16 x i32> [[TMP14]], i64 3
+; CHECK-NEXT: [[TMP17:%.*]] = and i32 [[TMP16]], 15
+; CHECK-NEXT: [[TMP47:%.*]] = or i32 [[TMP16]], [[TMP17]]
+; CHECK-NEXT: [[TMP18:%.*]] = extractelement <16 x i32> [[TMP14]], i64 4
+; CHECK-NEXT: [[TMP19:%.*]] = and i32 [[TMP18]], 15
+; CHECK-NEXT: [[TMP49:%.*]] = or i32 [[TMP18]], [[TMP19]]
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <16 x i32> [[TMP14]], i64 5
+; CHECK-NEXT: [[TMP21:%.*]] = and i32 [[TMP20]], 15
+; CHECK-NEXT: [[TMP50:%.*]] = or i32 [[TMP20]], [[TMP21]]
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <16 x i32> [[TMP14]], i64 6
+; CHECK-NEXT: [[TMP23:%.*]] = and i32 [[TMP22]], 15
+; CHECK-NEXT: [[TMP52:%.*]] = or i32 [[TMP22]], [[TMP23]]
+; CHECK-NEXT: [[TMP24:%.*]] = extractelement <16 x i32> [[TMP14]], i64 7
+; CHECK-NEXT: [[TMP25:%.*]] = and i32 [[TMP24]], 15
+; CHECK-NEXT: [[TMP53:%.*]] = or i32 [[TMP24]], [[TMP25]]
+; CHECK-NEXT: [[TMP26:%.*]] = extractelement <16 x i32> [[TMP14]], i64 8
+; CHECK-NEXT: [[TMP27:%.*]] = and i32 [[TMP26]], 15
+; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP26]], [[TMP27]]
+; CHECK-NEXT: [[TMP28:%.*]] = extractelement <16 x i32> [[TMP14]], i64 9
+; CHECK-NEXT: [[TMP29:%.*]] = and i32 [[TMP28]], 15
+; CHECK-NEXT: [[TMP56:%.*]] = or i32 [[TMP28]], [[TMP29]]
+; CHECK-NEXT: [[TMP30:%.*]] = extractelement <16 x i32> [[TMP14]], i64 10
+; CHECK-NEXT: [[TMP31:%.*]] = and i32 [[TMP30]], 15
+; CHECK-NEXT: [[TMP58:%.*]] = or i32 [[TMP30]], [[TMP31]]
+; CHECK-NEXT: [[TMP32:%.*]] = extractelement <16 x i32> [[TMP14]], i64 11
+; CHECK-NEXT: [[TMP33:%.*]] = and i32 [[TMP32]], 15
+; CHECK-NEXT: [[TMP61:%.*]] = or i32 [[TMP32]], [[TMP33]]
+; CHECK-NEXT: [[TMP34:%.*]] = extractelement <16 x i32> [[TMP14]], i64 12
+; CHECK-NEXT: [[TMP35:%.*]] = and i32 [[TMP34]], 15
+; CHECK-NEXT: [[TMP48:%.*]] = or i32 [[TMP34]], [[TMP35]]
+; CHECK-NEXT: [[TMP36:%.*]] = extractelement <16 x i32> [[TMP14]], i64 13
+; CHECK-NEXT: [[TMP37:%.*]] = and i32 [[TMP36]], 15
+; CHECK-NEXT: [[TMP51:%.*]] = or i32 [[TMP36]], [[TMP37]]
+; CHECK-NEXT: [[TMP38:%.*]] = extractelement <16 x i32> [[TMP14]], i64 14
+; CHECK-NEXT: [[TMP39:%.*]] = and i32 [[TMP38]], 15
+; CHECK-NEXT: [[TMP54:%.*]] = or i32 [[TMP38]], [[TMP39]]
+; CHECK-NEXT: [[TMP40:%.*]] = extractelement <16 x i32> [[TMP14]], i64 15
+; CHECK-NEXT: [[TMP41:%.*]] = and i32 [[TMP40]], 15
+; CHECK-NEXT: [[TMP57:%.*]] = or i32 [[TMP40]], [[TMP41]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP2]], <16 x i32> [[X1:%.*]], <16 x i32> [[TMP4]])
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i32 [[TMP57]], 0
+; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP59:%.*]], label [[TMP60:%.*]], !prof [[PROF1]]
+; CHECK: 59:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT: unreachable
+; CHECK: 60:
+; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1]], <16 x i32> [[X4:%.*]])
; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <16 x i32> [[TMP10]]
;
@@ -13744,9 +13797,62 @@ define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16
; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
-; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2]])
+; CHECK-NEXT: [[TMP42:%.*]] = extractelement <16 x i32> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP45:%.*]] = and i32 [[TMP42]], 15
+; CHECK-NEXT: [[TMP43:%.*]] = or i32 [[TMP42]], [[TMP45]]
+; CHECK-NEXT: [[TMP46:%.*]] = extractelement <16 x i32> [[TMP3]], i64 1
+; CHECK-NEXT: [[TMP47:%.*]] = and i32 [[TMP46]], 15
+; CHECK-NEXT: [[TMP44:%.*]] = or i32 [[TMP46]], [[TMP47]]
+; CHECK-NEXT: [[TMP48:%.*]] = extractelement <16 x i32> [[TMP3]], i64 2
+; CHECK-NEXT: [[TMP49:%.*]] = and i32 [[TMP48]], 15
+; CHECK-NEXT: [[TMP52:%.*]] = or i32 [[TMP48]], [[TMP49]]
+; CHECK-NEXT: [[TMP50:%.*]] = extractelement <16 x i32> [[TMP3]], i64 3
+; CHECK-NEXT: [[TMP51:%.*]] = and i32 [[TMP50]], 15
+; CHECK-NEXT: [[TMP53:%.*]] = or i32 [[TMP50]], [[TMP51]]
+; CHECK-NEXT: [[TMP18:%.*]] = extractelement <16 x i32> [[TMP3]], i64 4
+; CHECK-NEXT: [[TMP19:%.*]] = and i32 [[TMP18]], 15
+; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP18]], [[TMP19]]
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <16 x i32> [[TMP3]], i64 5
+; CHECK-NEXT: [[TMP21:%.*]] = and i32 [[TMP20]], 15
+; CHECK-NEXT: [[TMP56:%.*]] = or i32 [[TMP20]], [[TMP21]]
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <16 x i32> [[TMP3]], i64 6
+; CHECK-NEXT: [[TMP23:%.*]] = and i32 [[TMP22]], 15
+; CHECK-NEXT: [[TMP58:%.*]] = or i32 [[TMP22]], [[TMP23]]
+; CHECK-NEXT: [[TMP24:%.*]] = extractelement <16 x i32> [[TMP3]], i64 7
+; CHECK-NEXT: [[TMP25:%.*]] = and i32 [[TMP24]], 15
+; CHECK-NEXT: [[TMP61:%.*]] = or i32 [[TMP24]], [[TMP25]]
+; CHECK-NEXT: [[TMP26:%.*]] = extractelement <16 x i32> [[TMP3]], i64 8
+; CHECK-NEXT: [[TMP27:%.*]] = and i32 [[TMP26]], 15
+; CHECK-NEXT: [[TMP62:%.*]] = or i32 [[TMP26]], [[TMP27]]
+; CHECK-NEXT: [[TMP28:%.*]] = extractelement <16 x i32> [[TMP3]], i64 9
+; CHECK-NEXT: [[TMP29:%.*]] = and i32 [[TMP28]], 15
+; CHECK-NEXT: [[TMP63:%.*]] = or i32 [[TMP28]], [[TMP29]]
+; CHECK-NEXT: [[TMP30:%.*]] = extractelement <16 x i32> [[TMP3]], i64 10
+; CHECK-NEXT: [[TMP31:%.*]] = and i32 [[TMP30]], 15
+; CHECK-NEXT: [[TMP64:%.*]] = or i32 [[TMP30]], [[TMP31]]
+; CHECK-NEXT: [[TMP32:%.*]] = extractelement <16 x i32> [[TMP3]], i64 11
+; CHECK-NEXT: [[TMP33:%.*]] = and i32 [[TMP32]], 15
+; CHECK-NEXT: [[TMP65:%.*]] = or i32 [[TMP32]], [[TMP33]]
+; CHECK-NEXT: [[TMP34:%.*]] = extractelement <16 x i32> [[TMP3]], i64 12
+; CHECK-NEXT: [[TMP35:%.*]] = and i32 [[TMP34]], 15
+; CHECK-NEXT: [[TMP66:%.*]] = or i32 [[TMP34]], [[TMP35]]
+; CHECK-NEXT: [[TMP36:%.*]] = extractelement <16 x i32> [[TMP3]], i64 13
+; CHECK-NEXT: [[TMP37:%.*]] = and i32 [[TMP36]], 15
+; CHECK-NEXT: [[TMP67:%.*]] = or i32 [[TMP36]], [[TMP37]]
+; CHECK-NEXT: [[TMP38:%.*]] = extractelement <16 x i32> [[TMP3]], i64 14
+; CHECK-NEXT: [[TMP39:%.*]] = and i32 [[TMP38]], 15
+; CHECK-NEXT: [[TMP54:%.*]] = or i32 [[TMP38]], [[TMP39]]
+; CHECK-NEXT: [[TMP40:%.*]] = extractelement <16 x i32> [[TMP3]], i64 15
+; CHECK-NEXT: [[TMP41:%.*]] = and i32 [[TMP40]], 15
+; CHECK-NEXT: [[TMP57:%.*]] = or i32 [[TMP40]], [[TMP41]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP2]], <16 x i32> [[X1:%.*]], <16 x i32> [[_MSLD]])
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i32 [[TMP57]], 0
+; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP59:%.*]], label [[TMP60:%.*]], !prof [[PROF1]]
+; CHECK: 59:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT: unreachable
+; CHECK: 60:
+; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1]], <16 x i32> [[X2]])
; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP3]]
@@ -13768,25 +13874,46 @@ declare <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double>, <8 x
define <8 x double>@test_int_x86_avx512_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) #0 {
; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_pd_512(
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
-; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK: 7:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT: unreachable
-; CHECK: 8:
-; CHECK-NEXT: [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x double> [[X2:%.*]])
+; CHECK-NEXT: [[TMP8:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i64> [[TMP8]], i64 0
+; CHECK-NEXT: [[TMP21:%.*]] = and i64 [[TMP20]], 7
+; CHECK-NEXT: [[TMP25:%.*]] = or i64 [[TMP20]], [[TMP21]]
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i64> [[TMP8]], i64 1
+; CHECK-NEXT: [[TMP22:%.*]] = and i64 [[TMP6]], 7
+; CHECK-NEXT: [[TMP29:%.*]] = or i64 [[TMP6]], [[TMP22]]
+; CHECK-NEXT: [[TMP23:%.*]] = extractelement <8 x i64> [[TMP8]], i64 2
+; CHECK-NEXT: [[TMP26:%.*]] = and i64 [[TMP23]], 7
+; CHECK-NEXT: [[TMP30:%.*]] = or i64 [[TMP23]], [[TMP26]]
+; CHECK-NEXT: [[TMP27:%.*]] = extractelement <8 x i64> [[TMP8]], i64 3
+; CHECK-NEXT: [[TMP28:%.*]] = and i64 [[TMP27]], 7
+; CHECK-NEXT: [[TMP31:%.*]] = or i64 [[TMP27]], [[TMP28]]
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <8 x i64> [[TMP8]], i64 4
+; CHECK-NEXT: [[TMP13:%.*]] = and i64 [[TMP12]], 7
+; CHECK-NEXT: [[TMP34:%.*]] = or i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i64> [[TMP8]], i64 5
+; CHECK-NEXT: [[TMP15:%.*]] = and i64 [[TMP14]], 7
+; CHECK-NEXT: [[TMP35:%.*]] = or i64 [[TMP14]], [[TMP15]]
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i64> [[TMP8]], i64 6
+; CHECK-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 7
+; CHECK-NEXT: [[TMP24:%.*]] = or i64 [[TMP16]], [[TMP17]]
+; CHECK-NEXT: [[TMP18:%.*]] = extractelement <8 x i64> [[TMP8]], i64 7
+; CHECK-NEXT: [[TMP19:%.*]] = and i64 [[TMP18]], 7
+; CHECK-NEXT: [[TMP36:%.*]] = or i64 [[TMP18]], [[TMP19]]
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP3]] to <8 x double>
+; CHECK-NEXT: [[TMP11:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[TMP4]], <8 x i64> [[X1:%.*]], <8 x double> [[TMP5]])
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x double> [[TMP11]] to <8 x i64>
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP36]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP32:%.*]], label [[TMP33:%.*]], !prof [[PROF1]]
+; CHECK: 32:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT: unreachable
+; CHECK: 33:
+; CHECK-NEXT: [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1]], <8 x double> [[X2:%.*]])
; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i64> [[X1]] to <8 x double>
-; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: store <8 x i64> [[TMP7]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <8 x double> [[TMP9]]
;
%res = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1)
@@ -13797,32 +13924,53 @@ define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0,
;
; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_pd_512(
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
-; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK: 8:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT: unreachable
-; CHECK: 9:
-; CHECK-NEXT: [[TMP10:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x double> [[X2:%.*]])
+; CHECK-NEXT: [[TMP21:%.*]] = extractelement <8 x i64> [[TMP2]], i64 0
+; CHECK-NEXT: [[TMP22:%.*]] = and i64 [[TMP21]], 7
+; CHECK-NEXT: [[TMP26:%.*]] = or i64 [[TMP21]], [[TMP22]]
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i64> [[TMP2]], i64 1
+; CHECK-NEXT: [[TMP23:%.*]] = and i64 [[TMP7]], 7
+; CHECK-NEXT: [[TMP38:%.*]] = or i64 [[TMP7]], [[TMP23]]
+; CHECK-NEXT: [[TMP24:%.*]] = extractelement <8 x i64> [[TMP2]], i64 2
+; CHECK-NEXT: [[TMP27:%.*]] = and i64 [[TMP24]], 7
+; CHECK-NEXT: [[TMP39:%.*]] = or i64 [[TMP24]], [[TMP27]]
+; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i64> [[TMP2]], i64 3
+; CHECK-NEXT: [[TMP29:%.*]] = and i64 [[TMP28]], 7
+; CHECK-NEXT: [[TMP40:%.*]] = or i64 [[TMP28]], [[TMP29]]
+; CHECK-NEXT: [[TMP30:%.*]] = extractelement <8 x i64> [[TMP2]], i64 4
+; CHECK-NEXT: [[TMP31:%.*]] = and i64 [[TMP30]], 7
+; CHECK-NEXT: [[TMP41:%.*]] = or i64 [[TMP30]], [[TMP31]]
+; CHECK-NEXT: [[TMP32:%.*]] = extractelement <8 x i64> [[TMP2]], i64 5
+; CHECK-NEXT: [[TMP33:%.*]] = and i64 [[TMP32]], 7
+; CHECK-NEXT: [[TMP42:%.*]] = or i64 [[TMP32]], [[TMP33]]
+; CHECK-NEXT: [[TMP34:%.*]] = extractelement <8 x i64> [[TMP2]], i64 6
+; CHECK-NEXT: [[TMP35:%.*]] = and i64 [[TMP34]], 7
+; CHECK-NEXT: [[TMP25:%.*]] = or i64 [[TMP34]], [[TMP35]]
+; CHECK-NEXT: [[TMP36:%.*]] = extractelement <8 x i64> [[TMP2]], i64 7
+; CHECK-NEXT: [[TMP37:%.*]] = and i64 [[TMP36]], 7
+; CHECK-NEXT: [[TMP43:%.*]] = or i64 [[TMP36]], [[TMP37]]
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP3]] to <8 x double>
+; CHECK-NEXT: [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[TMP5]], <8 x i64> [[X1:%.*]], <8 x double> [[TMP6]])
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x double> [[TMP9]] to <8 x i64>
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP43]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP44:%.*]], label [[TMP45:%.*]], !prof [[PROF1]]
+; CHECK: 33:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT: unreachable
+; CHECK: 34:
+; CHECK-NEXT: [[TMP10:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1]], <8 x double> [[X2:%.*]])
; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i64> [[X1]] to <8 x double>
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> zeroinitializer, <8 x i64> [[TMP2]]
+; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP8]], <8 x i64> [[TMP2]]
; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x double> [[TMP10]] to <8 x i64>
; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x double> [[TMP11]] to <8 x i64>
; CHECK-NEXT: [[TMP17:%.*]] = xor <8 x i64> [[TMP15]], [[TMP16]]
-; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP17]], zeroinitializer
+; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP17]], [[TMP8]]
; CHECK-NEXT: [[TMP19:%.*]] = or <8 x i64> [[TMP18]], [[TMP2]]
; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP19]], <8 x i64> [[TMP14]]
; CHECK-NEXT: [[TMP20:%.*]] = select <8 x i1> [[TMP13]], <8 x double> [[TMP10]], <8 x double> [[TMP11]]
@@ -13838,25 +13986,70 @@ declare <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float>, <16 x
define <16 x float>@test_int_x86_avx512_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) #0 {
; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_ps_512(
; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
-; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK: 7:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT: unreachable
-; CHECK: 8:
-; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x float> [[X2:%.*]])
+; CHECK-NEXT: [[TMP8:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP36:%.*]] = extractelement <16 x i32> [[TMP8]], i64 0
+; CHECK-NEXT: [[TMP37:%.*]] = and i32 [[TMP36]], 15
+; CHECK-NEXT: [[TMP40:%.*]] = or i32 [[TMP36]], [[TMP37]]
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <16 x i32> [[TMP8]], i64 1
+; CHECK-NEXT: [[TMP38:%.*]] = and i32 [[TMP6]], 15
+; CHECK-NEXT: [[TMP41:%.*]] = or i32 [[TMP6]], [[TMP38]]
+; CHECK-NEXT: [[TMP39:%.*]] = extractelement <16 x i32> [[TMP8]], i64 2
+; CHECK-NEXT: [[TMP42:%.*]] = and i32 [[TMP39]], 15
+; CHECK-NEXT: [[TMP46:%.*]] = or i32 [[TMP39]], [[TMP42]]
+; CHECK-NEXT: [[TMP43:%.*]] = extractelement <16 x i32> [[TMP8]], i64 3
+; CHECK-NEXT: [[TMP44:%.*]] = and i32 [[TMP43]], 15
+; CHECK-NEXT: [[TMP47:%.*]] = or i32 [[TMP43]], [[TMP44]]
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <16 x i32> [[TMP8]], i64 4
+; CHECK-NEXT: [[TMP13:%.*]] = and i32 [[TMP12]], 15
+; CHECK-NEXT: [[TMP49:%.*]] = or i32 [[TMP12]], [[TMP13]]
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <16 x i32> [[TMP8]], i64 5
+; CHECK-NEXT: [[TMP15:%.*]] = and i32 [[TMP14]], 15
+; CHECK-NEXT: [[TMP51:%.*]] = or i32 [[TMP14]], [[TMP15]]
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <16 x i32> [[TMP8]], i64 6
+; CHECK-NEXT: [[TMP17:%.*]] = and i32 [[TMP16]], 15
+; CHECK-NEXT: [[TMP52:%.*]] = or i32 [[TMP16]], [[TMP17]]
+; CHECK-NEXT: [[TMP18:%.*]] = extractelement <16 x i32> [[TMP8]], i64 7
+; CHECK-NEXT: [[TMP19:%.*]] = and i32 [[TMP18]], 15
+; CHECK-NEXT: [[TMP53:%.*]] = or i32 [[TMP18]], [[TMP19]]
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <16 x i32> [[TMP8]], i64 8
+; CHECK-NEXT: [[TMP21:%.*]] = and i32 [[TMP20]], 15
+; CHECK-NEXT: [[TMP54:%.*]] = or i32 [[TMP20]], [[TMP21]]
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <16 x i32> [[TMP8]], i64 9
+; CHECK-NEXT: [[TMP23:%.*]] = and i32 [[TMP22]], 15
+; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP22]], [[TMP23]]
+; CHECK-NEXT: [[TMP24:%.*]] = extractelement <16 x i32> [[TMP8]], i64 10
+; CHECK-NEXT: [[TMP25:%.*]] = and i32 [[TMP24]], 15
+; CHECK-NEXT: [[TMP58:%.*]] = or i32 [[TMP24]], [[TMP25]]
+; CHECK-NEXT: [[TMP26:%.*]] = extractelement <16 x i32> [[TMP8]], i64 11
+; CHECK-NEXT: [[TMP27:%.*]] = and i32 [[TMP26]], 15
+; CHECK-NEXT: [[TMP59:%.*]] = or i32 [[TMP26]], [[TMP27]]
+; CHECK-NEXT: [[TMP28:%.*]] = extractelement <16 x i32> [[TMP8]], i64 12
+; CHECK-NEXT: [[TMP29:%.*]] = and i32 [[TMP28]], 15
+; CHECK-NEXT: [[TMP60:%.*]] = or i32 [[TMP28]], [[TMP29]]
+; CHECK-NEXT: [[TMP30:%.*]] = extractelement <16 x i32> [[TMP8]], i64 13
+; CHECK-NEXT: [[TMP31:%.*]] = and i32 [[TMP30]], 15
+; CHECK-NEXT: [[TMP45:%.*]] = or i32 [[TMP30]], [[TMP31]]
+; CHECK-NEXT: [[TMP32:%.*]] = extractelement <16 x i32> [[TMP8]], i64 14
+; CHECK-NEXT: [[TMP33:%.*]] = and i32 [[TMP32]], 15
+; CHECK-NEXT: [[TMP48:%.*]] = or i32 [[TMP32]], [[TMP33]]
+; CHECK-NEXT: [[TMP34:%.*]] = extractelement <16 x i32> [[TMP8]], i64 15
+; CHECK-NEXT: [[TMP50:%.*]] = and i32 [[TMP34]], 15
+; CHECK-NEXT: [[TMP35:%.*]] = or i32 [[TMP34]], [[TMP50]]
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP3]] to <16 x float>
+; CHECK-NEXT: [[TMP11:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[TMP4]], <16 x i32> [[X1:%.*]], <16 x float> [[TMP5]])
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x float> [[TMP11]] to <16 x i32>
+; CHECK-NEXT: [[_MSCMP28:%.*]] = icmp ne i32 [[TMP35]], 0
+; CHECK-NEXT: br i1 [[_MSCMP28]], label [[TMP56:%.*]], label [[TMP57:%.*]], !prof [[PROF1]]
+; CHECK: 56:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT: unreachable
+; CHECK: 57:
+; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1]], <16 x float> [[X2:%.*]])
; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[X1]] to <16 x float>
-; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: store <16 x i32> [[TMP7]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <16 x float> [[TMP9]]
;
%res = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1)
@@ -13867,32 +14060,77 @@ define <16 x float>@test_int_x86_avx512_mask_vpermi2var_ps_512(<16 x float> %x0,
;
; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_ps_512(
; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
-; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK: 8:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT: unreachable
-; CHECK: 9:
-; CHECK-NEXT: [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x float> [[X2:%.*]])
+; CHECK-NEXT: [[TMP37:%.*]] = extractelement <16 x i32> [[TMP2]], i64 0
+; CHECK-NEXT: [[TMP38:%.*]] = and i32 [[TMP37]], 15
+; CHECK-NEXT: [[TMP41:%.*]] = or i32 [[TMP37]], [[TMP38]]
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <16 x i32> [[TMP2]], i64 1
+; CHECK-NEXT: [[TMP39:%.*]] = and i32 [[TMP7]], 15
+; CHECK-NEXT: [[TMP42:%.*]] = or i32 [[TMP7]], [[TMP39]]
+; CHECK-NEXT: [[TMP40:%.*]] = extractelement <16 x i32> [[TMP2]], i64 2
+; CHECK-NEXT: [[TMP43:%.*]] = and i32 [[TMP40]], 15
+; CHECK-NEXT: [[TMP54:%.*]] = or i32 [[TMP40]], [[TMP43]]
+; CHECK-NEXT: [[TMP44:%.*]] = extractelement <16 x i32> [[TMP2]], i64 3
+; CHECK-NEXT: [[TMP45:%.*]] = and i32 [[TMP44]], 15
+; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP44]], [[TMP45]]
+; CHECK-NEXT: [[TMP46:%.*]] = extractelement <16 x i32> [[TMP2]], i64 4
+; CHECK-NEXT: [[TMP47:%.*]] = and i32 [[TMP46]], 15
+; CHECK-NEXT: [[TMP56:%.*]] = or i32 [[TMP46]], [[TMP47]]
+; CHECK-NEXT: [[TMP48:%.*]] = extractelement <16 x i32> [[TMP2]], i64 5
+; CHECK-NEXT: [[TMP49:%.*]] = and i32 [[TMP48]], 15
+; CHECK-NEXT: [[TMP59:%.*]] = or i32 [[TMP48]], [[TMP49]]
+; CHECK-NEXT: [[TMP50:%.*]] = extractelement <16 x i32> [[TMP2]], i64 6
+; CHECK-NEXT: [[TMP51:%.*]] = and i32 [[TMP50]], 15
+; CHECK-NEXT: [[TMP60:%.*]] = or i32 [[TMP50]], [[TMP51]]
+; CHECK-NEXT: [[TMP52:%.*]] = extractelement <16 x i32> [[TMP2]], i64 7
+; CHECK-NEXT: [[TMP53:%.*]] = and i32 [[TMP52]], 15
+; CHECK-NEXT: [[TMP61:%.*]] = or i32 [[TMP52]], [[TMP53]]
+; CHECK-NEXT: [[TMP21:%.*]] = extractelement <16 x i32> [[TMP2]], i64 8
+; CHECK-NEXT: [[TMP22:%.*]] = and i32 [[TMP21]], 15
+; CHECK-NEXT: [[TMP62:%.*]] = or i32 [[TMP21]], [[TMP22]]
+; CHECK-NEXT: [[TMP23:%.*]] = extractelement <16 x i32> [[TMP2]], i64 9
+; CHECK-NEXT: [[TMP24:%.*]] = and i32 [[TMP23]], 15
+; CHECK-NEXT: [[TMP63:%.*]] = or i32 [[TMP23]], [[TMP24]]
+; CHECK-NEXT: [[TMP25:%.*]] = extractelement <16 x i32> [[TMP2]], i64 10
+; CHECK-NEXT: [[TMP26:%.*]] = and i32 [[TMP25]], 15
+; CHECK-NEXT: [[TMP64:%.*]] = or i32 [[TMP25]], [[TMP26]]
+; CHECK-NEXT: [[TMP27:%.*]] = extractelement <16 x i32> [[TMP2]], i64 11
+; CHECK-NEXT: [[TMP28:%.*]] = and i32 [[TMP27]], 15
+; CHECK-NEXT: [[TMP65:%.*]] = or i32 [[TMP27]], [[TMP28]]
+; CHECK-NEXT: [[TMP29:%.*]] = extractelement <16 x i32> [[TMP2]], i64 12
+; CHECK-NEXT: [[TMP30:%.*]] = and i32 [[TMP29]], 15
+; CHECK-NEXT: [[TMP66:%.*]] = or i32 [[TMP29]], [[TMP30]]
+; CHECK-NEXT: [[TMP31:%.*]] = extractelement <16 x i32> [[TMP2]], i64 13
+; CHECK-NEXT: [[TMP32:%.*]] = and i32 [[TMP31]], 15
+; CHECK-NEXT: [[TMP67:%.*]] = or i32 [[TMP31]], [[TMP32]]
+; CHECK-NEXT: [[TMP33:%.*]] = extractelement <16 x i32> [[TMP2]], i64 14
+; CHECK-NEXT: [[TMP34:%.*]] = and i32 [[TMP33]], 15
+; CHECK-NEXT: [[TMP68:%.*]] = or i32 [[TMP33]], [[TMP34]]
+; CHECK-NEXT: [[TMP35:%.*]] = extractelement <16 x i32> [[TMP2]], i64 15
+; CHECK-NEXT: [[TMP36:%.*]] = and i32 [[TMP35]], 15
+; CHECK-NEXT: [[TMP69:%.*]] = or i32 [[TMP35]], [[TMP36]]
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP3]] to <16 x float>
+; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[TMP5]], <16 x i32> [[X1:%.*]], <16 x float> [[TMP6]])
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP69]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF1]]
+; CHECK: 57:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT: unreachable
+; CHECK: 58:
+; CHECK-NEXT: [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1]], <16 x float> [[X2:%.*]])
; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[X1]] to <16 x float>
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
; CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> zeroinitializer, <16 x i32> [[TMP2]]
+; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP8]], <16 x i32> [[TMP2]]
; CHECK-NEXT: [[TMP15:%.*]] = bitcast <16 x float> [[TMP10]] to <16 x i32>
; CHECK-NEXT: [[TMP16:%.*]] = bitcast <16 x float> [[TMP11]] to <16 x i32>
; CHECK-NEXT: [[TMP17:%.*]] = xor <16 x i32> [[TMP15]], [[TMP16]]
-; CHECK-NEXT: [[TMP18:%.*]] = or <16 x i32> [[TMP17]], zeroinitializer
+; CHECK-NEXT: [[TMP18:%.*]] = or <16 x i32> [[TMP17]], [[TMP8]]
; CHECK-NEXT: [[TMP19:%.*]] = or <16 x i32> [[TMP18]], [[TMP2]]
; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP19]], <16 x i32> [[TMP14]]
; CHECK-NEXT: [[TMP20:%.*]] = select <16 x i1> [[TMP13]], <16 x float> [[TMP10]], <16 x float> [[TMP11]]
@@ -13908,12 +14146,41 @@ declare <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64>, <8 x i64>, <
define <8 x i64>@test_int_x86_avx512_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) #0 {
; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_q_512(
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]])
+; CHECK-NEXT: [[TMP8:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i64> [[TMP8]], i64 0
+; CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP20]], 7
+; CHECK-NEXT: [[TMP22:%.*]] = or i64 [[TMP20]], [[TMP5]]
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i64> [[TMP8]], i64 1
+; CHECK-NEXT: [[TMP7:%.*]] = and i64 [[TMP6]], 7
+; CHECK-NEXT: [[TMP25:%.*]] = or i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP23:%.*]] = extractelement <8 x i64> [[TMP8]], i64 2
+; CHECK-NEXT: [[TMP9:%.*]] = and i64 [[TMP23]], 7
+; CHECK-NEXT: [[TMP26:%.*]] = or i64 [[TMP23]], [[TMP9]]
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i64> [[TMP8]], i64 3
+; CHECK-NEXT: [[TMP11:%.*]] = and i64 [[TMP10]], 7
+; CHECK-NEXT: [[TMP28:%.*]] = or i64 [[TMP10]], [[TMP11]]
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <8 x i64> [[TMP8]], i64 4
+; CHECK-NEXT: [[TMP13:%.*]] = and i64 [[TMP12]], 7
+; CHECK-NEXT: [[TMP31:%.*]] = or i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i64> [[TMP8]], i64 5
+; CHECK-NEXT: [[TMP15:%.*]] = and i64 [[TMP14]], 7
+; CHECK-NEXT: [[TMP21:%.*]] = or i64 [[TMP14]], [[TMP15]]
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i64> [[TMP8]], i64 6
+; CHECK-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 7
+; CHECK-NEXT: [[TMP24:%.*]] = or i64 [[TMP16]], [[TMP17]]
+; CHECK-NEXT: [[TMP18:%.*]] = extractelement <8 x i64> [[TMP8]], i64 7
+; CHECK-NEXT: [[TMP19:%.*]] = and i64 [[TMP18]], 7
+; CHECK-NEXT: [[TMP27:%.*]] = or i64 [[TMP18]], [[TMP19]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]], <8 x i64> [[TMP3]])
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP27]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP29:%.*]], label [[TMP30:%.*]], !prof [[PROF1]]
+; CHECK: 29:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT: unreachable
+; CHECK: 30:
+; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1]], <8 x i64> [[X2:%.*]])
; CHECK-NEXT: store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <8 x i64> [[TMP4]]
;
@@ -13925,13 +14192,42 @@ define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i
;
; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_q_512(
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]])
+; CHECK-NEXT: [[TMP21:%.*]] = extractelement <8 x i64> [[TMP2]], i64 0
+; CHECK-NEXT: [[TMP24:%.*]] = and i64 [[TMP21]], 7
+; CHECK-NEXT: [[TMP23:%.*]] = or i64 [[TMP21]], [[TMP24]]
+; CHECK-NEXT: [[TMP25:%.*]] = extractelement <8 x i64> [[TMP2]], i64 1
+; CHECK-NEXT: [[TMP26:%.*]] = and i64 [[TMP25]], 7
+; CHECK-NEXT: [[TMP32:%.*]] = or i64 [[TMP25]], [[TMP26]]
+; CHECK-NEXT: [[TMP27:%.*]] = extractelement <8 x i64> [[TMP2]], i64 2
+; CHECK-NEXT: [[TMP28:%.*]] = and i64 [[TMP27]], 7
+; CHECK-NEXT: [[TMP33:%.*]] = or i64 [[TMP27]], [[TMP28]]
+; CHECK-NEXT: [[TMP29:%.*]] = extractelement <8 x i64> [[TMP2]], i64 3
+; CHECK-NEXT: [[TMP30:%.*]] = and i64 [[TMP29]], 7
+; CHECK-NEXT: [[TMP34:%.*]] = or i64 [[TMP29]], [[TMP30]]
+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x i64> [[TMP2]], i64 4
+; CHECK-NEXT: [[TMP14:%.*]] = and i64 [[TMP13]], 7
+; CHECK-NEXT: [[TMP35:%.*]] = or i64 [[TMP13]], [[TMP14]]
+; CHECK-NEXT: [[TMP15:%.*]] = extractelement <8 x i64> [[TMP2]], i64 5
+; CHECK-NEXT: [[TMP16:%.*]] = and i64 [[TMP15]], 7
+; CHECK-NEXT: [[TMP22:%.*]] = or i64 [[TMP15]], [[TMP16]]
+; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i64> [[TMP2]], i64 6
+; CHECK-NEXT: [[TMP18:%.*]] = and i64 [[TMP17]], 7
+; CHECK-NEXT: [[TMP36:%.*]] = or i64 [[TMP17]], [[TMP18]]
+; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i64> [[TMP2]], i64 7
+; CHECK-NEXT: [[TMP20:%.*]] = and i64 [[TMP19]], 7
+; CHECK-NEXT: [[TMP37:%.*]] = or i64 [[TMP19]], [[TMP20]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]], <8 x i64> [[TMP3]])
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP37]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP38:%.*]], label [[TMP31:%.*]], !prof [[PROF1]]
+; CHECK: 30:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT: unreachable
+; CHECK: 31:
+; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1]], <8 x i64> [[X2:%.*]])
; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> [[TMP2]]
@@ -13968,9 +14264,62 @@ define <16 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_512(<16 x i32> %x0, <16
; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
-; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x i32> [[X2]])
+; CHECK-NEXT: [[TMP42:%.*]] = extractelement <16 x i32> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP45:%.*]] = and i32 [[TMP42]], 15
+; CHECK-NEXT: [[TMP43:%.*]] = or i32 [[TMP42]], [[TMP45]]
+; CHECK-NEXT: [[TMP46:%.*]] = extractelement <16 x i32> [[TMP3]], i64 1
+; CHECK-NEXT: [[TMP47:%.*]] = and i32 [[TMP46]], 15
+; CHECK-NEXT: [[TMP44:%.*]] = or i32 [[TMP46]], [[TMP47]]
+; CHECK-NEXT: [[TMP48:%.*]] = extractelement <16 x i32> [[TMP3]], i64 2
+; CHECK-NEXT: [[TMP49:%.*]] = and i32 [[TMP48]], 15
+; CHECK-NEXT: [[TMP52:%.*]] = or i32 [[TMP48]], [[TMP49]]
+; CHECK-NEXT: [[TMP50:%.*]] = extractelement <16 x i32> [[TMP3]], i64 3
+; CHECK-NEXT: [[TMP51:%.*]] = and i32 [[TMP50]], 15
+; CHECK-NEXT: [[TMP53:%.*]] = or i32 [[TMP50]], [[TMP51]]
+; CHECK-NEXT: [[TMP18:%.*]] = extractelement <16 x i32> [[TMP3]], i64 4
+; CHECK-NEXT: [[TMP19:%.*]] = and i32 [[TMP18]], 15
+; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP18]], [[TMP19]]
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <16 x i32> [[TMP3]], i64 5
+; CHECK-NEXT: [[TMP21:%.*]] = and i32 [[TMP20]], 15
+; CHECK-NEXT: [[TMP56:%.*]] = or i32 [[TMP20]], [[TMP21]]
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <16 x i32> [[TMP3]], i64 6
+; CHECK-NEXT: [[TMP23:%.*]] = and i32 [[TMP22]], 15
+; CHECK-NEXT: [[TMP58:%.*]] = or i32 [[TMP22]], [[TMP23]]
+; CHECK-NEXT: [[TMP24:%.*]] = extractelement <16 x i32> [[TMP3]], i64 7
+; CHECK-NEXT: [[TMP25:%.*]] = and i32 [[TMP24]], 15
+; CHECK-NEXT: [[TMP61:%.*]] = or i32 [[TMP24]], [[TMP25]]
+; CHECK-NEXT: [[TMP26:%.*]] = extractelement <16 x i32> [[TMP3]], i64 8
+; CHECK-NEXT: [[TMP27:%.*]] = and i32 [[TMP26]], 15
+; CHECK-NEXT: [[TMP62:%.*]] = or i32 [[TMP26]], [[TMP27]]
+; CHECK-NEXT: [[TMP28:%.*]] = extractelement <16 x i32> [[TMP3]], i64 9
+; CHECK-NEXT: [[TMP29:%.*]] = and i32 [[TMP28]], 15
+; CHECK-NEXT: [[TMP63:%.*]] = or i32 [[TMP28]], [[TMP29]]
+; CHECK-NEXT: [[TMP30:%.*]] = extractelement <16 x i32> [[TMP3]], i64 10
+; CHECK-NEXT: [[TMP31:%.*]] = and i32 [[TMP30]], 15
+; CHECK-NEXT: [[TMP64:%.*]] = or i32 [[TMP30]], [[TMP31]]
+; CHECK-NEXT: [[TMP32:%.*]] = extractelement <16 x i32> [[TMP3]], i64 11
+; CHECK-NEXT: [[TMP33:%.*]] = and i32 [[TMP32]], 15
+; CHECK-NEXT: [[TMP65:%.*]] = or i32 [[TMP32]], [[TMP33]]
+; CHECK-NEXT: [[TMP34:%.*]] = extractelement <16 x i32> [[TMP3]], i64 12
+; CHECK-NEXT: [[TMP35:%.*]] = and i32 [[TMP34]], 15
+; CHECK-NEXT: [[TMP66:%.*]] = or i32 [[TMP34]], [[TMP35]]
+; CHECK-NEXT: [[TMP36:%.*]] = extractelement <16 x i32> [[TMP3]], i64 13
+; CHECK-NEXT: [[TMP37:%.*]] = and i32 [[TMP36]], 15
+; CHECK-NEXT: [[TMP67:%.*]] = or i32 [[TMP36]], [[TMP37]]
+; CHECK-NEXT: [[TMP38:%.*]] = extractelement <16 x i32> [[TMP3]], i64 14
+; CHECK-NEXT: [[TMP39:%.*]] = and i32 [[TMP38]], 15
+; CHECK-NEXT: [[TMP54:%.*]] = or i32 [[TMP38]], [[TMP39]]
+; CHECK-NEXT: [[TMP40:%.*]] = extractelement <16 x i32> [[TMP3]], i64 15
+; CHECK-NEXT: [[TMP41:%.*]] = and i32 [[TMP40]], 15
+; CHECK-NEXT: [[TMP57:%.*]] = or i32 [[TMP40]], [[TMP41]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP2]], <16 x i32> [[X0:%.*]], <16 x i32> [[_MSLD]])
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i32 [[TMP57]], 0
+; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP59:%.*]], label [[TMP60:%.*]], !prof [[PROF1]]
+; CHECK: 59:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT: unreachable
+; CHECK: 60:
+; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0]], <16 x i32> [[X2]])
; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
@@ -13999,7 +14348,7 @@ define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <
; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
; CHECK: 6:
; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]]
; CHECK-NEXT: unreachable
@@ -14013,26 +14362,47 @@ define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <
; CHECK-NEXT: [[X2INS:%.*]] = insertelement <8 x double> [[EXTRA_PARAM:%.*]], double [[X2S]], i32 0
; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> [[TMP5]], <8 x i32> zeroinitializer
; CHECK-NEXT: [[X2:%.*]] = shufflevector <8 x double> [[X2INS]], <8 x double> [[EXTRA_PARAM]], <8 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP10]], 0
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
-; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP11]], 0
-; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to i512
-; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i512 [[TMP12]], 0
-; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR]], [[_MSCMP4]]
-; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP14:%.*]], label [[TMP24:%.*]], !prof [[PROF1]]
-; CHECK: 14:
+; CHECK-NEXT: [[TMP27:%.*]] = extractelement <8 x i64> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP12:%.*]] = and i64 [[TMP27]], 7
+; CHECK-NEXT: [[TMP32:%.*]] = or i64 [[TMP27]], [[TMP12]]
+; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i64> [[TMP3]], i64 1
+; CHECK-NEXT: [[TMP29:%.*]] = and i64 [[TMP28]], 7
+; CHECK-NEXT: [[TMP42:%.*]] = or i64 [[TMP28]], [[TMP29]]
+; CHECK-NEXT: [[TMP30:%.*]] = extractelement <8 x i64> [[TMP3]], i64 2
+; CHECK-NEXT: [[TMP33:%.*]] = and i64 [[TMP30]], 7
+; CHECK-NEXT: [[TMP43:%.*]] = or i64 [[TMP30]], [[TMP33]]
+; CHECK-NEXT: [[TMP34:%.*]] = extractelement <8 x i64> [[TMP3]], i64 3
+; CHECK-NEXT: [[TMP35:%.*]] = and i64 [[TMP34]], 7
+; CHECK-NEXT: [[TMP44:%.*]] = or i64 [[TMP34]], [[TMP35]]
+; CHECK-NEXT: [[TMP36:%.*]] = extractelement <8 x i64> [[TMP3]], i64 4
+; CHECK-NEXT: [[TMP37:%.*]] = and i64 [[TMP36]], 7
+; CHECK-NEXT: [[TMP45:%.*]] = or i64 [[TMP36]], [[TMP37]]
+; CHECK-NEXT: [[TMP38:%.*]] = extractelement <8 x i64> [[TMP3]], i64 5
+; CHECK-NEXT: [[TMP39:%.*]] = and i64 [[TMP38]], 7
+; CHECK-NEXT: [[TMP46:%.*]] = or i64 [[TMP38]], [[TMP39]]
+; CHECK-NEXT: [[TMP40:%.*]] = extractelement <8 x i64> [[TMP3]], i64 6
+; CHECK-NEXT: [[TMP41:%.*]] = and i64 [[TMP40]], 7
+; CHECK-NEXT: [[TMP31:%.*]] = or i64 [[TMP40]], [[TMP41]]
+; CHECK-NEXT: [[TMP25:%.*]] = extractelement <8 x i64> [[TMP3]], i64 7
+; CHECK-NEXT: [[TMP26:%.*]] = and i64 [[TMP25]], 7
+; CHECK-NEXT: [[TMP47:%.*]] = or i64 [[TMP25]], [[TMP26]]
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i64> [[TMP2]] to <8 x double>
+; CHECK-NEXT: [[TMP24:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <8 x double>
+; CHECK-NEXT: [[TMP13:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[TMP11]], <8 x i64> [[X0:%.*]], <8 x double> [[TMP24]])
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x double> [[TMP13]] to <8 x i64>
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP47]], 0
+; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP48:%.*]], label [[TMP49:%.*]], !prof [[PROF1]]
+; CHECK: 39:
; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]]
; CHECK-NEXT: unreachable
-; CHECK: 15:
-; CHECK-NEXT: [[TMP15:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X1:%.*]], <8 x i64> [[X0:%.*]], <8 x double> [[X2]])
+; CHECK: 40:
+; CHECK-NEXT: [[TMP15:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X1:%.*]], <8 x i64> [[X0]], <8 x double> [[X2]])
; CHECK-NEXT: [[TMP16:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
+; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> [[TMP14]], <8 x i64> zeroinitializer
; CHECK-NEXT: [[TMP19:%.*]] = bitcast <8 x double> [[TMP15]] to <8 x i64>
; CHECK-NEXT: [[TMP20:%.*]] = xor <8 x i64> [[TMP19]], zeroinitializer
-; CHECK-NEXT: [[TMP21:%.*]] = or <8 x i64> [[TMP20]], zeroinitializer
+; CHECK-NEXT: [[TMP21:%.*]] = or <8 x i64> [[TMP20]], [[TMP14]]
; CHECK-NEXT: [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer
; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP16]], <8 x i64> [[TMP22]], <8 x i64> [[TMP18]]
; CHECK-NEXT: [[TMP23:%.*]] = select <8 x i1> [[TMP17]], <8 x double> [[TMP15]], <8 x double> zeroinitializer
@@ -14052,30 +14422,75 @@ define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0,
;
; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_ps_512(
; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP9:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
-; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK: 8:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT: unreachable
-; CHECK: 9:
-; CHECK-NEXT: [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x float> [[X2:%.*]])
+; CHECK-NEXT: [[TMP37:%.*]] = extractelement <16 x i32> [[TMP9]], i64 0
+; CHECK-NEXT: [[TMP38:%.*]] = and i32 [[TMP37]], 15
+; CHECK-NEXT: [[TMP41:%.*]] = or i32 [[TMP37]], [[TMP38]]
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <16 x i32> [[TMP9]], i64 1
+; CHECK-NEXT: [[TMP39:%.*]] = and i32 [[TMP7]], 15
+; CHECK-NEXT: [[TMP42:%.*]] = or i32 [[TMP7]], [[TMP39]]
+; CHECK-NEXT: [[TMP40:%.*]] = extractelement <16 x i32> [[TMP9]], i64 2
+; CHECK-NEXT: [[TMP43:%.*]] = and i32 [[TMP40]], 15
+; CHECK-NEXT: [[TMP53:%.*]] = or i32 [[TMP40]], [[TMP43]]
+; CHECK-NEXT: [[TMP44:%.*]] = extractelement <16 x i32> [[TMP9]], i64 3
+; CHECK-NEXT: [[TMP45:%.*]] = and i32 [[TMP44]], 15
+; CHECK-NEXT: [[TMP54:%.*]] = or i32 [[TMP44]], [[TMP45]]
+; CHECK-NEXT: [[TMP46:%.*]] = extractelement <16 x i32> [[TMP9]], i64 4
+; CHECK-NEXT: [[TMP47:%.*]] = and i32 [[TMP46]], 15
+; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP46]], [[TMP47]]
+; CHECK-NEXT: [[TMP48:%.*]] = extractelement <16 x i32> [[TMP9]], i64 5
+; CHECK-NEXT: [[TMP49:%.*]] = and i32 [[TMP48]], 15
+; CHECK-NEXT: [[TMP56:%.*]] = or i32 [[TMP48]], [[TMP49]]
+; CHECK-NEXT: [[TMP50:%.*]] = extractelement <16 x i32> [[TMP9]], i64 6
+; CHECK-NEXT: [[TMP51:%.*]] = and i32 [[TMP50]], 15
+; CHECK-NEXT: [[TMP59:%.*]] = or i32 [[TMP50]], [[TMP51]]
+; CHECK-NEXT: [[TMP52:%.*]] = extractelement <16 x i32> [[TMP9]], i64 7
+; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP52]], 15
+; CHECK-NEXT: [[TMP60:%.*]] = or i32 [[TMP52]], [[TMP20]]
+; CHECK-NEXT: [[TMP21:%.*]] = extractelement <16 x i32> [[TMP9]], i64 8
+; CHECK-NEXT: [[TMP22:%.*]] = and i32 [[TMP21]], 15
+; CHECK-NEXT: [[TMP61:%.*]] = or i32 [[TMP21]], [[TMP22]]
+; CHECK-NEXT: [[TMP23:%.*]] = extractelement <16 x i32> [[TMP9]], i64 9
+; CHECK-NEXT: [[TMP24:%.*]] = and i32 [[TMP23]], 15
+; CHECK-NEXT: [[TMP62:%.*]] = or i32 [[TMP23]], [[TMP24]]
+; CHECK-NEXT: [[TMP25:%.*]] = extractelement <16 x i32> [[TMP9]], i64 10
+; CHECK-NEXT: [[TMP26:%.*]] = and i32 [[TMP25]], 15
+; CHECK-NEXT: [[TMP63:%.*]] = or i32 [[TMP25]], [[TMP26]]
+; CHECK-NEXT: [[TMP27:%.*]] = extractelement <16 x i32> [[TMP9]], i64 11
+; CHECK-NEXT: [[TMP28:%.*]] = and i32 [[TMP27]], 15
+; CHECK-NEXT: [[TMP64:%.*]] = or i32 [[TMP27]], [[TMP28]]
+; CHECK-NEXT: [[TMP29:%.*]] = extractelement <16 x i32> [[TMP9]], i64 12
+; CHECK-NEXT: [[TMP30:%.*]] = and i32 [[TMP29]], 15
+; CHECK-NEXT: [[TMP65:%.*]] = or i32 [[TMP29]], [[TMP30]]
+; CHECK-NEXT: [[TMP31:%.*]] = extractelement <16 x i32> [[TMP9]], i64 13
+; CHECK-NEXT: [[TMP32:%.*]] = and i32 [[TMP31]], 15
+; CHECK-NEXT: [[TMP66:%.*]] = or i32 [[TMP31]], [[TMP32]]
+; CHECK-NEXT: [[TMP33:%.*]] = extractelement <16 x i32> [[TMP9]], i64 14
+; CHECK-NEXT: [[TMP34:%.*]] = and i32 [[TMP33]], 15
+; CHECK-NEXT: [[TMP67:%.*]] = or i32 [[TMP33]], [[TMP34]]
+; CHECK-NEXT: [[TMP35:%.*]] = extractelement <16 x i32> [[TMP9]], i64 15
+; CHECK-NEXT: [[TMP36:%.*]] = and i32 [[TMP35]], 15
+; CHECK-NEXT: [[TMP68:%.*]] = or i32 [[TMP35]], [[TMP36]]
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP3]] to <16 x float>
+; CHECK-NEXT: [[TMP19:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[TMP5]], <16 x i32> [[X0:%.*]], <16 x float> [[TMP6]])
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x float> [[TMP19]] to <16 x i32>
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP68]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF1]]
+; CHECK: 57:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT: unreachable
+; CHECK: 58:
+; CHECK-NEXT: [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X1:%.*]], <16 x i32> [[X0]], <16 x float> [[X2:%.*]])
; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP8]], <16 x i32> zeroinitializer
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[TMP10]] to <16 x i32>
; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP8]]
; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], zeroinitializer
; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP17]], <16 x i32> [[TMP13]]
; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP12]], <16 x float> [[TMP10]], <16 x float> zeroinitializer
@@ -14093,13 +14508,42 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x
;
; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_q_512(
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP13:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X1:%.*]], <8 x i64> [[X0:%.*]], <8 x i64> [[X2:%.*]])
+; CHECK-NEXT: [[TMP21:%.*]] = extractelement <8 x i64> [[TMP13]], i64 0
+; CHECK-NEXT: [[TMP24:%.*]] = and i64 [[TMP21]], 7
+; CHECK-NEXT: [[TMP23:%.*]] = or i64 [[TMP21]], [[TMP24]]
+; CHECK-NEXT: [[TMP25:%.*]] = extractelement <8 x i64> [[TMP13]], i64 1
+; CHECK-NEXT: [[TMP26:%.*]] = and i64 [[TMP25]], 7
+; CHECK-NEXT: [[TMP32:%.*]] = or i64 [[TMP25]], [[TMP26]]
+; CHECK-NEXT: [[TMP27:%.*]] = extractelement <8 x i64> [[TMP13]], i64 2
+; CHECK-NEXT: [[TMP28:%.*]] = and i64 [[TMP27]], 7
+; CHECK-NEXT: [[TMP33:%.*]] = or i64 [[TMP27]], [[TMP28]]
+; CHECK-NEXT: [[TMP29:%.*]] = extractelement <8 x i64> [[TMP13]], i64 3
+; CHECK-NEXT: [[TMP30:%.*]] = and i64 [[TMP29]], 7
+; CHECK-NEXT: [[TMP34:%.*]] = or i64 [[TMP29]], [[TMP30]]
+; CHECK-NEXT: [[TMP31:%.*]] = extractelement <8 x i64> [[TMP13]], i64 4
+; CHECK-NEXT: [[TMP14:%.*]] = and i64 [[TMP31]], 7
+; CHECK-NEXT: [[TMP35:%.*]] = or i64 [[TMP31]], [[TMP14]]
+; CHECK-NEXT: [[TMP15:%.*]] = extractelement <8 x i64> [[TMP13]], i64 5
+; CHECK-NEXT: [[TMP16:%.*]] = and i64 [[TMP15]], 7
+; CHECK-NEXT: [[TMP22:%.*]] = or i64 [[TMP15]], [[TMP16]]
+; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i64> [[TMP13]], i64 6
+; CHECK-NEXT: [[TMP18:%.*]] = and i64 [[TMP17]], 7
+; CHECK-NEXT: [[TMP36:%.*]] = or i64 [[TMP17]], [[TMP18]]
+; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i64> [[TMP13]], i64 7
+; CHECK-NEXT: [[TMP20:%.*]] = and i64 [[TMP19]], 7
+; CHECK-NEXT: [[TMP37:%.*]] = or i64 [[TMP19]], [[TMP20]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[TMP1]], <8 x i64> [[X0:%.*]], <8 x i64> [[TMP3]])
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP37]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP38:%.*]], label [[TMP39:%.*]], !prof [[PROF1]]
+; CHECK: 30:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT: unreachable
+; CHECK: 31:
+; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X1:%.*]], <8 x i64> [[X0]], <8 x i64> [[X2:%.*]])
; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> zeroinitializer
@@ -14120,12 +14564,65 @@ declare <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32>, <16 x i32>
define <16 x i32>@test_int_x86_avx512_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) #0 {
; CHECK-LABEL: @test_int_x86_avx512_vpermt2var_d_512(
; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x i32> [[X2:%.*]])
+; CHECK-NEXT: [[TMP8:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP36:%.*]] = extractelement <16 x i32> [[TMP8]], i64 0
+; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[TMP36]], 15
+; CHECK-NEXT: [[TMP37:%.*]] = or i32 [[TMP36]], [[TMP5]]
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <16 x i32> [[TMP8]], i64 1
+; CHECK-NEXT: [[TMP7:%.*]] = and i32 [[TMP6]], 15
+; CHECK-NEXT: [[TMP38:%.*]] = or i32 [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP39:%.*]] = extractelement <16 x i32> [[TMP8]], i64 2
+; CHECK-NEXT: [[TMP9:%.*]] = and i32 [[TMP39]], 15
+; CHECK-NEXT: [[TMP40:%.*]] = or i32 [[TMP39]], [[TMP9]]
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <16 x i32> [[TMP8]], i64 3
+; CHECK-NEXT: [[TMP11:%.*]] = and i32 [[TMP10]], 15
+; CHECK-NEXT: [[TMP41:%.*]] = or i32 [[TMP10]], [[TMP11]]
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <16 x i32> [[TMP8]], i64 4
+; CHECK-NEXT: [[TMP13:%.*]] = and i32 [[TMP12]], 15
+; CHECK-NEXT: [[TMP43:%.*]] = or i32 [[TMP12]], [[TMP13]]
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <16 x i32> [[TMP8]], i64 5
+; CHECK-NEXT: [[TMP15:%.*]] = and i32 [[TMP14]], 15
+; CHECK-NEXT: [[TMP44:%.*]] = or i32 [[TMP14]], [[TMP15]]
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <16 x i32> [[TMP8]], i64 6
+; CHECK-NEXT: [[TMP17:%.*]] = and i32 [[TMP16]], 15
+; CHECK-NEXT: [[TMP46:%.*]] = or i32 [[TMP16]], [[TMP17]]
+; CHECK-NEXT: [[TMP18:%.*]] = extractelement <16 x i32> [[TMP8]], i64 7
+; CHECK-NEXT: [[TMP19:%.*]] = and i32 [[TMP18]], 15
+; CHECK-NEXT: [[TMP47:%.*]] = or i32 [[TMP18]], [[TMP19]]
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <16 x i32> [[TMP8]], i64 8
+; CHECK-NEXT: [[TMP21:%.*]] = and i32 [[TMP20]], 15
+; CHECK-NEXT: [[TMP49:%.*]] = or i32 [[TMP20]], [[TMP21]]
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <16 x i32> [[TMP8]], i64 9
+; CHECK-NEXT: [[TMP23:%.*]] = and i32 [[TMP22]], 15
+; CHECK-NEXT: [[TMP51:%.*]] = or i32 [[TMP22]], [[TMP23]]
+; CHECK-NEXT: [[TMP24:%.*]] = extractelement <16 x i32> [[TMP8]], i64 10
+; CHECK-NEXT: [[TMP25:%.*]] = and i32 [[TMP24]], 15
+; CHECK-NEXT: [[TMP52:%.*]] = or i32 [[TMP24]], [[TMP25]]
+; CHECK-NEXT: [[TMP26:%.*]] = extractelement <16 x i32> [[TMP8]], i64 11
+; CHECK-NEXT: [[TMP27:%.*]] = and i32 [[TMP26]], 15
+; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP26]], [[TMP27]]
+; CHECK-NEXT: [[TMP28:%.*]] = extractelement <16 x i32> [[TMP8]], i64 12
+; CHECK-NEXT: [[TMP29:%.*]] = and i32 [[TMP28]], 15
+; CHECK-NEXT: [[TMP42:%.*]] = or i32 [[TMP28]], [[TMP29]]
+; CHECK-NEXT: [[TMP30:%.*]] = extractelement <16 x i32> [[TMP8]], i64 13
+; CHECK-NEXT: [[TMP31:%.*]] = and i32 [[TMP30]], 15
+; CHECK-NEXT: [[TMP45:%.*]] = or i32 [[TMP30]], [[TMP31]]
+; CHECK-NEXT: [[TMP32:%.*]] = extractelement <16 x i32> [[TMP8]], i64 14
+; CHECK-NEXT: [[TMP33:%.*]] = and i32 [[TMP32]], 15
+; CHECK-NEXT: [[TMP48:%.*]] = or i32 [[TMP32]], [[TMP33]]
+; CHECK-NEXT: [[TMP34:%.*]] = extractelement <16 x i32> [[TMP8]], i64 15
+; CHECK-NEXT: [[TMP50:%.*]] = and i32 [[TMP34]], 15
+; CHECK-NEXT: [[TMP35:%.*]] = or i32 [[TMP34]], [[TMP50]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP1]], <16 x i32> [[X0:%.*]], <16 x i32> [[TMP3]])
+; CHECK-NEXT: [[_MSCMP28:%.*]] = icmp ne i32 [[TMP35]], 0
+; CHECK-NEXT: br i1 [[_MSCMP28]], label [[TMP53:%.*]], label [[TMP54:%.*]], !prof [[PROF1]]
+; CHECK: 53:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT: unreachable
+; CHECK: 54:
+; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0]], <16 x i32> [[X2:%.*]])
; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <16 x i32> [[TMP4]]
;
@@ -14137,13 +14634,66 @@ define <16 x i32>@test_int_x86_avx512_mask_vpermt2var_d_512(<16 x i32> %x0, <16
;
; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_d_512(
; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP13:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x i32> [[X2:%.*]])
+; CHECK-NEXT: [[TMP37:%.*]] = extractelement <16 x i32> [[TMP13]], i64 0
+; CHECK-NEXT: [[TMP40:%.*]] = and i32 [[TMP37]], 15
+; CHECK-NEXT: [[TMP38:%.*]] = or i32 [[TMP37]], [[TMP40]]
+; CHECK-NEXT: [[TMP41:%.*]] = extractelement <16 x i32> [[TMP13]], i64 1
+; CHECK-NEXT: [[TMP42:%.*]] = and i32 [[TMP41]], 15
+; CHECK-NEXT: [[TMP39:%.*]] = or i32 [[TMP41]], [[TMP42]]
+; CHECK-NEXT: [[TMP43:%.*]] = extractelement <16 x i32> [[TMP13]], i64 2
+; CHECK-NEXT: [[TMP44:%.*]] = and i32 [[TMP43]], 15
+; CHECK-NEXT: [[TMP48:%.*]] = or i32 [[TMP43]], [[TMP44]]
+; CHECK-NEXT: [[TMP45:%.*]] = extractelement <16 x i32> [[TMP13]], i64 3
+; CHECK-NEXT: [[TMP46:%.*]] = and i32 [[TMP45]], 15
+; CHECK-NEXT: [[TMP50:%.*]] = or i32 [[TMP45]], [[TMP46]]
+; CHECK-NEXT: [[TMP47:%.*]] = extractelement <16 x i32> [[TMP13]], i64 4
+; CHECK-NEXT: [[TMP14:%.*]] = and i32 [[TMP47]], 15
+; CHECK-NEXT: [[TMP52:%.*]] = or i32 [[TMP47]], [[TMP14]]
+; CHECK-NEXT: [[TMP15:%.*]] = extractelement <16 x i32> [[TMP13]], i64 5
+; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 15
+; CHECK-NEXT: [[TMP53:%.*]] = or i32 [[TMP15]], [[TMP16]]
+; CHECK-NEXT: [[TMP17:%.*]] = extractelement <16 x i32> [[TMP13]], i64 6
+; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 15
+; CHECK-NEXT: [[TMP56:%.*]] = or i32 [[TMP17]], [[TMP18]]
+; CHECK-NEXT: [[TMP19:%.*]] = extractelement <16 x i32> [[TMP13]], i64 7
+; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP19]], 15
+; CHECK-NEXT: [[TMP57:%.*]] = or i32 [[TMP19]], [[TMP20]]
+; CHECK-NEXT: [[TMP21:%.*]] = extractelement <16 x i32> [[TMP13]], i64 8
+; CHECK-NEXT: [[TMP22:%.*]] = and i32 [[TMP21]], 15
+; CHECK-NEXT: [[TMP58:%.*]] = or i32 [[TMP21]], [[TMP22]]
+; CHECK-NEXT: [[TMP23:%.*]] = extractelement <16 x i32> [[TMP13]], i64 9
+; CHECK-NEXT: [[TMP24:%.*]] = and i32 [[TMP23]], 15
+; CHECK-NEXT: [[TMP59:%.*]] = or i32 [[TMP23]], [[TMP24]]
+; CHECK-NEXT: [[TMP25:%.*]] = extractelement <16 x i32> [[TMP13]], i64 10
+; CHECK-NEXT: [[TMP26:%.*]] = and i32 [[TMP25]], 15
+; CHECK-NEXT: [[TMP60:%.*]] = or i32 [[TMP25]], [[TMP26]]
+; CHECK-NEXT: [[TMP27:%.*]] = extractelement <16 x i32> [[TMP13]], i64 11
+; CHECK-NEXT: [[TMP28:%.*]] = and i32 [[TMP27]], 15
+; CHECK-NEXT: [[TMP61:%.*]] = or i32 [[TMP27]], [[TMP28]]
+; CHECK-NEXT: [[TMP29:%.*]] = extractelement <16 x i32> [[TMP13]], i64 12
+; CHECK-NEXT: [[TMP30:%.*]] = and i32 [[TMP29]], 15
+; CHECK-NEXT: [[TMP62:%.*]] = or i32 [[TMP29]], [[TMP30]]
+; CHECK-NEXT: [[TMP31:%.*]] = extractelement <16 x i32> [[TMP13]], i64 13
+; CHECK-NEXT: [[TMP32:%.*]] = and i32 [[TMP31]], 15
+; CHECK-NEXT: [[TMP63:%.*]] = or i32 [[TMP31]], [[TMP32]]
+; CHECK-NEXT: [[TMP33:%.*]] = extractelement <16 x i32> [[TMP13]], i64 14
+; CHECK-NEXT: [[TMP34:%.*]] = and i32 [[TMP33]], 15
+; CHECK-NEXT: [[TMP49:%.*]] = or i32 [[TMP33]], [[TMP34]]
+; CHECK-NEXT: [[TMP35:%.*]] = extractelement <16 x i32> [[TMP13]], i64 15
+; CHECK-NEXT: [[TMP51:%.*]] = and i32 [[TMP35]], 15
+; CHECK-NEXT: [[TMP36:%.*]] = or i32 [[TMP35]], [[TMP51]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP1]], <16 x i32> [[X0:%.*]], <16 x i32> [[TMP3]])
+; CHECK-NEXT: [[_MSCMP28:%.*]] = icmp ne i32 [[TMP36]], 0
+; CHECK-NEXT: br i1 [[_MSCMP28]], label [[TMP54:%.*]], label [[TMP55:%.*]], !prof [[PROF1]]
+; CHECK: 54:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT: unreachable
+; CHECK: 55:
+; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0]], <16 x i32> [[X2:%.*]])
; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP1]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll
index 1644a5e..43595dc 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll
@@ -5467,9 +5467,62 @@ define <16 x i32>@test_int_x86_avx512_vpermi2var_d_512(<16 x i32> %x0, <16 x i32
; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
-; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2]])
+; CHECK-NEXT: [[TMP41:%.*]] = extractelement <16 x i32> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP10:%.*]] = and i32 [[TMP41]], 15
+; CHECK-NEXT: [[TMP42:%.*]] = or i32 [[TMP41]], [[TMP10]]
+; CHECK-NEXT: [[TMP11:%.*]] = extractelement <16 x i32> [[TMP3]], i64 1
+; CHECK-NEXT: [[TMP12:%.*]] = and i32 [[TMP11]], 15
+; CHECK-NEXT: [[TMP43:%.*]] = or i32 [[TMP11]], [[TMP12]]
+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <16 x i32> [[TMP3]], i64 2
+; CHECK-NEXT: [[TMP14:%.*]] = and i32 [[TMP13]], 15
+; CHECK-NEXT: [[TMP45:%.*]] = or i32 [[TMP13]], [[TMP14]]
+; CHECK-NEXT: [[TMP15:%.*]] = extractelement <16 x i32> [[TMP3]], i64 3
+; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 15
+; CHECK-NEXT: [[TMP46:%.*]] = or i32 [[TMP15]], [[TMP16]]
+; CHECK-NEXT: [[TMP17:%.*]] = extractelement <16 x i32> [[TMP3]], i64 4
+; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 15
+; CHECK-NEXT: [[TMP48:%.*]] = or i32 [[TMP17]], [[TMP18]]
+; CHECK-NEXT: [[TMP19:%.*]] = extractelement <16 x i32> [[TMP3]], i64 5
+; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP19]], 15
+; CHECK-NEXT: [[TMP49:%.*]] = or i32 [[TMP19]], [[TMP20]]
+; CHECK-NEXT: [[TMP21:%.*]] = extractelement <16 x i32> [[TMP3]], i64 6
+; CHECK-NEXT: [[TMP22:%.*]] = and i32 [[TMP21]], 15
+; CHECK-NEXT: [[TMP51:%.*]] = or i32 [[TMP21]], [[TMP22]]
+; CHECK-NEXT: [[TMP23:%.*]] = extractelement <16 x i32> [[TMP3]], i64 7
+; CHECK-NEXT: [[TMP24:%.*]] = and i32 [[TMP23]], 15
+; CHECK-NEXT: [[TMP52:%.*]] = or i32 [[TMP23]], [[TMP24]]
+; CHECK-NEXT: [[TMP25:%.*]] = extractelement <16 x i32> [[TMP3]], i64 8
+; CHECK-NEXT: [[TMP26:%.*]] = and i32 [[TMP25]], 15
+; CHECK-NEXT: [[TMP54:%.*]] = or i32 [[TMP25]], [[TMP26]]
+; CHECK-NEXT: [[TMP27:%.*]] = extractelement <16 x i32> [[TMP3]], i64 9
+; CHECK-NEXT: [[TMP28:%.*]] = and i32 [[TMP27]], 15
+; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP27]], [[TMP28]]
+; CHECK-NEXT: [[TMP29:%.*]] = extractelement <16 x i32> [[TMP3]], i64 10
+; CHECK-NEXT: [[TMP30:%.*]] = and i32 [[TMP29]], 15
+; CHECK-NEXT: [[TMP57:%.*]] = or i32 [[TMP29]], [[TMP30]]
+; CHECK-NEXT: [[TMP31:%.*]] = extractelement <16 x i32> [[TMP3]], i64 11
+; CHECK-NEXT: [[TMP32:%.*]] = and i32 [[TMP31]], 15
+; CHECK-NEXT: [[TMP44:%.*]] = or i32 [[TMP31]], [[TMP32]]
+; CHECK-NEXT: [[TMP33:%.*]] = extractelement <16 x i32> [[TMP3]], i64 12
+; CHECK-NEXT: [[TMP34:%.*]] = and i32 [[TMP33]], 15
+; CHECK-NEXT: [[TMP47:%.*]] = or i32 [[TMP33]], [[TMP34]]
+; CHECK-NEXT: [[TMP35:%.*]] = extractelement <16 x i32> [[TMP3]], i64 13
+; CHECK-NEXT: [[TMP36:%.*]] = and i32 [[TMP35]], 15
+; CHECK-NEXT: [[TMP50:%.*]] = or i32 [[TMP35]], [[TMP36]]
+; CHECK-NEXT: [[TMP37:%.*]] = extractelement <16 x i32> [[TMP3]], i64 14
+; CHECK-NEXT: [[TMP38:%.*]] = and i32 [[TMP37]], 15
+; CHECK-NEXT: [[TMP53:%.*]] = or i32 [[TMP37]], [[TMP38]]
+; CHECK-NEXT: [[TMP39:%.*]] = extractelement <16 x i32> [[TMP3]], i64 15
+; CHECK-NEXT: [[TMP40:%.*]] = and i32 [[TMP39]], 15
+; CHECK-NEXT: [[TMP56:%.*]] = or i32 [[TMP39]], [[TMP40]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP2]], <16 x i32> [[X1:%.*]], <16 x i32> [[_MSLD]])
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i32 [[TMP56]], 0
+; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP58:%.*]], label [[TMP59:%.*]], !prof [[PROF1]]
+; CHECK: 58:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 59:
+; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1]], <16 x i32> [[X2]])
; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <16 x i32> [[TMP9]]
;
@@ -5496,9 +5549,62 @@ define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16
; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
-; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2]])
+; CHECK-NEXT: [[TMP42:%.*]] = extractelement <16 x i32> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP45:%.*]] = and i32 [[TMP42]], 15
+; CHECK-NEXT: [[TMP43:%.*]] = or i32 [[TMP42]], [[TMP45]]
+; CHECK-NEXT: [[TMP46:%.*]] = extractelement <16 x i32> [[TMP3]], i64 1
+; CHECK-NEXT: [[TMP47:%.*]] = and i32 [[TMP46]], 15
+; CHECK-NEXT: [[TMP44:%.*]] = or i32 [[TMP46]], [[TMP47]]
+; CHECK-NEXT: [[TMP48:%.*]] = extractelement <16 x i32> [[TMP3]], i64 2
+; CHECK-NEXT: [[TMP49:%.*]] = and i32 [[TMP48]], 15
+; CHECK-NEXT: [[TMP52:%.*]] = or i32 [[TMP48]], [[TMP49]]
+; CHECK-NEXT: [[TMP50:%.*]] = extractelement <16 x i32> [[TMP3]], i64 3
+; CHECK-NEXT: [[TMP51:%.*]] = and i32 [[TMP50]], 15
+; CHECK-NEXT: [[TMP53:%.*]] = or i32 [[TMP50]], [[TMP51]]
+; CHECK-NEXT: [[TMP18:%.*]] = extractelement <16 x i32> [[TMP3]], i64 4
+; CHECK-NEXT: [[TMP19:%.*]] = and i32 [[TMP18]], 15
+; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP18]], [[TMP19]]
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <16 x i32> [[TMP3]], i64 5
+; CHECK-NEXT: [[TMP21:%.*]] = and i32 [[TMP20]], 15
+; CHECK-NEXT: [[TMP56:%.*]] = or i32 [[TMP20]], [[TMP21]]
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <16 x i32> [[TMP3]], i64 6
+; CHECK-NEXT: [[TMP23:%.*]] = and i32 [[TMP22]], 15
+; CHECK-NEXT: [[TMP58:%.*]] = or i32 [[TMP22]], [[TMP23]]
+; CHECK-NEXT: [[TMP24:%.*]] = extractelement <16 x i32> [[TMP3]], i64 7
+; CHECK-NEXT: [[TMP25:%.*]] = and i32 [[TMP24]], 15
+; CHECK-NEXT: [[TMP61:%.*]] = or i32 [[TMP24]], [[TMP25]]
+; CHECK-NEXT: [[TMP26:%.*]] = extractelement <16 x i32> [[TMP3]], i64 8
+; CHECK-NEXT: [[TMP27:%.*]] = and i32 [[TMP26]], 15
+; CHECK-NEXT: [[TMP62:%.*]] = or i32 [[TMP26]], [[TMP27]]
+; CHECK-NEXT: [[TMP28:%.*]] = extractelement <16 x i32> [[TMP3]], i64 9
+; CHECK-NEXT: [[TMP29:%.*]] = and i32 [[TMP28]], 15
+; CHECK-NEXT: [[TMP63:%.*]] = or i32 [[TMP28]], [[TMP29]]
+; CHECK-NEXT: [[TMP30:%.*]] = extractelement <16 x i32> [[TMP3]], i64 10
+; CHECK-NEXT: [[TMP31:%.*]] = and i32 [[TMP30]], 15
+; CHECK-NEXT: [[TMP64:%.*]] = or i32 [[TMP30]], [[TMP31]]
+; CHECK-NEXT: [[TMP32:%.*]] = extractelement <16 x i32> [[TMP3]], i64 11
+; CHECK-NEXT: [[TMP33:%.*]] = and i32 [[TMP32]], 15
+; CHECK-NEXT: [[TMP65:%.*]] = or i32 [[TMP32]], [[TMP33]]
+; CHECK-NEXT: [[TMP34:%.*]] = extractelement <16 x i32> [[TMP3]], i64 12
+; CHECK-NEXT: [[TMP35:%.*]] = and i32 [[TMP34]], 15
+; CHECK-NEXT: [[TMP66:%.*]] = or i32 [[TMP34]], [[TMP35]]
+; CHECK-NEXT: [[TMP36:%.*]] = extractelement <16 x i32> [[TMP3]], i64 13
+; CHECK-NEXT: [[TMP37:%.*]] = and i32 [[TMP36]], 15
+; CHECK-NEXT: [[TMP67:%.*]] = or i32 [[TMP36]], [[TMP37]]
+; CHECK-NEXT: [[TMP38:%.*]] = extractelement <16 x i32> [[TMP3]], i64 14
+; CHECK-NEXT: [[TMP39:%.*]] = and i32 [[TMP38]], 15
+; CHECK-NEXT: [[TMP54:%.*]] = or i32 [[TMP38]], [[TMP39]]
+; CHECK-NEXT: [[TMP40:%.*]] = extractelement <16 x i32> [[TMP3]], i64 15
+; CHECK-NEXT: [[TMP41:%.*]] = and i32 [[TMP40]], 15
+; CHECK-NEXT: [[TMP57:%.*]] = or i32 [[TMP40]], [[TMP41]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP2]], <16 x i32> [[X1:%.*]], <16 x i32> [[_MSLD]])
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i32 [[TMP57]], 0
+; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP59:%.*]], label [[TMP60:%.*]], !prof [[PROF1]]
+; CHECK: 59:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 60:
+; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1]], <16 x i32> [[X2]])
; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP3]]
@@ -5522,24 +5628,45 @@ declare <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double>, <8 x i64>,
define <8 x double>@test_int_x86_avx512_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) #0 {
; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_pd_512(
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
-; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK: 7:
+; CHECK-NEXT: [[TMP8:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i64> [[TMP8]], i64 0
+; CHECK-NEXT: [[TMP21:%.*]] = and i64 [[TMP20]], 7
+; CHECK-NEXT: [[TMP25:%.*]] = or i64 [[TMP20]], [[TMP21]]
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i64> [[TMP8]], i64 1
+; CHECK-NEXT: [[TMP22:%.*]] = and i64 [[TMP6]], 7
+; CHECK-NEXT: [[TMP28:%.*]] = or i64 [[TMP6]], [[TMP22]]
+; CHECK-NEXT: [[TMP23:%.*]] = extractelement <8 x i64> [[TMP8]], i64 2
+; CHECK-NEXT: [[TMP26:%.*]] = and i64 [[TMP23]], 7
+; CHECK-NEXT: [[TMP29:%.*]] = or i64 [[TMP23]], [[TMP26]]
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i64> [[TMP8]], i64 3
+; CHECK-NEXT: [[TMP27:%.*]] = and i64 [[TMP10]], 7
+; CHECK-NEXT: [[TMP30:%.*]] = or i64 [[TMP10]], [[TMP27]]
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <8 x i64> [[TMP8]], i64 4
+; CHECK-NEXT: [[TMP13:%.*]] = and i64 [[TMP12]], 7
+; CHECK-NEXT: [[TMP31:%.*]] = or i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i64> [[TMP8]], i64 5
+; CHECK-NEXT: [[TMP15:%.*]] = and i64 [[TMP14]], 7
+; CHECK-NEXT: [[TMP34:%.*]] = or i64 [[TMP14]], [[TMP15]]
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i64> [[TMP8]], i64 6
+; CHECK-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 7
+; CHECK-NEXT: [[TMP24:%.*]] = or i64 [[TMP16]], [[TMP17]]
+; CHECK-NEXT: [[TMP18:%.*]] = extractelement <8 x i64> [[TMP8]], i64 7
+; CHECK-NEXT: [[TMP19:%.*]] = and i64 [[TMP18]], 7
+; CHECK-NEXT: [[TMP35:%.*]] = or i64 [[TMP18]], [[TMP19]]
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP3]] to <8 x double>
+; CHECK-NEXT: [[TMP11:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[TMP4]], <8 x i64> [[X1:%.*]], <8 x double> [[TMP5]])
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x double> [[TMP11]] to <8 x i64>
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP35]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP32:%.*]], label [[TMP33:%.*]], !prof [[PROF1]]
+; CHECK: 32:
; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
; CHECK-NEXT: unreachable
-; CHECK: 8:
-; CHECK-NEXT: [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x double> [[X2:%.*]])
-; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK: 33:
+; CHECK-NEXT: [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1]], <8 x double> [[X2:%.*]])
+; CHECK-NEXT: store <8 x i64> [[TMP7]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <8 x double> [[TMP9]]
;
%1 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2)
@@ -5549,32 +5676,53 @@ define <8 x double>@test_int_x86_avx512_vpermi2var_pd_512(<8 x double> %x0, <8 x
define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) #0 {
; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_pd_512(
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
-; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK: 8:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT: unreachable
-; CHECK: 9:
-; CHECK-NEXT: [[TMP10:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x double> [[X2:%.*]])
+; CHECK-NEXT: [[TMP21:%.*]] = extractelement <8 x i64> [[TMP2]], i64 0
+; CHECK-NEXT: [[TMP22:%.*]] = and i64 [[TMP21]], 7
+; CHECK-NEXT: [[TMP26:%.*]] = or i64 [[TMP21]], [[TMP22]]
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i64> [[TMP2]], i64 1
+; CHECK-NEXT: [[TMP23:%.*]] = and i64 [[TMP7]], 7
+; CHECK-NEXT: [[TMP38:%.*]] = or i64 [[TMP7]], [[TMP23]]
+; CHECK-NEXT: [[TMP24:%.*]] = extractelement <8 x i64> [[TMP2]], i64 2
+; CHECK-NEXT: [[TMP27:%.*]] = and i64 [[TMP24]], 7
+; CHECK-NEXT: [[TMP39:%.*]] = or i64 [[TMP24]], [[TMP27]]
+; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i64> [[TMP2]], i64 3
+; CHECK-NEXT: [[TMP29:%.*]] = and i64 [[TMP28]], 7
+; CHECK-NEXT: [[TMP40:%.*]] = or i64 [[TMP28]], [[TMP29]]
+; CHECK-NEXT: [[TMP30:%.*]] = extractelement <8 x i64> [[TMP2]], i64 4
+; CHECK-NEXT: [[TMP31:%.*]] = and i64 [[TMP30]], 7
+; CHECK-NEXT: [[TMP41:%.*]] = or i64 [[TMP30]], [[TMP31]]
+; CHECK-NEXT: [[TMP32:%.*]] = extractelement <8 x i64> [[TMP2]], i64 5
+; CHECK-NEXT: [[TMP33:%.*]] = and i64 [[TMP32]], 7
+; CHECK-NEXT: [[TMP42:%.*]] = or i64 [[TMP32]], [[TMP33]]
+; CHECK-NEXT: [[TMP34:%.*]] = extractelement <8 x i64> [[TMP2]], i64 6
+; CHECK-NEXT: [[TMP35:%.*]] = and i64 [[TMP34]], 7
+; CHECK-NEXT: [[TMP25:%.*]] = or i64 [[TMP34]], [[TMP35]]
+; CHECK-NEXT: [[TMP36:%.*]] = extractelement <8 x i64> [[TMP2]], i64 7
+; CHECK-NEXT: [[TMP37:%.*]] = and i64 [[TMP36]], 7
+; CHECK-NEXT: [[TMP43:%.*]] = or i64 [[TMP36]], [[TMP37]]
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP3]] to <8 x double>
+; CHECK-NEXT: [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[TMP5]], <8 x i64> [[X1:%.*]], <8 x double> [[TMP6]])
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x double> [[TMP9]] to <8 x i64>
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP43]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP44:%.*]], label [[TMP45:%.*]], !prof [[PROF1]]
+; CHECK: 33:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 34:
+; CHECK-NEXT: [[TMP10:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1]], <8 x double> [[X2:%.*]])
; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i64> [[X1]] to <8 x double>
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> zeroinitializer, <8 x i64> [[TMP2]]
+; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP8]], <8 x i64> [[TMP2]]
; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x double> [[TMP10]] to <8 x i64>
; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x double> [[TMP11]] to <8 x i64>
; CHECK-NEXT: [[TMP17:%.*]] = xor <8 x i64> [[TMP15]], [[TMP16]]
-; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP17]], zeroinitializer
+; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP17]], [[TMP8]]
; CHECK-NEXT: [[TMP19:%.*]] = or <8 x i64> [[TMP18]], [[TMP2]]
; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP19]], <8 x i64> [[TMP14]]
; CHECK-NEXT: [[TMP20:%.*]] = select <8 x i1> [[TMP13]], <8 x double> [[TMP10]], <8 x double> [[TMP11]]
@@ -5593,24 +5741,69 @@ declare <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float>, <16 x i32>
define <16 x float>@test_int_x86_avx512_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) #0 {
; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_ps_512(
; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
-; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK: 7:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT: unreachable
-; CHECK: 8:
-; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x float> [[X2:%.*]])
-; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: [[TMP8:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP36:%.*]] = extractelement <16 x i32> [[TMP8]], i64 0
+; CHECK-NEXT: [[TMP37:%.*]] = and i32 [[TMP36]], 15
+; CHECK-NEXT: [[TMP40:%.*]] = or i32 [[TMP36]], [[TMP37]]
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <16 x i32> [[TMP8]], i64 1
+; CHECK-NEXT: [[TMP38:%.*]] = and i32 [[TMP6]], 15
+; CHECK-NEXT: [[TMP41:%.*]] = or i32 [[TMP6]], [[TMP38]]
+; CHECK-NEXT: [[TMP39:%.*]] = extractelement <16 x i32> [[TMP8]], i64 2
+; CHECK-NEXT: [[TMP42:%.*]] = and i32 [[TMP39]], 15
+; CHECK-NEXT: [[TMP44:%.*]] = or i32 [[TMP39]], [[TMP42]]
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <16 x i32> [[TMP8]], i64 3
+; CHECK-NEXT: [[TMP43:%.*]] = and i32 [[TMP10]], 15
+; CHECK-NEXT: [[TMP46:%.*]] = or i32 [[TMP10]], [[TMP43]]
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <16 x i32> [[TMP8]], i64 4
+; CHECK-NEXT: [[TMP13:%.*]] = and i32 [[TMP12]], 15
+; CHECK-NEXT: [[TMP47:%.*]] = or i32 [[TMP12]], [[TMP13]]
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <16 x i32> [[TMP8]], i64 5
+; CHECK-NEXT: [[TMP15:%.*]] = and i32 [[TMP14]], 15
+; CHECK-NEXT: [[TMP49:%.*]] = or i32 [[TMP14]], [[TMP15]]
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <16 x i32> [[TMP8]], i64 6
+; CHECK-NEXT: [[TMP17:%.*]] = and i32 [[TMP16]], 15
+; CHECK-NEXT: [[TMP51:%.*]] = or i32 [[TMP16]], [[TMP17]]
+; CHECK-NEXT: [[TMP18:%.*]] = extractelement <16 x i32> [[TMP8]], i64 7
+; CHECK-NEXT: [[TMP19:%.*]] = and i32 [[TMP18]], 15
+; CHECK-NEXT: [[TMP52:%.*]] = or i32 [[TMP18]], [[TMP19]]
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <16 x i32> [[TMP8]], i64 8
+; CHECK-NEXT: [[TMP21:%.*]] = and i32 [[TMP20]], 15
+; CHECK-NEXT: [[TMP53:%.*]] = or i32 [[TMP20]], [[TMP21]]
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <16 x i32> [[TMP8]], i64 9
+; CHECK-NEXT: [[TMP23:%.*]] = and i32 [[TMP22]], 15
+; CHECK-NEXT: [[TMP54:%.*]] = or i32 [[TMP22]], [[TMP23]]
+; CHECK-NEXT: [[TMP24:%.*]] = extractelement <16 x i32> [[TMP8]], i64 10
+; CHECK-NEXT: [[TMP25:%.*]] = and i32 [[TMP24]], 15
+; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP24]], [[TMP25]]
+; CHECK-NEXT: [[TMP26:%.*]] = extractelement <16 x i32> [[TMP8]], i64 11
+; CHECK-NEXT: [[TMP27:%.*]] = and i32 [[TMP26]], 15
+; CHECK-NEXT: [[TMP58:%.*]] = or i32 [[TMP26]], [[TMP27]]
+; CHECK-NEXT: [[TMP28:%.*]] = extractelement <16 x i32> [[TMP8]], i64 12
+; CHECK-NEXT: [[TMP29:%.*]] = and i32 [[TMP28]], 15
+; CHECK-NEXT: [[TMP59:%.*]] = or i32 [[TMP28]], [[TMP29]]
+; CHECK-NEXT: [[TMP30:%.*]] = extractelement <16 x i32> [[TMP8]], i64 13
+; CHECK-NEXT: [[TMP31:%.*]] = and i32 [[TMP30]], 15
+; CHECK-NEXT: [[TMP45:%.*]] = or i32 [[TMP30]], [[TMP31]]
+; CHECK-NEXT: [[TMP32:%.*]] = extractelement <16 x i32> [[TMP8]], i64 14
+; CHECK-NEXT: [[TMP33:%.*]] = and i32 [[TMP32]], 15
+; CHECK-NEXT: [[TMP48:%.*]] = or i32 [[TMP32]], [[TMP33]]
+; CHECK-NEXT: [[TMP34:%.*]] = extractelement <16 x i32> [[TMP8]], i64 15
+; CHECK-NEXT: [[TMP50:%.*]] = and i32 [[TMP34]], 15
+; CHECK-NEXT: [[TMP35:%.*]] = or i32 [[TMP34]], [[TMP50]]
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP3]] to <16 x float>
+; CHECK-NEXT: [[TMP11:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[TMP4]], <16 x i32> [[X1:%.*]], <16 x float> [[TMP5]])
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x float> [[TMP11]] to <16 x i32>
+; CHECK-NEXT: [[_MSCMP28:%.*]] = icmp ne i32 [[TMP35]], 0
+; CHECK-NEXT: br i1 [[_MSCMP28]], label [[TMP56:%.*]], label [[TMP57:%.*]], !prof [[PROF1]]
+; CHECK: 56:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 57:
+; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1]], <16 x float> [[X2:%.*]])
+; CHECK-NEXT: store <16 x i32> [[TMP7]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <16 x float> [[TMP9]]
;
%1 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2)
@@ -5620,32 +5813,77 @@ define <16 x float>@test_int_x86_avx512_vpermi2var_ps_512(<16 x float> %x0, <16
define <16 x float>@test_int_x86_avx512_mask_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) #0 {
; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_ps_512(
; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
-; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK: 8:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT: unreachable
-; CHECK: 9:
-; CHECK-NEXT: [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x float> [[X2:%.*]])
+; CHECK-NEXT: [[TMP37:%.*]] = extractelement <16 x i32> [[TMP2]], i64 0
+; CHECK-NEXT: [[TMP38:%.*]] = and i32 [[TMP37]], 15
+; CHECK-NEXT: [[TMP41:%.*]] = or i32 [[TMP37]], [[TMP38]]
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <16 x i32> [[TMP2]], i64 1
+; CHECK-NEXT: [[TMP39:%.*]] = and i32 [[TMP7]], 15
+; CHECK-NEXT: [[TMP42:%.*]] = or i32 [[TMP7]], [[TMP39]]
+; CHECK-NEXT: [[TMP40:%.*]] = extractelement <16 x i32> [[TMP2]], i64 2
+; CHECK-NEXT: [[TMP43:%.*]] = and i32 [[TMP40]], 15
+; CHECK-NEXT: [[TMP54:%.*]] = or i32 [[TMP40]], [[TMP43]]
+; CHECK-NEXT: [[TMP44:%.*]] = extractelement <16 x i32> [[TMP2]], i64 3
+; CHECK-NEXT: [[TMP45:%.*]] = and i32 [[TMP44]], 15
+; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP44]], [[TMP45]]
+; CHECK-NEXT: [[TMP46:%.*]] = extractelement <16 x i32> [[TMP2]], i64 4
+; CHECK-NEXT: [[TMP47:%.*]] = and i32 [[TMP46]], 15
+; CHECK-NEXT: [[TMP56:%.*]] = or i32 [[TMP46]], [[TMP47]]
+; CHECK-NEXT: [[TMP48:%.*]] = extractelement <16 x i32> [[TMP2]], i64 5
+; CHECK-NEXT: [[TMP49:%.*]] = and i32 [[TMP48]], 15
+; CHECK-NEXT: [[TMP59:%.*]] = or i32 [[TMP48]], [[TMP49]]
+; CHECK-NEXT: [[TMP50:%.*]] = extractelement <16 x i32> [[TMP2]], i64 6
+; CHECK-NEXT: [[TMP51:%.*]] = and i32 [[TMP50]], 15
+; CHECK-NEXT: [[TMP60:%.*]] = or i32 [[TMP50]], [[TMP51]]
+; CHECK-NEXT: [[TMP52:%.*]] = extractelement <16 x i32> [[TMP2]], i64 7
+; CHECK-NEXT: [[TMP53:%.*]] = and i32 [[TMP52]], 15
+; CHECK-NEXT: [[TMP61:%.*]] = or i32 [[TMP52]], [[TMP53]]
+; CHECK-NEXT: [[TMP21:%.*]] = extractelement <16 x i32> [[TMP2]], i64 8
+; CHECK-NEXT: [[TMP22:%.*]] = and i32 [[TMP21]], 15
+; CHECK-NEXT: [[TMP62:%.*]] = or i32 [[TMP21]], [[TMP22]]
+; CHECK-NEXT: [[TMP23:%.*]] = extractelement <16 x i32> [[TMP2]], i64 9
+; CHECK-NEXT: [[TMP24:%.*]] = and i32 [[TMP23]], 15
+; CHECK-NEXT: [[TMP63:%.*]] = or i32 [[TMP23]], [[TMP24]]
+; CHECK-NEXT: [[TMP25:%.*]] = extractelement <16 x i32> [[TMP2]], i64 10
+; CHECK-NEXT: [[TMP26:%.*]] = and i32 [[TMP25]], 15
+; CHECK-NEXT: [[TMP64:%.*]] = or i32 [[TMP25]], [[TMP26]]
+; CHECK-NEXT: [[TMP27:%.*]] = extractelement <16 x i32> [[TMP2]], i64 11
+; CHECK-NEXT: [[TMP28:%.*]] = and i32 [[TMP27]], 15
+; CHECK-NEXT: [[TMP65:%.*]] = or i32 [[TMP27]], [[TMP28]]
+; CHECK-NEXT: [[TMP29:%.*]] = extractelement <16 x i32> [[TMP2]], i64 12
+; CHECK-NEXT: [[TMP30:%.*]] = and i32 [[TMP29]], 15
+; CHECK-NEXT: [[TMP66:%.*]] = or i32 [[TMP29]], [[TMP30]]
+; CHECK-NEXT: [[TMP31:%.*]] = extractelement <16 x i32> [[TMP2]], i64 13
+; CHECK-NEXT: [[TMP32:%.*]] = and i32 [[TMP31]], 15
+; CHECK-NEXT: [[TMP67:%.*]] = or i32 [[TMP31]], [[TMP32]]
+; CHECK-NEXT: [[TMP33:%.*]] = extractelement <16 x i32> [[TMP2]], i64 14
+; CHECK-NEXT: [[TMP34:%.*]] = and i32 [[TMP33]], 15
+; CHECK-NEXT: [[TMP68:%.*]] = or i32 [[TMP33]], [[TMP34]]
+; CHECK-NEXT: [[TMP35:%.*]] = extractelement <16 x i32> [[TMP2]], i64 15
+; CHECK-NEXT: [[TMP36:%.*]] = and i32 [[TMP35]], 15
+; CHECK-NEXT: [[TMP69:%.*]] = or i32 [[TMP35]], [[TMP36]]
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP3]] to <16 x float>
+; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[TMP5]], <16 x i32> [[X1:%.*]], <16 x float> [[TMP6]])
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP69]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF1]]
+; CHECK: 57:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 58:
+; CHECK-NEXT: [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1]], <16 x float> [[X2:%.*]])
; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[X1]] to <16 x float>
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
; CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> zeroinitializer, <16 x i32> [[TMP2]]
+; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP8]], <16 x i32> [[TMP2]]
; CHECK-NEXT: [[TMP15:%.*]] = bitcast <16 x float> [[TMP10]] to <16 x i32>
; CHECK-NEXT: [[TMP16:%.*]] = bitcast <16 x float> [[TMP11]] to <16 x i32>
; CHECK-NEXT: [[TMP17:%.*]] = xor <16 x i32> [[TMP15]], [[TMP16]]
-; CHECK-NEXT: [[TMP18:%.*]] = or <16 x i32> [[TMP17]], zeroinitializer
+; CHECK-NEXT: [[TMP18:%.*]] = or <16 x i32> [[TMP17]], [[TMP8]]
; CHECK-NEXT: [[TMP19:%.*]] = or <16 x i32> [[TMP18]], [[TMP2]]
; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP19]], <16 x i32> [[TMP14]]
; CHECK-NEXT: [[TMP20:%.*]] = select <16 x i1> [[TMP13]], <16 x float> [[TMP10]], <16 x float> [[TMP11]]
@@ -5664,12 +5902,41 @@ declare <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i
define <8 x i64>@test_int_x86_avx512_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) #0 {
; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_q_512(
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]])
+; CHECK-NEXT: [[TMP8:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i64> [[TMP8]], i64 0
+; CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP20]], 7
+; CHECK-NEXT: [[TMP22:%.*]] = or i64 [[TMP20]], [[TMP5]]
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i64> [[TMP8]], i64 1
+; CHECK-NEXT: [[TMP7:%.*]] = and i64 [[TMP6]], 7
+; CHECK-NEXT: [[TMP25:%.*]] = or i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP23:%.*]] = extractelement <8 x i64> [[TMP8]], i64 2
+; CHECK-NEXT: [[TMP9:%.*]] = and i64 [[TMP23]], 7
+; CHECK-NEXT: [[TMP26:%.*]] = or i64 [[TMP23]], [[TMP9]]
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i64> [[TMP8]], i64 3
+; CHECK-NEXT: [[TMP11:%.*]] = and i64 [[TMP10]], 7
+; CHECK-NEXT: [[TMP28:%.*]] = or i64 [[TMP10]], [[TMP11]]
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <8 x i64> [[TMP8]], i64 4
+; CHECK-NEXT: [[TMP13:%.*]] = and i64 [[TMP12]], 7
+; CHECK-NEXT: [[TMP31:%.*]] = or i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i64> [[TMP8]], i64 5
+; CHECK-NEXT: [[TMP15:%.*]] = and i64 [[TMP14]], 7
+; CHECK-NEXT: [[TMP21:%.*]] = or i64 [[TMP14]], [[TMP15]]
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i64> [[TMP8]], i64 6
+; CHECK-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 7
+; CHECK-NEXT: [[TMP24:%.*]] = or i64 [[TMP16]], [[TMP17]]
+; CHECK-NEXT: [[TMP18:%.*]] = extractelement <8 x i64> [[TMP8]], i64 7
+; CHECK-NEXT: [[TMP19:%.*]] = and i64 [[TMP18]], 7
+; CHECK-NEXT: [[TMP27:%.*]] = or i64 [[TMP18]], [[TMP19]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]], <8 x i64> [[TMP3]])
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP27]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP29:%.*]], label [[TMP30:%.*]], !prof [[PROF1]]
+; CHECK: 29:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 30:
+; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1]], <8 x i64> [[X2:%.*]])
; CHECK-NEXT: store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <8 x i64> [[TMP4]]
;
@@ -5680,13 +5947,42 @@ define <8 x i64>@test_int_x86_avx512_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %
define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 {
; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_q_512(
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]])
+; CHECK-NEXT: [[TMP21:%.*]] = extractelement <8 x i64> [[TMP2]], i64 0
+; CHECK-NEXT: [[TMP24:%.*]] = and i64 [[TMP21]], 7
+; CHECK-NEXT: [[TMP23:%.*]] = or i64 [[TMP21]], [[TMP24]]
+; CHECK-NEXT: [[TMP25:%.*]] = extractelement <8 x i64> [[TMP2]], i64 1
+; CHECK-NEXT: [[TMP26:%.*]] = and i64 [[TMP25]], 7
+; CHECK-NEXT: [[TMP32:%.*]] = or i64 [[TMP25]], [[TMP26]]
+; CHECK-NEXT: [[TMP27:%.*]] = extractelement <8 x i64> [[TMP2]], i64 2
+; CHECK-NEXT: [[TMP28:%.*]] = and i64 [[TMP27]], 7
+; CHECK-NEXT: [[TMP33:%.*]] = or i64 [[TMP27]], [[TMP28]]
+; CHECK-NEXT: [[TMP29:%.*]] = extractelement <8 x i64> [[TMP2]], i64 3
+; CHECK-NEXT: [[TMP30:%.*]] = and i64 [[TMP29]], 7
+; CHECK-NEXT: [[TMP34:%.*]] = or i64 [[TMP29]], [[TMP30]]
+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x i64> [[TMP2]], i64 4
+; CHECK-NEXT: [[TMP14:%.*]] = and i64 [[TMP13]], 7
+; CHECK-NEXT: [[TMP35:%.*]] = or i64 [[TMP13]], [[TMP14]]
+; CHECK-NEXT: [[TMP15:%.*]] = extractelement <8 x i64> [[TMP2]], i64 5
+; CHECK-NEXT: [[TMP16:%.*]] = and i64 [[TMP15]], 7
+; CHECK-NEXT: [[TMP22:%.*]] = or i64 [[TMP15]], [[TMP16]]
+; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i64> [[TMP2]], i64 6
+; CHECK-NEXT: [[TMP18:%.*]] = and i64 [[TMP17]], 7
+; CHECK-NEXT: [[TMP36:%.*]] = or i64 [[TMP17]], [[TMP18]]
+; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i64> [[TMP2]], i64 7
+; CHECK-NEXT: [[TMP20:%.*]] = and i64 [[TMP19]], 7
+; CHECK-NEXT: [[TMP37:%.*]] = or i64 [[TMP19]], [[TMP20]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]], <8 x i64> [[TMP3]])
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP37]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP38:%.*]], label [[TMP31:%.*]], !prof [[PROF1]]
+; CHECK: 30:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 31:
+; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1]], <8 x i64> [[X2:%.*]])
; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> [[TMP2]]
@@ -5722,9 +6018,62 @@ define <16 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_512(<16 x i32> %x0, <16
; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
-; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x i32> [[X2]])
+; CHECK-NEXT: [[TMP42:%.*]] = extractelement <16 x i32> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP45:%.*]] = and i32 [[TMP42]], 15
+; CHECK-NEXT: [[TMP43:%.*]] = or i32 [[TMP42]], [[TMP45]]
+; CHECK-NEXT: [[TMP46:%.*]] = extractelement <16 x i32> [[TMP3]], i64 1
+; CHECK-NEXT: [[TMP47:%.*]] = and i32 [[TMP46]], 15
+; CHECK-NEXT: [[TMP44:%.*]] = or i32 [[TMP46]], [[TMP47]]
+; CHECK-NEXT: [[TMP48:%.*]] = extractelement <16 x i32> [[TMP3]], i64 2
+; CHECK-NEXT: [[TMP49:%.*]] = and i32 [[TMP48]], 15
+; CHECK-NEXT: [[TMP52:%.*]] = or i32 [[TMP48]], [[TMP49]]
+; CHECK-NEXT: [[TMP50:%.*]] = extractelement <16 x i32> [[TMP3]], i64 3
+; CHECK-NEXT: [[TMP51:%.*]] = and i32 [[TMP50]], 15
+; CHECK-NEXT: [[TMP53:%.*]] = or i32 [[TMP50]], [[TMP51]]
+; CHECK-NEXT: [[TMP18:%.*]] = extractelement <16 x i32> [[TMP3]], i64 4
+; CHECK-NEXT: [[TMP19:%.*]] = and i32 [[TMP18]], 15
+; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP18]], [[TMP19]]
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <16 x i32> [[TMP3]], i64 5
+; CHECK-NEXT: [[TMP21:%.*]] = and i32 [[TMP20]], 15
+; CHECK-NEXT: [[TMP56:%.*]] = or i32 [[TMP20]], [[TMP21]]
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <16 x i32> [[TMP3]], i64 6
+; CHECK-NEXT: [[TMP23:%.*]] = and i32 [[TMP22]], 15
+; CHECK-NEXT: [[TMP58:%.*]] = or i32 [[TMP22]], [[TMP23]]
+; CHECK-NEXT: [[TMP24:%.*]] = extractelement <16 x i32> [[TMP3]], i64 7
+; CHECK-NEXT: [[TMP25:%.*]] = and i32 [[TMP24]], 15
+; CHECK-NEXT: [[TMP61:%.*]] = or i32 [[TMP24]], [[TMP25]]
+; CHECK-NEXT: [[TMP26:%.*]] = extractelement <16 x i32> [[TMP3]], i64 8
+; CHECK-NEXT: [[TMP27:%.*]] = and i32 [[TMP26]], 15
+; CHECK-NEXT: [[TMP62:%.*]] = or i32 [[TMP26]], [[TMP27]]
+; CHECK-NEXT: [[TMP28:%.*]] = extractelement <16 x i32> [[TMP3]], i64 9
+; CHECK-NEXT: [[TMP29:%.*]] = and i32 [[TMP28]], 15
+; CHECK-NEXT: [[TMP63:%.*]] = or i32 [[TMP28]], [[TMP29]]
+; CHECK-NEXT: [[TMP30:%.*]] = extractelement <16 x i32> [[TMP3]], i64 10
+; CHECK-NEXT: [[TMP31:%.*]] = and i32 [[TMP30]], 15
+; CHECK-NEXT: [[TMP64:%.*]] = or i32 [[TMP30]], [[TMP31]]
+; CHECK-NEXT: [[TMP32:%.*]] = extractelement <16 x i32> [[TMP3]], i64 11
+; CHECK-NEXT: [[TMP33:%.*]] = and i32 [[TMP32]], 15
+; CHECK-NEXT: [[TMP65:%.*]] = or i32 [[TMP32]], [[TMP33]]
+; CHECK-NEXT: [[TMP34:%.*]] = extractelement <16 x i32> [[TMP3]], i64 12
+; CHECK-NEXT: [[TMP35:%.*]] = and i32 [[TMP34]], 15
+; CHECK-NEXT: [[TMP66:%.*]] = or i32 [[TMP34]], [[TMP35]]
+; CHECK-NEXT: [[TMP36:%.*]] = extractelement <16 x i32> [[TMP3]], i64 13
+; CHECK-NEXT: [[TMP37:%.*]] = and i32 [[TMP36]], 15
+; CHECK-NEXT: [[TMP67:%.*]] = or i32 [[TMP36]], [[TMP37]]
+; CHECK-NEXT: [[TMP38:%.*]] = extractelement <16 x i32> [[TMP3]], i64 14
+; CHECK-NEXT: [[TMP39:%.*]] = and i32 [[TMP38]], 15
+; CHECK-NEXT: [[TMP54:%.*]] = or i32 [[TMP38]], [[TMP39]]
+; CHECK-NEXT: [[TMP40:%.*]] = extractelement <16 x i32> [[TMP3]], i64 15
+; CHECK-NEXT: [[TMP41:%.*]] = and i32 [[TMP40]], 15
+; CHECK-NEXT: [[TMP57:%.*]] = or i32 [[TMP40]], [[TMP41]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP2]], <16 x i32> [[X0:%.*]], <16 x i32> [[_MSLD]])
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i32 [[TMP57]], 0
+; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP59:%.*]], label [[TMP60:%.*]], !prof [[PROF1]]
+; CHECK: 59:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 60:
+; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0]], <16 x i32> [[X2]])
; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
@@ -5753,7 +6102,7 @@ define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <
; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
; CHECK: 7:
; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
; CHECK-NEXT: unreachable
@@ -5767,26 +6116,47 @@ define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <
; CHECK-NEXT: [[X2INS:%.*]] = insertelement <8 x double> [[EXTRA_PARAM:%.*]], double [[X2S]], i32 0
; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> [[TMP6]], <8 x i32> zeroinitializer
; CHECK-NEXT: [[X2:%.*]] = shufflevector <8 x double> [[X2INS]], <8 x double> [[EXTRA_PARAM2:%.*]], <8 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP10]], 0
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
-; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP11]], 0
-; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to i512
-; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i512 [[TMP12]], 0
-; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR]], [[_MSCMP4]]
-; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP24:%.*]], label [[TMP25:%.*]], !prof [[PROF1]]
-; CHECK: 15:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT: unreachable
-; CHECK: 16:
-; CHECK-NEXT: [[TMP15:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X1:%.*]], <8 x i64> [[X0:%.*]], <8 x double> [[X2]])
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <8 x i64> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP28:%.*]] = and i64 [[TMP12]], 7
+; CHECK-NEXT: [[TMP33:%.*]] = or i64 [[TMP12]], [[TMP28]]
+; CHECK-NEXT: [[TMP29:%.*]] = extractelement <8 x i64> [[TMP3]], i64 1
+; CHECK-NEXT: [[TMP30:%.*]] = and i64 [[TMP29]], 7
+; CHECK-NEXT: [[TMP43:%.*]] = or i64 [[TMP29]], [[TMP30]]
+; CHECK-NEXT: [[TMP31:%.*]] = extractelement <8 x i64> [[TMP3]], i64 2
+; CHECK-NEXT: [[TMP34:%.*]] = and i64 [[TMP31]], 7
+; CHECK-NEXT: [[TMP44:%.*]] = or i64 [[TMP31]], [[TMP34]]
+; CHECK-NEXT: [[TMP35:%.*]] = extractelement <8 x i64> [[TMP3]], i64 3
+; CHECK-NEXT: [[TMP36:%.*]] = and i64 [[TMP35]], 7
+; CHECK-NEXT: [[TMP45:%.*]] = or i64 [[TMP35]], [[TMP36]]
+; CHECK-NEXT: [[TMP37:%.*]] = extractelement <8 x i64> [[TMP3]], i64 4
+; CHECK-NEXT: [[TMP38:%.*]] = and i64 [[TMP37]], 7
+; CHECK-NEXT: [[TMP46:%.*]] = or i64 [[TMP37]], [[TMP38]]
+; CHECK-NEXT: [[TMP39:%.*]] = extractelement <8 x i64> [[TMP3]], i64 5
+; CHECK-NEXT: [[TMP40:%.*]] = and i64 [[TMP39]], 7
+; CHECK-NEXT: [[TMP47:%.*]] = or i64 [[TMP39]], [[TMP40]]
+; CHECK-NEXT: [[TMP41:%.*]] = extractelement <8 x i64> [[TMP3]], i64 6
+; CHECK-NEXT: [[TMP42:%.*]] = and i64 [[TMP41]], 7
+; CHECK-NEXT: [[TMP32:%.*]] = or i64 [[TMP41]], [[TMP42]]
+; CHECK-NEXT: [[TMP26:%.*]] = extractelement <8 x i64> [[TMP3]], i64 7
+; CHECK-NEXT: [[TMP27:%.*]] = and i64 [[TMP26]], 7
+; CHECK-NEXT: [[TMP48:%.*]] = or i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT: [[TMP24:%.*]] = bitcast <8 x i64> [[TMP2]] to <8 x double>
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <8 x double>
+; CHECK-NEXT: [[TMP14:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[TMP24]], <8 x i64> [[X0:%.*]], <8 x double> [[TMP13]])
+; CHECK-NEXT: [[TMP25:%.*]] = bitcast <8 x double> [[TMP14]] to <8 x i64>
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP48]], 0
+; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP49:%.*]], label [[TMP50:%.*]], !prof [[PROF1]]
+; CHECK: 40:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 41:
+; CHECK-NEXT: [[TMP15:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X1:%.*]], <8 x i64> [[X0]], <8 x double> [[X2]])
; CHECK-NEXT: [[TMP16:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
+; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> [[TMP25]], <8 x i64> zeroinitializer
; CHECK-NEXT: [[TMP19:%.*]] = bitcast <8 x double> [[TMP15]] to <8 x i64>
; CHECK-NEXT: [[TMP20:%.*]] = xor <8 x i64> [[TMP19]], zeroinitializer
-; CHECK-NEXT: [[TMP21:%.*]] = or <8 x i64> [[TMP20]], zeroinitializer
+; CHECK-NEXT: [[TMP21:%.*]] = or <8 x i64> [[TMP20]], [[TMP25]]
; CHECK-NEXT: [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer
; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP16]], <8 x i64> [[TMP22]], <8 x i64> [[TMP18]]
; CHECK-NEXT: [[TMP23:%.*]] = select <8 x i1> [[TMP17]], <8 x double> [[TMP15]], <8 x double> zeroinitializer
@@ -5805,30 +6175,75 @@ define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <
define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) #0 {
; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_ps_512(
; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP9:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
-; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK: 8:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT: unreachable
-; CHECK: 9:
-; CHECK-NEXT: [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x float> [[X2:%.*]])
+; CHECK-NEXT: [[TMP37:%.*]] = extractelement <16 x i32> [[TMP9]], i64 0
+; CHECK-NEXT: [[TMP38:%.*]] = and i32 [[TMP37]], 15
+; CHECK-NEXT: [[TMP41:%.*]] = or i32 [[TMP37]], [[TMP38]]
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <16 x i32> [[TMP9]], i64 1
+; CHECK-NEXT: [[TMP39:%.*]] = and i32 [[TMP7]], 15
+; CHECK-NEXT: [[TMP42:%.*]] = or i32 [[TMP7]], [[TMP39]]
+; CHECK-NEXT: [[TMP40:%.*]] = extractelement <16 x i32> [[TMP9]], i64 2
+; CHECK-NEXT: [[TMP43:%.*]] = and i32 [[TMP40]], 15
+; CHECK-NEXT: [[TMP53:%.*]] = or i32 [[TMP40]], [[TMP43]]
+; CHECK-NEXT: [[TMP44:%.*]] = extractelement <16 x i32> [[TMP9]], i64 3
+; CHECK-NEXT: [[TMP45:%.*]] = and i32 [[TMP44]], 15
+; CHECK-NEXT: [[TMP54:%.*]] = or i32 [[TMP44]], [[TMP45]]
+; CHECK-NEXT: [[TMP46:%.*]] = extractelement <16 x i32> [[TMP9]], i64 4
+; CHECK-NEXT: [[TMP47:%.*]] = and i32 [[TMP46]], 15
+; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP46]], [[TMP47]]
+; CHECK-NEXT: [[TMP48:%.*]] = extractelement <16 x i32> [[TMP9]], i64 5
+; CHECK-NEXT: [[TMP49:%.*]] = and i32 [[TMP48]], 15
+; CHECK-NEXT: [[TMP56:%.*]] = or i32 [[TMP48]], [[TMP49]]
+; CHECK-NEXT: [[TMP50:%.*]] = extractelement <16 x i32> [[TMP9]], i64 6
+; CHECK-NEXT: [[TMP51:%.*]] = and i32 [[TMP50]], 15
+; CHECK-NEXT: [[TMP59:%.*]] = or i32 [[TMP50]], [[TMP51]]
+; CHECK-NEXT: [[TMP52:%.*]] = extractelement <16 x i32> [[TMP9]], i64 7
+; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP52]], 15
+; CHECK-NEXT: [[TMP60:%.*]] = or i32 [[TMP52]], [[TMP20]]
+; CHECK-NEXT: [[TMP21:%.*]] = extractelement <16 x i32> [[TMP9]], i64 8
+; CHECK-NEXT: [[TMP22:%.*]] = and i32 [[TMP21]], 15
+; CHECK-NEXT: [[TMP61:%.*]] = or i32 [[TMP21]], [[TMP22]]
+; CHECK-NEXT: [[TMP23:%.*]] = extractelement <16 x i32> [[TMP9]], i64 9
+; CHECK-NEXT: [[TMP24:%.*]] = and i32 [[TMP23]], 15
+; CHECK-NEXT: [[TMP62:%.*]] = or i32 [[TMP23]], [[TMP24]]
+; CHECK-NEXT: [[TMP25:%.*]] = extractelement <16 x i32> [[TMP9]], i64 10
+; CHECK-NEXT: [[TMP26:%.*]] = and i32 [[TMP25]], 15
+; CHECK-NEXT: [[TMP63:%.*]] = or i32 [[TMP25]], [[TMP26]]
+; CHECK-NEXT: [[TMP27:%.*]] = extractelement <16 x i32> [[TMP9]], i64 11
+; CHECK-NEXT: [[TMP28:%.*]] = and i32 [[TMP27]], 15
+; CHECK-NEXT: [[TMP64:%.*]] = or i32 [[TMP27]], [[TMP28]]
+; CHECK-NEXT: [[TMP29:%.*]] = extractelement <16 x i32> [[TMP9]], i64 12
+; CHECK-NEXT: [[TMP30:%.*]] = and i32 [[TMP29]], 15
+; CHECK-NEXT: [[TMP65:%.*]] = or i32 [[TMP29]], [[TMP30]]
+; CHECK-NEXT: [[TMP31:%.*]] = extractelement <16 x i32> [[TMP9]], i64 13
+; CHECK-NEXT: [[TMP32:%.*]] = and i32 [[TMP31]], 15
+; CHECK-NEXT: [[TMP66:%.*]] = or i32 [[TMP31]], [[TMP32]]
+; CHECK-NEXT: [[TMP33:%.*]] = extractelement <16 x i32> [[TMP9]], i64 14
+; CHECK-NEXT: [[TMP34:%.*]] = and i32 [[TMP33]], 15
+; CHECK-NEXT: [[TMP67:%.*]] = or i32 [[TMP33]], [[TMP34]]
+; CHECK-NEXT: [[TMP35:%.*]] = extractelement <16 x i32> [[TMP9]], i64 15
+; CHECK-NEXT: [[TMP36:%.*]] = and i32 [[TMP35]], 15
+; CHECK-NEXT: [[TMP68:%.*]] = or i32 [[TMP35]], [[TMP36]]
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP3]] to <16 x float>
+; CHECK-NEXT: [[TMP19:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[TMP5]], <16 x i32> [[X0:%.*]], <16 x float> [[TMP6]])
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x float> [[TMP19]] to <16 x i32>
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP68]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF1]]
+; CHECK: 57:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 58:
+; CHECK-NEXT: [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X1:%.*]], <16 x i32> [[X0]], <16 x float> [[X2:%.*]])
; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP8]], <16 x i32> zeroinitializer
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[TMP10]] to <16 x i32>
; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP8]]
; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], zeroinitializer
; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP17]], <16 x i32> [[TMP13]]
; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP12]], <16 x float> [[TMP10]], <16 x float> zeroinitializer
@@ -5844,13 +6259,42 @@ define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0,
define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 {
; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_q_512(
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP13:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X1:%.*]], <8 x i64> [[X0:%.*]], <8 x i64> [[X2:%.*]])
+; CHECK-NEXT: [[TMP21:%.*]] = extractelement <8 x i64> [[TMP13]], i64 0
+; CHECK-NEXT: [[TMP24:%.*]] = and i64 [[TMP21]], 7
+; CHECK-NEXT: [[TMP23:%.*]] = or i64 [[TMP21]], [[TMP24]]
+; CHECK-NEXT: [[TMP25:%.*]] = extractelement <8 x i64> [[TMP13]], i64 1
+; CHECK-NEXT: [[TMP26:%.*]] = and i64 [[TMP25]], 7
+; CHECK-NEXT: [[TMP32:%.*]] = or i64 [[TMP25]], [[TMP26]]
+; CHECK-NEXT: [[TMP27:%.*]] = extractelement <8 x i64> [[TMP13]], i64 2
+; CHECK-NEXT: [[TMP28:%.*]] = and i64 [[TMP27]], 7
+; CHECK-NEXT: [[TMP33:%.*]] = or i64 [[TMP27]], [[TMP28]]
+; CHECK-NEXT: [[TMP29:%.*]] = extractelement <8 x i64> [[TMP13]], i64 3
+; CHECK-NEXT: [[TMP30:%.*]] = and i64 [[TMP29]], 7
+; CHECK-NEXT: [[TMP34:%.*]] = or i64 [[TMP29]], [[TMP30]]
+; CHECK-NEXT: [[TMP31:%.*]] = extractelement <8 x i64> [[TMP13]], i64 4
+; CHECK-NEXT: [[TMP14:%.*]] = and i64 [[TMP31]], 7
+; CHECK-NEXT: [[TMP35:%.*]] = or i64 [[TMP31]], [[TMP14]]
+; CHECK-NEXT: [[TMP15:%.*]] = extractelement <8 x i64> [[TMP13]], i64 5
+; CHECK-NEXT: [[TMP16:%.*]] = and i64 [[TMP15]], 7
+; CHECK-NEXT: [[TMP22:%.*]] = or i64 [[TMP15]], [[TMP16]]
+; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i64> [[TMP13]], i64 6
+; CHECK-NEXT: [[TMP18:%.*]] = and i64 [[TMP17]], 7
+; CHECK-NEXT: [[TMP36:%.*]] = or i64 [[TMP17]], [[TMP18]]
+; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i64> [[TMP13]], i64 7
+; CHECK-NEXT: [[TMP20:%.*]] = and i64 [[TMP19]], 7
+; CHECK-NEXT: [[TMP37:%.*]] = or i64 [[TMP19]], [[TMP20]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[TMP1]], <8 x i64> [[X0:%.*]], <8 x i64> [[TMP3]])
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP37]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP38:%.*]], label [[TMP39:%.*]], !prof [[PROF1]]
+; CHECK: 30:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 31:
+; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X1:%.*]], <8 x i64> [[X0]], <8 x i64> [[X2:%.*]])
; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> zeroinitializer
@@ -5871,12 +6315,65 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x
define <16 x i32>@test_int_x86_avx512_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) #0 {
; CHECK-LABEL: @test_int_x86_avx512_vpermt2var_d_512(
; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x i32> [[X2:%.*]])
+; CHECK-NEXT: [[TMP8:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP36:%.*]] = extractelement <16 x i32> [[TMP8]], i64 0
+; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[TMP36]], 15
+; CHECK-NEXT: [[TMP37:%.*]] = or i32 [[TMP36]], [[TMP5]]
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <16 x i32> [[TMP8]], i64 1
+; CHECK-NEXT: [[TMP7:%.*]] = and i32 [[TMP6]], 15
+; CHECK-NEXT: [[TMP38:%.*]] = or i32 [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP39:%.*]] = extractelement <16 x i32> [[TMP8]], i64 2
+; CHECK-NEXT: [[TMP9:%.*]] = and i32 [[TMP39]], 15
+; CHECK-NEXT: [[TMP40:%.*]] = or i32 [[TMP39]], [[TMP9]]
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <16 x i32> [[TMP8]], i64 3
+; CHECK-NEXT: [[TMP11:%.*]] = and i32 [[TMP10]], 15
+; CHECK-NEXT: [[TMP41:%.*]] = or i32 [[TMP10]], [[TMP11]]
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <16 x i32> [[TMP8]], i64 4
+; CHECK-NEXT: [[TMP13:%.*]] = and i32 [[TMP12]], 15
+; CHECK-NEXT: [[TMP43:%.*]] = or i32 [[TMP12]], [[TMP13]]
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <16 x i32> [[TMP8]], i64 5
+; CHECK-NEXT: [[TMP15:%.*]] = and i32 [[TMP14]], 15
+; CHECK-NEXT: [[TMP44:%.*]] = or i32 [[TMP14]], [[TMP15]]
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <16 x i32> [[TMP8]], i64 6
+; CHECK-NEXT: [[TMP17:%.*]] = and i32 [[TMP16]], 15
+; CHECK-NEXT: [[TMP46:%.*]] = or i32 [[TMP16]], [[TMP17]]
+; CHECK-NEXT: [[TMP18:%.*]] = extractelement <16 x i32> [[TMP8]], i64 7
+; CHECK-NEXT: [[TMP19:%.*]] = and i32 [[TMP18]], 15
+; CHECK-NEXT: [[TMP47:%.*]] = or i32 [[TMP18]], [[TMP19]]
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <16 x i32> [[TMP8]], i64 8
+; CHECK-NEXT: [[TMP21:%.*]] = and i32 [[TMP20]], 15
+; CHECK-NEXT: [[TMP49:%.*]] = or i32 [[TMP20]], [[TMP21]]
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <16 x i32> [[TMP8]], i64 9
+; CHECK-NEXT: [[TMP23:%.*]] = and i32 [[TMP22]], 15
+; CHECK-NEXT: [[TMP51:%.*]] = or i32 [[TMP22]], [[TMP23]]
+; CHECK-NEXT: [[TMP24:%.*]] = extractelement <16 x i32> [[TMP8]], i64 10
+; CHECK-NEXT: [[TMP25:%.*]] = and i32 [[TMP24]], 15
+; CHECK-NEXT: [[TMP52:%.*]] = or i32 [[TMP24]], [[TMP25]]
+; CHECK-NEXT: [[TMP26:%.*]] = extractelement <16 x i32> [[TMP8]], i64 11
+; CHECK-NEXT: [[TMP27:%.*]] = and i32 [[TMP26]], 15
+; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP26]], [[TMP27]]
+; CHECK-NEXT: [[TMP28:%.*]] = extractelement <16 x i32> [[TMP8]], i64 12
+; CHECK-NEXT: [[TMP29:%.*]] = and i32 [[TMP28]], 15
+; CHECK-NEXT: [[TMP42:%.*]] = or i32 [[TMP28]], [[TMP29]]
+; CHECK-NEXT: [[TMP30:%.*]] = extractelement <16 x i32> [[TMP8]], i64 13
+; CHECK-NEXT: [[TMP31:%.*]] = and i32 [[TMP30]], 15
+; CHECK-NEXT: [[TMP45:%.*]] = or i32 [[TMP30]], [[TMP31]]
+; CHECK-NEXT: [[TMP32:%.*]] = extractelement <16 x i32> [[TMP8]], i64 14
+; CHECK-NEXT: [[TMP33:%.*]] = and i32 [[TMP32]], 15
+; CHECK-NEXT: [[TMP48:%.*]] = or i32 [[TMP32]], [[TMP33]]
+; CHECK-NEXT: [[TMP34:%.*]] = extractelement <16 x i32> [[TMP8]], i64 15
+; CHECK-NEXT: [[TMP50:%.*]] = and i32 [[TMP34]], 15
+; CHECK-NEXT: [[TMP35:%.*]] = or i32 [[TMP34]], [[TMP50]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP1]], <16 x i32> [[X0:%.*]], <16 x i32> [[TMP3]])
+; CHECK-NEXT: [[_MSCMP28:%.*]] = icmp ne i32 [[TMP35]], 0
+; CHECK-NEXT: br i1 [[_MSCMP28]], label [[TMP53:%.*]], label [[TMP54:%.*]], !prof [[PROF1]]
+; CHECK: 53:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 54:
+; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0]], <16 x i32> [[X2:%.*]])
; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <16 x i32> [[TMP4]]
;
@@ -5887,13 +6384,66 @@ define <16 x i32>@test_int_x86_avx512_vpermt2var_d_512(<16 x i32> %x0, <16 x i32
define <16 x i32>@test_int_x86_avx512_mask_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) #0 {
; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_d_512(
; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT: [[TMP13:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x i32> [[X2:%.*]])
+; CHECK-NEXT: [[TMP37:%.*]] = extractelement <16 x i32> [[TMP13]], i64 0
+; CHECK-NEXT: [[TMP40:%.*]] = and i32 [[TMP37]], 15
+; CHECK-NEXT: [[TMP38:%.*]] = or i32 [[TMP37]], [[TMP40]]
+; CHECK-NEXT: [[TMP41:%.*]] = extractelement <16 x i32> [[TMP13]], i64 1
+; CHECK-NEXT: [[TMP42:%.*]] = and i32 [[TMP41]], 15
+; CHECK-NEXT: [[TMP39:%.*]] = or i32 [[TMP41]], [[TMP42]]
+; CHECK-NEXT: [[TMP43:%.*]] = extractelement <16 x i32> [[TMP13]], i64 2
+; CHECK-NEXT: [[TMP44:%.*]] = and i32 [[TMP43]], 15
+; CHECK-NEXT: [[TMP48:%.*]] = or i32 [[TMP43]], [[TMP44]]
+; CHECK-NEXT: [[TMP45:%.*]] = extractelement <16 x i32> [[TMP13]], i64 3
+; CHECK-NEXT: [[TMP46:%.*]] = and i32 [[TMP45]], 15
+; CHECK-NEXT: [[TMP50:%.*]] = or i32 [[TMP45]], [[TMP46]]
+; CHECK-NEXT: [[TMP47:%.*]] = extractelement <16 x i32> [[TMP13]], i64 4
+; CHECK-NEXT: [[TMP14:%.*]] = and i32 [[TMP47]], 15
+; CHECK-NEXT: [[TMP52:%.*]] = or i32 [[TMP47]], [[TMP14]]
+; CHECK-NEXT: [[TMP15:%.*]] = extractelement <16 x i32> [[TMP13]], i64 5
+; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 15
+; CHECK-NEXT: [[TMP53:%.*]] = or i32 [[TMP15]], [[TMP16]]
+; CHECK-NEXT: [[TMP17:%.*]] = extractelement <16 x i32> [[TMP13]], i64 6
+; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 15
+; CHECK-NEXT: [[TMP56:%.*]] = or i32 [[TMP17]], [[TMP18]]
+; CHECK-NEXT: [[TMP19:%.*]] = extractelement <16 x i32> [[TMP13]], i64 7
+; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP19]], 15
+; CHECK-NEXT: [[TMP57:%.*]] = or i32 [[TMP19]], [[TMP20]]
+; CHECK-NEXT: [[TMP21:%.*]] = extractelement <16 x i32> [[TMP13]], i64 8
+; CHECK-NEXT: [[TMP22:%.*]] = and i32 [[TMP21]], 15
+; CHECK-NEXT: [[TMP58:%.*]] = or i32 [[TMP21]], [[TMP22]]
+; CHECK-NEXT: [[TMP23:%.*]] = extractelement <16 x i32> [[TMP13]], i64 9
+; CHECK-NEXT: [[TMP24:%.*]] = and i32 [[TMP23]], 15
+; CHECK-NEXT: [[TMP59:%.*]] = or i32 [[TMP23]], [[TMP24]]
+; CHECK-NEXT: [[TMP25:%.*]] = extractelement <16 x i32> [[TMP13]], i64 10
+; CHECK-NEXT: [[TMP26:%.*]] = and i32 [[TMP25]], 15
+; CHECK-NEXT: [[TMP60:%.*]] = or i32 [[TMP25]], [[TMP26]]
+; CHECK-NEXT: [[TMP27:%.*]] = extractelement <16 x i32> [[TMP13]], i64 11
+; CHECK-NEXT: [[TMP28:%.*]] = and i32 [[TMP27]], 15
+; CHECK-NEXT: [[TMP61:%.*]] = or i32 [[TMP27]], [[TMP28]]
+; CHECK-NEXT: [[TMP29:%.*]] = extractelement <16 x i32> [[TMP13]], i64 12
+; CHECK-NEXT: [[TMP30:%.*]] = and i32 [[TMP29]], 15
+; CHECK-NEXT: [[TMP62:%.*]] = or i32 [[TMP29]], [[TMP30]]
+; CHECK-NEXT: [[TMP31:%.*]] = extractelement <16 x i32> [[TMP13]], i64 13
+; CHECK-NEXT: [[TMP32:%.*]] = and i32 [[TMP31]], 15
+; CHECK-NEXT: [[TMP63:%.*]] = or i32 [[TMP31]], [[TMP32]]
+; CHECK-NEXT: [[TMP33:%.*]] = extractelement <16 x i32> [[TMP13]], i64 14
+; CHECK-NEXT: [[TMP34:%.*]] = and i32 [[TMP33]], 15
+; CHECK-NEXT: [[TMP49:%.*]] = or i32 [[TMP33]], [[TMP34]]
+; CHECK-NEXT: [[TMP35:%.*]] = extractelement <16 x i32> [[TMP13]], i64 15
+; CHECK-NEXT: [[TMP51:%.*]] = and i32 [[TMP35]], 15
+; CHECK-NEXT: [[TMP36:%.*]] = or i32 [[TMP35]], [[TMP51]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP1]], <16 x i32> [[X0:%.*]], <16 x i32> [[TMP3]])
+; CHECK-NEXT: [[_MSCMP28:%.*]] = icmp ne i32 [[TMP36]], 0
+; CHECK-NEXT: br i1 [[_MSCMP28]], label [[TMP54:%.*]], label [[TMP55:%.*]], !prof [[PROF1]]
+; CHECK: 54:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT: unreachable
+; CHECK: 55:
+; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0]], <16 x i32> [[X2:%.*]])
; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP1]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll
index 14d68b4..7d45cec 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll
@@ -1901,11 +1901,28 @@ define <4 x i32>@test_int_x86_avx512_vpermi2var_d_128(<4 x i32> %x0, <4 x i32> %
; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_vpermi2var_d_128(
; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP6]], [[TMP3]]
-; CHECK-NEXT: [[TMP4:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP5]]
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP13:%.*]] = and i32 [[TMP12]], 3
+; CHECK-NEXT: [[TMP16:%.*]] = or i32 [[TMP12]], [[TMP13]]
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i32> [[TMP3]], i64 1
+; CHECK-NEXT: [[TMP7:%.*]] = and i32 [[TMP14]], 3
+; CHECK-NEXT: [[TMP17:%.*]] = or i32 [[TMP14]], [[TMP7]]
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP3]], i64 2
+; CHECK-NEXT: [[TMP9:%.*]] = and i32 [[TMP8]], 3
+; CHECK-NEXT: [[TMP18:%.*]] = or i32 [[TMP8]], [[TMP9]]
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[TMP3]], i64 3
+; CHECK-NEXT: [[TMP11:%.*]] = and i32 [[TMP10]], 3
+; CHECK-NEXT: [[TMP15:%.*]] = or i32 [[TMP10]], [[TMP11]]
+; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[TMP6]], <4 x i32> [[X1]], <4 x i32> [[TMP5]])
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP15]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB17:.*]], label %[[BB18:.*]], !prof [[PROF1]]
+; CHECK: [[BB17]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB18]]:
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <4 x i32> [[TMP1]]
@@ -1919,12 +1936,29 @@ define <4 x i32>@test_int_x86_avx512_mask_vpermi2var_d_128(<4 x i32> %x0, <4 x i
; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_vpermi2var_d_128(
; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP8]], [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i32> [[_MSPROP1]], [[TMP6]]
+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP13]], 3
+; CHECK-NEXT: [[TMP22:%.*]] = or i32 [[TMP13]], [[TMP16]]
+; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i32> [[TMP3]], i64 1
+; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 3
+; CHECK-NEXT: [[TMP23:%.*]] = or i32 [[TMP17]], [[TMP18]]
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP3]], i64 2
+; CHECK-NEXT: [[TMP19:%.*]] = and i32 [[TMP9]], 3
+; CHECK-NEXT: [[TMP24:%.*]] = or i32 [[TMP9]], [[TMP19]]
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i32> [[TMP3]], i64 3
+; CHECK-NEXT: [[TMP21:%.*]] = and i32 [[TMP20]], 3
+; CHECK-NEXT: [[TMP25:%.*]] = or i32 [[TMP20]], [[TMP21]]
+; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[TMP8]], <4 x i32> [[X1]], <4 x i32> [[TMP6]])
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP25]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB18:.*]], label %[[BB19:.*]], !prof [[PROF1]]
+; CHECK: [[BB18]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB19]]:
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -1950,11 +1984,28 @@ define <4 x i32>@test_int_x86_avx512_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %
; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_vpermt2var_d_128(
; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP6]], [[TMP3]]
-; CHECK-NEXT: [[TMP4:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP5]]
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP13:%.*]] = and i32 [[TMP12]], 3
+; CHECK-NEXT: [[TMP16:%.*]] = or i32 [[TMP12]], [[TMP13]]
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i32> [[TMP3]], i64 1
+; CHECK-NEXT: [[TMP7:%.*]] = and i32 [[TMP14]], 3
+; CHECK-NEXT: [[TMP17:%.*]] = or i32 [[TMP14]], [[TMP7]]
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP3]], i64 2
+; CHECK-NEXT: [[TMP9:%.*]] = and i32 [[TMP8]], 3
+; CHECK-NEXT: [[TMP18:%.*]] = or i32 [[TMP8]], [[TMP9]]
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[TMP3]], i64 3
+; CHECK-NEXT: [[TMP11:%.*]] = and i32 [[TMP10]], 3
+; CHECK-NEXT: [[TMP15:%.*]] = or i32 [[TMP10]], [[TMP11]]
+; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[TMP6]], <4 x i32> [[X0]], <4 x i32> [[TMP5]])
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP15]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB17:.*]], label %[[BB18:.*]], !prof [[PROF1]]
+; CHECK: [[BB17]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB18]]:
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X1]], <4 x i32> [[X0]], <4 x i32> [[X2]])
; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <4 x i32> [[TMP1]]
@@ -1968,12 +2019,29 @@ define <4 x i32>@test_int_x86_avx512_mask_vpermt2var_d_128(<4 x i32> %x0, <4 x i
; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_vpermt2var_d_128(
; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP8]], [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i32> [[_MSPROP1]], [[TMP6]]
+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP13]], 3
+; CHECK-NEXT: [[TMP22:%.*]] = or i32 [[TMP13]], [[TMP16]]
+; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i32> [[TMP3]], i64 1
+; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 3
+; CHECK-NEXT: [[TMP23:%.*]] = or i32 [[TMP17]], [[TMP18]]
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP3]], i64 2
+; CHECK-NEXT: [[TMP19:%.*]] = and i32 [[TMP9]], 3
+; CHECK-NEXT: [[TMP24:%.*]] = or i32 [[TMP9]], [[TMP19]]
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i32> [[TMP3]], i64 3
+; CHECK-NEXT: [[TMP21:%.*]] = and i32 [[TMP20]], 3
+; CHECK-NEXT: [[TMP25:%.*]] = or i32 [[TMP20]], [[TMP21]]
+; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[TMP8]], <4 x i32> [[X0]], <4 x i32> [[TMP6]])
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP25]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB18:.*]], label %[[BB19:.*]], !prof [[PROF1]]
+; CHECK: [[BB18]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB19]]:
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X1]], <4 x i32> [[X0]], <4 x i32> [[X2]])
; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -2000,12 +2068,29 @@ define <4 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_128(<4 x i32> %x0, <4 x
; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_maskz_vpermt2var_d_128(
; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP8]], [[TMP3]]
-; CHECK-NEXT: [[TMP13:%.*]] = or <4 x i32> [[_MSPROP1]], [[TMP9]]
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP15:%.*]] = and i32 [[TMP14]], 3
+; CHECK-NEXT: [[TMP22:%.*]] = or i32 [[TMP14]], [[TMP15]]
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i32> [[TMP3]], i64 1
+; CHECK-NEXT: [[TMP17:%.*]] = and i32 [[TMP16]], 3
+; CHECK-NEXT: [[TMP23:%.*]] = or i32 [[TMP16]], [[TMP17]]
+; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x i32> [[TMP3]], i64 2
+; CHECK-NEXT: [[TMP19:%.*]] = and i32 [[TMP18]], 3
+; CHECK-NEXT: [[TMP24:%.*]] = or i32 [[TMP18]], [[TMP19]]
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i32> [[TMP3]], i64 3
+; CHECK-NEXT: [[TMP21:%.*]] = and i32 [[TMP20]], 3
+; CHECK-NEXT: [[TMP25:%.*]] = or i32 [[TMP20]], [[TMP21]]
+; CHECK-NEXT: [[TMP13:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[TMP8]], <4 x i32> [[X0]], <4 x i32> [[TMP9]])
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP25]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB18:.*]], label %[[BB19:.*]], !prof [[PROF1]]
+; CHECK: [[BB18]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB19]]:
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X1]], <4 x i32> [[X0]], <4 x i32> [[X2]])
; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -2033,11 +2118,40 @@ define <8 x i32>@test_int_x86_avx512_vpermi2var_d_256(<8 x i32> %x0, <8 x i32> %
; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_vpermi2var_d_256(
; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP6]], [[TMP3]]
-; CHECK-NEXT: [[TMP4:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP5]]
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i32> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP21:%.*]] = and i32 [[TMP20]], 7
+; CHECK-NEXT: [[TMP23:%.*]] = or i32 [[TMP20]], [[TMP21]]
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <8 x i32> [[TMP3]], i64 1
+; CHECK-NEXT: [[TMP7:%.*]] = and i32 [[TMP22]], 7
+; CHECK-NEXT: [[TMP25:%.*]] = or i32 [[TMP22]], [[TMP7]]
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i64 2
+; CHECK-NEXT: [[TMP9:%.*]] = and i32 [[TMP8]], 7
+; CHECK-NEXT: [[TMP26:%.*]] = or i32 [[TMP8]], [[TMP9]]
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i64 3
+; CHECK-NEXT: [[TMP11:%.*]] = and i32 [[TMP10]], 7
+; CHECK-NEXT: [[TMP28:%.*]] = or i32 [[TMP10]], [[TMP11]]
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <8 x i32> [[TMP3]], i64 4
+; CHECK-NEXT: [[TMP13:%.*]] = and i32 [[TMP12]], 7
+; CHECK-NEXT: [[TMP29:%.*]] = or i32 [[TMP12]], [[TMP13]]
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i32> [[TMP3]], i64 5
+; CHECK-NEXT: [[TMP15:%.*]] = and i32 [[TMP14]], 7
+; CHECK-NEXT: [[TMP30:%.*]] = or i32 [[TMP14]], [[TMP15]]
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[TMP3]], i64 6
+; CHECK-NEXT: [[TMP17:%.*]] = and i32 [[TMP16]], 7
+; CHECK-NEXT: [[TMP24:%.*]] = or i32 [[TMP16]], [[TMP17]]
+; CHECK-NEXT: [[TMP18:%.*]] = extractelement <8 x i32> [[TMP3]], i64 7
+; CHECK-NEXT: [[TMP19:%.*]] = and i32 [[TMP18]], 7
+; CHECK-NEXT: [[TMP27:%.*]] = or i32 [[TMP18]], [[TMP19]]
+; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[TMP6]], <8 x i32> [[X1]], <8 x i32> [[TMP5]])
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP27]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB29:.*]], label %[[BB30:.*]], !prof [[PROF1]]
+; CHECK: [[BB29]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB30]]:
; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
; CHECK-NEXT: store <8 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <8 x i32> [[TMP1]]
@@ -2051,12 +2165,41 @@ define <8 x i32>@test_int_x86_avx512_mask_vpermi2var_d_256(<8 x i32> %x0, <8 x i
; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_mask_vpermi2var_d_256(
; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP8:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP8]], [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP6]]
+; CHECK-NEXT: [[TMP21:%.*]] = extractelement <8 x i32> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP22:%.*]] = and i32 [[TMP21]], 7
+; CHECK-NEXT: [[TMP30:%.*]] = or i32 [[TMP21]], [[TMP22]]
+; CHECK-NEXT: [[TMP23:%.*]] = extractelement <8 x i32> [[TMP3]], i64 1
+; CHECK-NEXT: [[TMP24:%.*]] = and i32 [[TMP23]], 7
+; CHECK-NEXT: [[TMP31:%.*]] = or i32 [[TMP23]], [[TMP24]]
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i64 2
+; CHECK-NEXT: [[TMP25:%.*]] = and i32 [[TMP9]], 7
+; CHECK-NEXT: [[TMP32:%.*]] = or i32 [[TMP9]], [[TMP25]]
+; CHECK-NEXT: [[TMP26:%.*]] = extractelement <8 x i32> [[TMP3]], i64 3
+; CHECK-NEXT: [[TMP27:%.*]] = and i32 [[TMP26]], 7
+; CHECK-NEXT: [[TMP33:%.*]] = or i32 [[TMP26]], [[TMP27]]
+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x i32> [[TMP3]], i64 4
+; CHECK-NEXT: [[TMP28:%.*]] = and i32 [[TMP13]], 7
+; CHECK-NEXT: [[TMP34:%.*]] = or i32 [[TMP13]], [[TMP28]]
+; CHECK-NEXT: [[TMP29:%.*]] = extractelement <8 x i32> [[TMP3]], i64 5
+; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP29]], 7
+; CHECK-NEXT: [[TMP35:%.*]] = or i32 [[TMP29]], [[TMP16]]
+; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[TMP3]], i64 6
+; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 7
+; CHECK-NEXT: [[TMP36:%.*]] = or i32 [[TMP17]], [[TMP18]]
+; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i32> [[TMP3]], i64 7
+; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP19]], 7
+; CHECK-NEXT: [[TMP37:%.*]] = or i32 [[TMP19]], [[TMP20]]
+; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[TMP8]], <8 x i32> [[X1]], <8 x i32> [[TMP6]])
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP37]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB30:.*]], label %[[BB31:.*]], !prof [[PROF1]]
+; CHECK: [[BB30]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB31]]:
; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -2079,11 +2222,40 @@ define <8 x i32>@test_int_x86_avx512_ask_vpermt2var_d_256(<8 x i32> %x0, <8 x i3
; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_ask_vpermt2var_d_256(
; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP6]], [[TMP3]]
-; CHECK-NEXT: [[TMP4:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP5]]
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i32> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP21:%.*]] = and i32 [[TMP20]], 7
+; CHECK-NEXT: [[TMP23:%.*]] = or i32 [[TMP20]], [[TMP21]]
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <8 x i32> [[TMP3]], i64 1
+; CHECK-NEXT: [[TMP7:%.*]] = and i32 [[TMP22]], 7
+; CHECK-NEXT: [[TMP25:%.*]] = or i32 [[TMP22]], [[TMP7]]
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i64 2
+; CHECK-NEXT: [[TMP9:%.*]] = and i32 [[TMP8]], 7
+; CHECK-NEXT: [[TMP26:%.*]] = or i32 [[TMP8]], [[TMP9]]
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i64 3
+; CHECK-NEXT: [[TMP11:%.*]] = and i32 [[TMP10]], 7
+; CHECK-NEXT: [[TMP28:%.*]] = or i32 [[TMP10]], [[TMP11]]
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <8 x i32> [[TMP3]], i64 4
+; CHECK-NEXT: [[TMP13:%.*]] = and i32 [[TMP12]], 7
+; CHECK-NEXT: [[TMP29:%.*]] = or i32 [[TMP12]], [[TMP13]]
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i32> [[TMP3]], i64 5
+; CHECK-NEXT: [[TMP15:%.*]] = and i32 [[TMP14]], 7
+; CHECK-NEXT: [[TMP30:%.*]] = or i32 [[TMP14]], [[TMP15]]
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[TMP3]], i64 6
+; CHECK-NEXT: [[TMP17:%.*]] = and i32 [[TMP16]], 7
+; CHECK-NEXT: [[TMP24:%.*]] = or i32 [[TMP16]], [[TMP17]]
+; CHECK-NEXT: [[TMP18:%.*]] = extractelement <8 x i32> [[TMP3]], i64 7
+; CHECK-NEXT: [[TMP19:%.*]] = and i32 [[TMP18]], 7
+; CHECK-NEXT: [[TMP27:%.*]] = or i32 [[TMP18]], [[TMP19]]
+; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[TMP6]], <8 x i32> [[X0]], <8 x i32> [[TMP5]])
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP27]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB29:.*]], label %[[BB30:.*]], !prof [[PROF1]]
+; CHECK: [[BB29]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB30]]:
; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X1]], <8 x i32> [[X0]], <8 x i32> [[X2]])
; CHECK-NEXT: store <8 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <8 x i32> [[TMP1]]
@@ -2097,12 +2269,41 @@ define <8 x i32>@test_int_x86_avx512_mask_vpermt2var_d_256(<8 x i32> %x0, <8 x i
; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_mask_vpermt2var_d_256(
; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP8:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP8]], [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP6]]
+; CHECK-NEXT: [[TMP21:%.*]] = extractelement <8 x i32> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP22:%.*]] = and i32 [[TMP21]], 7
+; CHECK-NEXT: [[TMP30:%.*]] = or i32 [[TMP21]], [[TMP22]]
+; CHECK-NEXT: [[TMP23:%.*]] = extractelement <8 x i32> [[TMP3]], i64 1
+; CHECK-NEXT: [[TMP24:%.*]] = and i32 [[TMP23]], 7
+; CHECK-NEXT: [[TMP31:%.*]] = or i32 [[TMP23]], [[TMP24]]
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i64 2
+; CHECK-NEXT: [[TMP25:%.*]] = and i32 [[TMP9]], 7
+; CHECK-NEXT: [[TMP32:%.*]] = or i32 [[TMP9]], [[TMP25]]
+; CHECK-NEXT: [[TMP26:%.*]] = extractelement <8 x i32> [[TMP3]], i64 3
+; CHECK-NEXT: [[TMP27:%.*]] = and i32 [[TMP26]], 7
+; CHECK-NEXT: [[TMP33:%.*]] = or i32 [[TMP26]], [[TMP27]]
+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x i32> [[TMP3]], i64 4
+; CHECK-NEXT: [[TMP28:%.*]] = and i32 [[TMP13]], 7
+; CHECK-NEXT: [[TMP34:%.*]] = or i32 [[TMP13]], [[TMP28]]
+; CHECK-NEXT: [[TMP29:%.*]] = extractelement <8 x i32> [[TMP3]], i64 5
+; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP29]], 7
+; CHECK-NEXT: [[TMP35:%.*]] = or i32 [[TMP29]], [[TMP16]]
+; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[TMP3]], i64 6
+; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 7
+; CHECK-NEXT: [[TMP36:%.*]] = or i32 [[TMP17]], [[TMP18]]
+; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i32> [[TMP3]], i64 7
+; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP19]], 7
+; CHECK-NEXT: [[TMP37:%.*]] = or i32 [[TMP19]], [[TMP20]]
+; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[TMP8]], <8 x i32> [[X0]], <8 x i32> [[TMP6]])
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP37]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB30:.*]], label %[[BB31:.*]], !prof [[PROF1]]
+; CHECK: [[BB30]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB31]]:
; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X1]], <8 x i32> [[X0]], <8 x i32> [[X2]])
; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -2126,12 +2327,41 @@ define <8 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_256(<8 x i32> %x0, <8 x
; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_maskz_vpermt2var_d_256(
; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP8:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP9:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP8]], [[TMP3]]
-; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP9]]
+; CHECK-NEXT: [[TMP21:%.*]] = extractelement <8 x i32> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP22:%.*]] = and i32 [[TMP21]], 7
+; CHECK-NEXT: [[TMP30:%.*]] = or i32 [[TMP21]], [[TMP22]]
+; CHECK-NEXT: [[TMP23:%.*]] = extractelement <8 x i32> [[TMP3]], i64 1
+; CHECK-NEXT: [[TMP24:%.*]] = and i32 [[TMP23]], 7
+; CHECK-NEXT: [[TMP31:%.*]] = or i32 [[TMP23]], [[TMP24]]
+; CHECK-NEXT: [[TMP25:%.*]] = extractelement <8 x i32> [[TMP3]], i64 2
+; CHECK-NEXT: [[TMP26:%.*]] = and i32 [[TMP25]], 7
+; CHECK-NEXT: [[TMP32:%.*]] = or i32 [[TMP25]], [[TMP26]]
+; CHECK-NEXT: [[TMP27:%.*]] = extractelement <8 x i32> [[TMP3]], i64 3
+; CHECK-NEXT: [[TMP28:%.*]] = and i32 [[TMP27]], 7
+; CHECK-NEXT: [[TMP33:%.*]] = or i32 [[TMP27]], [[TMP28]]
+; CHECK-NEXT: [[TMP29:%.*]] = extractelement <8 x i32> [[TMP3]], i64 4
+; CHECK-NEXT: [[TMP14:%.*]] = and i32 [[TMP29]], 7
+; CHECK-NEXT: [[TMP34:%.*]] = or i32 [[TMP29]], [[TMP14]]
+; CHECK-NEXT: [[TMP15:%.*]] = extractelement <8 x i32> [[TMP3]], i64 5
+; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 7
+; CHECK-NEXT: [[TMP35:%.*]] = or i32 [[TMP15]], [[TMP16]]
+; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[TMP3]], i64 6
+; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 7
+; CHECK-NEXT: [[TMP36:%.*]] = or i32 [[TMP17]], [[TMP18]]
+; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i32> [[TMP3]], i64 7
+; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP19]], 7
+; CHECK-NEXT: [[TMP37:%.*]] = or i32 [[TMP19]], [[TMP20]]
+; CHECK-NEXT: [[TMP13:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[TMP8]], <8 x i32> [[X0]], <8 x i32> [[TMP9]])
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP37]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB30:.*]], label %[[BB31:.*]], !prof [[PROF1]]
+; CHECK: [[BB30]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB31]]:
; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X1]], <8 x i32> [[X0]], <8 x i32> [[X2]])
; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -2156,24 +2386,27 @@ define <2 x double>@test_int_x86_avx512_vpermi2var_pd_128(<2 x double> %x0, <2 x
; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_vpermi2var_pd_128(
; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x double> [[X2:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP9]] to i128
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP8]], 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
-; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP4]] to i128
-; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
-; CHECK: [[BB7]]:
+; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP12:%.*]] = and i64 [[TMP11]], 1
+; CHECK-NEXT: [[TMP14:%.*]] = or i64 [[TMP11]], [[TMP12]]
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i64 1
+; CHECK-NEXT: [[TMP13:%.*]] = and i64 [[TMP6]], 1
+; CHECK-NEXT: [[TMP15:%.*]] = or i64 [[TMP6]], [[TMP13]]
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP9]] to <2 x double>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <2 x double>
+; CHECK-NEXT: [[TMP10:%.*]] = call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> [[TMP8]], <2 x i64> [[X1]], <2 x double> [[TMP5]])
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x double> [[TMP10]] to <2 x i64>
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP15]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB14:.*]], label %[[BB15:.*]], !prof [[PROF1]]
+; CHECK: [[BB14]]:
; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
; CHECK-NEXT: unreachable
-; CHECK: [[BB8]]:
+; CHECK: [[BB15]]:
; CHECK-NEXT: [[TMP1:%.*]] = call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> [[X0]], <2 x i64> [[X1]], <2 x double> [[X2]])
-; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: store <2 x i64> [[TMP7]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <2 x double> [[TMP1]]
;
%1 = call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2)
@@ -2185,34 +2418,37 @@ define <2 x double>@test_int_x86_avx512_mask_vpermi2var_pd_128(<2 x double> %x0,
; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_mask_vpermi2var_pd_128(
; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP11:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT: [[TMP13:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
; CHECK-NEXT: [[TMP8:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP13:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast <2 x i64> [[TMP11]] to i128
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP9]], 0
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i64> [[TMP13]] to i128
-; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP12]], 0
-; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast <2 x i64> [[TMP8]] to i128
-; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP15]], 0
-; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
-; CHECK: [[BB8]]:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
-; CHECK-NEXT: unreachable
-; CHECK: [[BB9]]:
+; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP13]], i64 0
+; CHECK-NEXT: [[TMP19:%.*]] = and i64 [[TMP15]], 1
+; CHECK-NEXT: [[TMP24:%.*]] = or i64 [[TMP15]], [[TMP19]]
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i64> [[TMP13]], i64 1
+; CHECK-NEXT: [[TMP23:%.*]] = and i64 [[TMP22]], 1
+; CHECK-NEXT: [[TMP25:%.*]] = or i64 [[TMP22]], [[TMP23]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <2 x i64> [[TMP11]] to <2 x double>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i64> [[TMP8]] to <2 x double>
+; CHECK-NEXT: [[TMP17:%.*]] = call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> [[TMP9]], <2 x i64> [[X1]], <2 x double> [[TMP12]])
+; CHECK-NEXT: [[TMP18:%.*]] = bitcast <2 x double> [[TMP17]] to <2 x i64>
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP25]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB15:.*]], label %[[BB16:.*]], !prof [[PROF1]]
+; CHECK: [[BB15]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB16]]:
; CHECK-NEXT: [[TMP1:%.*]] = call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> [[X0]], <2 x i64> [[X1]], <2 x double> [[X2]])
; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[X1]] to <2 x double>
; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[X3]] to <8 x i1>
; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP14]], <8 x i1> [[TMP14]], <2 x i32> <i32 0, i32 1>
; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP16:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> zeroinitializer, <2 x i64> [[TMP13]]
+; CHECK-NEXT: [[TMP16:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[TMP18]], <2 x i64> [[TMP13]]
; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x double> [[TMP1]] to <2 x i64>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x double> [[TMP2]] to <2 x i64>
; CHECK-NEXT: [[TMP7:%.*]] = xor <2 x i64> [[TMP5]], [[TMP6]]
-; CHECK-NEXT: [[TMP20:%.*]] = or <2 x i64> [[TMP7]], zeroinitializer
+; CHECK-NEXT: [[TMP20:%.*]] = or <2 x i64> [[TMP7]], [[TMP18]]
; CHECK-NEXT: [[TMP21:%.*]] = or <2 x i64> [[TMP20]], [[TMP13]]
; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP]], <2 x i64> [[TMP21]], <2 x i64> [[TMP16]]
; CHECK-NEXT: [[TMP10:%.*]] = select <2 x i1> [[EXTRACT]], <2 x double> [[TMP1]], <2 x double> [[TMP2]]
@@ -2233,24 +2469,33 @@ define <4 x double>@test_int_x86_avx512_vpermi2var_pd_256(<4 x double> %x0, <4 x
; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_vpermi2var_pd_256(
; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x double> [[X2:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i64> [[TMP9]] to i256
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP8]], 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
-; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
-; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i64> [[TMP4]] to i256
-; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP6]], 0
-; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
-; CHECK: [[BB7]]:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
-; CHECK-NEXT: unreachable
-; CHECK: [[BB8]]:
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP13:%.*]] = and i64 [[TMP12]], 3
+; CHECK-NEXT: [[TMP18:%.*]] = or i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i64 1
+; CHECK-NEXT: [[TMP14:%.*]] = and i64 [[TMP6]], 3
+; CHECK-NEXT: [[TMP19:%.*]] = or i64 [[TMP6]], [[TMP14]]
+; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i64> [[TMP3]], i64 2
+; CHECK-NEXT: [[TMP16:%.*]] = and i64 [[TMP15]], 3
+; CHECK-NEXT: [[TMP20:%.*]] = or i64 [[TMP15]], [[TMP16]]
+; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i64> [[TMP3]], i64 3
+; CHECK-NEXT: [[TMP11:%.*]] = and i64 [[TMP17]], 3
+; CHECK-NEXT: [[TMP21:%.*]] = or i64 [[TMP17]], [[TMP11]]
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i64> [[TMP9]] to <4 x double>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i64> [[TMP4]] to <4 x double>
+; CHECK-NEXT: [[TMP10:%.*]] = call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> [[TMP8]], <4 x i64> [[X1]], <4 x double> [[TMP5]])
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x double> [[TMP10]] to <4 x i64>
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP21]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB20:.*]], label %[[BB21:.*]], !prof [[PROF1]]
+; CHECK: [[BB20]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB21]]:
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> [[X0]], <4 x i64> [[X1]], <4 x double> [[X2]])
-; CHECK-NEXT: store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: store <4 x i64> [[TMP7]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <4 x double> [[TMP1]]
;
%1 = call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2)
@@ -2262,34 +2507,43 @@ define <4 x double>@test_int_x86_avx512_mask_vpermi2var_pd_256(<4 x double> %x0,
; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask_vpermi2var_pd_256(
; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i64> [[TMP11]] to i256
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP9]], 0
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i64> [[TMP13]] to i256
-; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP12]], 0
-; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast <4 x i64> [[TMP8]] to i256
-; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP15]], 0
-; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
-; CHECK: [[BB8]]:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
-; CHECK-NEXT: unreachable
-; CHECK: [[BB9]]:
+; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i64> [[TMP13]], i64 0
+; CHECK-NEXT: [[TMP19:%.*]] = and i64 [[TMP15]], 3
+; CHECK-NEXT: [[TMP28:%.*]] = or i64 [[TMP15]], [[TMP19]]
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP13]], i64 1
+; CHECK-NEXT: [[TMP23:%.*]] = and i64 [[TMP22]], 3
+; CHECK-NEXT: [[TMP29:%.*]] = or i64 [[TMP22]], [[TMP23]]
+; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP13]], i64 2
+; CHECK-NEXT: [[TMP25:%.*]] = and i64 [[TMP24]], 3
+; CHECK-NEXT: [[TMP30:%.*]] = or i64 [[TMP24]], [[TMP25]]
+; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP13]], i64 3
+; CHECK-NEXT: [[TMP27:%.*]] = and i64 [[TMP26]], 3
+; CHECK-NEXT: [[TMP31:%.*]] = or i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i64> [[TMP11]] to <4 x double>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i64> [[TMP8]] to <4 x double>
+; CHECK-NEXT: [[TMP17:%.*]] = call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> [[TMP9]], <4 x i64> [[X1]], <4 x double> [[TMP12]])
+; CHECK-NEXT: [[TMP18:%.*]] = bitcast <4 x double> [[TMP17]] to <4 x i64>
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP31]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB21:.*]], label %[[BB22:.*]], !prof [[PROF1]]
+; CHECK: [[BB21]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB22]]:
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> [[X0]], <4 x i64> [[X1]], <4 x double> [[X2]])
; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i64> [[X1]] to <4 x double>
; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[X3]] to <8 x i1>
; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP14]], <8 x i1> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[TMP16:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> zeroinitializer, <4 x i64> [[TMP13]]
+; CHECK-NEXT: [[TMP16:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP18]], <4 x i64> [[TMP13]]
; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x double> [[TMP1]] to <4 x i64>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x double> [[TMP2]] to <4 x i64>
; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i64> [[TMP5]], [[TMP6]]
-; CHECK-NEXT: [[TMP20:%.*]] = or <4 x i64> [[TMP7]], zeroinitializer
+; CHECK-NEXT: [[TMP20:%.*]] = or <4 x i64> [[TMP7]], [[TMP18]]
; CHECK-NEXT: [[TMP21:%.*]] = or <4 x i64> [[TMP20]], [[TMP13]]
; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i64> [[TMP21]], <4 x i64> [[TMP16]]
; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP1]], <4 x double> [[TMP2]]
@@ -2310,24 +2564,33 @@ define <4 x float>@test_int_x86_avx512_vpermi2var_ps_128(<4 x float> %x0, <4 x i
; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_vpermi2var_ps_128(
; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x float> [[X2:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP9]] to i128
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP8]], 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
-; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP4]] to i128
-; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
-; CHECK: [[BB7]]:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
-; CHECK-NEXT: unreachable
-; CHECK: [[BB8]]:
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP13:%.*]] = and i32 [[TMP12]], 3
+; CHECK-NEXT: [[TMP18:%.*]] = or i32 [[TMP12]], [[TMP13]]
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i64 1
+; CHECK-NEXT: [[TMP14:%.*]] = and i32 [[TMP6]], 3
+; CHECK-NEXT: [[TMP19:%.*]] = or i32 [[TMP6]], [[TMP14]]
+; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[TMP3]], i64 2
+; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 3
+; CHECK-NEXT: [[TMP20:%.*]] = or i32 [[TMP15]], [[TMP16]]
+; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i32> [[TMP3]], i64 3
+; CHECK-NEXT: [[TMP11:%.*]] = and i32 [[TMP17]], 3
+; CHECK-NEXT: [[TMP21:%.*]] = or i32 [[TMP17]], [[TMP11]]
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP9]] to <4 x float>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float>
+; CHECK-NEXT: [[TMP10:%.*]] = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> [[TMP8]], <4 x i32> [[X1]], <4 x float> [[TMP5]])
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x float> [[TMP10]] to <4 x i32>
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP21]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB20:.*]], label %[[BB21:.*]], !prof [[PROF1]]
+; CHECK: [[BB20]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB21]]:
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> [[X0]], <4 x i32> [[X1]], <4 x float> [[X2]])
-; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: store <4 x i32> [[TMP7]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <4 x float> [[TMP1]]
;
%1 = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2)
@@ -2339,34 +2602,43 @@ define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128(<4 x float> %x0, <
; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask_vpermi2var_ps_128(
; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP11]] to i128
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP9]], 0
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> [[TMP13]] to i128
-; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP12]], 0
-; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast <4 x i32> [[TMP8]] to i128
-; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP15]], 0
-; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
-; CHECK: [[BB8]]:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
-; CHECK-NEXT: unreachable
-; CHECK: [[BB9]]:
+; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[TMP13]], i64 0
+; CHECK-NEXT: [[TMP19:%.*]] = and i32 [[TMP15]], 3
+; CHECK-NEXT: [[TMP28:%.*]] = or i32 [[TMP15]], [[TMP19]]
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i32> [[TMP13]], i64 1
+; CHECK-NEXT: [[TMP23:%.*]] = and i32 [[TMP22]], 3
+; CHECK-NEXT: [[TMP29:%.*]] = or i32 [[TMP22]], [[TMP23]]
+; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i32> [[TMP13]], i64 2
+; CHECK-NEXT: [[TMP25:%.*]] = and i32 [[TMP24]], 3
+; CHECK-NEXT: [[TMP30:%.*]] = or i32 [[TMP24]], [[TMP25]]
+; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i32> [[TMP13]], i64 3
+; CHECK-NEXT: [[TMP27:%.*]] = and i32 [[TMP26]], 3
+; CHECK-NEXT: [[TMP31:%.*]] = or i32 [[TMP26]], [[TMP27]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP11]] to <4 x float>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> [[TMP8]] to <4 x float>
+; CHECK-NEXT: [[TMP17:%.*]] = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> [[TMP9]], <4 x i32> [[X1]], <4 x float> [[TMP12]])
+; CHECK-NEXT: [[TMP18:%.*]] = bitcast <4 x float> [[TMP17]] to <4 x i32>
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP31]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB21:.*]], label %[[BB22:.*]], !prof [[PROF1]]
+; CHECK: [[BB21]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB22]]:
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> [[X0]], <4 x i32> [[X1]], <4 x float> [[X2]])
; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[X1]] to <4 x float>
; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[X3]] to <8 x i1>
; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP14]], <8 x i1> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[TMP16:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> zeroinitializer, <4 x i32> [[TMP13]]
+; CHECK-NEXT: [[TMP16:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP18]], <4 x i32> [[TMP13]]
; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32>
; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i32> [[TMP5]], [[TMP6]]
-; CHECK-NEXT: [[TMP20:%.*]] = or <4 x i32> [[TMP7]], zeroinitializer
+; CHECK-NEXT: [[TMP20:%.*]] = or <4 x i32> [[TMP7]], [[TMP18]]
; CHECK-NEXT: [[TMP21:%.*]] = or <4 x i32> [[TMP20]], [[TMP13]]
; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i32> [[TMP21]], <4 x i32> [[TMP16]]
; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP1]], <4 x float> [[TMP2]]
@@ -2392,30 +2664,39 @@ define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128_cast(<4 x float> %
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <2 x i64> [[TMP11]] to <4 x i32>
; CHECK-NEXT: [[X1CAST:%.*]] = bitcast <2 x i64> [[X1]] to <4 x i32>
-; CHECK-NEXT: [[TMP19:%.*]] = bitcast <4 x i32> [[TMP12]] to i128
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP19]], 0
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP14]] to i128
-; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP9]], 0
-; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP13]] to i128
-; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
-; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
-; CHECK: [[BB9]]:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
-; CHECK-NEXT: unreachable
-; CHECK: [[BB10]]:
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i32> [[TMP14]], i64 0
+; CHECK-NEXT: [[TMP23:%.*]] = and i32 [[TMP20]], 3
+; CHECK-NEXT: [[TMP29:%.*]] = or i32 [[TMP20]], [[TMP23]]
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP14]], i64 1
+; CHECK-NEXT: [[TMP24:%.*]] = and i32 [[TMP8]], 3
+; CHECK-NEXT: [[TMP30:%.*]] = or i32 [[TMP8]], [[TMP24]]
+; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i32> [[TMP14]], i64 2
+; CHECK-NEXT: [[TMP26:%.*]] = and i32 [[TMP25]], 3
+; CHECK-NEXT: [[TMP31:%.*]] = or i32 [[TMP25]], [[TMP26]]
+; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i32> [[TMP14]], i64 3
+; CHECK-NEXT: [[TMP28:%.*]] = and i32 [[TMP27]], 3
+; CHECK-NEXT: [[TMP32:%.*]] = or i32 [[TMP27]], [[TMP28]]
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i32> [[TMP12]] to <4 x float>
+; CHECK-NEXT: [[TMP18:%.*]] = bitcast <4 x i32> [[TMP13]] to <4 x float>
+; CHECK-NEXT: [[TMP19:%.*]] = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> [[TMP16]], <4 x i32> [[X1CAST]], <4 x float> [[TMP18]])
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x float> [[TMP19]] to <4 x i32>
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP32]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB22:.*]], label %[[BB23:.*]], !prof [[PROF1]]
+; CHECK: [[BB22]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB23]]:
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> [[X0]], <4 x i32> [[X1CAST]], <4 x float> [[X2]])
; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[X1CAST]] to <4 x float>
; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[X3]] to <8 x i1>
; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP15]], <8 x i1> [[TMP15]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[TMP17:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> zeroinitializer, <4 x i32> [[TMP14]]
+; CHECK-NEXT: [[TMP17:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP9]], <4 x i32> [[TMP14]]
; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32>
; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i32> [[TMP5]], [[TMP6]]
-; CHECK-NEXT: [[TMP21:%.*]] = or <4 x i32> [[TMP7]], zeroinitializer
+; CHECK-NEXT: [[TMP21:%.*]] = or <4 x i32> [[TMP7]], [[TMP9]]
; CHECK-NEXT: [[TMP22:%.*]] = or <4 x i32> [[TMP21]], [[TMP14]]
; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i32> [[TMP22]], <4 x i32> [[TMP17]]
; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP1]], <4 x float> [[TMP2]]
@@ -2437,24 +2718,45 @@ define <8 x float>@test_int_x86_avx512_vpermi2var_ps_256(<8 x float> %x0, <8 x i
; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_vpermi2var_ps_256(
; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x float> [[X2:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP9:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i32> [[TMP9]] to i256
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP8]], 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
-; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
-; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i32> [[TMP4]] to i256
-; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP6]], 0
-; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
-; CHECK: [[BB7]]:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
-; CHECK-NEXT: unreachable
-; CHECK: [[BB8]]:
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i32> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP21:%.*]] = and i32 [[TMP20]], 7
+; CHECK-NEXT: [[TMP26:%.*]] = or i32 [[TMP20]], [[TMP21]]
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i64 1
+; CHECK-NEXT: [[TMP22:%.*]] = and i32 [[TMP6]], 7
+; CHECK-NEXT: [[TMP28:%.*]] = or i32 [[TMP6]], [[TMP22]]
+; CHECK-NEXT: [[TMP23:%.*]] = extractelement <8 x i32> [[TMP3]], i64 2
+; CHECK-NEXT: [[TMP24:%.*]] = and i32 [[TMP23]], 7
+; CHECK-NEXT: [[TMP29:%.*]] = or i32 [[TMP23]], [[TMP24]]
+; CHECK-NEXT: [[TMP25:%.*]] = extractelement <8 x i32> [[TMP3]], i64 3
+; CHECK-NEXT: [[TMP11:%.*]] = and i32 [[TMP25]], 7
+; CHECK-NEXT: [[TMP30:%.*]] = or i32 [[TMP25]], [[TMP11]]
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <8 x i32> [[TMP3]], i64 4
+; CHECK-NEXT: [[TMP13:%.*]] = and i32 [[TMP12]], 7
+; CHECK-NEXT: [[TMP31:%.*]] = or i32 [[TMP12]], [[TMP13]]
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i32> [[TMP3]], i64 5
+; CHECK-NEXT: [[TMP15:%.*]] = and i32 [[TMP14]], 7
+; CHECK-NEXT: [[TMP32:%.*]] = or i32 [[TMP14]], [[TMP15]]
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[TMP3]], i64 6
+; CHECK-NEXT: [[TMP17:%.*]] = and i32 [[TMP16]], 7
+; CHECK-NEXT: [[TMP33:%.*]] = or i32 [[TMP16]], [[TMP17]]
+; CHECK-NEXT: [[TMP18:%.*]] = extractelement <8 x i32> [[TMP3]], i64 7
+; CHECK-NEXT: [[TMP19:%.*]] = and i32 [[TMP18]], 7
+; CHECK-NEXT: [[TMP27:%.*]] = or i32 [[TMP18]], [[TMP19]]
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i32> [[TMP9]] to <8 x float>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP4]] to <8 x float>
+; CHECK-NEXT: [[TMP10:%.*]] = call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> [[TMP8]], <8 x i32> [[X1]], <8 x float> [[TMP5]])
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x float> [[TMP10]] to <8 x i32>
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP27]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB32:.*]], label %[[BB33:.*]], !prof [[PROF1]]
+; CHECK: [[BB32]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB33]]:
; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> [[X0]], <8 x i32> [[X1]], <8 x float> [[X2]])
-; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: store <8 x i32> [[TMP7]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <8 x float> [[TMP1]]
;
%1 = call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2)
@@ -2466,32 +2768,53 @@ define <8 x float>@test_int_x86_avx512_mask_vpermi2var_ps_256(<8 x float> %x0, <
; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_mask_vpermi2var_ps_256(
; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP11:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT: [[TMP13:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT: [[TMP8:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP13:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i32> [[TMP11]] to i256
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP9]], 0
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i32> [[TMP13]] to i256
-; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP12]], 0
-; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x i32> [[TMP8]] to i256
-; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP15]], 0
-; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
-; CHECK: [[BB8]]:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
-; CHECK-NEXT: unreachable
-; CHECK: [[BB9]]:
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <8 x i32> [[TMP13]], i64 0
+; CHECK-NEXT: [[TMP23:%.*]] = and i32 [[TMP22]], 7
+; CHECK-NEXT: [[TMP36:%.*]] = or i32 [[TMP22]], [[TMP23]]
+; CHECK-NEXT: [[TMP24:%.*]] = extractelement <8 x i32> [[TMP13]], i64 1
+; CHECK-NEXT: [[TMP25:%.*]] = and i32 [[TMP24]], 7
+; CHECK-NEXT: [[TMP37:%.*]] = or i32 [[TMP24]], [[TMP25]]
+; CHECK-NEXT: [[TMP26:%.*]] = extractelement <8 x i32> [[TMP13]], i64 2
+; CHECK-NEXT: [[TMP27:%.*]] = and i32 [[TMP26]], 7
+; CHECK-NEXT: [[TMP38:%.*]] = or i32 [[TMP26]], [[TMP27]]
+; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i32> [[TMP13]], i64 3
+; CHECK-NEXT: [[TMP29:%.*]] = and i32 [[TMP28]], 7
+; CHECK-NEXT: [[TMP39:%.*]] = or i32 [[TMP28]], [[TMP29]]
+; CHECK-NEXT: [[TMP30:%.*]] = extractelement <8 x i32> [[TMP13]], i64 4
+; CHECK-NEXT: [[TMP31:%.*]] = and i32 [[TMP30]], 7
+; CHECK-NEXT: [[TMP40:%.*]] = or i32 [[TMP30]], [[TMP31]]
+; CHECK-NEXT: [[TMP15:%.*]] = extractelement <8 x i32> [[TMP13]], i64 5
+; CHECK-NEXT: [[TMP32:%.*]] = and i32 [[TMP15]], 7
+; CHECK-NEXT: [[TMP41:%.*]] = or i32 [[TMP15]], [[TMP32]]
+; CHECK-NEXT: [[TMP33:%.*]] = extractelement <8 x i32> [[TMP13]], i64 6
+; CHECK-NEXT: [[TMP34:%.*]] = and i32 [[TMP33]], 7
+; CHECK-NEXT: [[TMP42:%.*]] = or i32 [[TMP33]], [[TMP34]]
+; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i32> [[TMP13]], i64 7
+; CHECK-NEXT: [[TMP35:%.*]] = and i32 [[TMP19]], 7
+; CHECK-NEXT: [[TMP43:%.*]] = or i32 [[TMP19]], [[TMP35]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i32> [[TMP11]] to <8 x float>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i32> [[TMP8]] to <8 x float>
+; CHECK-NEXT: [[TMP17:%.*]] = call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> [[TMP9]], <8 x i32> [[X1]], <8 x float> [[TMP12]])
+; CHECK-NEXT: [[TMP18:%.*]] = bitcast <8 x float> [[TMP17]] to <8 x i32>
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP43]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB33:.*]], label %[[BB34:.*]], !prof [[PROF1]]
+; CHECK: [[BB33]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB34]]:
; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> [[X0]], <8 x i32> [[X1]], <8 x float> [[X2]])
; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i32> [[X1]] to <8 x float>
; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[X3]] to <8 x i1>
-; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP3]], <8 x i32> zeroinitializer, <8 x i32> [[TMP13]]
+; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP3]], <8 x i32> [[TMP18]], <8 x i32> [[TMP13]]
; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x float> [[TMP1]] to <8 x i32>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x float> [[TMP2]] to <8 x i32>
; CHECK-NEXT: [[TMP7:%.*]] = xor <8 x i32> [[TMP5]], [[TMP6]]
-; CHECK-NEXT: [[TMP20:%.*]] = or <8 x i32> [[TMP7]], zeroinitializer
+; CHECK-NEXT: [[TMP20:%.*]] = or <8 x i32> [[TMP7]], [[TMP18]]
; CHECK-NEXT: [[TMP21:%.*]] = or <8 x i32> [[TMP20]], [[TMP13]]
; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP14]], <8 x i32> [[TMP21]], <8 x i32> [[TMP16]]
; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[TMP3]], <8 x float> [[TMP1]], <8 x float> [[TMP2]]
@@ -2511,11 +2834,22 @@ define <2 x i64>@test_int_x86_avx512_vpermi2var_q_128(<2 x i64> %x0, <2 x i64> %
; CHECK-LABEL: define <2 x i64> @test_int_x86_avx512_vpermi2var_q_128(
; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP6]], [[TMP3]]
-; CHECK-NEXT: [[TMP4:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP5]]
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP9:%.*]] = and i64 [[TMP8]], 1
+; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP8]], [[TMP9]]
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP3]], i64 1
+; CHECK-NEXT: [[TMP7:%.*]] = and i64 [[TMP10]], 1
+; CHECK-NEXT: [[TMP12:%.*]] = or i64 [[TMP10]], [[TMP7]]
+; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[TMP6]], <2 x i64> [[X1]], <2 x i64> [[TMP5]])
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP12]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB11:.*]], label %[[BB12:.*]], !prof [[PROF1]]
+; CHECK: [[BB11]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB12]]:
; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X0]], <2 x i64> [[X1]], <2 x i64> [[X2]])
; CHECK-NEXT: store <2 x i64> [[TMP4]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <2 x i64> [[TMP1]]
@@ -2529,12 +2863,23 @@ define <2 x i64>@test_int_x86_avx512_mask_vpermi2var_q_128(<2 x i64> %x0, <2 x i
; CHECK-LABEL: define <2 x i64> @test_int_x86_avx512_mask_vpermi2var_q_128(
; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP8:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <2 x i64> [[TMP8]], [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = or <2 x i64> [[_MSPROP1]], [[TMP6]]
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP13:%.*]] = and i64 [[TMP9]], 1
+; CHECK-NEXT: [[TMP18:%.*]] = or i64 [[TMP9]], [[TMP13]]
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP3]], i64 1
+; CHECK-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 1
+; CHECK-NEXT: [[TMP19:%.*]] = or i64 [[TMP16]], [[TMP17]]
+; CHECK-NEXT: [[TMP5:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[TMP8]], <2 x i64> [[X1]], <2 x i64> [[TMP6]])
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP19]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK: [[BB12]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB13]]:
; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X0]], <2 x i64> [[X1]], <2 x i64> [[X2]])
; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -2560,11 +2905,22 @@ define <2 x i64>@test_int_x86_avx512_vpermt2var_q_128(<2 x i64> %x0, <2 x i64> %
; CHECK-LABEL: define <2 x i64> @test_int_x86_avx512_vpermt2var_q_128(
; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP6]], [[TMP3]]
-; CHECK-NEXT: [[TMP4:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP5]]
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP9:%.*]] = and i64 [[TMP8]], 1
+; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP8]], [[TMP9]]
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP3]], i64 1
+; CHECK-NEXT: [[TMP7:%.*]] = and i64 [[TMP10]], 1
+; CHECK-NEXT: [[TMP12:%.*]] = or i64 [[TMP10]], [[TMP7]]
+; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[TMP6]], <2 x i64> [[X0]], <2 x i64> [[TMP5]])
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP12]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB11:.*]], label %[[BB12:.*]], !prof [[PROF1]]
+; CHECK: [[BB11]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB12]]:
; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X1]], <2 x i64> [[X0]], <2 x i64> [[X2]])
; CHECK-NEXT: store <2 x i64> [[TMP4]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <2 x i64> [[TMP1]]
@@ -2578,12 +2934,23 @@ define <2 x i64>@test_int_x86_avx512_mask_vpermt2var_q_128(<2 x i64> %x0, <2 x i
; CHECK-LABEL: define <2 x i64> @test_int_x86_avx512_mask_vpermt2var_q_128(
; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP8:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <2 x i64> [[TMP8]], [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = or <2 x i64> [[_MSPROP1]], [[TMP6]]
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP13:%.*]] = and i64 [[TMP9]], 1
+; CHECK-NEXT: [[TMP18:%.*]] = or i64 [[TMP9]], [[TMP13]]
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP3]], i64 1
+; CHECK-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 1
+; CHECK-NEXT: [[TMP19:%.*]] = or i64 [[TMP16]], [[TMP17]]
+; CHECK-NEXT: [[TMP5:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[TMP8]], <2 x i64> [[X0]], <2 x i64> [[TMP6]])
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP19]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK: [[BB12]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB13]]:
; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X1]], <2 x i64> [[X0]], <2 x i64> [[X2]])
; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -2610,12 +2977,23 @@ define <2 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_128(<2 x i64> %x0, <2 x
; CHECK-LABEL: define <2 x i64> @test_int_x86_avx512_maskz_vpermt2var_q_128(
; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP8:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <2 x i64> [[TMP8]], [[TMP3]]
-; CHECK-NEXT: [[TMP13:%.*]] = or <2 x i64> [[_MSPROP1]], [[TMP9]]
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP15:%.*]] = and i64 [[TMP14]], 1
+; CHECK-NEXT: [[TMP18:%.*]] = or i64 [[TMP14]], [[TMP15]]
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP3]], i64 1
+; CHECK-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 1
+; CHECK-NEXT: [[TMP19:%.*]] = or i64 [[TMP16]], [[TMP17]]
+; CHECK-NEXT: [[TMP13:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[TMP8]], <2 x i64> [[X0]], <2 x i64> [[TMP9]])
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP19]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK: [[BB12]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB13]]:
; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X1]], <2 x i64> [[X0]], <2 x i64> [[X2]])
; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -2643,11 +3021,28 @@ define <4 x i64>@test_int_x86_avx512_vpermi2var_q_256(<4 x i64> %x0, <4 x i64> %
; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_vpermi2var_q_256(
; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i64> [[TMP6]], [[TMP3]]
-; CHECK-NEXT: [[TMP4:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP5]]
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP13:%.*]] = and i64 [[TMP12]], 3
+; CHECK-NEXT: [[TMP16:%.*]] = or i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[TMP3]], i64 1
+; CHECK-NEXT: [[TMP7:%.*]] = and i64 [[TMP14]], 3
+; CHECK-NEXT: [[TMP17:%.*]] = or i64 [[TMP14]], [[TMP7]]
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP3]], i64 2
+; CHECK-NEXT: [[TMP9:%.*]] = and i64 [[TMP8]], 3
+; CHECK-NEXT: [[TMP18:%.*]] = or i64 [[TMP8]], [[TMP9]]
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP3]], i64 3
+; CHECK-NEXT: [[TMP11:%.*]] = and i64 [[TMP10]], 3
+; CHECK-NEXT: [[TMP15:%.*]] = or i64 [[TMP10]], [[TMP11]]
+; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[TMP6]], <4 x i64> [[X1]], <4 x i64> [[TMP5]])
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP15]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB17:.*]], label %[[BB18:.*]], !prof [[PROF1]]
+; CHECK: [[BB17]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB18]]:
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X0]], <4 x i64> [[X1]], <4 x i64> [[X2]])
; CHECK-NEXT: store <4 x i64> [[TMP4]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <4 x i64> [[TMP1]]
@@ -2661,12 +3056,29 @@ define <4 x i64>@test_int_x86_avx512_mask_vpermi2var_q_256(<4 x i64> %x0, <4 x i
; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_mask_vpermi2var_q_256(
; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i64> [[TMP8]], [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i64> [[_MSPROP1]], [[TMP6]]
+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i64> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP16:%.*]] = and i64 [[TMP13]], 3
+; CHECK-NEXT: [[TMP22:%.*]] = or i64 [[TMP13]], [[TMP16]]
+; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i64> [[TMP3]], i64 1
+; CHECK-NEXT: [[TMP18:%.*]] = and i64 [[TMP17]], 3
+; CHECK-NEXT: [[TMP23:%.*]] = or i64 [[TMP17]], [[TMP18]]
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP3]], i64 2
+; CHECK-NEXT: [[TMP19:%.*]] = and i64 [[TMP9]], 3
+; CHECK-NEXT: [[TMP24:%.*]] = or i64 [[TMP9]], [[TMP19]]
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i64> [[TMP3]], i64 3
+; CHECK-NEXT: [[TMP21:%.*]] = and i64 [[TMP20]], 3
+; CHECK-NEXT: [[TMP25:%.*]] = or i64 [[TMP20]], [[TMP21]]
+; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[TMP8]], <4 x i64> [[X1]], <4 x i64> [[TMP6]])
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP25]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB18:.*]], label %[[BB19:.*]], !prof [[PROF1]]
+; CHECK: [[BB18]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB19]]:
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X0]], <4 x i64> [[X1]], <4 x i64> [[X2]])
; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -2692,11 +3104,28 @@ define <4 x i64>@test_int_x86_avx512_vpermt2var_q_256(<4 x i64> %x0, <4 x i64> %
; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_vpermt2var_q_256(
; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i64> [[TMP6]], [[TMP3]]
-; CHECK-NEXT: [[TMP4:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP5]]
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP13:%.*]] = and i64 [[TMP12]], 3
+; CHECK-NEXT: [[TMP16:%.*]] = or i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[TMP3]], i64 1
+; CHECK-NEXT: [[TMP7:%.*]] = and i64 [[TMP14]], 3
+; CHECK-NEXT: [[TMP17:%.*]] = or i64 [[TMP14]], [[TMP7]]
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP3]], i64 2
+; CHECK-NEXT: [[TMP9:%.*]] = and i64 [[TMP8]], 3
+; CHECK-NEXT: [[TMP18:%.*]] = or i64 [[TMP8]], [[TMP9]]
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP3]], i64 3
+; CHECK-NEXT: [[TMP11:%.*]] = and i64 [[TMP10]], 3
+; CHECK-NEXT: [[TMP15:%.*]] = or i64 [[TMP10]], [[TMP11]]
+; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[TMP6]], <4 x i64> [[X0]], <4 x i64> [[TMP5]])
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP15]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB17:.*]], label %[[BB18:.*]], !prof [[PROF1]]
+; CHECK: [[BB17]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB18]]:
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X1]], <4 x i64> [[X0]], <4 x i64> [[X2]])
; CHECK-NEXT: store <4 x i64> [[TMP4]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <4 x i64> [[TMP1]]
@@ -2710,12 +3139,29 @@ define <4 x i64>@test_int_x86_avx512_mask_vpermt2var_q_256(<4 x i64> %x0, <4 x i
; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_mask_vpermt2var_q_256(
; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i64> [[TMP8]], [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i64> [[_MSPROP1]], [[TMP6]]
+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i64> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP16:%.*]] = and i64 [[TMP13]], 3
+; CHECK-NEXT: [[TMP22:%.*]] = or i64 [[TMP13]], [[TMP16]]
+; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i64> [[TMP3]], i64 1
+; CHECK-NEXT: [[TMP18:%.*]] = and i64 [[TMP17]], 3
+; CHECK-NEXT: [[TMP23:%.*]] = or i64 [[TMP17]], [[TMP18]]
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP3]], i64 2
+; CHECK-NEXT: [[TMP19:%.*]] = and i64 [[TMP9]], 3
+; CHECK-NEXT: [[TMP24:%.*]] = or i64 [[TMP9]], [[TMP19]]
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i64> [[TMP3]], i64 3
+; CHECK-NEXT: [[TMP21:%.*]] = and i64 [[TMP20]], 3
+; CHECK-NEXT: [[TMP25:%.*]] = or i64 [[TMP20]], [[TMP21]]
+; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[TMP8]], <4 x i64> [[X0]], <4 x i64> [[TMP6]])
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP25]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB18:.*]], label %[[BB19:.*]], !prof [[PROF1]]
+; CHECK: [[BB18]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB19]]:
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X1]], <4 x i64> [[X0]], <4 x i64> [[X2]])
; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -2742,12 +3188,29 @@ define <4 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_256(<4 x i64> %x0, <4 x
; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_maskz_vpermt2var_q_256(
; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i64> [[TMP8]], [[TMP3]]
-; CHECK-NEXT: [[TMP13:%.*]] = or <4 x i64> [[_MSPROP1]], [[TMP9]]
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP15:%.*]] = and i64 [[TMP14]], 3
+; CHECK-NEXT: [[TMP22:%.*]] = or i64 [[TMP14]], [[TMP15]]
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i64> [[TMP3]], i64 1
+; CHECK-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 3
+; CHECK-NEXT: [[TMP23:%.*]] = or i64 [[TMP16]], [[TMP17]]
+; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x i64> [[TMP3]], i64 2
+; CHECK-NEXT: [[TMP19:%.*]] = and i64 [[TMP18]], 3
+; CHECK-NEXT: [[TMP24:%.*]] = or i64 [[TMP18]], [[TMP19]]
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i64> [[TMP3]], i64 3
+; CHECK-NEXT: [[TMP21:%.*]] = and i64 [[TMP20]], 3
+; CHECK-NEXT: [[TMP25:%.*]] = or i64 [[TMP20]], [[TMP21]]
+; CHECK-NEXT: [[TMP13:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[TMP8]], <4 x i64> [[X0]], <4 x i64> [[TMP9]])
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP25]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB18:.*]], label %[[BB19:.*]], !prof [[PROF1]]
+; CHECK: [[BB18]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB19]]:
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X1]], <4 x i64> [[X0]], <4 x i64> [[X2]])
; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -12267,8 +12730,7 @@ define <8 x i32> @combine_vpermi2d_vpermps(<16 x i32> noundef %a) #0 {
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> splat (i32 -1), <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i32> [[_MSPROP]], zeroinitializer
-; CHECK-NEXT: [[TMP4:%.*]] = or <8 x i32> [[_MSPROP2]], [[_MSPROP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[_MSPROP]], <8 x i32> <i32 14, i32 13, i32 6, i32 3, i32 5, i32 15, i32 0, i32 1>, <8 x i32> [[_MSPROP1]])
; CHECK-NEXT: [[TMP3:%.*]] = tail call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[TMP1]], <8 x i32> <i32 14, i32 13, i32 6, i32 3, i32 5, i32 15, i32 0, i32 1>, <8 x i32> [[TMP2]])
; CHECK-NEXT: store <8 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <8 x i32> [[TMP3]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/x86-vpermi2.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/x86-vpermi2.ll
index 2350d75..95c1bbf 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/x86-vpermi2.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/x86-vpermi2.ll
@@ -16,8 +16,7 @@ define <2 x i64> @shuffle_vpermv3_v2i64(<2 x i64> %x0, <2 x i64> %x1) #0 {
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP1]], zeroinitializer
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[TMP1]], <2 x i64> <i64 2, i64 0>, <2 x i64> [[TMP2]])
; CHECK-NEXT: [[R:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X0]], <2 x i64> <i64 2, i64 0>, <2 x i64> [[X1]])
; CHECK-NEXT: store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <2 x i64> [[R]]
@@ -31,8 +30,7 @@ define <2 x i64> @shuffle_vpermv3_v2i64_unary(<2 x i64> %x0) #0 {
; CHECK-SAME: <2 x i64> [[X0:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP1]], zeroinitializer
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[TMP1]], <2 x i64> <i64 2, i64 0>, <2 x i64> [[TMP1]])
; CHECK-NEXT: [[R:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X0]], <2 x i64> <i64 2, i64 0>, <2 x i64> [[X0]])
; CHECK-NEXT: store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <2 x i64> [[R]]
@@ -55,8 +53,19 @@ define <2 x i64> @shuffle_vpermv3_v2i64_demandedbits(<2 x i64> %x0, <2 x i64> %x
; CHECK-NEXT: [[TMP8:%.*]] = or <2 x i64> [[TMP5]], [[TMP2]]
; CHECK-NEXT: [[TMP9:%.*]] = or <2 x i64> [[TMP8]], [[TMP7]]
; CHECK-NEXT: [[T:%.*]] = or <2 x i64> [[M]], <i64 0, i64 4>
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP6]], [[TMP9]]
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP9]], i64 0
+; CHECK-NEXT: [[TMP11:%.*]] = and i64 [[TMP10]], 1
+; CHECK-NEXT: [[TMP14:%.*]] = or i64 [[TMP10]], [[TMP11]]
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP9]], i64 1
+; CHECK-NEXT: [[TMP13:%.*]] = and i64 [[TMP12]], 1
+; CHECK-NEXT: [[TMP15:%.*]] = or i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[TMP6]], <2 x i64> [[T]], <2 x i64> [[TMP3]])
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP15]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB17:.*]], label %[[BB18:.*]], !prof [[PROF1:![0-9]+]]
+; CHECK: [[BB17]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB18]]:
; CHECK-NEXT: [[R:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X0]], <2 x i64> [[T]], <2 x i64> [[X1]])
; CHECK-NEXT: store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <2 x i64> [[R]]
@@ -80,8 +89,19 @@ define <2 x i64> @shuffle_vpermv3_v2i64_demandedbits_negative(<2 x i64> %x0, <2
; CHECK-NEXT: [[TMP8:%.*]] = or <2 x i64> [[TMP5]], [[TMP2]]
; CHECK-NEXT: [[TMP9:%.*]] = or <2 x i64> [[TMP8]], [[TMP7]]
; CHECK-NEXT: [[T:%.*]] = or <2 x i64> [[M]], <i64 0, i64 2>
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP6]], [[TMP9]]
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP9]], i64 0
+; CHECK-NEXT: [[TMP11:%.*]] = and i64 [[TMP10]], 1
+; CHECK-NEXT: [[TMP14:%.*]] = or i64 [[TMP10]], [[TMP11]]
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP9]], i64 1
+; CHECK-NEXT: [[TMP13:%.*]] = and i64 [[TMP12]], 1
+; CHECK-NEXT: [[TMP15:%.*]] = or i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[TMP6]], <2 x i64> [[T]], <2 x i64> [[TMP3]])
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP15]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB17:.*]], label %[[BB18:.*]], !prof [[PROF1]]
+; CHECK: [[BB17]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB18]]:
; CHECK-NEXT: [[R:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X0]], <2 x i64> [[T]], <2 x i64> [[X1]])
; CHECK-NEXT: store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <2 x i64> [[R]]
@@ -97,8 +117,7 @@ define <4 x i64> @shuffle_vpermv3_v4i64(<4 x i64> %x0, <4 x i64> %x1) #0 {
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i64> [[TMP1]], zeroinitializer
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[TMP1]], <4 x i64> <i64 7, i64 2, i64 6, i64 0>, <4 x i64> [[TMP2]])
; CHECK-NEXT: [[R:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X0]], <4 x i64> <i64 7, i64 2, i64 6, i64 0>, <4 x i64> [[X1]])
; CHECK-NEXT: store <4 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <4 x i64> [[R]]
@@ -112,8 +131,7 @@ define <4 x i64> @shuffle_vpermv3_v4i64_unary(<4 x i64> %x0) #0 {
; CHECK-SAME: <4 x i64> [[X0:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i64> [[TMP1]], zeroinitializer
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[TMP1]], <4 x i64> <i64 7, i64 2, i64 6, i64 0>, <4 x i64> [[TMP1]])
; CHECK-NEXT: [[R:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X0]], <4 x i64> <i64 7, i64 2, i64 6, i64 0>, <4 x i64> [[X0]])
; CHECK-NEXT: store <4 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <4 x i64> [[R]]
@@ -136,8 +154,25 @@ define <4 x i64> @shuffle_vpermv3_v4i64_demandedbits(<4 x i64> %x0, <4 x i64> %x
; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i64> [[TMP5]], [[TMP2]]
; CHECK-NEXT: [[TMP9:%.*]] = or <4 x i64> [[TMP8]], [[TMP7]]
; CHECK-NEXT: [[T:%.*]] = or <4 x i64> [[M]], <i64 0, i64 8, i64 16, i64 32>
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i64> [[TMP6]], [[TMP9]]
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP9]], i64 0
+; CHECK-NEXT: [[TMP11:%.*]] = and i64 [[TMP10]], 3
+; CHECK-NEXT: [[TMP19:%.*]] = or i64 [[TMP10]], [[TMP11]]
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[TMP9]], i64 1
+; CHECK-NEXT: [[TMP13:%.*]] = and i64 [[TMP12]], 3
+; CHECK-NEXT: [[TMP20:%.*]] = or i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[TMP9]], i64 2
+; CHECK-NEXT: [[TMP15:%.*]] = and i64 [[TMP14]], 3
+; CHECK-NEXT: [[TMP18:%.*]] = or i64 [[TMP14]], [[TMP15]]
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i64> [[TMP9]], i64 3
+; CHECK-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 3
+; CHECK-NEXT: [[TMP21:%.*]] = or i64 [[TMP16]], [[TMP17]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[TMP6]], <4 x i64> [[T]], <4 x i64> [[TMP3]])
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP21]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB23:.*]], label %[[BB24:.*]], !prof [[PROF1]]
+; CHECK: [[BB23]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB24]]:
; CHECK-NEXT: [[R:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X0]], <4 x i64> [[T]], <4 x i64> [[X1]])
; CHECK-NEXT: store <4 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <4 x i64> [[R]]
@@ -153,8 +188,7 @@ define <8 x i64> @shuffle_vpermv3_v8i64(<8 x i64> %x0, <8 x i64> %x1) #0 {
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], zeroinitializer
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[TMP1]], <8 x i64> <i64 8, i64 6, i64 10, i64 4, i64 12, i64 2, i64 14, i64 0>, <8 x i64> [[TMP2]])
; CHECK-NEXT: [[R:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0]], <8 x i64> <i64 8, i64 6, i64 10, i64 4, i64 12, i64 2, i64 14, i64 0>, <8 x i64> [[X1]])
; CHECK-NEXT: store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <8 x i64> [[R]]
@@ -168,8 +202,7 @@ define <8 x i64> @shuffle_vpermv3_v8i64_unary(<8 x i64> %x0) #0 {
; CHECK-SAME: <8 x i64> [[X0:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], zeroinitializer
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[TMP1]], <8 x i64> <i64 8, i64 6, i64 10, i64 4, i64 12, i64 2, i64 14, i64 0>, <8 x i64> [[TMP1]])
; CHECK-NEXT: [[R:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0]], <8 x i64> <i64 8, i64 6, i64 10, i64 4, i64 12, i64 2, i64 14, i64 0>, <8 x i64> [[X0]])
; CHECK-NEXT: store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <8 x i64> [[R]]
@@ -192,8 +225,37 @@ define <8 x i64> @shuffle_vpermv3_v8i64_demandedbits(<8 x i64> %x0, <8 x i64> %x
; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP5]], [[TMP2]]
; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP7]]
; CHECK-NEXT: [[T:%.*]] = or <8 x i64> [[M]], <i64 0, i64 16, i64 32, i64 64, i64 256, i64 512, i64 1024, i64 -16>
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP6]], [[TMP9]]
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i64> [[TMP9]], i64 0
+; CHECK-NEXT: [[TMP11:%.*]] = and i64 [[TMP10]], 7
+; CHECK-NEXT: [[TMP26:%.*]] = or i64 [[TMP10]], [[TMP11]]
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <8 x i64> [[TMP9]], i64 1
+; CHECK-NEXT: [[TMP13:%.*]] = and i64 [[TMP12]], 7
+; CHECK-NEXT: [[TMP28:%.*]] = or i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i64> [[TMP9]], i64 2
+; CHECK-NEXT: [[TMP15:%.*]] = and i64 [[TMP14]], 7
+; CHECK-NEXT: [[TMP29:%.*]] = or i64 [[TMP14]], [[TMP15]]
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i64> [[TMP9]], i64 3
+; CHECK-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 7
+; CHECK-NEXT: [[TMP31:%.*]] = or i64 [[TMP16]], [[TMP17]]
+; CHECK-NEXT: [[TMP18:%.*]] = extractelement <8 x i64> [[TMP9]], i64 4
+; CHECK-NEXT: [[TMP19:%.*]] = and i64 [[TMP18]], 7
+; CHECK-NEXT: [[TMP32:%.*]] = or i64 [[TMP18]], [[TMP19]]
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i64> [[TMP9]], i64 5
+; CHECK-NEXT: [[TMP21:%.*]] = and i64 [[TMP20]], 7
+; CHECK-NEXT: [[TMP27:%.*]] = or i64 [[TMP20]], [[TMP21]]
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <8 x i64> [[TMP9]], i64 6
+; CHECK-NEXT: [[TMP23:%.*]] = and i64 [[TMP22]], 7
+; CHECK-NEXT: [[TMP30:%.*]] = or i64 [[TMP22]], [[TMP23]]
+; CHECK-NEXT: [[TMP24:%.*]] = extractelement <8 x i64> [[TMP9]], i64 7
+; CHECK-NEXT: [[TMP25:%.*]] = and i64 [[TMP24]], 7
+; CHECK-NEXT: [[TMP33:%.*]] = or i64 [[TMP24]], [[TMP25]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[TMP6]], <8 x i64> [[T]], <8 x i64> [[TMP3]])
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP33]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB35:.*]], label %[[BB36:.*]], !prof [[PROF1]]
+; CHECK: [[BB35]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB36]]:
; CHECK-NEXT: [[R:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0]], <8 x i64> [[T]], <8 x i64> [[X1]])
; CHECK-NEXT: store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <8 x i64> [[R]]
@@ -213,8 +275,7 @@ define <4 x i32> @shuffle_vpermv3_v4i32(<4 x i32> %x0, <4 x i32> %x1) #0 {
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], zeroinitializer
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[TMP1]], <4 x i32> <i32 7, i32 2, i32 6, i32 0>, <4 x i32> [[TMP2]])
; CHECK-NEXT: [[R:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X0]], <4 x i32> <i32 7, i32 2, i32 6, i32 0>, <4 x i32> [[X1]])
; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <4 x i32> [[R]]
@@ -228,8 +289,7 @@ define <4 x i32> @shuffle_vpermv3_v4i32_unary(<4 x i32> %x0) #0 {
; CHECK-SAME: <4 x i32> [[X0:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], zeroinitializer
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[TMP1]], <4 x i32> <i32 7, i32 2, i32 6, i32 0>, <4 x i32> [[TMP1]])
; CHECK-NEXT: [[R:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X0]], <4 x i32> <i32 7, i32 2, i32 6, i32 0>, <4 x i32> [[X0]])
; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <4 x i32> [[R]]
@@ -252,8 +312,25 @@ define <4 x i32> @shuffle_vpermv3_v4i32_demandedbits(<4 x i32> %x0, <4 x i32> %x
; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i32> [[TMP5]], [[TMP2]]
; CHECK-NEXT: [[TMP9:%.*]] = or <4 x i32> [[TMP8]], [[TMP7]]
; CHECK-NEXT: [[T:%.*]] = or <4 x i32> [[M]], <i32 0, i32 8, i32 16, i32 32>
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP6]], [[TMP9]]
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[TMP9]], i64 0
+; CHECK-NEXT: [[TMP11:%.*]] = and i32 [[TMP10]], 3
+; CHECK-NEXT: [[TMP19:%.*]] = or i32 [[TMP10]], [[TMP11]]
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[TMP9]], i64 1
+; CHECK-NEXT: [[TMP13:%.*]] = and i32 [[TMP12]], 3
+; CHECK-NEXT: [[TMP20:%.*]] = or i32 [[TMP12]], [[TMP13]]
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i32> [[TMP9]], i64 2
+; CHECK-NEXT: [[TMP15:%.*]] = and i32 [[TMP14]], 3
+; CHECK-NEXT: [[TMP18:%.*]] = or i32 [[TMP14]], [[TMP15]]
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i32> [[TMP9]], i64 3
+; CHECK-NEXT: [[TMP17:%.*]] = and i32 [[TMP16]], 3
+; CHECK-NEXT: [[TMP21:%.*]] = or i32 [[TMP16]], [[TMP17]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[TMP6]], <4 x i32> [[T]], <4 x i32> [[TMP3]])
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP21]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB23:.*]], label %[[BB24:.*]], !prof [[PROF1]]
+; CHECK: [[BB23]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB24]]:
; CHECK-NEXT: [[R:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X0]], <4 x i32> [[T]], <4 x i32> [[X1]])
; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <4 x i32> [[R]]
@@ -269,8 +346,7 @@ define <8 x i32> @shuffle_vpermv3_v8i32(<8 x i32> %x0, <8 x i32> %x1) #0 {
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], zeroinitializer
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[TMP1]], <8 x i32> <i32 8, i32 6, i32 10, i32 4, i32 12, i32 2, i32 14, i32 0>, <8 x i32> [[TMP2]])
; CHECK-NEXT: [[R:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X0]], <8 x i32> <i32 8, i32 6, i32 10, i32 4, i32 12, i32 2, i32 14, i32 0>, <8 x i32> [[X1]])
; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <8 x i32> [[R]]
@@ -284,8 +360,7 @@ define <8 x i32> @shuffle_vpermv3_v8i32_unary(<8 x i32> %x0) #0 {
; CHECK-SAME: <8 x i32> [[X0:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], zeroinitializer
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[TMP1]], <8 x i32> <i32 8, i32 6, i32 10, i32 4, i32 12, i32 2, i32 14, i32 0>, <8 x i32> [[TMP1]])
; CHECK-NEXT: [[R:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X0]], <8 x i32> <i32 8, i32 6, i32 10, i32 4, i32 12, i32 2, i32 14, i32 0>, <8 x i32> [[X0]])
; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <8 x i32> [[R]]
@@ -308,8 +383,37 @@ define <8 x i32> @shuffle_vpermv3_v8i32_demandedbits(<8 x i32> %x0, <8 x i32> %x
; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i32> [[TMP5]], [[TMP2]]
; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i32> [[TMP8]], [[TMP7]]
; CHECK-NEXT: [[T:%.*]] = or <8 x i32> [[M]], <i32 0, i32 16, i32 32, i32 64, i32 256, i32 512, i32 -16, i32 -32>
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP6]], [[TMP9]]
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP9]], i64 0
+; CHECK-NEXT: [[TMP11:%.*]] = and i32 [[TMP10]], 7
+; CHECK-NEXT: [[TMP26:%.*]] = or i32 [[TMP10]], [[TMP11]]
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <8 x i32> [[TMP9]], i64 1
+; CHECK-NEXT: [[TMP13:%.*]] = and i32 [[TMP12]], 7
+; CHECK-NEXT: [[TMP28:%.*]] = or i32 [[TMP12]], [[TMP13]]
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i32> [[TMP9]], i64 2
+; CHECK-NEXT: [[TMP15:%.*]] = and i32 [[TMP14]], 7
+; CHECK-NEXT: [[TMP29:%.*]] = or i32 [[TMP14]], [[TMP15]]
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[TMP9]], i64 3
+; CHECK-NEXT: [[TMP17:%.*]] = and i32 [[TMP16]], 7
+; CHECK-NEXT: [[TMP31:%.*]] = or i32 [[TMP16]], [[TMP17]]
+; CHECK-NEXT: [[TMP18:%.*]] = extractelement <8 x i32> [[TMP9]], i64 4
+; CHECK-NEXT: [[TMP19:%.*]] = and i32 [[TMP18]], 7
+; CHECK-NEXT: [[TMP32:%.*]] = or i32 [[TMP18]], [[TMP19]]
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i32> [[TMP9]], i64 5
+; CHECK-NEXT: [[TMP21:%.*]] = and i32 [[TMP20]], 7
+; CHECK-NEXT: [[TMP27:%.*]] = or i32 [[TMP20]], [[TMP21]]
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <8 x i32> [[TMP9]], i64 6
+; CHECK-NEXT: [[TMP23:%.*]] = and i32 [[TMP22]], 7
+; CHECK-NEXT: [[TMP30:%.*]] = or i32 [[TMP22]], [[TMP23]]
+; CHECK-NEXT: [[TMP24:%.*]] = extractelement <8 x i32> [[TMP9]], i64 7
+; CHECK-NEXT: [[TMP25:%.*]] = and i32 [[TMP24]], 7
+; CHECK-NEXT: [[TMP33:%.*]] = or i32 [[TMP24]], [[TMP25]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[TMP6]], <8 x i32> [[T]], <8 x i32> [[TMP3]])
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP33]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB35:.*]], label %[[BB36:.*]], !prof [[PROF1]]
+; CHECK: [[BB35]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB36]]:
; CHECK-NEXT: [[R:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X0]], <8 x i32> [[T]], <8 x i32> [[X1]])
; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <8 x i32> [[R]]
@@ -325,8 +429,7 @@ define <16 x i32> @shuffle_vpermv3_v16i32(<16 x i32> %x0, <16 x i32> %x1) #0 {
; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], zeroinitializer
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP1]], <16 x i32> <i32 16, i32 14, i32 18, i32 12, i32 20, i32 10, i32 22, i32 8, i32 24, i32 6, i32 26, i32 4, i32 28, i32 2, i32 30, i32 0>, <16 x i32> [[TMP2]])
; CHECK-NEXT: [[R:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0]], <16 x i32> <i32 16, i32 14, i32 18, i32 12, i32 20, i32 10, i32 22, i32 8, i32 24, i32 6, i32 26, i32 4, i32 28, i32 2, i32 30, i32 0>, <16 x i32> [[X1]])
; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <16 x i32> [[R]]
@@ -340,8 +443,7 @@ define <16 x i32> @shuffle_vpermv3_v16i32_unary(<16 x i32> %x0) #0 {
; CHECK-SAME: <16 x i32> [[X0:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], zeroinitializer
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP1]], <16 x i32> <i32 16, i32 14, i32 18, i32 12, i32 20, i32 10, i32 22, i32 8, i32 24, i32 6, i32 26, i32 4, i32 28, i32 2, i32 30, i32 0>, <16 x i32> [[TMP1]])
; CHECK-NEXT: [[R:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0]], <16 x i32> <i32 16, i32 14, i32 18, i32 12, i32 20, i32 10, i32 22, i32 8, i32 24, i32 6, i32 26, i32 4, i32 28, i32 2, i32 30, i32 0>, <16 x i32> [[X0]])
; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <16 x i32> [[R]]
@@ -364,8 +466,61 @@ define <16 x i32> @shuffle_vpermv3_v16i32_demandedbits(<16 x i32> %x0, <16 x i32
; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP5]], [[TMP2]]
; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP7]]
; CHECK-NEXT: [[T:%.*]] = or <16 x i32> [[M]], <i32 0, i32 32, i32 64, i32 256, i32 512, i32 1024, i32 2048, i32 4096, i32 8192, i32 -32, i32 -64, i32 -128, i32 -256, i32 -512, i32 -1024, i32 -2048>
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP6]], [[TMP9]]
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <16 x i32> [[TMP9]], i64 0
+; CHECK-NEXT: [[TMP11:%.*]] = and i32 [[TMP10]], 15
+; CHECK-NEXT: [[TMP43:%.*]] = or i32 [[TMP10]], [[TMP11]]
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <16 x i32> [[TMP9]], i64 1
+; CHECK-NEXT: [[TMP13:%.*]] = and i32 [[TMP12]], 15
+; CHECK-NEXT: [[TMP44:%.*]] = or i32 [[TMP12]], [[TMP13]]
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <16 x i32> [[TMP9]], i64 2
+; CHECK-NEXT: [[TMP15:%.*]] = and i32 [[TMP14]], 15
+; CHECK-NEXT: [[TMP46:%.*]] = or i32 [[TMP14]], [[TMP15]]
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <16 x i32> [[TMP9]], i64 3
+; CHECK-NEXT: [[TMP17:%.*]] = and i32 [[TMP16]], 15
+; CHECK-NEXT: [[TMP47:%.*]] = or i32 [[TMP16]], [[TMP17]]
+; CHECK-NEXT: [[TMP18:%.*]] = extractelement <16 x i32> [[TMP9]], i64 4
+; CHECK-NEXT: [[TMP19:%.*]] = and i32 [[TMP18]], 15
+; CHECK-NEXT: [[TMP49:%.*]] = or i32 [[TMP18]], [[TMP19]]
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <16 x i32> [[TMP9]], i64 5
+; CHECK-NEXT: [[TMP21:%.*]] = and i32 [[TMP20]], 15
+; CHECK-NEXT: [[TMP50:%.*]] = or i32 [[TMP20]], [[TMP21]]
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <16 x i32> [[TMP9]], i64 6
+; CHECK-NEXT: [[TMP23:%.*]] = and i32 [[TMP22]], 15
+; CHECK-NEXT: [[TMP52:%.*]] = or i32 [[TMP22]], [[TMP23]]
+; CHECK-NEXT: [[TMP24:%.*]] = extractelement <16 x i32> [[TMP9]], i64 7
+; CHECK-NEXT: [[TMP25:%.*]] = and i32 [[TMP24]], 15
+; CHECK-NEXT: [[TMP53:%.*]] = or i32 [[TMP24]], [[TMP25]]
+; CHECK-NEXT: [[TMP26:%.*]] = extractelement <16 x i32> [[TMP9]], i64 8
+; CHECK-NEXT: [[TMP27:%.*]] = and i32 [[TMP26]], 15
+; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP26]], [[TMP27]]
+; CHECK-NEXT: [[TMP28:%.*]] = extractelement <16 x i32> [[TMP9]], i64 9
+; CHECK-NEXT: [[TMP29:%.*]] = and i32 [[TMP28]], 15
+; CHECK-NEXT: [[TMP56:%.*]] = or i32 [[TMP28]], [[TMP29]]
+; CHECK-NEXT: [[TMP30:%.*]] = extractelement <16 x i32> [[TMP9]], i64 10
+; CHECK-NEXT: [[TMP31:%.*]] = and i32 [[TMP30]], 15
+; CHECK-NEXT: [[TMP42:%.*]] = or i32 [[TMP30]], [[TMP31]]
+; CHECK-NEXT: [[TMP32:%.*]] = extractelement <16 x i32> [[TMP9]], i64 11
+; CHECK-NEXT: [[TMP33:%.*]] = and i32 [[TMP32]], 15
+; CHECK-NEXT: [[TMP45:%.*]] = or i32 [[TMP32]], [[TMP33]]
+; CHECK-NEXT: [[TMP34:%.*]] = extractelement <16 x i32> [[TMP9]], i64 12
+; CHECK-NEXT: [[TMP35:%.*]] = and i32 [[TMP34]], 15
+; CHECK-NEXT: [[TMP48:%.*]] = or i32 [[TMP34]], [[TMP35]]
+; CHECK-NEXT: [[TMP36:%.*]] = extractelement <16 x i32> [[TMP9]], i64 13
+; CHECK-NEXT: [[TMP37:%.*]] = and i32 [[TMP36]], 15
+; CHECK-NEXT: [[TMP51:%.*]] = or i32 [[TMP36]], [[TMP37]]
+; CHECK-NEXT: [[TMP38:%.*]] = extractelement <16 x i32> [[TMP9]], i64 14
+; CHECK-NEXT: [[TMP39:%.*]] = and i32 [[TMP38]], 15
+; CHECK-NEXT: [[TMP54:%.*]] = or i32 [[TMP38]], [[TMP39]]
+; CHECK-NEXT: [[TMP40:%.*]] = extractelement <16 x i32> [[TMP9]], i64 15
+; CHECK-NEXT: [[TMP41:%.*]] = and i32 [[TMP40]], 15
+; CHECK-NEXT: [[TMP57:%.*]] = or i32 [[TMP40]], [[TMP41]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP6]], <16 x i32> [[T]], <16 x i32> [[TMP3]])
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP57]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB59:.*]], label %[[BB60:.*]], !prof [[PROF1]]
+; CHECK: [[BB59]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB60]]:
; CHECK-NEXT: [[R:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0]], <16 x i32> [[T]], <16 x i32> [[X1]])
; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <16 x i32> [[R]]
@@ -385,8 +540,7 @@ define <8 x i16> @shuffle_vpermv3_v8i16(<8 x i16> %x0, <8 x i16> %x1) #0 {
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i16> [[TMP1]], zeroinitializer
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i16> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> [[TMP1]], <8 x i16> <i16 8, i16 6, i16 10, i16 4, i16 12, i16 2, i16 14, i16 0>, <8 x i16> [[TMP2]])
; CHECK-NEXT: [[R:%.*]] = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> [[X0]], <8 x i16> <i16 8, i16 6, i16 10, i16 4, i16 12, i16 2, i16 14, i16 0>, <8 x i16> [[X1]])
; CHECK-NEXT: store <8 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <8 x i16> [[R]]
@@ -400,8 +554,7 @@ define <8 x i16> @shuffle_vpermv3_v8i16_unary(<8 x i16> %x0) #0 {
; CHECK-SAME: <8 x i16> [[X0:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i16> [[TMP1]], zeroinitializer
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i16> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> [[TMP1]], <8 x i16> <i16 8, i16 6, i16 10, i16 4, i16 12, i16 2, i16 14, i16 0>, <8 x i16> [[TMP1]])
; CHECK-NEXT: [[R:%.*]] = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> [[X0]], <8 x i16> <i16 8, i16 6, i16 10, i16 4, i16 12, i16 2, i16 14, i16 0>, <8 x i16> [[X0]])
; CHECK-NEXT: store <8 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <8 x i16> [[R]]
@@ -424,8 +577,37 @@ define <8 x i16> @shuffle_vpermv3_v8i16_demandedbits(<8 x i16> %x0, <8 x i16> %x
; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i16> [[TMP5]], [[TMP2]]
; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i16> [[TMP8]], [[TMP7]]
; CHECK-NEXT: [[T:%.*]] = or <8 x i16> [[M]], <i16 0, i16 16, i16 32, i16 64, i16 256, i16 512, i16 -16, i16 -32>
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i16> [[TMP6]], [[TMP9]]
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i16> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP9]], i64 0
+; CHECK-NEXT: [[TMP11:%.*]] = and i16 [[TMP10]], 7
+; CHECK-NEXT: [[TMP26:%.*]] = or i16 [[TMP10]], [[TMP11]]
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <8 x i16> [[TMP9]], i64 1
+; CHECK-NEXT: [[TMP13:%.*]] = and i16 [[TMP12]], 7
+; CHECK-NEXT: [[TMP28:%.*]] = or i16 [[TMP12]], [[TMP13]]
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i16> [[TMP9]], i64 2
+; CHECK-NEXT: [[TMP15:%.*]] = and i16 [[TMP14]], 7
+; CHECK-NEXT: [[TMP29:%.*]] = or i16 [[TMP14]], [[TMP15]]
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i16> [[TMP9]], i64 3
+; CHECK-NEXT: [[TMP17:%.*]] = and i16 [[TMP16]], 7
+; CHECK-NEXT: [[TMP31:%.*]] = or i16 [[TMP16]], [[TMP17]]
+; CHECK-NEXT: [[TMP18:%.*]] = extractelement <8 x i16> [[TMP9]], i64 4
+; CHECK-NEXT: [[TMP19:%.*]] = and i16 [[TMP18]], 7
+; CHECK-NEXT: [[TMP32:%.*]] = or i16 [[TMP18]], [[TMP19]]
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i16> [[TMP9]], i64 5
+; CHECK-NEXT: [[TMP21:%.*]] = and i16 [[TMP20]], 7
+; CHECK-NEXT: [[TMP27:%.*]] = or i16 [[TMP20]], [[TMP21]]
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <8 x i16> [[TMP9]], i64 6
+; CHECK-NEXT: [[TMP23:%.*]] = and i16 [[TMP22]], 7
+; CHECK-NEXT: [[TMP30:%.*]] = or i16 [[TMP22]], [[TMP23]]
+; CHECK-NEXT: [[TMP24:%.*]] = extractelement <8 x i16> [[TMP9]], i64 7
+; CHECK-NEXT: [[TMP25:%.*]] = and i16 [[TMP24]], 7
+; CHECK-NEXT: [[TMP33:%.*]] = or i16 [[TMP24]], [[TMP25]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> [[TMP6]], <8 x i16> [[T]], <8 x i16> [[TMP3]])
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i16 [[TMP33]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB35:.*]], label %[[BB36:.*]], !prof [[PROF1]]
+; CHECK: [[BB35]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB36]]:
; CHECK-NEXT: [[R:%.*]] = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> [[X0]], <8 x i16> [[T]], <8 x i16> [[X1]])
; CHECK-NEXT: store <8 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <8 x i16> [[R]]
@@ -441,8 +623,7 @@ define <16 x i16> @shuffle_vpermv3_v16i16(<16 x i16> %x0, <16 x i16> %x1) #0 {
; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], zeroinitializer
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i16> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> [[TMP1]], <16 x i16> <i16 16, i16 14, i16 18, i16 12, i16 20, i16 10, i16 22, i16 8, i16 24, i16 6, i16 26, i16 4, i16 28, i16 2, i16 30, i16 0>, <16 x i16> [[TMP2]])
; CHECK-NEXT: [[R:%.*]] = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> [[X0]], <16 x i16> <i16 16, i16 14, i16 18, i16 12, i16 20, i16 10, i16 22, i16 8, i16 24, i16 6, i16 26, i16 4, i16 28, i16 2, i16 30, i16 0>, <16 x i16> [[X1]])
; CHECK-NEXT: store <16 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <16 x i16> [[R]]
@@ -456,8 +637,7 @@ define <16 x i16> @shuffle_vpermv3_v16i16_unary(<16 x i16> %x0) #0 {
; CHECK-SAME: <16 x i16> [[X0:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], zeroinitializer
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i16> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> [[TMP1]], <16 x i16> <i16 16, i16 14, i16 18, i16 12, i16 20, i16 10, i16 22, i16 8, i16 24, i16 6, i16 26, i16 4, i16 28, i16 2, i16 30, i16 0>, <16 x i16> [[TMP1]])
; CHECK-NEXT: [[R:%.*]] = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> [[X0]], <16 x i16> <i16 16, i16 14, i16 18, i16 12, i16 20, i16 10, i16 22, i16 8, i16 24, i16 6, i16 26, i16 4, i16 28, i16 2, i16 30, i16 0>, <16 x i16> [[X0]])
; CHECK-NEXT: store <16 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <16 x i16> [[R]]
@@ -480,8 +660,61 @@ define <16 x i16> @shuffle_vpermv3_v16i16_demandedbits(<16 x i16> %x0, <16 x i16
; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i16> [[TMP5]], [[TMP2]]
; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i16> [[TMP8]], [[TMP7]]
; CHECK-NEXT: [[T:%.*]] = or <16 x i16> [[M]], <i16 0, i16 32, i16 64, i16 256, i16 512, i16 1024, i16 2048, i16 4096, i16 -32, i16 -64, i16 -128, i16 -256, i16 -512, i16 -1024, i16 -2048, i16 -4096>
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i16> [[TMP6]], [[TMP9]]
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i16> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <16 x i16> [[TMP9]], i64 0
+; CHECK-NEXT: [[TMP11:%.*]] = and i16 [[TMP10]], 15
+; CHECK-NEXT: [[TMP43:%.*]] = or i16 [[TMP10]], [[TMP11]]
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <16 x i16> [[TMP9]], i64 1
+; CHECK-NEXT: [[TMP13:%.*]] = and i16 [[TMP12]], 15
+; CHECK-NEXT: [[TMP44:%.*]] = or i16 [[TMP12]], [[TMP13]]
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <16 x i16> [[TMP9]], i64 2
+; CHECK-NEXT: [[TMP15:%.*]] = and i16 [[TMP14]], 15
+; CHECK-NEXT: [[TMP46:%.*]] = or i16 [[TMP14]], [[TMP15]]
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <16 x i16> [[TMP9]], i64 3
+; CHECK-NEXT: [[TMP17:%.*]] = and i16 [[TMP16]], 15
+; CHECK-NEXT: [[TMP47:%.*]] = or i16 [[TMP16]], [[TMP17]]
+; CHECK-NEXT: [[TMP18:%.*]] = extractelement <16 x i16> [[TMP9]], i64 4
+; CHECK-NEXT: [[TMP19:%.*]] = and i16 [[TMP18]], 15
+; CHECK-NEXT: [[TMP49:%.*]] = or i16 [[TMP18]], [[TMP19]]
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <16 x i16> [[TMP9]], i64 5
+; CHECK-NEXT: [[TMP21:%.*]] = and i16 [[TMP20]], 15
+; CHECK-NEXT: [[TMP50:%.*]] = or i16 [[TMP20]], [[TMP21]]
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <16 x i16> [[TMP9]], i64 6
+; CHECK-NEXT: [[TMP23:%.*]] = and i16 [[TMP22]], 15
+; CHECK-NEXT: [[TMP52:%.*]] = or i16 [[TMP22]], [[TMP23]]
+; CHECK-NEXT: [[TMP24:%.*]] = extractelement <16 x i16> [[TMP9]], i64 7
+; CHECK-NEXT: [[TMP25:%.*]] = and i16 [[TMP24]], 15
+; CHECK-NEXT: [[TMP53:%.*]] = or i16 [[TMP24]], [[TMP25]]
+; CHECK-NEXT: [[TMP26:%.*]] = extractelement <16 x i16> [[TMP9]], i64 8
+; CHECK-NEXT: [[TMP27:%.*]] = and i16 [[TMP26]], 15
+; CHECK-NEXT: [[TMP55:%.*]] = or i16 [[TMP26]], [[TMP27]]
+; CHECK-NEXT: [[TMP28:%.*]] = extractelement <16 x i16> [[TMP9]], i64 9
+; CHECK-NEXT: [[TMP29:%.*]] = and i16 [[TMP28]], 15
+; CHECK-NEXT: [[TMP56:%.*]] = or i16 [[TMP28]], [[TMP29]]
+; CHECK-NEXT: [[TMP30:%.*]] = extractelement <16 x i16> [[TMP9]], i64 10
+; CHECK-NEXT: [[TMP31:%.*]] = and i16 [[TMP30]], 15
+; CHECK-NEXT: [[TMP42:%.*]] = or i16 [[TMP30]], [[TMP31]]
+; CHECK-NEXT: [[TMP32:%.*]] = extractelement <16 x i16> [[TMP9]], i64 11
+; CHECK-NEXT: [[TMP33:%.*]] = and i16 [[TMP32]], 15
+; CHECK-NEXT: [[TMP45:%.*]] = or i16 [[TMP32]], [[TMP33]]
+; CHECK-NEXT: [[TMP34:%.*]] = extractelement <16 x i16> [[TMP9]], i64 12
+; CHECK-NEXT: [[TMP35:%.*]] = and i16 [[TMP34]], 15
+; CHECK-NEXT: [[TMP48:%.*]] = or i16 [[TMP34]], [[TMP35]]
+; CHECK-NEXT: [[TMP36:%.*]] = extractelement <16 x i16> [[TMP9]], i64 13
+; CHECK-NEXT: [[TMP37:%.*]] = and i16 [[TMP36]], 15
+; CHECK-NEXT: [[TMP51:%.*]] = or i16 [[TMP36]], [[TMP37]]
+; CHECK-NEXT: [[TMP38:%.*]] = extractelement <16 x i16> [[TMP9]], i64 14
+; CHECK-NEXT: [[TMP39:%.*]] = and i16 [[TMP38]], 15
+; CHECK-NEXT: [[TMP54:%.*]] = or i16 [[TMP38]], [[TMP39]]
+; CHECK-NEXT: [[TMP40:%.*]] = extractelement <16 x i16> [[TMP9]], i64 15
+; CHECK-NEXT: [[TMP41:%.*]] = and i16 [[TMP40]], 15
+; CHECK-NEXT: [[TMP57:%.*]] = or i16 [[TMP40]], [[TMP41]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> [[TMP6]], <16 x i16> [[T]], <16 x i16> [[TMP3]])
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i16 [[TMP57]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB59:.*]], label %[[BB60:.*]], !prof [[PROF1]]
+; CHECK: [[BB59]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB60]]:
; CHECK-NEXT: [[R:%.*]] = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> [[X0]], <16 x i16> [[T]], <16 x i16> [[X1]])
; CHECK-NEXT: store <16 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <16 x i16> [[R]]
@@ -497,8 +730,7 @@ define <32 x i16> @shuffle_vpermv3_v32i16(<32 x i16> %x0, <32 x i16> %x1) #0 {
; CHECK-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], zeroinitializer
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <32 x i16> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[TMP1]], <32 x i16> <i16 33, i16 17, i16 35, i16 19, i16 37, i16 21, i16 39, i16 23, i16 41, i16 25, i16 43, i16 27, i16 45, i16 29, i16 47, i16 31, i16 49, i16 14, i16 51, i16 12, i16 53, i16 10, i16 55, i16 8, i16 57, i16 6, i16 59, i16 4, i16 61, i16 2, i16 63, i16 0>, <32 x i16> [[TMP2]])
; CHECK-NEXT: [[R:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[X0]], <32 x i16> <i16 33, i16 17, i16 35, i16 19, i16 37, i16 21, i16 39, i16 23, i16 41, i16 25, i16 43, i16 27, i16 45, i16 29, i16 47, i16 31, i16 49, i16 14, i16 51, i16 12, i16 53, i16 10, i16 55, i16 8, i16 57, i16 6, i16 59, i16 4, i16 61, i16 2, i16 63, i16 0>, <32 x i16> [[X1]])
; CHECK-NEXT: store <32 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <32 x i16> [[R]]
@@ -512,8 +744,7 @@ define <32 x i16> @shuffle_vpermv3_v32i16_unary(<32 x i16> %x0) #0 {
; CHECK-SAME: <32 x i16> [[X0:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], zeroinitializer
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <32 x i16> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[TMP1]], <32 x i16> <i16 33, i16 17, i16 35, i16 19, i16 37, i16 21, i16 39, i16 23, i16 41, i16 25, i16 43, i16 27, i16 45, i16 29, i16 47, i16 31, i16 49, i16 14, i16 51, i16 12, i16 53, i16 10, i16 55, i16 8, i16 57, i16 6, i16 59, i16 4, i16 61, i16 2, i16 63, i16 0>, <32 x i16> [[TMP1]])
; CHECK-NEXT: [[R:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[X0]], <32 x i16> <i16 33, i16 17, i16 35, i16 19, i16 37, i16 21, i16 39, i16 23, i16 41, i16 25, i16 43, i16 27, i16 45, i16 29, i16 47, i16 31, i16 49, i16 14, i16 51, i16 12, i16 53, i16 10, i16 55, i16 8, i16 57, i16 6, i16 59, i16 4, i16 61, i16 2, i16 63, i16 0>, <32 x i16> [[X0]])
; CHECK-NEXT: store <32 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <32 x i16> [[R]]
@@ -536,8 +767,109 @@ define <32 x i16> @shuffle_vpermv3_v32i16_demandedbits(<32 x i16> %x0, <32 x i16
; CHECK-NEXT: [[TMP8:%.*]] = or <32 x i16> [[TMP5]], [[TMP2]]
; CHECK-NEXT: [[TMP9:%.*]] = or <32 x i16> [[TMP8]], [[TMP7]]
; CHECK-NEXT: [[T:%.*]] = or <32 x i16> [[M]], <i16 0, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048, i16 4096, i16 0, i16 -64, i16 -128, i16 -256, i16 -512, i16 -1024, i16 -2048, i16 -4096, i16 0, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048, i16 4096, i16 0, i16 -64, i16 -128, i16 -256, i16 -512, i16 -1024, i16 -2048, i16 -4096>
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <32 x i16> [[TMP6]], [[TMP9]]
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <32 x i16> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <32 x i16> [[TMP9]], i64 0
+; CHECK-NEXT: [[TMP11:%.*]] = and i16 [[TMP10]], 31
+; CHECK-NEXT: [[TMP74:%.*]] = or i16 [[TMP10]], [[TMP11]]
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <32 x i16> [[TMP9]], i64 1
+; CHECK-NEXT: [[TMP13:%.*]] = and i16 [[TMP12]], 31
+; CHECK-NEXT: [[TMP76:%.*]] = or i16 [[TMP12]], [[TMP13]]
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <32 x i16> [[TMP9]], i64 2
+; CHECK-NEXT: [[TMP15:%.*]] = and i16 [[TMP14]], 31
+; CHECK-NEXT: [[TMP77:%.*]] = or i16 [[TMP14]], [[TMP15]]
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <32 x i16> [[TMP9]], i64 3
+; CHECK-NEXT: [[TMP17:%.*]] = and i16 [[TMP16]], 31
+; CHECK-NEXT: [[TMP79:%.*]] = or i16 [[TMP16]], [[TMP17]]
+; CHECK-NEXT: [[TMP18:%.*]] = extractelement <32 x i16> [[TMP9]], i64 4
+; CHECK-NEXT: [[TMP19:%.*]] = and i16 [[TMP18]], 31
+; CHECK-NEXT: [[TMP80:%.*]] = or i16 [[TMP18]], [[TMP19]]
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <32 x i16> [[TMP9]], i64 5
+; CHECK-NEXT: [[TMP21:%.*]] = and i16 [[TMP20]], 31
+; CHECK-NEXT: [[TMP82:%.*]] = or i16 [[TMP20]], [[TMP21]]
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <32 x i16> [[TMP9]], i64 6
+; CHECK-NEXT: [[TMP23:%.*]] = and i16 [[TMP22]], 31
+; CHECK-NEXT: [[TMP83:%.*]] = or i16 [[TMP22]], [[TMP23]]
+; CHECK-NEXT: [[TMP24:%.*]] = extractelement <32 x i16> [[TMP9]], i64 7
+; CHECK-NEXT: [[TMP25:%.*]] = and i16 [[TMP24]], 31
+; CHECK-NEXT: [[TMP85:%.*]] = or i16 [[TMP24]], [[TMP25]]
+; CHECK-NEXT: [[TMP26:%.*]] = extractelement <32 x i16> [[TMP9]], i64 8
+; CHECK-NEXT: [[TMP27:%.*]] = and i16 [[TMP26]], 31
+; CHECK-NEXT: [[TMP86:%.*]] = or i16 [[TMP26]], [[TMP27]]
+; CHECK-NEXT: [[TMP28:%.*]] = extractelement <32 x i16> [[TMP9]], i64 9
+; CHECK-NEXT: [[TMP29:%.*]] = and i16 [[TMP28]], 31
+; CHECK-NEXT: [[TMP88:%.*]] = or i16 [[TMP28]], [[TMP29]]
+; CHECK-NEXT: [[TMP30:%.*]] = extractelement <32 x i16> [[TMP9]], i64 10
+; CHECK-NEXT: [[TMP31:%.*]] = and i16 [[TMP30]], 31
+; CHECK-NEXT: [[TMP89:%.*]] = or i16 [[TMP30]], [[TMP31]]
+; CHECK-NEXT: [[TMP32:%.*]] = extractelement <32 x i16> [[TMP9]], i64 11
+; CHECK-NEXT: [[TMP33:%.*]] = and i16 [[TMP32]], 31
+; CHECK-NEXT: [[TMP91:%.*]] = or i16 [[TMP32]], [[TMP33]]
+; CHECK-NEXT: [[TMP34:%.*]] = extractelement <32 x i16> [[TMP9]], i64 12
+; CHECK-NEXT: [[TMP35:%.*]] = and i16 [[TMP34]], 31
+; CHECK-NEXT: [[TMP92:%.*]] = or i16 [[TMP34]], [[TMP35]]
+; CHECK-NEXT: [[TMP36:%.*]] = extractelement <32 x i16> [[TMP9]], i64 13
+; CHECK-NEXT: [[TMP37:%.*]] = and i16 [[TMP36]], 31
+; CHECK-NEXT: [[TMP94:%.*]] = or i16 [[TMP36]], [[TMP37]]
+; CHECK-NEXT: [[TMP38:%.*]] = extractelement <32 x i16> [[TMP9]], i64 14
+; CHECK-NEXT: [[TMP39:%.*]] = and i16 [[TMP38]], 31
+; CHECK-NEXT: [[TMP95:%.*]] = or i16 [[TMP38]], [[TMP39]]
+; CHECK-NEXT: [[TMP40:%.*]] = extractelement <32 x i16> [[TMP9]], i64 15
+; CHECK-NEXT: [[TMP41:%.*]] = and i16 [[TMP40]], 31
+; CHECK-NEXT: [[TMP97:%.*]] = or i16 [[TMP40]], [[TMP41]]
+; CHECK-NEXT: [[TMP42:%.*]] = extractelement <32 x i16> [[TMP9]], i64 16
+; CHECK-NEXT: [[TMP43:%.*]] = and i16 [[TMP42]], 31
+; CHECK-NEXT: [[TMP98:%.*]] = or i16 [[TMP42]], [[TMP43]]
+; CHECK-NEXT: [[TMP44:%.*]] = extractelement <32 x i16> [[TMP9]], i64 17
+; CHECK-NEXT: [[TMP45:%.*]] = and i16 [[TMP44]], 31
+; CHECK-NEXT: [[TMP100:%.*]] = or i16 [[TMP44]], [[TMP45]]
+; CHECK-NEXT: [[TMP46:%.*]] = extractelement <32 x i16> [[TMP9]], i64 18
+; CHECK-NEXT: [[TMP47:%.*]] = and i16 [[TMP46]], 31
+; CHECK-NEXT: [[TMP101:%.*]] = or i16 [[TMP46]], [[TMP47]]
+; CHECK-NEXT: [[TMP48:%.*]] = extractelement <32 x i16> [[TMP9]], i64 19
+; CHECK-NEXT: [[TMP49:%.*]] = and i16 [[TMP48]], 31
+; CHECK-NEXT: [[TMP103:%.*]] = or i16 [[TMP48]], [[TMP49]]
+; CHECK-NEXT: [[TMP50:%.*]] = extractelement <32 x i16> [[TMP9]], i64 20
+; CHECK-NEXT: [[TMP51:%.*]] = and i16 [[TMP50]], 31
+; CHECK-NEXT: [[TMP105:%.*]] = or i16 [[TMP50]], [[TMP51]]
+; CHECK-NEXT: [[TMP52:%.*]] = extractelement <32 x i16> [[TMP9]], i64 21
+; CHECK-NEXT: [[TMP53:%.*]] = and i16 [[TMP52]], 31
+; CHECK-NEXT: [[TMP75:%.*]] = or i16 [[TMP52]], [[TMP53]]
+; CHECK-NEXT: [[TMP54:%.*]] = extractelement <32 x i16> [[TMP9]], i64 22
+; CHECK-NEXT: [[TMP55:%.*]] = and i16 [[TMP54]], 31
+; CHECK-NEXT: [[TMP78:%.*]] = or i16 [[TMP54]], [[TMP55]]
+; CHECK-NEXT: [[TMP56:%.*]] = extractelement <32 x i16> [[TMP9]], i64 23
+; CHECK-NEXT: [[TMP57:%.*]] = and i16 [[TMP56]], 31
+; CHECK-NEXT: [[TMP81:%.*]] = or i16 [[TMP56]], [[TMP57]]
+; CHECK-NEXT: [[TMP58:%.*]] = extractelement <32 x i16> [[TMP9]], i64 24
+; CHECK-NEXT: [[TMP59:%.*]] = and i16 [[TMP58]], 31
+; CHECK-NEXT: [[TMP84:%.*]] = or i16 [[TMP58]], [[TMP59]]
+; CHECK-NEXT: [[TMP60:%.*]] = extractelement <32 x i16> [[TMP9]], i64 25
+; CHECK-NEXT: [[TMP61:%.*]] = and i16 [[TMP60]], 31
+; CHECK-NEXT: [[TMP87:%.*]] = or i16 [[TMP60]], [[TMP61]]
+; CHECK-NEXT: [[TMP62:%.*]] = extractelement <32 x i16> [[TMP9]], i64 26
+; CHECK-NEXT: [[TMP63:%.*]] = and i16 [[TMP62]], 31
+; CHECK-NEXT: [[TMP90:%.*]] = or i16 [[TMP62]], [[TMP63]]
+; CHECK-NEXT: [[TMP64:%.*]] = extractelement <32 x i16> [[TMP9]], i64 27
+; CHECK-NEXT: [[TMP65:%.*]] = and i16 [[TMP64]], 31
+; CHECK-NEXT: [[TMP93:%.*]] = or i16 [[TMP64]], [[TMP65]]
+; CHECK-NEXT: [[TMP66:%.*]] = extractelement <32 x i16> [[TMP9]], i64 28
+; CHECK-NEXT: [[TMP67:%.*]] = and i16 [[TMP66]], 31
+; CHECK-NEXT: [[TMP96:%.*]] = or i16 [[TMP66]], [[TMP67]]
+; CHECK-NEXT: [[TMP68:%.*]] = extractelement <32 x i16> [[TMP9]], i64 29
+; CHECK-NEXT: [[TMP69:%.*]] = and i16 [[TMP68]], 31
+; CHECK-NEXT: [[TMP99:%.*]] = or i16 [[TMP68]], [[TMP69]]
+; CHECK-NEXT: [[TMP70:%.*]] = extractelement <32 x i16> [[TMP9]], i64 30
+; CHECK-NEXT: [[TMP71:%.*]] = and i16 [[TMP70]], 31
+; CHECK-NEXT: [[TMP102:%.*]] = or i16 [[TMP70]], [[TMP71]]
+; CHECK-NEXT: [[TMP72:%.*]] = extractelement <32 x i16> [[TMP9]], i64 31
+; CHECK-NEXT: [[TMP104:%.*]] = and i16 [[TMP72]], 31
+; CHECK-NEXT: [[TMP73:%.*]] = or i16 [[TMP72]], [[TMP104]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[TMP6]], <32 x i16> [[T]], <32 x i16> [[TMP3]])
+; CHECK-NEXT: [[_MSCMP60:%.*]] = icmp ne i16 [[TMP73]], 0
+; CHECK-NEXT: br i1 [[_MSCMP60]], label %[[BB107:.*]], label %[[BB108:.*]], !prof [[PROF1]]
+; CHECK: [[BB107]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB108]]:
; CHECK-NEXT: [[R:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[X0]], <32 x i16> [[T]], <32 x i16> [[X1]])
; CHECK-NEXT: store <32 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <32 x i16> [[R]]
@@ -557,8 +889,7 @@ define <16 x i8> @shuffle_vpermv3_v16i8(<16 x i8> %x0, <16 x i8> %x1) #0 {
; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], zeroinitializer
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> [[TMP1]], <16 x i8> <i8 16, i8 14, i8 18, i8 12, i8 20, i8 10, i8 22, i8 8, i8 24, i8 6, i8 26, i8 4, i8 28, i8 2, i8 30, i8 0>, <16 x i8> [[TMP2]])
; CHECK-NEXT: [[R:%.*]] = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> [[X0]], <16 x i8> <i8 16, i8 14, i8 18, i8 12, i8 20, i8 10, i8 22, i8 8, i8 24, i8 6, i8 26, i8 4, i8 28, i8 2, i8 30, i8 0>, <16 x i8> [[X1]])
; CHECK-NEXT: store <16 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <16 x i8> [[R]]
@@ -572,8 +903,7 @@ define <16 x i8> @shuffle_vpermv3_v16i8_unary(<16 x i8> %x0) #0 {
; CHECK-SAME: <16 x i8> [[X0:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], zeroinitializer
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> [[TMP1]], <16 x i8> <i8 16, i8 14, i8 18, i8 12, i8 20, i8 10, i8 22, i8 8, i8 24, i8 6, i8 26, i8 4, i8 28, i8 2, i8 30, i8 0>, <16 x i8> [[TMP1]])
; CHECK-NEXT: [[R:%.*]] = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> [[X0]], <16 x i8> <i8 16, i8 14, i8 18, i8 12, i8 20, i8 10, i8 22, i8 8, i8 24, i8 6, i8 26, i8 4, i8 28, i8 2, i8 30, i8 0>, <16 x i8> [[X0]])
; CHECK-NEXT: store <16 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <16 x i8> [[R]]
@@ -596,8 +926,61 @@ define <16 x i8> @shuffle_vpermv3_v16i8_demandedbits(<16 x i8> %x0, <16 x i8> %x
; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i8> [[TMP5]], [[TMP2]]
; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i8> [[TMP8]], [[TMP7]]
; CHECK-NEXT: [[T:%.*]] = or <16 x i8> [[M]], <i8 0, i8 32, i8 64, i8 -128, i8 0, i8 -32, i8 -64, i8 -128, i8 0, i8 32, i8 64, i8 -128, i8 0, i8 -32, i8 -64, i8 -128>
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i8> [[TMP6]], [[TMP9]]
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <16 x i8> [[TMP9]], i64 0
+; CHECK-NEXT: [[TMP11:%.*]] = and i8 [[TMP10]], 15
+; CHECK-NEXT: [[TMP43:%.*]] = or i8 [[TMP10]], [[TMP11]]
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <16 x i8> [[TMP9]], i64 1
+; CHECK-NEXT: [[TMP13:%.*]] = and i8 [[TMP12]], 15
+; CHECK-NEXT: [[TMP44:%.*]] = or i8 [[TMP12]], [[TMP13]]
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <16 x i8> [[TMP9]], i64 2
+; CHECK-NEXT: [[TMP15:%.*]] = and i8 [[TMP14]], 15
+; CHECK-NEXT: [[TMP46:%.*]] = or i8 [[TMP14]], [[TMP15]]
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <16 x i8> [[TMP9]], i64 3
+; CHECK-NEXT: [[TMP17:%.*]] = and i8 [[TMP16]], 15
+; CHECK-NEXT: [[TMP47:%.*]] = or i8 [[TMP16]], [[TMP17]]
+; CHECK-NEXT: [[TMP18:%.*]] = extractelement <16 x i8> [[TMP9]], i64 4
+; CHECK-NEXT: [[TMP19:%.*]] = and i8 [[TMP18]], 15
+; CHECK-NEXT: [[TMP49:%.*]] = or i8 [[TMP18]], [[TMP19]]
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <16 x i8> [[TMP9]], i64 5
+; CHECK-NEXT: [[TMP21:%.*]] = and i8 [[TMP20]], 15
+; CHECK-NEXT: [[TMP50:%.*]] = or i8 [[TMP20]], [[TMP21]]
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <16 x i8> [[TMP9]], i64 6
+; CHECK-NEXT: [[TMP23:%.*]] = and i8 [[TMP22]], 15
+; CHECK-NEXT: [[TMP52:%.*]] = or i8 [[TMP22]], [[TMP23]]
+; CHECK-NEXT: [[TMP24:%.*]] = extractelement <16 x i8> [[TMP9]], i64 7
+; CHECK-NEXT: [[TMP25:%.*]] = and i8 [[TMP24]], 15
+; CHECK-NEXT: [[TMP53:%.*]] = or i8 [[TMP24]], [[TMP25]]
+; CHECK-NEXT: [[TMP26:%.*]] = extractelement <16 x i8> [[TMP9]], i64 8
+; CHECK-NEXT: [[TMP27:%.*]] = and i8 [[TMP26]], 15
+; CHECK-NEXT: [[TMP55:%.*]] = or i8 [[TMP26]], [[TMP27]]
+; CHECK-NEXT: [[TMP28:%.*]] = extractelement <16 x i8> [[TMP9]], i64 9
+; CHECK-NEXT: [[TMP29:%.*]] = and i8 [[TMP28]], 15
+; CHECK-NEXT: [[TMP56:%.*]] = or i8 [[TMP28]], [[TMP29]]
+; CHECK-NEXT: [[TMP30:%.*]] = extractelement <16 x i8> [[TMP9]], i64 10
+; CHECK-NEXT: [[TMP31:%.*]] = and i8 [[TMP30]], 15
+; CHECK-NEXT: [[TMP42:%.*]] = or i8 [[TMP30]], [[TMP31]]
+; CHECK-NEXT: [[TMP32:%.*]] = extractelement <16 x i8> [[TMP9]], i64 11
+; CHECK-NEXT: [[TMP33:%.*]] = and i8 [[TMP32]], 15
+; CHECK-NEXT: [[TMP45:%.*]] = or i8 [[TMP32]], [[TMP33]]
+; CHECK-NEXT: [[TMP34:%.*]] = extractelement <16 x i8> [[TMP9]], i64 12
+; CHECK-NEXT: [[TMP35:%.*]] = and i8 [[TMP34]], 15
+; CHECK-NEXT: [[TMP48:%.*]] = or i8 [[TMP34]], [[TMP35]]
+; CHECK-NEXT: [[TMP36:%.*]] = extractelement <16 x i8> [[TMP9]], i64 13
+; CHECK-NEXT: [[TMP37:%.*]] = and i8 [[TMP36]], 15
+; CHECK-NEXT: [[TMP51:%.*]] = or i8 [[TMP36]], [[TMP37]]
+; CHECK-NEXT: [[TMP38:%.*]] = extractelement <16 x i8> [[TMP9]], i64 14
+; CHECK-NEXT: [[TMP39:%.*]] = and i8 [[TMP38]], 15
+; CHECK-NEXT: [[TMP54:%.*]] = or i8 [[TMP38]], [[TMP39]]
+; CHECK-NEXT: [[TMP40:%.*]] = extractelement <16 x i8> [[TMP9]], i64 15
+; CHECK-NEXT: [[TMP41:%.*]] = and i8 [[TMP40]], 15
+; CHECK-NEXT: [[TMP57:%.*]] = or i8 [[TMP40]], [[TMP41]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> [[TMP6]], <16 x i8> [[T]], <16 x i8> [[TMP3]])
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i8 [[TMP57]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB59:.*]], label %[[BB60:.*]], !prof [[PROF1]]
+; CHECK: [[BB59]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB60]]:
; CHECK-NEXT: [[R:%.*]] = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> [[X0]], <16 x i8> [[T]], <16 x i8> [[X1]])
; CHECK-NEXT: store <16 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <16 x i8> [[R]]
@@ -613,8 +996,7 @@ define <32 x i8> @shuffle_vpermv3_v32i8(<32 x i8> %x0, <32 x i8> %x1) #0 {
; CHECK-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <32 x i8> [[TMP1]], zeroinitializer
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <32 x i8> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> [[TMP1]], <32 x i8> <i8 33, i8 17, i8 35, i8 19, i8 37, i8 21, i8 39, i8 23, i8 41, i8 25, i8 43, i8 27, i8 45, i8 29, i8 47, i8 31, i8 49, i8 14, i8 51, i8 12, i8 53, i8 10, i8 55, i8 8, i8 57, i8 6, i8 59, i8 4, i8 61, i8 2, i8 63, i8 0>, <32 x i8> [[TMP2]])
; CHECK-NEXT: [[R:%.*]] = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> [[X0]], <32 x i8> <i8 33, i8 17, i8 35, i8 19, i8 37, i8 21, i8 39, i8 23, i8 41, i8 25, i8 43, i8 27, i8 45, i8 29, i8 47, i8 31, i8 49, i8 14, i8 51, i8 12, i8 53, i8 10, i8 55, i8 8, i8 57, i8 6, i8 59, i8 4, i8 61, i8 2, i8 63, i8 0>, <32 x i8> [[X1]])
; CHECK-NEXT: store <32 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <32 x i8> [[R]]
@@ -628,8 +1010,7 @@ define <32 x i8> @shuffle_vpermv3_v32i8_unary(<32 x i8> %x0) #0 {
; CHECK-SAME: <32 x i8> [[X0:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <32 x i8> [[TMP1]], zeroinitializer
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <32 x i8> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> [[TMP1]], <32 x i8> <i8 33, i8 17, i8 35, i8 19, i8 37, i8 21, i8 39, i8 23, i8 41, i8 25, i8 43, i8 27, i8 45, i8 29, i8 47, i8 31, i8 49, i8 14, i8 51, i8 12, i8 53, i8 10, i8 55, i8 8, i8 57, i8 6, i8 59, i8 4, i8 61, i8 2, i8 63, i8 0>, <32 x i8> [[TMP1]])
; CHECK-NEXT: [[R:%.*]] = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> [[X0]], <32 x i8> <i8 33, i8 17, i8 35, i8 19, i8 37, i8 21, i8 39, i8 23, i8 41, i8 25, i8 43, i8 27, i8 45, i8 29, i8 47, i8 31, i8 49, i8 14, i8 51, i8 12, i8 53, i8 10, i8 55, i8 8, i8 57, i8 6, i8 59, i8 4, i8 61, i8 2, i8 63, i8 0>, <32 x i8> [[X0]])
; CHECK-NEXT: store <32 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <32 x i8> [[R]]
@@ -652,8 +1033,109 @@ define <32 x i8> @shuffle_vpermv3_v32i8_demandedbits(<32 x i8> %x0, <32 x i8> %x
; CHECK-NEXT: [[TMP8:%.*]] = or <32 x i8> [[TMP5]], [[TMP2]]
; CHECK-NEXT: [[TMP9:%.*]] = or <32 x i8> [[TMP8]], [[TMP7]]
; CHECK-NEXT: [[T:%.*]] = or <32 x i8> [[M]], <i8 0, i8 0, i8 64, i8 -128, i8 0, i8 0, i8 -64, i8 -128, i8 0, i8 0, i8 64, i8 -128, i8 0, i8 0, i8 -64, i8 -128, i8 0, i8 0, i8 64, i8 -128, i8 0, i8 0, i8 -64, i8 -128, i8 0, i8 0, i8 64, i8 -128, i8 0, i8 0, i8 -64, i8 -128>
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <32 x i8> [[TMP6]], [[TMP9]]
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <32 x i8> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <32 x i8> [[TMP9]], i64 0
+; CHECK-NEXT: [[TMP11:%.*]] = and i8 [[TMP10]], 31
+; CHECK-NEXT: [[TMP74:%.*]] = or i8 [[TMP10]], [[TMP11]]
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <32 x i8> [[TMP9]], i64 1
+; CHECK-NEXT: [[TMP13:%.*]] = and i8 [[TMP12]], 31
+; CHECK-NEXT: [[TMP76:%.*]] = or i8 [[TMP12]], [[TMP13]]
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <32 x i8> [[TMP9]], i64 2
+; CHECK-NEXT: [[TMP15:%.*]] = and i8 [[TMP14]], 31
+; CHECK-NEXT: [[TMP77:%.*]] = or i8 [[TMP14]], [[TMP15]]
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <32 x i8> [[TMP9]], i64 3
+; CHECK-NEXT: [[TMP17:%.*]] = and i8 [[TMP16]], 31
+; CHECK-NEXT: [[TMP79:%.*]] = or i8 [[TMP16]], [[TMP17]]
+; CHECK-NEXT: [[TMP18:%.*]] = extractelement <32 x i8> [[TMP9]], i64 4
+; CHECK-NEXT: [[TMP19:%.*]] = and i8 [[TMP18]], 31
+; CHECK-NEXT: [[TMP80:%.*]] = or i8 [[TMP18]], [[TMP19]]
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <32 x i8> [[TMP9]], i64 5
+; CHECK-NEXT: [[TMP21:%.*]] = and i8 [[TMP20]], 31
+; CHECK-NEXT: [[TMP82:%.*]] = or i8 [[TMP20]], [[TMP21]]
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <32 x i8> [[TMP9]], i64 6
+; CHECK-NEXT: [[TMP23:%.*]] = and i8 [[TMP22]], 31
+; CHECK-NEXT: [[TMP83:%.*]] = or i8 [[TMP22]], [[TMP23]]
+; CHECK-NEXT: [[TMP24:%.*]] = extractelement <32 x i8> [[TMP9]], i64 7
+; CHECK-NEXT: [[TMP25:%.*]] = and i8 [[TMP24]], 31
+; CHECK-NEXT: [[TMP85:%.*]] = or i8 [[TMP24]], [[TMP25]]
+; CHECK-NEXT: [[TMP26:%.*]] = extractelement <32 x i8> [[TMP9]], i64 8
+; CHECK-NEXT: [[TMP27:%.*]] = and i8 [[TMP26]], 31
+; CHECK-NEXT: [[TMP86:%.*]] = or i8 [[TMP26]], [[TMP27]]
+; CHECK-NEXT: [[TMP28:%.*]] = extractelement <32 x i8> [[TMP9]], i64 9
+; CHECK-NEXT: [[TMP29:%.*]] = and i8 [[TMP28]], 31
+; CHECK-NEXT: [[TMP88:%.*]] = or i8 [[TMP28]], [[TMP29]]
+; CHECK-NEXT: [[TMP30:%.*]] = extractelement <32 x i8> [[TMP9]], i64 10
+; CHECK-NEXT: [[TMP31:%.*]] = and i8 [[TMP30]], 31
+; CHECK-NEXT: [[TMP89:%.*]] = or i8 [[TMP30]], [[TMP31]]
+; CHECK-NEXT: [[TMP32:%.*]] = extractelement <32 x i8> [[TMP9]], i64 11
+; CHECK-NEXT: [[TMP33:%.*]] = and i8 [[TMP32]], 31
+; CHECK-NEXT: [[TMP91:%.*]] = or i8 [[TMP32]], [[TMP33]]
+; CHECK-NEXT: [[TMP34:%.*]] = extractelement <32 x i8> [[TMP9]], i64 12
+; CHECK-NEXT: [[TMP35:%.*]] = and i8 [[TMP34]], 31
+; CHECK-NEXT: [[TMP92:%.*]] = or i8 [[TMP34]], [[TMP35]]
+; CHECK-NEXT: [[TMP36:%.*]] = extractelement <32 x i8> [[TMP9]], i64 13
+; CHECK-NEXT: [[TMP37:%.*]] = and i8 [[TMP36]], 31
+; CHECK-NEXT: [[TMP94:%.*]] = or i8 [[TMP36]], [[TMP37]]
+; CHECK-NEXT: [[TMP38:%.*]] = extractelement <32 x i8> [[TMP9]], i64 14
+; CHECK-NEXT: [[TMP39:%.*]] = and i8 [[TMP38]], 31
+; CHECK-NEXT: [[TMP95:%.*]] = or i8 [[TMP38]], [[TMP39]]
+; CHECK-NEXT: [[TMP40:%.*]] = extractelement <32 x i8> [[TMP9]], i64 15
+; CHECK-NEXT: [[TMP41:%.*]] = and i8 [[TMP40]], 31
+; CHECK-NEXT: [[TMP97:%.*]] = or i8 [[TMP40]], [[TMP41]]
+; CHECK-NEXT: [[TMP42:%.*]] = extractelement <32 x i8> [[TMP9]], i64 16
+; CHECK-NEXT: [[TMP43:%.*]] = and i8 [[TMP42]], 31
+; CHECK-NEXT: [[TMP98:%.*]] = or i8 [[TMP42]], [[TMP43]]
+; CHECK-NEXT: [[TMP44:%.*]] = extractelement <32 x i8> [[TMP9]], i64 17
+; CHECK-NEXT: [[TMP45:%.*]] = and i8 [[TMP44]], 31
+; CHECK-NEXT: [[TMP100:%.*]] = or i8 [[TMP44]], [[TMP45]]
+; CHECK-NEXT: [[TMP46:%.*]] = extractelement <32 x i8> [[TMP9]], i64 18
+; CHECK-NEXT: [[TMP47:%.*]] = and i8 [[TMP46]], 31
+; CHECK-NEXT: [[TMP101:%.*]] = or i8 [[TMP46]], [[TMP47]]
+; CHECK-NEXT: [[TMP48:%.*]] = extractelement <32 x i8> [[TMP9]], i64 19
+; CHECK-NEXT: [[TMP49:%.*]] = and i8 [[TMP48]], 31
+; CHECK-NEXT: [[TMP103:%.*]] = or i8 [[TMP48]], [[TMP49]]
+; CHECK-NEXT: [[TMP50:%.*]] = extractelement <32 x i8> [[TMP9]], i64 20
+; CHECK-NEXT: [[TMP51:%.*]] = and i8 [[TMP50]], 31
+; CHECK-NEXT: [[TMP105:%.*]] = or i8 [[TMP50]], [[TMP51]]
+; CHECK-NEXT: [[TMP52:%.*]] = extractelement <32 x i8> [[TMP9]], i64 21
+; CHECK-NEXT: [[TMP53:%.*]] = and i8 [[TMP52]], 31
+; CHECK-NEXT: [[TMP75:%.*]] = or i8 [[TMP52]], [[TMP53]]
+; CHECK-NEXT: [[TMP54:%.*]] = extractelement <32 x i8> [[TMP9]], i64 22
+; CHECK-NEXT: [[TMP55:%.*]] = and i8 [[TMP54]], 31
+; CHECK-NEXT: [[TMP78:%.*]] = or i8 [[TMP54]], [[TMP55]]
+; CHECK-NEXT: [[TMP56:%.*]] = extractelement <32 x i8> [[TMP9]], i64 23
+; CHECK-NEXT: [[TMP57:%.*]] = and i8 [[TMP56]], 31
+; CHECK-NEXT: [[TMP81:%.*]] = or i8 [[TMP56]], [[TMP57]]
+; CHECK-NEXT: [[TMP58:%.*]] = extractelement <32 x i8> [[TMP9]], i64 24
+; CHECK-NEXT: [[TMP59:%.*]] = and i8 [[TMP58]], 31
+; CHECK-NEXT: [[TMP84:%.*]] = or i8 [[TMP58]], [[TMP59]]
+; CHECK-NEXT: [[TMP60:%.*]] = extractelement <32 x i8> [[TMP9]], i64 25
+; CHECK-NEXT: [[TMP61:%.*]] = and i8 [[TMP60]], 31
+; CHECK-NEXT: [[TMP87:%.*]] = or i8 [[TMP60]], [[TMP61]]
+; CHECK-NEXT: [[TMP62:%.*]] = extractelement <32 x i8> [[TMP9]], i64 26
+; CHECK-NEXT: [[TMP63:%.*]] = and i8 [[TMP62]], 31
+; CHECK-NEXT: [[TMP90:%.*]] = or i8 [[TMP62]], [[TMP63]]
+; CHECK-NEXT: [[TMP64:%.*]] = extractelement <32 x i8> [[TMP9]], i64 27
+; CHECK-NEXT: [[TMP65:%.*]] = and i8 [[TMP64]], 31
+; CHECK-NEXT: [[TMP93:%.*]] = or i8 [[TMP64]], [[TMP65]]
+; CHECK-NEXT: [[TMP66:%.*]] = extractelement <32 x i8> [[TMP9]], i64 28
+; CHECK-NEXT: [[TMP67:%.*]] = and i8 [[TMP66]], 31
+; CHECK-NEXT: [[TMP96:%.*]] = or i8 [[TMP66]], [[TMP67]]
+; CHECK-NEXT: [[TMP68:%.*]] = extractelement <32 x i8> [[TMP9]], i64 29
+; CHECK-NEXT: [[TMP69:%.*]] = and i8 [[TMP68]], 31
+; CHECK-NEXT: [[TMP99:%.*]] = or i8 [[TMP68]], [[TMP69]]
+; CHECK-NEXT: [[TMP70:%.*]] = extractelement <32 x i8> [[TMP9]], i64 30
+; CHECK-NEXT: [[TMP71:%.*]] = and i8 [[TMP70]], 31
+; CHECK-NEXT: [[TMP102:%.*]] = or i8 [[TMP70]], [[TMP71]]
+; CHECK-NEXT: [[TMP72:%.*]] = extractelement <32 x i8> [[TMP9]], i64 31
+; CHECK-NEXT: [[TMP104:%.*]] = and i8 [[TMP72]], 31
+; CHECK-NEXT: [[TMP73:%.*]] = or i8 [[TMP72]], [[TMP104]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> [[TMP6]], <32 x i8> [[T]], <32 x i8> [[TMP3]])
+; CHECK-NEXT: [[_MSCMP60:%.*]] = icmp ne i8 [[TMP73]], 0
+; CHECK-NEXT: br i1 [[_MSCMP60]], label %[[BB107:.*]], label %[[BB108:.*]], !prof [[PROF1]]
+; CHECK: [[BB107]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB108]]:
; CHECK-NEXT: [[R:%.*]] = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> [[X0]], <32 x i8> [[T]], <32 x i8> [[X1]])
; CHECK-NEXT: store <32 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <32 x i8> [[R]]
@@ -669,8 +1151,7 @@ define <64 x i8> @shuffle_vpermv3_v64i8(<64 x i8> %x0, <64 x i8> %x1) #0 {
; CHECK-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <64 x i8> [[TMP1]], zeroinitializer
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <64 x i8> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> [[TMP1]], <64 x i8> <i8 -128, i8 127, i8 126, i8 125, i8 124, i8 123, i8 122, i8 121, i8 120, i8 119, i8 118, i8 115, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <64 x i8> [[TMP2]])
; CHECK-NEXT: [[R:%.*]] = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> [[X0]], <64 x i8> <i8 -128, i8 127, i8 126, i8 125, i8 124, i8 123, i8 122, i8 121, i8 120, i8 119, i8 118, i8 115, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <64 x i8> [[X1]])
; CHECK-NEXT: store <64 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <64 x i8> [[R]]
@@ -684,8 +1165,7 @@ define <64 x i8> @shuffle_vpermv3_v64i8_unary(<64 x i8> %x0) #0 {
; CHECK-SAME: <64 x i8> [[X0:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <64 x i8> [[TMP1]], zeroinitializer
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <64 x i8> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> [[TMP1]], <64 x i8> <i8 -128, i8 127, i8 126, i8 125, i8 124, i8 123, i8 122, i8 121, i8 120, i8 119, i8 118, i8 115, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <64 x i8> [[TMP1]])
; CHECK-NEXT: [[R:%.*]] = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> [[X0]], <64 x i8> <i8 -128, i8 127, i8 126, i8 125, i8 124, i8 123, i8 122, i8 121, i8 120, i8 119, i8 118, i8 115, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <64 x i8> [[X0]])
; CHECK-NEXT: store <64 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <64 x i8> [[R]]
@@ -708,8 +1188,205 @@ define <64 x i8> @shuffle_vpermv3_v64i8_demandedbits(<64 x i8> %x0, <64 x i8> %x
; CHECK-NEXT: [[TMP8:%.*]] = or <64 x i8> [[TMP5]], [[TMP2]]
; CHECK-NEXT: [[TMP9:%.*]] = or <64 x i8> [[TMP8]], [[TMP7]]
; CHECK-NEXT: [[T:%.*]] = or <64 x i8> [[M]], <i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128>
-; CHECK-NEXT: [[_MSPROP:%.*]] = or <64 x i8> [[TMP6]], [[TMP9]]
-; CHECK-NEXT: [[_MSPROP1:%.*]] = or <64 x i8> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <64 x i8> [[TMP9]], i64 0
+; CHECK-NEXT: [[TMP11:%.*]] = and i8 [[TMP10]], 63
+; CHECK-NEXT: [[TMP139:%.*]] = or i8 [[TMP10]], [[TMP11]]
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <64 x i8> [[TMP9]], i64 1
+; CHECK-NEXT: [[TMP13:%.*]] = and i8 [[TMP12]], 63
+; CHECK-NEXT: [[TMP140:%.*]] = or i8 [[TMP12]], [[TMP13]]
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <64 x i8> [[TMP9]], i64 2
+; CHECK-NEXT: [[TMP15:%.*]] = and i8 [[TMP14]], 63
+; CHECK-NEXT: [[TMP142:%.*]] = or i8 [[TMP14]], [[TMP15]]
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <64 x i8> [[TMP9]], i64 3
+; CHECK-NEXT: [[TMP17:%.*]] = and i8 [[TMP16]], 63
+; CHECK-NEXT: [[TMP143:%.*]] = or i8 [[TMP16]], [[TMP17]]
+; CHECK-NEXT: [[TMP18:%.*]] = extractelement <64 x i8> [[TMP9]], i64 4
+; CHECK-NEXT: [[TMP19:%.*]] = and i8 [[TMP18]], 63
+; CHECK-NEXT: [[TMP145:%.*]] = or i8 [[TMP18]], [[TMP19]]
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <64 x i8> [[TMP9]], i64 5
+; CHECK-NEXT: [[TMP21:%.*]] = and i8 [[TMP20]], 63
+; CHECK-NEXT: [[TMP146:%.*]] = or i8 [[TMP20]], [[TMP21]]
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <64 x i8> [[TMP9]], i64 6
+; CHECK-NEXT: [[TMP23:%.*]] = and i8 [[TMP22]], 63
+; CHECK-NEXT: [[TMP148:%.*]] = or i8 [[TMP22]], [[TMP23]]
+; CHECK-NEXT: [[TMP24:%.*]] = extractelement <64 x i8> [[TMP9]], i64 7
+; CHECK-NEXT: [[TMP25:%.*]] = and i8 [[TMP24]], 63
+; CHECK-NEXT: [[TMP149:%.*]] = or i8 [[TMP24]], [[TMP25]]
+; CHECK-NEXT: [[TMP26:%.*]] = extractelement <64 x i8> [[TMP9]], i64 8
+; CHECK-NEXT: [[TMP27:%.*]] = and i8 [[TMP26]], 63
+; CHECK-NEXT: [[TMP151:%.*]] = or i8 [[TMP26]], [[TMP27]]
+; CHECK-NEXT: [[TMP28:%.*]] = extractelement <64 x i8> [[TMP9]], i64 9
+; CHECK-NEXT: [[TMP29:%.*]] = and i8 [[TMP28]], 63
+; CHECK-NEXT: [[TMP152:%.*]] = or i8 [[TMP28]], [[TMP29]]
+; CHECK-NEXT: [[TMP30:%.*]] = extractelement <64 x i8> [[TMP9]], i64 10
+; CHECK-NEXT: [[TMP31:%.*]] = and i8 [[TMP30]], 63
+; CHECK-NEXT: [[TMP154:%.*]] = or i8 [[TMP30]], [[TMP31]]
+; CHECK-NEXT: [[TMP32:%.*]] = extractelement <64 x i8> [[TMP9]], i64 11
+; CHECK-NEXT: [[TMP33:%.*]] = and i8 [[TMP32]], 63
+; CHECK-NEXT: [[TMP155:%.*]] = or i8 [[TMP32]], [[TMP33]]
+; CHECK-NEXT: [[TMP34:%.*]] = extractelement <64 x i8> [[TMP9]], i64 12
+; CHECK-NEXT: [[TMP35:%.*]] = and i8 [[TMP34]], 63
+; CHECK-NEXT: [[TMP157:%.*]] = or i8 [[TMP34]], [[TMP35]]
+; CHECK-NEXT: [[TMP36:%.*]] = extractelement <64 x i8> [[TMP9]], i64 13
+; CHECK-NEXT: [[TMP37:%.*]] = and i8 [[TMP36]], 63
+; CHECK-NEXT: [[TMP158:%.*]] = or i8 [[TMP36]], [[TMP37]]
+; CHECK-NEXT: [[TMP38:%.*]] = extractelement <64 x i8> [[TMP9]], i64 14
+; CHECK-NEXT: [[TMP39:%.*]] = and i8 [[TMP38]], 63
+; CHECK-NEXT: [[TMP160:%.*]] = or i8 [[TMP38]], [[TMP39]]
+; CHECK-NEXT: [[TMP40:%.*]] = extractelement <64 x i8> [[TMP9]], i64 15
+; CHECK-NEXT: [[TMP41:%.*]] = and i8 [[TMP40]], 63
+; CHECK-NEXT: [[TMP161:%.*]] = or i8 [[TMP40]], [[TMP41]]
+; CHECK-NEXT: [[TMP42:%.*]] = extractelement <64 x i8> [[TMP9]], i64 16
+; CHECK-NEXT: [[TMP43:%.*]] = and i8 [[TMP42]], 63
+; CHECK-NEXT: [[TMP163:%.*]] = or i8 [[TMP42]], [[TMP43]]
+; CHECK-NEXT: [[TMP44:%.*]] = extractelement <64 x i8> [[TMP9]], i64 17
+; CHECK-NEXT: [[TMP45:%.*]] = and i8 [[TMP44]], 63
+; CHECK-NEXT: [[TMP164:%.*]] = or i8 [[TMP44]], [[TMP45]]
+; CHECK-NEXT: [[TMP46:%.*]] = extractelement <64 x i8> [[TMP9]], i64 18
+; CHECK-NEXT: [[TMP47:%.*]] = and i8 [[TMP46]], 63
+; CHECK-NEXT: [[TMP166:%.*]] = or i8 [[TMP46]], [[TMP47]]
+; CHECK-NEXT: [[TMP48:%.*]] = extractelement <64 x i8> [[TMP9]], i64 19
+; CHECK-NEXT: [[TMP49:%.*]] = and i8 [[TMP48]], 63
+; CHECK-NEXT: [[TMP167:%.*]] = or i8 [[TMP48]], [[TMP49]]
+; CHECK-NEXT: [[TMP50:%.*]] = extractelement <64 x i8> [[TMP9]], i64 20
+; CHECK-NEXT: [[TMP51:%.*]] = and i8 [[TMP50]], 63
+; CHECK-NEXT: [[TMP169:%.*]] = or i8 [[TMP50]], [[TMP51]]
+; CHECK-NEXT: [[TMP52:%.*]] = extractelement <64 x i8> [[TMP9]], i64 21
+; CHECK-NEXT: [[TMP53:%.*]] = and i8 [[TMP52]], 63
+; CHECK-NEXT: [[TMP170:%.*]] = or i8 [[TMP52]], [[TMP53]]
+; CHECK-NEXT: [[TMP54:%.*]] = extractelement <64 x i8> [[TMP9]], i64 22
+; CHECK-NEXT: [[TMP55:%.*]] = and i8 [[TMP54]], 63
+; CHECK-NEXT: [[TMP172:%.*]] = or i8 [[TMP54]], [[TMP55]]
+; CHECK-NEXT: [[TMP56:%.*]] = extractelement <64 x i8> [[TMP9]], i64 23
+; CHECK-NEXT: [[TMP57:%.*]] = and i8 [[TMP56]], 63
+; CHECK-NEXT: [[TMP173:%.*]] = or i8 [[TMP56]], [[TMP57]]
+; CHECK-NEXT: [[TMP58:%.*]] = extractelement <64 x i8> [[TMP9]], i64 24
+; CHECK-NEXT: [[TMP59:%.*]] = and i8 [[TMP58]], 63
+; CHECK-NEXT: [[TMP175:%.*]] = or i8 [[TMP58]], [[TMP59]]
+; CHECK-NEXT: [[TMP60:%.*]] = extractelement <64 x i8> [[TMP9]], i64 25
+; CHECK-NEXT: [[TMP61:%.*]] = and i8 [[TMP60]], 63
+; CHECK-NEXT: [[TMP176:%.*]] = or i8 [[TMP60]], [[TMP61]]
+; CHECK-NEXT: [[TMP62:%.*]] = extractelement <64 x i8> [[TMP9]], i64 26
+; CHECK-NEXT: [[TMP63:%.*]] = and i8 [[TMP62]], 63
+; CHECK-NEXT: [[TMP178:%.*]] = or i8 [[TMP62]], [[TMP63]]
+; CHECK-NEXT: [[TMP64:%.*]] = extractelement <64 x i8> [[TMP9]], i64 27
+; CHECK-NEXT: [[TMP65:%.*]] = and i8 [[TMP64]], 63
+; CHECK-NEXT: [[TMP179:%.*]] = or i8 [[TMP64]], [[TMP65]]
+; CHECK-NEXT: [[TMP66:%.*]] = extractelement <64 x i8> [[TMP9]], i64 28
+; CHECK-NEXT: [[TMP67:%.*]] = and i8 [[TMP66]], 63
+; CHECK-NEXT: [[TMP181:%.*]] = or i8 [[TMP66]], [[TMP67]]
+; CHECK-NEXT: [[TMP68:%.*]] = extractelement <64 x i8> [[TMP9]], i64 29
+; CHECK-NEXT: [[TMP69:%.*]] = and i8 [[TMP68]], 63
+; CHECK-NEXT: [[TMP182:%.*]] = or i8 [[TMP68]], [[TMP69]]
+; CHECK-NEXT: [[TMP70:%.*]] = extractelement <64 x i8> [[TMP9]], i64 30
+; CHECK-NEXT: [[TMP71:%.*]] = and i8 [[TMP70]], 63
+; CHECK-NEXT: [[TMP184:%.*]] = or i8 [[TMP70]], [[TMP71]]
+; CHECK-NEXT: [[TMP72:%.*]] = extractelement <64 x i8> [[TMP9]], i64 31
+; CHECK-NEXT: [[TMP73:%.*]] = and i8 [[TMP72]], 63
+; CHECK-NEXT: [[TMP185:%.*]] = or i8 [[TMP72]], [[TMP73]]
+; CHECK-NEXT: [[TMP74:%.*]] = extractelement <64 x i8> [[TMP9]], i64 32
+; CHECK-NEXT: [[TMP75:%.*]] = and i8 [[TMP74]], 63
+; CHECK-NEXT: [[TMP187:%.*]] = or i8 [[TMP74]], [[TMP75]]
+; CHECK-NEXT: [[TMP76:%.*]] = extractelement <64 x i8> [[TMP9]], i64 33
+; CHECK-NEXT: [[TMP77:%.*]] = and i8 [[TMP76]], 63
+; CHECK-NEXT: [[TMP188:%.*]] = or i8 [[TMP76]], [[TMP77]]
+; CHECK-NEXT: [[TMP78:%.*]] = extractelement <64 x i8> [[TMP9]], i64 34
+; CHECK-NEXT: [[TMP79:%.*]] = and i8 [[TMP78]], 63
+; CHECK-NEXT: [[TMP190:%.*]] = or i8 [[TMP78]], [[TMP79]]
+; CHECK-NEXT: [[TMP80:%.*]] = extractelement <64 x i8> [[TMP9]], i64 35
+; CHECK-NEXT: [[TMP81:%.*]] = and i8 [[TMP80]], 63
+; CHECK-NEXT: [[TMP191:%.*]] = or i8 [[TMP80]], [[TMP81]]
+; CHECK-NEXT: [[TMP82:%.*]] = extractelement <64 x i8> [[TMP9]], i64 36
+; CHECK-NEXT: [[TMP83:%.*]] = and i8 [[TMP82]], 63
+; CHECK-NEXT: [[TMP193:%.*]] = or i8 [[TMP82]], [[TMP83]]
+; CHECK-NEXT: [[TMP84:%.*]] = extractelement <64 x i8> [[TMP9]], i64 37
+; CHECK-NEXT: [[TMP85:%.*]] = and i8 [[TMP84]], 63
+; CHECK-NEXT: [[TMP194:%.*]] = or i8 [[TMP84]], [[TMP85]]
+; CHECK-NEXT: [[TMP86:%.*]] = extractelement <64 x i8> [[TMP9]], i64 38
+; CHECK-NEXT: [[TMP87:%.*]] = and i8 [[TMP86]], 63
+; CHECK-NEXT: [[TMP196:%.*]] = or i8 [[TMP86]], [[TMP87]]
+; CHECK-NEXT: [[TMP88:%.*]] = extractelement <64 x i8> [[TMP9]], i64 39
+; CHECK-NEXT: [[TMP89:%.*]] = and i8 [[TMP88]], 63
+; CHECK-NEXT: [[TMP197:%.*]] = or i8 [[TMP88]], [[TMP89]]
+; CHECK-NEXT: [[TMP90:%.*]] = extractelement <64 x i8> [[TMP9]], i64 40
+; CHECK-NEXT: [[TMP91:%.*]] = and i8 [[TMP90]], 63
+; CHECK-NEXT: [[TMP199:%.*]] = or i8 [[TMP90]], [[TMP91]]
+; CHECK-NEXT: [[TMP92:%.*]] = extractelement <64 x i8> [[TMP9]], i64 41
+; CHECK-NEXT: [[TMP93:%.*]] = and i8 [[TMP92]], 63
+; CHECK-NEXT: [[TMP201:%.*]] = or i8 [[TMP92]], [[TMP93]]
+; CHECK-NEXT: [[TMP94:%.*]] = extractelement <64 x i8> [[TMP9]], i64 42
+; CHECK-NEXT: [[TMP95:%.*]] = and i8 [[TMP94]], 63
+; CHECK-NEXT: [[TMP138:%.*]] = or i8 [[TMP94]], [[TMP95]]
+; CHECK-NEXT: [[TMP96:%.*]] = extractelement <64 x i8> [[TMP9]], i64 43
+; CHECK-NEXT: [[TMP97:%.*]] = and i8 [[TMP96]], 63
+; CHECK-NEXT: [[TMP141:%.*]] = or i8 [[TMP96]], [[TMP97]]
+; CHECK-NEXT: [[TMP98:%.*]] = extractelement <64 x i8> [[TMP9]], i64 44
+; CHECK-NEXT: [[TMP99:%.*]] = and i8 [[TMP98]], 63
+; CHECK-NEXT: [[TMP144:%.*]] = or i8 [[TMP98]], [[TMP99]]
+; CHECK-NEXT: [[TMP100:%.*]] = extractelement <64 x i8> [[TMP9]], i64 45
+; CHECK-NEXT: [[TMP101:%.*]] = and i8 [[TMP100]], 63
+; CHECK-NEXT: [[TMP147:%.*]] = or i8 [[TMP100]], [[TMP101]]
+; CHECK-NEXT: [[TMP102:%.*]] = extractelement <64 x i8> [[TMP9]], i64 46
+; CHECK-NEXT: [[TMP103:%.*]] = and i8 [[TMP102]], 63
+; CHECK-NEXT: [[TMP150:%.*]] = or i8 [[TMP102]], [[TMP103]]
+; CHECK-NEXT: [[TMP104:%.*]] = extractelement <64 x i8> [[TMP9]], i64 47
+; CHECK-NEXT: [[TMP105:%.*]] = and i8 [[TMP104]], 63
+; CHECK-NEXT: [[TMP153:%.*]] = or i8 [[TMP104]], [[TMP105]]
+; CHECK-NEXT: [[TMP106:%.*]] = extractelement <64 x i8> [[TMP9]], i64 48
+; CHECK-NEXT: [[TMP107:%.*]] = and i8 [[TMP106]], 63
+; CHECK-NEXT: [[TMP156:%.*]] = or i8 [[TMP106]], [[TMP107]]
+; CHECK-NEXT: [[TMP108:%.*]] = extractelement <64 x i8> [[TMP9]], i64 49
+; CHECK-NEXT: [[TMP109:%.*]] = and i8 [[TMP108]], 63
+; CHECK-NEXT: [[TMP159:%.*]] = or i8 [[TMP108]], [[TMP109]]
+; CHECK-NEXT: [[TMP110:%.*]] = extractelement <64 x i8> [[TMP9]], i64 50
+; CHECK-NEXT: [[TMP111:%.*]] = and i8 [[TMP110]], 63
+; CHECK-NEXT: [[TMP162:%.*]] = or i8 [[TMP110]], [[TMP111]]
+; CHECK-NEXT: [[TMP112:%.*]] = extractelement <64 x i8> [[TMP9]], i64 51
+; CHECK-NEXT: [[TMP113:%.*]] = and i8 [[TMP112]], 63
+; CHECK-NEXT: [[TMP165:%.*]] = or i8 [[TMP112]], [[TMP113]]
+; CHECK-NEXT: [[TMP114:%.*]] = extractelement <64 x i8> [[TMP9]], i64 52
+; CHECK-NEXT: [[TMP115:%.*]] = and i8 [[TMP114]], 63
+; CHECK-NEXT: [[TMP168:%.*]] = or i8 [[TMP114]], [[TMP115]]
+; CHECK-NEXT: [[TMP116:%.*]] = extractelement <64 x i8> [[TMP9]], i64 53
+; CHECK-NEXT: [[TMP117:%.*]] = and i8 [[TMP116]], 63
+; CHECK-NEXT: [[TMP171:%.*]] = or i8 [[TMP116]], [[TMP117]]
+; CHECK-NEXT: [[TMP118:%.*]] = extractelement <64 x i8> [[TMP9]], i64 54
+; CHECK-NEXT: [[TMP119:%.*]] = and i8 [[TMP118]], 63
+; CHECK-NEXT: [[TMP174:%.*]] = or i8 [[TMP118]], [[TMP119]]
+; CHECK-NEXT: [[TMP120:%.*]] = extractelement <64 x i8> [[TMP9]], i64 55
+; CHECK-NEXT: [[TMP121:%.*]] = and i8 [[TMP120]], 63
+; CHECK-NEXT: [[TMP177:%.*]] = or i8 [[TMP120]], [[TMP121]]
+; CHECK-NEXT: [[TMP122:%.*]] = extractelement <64 x i8> [[TMP9]], i64 56
+; CHECK-NEXT: [[TMP123:%.*]] = and i8 [[TMP122]], 63
+; CHECK-NEXT: [[TMP180:%.*]] = or i8 [[TMP122]], [[TMP123]]
+; CHECK-NEXT: [[TMP124:%.*]] = extractelement <64 x i8> [[TMP9]], i64 57
+; CHECK-NEXT: [[TMP125:%.*]] = and i8 [[TMP124]], 63
+; CHECK-NEXT: [[TMP183:%.*]] = or i8 [[TMP124]], [[TMP125]]
+; CHECK-NEXT: [[TMP126:%.*]] = extractelement <64 x i8> [[TMP9]], i64 58
+; CHECK-NEXT: [[TMP127:%.*]] = and i8 [[TMP126]], 63
+; CHECK-NEXT: [[TMP186:%.*]] = or i8 [[TMP126]], [[TMP127]]
+; CHECK-NEXT: [[TMP128:%.*]] = extractelement <64 x i8> [[TMP9]], i64 59
+; CHECK-NEXT: [[TMP129:%.*]] = and i8 [[TMP128]], 63
+; CHECK-NEXT: [[TMP189:%.*]] = or i8 [[TMP128]], [[TMP129]]
+; CHECK-NEXT: [[TMP130:%.*]] = extractelement <64 x i8> [[TMP9]], i64 60
+; CHECK-NEXT: [[TMP131:%.*]] = and i8 [[TMP130]], 63
+; CHECK-NEXT: [[TMP192:%.*]] = or i8 [[TMP130]], [[TMP131]]
+; CHECK-NEXT: [[TMP132:%.*]] = extractelement <64 x i8> [[TMP9]], i64 61
+; CHECK-NEXT: [[TMP133:%.*]] = and i8 [[TMP132]], 63
+; CHECK-NEXT: [[TMP195:%.*]] = or i8 [[TMP132]], [[TMP133]]
+; CHECK-NEXT: [[TMP134:%.*]] = extractelement <64 x i8> [[TMP9]], i64 62
+; CHECK-NEXT: [[TMP135:%.*]] = and i8 [[TMP134]], 63
+; CHECK-NEXT: [[TMP198:%.*]] = or i8 [[TMP134]], [[TMP135]]
+; CHECK-NEXT: [[TMP136:%.*]] = extractelement <64 x i8> [[TMP9]], i64 63
+; CHECK-NEXT: [[TMP200:%.*]] = and i8 [[TMP136]], 63
+; CHECK-NEXT: [[TMP137:%.*]] = or i8 [[TMP136]], [[TMP200]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> [[TMP6]], <64 x i8> [[T]], <64 x i8> [[TMP3]])
+; CHECK-NEXT: [[_MSCMP124:%.*]] = icmp ne i8 [[TMP137]], 0
+; CHECK-NEXT: br i1 [[_MSCMP124]], label %[[BB203:.*]], label %[[BB204:.*]], !prof [[PROF1]]
+; CHECK: [[BB203]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB204]]:
; CHECK-NEXT: [[R:%.*]] = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> [[X0]], <64 x i8> [[T]], <64 x i8> [[X1]])
; CHECK-NEXT: store <64 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <64 x i8> [[R]]
@@ -720,3 +1397,6 @@ define <64 x i8> @shuffle_vpermv3_v64i8_demandedbits(<64 x i8> %x0, <64 x i8> %x
}
attributes #0 = { sanitize_memory }
+;.
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
+;.