diff options
Diffstat (limited to 'llvm/test/Transforms/LowerMatrixIntrinsics')
4 files changed, 861 insertions, 8 deletions
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll new file mode 100644 index 0000000..d281905 --- /dev/null +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll @@ -0,0 +1,539 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes=lower-matrix-intrinsics,instcombine -data-layout='p:128:128' -fuse-matrix-use-loops=false -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s --check-prefix=PTR128 +; RUN: opt -passes=lower-matrix-intrinsics,instcombine -data-layout='p:64:64' -fuse-matrix-use-loops=false -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s --check-prefix=PTR64 +; RUN: opt -passes=lower-matrix-intrinsics,instcombine -data-layout='p:32:32' -fuse-matrix-use-loops=false -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s --check-prefix=PTR32 + +; REQUIRES: aarch64-registered-target + +; See the comment in `data-layout.ll` for an explanation. + +target triple = "aarch64-unknown-unknown" + +define void @multiply(ptr %A, ptr %B, ptr %C) { +; PTR128-LABEL: @multiply( +; PTR128-NEXT: entry: +; PTR128-NEXT: [[STORE_BEGIN:%.*]] = ptrtoint ptr [[C:%.*]] to i128 +; PTR128-NEXT: [[STORE_END:%.*]] = add nuw nsw i128 [[STORE_BEGIN]], 128 +; PTR128-NEXT: [[LOAD_BEGIN:%.*]] = ptrtoint ptr [[A:%.*]] to i128 +; PTR128-NEXT: [[TMP0:%.*]] = icmp ugt i128 [[STORE_END]], [[LOAD_BEGIN]] +; PTR128-NEXT: br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]] +; PTR128: alias_cont: +; PTR128-NEXT: [[LOAD_END:%.*]] = add nuw nsw i128 [[LOAD_BEGIN]], 128 +; PTR128-NEXT: [[TMP1:%.*]] = icmp ugt i128 [[LOAD_END]], [[STORE_BEGIN]] +; PTR128-NEXT: br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]] +; PTR128: copy: +; PTR128-NEXT: [[TMP2:%.*]] = alloca [16 x double], align 8 +; PTR128-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP2]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false) +; PTR128-NEXT: br label [[NO_ALIAS]] +; PTR128: no_alias: +; PTR128-NEXT: [[TMP3:%.*]] = phi ptr [ [[A]], [[ENTRY:%.*]] ], [ [[A]], [[ALIAS_CONT]] ], [ [[TMP2]], [[COPY]] ] +; PTR128-NEXT: [[STORE_BEGIN4:%.*]] = ptrtoint ptr [[C]] to i128 +; PTR128-NEXT: [[STORE_END5:%.*]] = add nuw nsw i128 [[STORE_BEGIN4]], 128 +; PTR128-NEXT: [[LOAD_BEGIN6:%.*]] = ptrtoint ptr [[A]] to i128 +; PTR128-NEXT: [[TMP4:%.*]] = icmp ugt i128 [[STORE_END5]], [[LOAD_BEGIN6]] +; PTR128-NEXT: br i1 [[TMP4]], label [[ALIAS_CONT1:%.*]], label [[NO_ALIAS3:%.*]] +; PTR128: alias_cont1: +; PTR128-NEXT: [[LOAD_END7:%.*]] = add nuw nsw i128 [[LOAD_BEGIN6]], 128 +; PTR128-NEXT: [[TMP5:%.*]] = icmp ugt i128 [[LOAD_END7]], [[STORE_BEGIN4]] +; PTR128-NEXT: br i1 [[TMP5]], label [[COPY2:%.*]], label [[NO_ALIAS3]] +; PTR128: copy2: +; PTR128-NEXT: [[TMP6:%.*]] = alloca [16 x double], align 8 +; PTR128-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP6]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false) +; PTR128-NEXT: br label [[NO_ALIAS3]] +; PTR128: no_alias3: +; PTR128-NEXT: [[TMP7:%.*]] = phi ptr [ [[A]], [[NO_ALIAS]] ], [ [[A]], [[ALIAS_CONT1]] ], [ [[TMP6]], [[COPY2]] ] +; PTR128-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP3]], align 8 +; PTR128-NEXT: [[VEC_GEP:%.*]] = getelementptr i8, ptr [[TMP3]], i128 32 +; PTR128-NEXT: [[COL_LOAD8:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8 +; PTR128-NEXT: [[COL_LOAD9:%.*]] = load <2 x double>, ptr [[TMP7]], align 8 +; PTR128-NEXT: [[VEC_GEP10:%.*]] = getelementptr i8, ptr [[TMP7]], i128 32 +; PTR128-NEXT: [[COL_LOAD11:%.*]] = load <2 x double>, ptr [[VEC_GEP10]], align 8 +; PTR128-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD9]], <2 x double> poison, <2 x i32> zeroinitializer +; PTR128-NEXT: [[TMP8:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]] +; PTR128-NEXT: [[SPLAT_SPLAT14:%.*]] = shufflevector <2 x double> [[COL_LOAD9]], <2 x double> poison, <2 x i32> <i32 1, i32 1> +; PTR128-NEXT: [[TMP9:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT14]], <2 x double> [[TMP8]]) +; PTR128-NEXT: [[SPLAT_SPLAT17:%.*]] = shufflevector <2 x double> [[COL_LOAD11]], <2 x double> poison, <2 x i32> zeroinitializer +; PTR128-NEXT: [[TMP10:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT17]] +; PTR128-NEXT: [[SPLAT_SPLAT20:%.*]] = shufflevector <2 x double> [[COL_LOAD11]], <2 x double> poison, <2 x i32> <i32 1, i32 1> +; PTR128-NEXT: [[TMP11:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT20]], <2 x double> [[TMP10]]) +; PTR128-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP3]], i128 64 +; PTR128-NEXT: [[COL_LOAD21:%.*]] = load <2 x double>, ptr [[TMP12]], align 8 +; PTR128-NEXT: [[VEC_GEP22:%.*]] = getelementptr i8, ptr [[TMP3]], i128 96 +; PTR128-NEXT: [[COL_LOAD23:%.*]] = load <2 x double>, ptr [[VEC_GEP22]], align 8 +; PTR128-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP7]], i128 16 +; PTR128-NEXT: [[COL_LOAD24:%.*]] = load <2 x double>, ptr [[TMP13]], align 8 +; PTR128-NEXT: [[VEC_GEP25:%.*]] = getelementptr i8, ptr [[TMP7]], i128 48 +; PTR128-NEXT: [[COL_LOAD26:%.*]] = load <2 x double>, ptr [[VEC_GEP25]], align 8 +; PTR128-NEXT: [[SPLAT_SPLAT30:%.*]] = shufflevector <2 x double> [[COL_LOAD24]], <2 x double> poison, <2 x i32> zeroinitializer +; PTR128-NEXT: [[TMP14:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD21]], <2 x double> [[SPLAT_SPLAT30]], <2 x double> [[TMP9]]) +; PTR128-NEXT: [[SPLAT_SPLAT33:%.*]] = shufflevector <2 x double> [[COL_LOAD24]], <2 x double> poison, <2 x i32> <i32 1, i32 1> +; PTR128-NEXT: [[TMP15:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD23]], <2 x double> [[SPLAT_SPLAT33]], <2 x double> [[TMP14]]) +; PTR128-NEXT: [[SPLAT_SPLAT37:%.*]] = shufflevector <2 x double> [[COL_LOAD26]], <2 x double> poison, <2 x i32> zeroinitializer +; PTR128-NEXT: [[TMP16:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD21]], <2 x double> [[SPLAT_SPLAT37]], <2 x double> [[TMP11]]) +; PTR128-NEXT: [[SPLAT_SPLAT40:%.*]] = shufflevector <2 x double> [[COL_LOAD26]], <2 x double> poison, <2 x i32> <i32 1, i32 1> +; PTR128-NEXT: [[TMP17:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD23]], <2 x double> [[SPLAT_SPLAT40]], <2 x double> [[TMP16]]) +; PTR128-NEXT: store <2 x double> [[TMP15]], ptr [[C]], align 8 +; PTR128-NEXT: [[VEC_GEP41:%.*]] = getelementptr i8, ptr [[C]], i128 32 +; PTR128-NEXT: store <2 x double> [[TMP17]], ptr [[VEC_GEP41]], align 8 +; PTR128-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP3]], i128 16 +; PTR128-NEXT: [[COL_LOAD42:%.*]] = load <2 x double>, ptr [[TMP18]], align 8 +; PTR128-NEXT: [[VEC_GEP43:%.*]] = getelementptr i8, ptr [[TMP3]], i128 48 +; PTR128-NEXT: [[COL_LOAD44:%.*]] = load <2 x double>, ptr [[VEC_GEP43]], align 8 +; PTR128-NEXT: [[COL_LOAD45:%.*]] = load <2 x double>, ptr [[TMP7]], align 8 +; PTR128-NEXT: [[VEC_GEP46:%.*]] = getelementptr i8, ptr [[TMP7]], i128 32 +; PTR128-NEXT: [[COL_LOAD47:%.*]] = load <2 x double>, ptr [[VEC_GEP46]], align 8 +; PTR128-NEXT: [[SPLAT_SPLAT50:%.*]] = shufflevector <2 x double> [[COL_LOAD45]], <2 x double> poison, <2 x i32> zeroinitializer +; PTR128-NEXT: [[TMP19:%.*]] = fmul contract <2 x double> [[COL_LOAD42]], [[SPLAT_SPLAT50]] +; PTR128-NEXT: [[SPLAT_SPLAT53:%.*]] = shufflevector <2 x double> [[COL_LOAD45]], <2 x double> poison, <2 x i32> <i32 1, i32 1> +; PTR128-NEXT: [[TMP20:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD44]], <2 x double> [[SPLAT_SPLAT53]], <2 x double> [[TMP19]]) +; PTR128-NEXT: [[SPLAT_SPLAT56:%.*]] = shufflevector <2 x double> [[COL_LOAD47]], <2 x double> poison, <2 x i32> zeroinitializer +; PTR128-NEXT: [[TMP21:%.*]] = fmul contract <2 x double> [[COL_LOAD42]], [[SPLAT_SPLAT56]] +; PTR128-NEXT: [[SPLAT_SPLAT59:%.*]] = shufflevector <2 x double> [[COL_LOAD47]], <2 x double> poison, <2 x i32> <i32 1, i32 1> +; PTR128-NEXT: [[TMP22:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD44]], <2 x double> [[SPLAT_SPLAT59]], <2 x double> [[TMP21]]) +; PTR128-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[TMP3]], i128 80 +; PTR128-NEXT: [[COL_LOAD60:%.*]] = load <2 x double>, ptr [[TMP23]], align 8 +; PTR128-NEXT: [[VEC_GEP61:%.*]] = getelementptr i8, ptr [[TMP3]], i128 112 +; PTR128-NEXT: [[COL_LOAD62:%.*]] = load <2 x double>, ptr [[VEC_GEP61]], align 8 +; PTR128-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[TMP7]], i128 16 +; PTR128-NEXT: [[COL_LOAD63:%.*]] = load <2 x double>, ptr [[TMP24]], align 8 +; PTR128-NEXT: [[VEC_GEP64:%.*]] = getelementptr i8, ptr [[TMP7]], i128 48 +; PTR128-NEXT: [[COL_LOAD65:%.*]] = load <2 x double>, ptr [[VEC_GEP64]], align 8 +; PTR128-NEXT: [[SPLAT_SPLAT69:%.*]] = shufflevector <2 x double> [[COL_LOAD63]], <2 x double> poison, <2 x i32> zeroinitializer +; PTR128-NEXT: [[TMP25:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT69]], <2 x double> [[TMP20]]) +; PTR128-NEXT: [[SPLAT_SPLAT72:%.*]] = shufflevector <2 x double> [[COL_LOAD63]], <2 x double> poison, <2 x i32> <i32 1, i32 1> +; PTR128-NEXT: [[TMP26:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD62]], <2 x double> [[SPLAT_SPLAT72]], <2 x double> [[TMP25]]) +; PTR128-NEXT: [[SPLAT_SPLAT76:%.*]] = shufflevector <2 x double> [[COL_LOAD65]], <2 x double> poison, <2 x i32> zeroinitializer +; PTR128-NEXT: [[TMP27:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT76]], <2 x double> [[TMP22]]) +; PTR128-NEXT: [[SPLAT_SPLAT79:%.*]] = shufflevector <2 x double> [[COL_LOAD65]], <2 x double> poison, <2 x i32> <i32 1, i32 1> +; PTR128-NEXT: [[TMP28:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD62]], <2 x double> [[SPLAT_SPLAT79]], <2 x double> [[TMP27]]) +; PTR128-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[C]], i128 16 +; PTR128-NEXT: store <2 x double> [[TMP26]], ptr [[TMP29]], align 8 +; PTR128-NEXT: [[VEC_GEP80:%.*]] = getelementptr i8, ptr [[C]], i128 48 +; PTR128-NEXT: store <2 x double> [[TMP28]], ptr [[VEC_GEP80]], align 8 +; PTR128-NEXT: [[COL_LOAD81:%.*]] = load <2 x double>, ptr [[TMP3]], align 8 +; PTR128-NEXT: [[VEC_GEP82:%.*]] = getelementptr i8, ptr [[TMP3]], i128 32 +; PTR128-NEXT: [[COL_LOAD83:%.*]] = load <2 x double>, ptr [[VEC_GEP82]], align 8 +; PTR128-NEXT: [[TMP30:%.*]] = getelementptr i8, ptr [[TMP7]], i128 64 +; PTR128-NEXT: [[COL_LOAD84:%.*]] = load <2 x double>, ptr [[TMP30]], align 8 +; PTR128-NEXT: [[VEC_GEP85:%.*]] = getelementptr i8, ptr [[TMP7]], i128 96 +; PTR128-NEXT: [[COL_LOAD86:%.*]] = load <2 x double>, ptr [[VEC_GEP85]], align 8 +; PTR128-NEXT: [[SPLAT_SPLAT89:%.*]] = shufflevector <2 x double> [[COL_LOAD84]], <2 x double> poison, <2 x i32> zeroinitializer +; PTR128-NEXT: [[TMP31:%.*]] = fmul contract <2 x double> [[COL_LOAD81]], [[SPLAT_SPLAT89]] +; PTR128-NEXT: [[SPLAT_SPLAT92:%.*]] = shufflevector <2 x double> [[COL_LOAD84]], <2 x double> poison, <2 x i32> <i32 1, i32 1> +; PTR128-NEXT: [[TMP32:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD83]], <2 x double> [[SPLAT_SPLAT92]], <2 x double> [[TMP31]]) +; PTR128-NEXT: [[SPLAT_SPLAT95:%.*]] = shufflevector <2 x double> [[COL_LOAD86]], <2 x double> poison, <2 x i32> zeroinitializer +; PTR128-NEXT: [[TMP33:%.*]] = fmul contract <2 x double> [[COL_LOAD81]], [[SPLAT_SPLAT95]] +; PTR128-NEXT: [[SPLAT_SPLAT98:%.*]] = shufflevector <2 x double> [[COL_LOAD86]], <2 x double> poison, <2 x i32> <i32 1, i32 1> +; PTR128-NEXT: [[TMP34:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD83]], <2 x double> [[SPLAT_SPLAT98]], <2 x double> [[TMP33]]) +; PTR128-NEXT: [[TMP35:%.*]] = getelementptr i8, ptr [[TMP3]], i128 64 +; PTR128-NEXT: [[COL_LOAD99:%.*]] = load <2 x double>, ptr [[TMP35]], align 8 +; PTR128-NEXT: [[VEC_GEP100:%.*]] = getelementptr i8, ptr [[TMP3]], i128 96 +; PTR128-NEXT: [[COL_LOAD101:%.*]] = load <2 x double>, ptr [[VEC_GEP100]], align 8 +; PTR128-NEXT: [[TMP36:%.*]] = getelementptr i8, ptr [[TMP7]], i128 80 +; PTR128-NEXT: [[COL_LOAD102:%.*]] = load <2 x double>, ptr [[TMP36]], align 8 +; PTR128-NEXT: [[VEC_GEP103:%.*]] = getelementptr i8, ptr [[TMP7]], i128 112 +; PTR128-NEXT: [[COL_LOAD104:%.*]] = load <2 x double>, ptr [[VEC_GEP103]], align 8 +; PTR128-NEXT: [[SPLAT_SPLAT108:%.*]] = shufflevector <2 x double> [[COL_LOAD102]], <2 x double> poison, <2 x i32> zeroinitializer +; PTR128-NEXT: [[TMP37:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD99]], <2 x double> [[SPLAT_SPLAT108]], <2 x double> [[TMP32]]) +; PTR128-NEXT: [[SPLAT_SPLAT111:%.*]] = shufflevector <2 x double> [[COL_LOAD102]], <2 x double> poison, <2 x i32> <i32 1, i32 1> +; PTR128-NEXT: [[TMP38:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD101]], <2 x double> [[SPLAT_SPLAT111]], <2 x double> [[TMP37]]) +; PTR128-NEXT: [[SPLAT_SPLAT115:%.*]] = shufflevector <2 x double> [[COL_LOAD104]], <2 x double> poison, <2 x i32> zeroinitializer +; PTR128-NEXT: [[TMP39:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD99]], <2 x double> [[SPLAT_SPLAT115]], <2 x double> [[TMP34]]) +; PTR128-NEXT: [[SPLAT_SPLAT118:%.*]] = shufflevector <2 x double> [[COL_LOAD104]], <2 x double> poison, <2 x i32> <i32 1, i32 1> +; PTR128-NEXT: [[TMP40:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD101]], <2 x double> [[SPLAT_SPLAT118]], <2 x double> [[TMP39]]) +; PTR128-NEXT: [[TMP41:%.*]] = getelementptr i8, ptr [[C]], i128 64 +; PTR128-NEXT: store <2 x double> [[TMP38]], ptr [[TMP41]], align 8 +; PTR128-NEXT: [[VEC_GEP119:%.*]] = getelementptr i8, ptr [[C]], i128 96 +; PTR128-NEXT: store <2 x double> [[TMP40]], ptr [[VEC_GEP119]], align 8 +; PTR128-NEXT: [[TMP42:%.*]] = getelementptr i8, ptr [[TMP3]], i128 16 +; PTR128-NEXT: [[COL_LOAD120:%.*]] = load <2 x double>, ptr [[TMP42]], align 8 +; PTR128-NEXT: [[VEC_GEP121:%.*]] = getelementptr i8, ptr [[TMP3]], i128 48 +; PTR128-NEXT: [[COL_LOAD122:%.*]] = load <2 x double>, ptr [[VEC_GEP121]], align 8 +; PTR128-NEXT: [[TMP43:%.*]] = getelementptr i8, ptr [[TMP7]], i128 64 +; PTR128-NEXT: [[COL_LOAD123:%.*]] = load <2 x double>, ptr [[TMP43]], align 8 +; PTR128-NEXT: [[VEC_GEP124:%.*]] = getelementptr i8, ptr [[TMP7]], i128 96 +; PTR128-NEXT: [[COL_LOAD125:%.*]] = load <2 x double>, ptr [[VEC_GEP124]], align 8 +; PTR128-NEXT: [[SPLAT_SPLAT128:%.*]] = shufflevector <2 x double> [[COL_LOAD123]], <2 x double> poison, <2 x i32> zeroinitializer +; PTR128-NEXT: [[TMP44:%.*]] = fmul contract <2 x double> [[COL_LOAD120]], [[SPLAT_SPLAT128]] +; PTR128-NEXT: [[SPLAT_SPLAT131:%.*]] = shufflevector <2 x double> [[COL_LOAD123]], <2 x double> poison, <2 x i32> <i32 1, i32 1> +; PTR128-NEXT: [[TMP45:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD122]], <2 x double> [[SPLAT_SPLAT131]], <2 x double> [[TMP44]]) +; PTR128-NEXT: [[SPLAT_SPLAT134:%.*]] = shufflevector <2 x double> [[COL_LOAD125]], <2 x double> poison, <2 x i32> zeroinitializer +; PTR128-NEXT: [[TMP46:%.*]] = fmul contract <2 x double> [[COL_LOAD120]], [[SPLAT_SPLAT134]] +; PTR128-NEXT: [[SPLAT_SPLAT137:%.*]] = shufflevector <2 x double> [[COL_LOAD125]], <2 x double> poison, <2 x i32> <i32 1, i32 1> +; PTR128-NEXT: [[TMP47:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD122]], <2 x double> [[SPLAT_SPLAT137]], <2 x double> [[TMP46]]) +; PTR128-NEXT: [[TMP48:%.*]] = getelementptr i8, ptr [[TMP3]], i128 80 +; PTR128-NEXT: [[COL_LOAD138:%.*]] = load <2 x double>, ptr [[TMP48]], align 8 +; PTR128-NEXT: [[VEC_GEP139:%.*]] = getelementptr i8, ptr [[TMP3]], i128 112 +; PTR128-NEXT: [[COL_LOAD140:%.*]] = load <2 x double>, ptr [[VEC_GEP139]], align 8 +; PTR128-NEXT: [[TMP49:%.*]] = getelementptr i8, ptr [[TMP7]], i128 80 +; PTR128-NEXT: [[COL_LOAD141:%.*]] = load <2 x double>, ptr [[TMP49]], align 8 +; PTR128-NEXT: [[VEC_GEP142:%.*]] = getelementptr i8, ptr [[TMP7]], i128 112 +; PTR128-NEXT: [[COL_LOAD143:%.*]] = load <2 x double>, ptr [[VEC_GEP142]], align 8 +; PTR128-NEXT: [[SPLAT_SPLAT147:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> poison, <2 x i32> zeroinitializer +; PTR128-NEXT: [[TMP50:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT147]], <2 x double> [[TMP45]]) +; PTR128-NEXT: [[SPLAT_SPLAT150:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> poison, <2 x i32> <i32 1, i32 1> +; PTR128-NEXT: [[TMP51:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD140]], <2 x double> [[SPLAT_SPLAT150]], <2 x double> [[TMP50]]) +; PTR128-NEXT: [[SPLAT_SPLAT154:%.*]] = shufflevector <2 x double> [[COL_LOAD143]], <2 x double> poison, <2 x i32> zeroinitializer +; PTR128-NEXT: [[TMP52:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT154]], <2 x double> [[TMP47]]) +; PTR128-NEXT: [[SPLAT_SPLAT157:%.*]] = shufflevector <2 x double> [[COL_LOAD143]], <2 x double> poison, <2 x i32> <i32 1, i32 1> +; PTR128-NEXT: [[TMP53:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD140]], <2 x double> [[SPLAT_SPLAT157]], <2 x double> [[TMP52]]) +; PTR128-NEXT: [[TMP54:%.*]] = getelementptr i8, ptr [[C]], i128 80 +; PTR128-NEXT: store <2 x double> [[TMP51]], ptr [[TMP54]], align 8 +; PTR128-NEXT: [[VEC_GEP158:%.*]] = getelementptr i8, ptr [[C]], i128 112 +; PTR128-NEXT: store <2 x double> [[TMP53]], ptr [[VEC_GEP158]], align 8 +; PTR128-NEXT: ret void +; +; PTR64-LABEL: @multiply( +; PTR64-NEXT: entry: +; PTR64-NEXT: [[STORE_BEGIN:%.*]] = ptrtoint ptr [[C:%.*]] to i64 +; PTR64-NEXT: [[STORE_END:%.*]] = add nuw nsw i64 [[STORE_BEGIN]], 128 +; PTR64-NEXT: [[LOAD_BEGIN:%.*]] = ptrtoint ptr [[A:%.*]] to i64 +; PTR64-NEXT: [[TMP0:%.*]] = icmp ugt i64 [[STORE_END]], [[LOAD_BEGIN]] +; PTR64-NEXT: br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]] +; PTR64: alias_cont: +; PTR64-NEXT: [[LOAD_END:%.*]] = add nuw nsw i64 [[LOAD_BEGIN]], 128 +; PTR64-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[LOAD_END]], [[STORE_BEGIN]] +; PTR64-NEXT: br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]] +; PTR64: copy: +; PTR64-NEXT: [[TMP2:%.*]] = alloca [16 x double], align 8 +; PTR64-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP2]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false) +; PTR64-NEXT: br label [[NO_ALIAS]] +; PTR64: no_alias: +; PTR64-NEXT: [[TMP3:%.*]] = phi ptr [ [[A]], [[ENTRY:%.*]] ], [ [[A]], [[ALIAS_CONT]] ], [ [[TMP2]], [[COPY]] ] +; PTR64-NEXT: [[STORE_BEGIN4:%.*]] = ptrtoint ptr [[C]] to i64 +; PTR64-NEXT: [[STORE_END5:%.*]] = add nuw nsw i64 [[STORE_BEGIN4]], 128 +; PTR64-NEXT: [[LOAD_BEGIN6:%.*]] = ptrtoint ptr [[A]] to i64 +; PTR64-NEXT: [[TMP4:%.*]] = icmp ugt i64 [[STORE_END5]], [[LOAD_BEGIN6]] +; PTR64-NEXT: br i1 [[TMP4]], label [[ALIAS_CONT1:%.*]], label [[NO_ALIAS3:%.*]] +; PTR64: alias_cont1: +; PTR64-NEXT: [[LOAD_END7:%.*]] = add nuw nsw i64 [[LOAD_BEGIN6]], 128 +; PTR64-NEXT: [[TMP5:%.*]] = icmp ugt i64 [[LOAD_END7]], [[STORE_BEGIN4]] +; PTR64-NEXT: br i1 [[TMP5]], label [[COPY2:%.*]], label [[NO_ALIAS3]] +; PTR64: copy2: +; PTR64-NEXT: [[TMP6:%.*]] = alloca [16 x double], align 8 +; PTR64-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP6]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false) +; PTR64-NEXT: br label [[NO_ALIAS3]] +; PTR64: no_alias3: +; PTR64-NEXT: [[TMP7:%.*]] = phi ptr [ [[A]], [[NO_ALIAS]] ], [ [[A]], [[ALIAS_CONT1]] ], [ [[TMP6]], [[COPY2]] ] +; PTR64-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP3]], align 8 +; PTR64-NEXT: [[VEC_GEP:%.*]] = getelementptr i8, ptr [[TMP3]], i64 32 +; PTR64-NEXT: [[COL_LOAD8:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8 +; PTR64-NEXT: [[COL_LOAD9:%.*]] = load <2 x double>, ptr [[TMP7]], align 8 +; PTR64-NEXT: [[VEC_GEP10:%.*]] = getelementptr i8, ptr [[TMP7]], i64 32 +; PTR64-NEXT: [[COL_LOAD11:%.*]] = load <2 x double>, ptr [[VEC_GEP10]], align 8 +; PTR64-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD9]], <2 x double> poison, <2 x i32> zeroinitializer +; PTR64-NEXT: [[TMP8:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]] +; PTR64-NEXT: [[SPLAT_SPLAT14:%.*]] = shufflevector <2 x double> [[COL_LOAD9]], <2 x double> poison, <2 x i32> <i32 1, i32 1> +; PTR64-NEXT: [[TMP9:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT14]], <2 x double> [[TMP8]]) +; PTR64-NEXT: [[SPLAT_SPLAT17:%.*]] = shufflevector <2 x double> [[COL_LOAD11]], <2 x double> poison, <2 x i32> zeroinitializer +; PTR64-NEXT: [[TMP10:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT17]] +; PTR64-NEXT: [[SPLAT_SPLAT20:%.*]] = shufflevector <2 x double> [[COL_LOAD11]], <2 x double> poison, <2 x i32> <i32 1, i32 1> +; PTR64-NEXT: [[TMP11:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT20]], <2 x double> [[TMP10]]) +; PTR64-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP3]], i64 64 +; PTR64-NEXT: [[COL_LOAD21:%.*]] = load <2 x double>, ptr [[TMP12]], align 8 +; PTR64-NEXT: [[VEC_GEP22:%.*]] = getelementptr i8, ptr [[TMP3]], i64 96 +; PTR64-NEXT: [[COL_LOAD23:%.*]] = load <2 x double>, ptr [[VEC_GEP22]], align 8 +; PTR64-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP7]], i64 16 +; PTR64-NEXT: [[COL_LOAD24:%.*]] = load <2 x double>, ptr [[TMP13]], align 8 +; PTR64-NEXT: [[VEC_GEP25:%.*]] = getelementptr i8, ptr [[TMP7]], i64 48 +; PTR64-NEXT: [[COL_LOAD26:%.*]] = load <2 x double>, ptr [[VEC_GEP25]], align 8 +; PTR64-NEXT: [[SPLAT_SPLAT30:%.*]] = shufflevector <2 x double> [[COL_LOAD24]], <2 x double> poison, <2 x i32> zeroinitializer +; PTR64-NEXT: [[TMP14:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD21]], <2 x double> [[SPLAT_SPLAT30]], <2 x double> [[TMP9]]) +; PTR64-NEXT: [[SPLAT_SPLAT33:%.*]] = shufflevector <2 x double> [[COL_LOAD24]], <2 x double> poison, <2 x i32> <i32 1, i32 1> +; PTR64-NEXT: [[TMP15:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD23]], <2 x double> [[SPLAT_SPLAT33]], <2 x double> [[TMP14]]) +; PTR64-NEXT: [[SPLAT_SPLAT37:%.*]] = shufflevector <2 x double> [[COL_LOAD26]], <2 x double> poison, <2 x i32> zeroinitializer +; PTR64-NEXT: [[TMP16:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD21]], <2 x double> [[SPLAT_SPLAT37]], <2 x double> [[TMP11]]) +; PTR64-NEXT: [[SPLAT_SPLAT40:%.*]] = shufflevector <2 x double> [[COL_LOAD26]], <2 x double> poison, <2 x i32> <i32 1, i32 1> +; PTR64-NEXT: [[TMP17:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD23]], <2 x double> [[SPLAT_SPLAT40]], <2 x double> [[TMP16]]) +; PTR64-NEXT: store <2 x double> [[TMP15]], ptr [[C]], align 8 +; PTR64-NEXT: [[VEC_GEP41:%.*]] = getelementptr i8, ptr [[C]], i64 32 +; PTR64-NEXT: store <2 x double> [[TMP17]], ptr [[VEC_GEP41]], align 8 +; PTR64-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP3]], i64 16 +; PTR64-NEXT: [[COL_LOAD42:%.*]] = load <2 x double>, ptr [[TMP18]], align 8 +; PTR64-NEXT: [[VEC_GEP43:%.*]] = getelementptr i8, ptr [[TMP3]], i64 48 +; PTR64-NEXT: [[COL_LOAD44:%.*]] = load <2 x double>, ptr [[VEC_GEP43]], align 8 +; PTR64-NEXT: [[COL_LOAD45:%.*]] = load <2 x double>, ptr [[TMP7]], align 8 +; PTR64-NEXT: [[VEC_GEP46:%.*]] = getelementptr i8, ptr [[TMP7]], i64 32 +; PTR64-NEXT: [[COL_LOAD47:%.*]] = load <2 x double>, ptr [[VEC_GEP46]], align 8 +; PTR64-NEXT: [[SPLAT_SPLAT50:%.*]] = shufflevector <2 x double> [[COL_LOAD45]], <2 x double> poison, <2 x i32> zeroinitializer +; PTR64-NEXT: [[TMP19:%.*]] = fmul contract <2 x double> [[COL_LOAD42]], [[SPLAT_SPLAT50]] +; PTR64-NEXT: [[SPLAT_SPLAT53:%.*]] = shufflevector <2 x double> [[COL_LOAD45]], <2 x double> poison, <2 x i32> <i32 1, i32 1> +; PTR64-NEXT: [[TMP20:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD44]], <2 x double> [[SPLAT_SPLAT53]], <2 x double> [[TMP19]]) +; PTR64-NEXT: [[SPLAT_SPLAT56:%.*]] = shufflevector <2 x double> [[COL_LOAD47]], <2 x double> poison, <2 x i32> zeroinitializer +; PTR64-NEXT: [[TMP21:%.*]] = fmul contract <2 x double> [[COL_LOAD42]], [[SPLAT_SPLAT56]] +; PTR64-NEXT: [[SPLAT_SPLAT59:%.*]] = shufflevector <2 x double> [[COL_LOAD47]], <2 x double> poison, <2 x i32> <i32 1, i32 1> +; PTR64-NEXT: [[TMP22:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD44]], <2 x double> [[SPLAT_SPLAT59]], <2 x double> [[TMP21]]) +; PTR64-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[TMP3]], i64 80 +; PTR64-NEXT: [[COL_LOAD60:%.*]] = load <2 x double>, ptr [[TMP23]], align 8 +; PTR64-NEXT: [[VEC_GEP61:%.*]] = getelementptr i8, ptr [[TMP3]], i64 112 +; PTR64-NEXT: [[COL_LOAD62:%.*]] = load <2 x double>, ptr [[VEC_GEP61]], align 8 +; PTR64-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[TMP7]], i64 16 +; PTR64-NEXT: [[COL_LOAD63:%.*]] = load <2 x double>, ptr [[TMP24]], align 8 +; PTR64-NEXT: [[VEC_GEP64:%.*]] = getelementptr i8, ptr [[TMP7]], i64 48 +; PTR64-NEXT: [[COL_LOAD65:%.*]] = load <2 x double>, ptr [[VEC_GEP64]], align 8 +; PTR64-NEXT: [[SPLAT_SPLAT69:%.*]] = shufflevector <2 x double> [[COL_LOAD63]], <2 x double> poison, <2 x i32> zeroinitializer +; PTR64-NEXT: [[TMP25:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT69]], <2 x double> [[TMP20]]) +; PTR64-NEXT: [[SPLAT_SPLAT72:%.*]] = shufflevector <2 x double> [[COL_LOAD63]], <2 x double> poison, <2 x i32> <i32 1, i32 1> +; PTR64-NEXT: [[TMP26:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD62]], <2 x double> [[SPLAT_SPLAT72]], <2 x double> [[TMP25]]) +; PTR64-NEXT: [[SPLAT_SPLAT76:%.*]] = shufflevector <2 x double> [[COL_LOAD65]], <2 x double> poison, <2 x i32> zeroinitializer +; PTR64-NEXT: [[TMP27:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT76]], <2 x double> [[TMP22]]) +; PTR64-NEXT: [[SPLAT_SPLAT79:%.*]] = shufflevector <2 x double> [[COL_LOAD65]], <2 x double> poison, <2 x i32> <i32 1, i32 1> +; PTR64-NEXT: [[TMP28:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD62]], <2 x double> [[SPLAT_SPLAT79]], <2 x double> [[TMP27]]) +; PTR64-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[C]], i64 16 +; PTR64-NEXT: store <2 x double> [[TMP26]], ptr [[TMP29]], align 8 +; PTR64-NEXT: [[VEC_GEP80:%.*]] = getelementptr i8, ptr [[C]], i64 48 +; PTR64-NEXT: store <2 x double> [[TMP28]], ptr [[VEC_GEP80]], align 8 +; PTR64-NEXT: [[COL_LOAD81:%.*]] = load <2 x double>, ptr [[TMP3]], align 8 +; PTR64-NEXT: [[VEC_GEP82:%.*]] = getelementptr i8, ptr [[TMP3]], i64 32 +; PTR64-NEXT: [[COL_LOAD83:%.*]] = load <2 x double>, ptr [[VEC_GEP82]], align 8 +; PTR64-NEXT: [[TMP30:%.*]] = getelementptr i8, ptr [[TMP7]], i64 64 +; PTR64-NEXT: [[COL_LOAD84:%.*]] = load <2 x double>, ptr [[TMP30]], align 8 +; PTR64-NEXT: [[VEC_GEP85:%.*]] = getelementptr i8, ptr [[TMP7]], i64 96 +; PTR64-NEXT: [[COL_LOAD86:%.*]] = load <2 x double>, ptr [[VEC_GEP85]], align 8 +; PTR64-NEXT: [[SPLAT_SPLAT89:%.*]] = shufflevector <2 x double> [[COL_LOAD84]], <2 x double> poison, <2 x i32> zeroinitializer +; PTR64-NEXT: [[TMP31:%.*]] = fmul contract <2 x double> [[COL_LOAD81]], [[SPLAT_SPLAT89]] +; PTR64-NEXT: [[SPLAT_SPLAT92:%.*]] = shufflevector <2 x double> [[COL_LOAD84]], <2 x double> poison, <2 x i32> <i32 1, i32 1> +; PTR64-NEXT: [[TMP32:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD83]], <2 x double> [[SPLAT_SPLAT92]], <2 x double> [[TMP31]]) +; PTR64-NEXT: [[SPLAT_SPLAT95:%.*]] = shufflevector <2 x double> [[COL_LOAD86]], <2 x double> poison, <2 x i32> zeroinitializer +; PTR64-NEXT: [[TMP33:%.*]] = fmul contract <2 x double> [[COL_LOAD81]], [[SPLAT_SPLAT95]] +; PTR64-NEXT: [[SPLAT_SPLAT98:%.*]] = shufflevector <2 x double> [[COL_LOAD86]], <2 x double> poison, <2 x i32> <i32 1, i32 1> +; PTR64-NEXT: [[TMP34:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD83]], <2 x double> [[SPLAT_SPLAT98]], <2 x double> [[TMP33]]) +; PTR64-NEXT: [[TMP35:%.*]] = getelementptr i8, ptr [[TMP3]], i64 64 +; PTR64-NEXT: [[COL_LOAD99:%.*]] = load <2 x double>, ptr [[TMP35]], align 8 +; PTR64-NEXT: [[VEC_GEP100:%.*]] = getelementptr i8, ptr [[TMP3]], i64 96 +; PTR64-NEXT: [[COL_LOAD101:%.*]] = load <2 x double>, ptr [[VEC_GEP100]], align 8 +; PTR64-NEXT: [[TMP36:%.*]] = getelementptr i8, ptr [[TMP7]], i64 80 +; PTR64-NEXT: [[COL_LOAD102:%.*]] = load <2 x double>, ptr [[TMP36]], align 8 +; PTR64-NEXT: [[VEC_GEP103:%.*]] = getelementptr i8, ptr [[TMP7]], i64 112 +; PTR64-NEXT: [[COL_LOAD104:%.*]] = load <2 x double>, ptr [[VEC_GEP103]], align 8 +; PTR64-NEXT: [[SPLAT_SPLAT108:%.*]] = shufflevector <2 x double> [[COL_LOAD102]], <2 x double> poison, <2 x i32> zeroinitializer +; PTR64-NEXT: [[TMP37:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD99]], <2 x double> [[SPLAT_SPLAT108]], <2 x double> [[TMP32]]) +; PTR64-NEXT: [[SPLAT_SPLAT111:%.*]] = shufflevector <2 x double> [[COL_LOAD102]], <2 x double> poison, <2 x i32> <i32 1, i32 1> +; PTR64-NEXT: [[TMP38:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD101]], <2 x double> [[SPLAT_SPLAT111]], <2 x double> [[TMP37]]) +; PTR64-NEXT: [[SPLAT_SPLAT115:%.*]] = shufflevector <2 x double> [[COL_LOAD104]], <2 x double> poison, <2 x i32> zeroinitializer +; PTR64-NEXT: [[TMP39:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD99]], <2 x double> [[SPLAT_SPLAT115]], <2 x double> [[TMP34]]) +; PTR64-NEXT: [[SPLAT_SPLAT118:%.*]] = shufflevector <2 x double> [[COL_LOAD104]], <2 x double> poison, <2 x i32> <i32 1, i32 1> +; PTR64-NEXT: [[TMP40:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD101]], <2 x double> [[SPLAT_SPLAT118]], <2 x double> [[TMP39]]) +; PTR64-NEXT: [[TMP41:%.*]] = getelementptr i8, ptr [[C]], i64 64 +; PTR64-NEXT: store <2 x double> [[TMP38]], ptr [[TMP41]], align 8 +; PTR64-NEXT: [[VEC_GEP119:%.*]] = getelementptr i8, ptr [[C]], i64 96 +; PTR64-NEXT: store <2 x double> [[TMP40]], ptr [[VEC_GEP119]], align 8 +; PTR64-NEXT: [[TMP42:%.*]] = getelementptr i8, ptr [[TMP3]], i64 16 +; PTR64-NEXT: [[COL_LOAD120:%.*]] = load <2 x double>, ptr [[TMP42]], align 8 +; PTR64-NEXT: [[VEC_GEP121:%.*]] = getelementptr i8, ptr [[TMP3]], i64 48 +; PTR64-NEXT: [[COL_LOAD122:%.*]] = load <2 x double>, ptr [[VEC_GEP121]], align 8 +; PTR64-NEXT: [[TMP43:%.*]] = getelementptr i8, ptr [[TMP7]], i64 64 +; PTR64-NEXT: [[COL_LOAD123:%.*]] = load <2 x double>, ptr [[TMP43]], align 8 +; PTR64-NEXT: [[VEC_GEP124:%.*]] = getelementptr i8, ptr [[TMP7]], i64 96 +; PTR64-NEXT: [[COL_LOAD125:%.*]] = load <2 x double>, ptr [[VEC_GEP124]], align 8 +; PTR64-NEXT: [[SPLAT_SPLAT128:%.*]] = shufflevector <2 x double> [[COL_LOAD123]], <2 x double> poison, <2 x i32> zeroinitializer +; PTR64-NEXT: [[TMP44:%.*]] = fmul contract <2 x double> [[COL_LOAD120]], [[SPLAT_SPLAT128]] +; PTR64-NEXT: [[SPLAT_SPLAT131:%.*]] = shufflevector <2 x double> [[COL_LOAD123]], <2 x double> poison, <2 x i32> <i32 1, i32 1> +; PTR64-NEXT: [[TMP45:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD122]], <2 x double> [[SPLAT_SPLAT131]], <2 x double> [[TMP44]]) +; PTR64-NEXT: [[SPLAT_SPLAT134:%.*]] = shufflevector <2 x double> [[COL_LOAD125]], <2 x double> poison, <2 x i32> zeroinitializer +; PTR64-NEXT: [[TMP46:%.*]] = fmul contract <2 x double> [[COL_LOAD120]], [[SPLAT_SPLAT134]] +; PTR64-NEXT: [[SPLAT_SPLAT137:%.*]] = shufflevector <2 x double> [[COL_LOAD125]], <2 x double> poison, <2 x i32> <i32 1, i32 1> +; PTR64-NEXT: [[TMP47:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD122]], <2 x double> [[SPLAT_SPLAT137]], <2 x double> [[TMP46]]) +; PTR64-NEXT: [[TMP48:%.*]] = getelementptr i8, ptr [[TMP3]], i64 80 +; PTR64-NEXT: [[COL_LOAD138:%.*]] = load <2 x double>, ptr [[TMP48]], align 8 +; PTR64-NEXT: [[VEC_GEP139:%.*]] = getelementptr i8, ptr [[TMP3]], i64 112 +; PTR64-NEXT: [[COL_LOAD140:%.*]] = load <2 x double>, ptr [[VEC_GEP139]], align 8 +; PTR64-NEXT: [[TMP49:%.*]] = getelementptr i8, ptr [[TMP7]], i64 80 +; PTR64-NEXT: [[COL_LOAD141:%.*]] = load <2 x double>, ptr [[TMP49]], align 8 +; PTR64-NEXT: [[VEC_GEP142:%.*]] = getelementptr i8, ptr [[TMP7]], i64 112 +; PTR64-NEXT: [[COL_LOAD143:%.*]] = load <2 x double>, ptr [[VEC_GEP142]], align 8 +; PTR64-NEXT: [[SPLAT_SPLAT147:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> poison, <2 x i32> zeroinitializer +; PTR64-NEXT: [[TMP50:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT147]], <2 x double> [[TMP45]]) +; PTR64-NEXT: [[SPLAT_SPLAT150:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> poison, <2 x i32> <i32 1, i32 1> +; PTR64-NEXT: [[TMP51:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD140]], <2 x double> [[SPLAT_SPLAT150]], <2 x double> [[TMP50]]) +; PTR64-NEXT: [[SPLAT_SPLAT154:%.*]] = shufflevector <2 x double> [[COL_LOAD143]], <2 x double> poison, <2 x i32> zeroinitializer +; PTR64-NEXT: [[TMP52:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT154]], <2 x double> [[TMP47]]) +; PTR64-NEXT: [[SPLAT_SPLAT157:%.*]] = shufflevector <2 x double> [[COL_LOAD143]], <2 x double> poison, <2 x i32> <i32 1, i32 1> +; PTR64-NEXT: [[TMP53:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD140]], <2 x double> [[SPLAT_SPLAT157]], <2 x double> [[TMP52]]) +; PTR64-NEXT: [[TMP54:%.*]] = getelementptr i8, ptr [[C]], i64 80 +; PTR64-NEXT: store <2 x double> [[TMP51]], ptr [[TMP54]], align 8 +; PTR64-NEXT: [[VEC_GEP158:%.*]] = getelementptr i8, ptr [[C]], i64 112 +; PTR64-NEXT: store <2 x double> [[TMP53]], ptr [[VEC_GEP158]], align 8 +; PTR64-NEXT: ret void +; +; PTR32-LABEL: @multiply( +; PTR32-NEXT: entry: +; PTR32-NEXT: [[STORE_BEGIN:%.*]] = ptrtoint ptr [[C:%.*]] to i32 +; PTR32-NEXT: [[STORE_END:%.*]] = add nuw nsw i32 [[STORE_BEGIN]], 128 +; PTR32-NEXT: [[LOAD_BEGIN:%.*]] = ptrtoint ptr [[A:%.*]] to i32 +; PTR32-NEXT: [[TMP0:%.*]] = icmp ugt i32 [[STORE_END]], [[LOAD_BEGIN]] +; PTR32-NEXT: br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]] +; PTR32: alias_cont: +; PTR32-NEXT: [[LOAD_END:%.*]] = add nuw nsw i32 [[LOAD_BEGIN]], 128 +; PTR32-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[LOAD_END]], [[STORE_BEGIN]] +; PTR32-NEXT: br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]] +; PTR32: copy: +; PTR32-NEXT: [[TMP2:%.*]] = alloca [16 x double], align 8 +; PTR32-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP2]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false) +; PTR32-NEXT: br label [[NO_ALIAS]] +; PTR32: no_alias: +; PTR32-NEXT: [[TMP3:%.*]] = phi ptr [ [[A]], [[ENTRY:%.*]] ], [ [[A]], [[ALIAS_CONT]] ], [ [[TMP2]], [[COPY]] ] +; PTR32-NEXT: [[STORE_BEGIN4:%.*]] = ptrtoint ptr [[C]] to i32 +; PTR32-NEXT: [[STORE_END5:%.*]] = add nuw nsw i32 [[STORE_BEGIN4]], 128 +; PTR32-NEXT: [[LOAD_BEGIN6:%.*]] = ptrtoint ptr [[A]] to i32 +; PTR32-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[STORE_END5]], [[LOAD_BEGIN6]] +; PTR32-NEXT: br i1 [[TMP4]], label [[ALIAS_CONT1:%.*]], label [[NO_ALIAS3:%.*]] +; PTR32: alias_cont1: +; PTR32-NEXT: [[LOAD_END7:%.*]] = add nuw nsw i32 [[LOAD_BEGIN6]], 128 +; PTR32-NEXT: [[TMP5:%.*]] = icmp ugt i32 [[LOAD_END7]], [[STORE_BEGIN4]] +; PTR32-NEXT: br i1 [[TMP5]], label [[COPY2:%.*]], label [[NO_ALIAS3]] +; PTR32: copy2: +; PTR32-NEXT: [[TMP6:%.*]] = alloca [16 x double], align 8 +; PTR32-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP6]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false) +; PTR32-NEXT: br label [[NO_ALIAS3]] +; PTR32: no_alias3: +; PTR32-NEXT: [[TMP7:%.*]] = phi ptr [ [[A]], [[NO_ALIAS]] ], [ [[A]], [[ALIAS_CONT1]] ], [ [[TMP6]], [[COPY2]] ] +; PTR32-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP3]], align 8 +; PTR32-NEXT: [[VEC_GEP:%.*]] = getelementptr i8, ptr [[TMP3]], i32 32 +; PTR32-NEXT: [[COL_LOAD8:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8 +; PTR32-NEXT: [[COL_LOAD9:%.*]] = load <2 x double>, ptr [[TMP7]], align 8 +; PTR32-NEXT: [[VEC_GEP10:%.*]] = getelementptr i8, ptr [[TMP7]], i32 32 +; PTR32-NEXT: [[COL_LOAD11:%.*]] = load <2 x double>, ptr [[VEC_GEP10]], align 8 +; PTR32-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD9]], <2 x double> poison, <2 x i32> zeroinitializer +; PTR32-NEXT: [[TMP8:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]] +; PTR32-NEXT: [[SPLAT_SPLAT14:%.*]] = shufflevector <2 x double> [[COL_LOAD9]], <2 x double> poison, <2 x i32> <i32 1, i32 1> +; PTR32-NEXT: [[TMP9:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT14]], <2 x double> [[TMP8]]) +; PTR32-NEXT: [[SPLAT_SPLAT17:%.*]] = shufflevector <2 x double> [[COL_LOAD11]], <2 x double> poison, <2 x i32> zeroinitializer +; PTR32-NEXT: [[TMP10:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT17]] +; PTR32-NEXT: [[SPLAT_SPLAT20:%.*]] = shufflevector <2 x double> [[COL_LOAD11]], <2 x double> poison, <2 x i32> <i32 1, i32 1> +; PTR32-NEXT: [[TMP11:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT20]], <2 x double> [[TMP10]]) +; PTR32-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP3]], i32 64 +; PTR32-NEXT: [[COL_LOAD21:%.*]] = load <2 x double>, ptr [[TMP12]], align 8 +; PTR32-NEXT: [[VEC_GEP22:%.*]] = getelementptr i8, ptr [[TMP3]], i32 96 +; PTR32-NEXT: [[COL_LOAD23:%.*]] = load <2 x double>, ptr [[VEC_GEP22]], align 8 +; PTR32-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP7]], i32 16 +; PTR32-NEXT: [[COL_LOAD24:%.*]] = load <2 x double>, ptr [[TMP13]], align 8 +; PTR32-NEXT: [[VEC_GEP25:%.*]] = getelementptr i8, ptr [[TMP7]], i32 48 +; PTR32-NEXT: [[COL_LOAD26:%.*]] = load <2 x double>, ptr [[VEC_GEP25]], align 8 +; PTR32-NEXT: [[SPLAT_SPLAT30:%.*]] = shufflevector <2 x double> [[COL_LOAD24]], <2 x double> poison, <2 x i32> zeroinitializer +; PTR32-NEXT: [[TMP14:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD21]], <2 x double> [[SPLAT_SPLAT30]], <2 x double> [[TMP9]]) +; PTR32-NEXT: [[SPLAT_SPLAT33:%.*]] = shufflevector <2 x double> [[COL_LOAD24]], <2 x double> poison, <2 x i32> <i32 1, i32 1> +; PTR32-NEXT: [[TMP15:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD23]], <2 x double> [[SPLAT_SPLAT33]], <2 x double> [[TMP14]]) +; PTR32-NEXT: [[SPLAT_SPLAT37:%.*]] = shufflevector <2 x double> [[COL_LOAD26]], <2 x double> poison, <2 x i32> zeroinitializer +; PTR32-NEXT: [[TMP16:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD21]], <2 x double> [[SPLAT_SPLAT37]], <2 x double> [[TMP11]]) +; PTR32-NEXT: [[SPLAT_SPLAT40:%.*]] = shufflevector <2 x double> [[COL_LOAD26]], <2 x double> poison, <2 x i32> <i32 1, i32 1> +; PTR32-NEXT: [[TMP17:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD23]], <2 x double> [[SPLAT_SPLAT40]], <2 x double> [[TMP16]]) +; PTR32-NEXT: store <2 x double> [[TMP15]], ptr [[C]], align 8 +; PTR32-NEXT: [[VEC_GEP41:%.*]] = getelementptr i8, ptr [[C]], i32 32 +; PTR32-NEXT: store <2 x double> [[TMP17]], ptr [[VEC_GEP41]], align 8 +; PTR32-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP3]], i32 16 +; PTR32-NEXT: [[COL_LOAD42:%.*]] = load <2 x double>, ptr [[TMP18]], align 8 +; PTR32-NEXT: [[VEC_GEP43:%.*]] = getelementptr i8, ptr [[TMP3]], i32 48 +; PTR32-NEXT: [[COL_LOAD44:%.*]] = load <2 x double>, ptr [[VEC_GEP43]], align 8 +; PTR32-NEXT: [[COL_LOAD45:%.*]] = load <2 x double>, ptr [[TMP7]], align 8 +; PTR32-NEXT: [[VEC_GEP46:%.*]] = getelementptr i8, ptr [[TMP7]], i32 32 +; PTR32-NEXT: [[COL_LOAD47:%.*]] = load <2 x double>, ptr [[VEC_GEP46]], align 8 +; PTR32-NEXT: [[SPLAT_SPLAT50:%.*]] = shufflevector <2 x double> [[COL_LOAD45]], <2 x double> poison, <2 x i32> zeroinitializer +; PTR32-NEXT: [[TMP19:%.*]] = fmul contract <2 x double> [[COL_LOAD42]], [[SPLAT_SPLAT50]] +; PTR32-NEXT: [[SPLAT_SPLAT53:%.*]] = shufflevector <2 x double> [[COL_LOAD45]], <2 x double> poison, <2 x i32> <i32 1, i32 1> +; PTR32-NEXT: [[TMP20:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD44]], <2 x double> [[SPLAT_SPLAT53]], <2 x double> [[TMP19]]) +; PTR32-NEXT: [[SPLAT_SPLAT56:%.*]] = shufflevector <2 x double> [[COL_LOAD47]], <2 x double> poison, <2 x i32> zeroinitializer +; PTR32-NEXT: [[TMP21:%.*]] = fmul contract <2 x double> [[COL_LOAD42]], [[SPLAT_SPLAT56]] +; PTR32-NEXT: [[SPLAT_SPLAT59:%.*]] = shufflevector <2 x double> [[COL_LOAD47]], <2 x double> poison, <2 x i32> <i32 1, i32 1> +; PTR32-NEXT: [[TMP22:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD44]], <2 x double> [[SPLAT_SPLAT59]], <2 x double> [[TMP21]]) +; PTR32-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[TMP3]], i32 80 +; PTR32-NEXT: [[COL_LOAD60:%.*]] = load <2 x double>, ptr [[TMP23]], align 8 +; PTR32-NEXT: [[VEC_GEP61:%.*]] = getelementptr i8, ptr [[TMP3]], i32 112 +; PTR32-NEXT: [[COL_LOAD62:%.*]] = load <2 x double>, ptr [[VEC_GEP61]], align 8 +; PTR32-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[TMP7]], i32 16 +; PTR32-NEXT: [[COL_LOAD63:%.*]] = load <2 x double>, ptr [[TMP24]], align 8 +; PTR32-NEXT: [[VEC_GEP64:%.*]] = getelementptr i8, ptr [[TMP7]], i32 48 +; PTR32-NEXT: [[COL_LOAD65:%.*]] = load <2 x double>, ptr [[VEC_GEP64]], align 8 +; PTR32-NEXT: [[SPLAT_SPLAT69:%.*]] = shufflevector <2 x double> [[COL_LOAD63]], <2 x double> poison, <2 x i32> zeroinitializer +; PTR32-NEXT: [[TMP25:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT69]], <2 x double> [[TMP20]]) +; PTR32-NEXT: [[SPLAT_SPLAT72:%.*]] = shufflevector <2 x double> [[COL_LOAD63]], <2 x double> poison, <2 x i32> <i32 1, i32 1> +; PTR32-NEXT: [[TMP26:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD62]], <2 x double> [[SPLAT_SPLAT72]], <2 x double> [[TMP25]]) +; PTR32-NEXT: [[SPLAT_SPLAT76:%.*]] = shufflevector <2 x double> [[COL_LOAD65]], <2 x double> poison, <2 x i32> zeroinitializer +; PTR32-NEXT: [[TMP27:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT76]], <2 x double> [[TMP22]]) +; PTR32-NEXT: [[SPLAT_SPLAT79:%.*]] = shufflevector <2 x double> [[COL_LOAD65]], <2 x double> poison, <2 x i32> <i32 1, i32 1> +; PTR32-NEXT: [[TMP28:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD62]], <2 x double> [[SPLAT_SPLAT79]], <2 x double> [[TMP27]]) +; PTR32-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[C]], i32 16 +; PTR32-NEXT: store <2 x double> [[TMP26]], ptr [[TMP29]], align 8 +; PTR32-NEXT: [[VEC_GEP80:%.*]] = getelementptr i8, ptr [[C]], i32 48 +; PTR32-NEXT: store <2 x double> [[TMP28]], ptr [[VEC_GEP80]], align 8 +; PTR32-NEXT: [[COL_LOAD81:%.*]] = load <2 x double>, ptr [[TMP3]], align 8 +; PTR32-NEXT: [[VEC_GEP82:%.*]] = getelementptr i8, ptr [[TMP3]], i32 32 +; PTR32-NEXT: [[COL_LOAD83:%.*]] = load <2 x double>, ptr [[VEC_GEP82]], align 8 +; PTR32-NEXT: [[TMP30:%.*]] = getelementptr i8, ptr [[TMP7]], i32 64 +; PTR32-NEXT: [[COL_LOAD84:%.*]] = load <2 x double>, ptr [[TMP30]], align 8 +; PTR32-NEXT: [[VEC_GEP85:%.*]] = getelementptr i8, ptr [[TMP7]], i32 96 +; PTR32-NEXT: [[COL_LOAD86:%.*]] = load <2 x double>, ptr [[VEC_GEP85]], align 8 +; PTR32-NEXT: [[SPLAT_SPLAT89:%.*]] = shufflevector <2 x double> [[COL_LOAD84]], <2 x double> poison, <2 x i32> zeroinitializer +; PTR32-NEXT: [[TMP31:%.*]] = fmul contract <2 x double> [[COL_LOAD81]], [[SPLAT_SPLAT89]] +; PTR32-NEXT: [[SPLAT_SPLAT92:%.*]] = shufflevector <2 x double> [[COL_LOAD84]], <2 x double> poison, <2 x i32> <i32 1, i32 1> +; PTR32-NEXT: [[TMP32:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD83]], <2 x double> [[SPLAT_SPLAT92]], <2 x double> [[TMP31]]) +; PTR32-NEXT: [[SPLAT_SPLAT95:%.*]] = shufflevector <2 x double> [[COL_LOAD86]], <2 x double> poison, <2 x i32> zeroinitializer +; PTR32-NEXT: [[TMP33:%.*]] = fmul contract <2 x double> [[COL_LOAD81]], [[SPLAT_SPLAT95]] +; PTR32-NEXT: [[SPLAT_SPLAT98:%.*]] = shufflevector <2 x double> [[COL_LOAD86]], <2 x double> poison, <2 x i32> <i32 1, i32 1> +; PTR32-NEXT: [[TMP34:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD83]], <2 x double> [[SPLAT_SPLAT98]], <2 x double> [[TMP33]]) +; PTR32-NEXT: [[TMP35:%.*]] = getelementptr i8, ptr [[TMP3]], i32 64 +; PTR32-NEXT: [[COL_LOAD99:%.*]] = load <2 x double>, ptr [[TMP35]], align 8 +; PTR32-NEXT: [[VEC_GEP100:%.*]] = getelementptr i8, ptr [[TMP3]], i32 96 +; PTR32-NEXT: [[COL_LOAD101:%.*]] = load <2 x double>, ptr [[VEC_GEP100]], align 8 +; PTR32-NEXT: [[TMP36:%.*]] = getelementptr i8, ptr [[TMP7]], i32 80 +; PTR32-NEXT: [[COL_LOAD102:%.*]] = load <2 x double>, ptr [[TMP36]], align 8 +; PTR32-NEXT: [[VEC_GEP103:%.*]] = getelementptr i8, ptr [[TMP7]], i32 112 +; PTR32-NEXT: [[COL_LOAD104:%.*]] = load <2 x double>, ptr [[VEC_GEP103]], align 8 +; PTR32-NEXT: [[SPLAT_SPLAT108:%.*]] = shufflevector <2 x double> [[COL_LOAD102]], <2 x double> poison, <2 x i32> zeroinitializer +; PTR32-NEXT: [[TMP37:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD99]], <2 x double> [[SPLAT_SPLAT108]], <2 x double> [[TMP32]]) +; PTR32-NEXT: [[SPLAT_SPLAT111:%.*]] = shufflevector <2 x double> [[COL_LOAD102]], <2 x double> poison, <2 x i32> <i32 1, i32 1> +; PTR32-NEXT: [[TMP38:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD101]], <2 x double> [[SPLAT_SPLAT111]], <2 x double> [[TMP37]]) +; PTR32-NEXT: [[SPLAT_SPLAT115:%.*]] = shufflevector <2 x double> [[COL_LOAD104]], <2 x double> poison, <2 x i32> zeroinitializer +; PTR32-NEXT: [[TMP39:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD99]], <2 x double> [[SPLAT_SPLAT115]], <2 x double> [[TMP34]]) +; PTR32-NEXT: [[SPLAT_SPLAT118:%.*]] = shufflevector <2 x double> [[COL_LOAD104]], <2 x double> poison, <2 x i32> <i32 1, i32 1> +; PTR32-NEXT: [[TMP40:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD101]], <2 x double> [[SPLAT_SPLAT118]], <2 x double> [[TMP39]]) +; PTR32-NEXT: [[TMP41:%.*]] = getelementptr i8, ptr [[C]], i32 64 +; PTR32-NEXT: store <2 x double> [[TMP38]], ptr [[TMP41]], align 8 +; PTR32-NEXT: [[VEC_GEP119:%.*]] = getelementptr i8, ptr [[C]], i32 96 +; PTR32-NEXT: store <2 x double> [[TMP40]], ptr [[VEC_GEP119]], align 8 +; PTR32-NEXT: [[TMP42:%.*]] = getelementptr i8, ptr [[TMP3]], i32 16 +; PTR32-NEXT: [[COL_LOAD120:%.*]] = load <2 x double>, ptr [[TMP42]], align 8 +; PTR32-NEXT: [[VEC_GEP121:%.*]] = getelementptr i8, ptr [[TMP3]], i32 48 +; PTR32-NEXT: [[COL_LOAD122:%.*]] = load <2 x double>, ptr [[VEC_GEP121]], align 8 +; PTR32-NEXT: [[TMP43:%.*]] = getelementptr i8, ptr [[TMP7]], i32 64 +; PTR32-NEXT: [[COL_LOAD123:%.*]] = load <2 x double>, ptr [[TMP43]], align 8 +; PTR32-NEXT: [[VEC_GEP124:%.*]] = getelementptr i8, ptr [[TMP7]], i32 96 +; PTR32-NEXT: [[COL_LOAD125:%.*]] = load <2 x double>, ptr [[VEC_GEP124]], align 8 +; PTR32-NEXT: [[SPLAT_SPLAT128:%.*]] = shufflevector <2 x double> [[COL_LOAD123]], <2 x double> poison, <2 x i32> zeroinitializer +; PTR32-NEXT: [[TMP44:%.*]] = fmul contract <2 x double> [[COL_LOAD120]], [[SPLAT_SPLAT128]] +; PTR32-NEXT: [[SPLAT_SPLAT131:%.*]] = shufflevector <2 x double> [[COL_LOAD123]], <2 x double> poison, <2 x i32> <i32 1, i32 1> +; PTR32-NEXT: [[TMP45:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD122]], <2 x double> [[SPLAT_SPLAT131]], <2 x double> [[TMP44]]) +; PTR32-NEXT: [[SPLAT_SPLAT134:%.*]] = shufflevector <2 x double> [[COL_LOAD125]], <2 x double> poison, <2 x i32> zeroinitializer +; PTR32-NEXT: [[TMP46:%.*]] = fmul contract <2 x double> [[COL_LOAD120]], [[SPLAT_SPLAT134]] +; PTR32-NEXT: [[SPLAT_SPLAT137:%.*]] = shufflevector <2 x double> [[COL_LOAD125]], <2 x double> poison, <2 x i32> <i32 1, i32 1> +; PTR32-NEXT: [[TMP47:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD122]], <2 x double> [[SPLAT_SPLAT137]], <2 x double> [[TMP46]]) +; PTR32-NEXT: [[TMP48:%.*]] = getelementptr i8, ptr [[TMP3]], i32 80 +; PTR32-NEXT: [[COL_LOAD138:%.*]] = load <2 x double>, ptr [[TMP48]], align 8 +; PTR32-NEXT: [[VEC_GEP139:%.*]] = getelementptr i8, ptr [[TMP3]], i32 112 +; PTR32-NEXT: [[COL_LOAD140:%.*]] = load <2 x double>, ptr [[VEC_GEP139]], align 8 +; PTR32-NEXT: [[TMP49:%.*]] = getelementptr i8, ptr [[TMP7]], i32 80 +; PTR32-NEXT: [[COL_LOAD141:%.*]] = load <2 x double>, ptr [[TMP49]], align 8 +; PTR32-NEXT: [[VEC_GEP142:%.*]] = getelementptr i8, ptr [[TMP7]], i32 112 +; PTR32-NEXT: [[COL_LOAD143:%.*]] = load <2 x double>, ptr [[VEC_GEP142]], align 8 +; PTR32-NEXT: [[SPLAT_SPLAT147:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> poison, <2 x i32> zeroinitializer +; PTR32-NEXT: [[TMP50:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT147]], <2 x double> [[TMP45]]) +; PTR32-NEXT: [[SPLAT_SPLAT150:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> poison, <2 x i32> <i32 1, i32 1> +; PTR32-NEXT: [[TMP51:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD140]], <2 x double> [[SPLAT_SPLAT150]], <2 x double> [[TMP50]]) +; PTR32-NEXT: [[SPLAT_SPLAT154:%.*]] = shufflevector <2 x double> [[COL_LOAD143]], <2 x double> poison, <2 x i32> zeroinitializer +; PTR32-NEXT: [[TMP52:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT154]], <2 x double> [[TMP47]]) +; PTR32-NEXT: [[SPLAT_SPLAT157:%.*]] = shufflevector <2 x double> [[COL_LOAD143]], <2 x double> poison, <2 x i32> <i32 1, i32 1> +; PTR32-NEXT: [[TMP53:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD140]], <2 x double> [[SPLAT_SPLAT157]], <2 x double> [[TMP52]]) +; PTR32-NEXT: [[TMP54:%.*]] = getelementptr i8, ptr [[C]], i32 80 +; PTR32-NEXT: store <2 x double> [[TMP51]], ptr [[TMP54]], align 8 +; PTR32-NEXT: [[VEC_GEP158:%.*]] = getelementptr i8, ptr [[C]], i32 112 +; PTR32-NEXT: store <2 x double> [[TMP53]], ptr [[VEC_GEP158]], align 8 +; PTR32-NEXT: ret void +; +entry: + %a = load <16 x double>, ptr %A, align 8 + %c = call <16 x double> @llvm.matrix.multiply(<16 x double> %a, <16 x double> %a, i32 4, i32 4, i32 4) + store <16 x double> %c, ptr %C, align 8 + ret void +} + +declare <16 x double> @llvm.matrix.multiply(<16 x double>, <16 x double>, i32, i32, i32) diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout.ll new file mode 100644 index 0000000..87def6b --- /dev/null +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout.ll @@ -0,0 +1,312 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes='lower-matrix-intrinsics' -data-layout='p:128:128' -S < %s | FileCheck %s --check-prefix=PTR128 +; RUN: opt -passes='lower-matrix-intrinsics' -data-layout='p:64:64' -S < %s | FileCheck %s --check-prefix=PTR64 +; RUN: opt -passes='lower-matrix-intrinsics' -data-layout='p:32:32' -S < %s | FileCheck %s --check-prefix=PTR32 + +; To properly support the matrix intrinsics on, e.g., 32-bit platforms (without +; the need to emit `libc` calls), we perform strided index calculations using +; the same pointer bit-width as the matrix pointers, as determined by the data +; layout. To verify this behaviour, this test runs several strided loads and +; stores through the lowering pass with (32|64|128)-bit pointers, and verifies +; the generated code extends / truncates strides accordingly. Similarly, +; `data-layout-multiply-fused.ll` adopts this approach to verify the same +; behaviour for index calculations emitted while lowering fused matrix +; multiplies. + +define <9 x double> @strided_load_3x3_i128(ptr %in, i128 %stride) { +; PTR128-LABEL: @strided_load_3x3_i128( +; PTR128-NEXT: entry: +; PTR128-NEXT: [[VEC_START:%.*]] = mul i128 0, [[STRIDE:%.*]] +; PTR128-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i128 [[VEC_START]] +; PTR128-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8 +; PTR128-NEXT: [[VEC_START1:%.*]] = mul i128 1, [[STRIDE]] +; PTR128-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START1]] +; PTR128-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8 +; PTR128-NEXT: [[VEC_START4:%.*]] = mul i128 2, [[STRIDE]] +; PTR128-NEXT: [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START4]] +; PTR128-NEXT: [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8 +; PTR128-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5> +; PTR128-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison> +; PTR128-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> +; PTR128-NEXT: ret <9 x double> [[TMP2]] +; +; PTR64-LABEL: @strided_load_3x3_i128( +; PTR64-NEXT: entry: +; PTR64-NEXT: [[STRIDE_CAST:%.*]] = trunc i128 [[STRIDE:%.*]] to i64 +; PTR64-NEXT: [[VEC_START:%.*]] = mul i64 0, [[STRIDE_CAST]] +; PTR64-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i64 [[VEC_START]] +; PTR64-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8 +; PTR64-NEXT: [[VEC_START1:%.*]] = mul i64 1, [[STRIDE_CAST]] +; PTR64-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i64 [[VEC_START1]] +; PTR64-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8 +; PTR64-NEXT: [[VEC_START4:%.*]] = mul i64 2, [[STRIDE_CAST]] +; PTR64-NEXT: [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i64 [[VEC_START4]] +; PTR64-NEXT: [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8 +; PTR64-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5> +; PTR64-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison> +; PTR64-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> +; PTR64-NEXT: ret <9 x double> [[TMP2]] +; +; PTR32-LABEL: @strided_load_3x3_i128( +; PTR32-NEXT: entry: +; PTR32-NEXT: [[STRIDE_CAST:%.*]] = trunc i128 [[STRIDE:%.*]] to i32 +; PTR32-NEXT: [[VEC_START:%.*]] = mul i32 0, [[STRIDE_CAST]] +; PTR32-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i32 [[VEC_START]] +; PTR32-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8 +; PTR32-NEXT: [[VEC_START1:%.*]] = mul i32 1, [[STRIDE_CAST]] +; PTR32-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i32 [[VEC_START1]] +; PTR32-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8 +; PTR32-NEXT: [[VEC_START4:%.*]] = mul i32 2, [[STRIDE_CAST]] +; PTR32-NEXT: [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i32 [[VEC_START4]] +; PTR32-NEXT: [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8 +; PTR32-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5> +; PTR32-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison> +; PTR32-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> +; PTR32-NEXT: ret <9 x double> [[TMP2]] +; +entry: + %load = call <9 x double> @llvm.matrix.column.major.load.v9f64.i128(ptr %in, i128 %stride, i1 false, i32 3, i32 3) + ret <9 x double> %load +} + +define <9 x double> @strided_load_3x3_const_stride_i128(ptr %in) { +; PTR128-LABEL: @strided_load_3x3_const_stride_i128( +; PTR128-NEXT: entry: +; PTR128-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8 +; PTR128-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i128 16 +; PTR128-NEXT: [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8 +; PTR128-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 32 +; PTR128-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8 +; PTR128-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5> +; PTR128-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison> +; PTR128-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> +; PTR128-NEXT: ret <9 x double> [[TMP2]] +; +; PTR64-LABEL: @strided_load_3x3_const_stride_i128( +; PTR64-NEXT: entry: +; PTR64-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8 +; PTR64-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i64 16 +; PTR64-NEXT: [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8 +; PTR64-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i64 32 +; PTR64-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8 +; PTR64-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5> +; PTR64-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison> +; PTR64-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> +; PTR64-NEXT: ret <9 x double> [[TMP2]] +; +; PTR32-LABEL: @strided_load_3x3_const_stride_i128( +; PTR32-NEXT: entry: +; PTR32-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8 +; PTR32-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i32 16 +; PTR32-NEXT: [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8 +; PTR32-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i32 32 +; PTR32-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8 +; PTR32-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5> +; PTR32-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison> +; PTR32-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> +; PTR32-NEXT: ret <9 x double> [[TMP2]] +; +entry: + %load = call <9 x double> @llvm.matrix.column.major.load.v9f64.i128(ptr %in, i128 16, i1 false, i32 3, i32 3) + ret <9 x double> %load +} + +define <9 x double> @strided_load_3x3_i64(ptr %in, i64 %stride) { +; PTR128-LABEL: @strided_load_3x3_i64( +; PTR128-NEXT: entry: +; PTR128-NEXT: [[STRIDE_CAST:%.*]] = zext i64 [[STRIDE:%.*]] to i128 +; PTR128-NEXT: [[VEC_START:%.*]] = mul i128 0, [[STRIDE_CAST]] +; PTR128-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i128 [[VEC_START]] +; PTR128-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8 +; PTR128-NEXT: [[VEC_START1:%.*]] = mul i128 1, [[STRIDE_CAST]] +; PTR128-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START1]] +; PTR128-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8 +; PTR128-NEXT: [[VEC_START4:%.*]] = mul i128 2, [[STRIDE_CAST]] +; PTR128-NEXT: [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START4]] +; PTR128-NEXT: [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8 +; PTR128-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5> +; PTR128-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison> +; PTR128-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> +; PTR128-NEXT: ret <9 x double> [[TMP2]] +; +; PTR64-LABEL: @strided_load_3x3_i64( +; PTR64-NEXT: entry: +; PTR64-NEXT: [[VEC_START:%.*]] = mul i64 0, [[STRIDE:%.*]] +; PTR64-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i64 [[VEC_START]] +; PTR64-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8 +; PTR64-NEXT: [[VEC_START1:%.*]] = mul i64 1, [[STRIDE]] +; PTR64-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i64 [[VEC_START1]] +; PTR64-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8 +; PTR64-NEXT: [[VEC_START4:%.*]] = mul i64 2, [[STRIDE]] +; PTR64-NEXT: [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i64 [[VEC_START4]] +; PTR64-NEXT: [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8 +; PTR64-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5> +; PTR64-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison> +; PTR64-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> +; PTR64-NEXT: ret <9 x double> [[TMP2]] +; +; PTR32-LABEL: @strided_load_3x3_i64( +; PTR32-NEXT: entry: +; PTR32-NEXT: [[STRIDE_CAST:%.*]] = trunc i64 [[STRIDE:%.*]] to i32 +; PTR32-NEXT: [[VEC_START:%.*]] = mul i32 0, [[STRIDE_CAST]] +; PTR32-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i32 [[VEC_START]] +; PTR32-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8 +; PTR32-NEXT: [[VEC_START1:%.*]] = mul i32 1, [[STRIDE_CAST]] +; PTR32-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i32 [[VEC_START1]] +; PTR32-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8 +; PTR32-NEXT: [[VEC_START4:%.*]] = mul i32 2, [[STRIDE_CAST]] +; PTR32-NEXT: [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i32 [[VEC_START4]] +; PTR32-NEXT: [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8 +; PTR32-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5> +; PTR32-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison> +; PTR32-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> +; PTR32-NEXT: ret <9 x double> [[TMP2]] +; +entry: + %load = call <9 x double> @llvm.matrix.column.major.load.v9f64.i64(ptr %in, i64 %stride, i1 false, i32 3, i32 3) + ret <9 x double> %load +} + +define <9 x double> @strided_load_3x3_const_stride_i64(ptr %in) { +; PTR128-LABEL: @strided_load_3x3_const_stride_i64( +; PTR128-NEXT: entry: +; PTR128-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8 +; PTR128-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i128 16 +; PTR128-NEXT: [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8 +; PTR128-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 32 +; PTR128-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8 +; PTR128-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5> +; PTR128-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison> +; PTR128-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> +; PTR128-NEXT: ret <9 x double> [[TMP2]] +; +; PTR64-LABEL: @strided_load_3x3_const_stride_i64( +; PTR64-NEXT: entry: +; PTR64-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8 +; PTR64-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i64 16 +; PTR64-NEXT: [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8 +; PTR64-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i64 32 +; PTR64-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8 +; PTR64-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5> +; PTR64-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison> +; PTR64-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> +; PTR64-NEXT: ret <9 x double> [[TMP2]] +; +; PTR32-LABEL: @strided_load_3x3_const_stride_i64( +; PTR32-NEXT: entry: +; PTR32-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8 +; PTR32-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i32 16 +; PTR32-NEXT: [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8 +; PTR32-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i32 32 +; PTR32-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8 +; PTR32-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5> +; PTR32-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison> +; PTR32-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> +; PTR32-NEXT: ret <9 x double> [[TMP2]] +; +entry: + %load = call <9 x double> @llvm.matrix.column.major.load.v9f64.i64(ptr %in, i64 16, i1 false, i32 3, i32 3) + ret <9 x double> %load +} + +define <9 x double> @strided_load_3x3_i32(ptr %in, i32 %stride) { +; PTR128-LABEL: @strided_load_3x3_i32( +; PTR128-NEXT: entry: +; PTR128-NEXT: [[STRIDE_CAST:%.*]] = zext i32 [[STRIDE:%.*]] to i128 +; PTR128-NEXT: [[VEC_START:%.*]] = mul i128 0, [[STRIDE_CAST]] +; PTR128-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i128 [[VEC_START]] +; PTR128-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8 +; PTR128-NEXT: [[VEC_START1:%.*]] = mul i128 1, [[STRIDE_CAST]] +; PTR128-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START1]] +; PTR128-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8 +; PTR128-NEXT: [[VEC_START4:%.*]] = mul i128 2, [[STRIDE_CAST]] +; PTR128-NEXT: [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START4]] +; PTR128-NEXT: [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8 +; PTR128-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5> +; PTR128-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison> +; PTR128-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> +; PTR128-NEXT: ret <9 x double> [[TMP2]] +; +; PTR64-LABEL: @strided_load_3x3_i32( +; PTR64-NEXT: entry: +; PTR64-NEXT: [[STRIDE_CAST:%.*]] = zext i32 [[STRIDE:%.*]] to i64 +; PTR64-NEXT: [[VEC_START:%.*]] = mul i64 0, [[STRIDE_CAST]] +; PTR64-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i64 [[VEC_START]] +; PTR64-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8 +; PTR64-NEXT: [[VEC_START1:%.*]] = mul i64 1, [[STRIDE_CAST]] +; PTR64-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i64 [[VEC_START1]] +; PTR64-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8 +; PTR64-NEXT: [[VEC_START4:%.*]] = mul i64 2, [[STRIDE_CAST]] +; PTR64-NEXT: [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i64 [[VEC_START4]] +; PTR64-NEXT: [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8 +; PTR64-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5> +; PTR64-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison> +; PTR64-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> +; PTR64-NEXT: ret <9 x double> [[TMP2]] +; +; PTR32-LABEL: @strided_load_3x3_i32( +; PTR32-NEXT: entry: +; PTR32-NEXT: [[VEC_START:%.*]] = mul i32 0, [[STRIDE:%.*]] +; PTR32-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i32 [[VEC_START]] +; PTR32-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8 +; PTR32-NEXT: [[VEC_START1:%.*]] = mul i32 1, [[STRIDE]] +; PTR32-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i32 [[VEC_START1]] +; PTR32-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8 +; PTR32-NEXT: [[VEC_START4:%.*]] = mul i32 2, [[STRIDE]] +; PTR32-NEXT: [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i32 [[VEC_START4]] +; PTR32-NEXT: [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8 +; PTR32-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5> +; PTR32-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison> +; PTR32-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> +; PTR32-NEXT: ret <9 x double> [[TMP2]] +; +entry: + %load = call <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr %in, i32 %stride, i1 false, i32 3, i32 3) + ret <9 x double> %load +} + +define <9 x double> @strided_load_3x3_const_stride_i32(ptr %in) { +; PTR128-LABEL: @strided_load_3x3_const_stride_i32( +; PTR128-NEXT: entry: +; PTR128-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8 +; PTR128-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i128 16 +; PTR128-NEXT: [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8 +; PTR128-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 32 +; PTR128-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8 +; PTR128-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5> +; PTR128-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison> +; PTR128-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> +; PTR128-NEXT: ret <9 x double> [[TMP2]] +; +; PTR64-LABEL: @strided_load_3x3_const_stride_i32( +; PTR64-NEXT: entry: +; PTR64-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8 +; PTR64-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i64 16 +; PTR64-NEXT: [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8 +; PTR64-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i64 32 +; PTR64-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8 +; PTR64-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5> +; PTR64-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison> +; PTR64-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> +; PTR64-NEXT: ret <9 x double> [[TMP2]] +; +; PTR32-LABEL: @strided_load_3x3_const_stride_i32( +; PTR32-NEXT: entry: +; PTR32-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8 +; PTR32-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i32 16 +; PTR32-NEXT: [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8 +; PTR32-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i32 32 +; PTR32-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8 +; PTR32-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5> +; PTR32-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison> +; PTR32-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> +; PTR32-NEXT: ret <9 x double> [[TMP2]] +; +entry: + %load = call <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr %in, i32 16, i1 false, i32 3, i32 3) + ret <9 x double> %load +} + +declare <9 x double> @llvm.matrix.column.major.load.v9f64.i128(ptr, i128, i1, i32, i32) +declare <9 x double> @llvm.matrix.column.major.load.v9f64.i64(ptr, i64, i1, i32, i32) +declare <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr, i32, i1, i32, i32) diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-double.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-double.ll index ae7da19..abc4705 100644 --- a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-double.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-double.ll @@ -62,11 +62,12 @@ declare <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr, i32, i1, i32, define <8 x double> @strided_load_4x2_stride_i32(ptr %in, i32 %stride) { ; CHECK-LABEL: @strided_load_4x2_stride_i32( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[VEC_START:%.*]] = mul i32 0, [[STRIDE:%.*]] -; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i32 [[VEC_START]] +; CHECK-NEXT: [[STRIDE_CAST:%.*]] = zext i32 [[STRIDE:%.*]] to i64 +; CHECK-NEXT: [[VEC_START:%.*]] = mul i64 0, [[STRIDE_CAST]] +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i64 [[VEC_START]] ; CHECK-NEXT: [[COL_LOAD:%.*]] = load <4 x double>, ptr [[VEC_GEP]], align 8 -; CHECK-NEXT: [[VEC_START1:%.*]] = mul i32 1, [[STRIDE]] -; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i32 [[VEC_START1]] +; CHECK-NEXT: [[VEC_START1:%.*]] = mul i64 1, [[STRIDE_CAST]] +; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i64 [[VEC_START1]] ; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <4 x double>, ptr [[VEC_GEP2]], align 8 ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x double> [[COL_LOAD]], <4 x double> [[COL_LOAD4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> ; CHECK-NEXT: ret <8 x double> [[TMP0]] diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-double.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-double.ll index 28e9cdb..81b8507 100644 --- a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-double.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-double.ll @@ -34,11 +34,12 @@ define void @strided_store_3x2_nonconst_i32_stride(<6 x double> %in, i32 %stride ; CHECK-LABEL: @strided_store_3x2_nonconst_i32_stride( ; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <6 x double> [[IN:%.*]], <6 x double> poison, <3 x i32> <i32 0, i32 1, i32 2> ; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <6 x double> [[IN]], <6 x double> poison, <3 x i32> <i32 3, i32 4, i32 5> -; CHECK-NEXT: [[VEC_START:%.*]] = mul i32 0, [[STRIDE:%.*]] -; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[OUT:%.*]], i32 [[VEC_START]] +; CHECK-NEXT: [[STRIDE_CAST:%.*]] = zext i32 [[STRIDE:%.*]] to i64 +; CHECK-NEXT: [[VEC_START:%.*]] = mul i64 0, [[STRIDE_CAST]] +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[VEC_START]] ; CHECK-NEXT: store <3 x double> [[SPLIT]], ptr [[VEC_GEP]], align 8 -; CHECK-NEXT: [[VEC_START2:%.*]] = mul i32 1, [[STRIDE]] -; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr double, ptr [[OUT]], i32 [[VEC_START2]] +; CHECK-NEXT: [[VEC_START2:%.*]] = mul i64 1, [[STRIDE_CAST]] +; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr double, ptr [[OUT]], i64 [[VEC_START2]] ; CHECK-NEXT: store <3 x double> [[SPLIT1]], ptr [[VEC_GEP3]], align 8 ; CHECK-NEXT: ret void ; |