diff options
Diffstat (limited to 'llvm/test/Transforms')
96 files changed, 6358 insertions, 3343 deletions
diff --git a/llvm/test/Transforms/AggressiveInstCombine/X86/store-merge-be.ll b/llvm/test/Transforms/AggressiveInstCombine/X86/store-merge-be.ll new file mode 100644 index 0000000..34f3924 --- /dev/null +++ b/llvm/test/Transforms/AggressiveInstCombine/X86/store-merge-be.ll @@ -0,0 +1,106 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=aggressive-instcombine -mtriple=x86_64-unknown-linux-gnu -data-layout="E-n64" < %s | FileCheck %s + +; Pretend X86 is big endian. + +; FIXME: Big endian not supported yet. + +define void @test_i32_be(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_i32_be( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_0:%.*]] = trunc i32 [[X]] to i8 +; CHECK-NEXT: [[GEP_0:%.*]] = getelementptr i8, ptr [[P]], i64 3 +; CHECK-NEXT: store i8 [[X_0]], ptr [[GEP_0]], align 1 +; CHECK-NEXT: [[SHR_1:%.*]] = lshr i32 [[X]], 8 +; CHECK-NEXT: [[X_1:%.*]] = trunc i32 [[SHR_1]] to i8 +; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 2 +; CHECK-NEXT: store i8 [[X_1]], ptr [[GEP_1]], align 1 +; CHECK-NEXT: [[SHR_2:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[X_2:%.*]] = trunc i32 [[SHR_2]] to i8 +; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr i8, ptr [[P]], i64 1 +; CHECK-NEXT: store i8 [[X_2]], ptr [[GEP_2]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[X]], 24 +; CHECK-NEXT: [[X_3:%.*]] = trunc i32 [[TMP1]] to i8 +; CHECK-NEXT: store i8 [[X_3]], ptr [[P]], align 1 +; CHECK-NEXT: ret void +; + %x.0 = trunc i32 %x to i8 + %gep.0 = getelementptr i8, ptr %p, i64 3 + store i8 %x.0, ptr %gep.0 + %shr.1 = lshr i32 %x, 8 + %x.1 = trunc i32 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 2 + store i8 %x.1, ptr %gep.1 + %shr.2 = lshr i32 %x, 16 + %x.2 = trunc i32 %shr.2 to i8 + %gep.2 = getelementptr i8, ptr %p, i64 1 + store i8 %x.2, ptr %gep.2 + %shr.3 = lshr i32 %x, 24 + %x.3 = trunc i32 %shr.3 to i8 + store i8 %x.3, ptr %p + ret void +} + +define void @test_i32_le(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_i32_le( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_0:%.*]] = trunc i32 [[X]] to i8 +; CHECK-NEXT: store i8 [[X_0]], ptr [[P]], align 1 +; CHECK-NEXT: [[SHR_1:%.*]] = lshr i32 [[X]], 8 +; CHECK-NEXT: [[X_1:%.*]] = trunc i32 [[SHR_1]] to i8 +; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 1 +; CHECK-NEXT: store i8 [[X_1]], ptr [[GEP_1]], align 1 +; CHECK-NEXT: [[SHR_2:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[X_2:%.*]] = trunc i32 [[SHR_2]] to i8 +; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr i8, ptr [[P]], i64 2 +; CHECK-NEXT: store i8 [[X_2]], ptr [[GEP_2]], align 1 +; CHECK-NEXT: [[SHR_3:%.*]] = lshr i32 [[X]], 24 +; CHECK-NEXT: [[X_3:%.*]] = trunc i32 [[SHR_3]] to i8 +; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr i8, ptr [[P]], i64 3 +; CHECK-NEXT: store i8 [[X_3]], ptr [[GEP_3]], align 1 +; CHECK-NEXT: ret void +; + %x.0 = trunc i32 %x to i8 + store i8 %x.0, ptr %p + %shr.1 = lshr i32 %x, 8 + %x.1 = trunc i32 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i8 %x.1, ptr %gep.1 + %shr.2 = lshr i32 %x, 16 + %x.2 = trunc i32 %shr.2 to i8 + %gep.2 = getelementptr i8, ptr %p, i64 2 + store i8 %x.2, ptr %gep.2 + %shr.3 = lshr i32 %x, 24 + %x.3 = trunc i32 %shr.3 to i8 + %gep.3 = getelementptr i8, ptr %p, i64 3 + store i8 %x.3, ptr %gep.3 + ret void +} + +define void @test_i32_mixed_parts(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_i32_mixed_parts( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_0:%.*]] = trunc i32 [[X]] to i8 +; CHECK-NEXT: [[GEP_0:%.*]] = getelementptr i8, ptr [[P]], i64 3 +; CHECK-NEXT: store i8 [[X_0]], ptr [[GEP_0]], align 1 +; CHECK-NEXT: [[SHR_1:%.*]] = lshr i32 [[X]], 8 +; CHECK-NEXT: [[X_1:%.*]] = trunc i32 [[SHR_1]] to i16 +; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 1 +; CHECK-NEXT: store i16 [[X_1]], ptr [[GEP_1]], align 2 +; CHECK-NEXT: [[SHR_3:%.*]] = lshr i32 [[X]], 24 +; CHECK-NEXT: [[X_3:%.*]] = trunc i32 [[SHR_3]] to i8 +; CHECK-NEXT: store i8 [[X_3]], ptr [[P]], align 1 +; CHECK-NEXT: ret void +; + %x.0 = trunc i32 %x to i8 + %gep.0 = getelementptr i8, ptr %p, i64 3 + store i8 %x.0, ptr %gep.0 + %shr.1 = lshr i32 %x, 8 + %x.1 = trunc i32 %shr.1 to i16 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i16 %x.1, ptr %gep.1 + %shr.3 = lshr i32 %x, 24 + %x.3 = trunc i32 %shr.3 to i8 + store i8 %x.3, ptr %p + ret void +} diff --git a/llvm/test/Transforms/AggressiveInstCombine/X86/store-merge.ll b/llvm/test/Transforms/AggressiveInstCombine/X86/store-merge.ll new file mode 100644 index 0000000..56786d0 --- /dev/null +++ b/llvm/test/Transforms/AggressiveInstCombine/X86/store-merge.ll @@ -0,0 +1,901 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=aggressive-instcombine -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s + +declare void @use.i16(i16) +declare void @use.i32(i32) + +define void @test_i16(i16 %x, ptr %p) { +; CHECK-LABEL: define void @test_i16( +; CHECK-SAME: i16 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: store i16 [[X]], ptr [[P]], align 1 +; CHECK-NEXT: ret void +; + %x.0 = trunc i16 %x to i8 + store i8 %x.0, ptr %p + %shr.1 = lshr i16 %x, 8 + %x.1 = trunc i16 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i8 %x.1, ptr %gep.1 + ret void +} + +define void @test_i32_i8_parts(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_i32_i8_parts( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: store i32 [[X]], ptr [[P]], align 1 +; CHECK-NEXT: ret void +; + %x.0 = trunc i32 %x to i8 + store i8 %x.0, ptr %p + %shr.1 = lshr i32 %x, 8 + %x.1 = trunc i32 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i8 %x.1, ptr %gep.1 + %shr.2 = lshr i32 %x, 16 + %x.2 = trunc i32 %shr.2 to i8 + %gep.2 = getelementptr i8, ptr %p, i64 2 + store i8 %x.2, ptr %gep.2 + %shr.3 = lshr i32 %x, 24 + %x.3 = trunc i32 %shr.3 to i8 + %gep.3 = getelementptr i8, ptr %p, i64 3 + store i8 %x.3, ptr %gep.3 + ret void +} + +define void @test_i32_i16_parts(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_i32_i16_parts( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: store i32 [[X]], ptr [[P]], align 2 +; CHECK-NEXT: ret void +; + %x.0 = trunc i32 %x to i16 + store i16 %x.0, ptr %p + %shr.1 = lshr i32 %x, 16 + %x.1 = trunc i32 %shr.1 to i16 + %gep.1 = getelementptr i8, ptr %p, i64 2 + store i16 %x.1, ptr %gep.1 + ret void +} + +define void @test_i32_mixed_parts(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_i32_mixed_parts( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: store i32 [[X]], ptr [[P]], align 1 +; CHECK-NEXT: ret void +; + %x.0 = trunc i32 %x to i8 + store i8 %x.0, ptr %p + %shr.1 = lshr i32 %x, 8 + %x.1 = trunc i32 %shr.1 to i16 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i16 %x.1, ptr %gep.1 + %shr.3 = lshr i32 %x, 24 + %x.3 = trunc i32 %shr.3 to i8 + %gep.3 = getelementptr i8, ptr %p, i64 3 + store i8 %x.3, ptr %gep.3 + ret void +} + +define void @test_i64(i64 %x, ptr %p) { +; CHECK-LABEL: define void @test_i64( +; CHECK-SAME: i64 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: store i64 [[X]], ptr [[P]], align 1 +; CHECK-NEXT: ret void +; + %x.0 = trunc i64 %x to i8 + store i8 %x.0, ptr %p + %shr.1 = lshr i64 %x, 8 + %x.1 = trunc i64 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i8 %x.1, ptr %gep.1 + %shr.2 = lshr i64 %x, 16 + %x.2 = trunc i64 %shr.2 to i8 + %gep.2 = getelementptr i8, ptr %p, i64 2 + store i8 %x.2, ptr %gep.2 + %shr.3 = lshr i64 %x, 24 + %x.3 = trunc i64 %shr.3 to i8 + %gep.3 = getelementptr i8, ptr %p, i64 3 + store i8 %x.3, ptr %gep.3 + %shr.4 = lshr i64 %x, 32 + %x.4 = trunc i64 %shr.4 to i8 + %gep.4 = getelementptr i8, ptr %p, i64 4 + store i8 %x.4, ptr %gep.4 + %shr.5 = lshr i64 %x, 40 + %x.5 = trunc i64 %shr.5 to i8 + %gep.5 = getelementptr i8, ptr %p, i64 5 + store i8 %x.5, ptr %gep.5 + %shr.6 = lshr i64 %x, 48 + %x.6 = trunc i64 %shr.6 to i8 + %gep.6 = getelementptr i8, ptr %p, i64 6 + store i8 %x.6, ptr %gep.6 + %shr.7 = lshr i64 %x, 56 + %x.7 = trunc i64 %shr.7 to i8 + %gep.7 = getelementptr i8, ptr %p, i64 7 + store i8 %x.7, ptr %gep.7 + ret void +} + +define void @test_i128(i128 %x, ptr %p) { +; CHECK-LABEL: define void @test_i128( +; CHECK-SAME: i128 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_0:%.*]] = trunc i128 [[X]] to i8 +; CHECK-NEXT: store i8 [[X_0]], ptr [[P]], align 1 +; CHECK-NEXT: [[SHR_1:%.*]] = lshr i128 [[X]], 8 +; CHECK-NEXT: [[X_1:%.*]] = trunc i128 [[SHR_1]] to i8 +; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 1 +; CHECK-NEXT: store i8 [[X_1]], ptr [[GEP_1]], align 1 +; CHECK-NEXT: [[SHR_2:%.*]] = lshr i128 [[X]], 16 +; CHECK-NEXT: [[X_2:%.*]] = trunc i128 [[SHR_2]] to i8 +; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr i8, ptr [[P]], i64 2 +; CHECK-NEXT: store i8 [[X_2]], ptr [[GEP_2]], align 1 +; CHECK-NEXT: [[SHR_3:%.*]] = lshr i128 [[X]], 24 +; CHECK-NEXT: [[X_3:%.*]] = trunc i128 [[SHR_3]] to i8 +; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr i8, ptr [[P]], i64 3 +; CHECK-NEXT: store i8 [[X_3]], ptr [[GEP_3]], align 1 +; CHECK-NEXT: [[SHR_4:%.*]] = lshr i128 [[X]], 32 +; CHECK-NEXT: [[X_4:%.*]] = trunc i128 [[SHR_4]] to i8 +; CHECK-NEXT: [[GEP_4:%.*]] = getelementptr i8, ptr [[P]], i64 4 +; CHECK-NEXT: store i8 [[X_4]], ptr [[GEP_4]], align 1 +; CHECK-NEXT: [[SHR_5:%.*]] = lshr i128 [[X]], 40 +; CHECK-NEXT: [[X_5:%.*]] = trunc i128 [[SHR_5]] to i8 +; CHECK-NEXT: [[GEP_5:%.*]] = getelementptr i8, ptr [[P]], i64 5 +; CHECK-NEXT: store i8 [[X_5]], ptr [[GEP_5]], align 1 +; CHECK-NEXT: [[SHR_6:%.*]] = lshr i128 [[X]], 48 +; CHECK-NEXT: [[X_6:%.*]] = trunc i128 [[SHR_6]] to i8 +; CHECK-NEXT: [[GEP_6:%.*]] = getelementptr i8, ptr [[P]], i64 6 +; CHECK-NEXT: store i8 [[X_6]], ptr [[GEP_6]], align 1 +; CHECK-NEXT: [[SHR_7:%.*]] = lshr i128 [[X]], 56 +; CHECK-NEXT: [[X_7:%.*]] = trunc i128 [[SHR_7]] to i8 +; CHECK-NEXT: [[GEP_7:%.*]] = getelementptr i8, ptr [[P]], i64 7 +; CHECK-NEXT: store i8 [[X_7]], ptr [[GEP_7]], align 1 +; CHECK-NEXT: [[SHR_8:%.*]] = lshr i128 [[X]], 64 +; CHECK-NEXT: [[X_8:%.*]] = trunc i128 [[SHR_8]] to i8 +; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i8 [[X_8]], ptr [[GEP_8]], align 1 +; CHECK-NEXT: [[SHR_9:%.*]] = lshr i128 [[X]], 72 +; CHECK-NEXT: [[X_9:%.*]] = trunc i128 [[SHR_9]] to i8 +; CHECK-NEXT: [[GEP_9:%.*]] = getelementptr i8, ptr [[P]], i64 9 +; CHECK-NEXT: store i8 [[X_9]], ptr [[GEP_9]], align 1 +; CHECK-NEXT: [[SHR_10:%.*]] = lshr i128 [[X]], 80 +; CHECK-NEXT: [[X_10:%.*]] = trunc i128 [[SHR_10]] to i8 +; CHECK-NEXT: [[GEP_10:%.*]] = getelementptr i8, ptr [[P]], i64 10 +; CHECK-NEXT: store i8 [[X_10]], ptr [[GEP_10]], align 1 +; CHECK-NEXT: [[SHR_11:%.*]] = lshr i128 [[X]], 88 +; CHECK-NEXT: [[X_11:%.*]] = trunc i128 [[SHR_11]] to i8 +; CHECK-NEXT: [[GEP_11:%.*]] = getelementptr i8, ptr [[P]], i64 11 +; CHECK-NEXT: store i8 [[X_11]], ptr [[GEP_11]], align 1 +; CHECK-NEXT: [[SHR_12:%.*]] = lshr i128 [[X]], 96 +; CHECK-NEXT: [[X_12:%.*]] = trunc i128 [[SHR_12]] to i8 +; CHECK-NEXT: [[GEP_12:%.*]] = getelementptr i8, ptr [[P]], i64 12 +; CHECK-NEXT: store i8 [[X_12]], ptr [[GEP_12]], align 1 +; CHECK-NEXT: [[SHR_13:%.*]] = lshr i128 [[X]], 104 +; CHECK-NEXT: [[X_13:%.*]] = trunc i128 [[SHR_13]] to i8 +; CHECK-NEXT: [[GEP_13:%.*]] = getelementptr i8, ptr [[P]], i64 13 +; CHECK-NEXT: store i8 [[X_13]], ptr [[GEP_13]], align 1 +; CHECK-NEXT: [[SHR_14:%.*]] = lshr i128 [[X]], 112 +; CHECK-NEXT: [[X_14:%.*]] = trunc i128 [[SHR_14]] to i8 +; CHECK-NEXT: [[GEP_14:%.*]] = getelementptr i8, ptr [[P]], i64 14 +; CHECK-NEXT: store i8 [[X_14]], ptr [[GEP_14]], align 1 +; CHECK-NEXT: [[SHR_15:%.*]] = lshr i128 [[X]], 120 +; CHECK-NEXT: [[X_15:%.*]] = trunc i128 [[SHR_15]] to i8 +; CHECK-NEXT: [[GEP_15:%.*]] = getelementptr i8, ptr [[P]], i64 15 +; CHECK-NEXT: store i8 [[X_15]], ptr [[GEP_15]], align 1 +; CHECK-NEXT: ret void +; + %x.0 = trunc i128 %x to i8 + store i8 %x.0, ptr %p + %shr.1 = lshr i128 %x, 8 + %x.1 = trunc i128 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i8 %x.1, ptr %gep.1 + %shr.2 = lshr i128 %x, 16 + %x.2 = trunc i128 %shr.2 to i8 + %gep.2 = getelementptr i8, ptr %p, i64 2 + store i8 %x.2, ptr %gep.2 + %shr.3 = lshr i128 %x, 24 + %x.3 = trunc i128 %shr.3 to i8 + %gep.3 = getelementptr i8, ptr %p, i64 3 + store i8 %x.3, ptr %gep.3 + %shr.4 = lshr i128 %x, 32 + %x.4 = trunc i128 %shr.4 to i8 + %gep.4 = getelementptr i8, ptr %p, i64 4 + store i8 %x.4, ptr %gep.4 + %shr.5 = lshr i128 %x, 40 + %x.5 = trunc i128 %shr.5 to i8 + %gep.5 = getelementptr i8, ptr %p, i64 5 + store i8 %x.5, ptr %gep.5 + %shr.6 = lshr i128 %x, 48 + %x.6 = trunc i128 %shr.6 to i8 + %gep.6 = getelementptr i8, ptr %p, i64 6 + store i8 %x.6, ptr %gep.6 + %shr.7 = lshr i128 %x, 56 + %x.7 = trunc i128 %shr.7 to i8 + %gep.7 = getelementptr i8, ptr %p, i64 7 + store i8 %x.7, ptr %gep.7 + %shr.8 = lshr i128 %x, 64 + %x.8 = trunc i128 %shr.8 to i8 + %gep.8 = getelementptr i8, ptr %p, i64 8 + store i8 %x.8, ptr %gep.8 + %shr.9 = lshr i128 %x, 72 + %x.9 = trunc i128 %shr.9 to i8 + %gep.9 = getelementptr i8, ptr %p, i64 9 + store i8 %x.9, ptr %gep.9 + %shr.10 = lshr i128 %x, 80 + %x.10 = trunc i128 %shr.10 to i8 + %gep.10 = getelementptr i8, ptr %p, i64 10 + store i8 %x.10, ptr %gep.10 + %shr.11 = lshr i128 %x, 88 + %x.11 = trunc i128 %shr.11 to i8 + %gep.11 = getelementptr i8, ptr %p, i64 11 + store i8 %x.11, ptr %gep.11 + %shr.12 = lshr i128 %x, 96 + %x.12 = trunc i128 %shr.12 to i8 + %gep.12 = getelementptr i8, ptr %p, i64 12 + store i8 %x.12, ptr %gep.12 + %shr.13 = lshr i128 %x, 104 + %x.13 = trunc i128 %shr.13 to i8 + %gep.13 = getelementptr i8, ptr %p, i64 13 + store i8 %x.13, ptr %gep.13 + %shr.14 = lshr i128 %x, 112 + %x.14 = trunc i128 %shr.14 to i8 + %gep.14 = getelementptr i8, ptr %p, i64 14 + store i8 %x.14, ptr %gep.14 + %shr.15 = lshr i128 %x, 120 + %x.15 = trunc i128 %shr.15 to i8 + %gep.15 = getelementptr i8, ptr %p, i64 15 + store i8 %x.15, ptr %gep.15 + ret void +} + +define void @test_i32_lo(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_i32_lo( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[X]] to i16 +; CHECK-NEXT: store i16 [[TMP1]], ptr [[P]], align 1 +; CHECK-NEXT: ret void +; + %x.0 = trunc i32 %x to i8 + store i8 %x.0, ptr %p + %shr.1 = lshr i32 %x, 8 + %x.1 = trunc i32 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i8 %x.1, ptr %gep.1 + ret void +} + +define void @test_i32_hi(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_i32_hi( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[SHR_0:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[SHR_0]] to i16 +; CHECK-NEXT: store i16 [[TMP2]], ptr [[P]], align 1 +; CHECK-NEXT: ret void +; + %shr.0 = lshr i32 %x, 16 + %x.0 = trunc i32 %shr.0 to i8 + store i8 %x.0, ptr %p + %shr.1 = lshr i32 %x, 24 + %x.1 = trunc i32 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i8 %x.1, ptr %gep.1 + ret void +} + +define void @test_i32_mid(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_i32_mid( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[X]], 10 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; CHECK-NEXT: store i16 [[TMP2]], ptr [[P]], align 1 +; CHECK-NEXT: ret void +; + %shr.0 = lshr i32 %x, 10 + %x.0 = trunc i32 %shr.0 to i8 + store i8 %x.0, ptr %p + %shr.1 = lshr i32 %x, 18 + %x.1 = trunc i32 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i8 %x.1, ptr %gep.1 + ret void +} + +define void @test_i32_shift_in_zeros(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_i32_shift_in_zeros( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[SHR_0:%.*]] = lshr i32 [[X]], 20 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[SHR_0]] to i16 +; CHECK-NEXT: store i16 [[TMP2]], ptr [[P]], align 1 +; CHECK-NEXT: ret void +; + %shr.0 = lshr i32 %x, 20 + %x.0 = trunc i32 %shr.0 to i8 + store i8 %x.0, ptr %p + %shr.1 = lshr i32 %x, 28 + %x.1 = trunc i32 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i8 %x.1, ptr %gep.1 + ret void +} + +define void @test_base_ptr_with_offset(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_base_ptr_with_offset( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[P]], i64 7 +; CHECK-NEXT: store i32 [[X]], ptr [[TMP1]], align 2 +; CHECK-NEXT: ret void +; + %x.0 = trunc i32 %x to i16 + %gep.0 = getelementptr i8, ptr %p, i64 7 + store i16 %x.0, ptr %gep.0 + %shr.1 = lshr i32 %x, 16 + %x.1 = trunc i32 %shr.1 to i16 + %gep.1 = getelementptr i8, ptr %p, i64 9 + store i16 %x.1, ptr %gep.1 + ret void +} + +define void @test_aliasing_store(i16 %x, ptr %p, ptr %p2) { +; CHECK-LABEL: define void @test_aliasing_store( +; CHECK-SAME: i16 [[X:%.*]], ptr [[P:%.*]], ptr [[P2:%.*]]) { +; CHECK-NEXT: [[X_0:%.*]] = trunc i16 [[X]] to i8 +; CHECK-NEXT: store i8 [[X_0]], ptr [[P]], align 1 +; CHECK-NEXT: store i8 0, ptr [[P2]], align 1 +; CHECK-NEXT: [[SHR_1:%.*]] = lshr i16 [[X]], 8 +; CHECK-NEXT: [[X_1:%.*]] = trunc i16 [[SHR_1]] to i8 +; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 1 +; CHECK-NEXT: store i8 [[X_1]], ptr [[GEP_1]], align 1 +; CHECK-NEXT: ret void +; + %x.0 = trunc i16 %x to i8 + store i8 %x.0, ptr %p + store i8 0, ptr %p2 + %shr.1 = lshr i16 %x, 8 + %x.1 = trunc i16 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i8 %x.1, ptr %gep.1 + ret void +} + +define void @test_non_aliasing_store(i16 %x, ptr noalias %p, ptr noalias %p2) { +; CHECK-LABEL: define void @test_non_aliasing_store( +; CHECK-SAME: i16 [[X:%.*]], ptr noalias [[P:%.*]], ptr noalias [[P2:%.*]]) { +; CHECK-NEXT: store i16 [[X]], ptr [[P]], align 1 +; CHECK-NEXT: store i8 0, ptr [[P2]], align 1 +; CHECK-NEXT: ret void +; + %x.0 = trunc i16 %x to i8 + store i8 %x.0, ptr %p + store i8 0, ptr %p2 + %shr.1 = lshr i16 %x, 8 + %x.1 = trunc i16 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i8 %x.1, ptr %gep.1 + ret void +} + +define i8 @test_aliasing_load(i16 %x, ptr %p, ptr %p2) { +; CHECK-LABEL: define i8 @test_aliasing_load( +; CHECK-SAME: i16 [[X:%.*]], ptr [[P:%.*]], ptr [[P2:%.*]]) { +; CHECK-NEXT: [[X_0:%.*]] = trunc i16 [[X]] to i8 +; CHECK-NEXT: store i8 [[X_0]], ptr [[P]], align 1 +; CHECK-NEXT: [[V:%.*]] = load i8, ptr [[P2]], align 1 +; CHECK-NEXT: [[SHR_1:%.*]] = lshr i16 [[X]], 8 +; CHECK-NEXT: [[X_1:%.*]] = trunc i16 [[SHR_1]] to i8 +; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 1 +; CHECK-NEXT: store i8 [[X_1]], ptr [[GEP_1]], align 1 +; CHECK-NEXT: ret i8 [[V]] +; + %x.0 = trunc i16 %x to i8 + store i8 %x.0, ptr %p + %v = load i8, ptr %p2 + %shr.1 = lshr i16 %x, 8 + %x.1 = trunc i16 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i8 %x.1, ptr %gep.1 + ret i8 %v +} + +define i8 @test_non_aliasing_load(i16 %x, ptr noalias %p, ptr noalias %p2) { +; CHECK-LABEL: define i8 @test_non_aliasing_load( +; CHECK-SAME: i16 [[X:%.*]], ptr noalias [[P:%.*]], ptr noalias [[P2:%.*]]) { +; CHECK-NEXT: store i16 [[X]], ptr [[P]], align 1 +; CHECK-NEXT: [[V:%.*]] = load i8, ptr [[P2]], align 1 +; CHECK-NEXT: ret i8 [[V]] +; + %x.0 = trunc i16 %x to i8 + store i8 %x.0, ptr %p + %v = load i8, ptr %p2 + %shr.1 = lshr i16 %x, 8 + %x.1 = trunc i16 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i8 %x.1, ptr %gep.1 + ret i8 %v +} + +define i8 @test_aliasing_load_partially_mergeable(i32 %x, ptr %p, ptr %p2) { +; CHECK-LABEL: define i8 @test_aliasing_load_partially_mergeable( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]], ptr [[P2:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[X]] to i16 +; CHECK-NEXT: store i16 [[TMP1]], ptr [[P]], align 1 +; CHECK-NEXT: [[V:%.*]] = load i8, ptr [[P2]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[P]], i64 2 +; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i16 +; CHECK-NEXT: store i16 [[TMP3]], ptr [[TMP4]], align 1 +; CHECK-NEXT: ret i8 [[V]] +; + %x.0 = trunc i32 %x to i8 + store i8 %x.0, ptr %p + %shr.1 = lshr i32 %x, 8 + %x.1 = trunc i32 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i8 %x.1, ptr %gep.1 + %v = load i8, ptr %p2 + %shr.2 = lshr i32 %x, 16 + %x.2 = trunc i32 %shr.2 to i8 + %gep.2 = getelementptr i8, ptr %p, i64 2 + store i8 %x.2, ptr %gep.2 + %shr.3 = lshr i32 %x, 24 + %x.3 = trunc i32 %shr.3 to i8 + %gep.3 = getelementptr i8, ptr %p, i64 3 + store i8 %x.3, ptr %gep.3 + ret i8 %v +} + +declare void @may_unwind() memory(none) + +define void @test_unwind(i16 %x, ptr %p, ptr %p2) { +; CHECK-LABEL: define void @test_unwind( +; CHECK-SAME: i16 [[X:%.*]], ptr [[P:%.*]], ptr [[P2:%.*]]) { +; CHECK-NEXT: [[X_0:%.*]] = trunc i16 [[X]] to i8 +; CHECK-NEXT: store i8 [[X_0]], ptr [[P]], align 1 +; CHECK-NEXT: call void @may_unwind() +; CHECK-NEXT: [[SHR_1:%.*]] = lshr i16 [[X]], 8 +; CHECK-NEXT: [[X_1:%.*]] = trunc i16 [[SHR_1]] to i8 +; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 1 +; CHECK-NEXT: store i8 [[X_1]], ptr [[GEP_1]], align 1 +; CHECK-NEXT: ret void +; + %x.0 = trunc i16 %x to i8 + store i8 %x.0, ptr %p + call void @may_unwind() + %shr.1 = lshr i16 %x, 8 + %x.1 = trunc i16 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i8 %x.1, ptr %gep.1 + ret void +} + +define void @test_multi_group(i16 %x, ptr %p1, i16 %y, ptr %p2) { +; CHECK-LABEL: define void @test_multi_group( +; CHECK-SAME: i16 [[X:%.*]], ptr [[P1:%.*]], i16 [[Y:%.*]], ptr [[P2:%.*]]) { +; CHECK-NEXT: store i16 [[X]], ptr [[P1]], align 1 +; CHECK-NEXT: call void @may_unwind() +; CHECK-NEXT: store i16 [[Y]], ptr [[P2]], align 1 +; CHECK-NEXT: ret void +; + %x.0 = trunc i16 %x to i8 + store i8 %x.0, ptr %p1 + %shr.1 = lshr i16 %x, 8 + %x.1 = trunc i16 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p1, i64 1 + store i8 %x.1, ptr %gep.1 + call void @may_unwind() + %y.0 = trunc i16 %y to i8 + store i8 %y.0, ptr %p2 + %shr.2 = lshr i16 %y, 8 + %y.1 = trunc i16 %shr.2 to i8 + %gep.2 = getelementptr i8, ptr %p2, i64 1 + store i8 %y.1, ptr %gep.2 + ret void +} + +define void @test_stores_out_of_order(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_stores_out_of_order( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: store i32 [[X]], ptr [[P]], align 1 +; CHECK-NEXT: ret void +; + %shr.2 = lshr i32 %x, 16 + %x.2 = trunc i32 %shr.2 to i8 + %gep.2 = getelementptr i8, ptr %p, i64 2 + store i8 %x.2, ptr %gep.2 + %x.0 = trunc i32 %x to i8 + store i8 %x.0, ptr %p + %shr.3 = lshr i32 %x, 24 + %x.3 = trunc i32 %shr.3 to i8 + %gep.3 = getelementptr i8, ptr %p, i64 3 + store i8 %x.3, ptr %gep.3 + %shr.1 = lshr i32 %x, 8 + %x.1 = trunc i32 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i8 %x.1, ptr %gep.1 + ret void +} + +define void @test_i32_gap(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_i32_gap( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[SHR_0:%.*]] = lshr i32 [[X]], 7 +; CHECK-NEXT: [[X_0:%.*]] = trunc i32 [[SHR_0]] to i8 +; CHECK-NEXT: store i8 [[X_0]], ptr [[P]], align 1 +; CHECK-NEXT: [[SHR_1:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[X_1:%.*]] = trunc i32 [[SHR_1]] to i8 +; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 1 +; CHECK-NEXT: store i8 [[X_1]], ptr [[GEP_1]], align 1 +; CHECK-NEXT: ret void +; + %shr.0 = lshr i32 %x, 7 + %x.0 = trunc i32 %shr.0 to i8 + store i8 %x.0, ptr %p + %shr.1 = lshr i32 %x, 16 + %x.1 = trunc i32 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i8 %x.1, ptr %gep.1 + ret void +} + +define void @test_i32_non_byte_sized(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_i32_non_byte_sized( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_0:%.*]] = trunc i32 [[X]] to i15 +; CHECK-NEXT: store i15 [[X_0]], ptr [[P]], align 2 +; CHECK-NEXT: [[SHR_1:%.*]] = lshr i32 [[X]], 15 +; CHECK-NEXT: [[X_1:%.*]] = trunc i32 [[SHR_1]] to i17 +; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 2 +; CHECK-NEXT: store i17 [[X_1]], ptr [[GEP_1]], align 4 +; CHECK-NEXT: ret void +; + %x.0 = trunc i32 %x to i15 + store i15 %x.0, ptr %p + %shr.1 = lshr i32 %x, 15 + %x.1 = trunc i32 %shr.1 to i17 + %gep.1 = getelementptr i8, ptr %p, i64 2 + store i17 %x.1, ptr %gep.1 + ret void +} + +define void @test_i32_wrong_ptr_offset(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_i32_wrong_ptr_offset( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[SHR_0:%.*]] = lshr i32 [[X]], 8 +; CHECK-NEXT: [[X_0:%.*]] = trunc i32 [[SHR_0]] to i8 +; CHECK-NEXT: store i8 [[X_0]], ptr [[P]], align 1 +; CHECK-NEXT: [[SHR_1:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[X_1:%.*]] = trunc i32 [[SHR_1]] to i8 +; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 2 +; CHECK-NEXT: store i8 [[X_1]], ptr [[GEP_1]], align 1 +; CHECK-NEXT: ret void +; + %shr.0 = lshr i32 %x, 8 + %x.0 = trunc i32 %shr.0 to i8 + store i8 %x.0, ptr %p + %shr.1 = lshr i32 %x, 16 + %x.1 = trunc i32 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 2 + store i8 %x.1, ptr %gep.1 + ret void +} + +define void @test_i32_wrong_endian(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_i32_wrong_endian( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_0:%.*]] = trunc i32 [[X]] to i8 +; CHECK-NEXT: [[GEP_0:%.*]] = getelementptr i8, ptr [[P]], i64 3 +; CHECK-NEXT: store i8 [[X_0]], ptr [[GEP_0]], align 1 +; CHECK-NEXT: [[SHR_1:%.*]] = lshr i32 [[X]], 8 +; CHECK-NEXT: [[X_1:%.*]] = trunc i32 [[SHR_1]] to i8 +; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 2 +; CHECK-NEXT: store i8 [[X_1]], ptr [[GEP_1]], align 1 +; CHECK-NEXT: [[SHR_2:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[X_2:%.*]] = trunc i32 [[SHR_2]] to i8 +; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr i8, ptr [[P]], i64 1 +; CHECK-NEXT: store i8 [[X_2]], ptr [[GEP_2]], align 1 +; CHECK-NEXT: [[SHR_3:%.*]] = lshr i32 [[X]], 24 +; CHECK-NEXT: [[X_3:%.*]] = trunc i32 [[SHR_3]] to i8 +; CHECK-NEXT: store i8 [[X_3]], ptr [[P]], align 1 +; CHECK-NEXT: ret void +; + %x.0 = trunc i32 %x to i8 + %gep.0 = getelementptr i8, ptr %p, i64 3 + store i8 %x.0, ptr %gep.0 + %shr.1 = lshr i32 %x, 8 + %x.1 = trunc i32 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 2 + store i8 %x.1, ptr %gep.1 + %shr.2 = lshr i32 %x, 16 + %x.2 = trunc i32 %shr.2 to i8 + %gep.2 = getelementptr i8, ptr %p, i64 1 + store i8 %x.2, ptr %gep.2 + %shr.3 = lshr i32 %x, 24 + %x.3 = trunc i32 %shr.3 to i8 + store i8 %x.3, ptr %p + ret void +} + +define void @test_i32_volatile(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_i32_volatile( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[X]], 8 +; CHECK-NEXT: [[X_0:%.*]] = trunc i32 [[TMP1]] to i8 +; CHECK-NEXT: store volatile i8 [[X_0]], ptr [[P]], align 1 +; CHECK-NEXT: [[SHR_1:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[X_1:%.*]] = trunc i32 [[SHR_1]] to i8 +; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 1 +; CHECK-NEXT: store i8 [[X_1]], ptr [[GEP_1]], align 1 +; CHECK-NEXT: ret void +; + %shr.0 = lshr i32 %x, 8 + %x.0 = trunc i32 %shr.0 to i8 + store volatile i8 %x.0, ptr %p + %shr.1 = lshr i32 %x, 16 + %x.1 = trunc i32 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i8 %x.1, ptr %gep.1 + ret void +} + +define void @test_i32_atomic(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_i32_atomic( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[SHR_0:%.*]] = lshr i32 [[X]], 8 +; CHECK-NEXT: [[X_0:%.*]] = trunc i32 [[SHR_0]] to i8 +; CHECK-NEXT: store atomic i8 [[X_0]], ptr [[P]] monotonic, align 1 +; CHECK-NEXT: [[SHR_1:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[X_1:%.*]] = trunc i32 [[SHR_1]] to i8 +; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 1 +; CHECK-NEXT: store i8 [[X_1]], ptr [[GEP_1]], align 1 +; CHECK-NEXT: ret void +; + %shr.0 = lshr i32 %x, 8 + %x.0 = trunc i32 %shr.0 to i8 + store atomic i8 %x.0, ptr %p monotonic, align 1 + %shr.1 = lshr i32 %x, 16 + %x.1 = trunc i32 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i8 %x.1, ptr %gep.1 + ret void +} + +define void @test_i32_multiple_pointers(i32 %x, i32 %y, ptr %p, ptr %p2) { +; CHECK-LABEL: define void @test_i32_multiple_pointers( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]], ptr [[P:%.*]], ptr [[P2:%.*]]) { +; CHECK-NEXT: store i32 [[X]], ptr [[P]], align 2 +; CHECK-NEXT: store i32 [[Y]], ptr [[P2]], align 2 +; CHECK-NEXT: ret void +; + %x.0 = trunc i32 %x to i16 + store i16 %x.0, ptr %p + %shr.1 = lshr i32 %x, 16 + %x.1 = trunc i32 %shr.1 to i16 + %gep.1 = getelementptr i8, ptr %p, i64 2 + store i16 %x.1, ptr %gep.1 + + %y.0 = trunc i32 %y to i16 + store i16 %y.0, ptr %p2 + %y.shr.1 = lshr i32 %y, 16 + %y.1 = trunc i32 %y.shr.1 to i16 + %p2.gep.1 = getelementptr i8, ptr %p2, i64 2 + store i16 %y.1, ptr %p2.gep.1 + ret void +} + +define void @test_i32_multiple_pointers_interleaved(i32 %x, i32 %y, ptr noalias %p, ptr noalias %p2) { +; CHECK-LABEL: define void @test_i32_multiple_pointers_interleaved( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]], ptr noalias [[P:%.*]], ptr noalias [[P2:%.*]]) { +; CHECK-NEXT: [[X_0:%.*]] = trunc i32 [[X]] to i16 +; CHECK-NEXT: store i16 [[X_0]], ptr [[P]], align 2 +; CHECK-NEXT: [[Y_0:%.*]] = trunc i32 [[Y]] to i16 +; CHECK-NEXT: store i16 [[Y_0]], ptr [[P2]], align 2 +; CHECK-NEXT: [[SHR_1:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[X_1:%.*]] = trunc i32 [[SHR_1]] to i16 +; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 2 +; CHECK-NEXT: store i16 [[X_1]], ptr [[GEP_1]], align 2 +; CHECK-NEXT: [[Y_SHR_1:%.*]] = lshr i32 [[Y]], 16 +; CHECK-NEXT: [[Y_1:%.*]] = trunc i32 [[Y_SHR_1]] to i16 +; CHECK-NEXT: [[P2_GEP_1:%.*]] = getelementptr i8, ptr [[P2]], i64 2 +; CHECK-NEXT: store i16 [[Y_1]], ptr [[P2_GEP_1]], align 2 +; CHECK-NEXT: ret void +; + %x.0 = trunc i32 %x to i16 + store i16 %x.0, ptr %p + %y.0 = trunc i32 %y to i16 + store i16 %y.0, ptr %p2 + + %shr.1 = lshr i32 %x, 16 + %x.1 = trunc i32 %shr.1 to i16 + %gep.1 = getelementptr i8, ptr %p, i64 2 + store i16 %x.1, ptr %gep.1 + %y.shr.1 = lshr i32 %y, 16 + %y.1 = trunc i32 %y.shr.1 to i16 + %p2.gep.1 = getelementptr i8, ptr %p2, i64 2 + store i16 %y.1, ptr %p2.gep.1 + ret void +} + +define void @test_i32_multi_use(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_i32_multi_use( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_0:%.*]] = trunc i32 [[X]] to i16 +; CHECK-NEXT: store i32 [[X]], ptr [[P]], align 2 +; CHECK-NEXT: [[SHR_1:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[X_1:%.*]] = trunc i32 [[SHR_1]] to i16 +; CHECK-NEXT: call void @use.i16(i16 [[X_0]]) +; CHECK-NEXT: call void @use.i16(i16 [[X_1]]) +; CHECK-NEXT: call void @use.i32(i32 [[SHR_1]]) +; CHECK-NEXT: ret void +; + %x.0 = trunc i32 %x to i16 + store i16 %x.0, ptr %p + %shr.1 = lshr i32 %x, 16 + %x.1 = trunc i32 %shr.1 to i16 + %gep.1 = getelementptr i8, ptr %p, i64 2 + store i16 %x.1, ptr %gep.1 + call void @use.i16(i16 %x.0) + call void @use.i16(i16 %x.1) + call void @use.i32(i32 %shr.1) + ret void +} + +define void @test_i32_scoped_aa_same(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_i32_scoped_aa_same( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: store i32 [[X]], ptr [[P]], align 2, !noalias [[META0:![0-9]+]] +; CHECK-NEXT: ret void +; + %x.0 = trunc i32 %x to i16 + store i16 %x.0, ptr %p, !noalias !0 + %shr.1 = lshr i32 %x, 16 + %x.1 = trunc i32 %shr.1 to i16 + %gep.1 = getelementptr i8, ptr %p, i64 2 + store i16 %x.1, ptr %gep.1, !noalias !0 + ret void +} + +define void @test_i32_scoped_aa_different(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_i32_scoped_aa_different( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: store i32 [[X]], ptr [[P]], align 2, !noalias [[META3:![0-9]+]] +; CHECK-NEXT: ret void +; + %x.0 = trunc i32 %x to i16 + store i16 %x.0, ptr %p, !noalias !0 + %shr.1 = lshr i32 %x, 16 + %x.1 = trunc i32 %shr.1 to i16 + %gep.1 = getelementptr i8, ptr %p, i64 2 + store i16 %x.1, ptr %gep.1, !noalias !3 + ret void +} + +define void @test_i32_tbaa(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_i32_tbaa( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: store i32 [[X]], ptr [[P]], align 2 +; CHECK-NEXT: ret void +; + %x.0 = trunc i32 %x to i16 + store i16 %x.0, ptr %p, !tbaa !6 + %shr.1 = lshr i32 %x, 16 + %x.1 = trunc i32 %shr.1 to i16 + %gep.1 = getelementptr i8, ptr %p, i64 2 + store i16 %x.1, ptr %gep.1, !tbaa !6 + ret void +} + +define void @test_multiple_parts_with_gap1(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_multiple_parts_with_gap1( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[X]] to i16 +; CHECK-NEXT: store i16 [[TMP1]], ptr [[P]], align 1 +; CHECK-NEXT: [[SHR_3:%.*]] = lshr i32 [[X]], 24 +; CHECK-NEXT: [[X_3:%.*]] = trunc i32 [[SHR_3]] to i8 +; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr i8, ptr [[P]], i64 3 +; CHECK-NEXT: store i8 [[X_3]], ptr [[GEP_3]], align 1 +; CHECK-NEXT: ret void +; + %x.0 = trunc i32 %x to i8 + store i8 %x.0, ptr %p + %shr.1 = lshr i32 %x, 8 + %x.1 = trunc i32 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i8 %x.1, ptr %gep.1 + %shr.3 = lshr i32 %x, 24 + %x.3 = trunc i32 %shr.3 to i8 + %gep.3 = getelementptr i8, ptr %p, i64 3 + store i8 %x.3, ptr %gep.3 + ret void +} + +define void @test_multiple_parts_with_gap2(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_multiple_parts_with_gap2( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_0:%.*]] = trunc i32 [[X]] to i8 +; CHECK-NEXT: store i8 [[X_0]], ptr [[P]], align 1 +; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr i8, ptr [[P]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; CHECK-NEXT: store i16 [[TMP2]], ptr [[GEP_2]], align 1 +; CHECK-NEXT: ret void +; + %x.0 = trunc i32 %x to i8 + store i8 %x.0, ptr %p + %shr.2 = lshr i32 %x, 16 + %x.2 = trunc i32 %shr.2 to i8 + %gep.2 = getelementptr i8, ptr %p, i64 1 + store i8 %x.2, ptr %gep.2 + %shr.3 = lshr i32 %x, 24 + %x.3 = trunc i32 %shr.3 to i8 + %gep.3 = getelementptr i8, ptr %p, i64 2 + store i8 %x.3, ptr %gep.3 + ret void +} + +define void @test_multiple_parts_with_gap3(i64 %x, ptr %p) { +; CHECK-LABEL: define void @test_multiple_parts_with_gap3( +; CHECK-SAME: i64 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[X]] to i16 +; CHECK-NEXT: store i16 [[TMP1]], ptr [[P]], align 1 +; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr i8, ptr [[P]], i64 3 +; CHECK-NEXT: [[TMP2:%.*]] = lshr i64 [[X]], 24 +; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i16 +; CHECK-NEXT: store i16 [[TMP3]], ptr [[GEP_3]], align 1 +; CHECK-NEXT: ret void +; + %x.0 = trunc i64 %x to i8 + store i8 %x.0, ptr %p + %shr.1 = lshr i64 %x, 8 + %x.1 = trunc i64 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i8 %x.1, ptr %gep.1 + %shr.3 = lshr i64 %x, 24 + %x.3 = trunc i64 %shr.3 to i8 + %gep.3 = getelementptr i8, ptr %p, i64 3 + store i8 %x.3, ptr %gep.3 + %shr.4 = lshr i64 %x, 32 + %x.4 = trunc i64 %shr.4 to i8 + %gep.4 = getelementptr i8, ptr %p, i64 4 + store i8 %x.4, ptr %gep.4 + ret void +} + +define void @test_store_same_parts_twice(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_store_same_parts_twice( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[X]] to i16 +; CHECK-NEXT: store i16 [[TMP1]], ptr [[P]], align 1 +; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr i8, ptr [[P]], i64 2 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[X]] to i16 +; CHECK-NEXT: store i16 [[TMP2]], ptr [[GEP_2]], align 1 +; CHECK-NEXT: ret void +; + %x.0 = trunc i32 %x to i8 + store i8 %x.0, ptr %p + %shr.1 = lshr i32 %x, 8 + %x.1 = trunc i32 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i8 %x.1, ptr %gep.1 + %gep.2 = getelementptr i8, ptr %p, i64 2 + store i8 %x.0, ptr %gep.2 + %gep.3 = getelementptr i8, ptr %p, i64 3 + store i8 %x.1, ptr %gep.3 + ret void +} + +!0 = !{!1} +!1 = !{!1, !2} +!2 = !{!2} + +!3 = !{!4} +!4 = !{!4, !5} +!5 = !{!5} + +!6 = !{!7, !7, i64 0} +!7 = !{!"short", !8, i64 0} +!8 = !{!"omnipotent char", !9, i64 0} +!9 = !{!"Simple C/C++ TBAA"} +;. +; CHECK: [[META0]] = !{[[META1:![0-9]+]]} +; CHECK: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]} +; CHECK: [[META2]] = distinct !{[[META2]]} +; CHECK: [[META3]] = !{} +;. diff --git a/llvm/test/Transforms/Attributor/heap_to_stack.ll b/llvm/test/Transforms/Attributor/heap_to_stack.ll index fad4acb..6719290 100644 --- a/llvm/test/Transforms/Attributor/heap_to_stack.ll +++ b/llvm/test/Transforms/Attributor/heap_to_stack.ll @@ -393,26 +393,6 @@ bb: ret i32 %i2 } -define i32 @test_lifetime() { -; CHECK-LABEL: define {{[^@]+}}@test_lifetime() { -; CHECK-NEXT: bb: -; CHECK-NEXT: [[I_H2S:%.*]] = alloca i8, i64 4, align 1 -; CHECK-NEXT: tail call void @no_sync_func(ptr noalias nofree captures(none) [[I_H2S]]) -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 noundef 4, ptr noalias nofree nonnull align 4 captures(none) dereferenceable(4) [[I_H2S]]) -; CHECK-NEXT: store i32 10, ptr [[I_H2S]], align 4 -; CHECK-NEXT: [[I2:%.*]] = load i32, ptr [[I_H2S]], align 4 -; CHECK-NEXT: ret i32 [[I2]] -; -bb: - %i = tail call noalias ptr @malloc(i64 4) - tail call void @no_sync_func(ptr %i) - call void @llvm.lifetime.start.p0(i64 4, ptr %i) - store i32 10, ptr %i, align 4 - %i2 = load i32, ptr %i, align 4 - tail call void @free(ptr %i) - ret i32 %i2 -} - ; TEST 11 define void @test11() { diff --git a/llvm/test/Transforms/Attributor/heap_to_stack_gpu.ll b/llvm/test/Transforms/Attributor/heap_to_stack_gpu.ll index c7a9ec8..0be9434 100644 --- a/llvm/test/Transforms/Attributor/heap_to_stack_gpu.ll +++ b/llvm/test/Transforms/Attributor/heap_to_stack_gpu.ll @@ -340,27 +340,6 @@ bb: ret i32 %i2 } -define i32 @test_lifetime() { -; CHECK-LABEL: define {{[^@]+}}@test_lifetime() { -; CHECK-NEXT: bb: -; CHECK-NEXT: [[I:%.*]] = tail call noalias ptr @malloc(i64 noundef 4) -; CHECK-NEXT: tail call void @no_sync_func(ptr noalias nofree captures(none) [[I]]) -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 noundef 4, ptr noalias nofree nonnull align 4 captures(none) dereferenceable(4) [[I]]) -; CHECK-NEXT: store i32 10, ptr [[I]], align 4 -; CHECK-NEXT: [[I2:%.*]] = load i32, ptr [[I]], align 4 -; CHECK-NEXT: tail call void @free(ptr noalias nonnull align 4 captures(none) dereferenceable(4) [[I]]) -; CHECK-NEXT: ret i32 [[I2]] -; -bb: - %i = tail call noalias ptr @malloc(i64 4) - tail call void @no_sync_func(ptr %i) - call void @llvm.lifetime.start.p0(i64 4, ptr %i) - store i32 10, ptr %i, align 4 - %i2 = load i32, ptr %i, align 4 - tail call void @free(ptr %i) - ret i32 %i2 -} - ; TEST 11 define void @test11() { diff --git a/llvm/test/Transforms/Attributor/memory_locations.ll b/llvm/test/Transforms/Attributor/memory_locations.ll index 9c27fca..936b8a0 100644 --- a/llvm/test/Transforms/Attributor/memory_locations.ll +++ b/llvm/test/Transforms/Attributor/memory_locations.ll @@ -300,7 +300,6 @@ entry: declare ptr @unknown_ptr() readnone declare ptr @argmem_only(ptr %arg) argmemonly declare ptr @inaccesible_argmem_only_decl(ptr %arg) inaccessiblemem_or_argmemonly -declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) nounwind argmemonly willreturn define void @callerA1(ptr %arg) { ; CHECK: Function Attrs: memory(argmem: readwrite) @@ -387,21 +386,10 @@ define void @callerD2() { ret void } -define void @callerE(ptr %arg) { -; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) -; CHECK-LABEL: define {{[^@]+}}@callerE -; CHECK-SAME: (ptr nofree readnone captures(none) [[ARG:%.*]]) #[[ATTR5:[0-9]+]] { -; CHECK-NEXT: ret void -; - call void @llvm.lifetime.start.p0(i64 4, ptr %arg) - ret void -} - - define void @write_global() { ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write) ; CHECK-LABEL: define {{[^@]+}}@write_global -; CHECK-SAME: () #[[ATTR6:[0-9]+]] { +; CHECK-SAME: () #[[ATTR4:[0-9]+]] { ; CHECK-NEXT: store i32 0, ptr @G, align 4 ; CHECK-NEXT: ret void ; @@ -411,7 +399,7 @@ define void @write_global() { define void @write_global_via_arg(ptr %GPtr) { ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) ; CHECK-LABEL: define {{[^@]+}}@write_global_via_arg -; CHECK-SAME: (ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[GPTR:%.*]]) #[[ATTR7:[0-9]+]] { +; CHECK-SAME: (ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[GPTR:%.*]]) #[[ATTR5:[0-9]+]] { ; CHECK-NEXT: store i32 0, ptr [[GPTR]], align 4 ; CHECK-NEXT: ret void ; @@ -421,7 +409,7 @@ define void @write_global_via_arg(ptr %GPtr) { define internal void @write_global_via_arg_internal(ptr %GPtr) { ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none) ; CHECK-LABEL: define {{[^@]+}}@write_global_via_arg_internal -; CHECK-SAME: () #[[ATTR8:[0-9]+]] { +; CHECK-SAME: () #[[ATTR6:[0-9]+]] { ; CHECK-NEXT: store i32 0, ptr @G, align 4 ; CHECK-NEXT: ret void ; @@ -432,14 +420,14 @@ define internal void @write_global_via_arg_internal(ptr %GPtr) { define void @writeonly_global() { ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write) ; TUNIT-LABEL: define {{[^@]+}}@writeonly_global -; TUNIT-SAME: () #[[ATTR6]] { -; TUNIT-NEXT: call void @write_global() #[[ATTR12:[0-9]+]] +; TUNIT-SAME: () #[[ATTR4]] { +; TUNIT-NEXT: call void @write_global() #[[ATTR10:[0-9]+]] ; TUNIT-NEXT: ret void ; ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(write) ; CGSCC-LABEL: define {{[^@]+}}@writeonly_global -; CGSCC-SAME: () #[[ATTR9:[0-9]+]] { -; CGSCC-NEXT: call void @write_global() #[[ATTR13:[0-9]+]] +; CGSCC-SAME: () #[[ATTR7:[0-9]+]] { +; CGSCC-NEXT: call void @write_global() #[[ATTR11:[0-9]+]] ; CGSCC-NEXT: ret void ; call void @write_global() @@ -448,14 +436,14 @@ define void @writeonly_global() { define void @writeonly_global_via_arg() { ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write) ; TUNIT-LABEL: define {{[^@]+}}@writeonly_global_via_arg -; TUNIT-SAME: () #[[ATTR6]] { -; TUNIT-NEXT: call void @write_global_via_arg(ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) @G) #[[ATTR12]] +; TUNIT-SAME: () #[[ATTR4]] { +; TUNIT-NEXT: call void @write_global_via_arg(ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) @G) #[[ATTR10]] ; TUNIT-NEXT: ret void ; ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(write) ; CGSCC-LABEL: define {{[^@]+}}@writeonly_global_via_arg -; CGSCC-SAME: () #[[ATTR9]] { -; CGSCC-NEXT: call void @write_global_via_arg(ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) @G) #[[ATTR13]] +; CGSCC-SAME: () #[[ATTR7]] { +; CGSCC-NEXT: call void @write_global_via_arg(ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) @G) #[[ATTR11]] ; CGSCC-NEXT: ret void ; call void @write_global_via_arg(ptr @G) @@ -466,14 +454,14 @@ define void @writeonly_global_via_arg_internal() { ; ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write) ; TUNIT-LABEL: define {{[^@]+}}@writeonly_global_via_arg_internal -; TUNIT-SAME: () #[[ATTR6]] { -; TUNIT-NEXT: call void @write_global_via_arg_internal() #[[ATTR12]] +; TUNIT-SAME: () #[[ATTR4]] { +; TUNIT-NEXT: call void @write_global_via_arg_internal() #[[ATTR10]] ; TUNIT-NEXT: ret void ; ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(write) ; CGSCC-LABEL: define {{[^@]+}}@writeonly_global_via_arg_internal -; CGSCC-SAME: () #[[ATTR9]] { -; CGSCC-NEXT: call void @write_global_via_arg_internal() #[[ATTR13]] +; CGSCC-SAME: () #[[ATTR7]] { +; CGSCC-NEXT: call void @write_global_via_arg_internal() #[[ATTR11]] ; CGSCC-NEXT: ret void ; call void @write_global_via_arg_internal(ptr @G) @@ -483,11 +471,11 @@ define void @writeonly_global_via_arg_internal() { define i8 @recursive_not_readnone(ptr %ptr, i1 %c) { ; TUNIT: Function Attrs: nofree nosync nounwind memory(argmem: write) ; TUNIT-LABEL: define {{[^@]+}}@recursive_not_readnone -; TUNIT-SAME: (ptr nofree writeonly captures(none) [[PTR:%.*]], i1 noundef [[C:%.*]]) #[[ATTR9:[0-9]+]] { +; TUNIT-SAME: (ptr nofree writeonly captures(none) [[PTR:%.*]], i1 noundef [[C:%.*]]) #[[ATTR7:[0-9]+]] { ; TUNIT-NEXT: [[ALLOC:%.*]] = alloca i8, align 1 ; TUNIT-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]] ; TUNIT: t: -; TUNIT-NEXT: [[TMP1:%.*]] = call i8 @recursive_not_readnone(ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[ALLOC]], i1 noundef false) #[[ATTR13:[0-9]+]] +; TUNIT-NEXT: [[TMP1:%.*]] = call i8 @recursive_not_readnone(ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[ALLOC]], i1 noundef false) #[[ATTR11:[0-9]+]] ; TUNIT-NEXT: ret i8 1 ; TUNIT: f: ; TUNIT-NEXT: store i8 1, ptr [[PTR]], align 1 @@ -495,11 +483,11 @@ define i8 @recursive_not_readnone(ptr %ptr, i1 %c) { ; ; CGSCC: Function Attrs: nofree nosync nounwind memory(argmem: write) ; CGSCC-LABEL: define {{[^@]+}}@recursive_not_readnone -; CGSCC-SAME: (ptr nofree writeonly captures(none) [[PTR:%.*]], i1 noundef [[C:%.*]]) #[[ATTR10:[0-9]+]] { +; CGSCC-SAME: (ptr nofree writeonly captures(none) [[PTR:%.*]], i1 noundef [[C:%.*]]) #[[ATTR8:[0-9]+]] { ; CGSCC-NEXT: [[ALLOC:%.*]] = alloca i8, align 1 ; CGSCC-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]] ; CGSCC: t: -; CGSCC-NEXT: [[TMP1:%.*]] = call i8 @recursive_not_readnone(ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[ALLOC]], i1 noundef false) #[[ATTR14:[0-9]+]] +; CGSCC-NEXT: [[TMP1:%.*]] = call i8 @recursive_not_readnone(ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[ALLOC]], i1 noundef false) #[[ATTR12:[0-9]+]] ; CGSCC-NEXT: ret i8 1 ; CGSCC: f: ; CGSCC-NEXT: store i8 1, ptr [[PTR]], align 1 @@ -519,11 +507,11 @@ f: define internal i8 @recursive_not_readnone_internal(ptr %ptr, i1 %c) { ; TUNIT: Function Attrs: nofree nosync nounwind memory(argmem: write) ; TUNIT-LABEL: define {{[^@]+}}@recursive_not_readnone_internal -; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[PTR:%.*]], i1 noundef [[C:%.*]]) #[[ATTR9]] { +; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[PTR:%.*]], i1 noundef [[C:%.*]]) #[[ATTR7]] { ; TUNIT-NEXT: [[ALLOC:%.*]] = alloca i8, align 1 ; TUNIT-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]] ; TUNIT: t: -; TUNIT-NEXT: [[TMP1:%.*]] = call i8 @recursive_not_readnone_internal(ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[ALLOC]], i1 noundef false) #[[ATTR13]] +; TUNIT-NEXT: [[TMP1:%.*]] = call i8 @recursive_not_readnone_internal(ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[ALLOC]], i1 noundef false) #[[ATTR11]] ; TUNIT-NEXT: ret i8 1 ; TUNIT: f: ; TUNIT-NEXT: store i8 1, ptr [[PTR]], align 1 @@ -531,11 +519,11 @@ define internal i8 @recursive_not_readnone_internal(ptr %ptr, i1 %c) { ; ; CGSCC: Function Attrs: nofree nosync nounwind memory(argmem: write) ; CGSCC-LABEL: define {{[^@]+}}@recursive_not_readnone_internal -; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[PTR:%.*]], i1 noundef [[C:%.*]]) #[[ATTR10]] { +; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[PTR:%.*]], i1 noundef [[C:%.*]]) #[[ATTR8]] { ; CGSCC-NEXT: [[ALLOC:%.*]] = alloca i8, align 1 ; CGSCC-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]] ; CGSCC: t: -; CGSCC-NEXT: [[TMP1:%.*]] = call i8 @recursive_not_readnone_internal(ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[ALLOC]], i1 noundef false) #[[ATTR14]] +; CGSCC-NEXT: [[TMP1:%.*]] = call i8 @recursive_not_readnone_internal(ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[ALLOC]], i1 noundef false) #[[ATTR12]] ; CGSCC-NEXT: ret i8 1 ; CGSCC: f: ; CGSCC-NEXT: store i8 1, ptr [[PTR]], align 1 @@ -555,16 +543,16 @@ f: define i8 @readnone_caller(i1 %c) { ; TUNIT: Function Attrs: nofree norecurse nosync nounwind memory(none) ; TUNIT-LABEL: define {{[^@]+}}@readnone_caller -; TUNIT-SAME: (i1 [[C:%.*]]) #[[ATTR10:[0-9]+]] { +; TUNIT-SAME: (i1 [[C:%.*]]) #[[ATTR8:[0-9]+]] { ; TUNIT-NEXT: [[A:%.*]] = alloca i8, align 1 -; TUNIT-NEXT: [[R:%.*]] = call i8 @recursive_not_readnone_internal(ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[A]], i1 noundef [[C]]) #[[ATTR13]] +; TUNIT-NEXT: [[R:%.*]] = call i8 @recursive_not_readnone_internal(ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[A]], i1 noundef [[C]]) #[[ATTR11]] ; TUNIT-NEXT: ret i8 [[R]] ; ; CGSCC: Function Attrs: nofree nosync nounwind memory(none) ; CGSCC-LABEL: define {{[^@]+}}@readnone_caller -; CGSCC-SAME: (i1 noundef [[C:%.*]]) #[[ATTR11:[0-9]+]] { +; CGSCC-SAME: (i1 noundef [[C:%.*]]) #[[ATTR9:[0-9]+]] { ; CGSCC-NEXT: [[A:%.*]] = alloca i8, align 1 -; CGSCC-NEXT: [[R:%.*]] = call i8 @recursive_not_readnone_internal(ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[A]], i1 noundef [[C]]) #[[ATTR15:[0-9]+]] +; CGSCC-NEXT: [[R:%.*]] = call i8 @recursive_not_readnone_internal(ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[A]], i1 noundef [[C]]) #[[ATTR13:[0-9]+]] ; CGSCC-NEXT: ret i8 [[R]] ; %a = alloca i8 @@ -575,11 +563,11 @@ define i8 @readnone_caller(i1 %c) { define internal i8 @recursive_readnone_internal2(ptr %ptr, i1 %c) { ; TUNIT: Function Attrs: nofree nosync nounwind memory(argmem: write) ; TUNIT-LABEL: define {{[^@]+}}@recursive_readnone_internal2 -; TUNIT-SAME: (ptr noalias nofree writeonly captures(none) [[PTR:%.*]], i1 noundef [[C:%.*]]) #[[ATTR9]] { +; TUNIT-SAME: (ptr noalias nofree writeonly captures(none) [[PTR:%.*]], i1 noundef [[C:%.*]]) #[[ATTR7]] { ; TUNIT-NEXT: [[ALLOC:%.*]] = alloca i8, align 1 ; TUNIT-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]] ; TUNIT: t: -; TUNIT-NEXT: [[TMP1:%.*]] = call i8 @recursive_readnone_internal2(ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[ALLOC]], i1 noundef false) #[[ATTR13]] +; TUNIT-NEXT: [[TMP1:%.*]] = call i8 @recursive_readnone_internal2(ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[ALLOC]], i1 noundef false) #[[ATTR11]] ; TUNIT-NEXT: ret i8 1 ; TUNIT: f: ; TUNIT-NEXT: store i8 1, ptr [[PTR]], align 1 @@ -587,11 +575,11 @@ define internal i8 @recursive_readnone_internal2(ptr %ptr, i1 %c) { ; ; CGSCC: Function Attrs: nofree nosync nounwind memory(argmem: write) ; CGSCC-LABEL: define {{[^@]+}}@recursive_readnone_internal2 -; CGSCC-SAME: (ptr noalias nofree writeonly captures(none) [[PTR:%.*]], i1 noundef [[C:%.*]]) #[[ATTR10]] { +; CGSCC-SAME: (ptr noalias nofree writeonly captures(none) [[PTR:%.*]], i1 noundef [[C:%.*]]) #[[ATTR8]] { ; CGSCC-NEXT: [[ALLOC:%.*]] = alloca i8, align 1 ; CGSCC-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]] ; CGSCC: t: -; CGSCC-NEXT: [[TMP1:%.*]] = call i8 @recursive_readnone_internal2(ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[ALLOC]], i1 noundef false) #[[ATTR14]] +; CGSCC-NEXT: [[TMP1:%.*]] = call i8 @recursive_readnone_internal2(ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[ALLOC]], i1 noundef false) #[[ATTR12]] ; CGSCC-NEXT: ret i8 1 ; CGSCC: f: ; CGSCC-NEXT: store i8 1, ptr [[PTR]], align 1 @@ -611,14 +599,14 @@ f: define i8 @readnone_caller2(i1 %c) { ; TUNIT: Function Attrs: nofree norecurse nosync nounwind memory(none) ; TUNIT-LABEL: define {{[^@]+}}@readnone_caller2 -; TUNIT-SAME: (i1 [[C:%.*]]) #[[ATTR10]] { -; TUNIT-NEXT: [[R:%.*]] = call i8 @recursive_readnone_internal2(ptr undef, i1 noundef [[C]]) #[[ATTR13]] +; TUNIT-SAME: (i1 [[C:%.*]]) #[[ATTR8]] { +; TUNIT-NEXT: [[R:%.*]] = call i8 @recursive_readnone_internal2(ptr undef, i1 noundef [[C]]) #[[ATTR11]] ; TUNIT-NEXT: ret i8 [[R]] ; ; CGSCC: Function Attrs: nofree nosync nounwind memory(none) ; CGSCC-LABEL: define {{[^@]+}}@readnone_caller2 -; CGSCC-SAME: (i1 noundef [[C:%.*]]) #[[ATTR11]] { -; CGSCC-NEXT: [[R:%.*]] = call i8 @recursive_readnone_internal2(ptr nofree undef, i1 noundef [[C]]) #[[ATTR15]] +; CGSCC-SAME: (i1 noundef [[C:%.*]]) #[[ATTR9]] { +; CGSCC-NEXT: [[R:%.*]] = call i8 @recursive_readnone_internal2(ptr nofree undef, i1 noundef [[C]]) #[[ATTR13]] ; CGSCC-NEXT: ret i8 [[R]] ; %r = call i8 @recursive_readnone_internal2(ptr undef, i1 %c) @@ -628,11 +616,11 @@ define i8 @readnone_caller2(i1 %c) { define internal i8 @recursive_not_readnone_internal3(ptr %ptr, i1 %c) { ; TUNIT: Function Attrs: nofree nosync nounwind memory(argmem: write) ; TUNIT-LABEL: define {{[^@]+}}@recursive_not_readnone_internal3 -; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[PTR:%.*]], i1 noundef [[C:%.*]]) #[[ATTR9]] { +; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[PTR:%.*]], i1 noundef [[C:%.*]]) #[[ATTR7]] { ; TUNIT-NEXT: [[ALLOC:%.*]] = alloca i8, align 1 ; TUNIT-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]] ; TUNIT: t: -; TUNIT-NEXT: [[TMP1:%.*]] = call i8 @recursive_not_readnone_internal3(ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[ALLOC]], i1 noundef false) #[[ATTR13]] +; TUNIT-NEXT: [[TMP1:%.*]] = call i8 @recursive_not_readnone_internal3(ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[ALLOC]], i1 noundef false) #[[ATTR11]] ; TUNIT-NEXT: ret i8 1 ; TUNIT: f: ; TUNIT-NEXT: store i8 1, ptr [[PTR]], align 1 @@ -640,11 +628,11 @@ define internal i8 @recursive_not_readnone_internal3(ptr %ptr, i1 %c) { ; ; CGSCC: Function Attrs: nofree nosync nounwind memory(argmem: write) ; CGSCC-LABEL: define {{[^@]+}}@recursive_not_readnone_internal3 -; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[PTR:%.*]], i1 noundef [[C:%.*]]) #[[ATTR10]] { +; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[PTR:%.*]], i1 noundef [[C:%.*]]) #[[ATTR8]] { ; CGSCC-NEXT: [[ALLOC:%.*]] = alloca i8, align 1 ; CGSCC-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]] ; CGSCC: t: -; CGSCC-NEXT: [[TMP1:%.*]] = call i8 @recursive_not_readnone_internal3(ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[ALLOC]], i1 noundef false) #[[ATTR14]] +; CGSCC-NEXT: [[TMP1:%.*]] = call i8 @recursive_not_readnone_internal3(ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[ALLOC]], i1 noundef false) #[[ATTR12]] ; CGSCC-NEXT: ret i8 1 ; CGSCC: f: ; CGSCC-NEXT: store i8 1, ptr [[PTR]], align 1 @@ -664,16 +652,16 @@ f: define i8 @readnone_caller3(i1 %c) { ; TUNIT: Function Attrs: nofree norecurse nosync nounwind memory(none) ; TUNIT-LABEL: define {{[^@]+}}@readnone_caller3 -; TUNIT-SAME: (i1 [[C:%.*]]) #[[ATTR10]] { +; TUNIT-SAME: (i1 [[C:%.*]]) #[[ATTR8]] { ; TUNIT-NEXT: [[ALLOC:%.*]] = alloca i8, align 1 -; TUNIT-NEXT: [[R:%.*]] = call i8 @recursive_not_readnone_internal3(ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[ALLOC]], i1 noundef [[C]]) #[[ATTR13]] +; TUNIT-NEXT: [[R:%.*]] = call i8 @recursive_not_readnone_internal3(ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[ALLOC]], i1 noundef [[C]]) #[[ATTR11]] ; TUNIT-NEXT: ret i8 [[R]] ; ; CGSCC: Function Attrs: nofree nosync nounwind memory(none) ; CGSCC-LABEL: define {{[^@]+}}@readnone_caller3 -; CGSCC-SAME: (i1 noundef [[C:%.*]]) #[[ATTR11]] { +; CGSCC-SAME: (i1 noundef [[C:%.*]]) #[[ATTR9]] { ; CGSCC-NEXT: [[ALLOC:%.*]] = alloca i8, align 1 -; CGSCC-NEXT: [[R:%.*]] = call i8 @recursive_not_readnone_internal3(ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[ALLOC]], i1 noundef [[C]]) #[[ATTR15]] +; CGSCC-NEXT: [[R:%.*]] = call i8 @recursive_not_readnone_internal3(ptr noalias nofree noundef nonnull writeonly captures(none) dereferenceable(1) [[ALLOC]], i1 noundef [[C]]) #[[ATTR13]] ; CGSCC-NEXT: ret i8 [[R]] ; %alloc = alloca i8 @@ -684,7 +672,7 @@ define i8 @readnone_caller3(i1 %c) { define internal void @argmemonly_before_ipconstprop(ptr %p) argmemonly { ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none) ; CHECK-LABEL: define {{[^@]+}}@argmemonly_before_ipconstprop -; CHECK-SAME: () #[[ATTR8]] { +; CHECK-SAME: () #[[ATTR6]] { ; CHECK-NEXT: store i32 0, ptr @G, align 4 ; CHECK-NEXT: ret void ; @@ -695,14 +683,14 @@ define internal void @argmemonly_before_ipconstprop(ptr %p) argmemonly { define void @argmemonly_caller() { ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write) ; TUNIT-LABEL: define {{[^@]+}}@argmemonly_caller -; TUNIT-SAME: () #[[ATTR6]] { -; TUNIT-NEXT: call void @argmemonly_before_ipconstprop() #[[ATTR12]] +; TUNIT-SAME: () #[[ATTR4]] { +; TUNIT-NEXT: call void @argmemonly_before_ipconstprop() #[[ATTR10]] ; TUNIT-NEXT: ret void ; ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(write) ; CGSCC-LABEL: define {{[^@]+}}@argmemonly_caller -; CGSCC-SAME: () #[[ATTR9]] { -; CGSCC-NEXT: call void @argmemonly_before_ipconstprop() #[[ATTR13]] +; CGSCC-SAME: () #[[ATTR7]] { +; CGSCC-NEXT: call void @argmemonly_before_ipconstprop() #[[ATTR11]] ; CGSCC-NEXT: ret void ; call void @argmemonly_before_ipconstprop(ptr @G) @@ -714,10 +702,10 @@ declare ptr @no_mem_unknown_ptr(ptr %arg) memory(none) define void @argmem_and_unknown(i1 %c, ptr %arg) memory(argmem: readwrite) { ; TUNIT: Function Attrs: nosync memory(argmem: write) ; TUNIT-LABEL: define {{[^@]+}}@argmem_and_unknown -; TUNIT-SAME: (i1 noundef [[C:%.*]], ptr writeonly [[ARG:%.*]]) #[[ATTR11:[0-9]+]] { +; TUNIT-SAME: (i1 noundef [[C:%.*]], ptr writeonly [[ARG:%.*]]) #[[ATTR9:[0-9]+]] { ; TUNIT-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]] ; TUNIT: t: -; TUNIT-NEXT: [[P:%.*]] = call ptr @no_mem_unknown_ptr(ptr noalias readnone [[ARG]]) #[[ATTR14:[0-9]+]] +; TUNIT-NEXT: [[P:%.*]] = call ptr @no_mem_unknown_ptr(ptr noalias readnone [[ARG]]) #[[ATTR12:[0-9]+]] ; TUNIT-NEXT: store i32 0, ptr [[P]], align 4 ; TUNIT-NEXT: br label [[F]] ; TUNIT: f: @@ -725,10 +713,10 @@ define void @argmem_and_unknown(i1 %c, ptr %arg) memory(argmem: readwrite) { ; ; CGSCC: Function Attrs: nosync memory(argmem: write) ; CGSCC-LABEL: define {{[^@]+}}@argmem_and_unknown -; CGSCC-SAME: (i1 noundef [[C:%.*]], ptr writeonly [[ARG:%.*]]) #[[ATTR12:[0-9]+]] { +; CGSCC-SAME: (i1 noundef [[C:%.*]], ptr writeonly [[ARG:%.*]]) #[[ATTR10:[0-9]+]] { ; CGSCC-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]] ; CGSCC: t: -; CGSCC-NEXT: [[P:%.*]] = call ptr @no_mem_unknown_ptr(ptr noalias readnone [[ARG]]) #[[ATTR16:[0-9]+]] +; CGSCC-NEXT: [[P:%.*]] = call ptr @no_mem_unknown_ptr(ptr noalias readnone [[ARG]]) #[[ATTR14:[0-9]+]] ; CGSCC-NEXT: store i32 0, ptr [[P]], align 4 ; CGSCC-NEXT: br label [[F]] ; CGSCC: f: @@ -747,33 +735,29 @@ f: ; TUNIT: attributes #[[ATTR1]] = { memory(argmem: readwrite, inaccessiblemem: readwrite) } ; TUNIT: attributes #[[ATTR2]] = { memory(none) } ; TUNIT: attributes #[[ATTR3]] = { memory(argmem: readwrite) } -; TUNIT: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } -; TUNIT: attributes #[[ATTR5]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) } -; TUNIT: attributes #[[ATTR6]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(write) } -; TUNIT: attributes #[[ATTR7]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) } -; TUNIT: attributes #[[ATTR8]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none) } -; TUNIT: attributes #[[ATTR9]] = { nofree nosync nounwind memory(argmem: write) } -; TUNIT: attributes #[[ATTR10]] = { nofree norecurse nosync nounwind memory(none) } -; TUNIT: attributes #[[ATTR11]] = { nosync memory(argmem: write) } -; TUNIT: attributes #[[ATTR12]] = { nofree nosync nounwind willreturn memory(write) } -; TUNIT: attributes #[[ATTR13]] = { nofree nosync nounwind memory(write) } -; TUNIT: attributes #[[ATTR14]] = { nosync } +; TUNIT: attributes #[[ATTR4]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(write) } +; TUNIT: attributes #[[ATTR5]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) } +; TUNIT: attributes #[[ATTR6]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none) } +; TUNIT: attributes #[[ATTR7]] = { nofree nosync nounwind memory(argmem: write) } +; TUNIT: attributes #[[ATTR8]] = { nofree norecurse nosync nounwind memory(none) } +; TUNIT: attributes #[[ATTR9]] = { nosync memory(argmem: write) } +; TUNIT: attributes #[[ATTR10]] = { nofree nosync nounwind willreturn memory(write) } +; TUNIT: attributes #[[ATTR11]] = { nofree nosync nounwind memory(write) } +; TUNIT: attributes #[[ATTR12]] = { nosync } ;. ; CGSCC: attributes #[[ATTR0]] = { memory(inaccessiblemem: readwrite) } ; CGSCC: attributes #[[ATTR1]] = { memory(argmem: readwrite, inaccessiblemem: readwrite) } ; CGSCC: attributes #[[ATTR2]] = { memory(none) } ; CGSCC: attributes #[[ATTR3]] = { memory(argmem: readwrite) } -; CGSCC: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } -; CGSCC: attributes #[[ATTR5]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) } -; CGSCC: attributes #[[ATTR6]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(write) } -; CGSCC: attributes #[[ATTR7]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) } -; CGSCC: attributes #[[ATTR8]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none) } -; CGSCC: attributes #[[ATTR9]] = { mustprogress nofree nosync nounwind willreturn memory(write) } -; CGSCC: attributes #[[ATTR10]] = { nofree nosync nounwind memory(argmem: write) } -; CGSCC: attributes #[[ATTR11]] = { nofree nosync nounwind memory(none) } -; CGSCC: attributes #[[ATTR12]] = { nosync memory(argmem: write) } -; CGSCC: attributes #[[ATTR13]] = { nofree nounwind willreturn memory(write) } -; CGSCC: attributes #[[ATTR14]] = { nofree nosync nounwind memory(write) } -; CGSCC: attributes #[[ATTR15]] = { nofree nounwind memory(write) } -; CGSCC: attributes #[[ATTR16]] = { nosync } +; CGSCC: attributes #[[ATTR4]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(write) } +; CGSCC: attributes #[[ATTR5]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) } +; CGSCC: attributes #[[ATTR6]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none) } +; CGSCC: attributes #[[ATTR7]] = { mustprogress nofree nosync nounwind willreturn memory(write) } +; CGSCC: attributes #[[ATTR8]] = { nofree nosync nounwind memory(argmem: write) } +; CGSCC: attributes #[[ATTR9]] = { nofree nosync nounwind memory(none) } +; CGSCC: attributes #[[ATTR10]] = { nosync memory(argmem: write) } +; CGSCC: attributes #[[ATTR11]] = { nofree nounwind willreturn memory(write) } +; CGSCC: attributes #[[ATTR12]] = { nofree nosync nounwind memory(write) } +; CGSCC: attributes #[[ATTR13]] = { nofree nounwind memory(write) } +; CGSCC: attributes #[[ATTR14]] = { nosync } ;. diff --git a/llvm/test/Transforms/CodeExtractor/PartialInlineAlloca5.ll b/llvm/test/Transforms/CodeExtractor/PartialInlineAlloca5.ll index 005c021..54782c5 100644 --- a/llvm/test/Transforms/CodeExtractor/PartialInlineAlloca5.ll +++ b/llvm/test/Transforms/CodeExtractor/PartialInlineAlloca5.ll @@ -18,11 +18,11 @@ bb: br i1 %tmp4, label %bb6, label %bb5 bb5: ; preds = %bb - call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %tmp1) #2 + call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %tmp) #2 store i32 %tmp3, ptr %tmp, align 4, !tbaa !2 store i32 %tmp3, ptr @g, align 4, !tbaa !2 call void @bar(ptr nonnull %tmp) #2 - call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %tmp1) #2 + call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %tmp) #2 br label %bb6 bb6: ; preds = %bb5, %bb diff --git a/llvm/test/Transforms/CodeExtractor/live_shrink_gep.ll b/llvm/test/Transforms/CodeExtractor/live_shrink_gep.ll index 03ff31b..e9d5fb6 100644 --- a/llvm/test/Transforms/CodeExtractor/live_shrink_gep.ll +++ b/llvm/test/Transforms/CodeExtractor/live_shrink_gep.ll @@ -9,8 +9,7 @@ define void @_Z3foov() local_unnamed_addr { bb: %tmp = alloca %class.A, align 1 - %tmp1 = getelementptr inbounds %class.A, ptr %tmp, i64 0, i32 0 - call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %tmp1) + call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %tmp) %tmp2 = load i32, ptr @cond, align 4, !tbaa !2 %tmp3 = icmp eq i32 %tmp2, 0 br i1 %tmp3, label %bb4, label %bb5 @@ -20,7 +19,7 @@ bb4: ; preds = %bb br label %bb5 bb5: ; preds = %bb4, %bb - call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %tmp1) + call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %tmp) ret void } @@ -38,7 +37,6 @@ define void @_Z3goov() local_unnamed_addr { bb: ; CHECK: bb: ; CHECK-NOT: alloca -; CHECK-NOT: getelementptr ; CHECK-NOT: llvm.lifetime ; CHECK: br i1 ; CHECK: codeRepl.i: @@ -50,7 +48,6 @@ bb: ; CHECK-LABEL: define internal void @_Z3foov.1. ; CHECK: newFuncRoot: ; CHECK-NEXT: %tmp = alloca %class.A -; CHECK-NEXT: %tmp1 = getelementptr ; CHECK-NEXT: call void @llvm.lifetime.start.p0 ; CHECK: call void @llvm.lifetime.end.p0 ; CHECK-NEXT: br label %bb5.exitStub diff --git a/llvm/test/Transforms/ConstantHoisting/AArch64/const-hoist-intrinsics.ll b/llvm/test/Transforms/ConstantHoisting/AArch64/const-hoist-intrinsics.ll index 9b5362d..6bf268b 100644 --- a/llvm/test/Transforms/ConstantHoisting/AArch64/const-hoist-intrinsics.ll +++ b/llvm/test/Transforms/ConstantHoisting/AArch64/const-hoist-intrinsics.ll @@ -61,10 +61,11 @@ entry: declare i64 @llvm.aarch64.udiv.i64.i64(i64, i64) -define void @test_free_intrinsics(i64 %x, ptr %ptr) { +define void @test_free_intrinsics(i64 %x) { ; CHECK-LABEL: @test_free_intrinsics( ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 100000000032, ptr [[PTR:%.*]]) +; CHECK-NEXT: [[PTR:%.*]] = alloca i8, align 1 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 100000000032, ptr [[PTR]]) ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 100000000064, ptr [[PTR]]) ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 100000000128, ptr [[PTR]]) ; CHECK-NEXT: [[I:%.*]] = call ptr @llvm.invariant.start.p0(i64 100000000256, ptr [[PTR]]) @@ -72,6 +73,7 @@ define void @test_free_intrinsics(i64 %x, ptr %ptr) { ; CHECK-NEXT: ret void ; entry: + %ptr = alloca i8 call void @llvm.lifetime.start.p0(i64 100000000032, ptr %ptr) call void @llvm.lifetime.start.p0(i64 100000000064, ptr %ptr) call void @llvm.lifetime.end.p0(i64 100000000128, ptr %ptr) diff --git a/llvm/test/Transforms/DCE/basic.ll b/llvm/test/Transforms/DCE/basic.ll index 134994a..1a3b12e 100644 --- a/llvm/test/Transforms/DCE/basic.ll +++ b/llvm/test/Transforms/DCE/basic.ll @@ -26,47 +26,5 @@ define i32 @test_lifetime_alloca() { ret i32 0 } -; CHECK-LABEL: @test_lifetime_arg -define i32 @test_lifetime_arg(ptr) { -; Check that lifetime intrinsics are removed along with the pointer. -; CHECK-NEXT: #dbg_value -; CHECK-NEXT: ret i32 0 -; CHECK-NOT: llvm.lifetime.start -; CHECK-NOT: llvm.lifetime.end - call void @llvm.lifetime.start.p0(i64 -1, ptr %0) - call void @llvm.lifetime.end.p0(i64 -1, ptr %0) - ret i32 0 -} - -@glob = global i8 1 - -; CHECK-LABEL: @test_lifetime_global -define i32 @test_lifetime_global() { -; Check that lifetime intrinsics are removed along with the pointer. -; CHECK-NEXT: #dbg_value -; CHECK-NEXT: ret i32 0 -; CHECK-NOT: llvm.lifetime.start -; CHECK-NOT: llvm.lifetime.end - call void @llvm.lifetime.start.p0(i64 -1, ptr @glob) - call void @llvm.lifetime.end.p0(i64 -1, ptr @glob) - ret i32 0 -} - -; CHECK-LABEL: @test_lifetime_bitcast -define i32 @test_lifetime_bitcast(ptr %arg) { -; Check that lifetime intrinsics are NOT removed when the pointer is a bitcast. -; It's not uncommon for two bitcasts to be made: one for lifetime, one for use. -; TODO: Support the above case. -; CHECK-NEXT: bitcast -; CHECK-NEXT: #dbg_value -; CHECK-NEXT: llvm.lifetime.start.p0(i64 -1, ptr %cast) -; CHECK-NEXT: llvm.lifetime.end.p0(i64 -1, ptr %cast) -; CHECK-NEXT: ret i32 0 - %cast = bitcast ptr %arg to ptr - call void @llvm.lifetime.start.p0(i64 -1, ptr %cast) - call void @llvm.lifetime.end.p0(i64 -1, ptr %cast) - ret i32 0 -} - ; CHECK: [[add]] = !DILocalVariable ; CHECK: [[sub]] = !DILocalVariable diff --git a/llvm/test/Transforms/DeadStoreElimination/libcalls.ll b/llvm/test/Transforms/DeadStoreElimination/libcalls.ll index 4d9a767..27ad639 100644 --- a/llvm/test/Transforms/DeadStoreElimination/libcalls.ll +++ b/llvm/test/Transforms/DeadStoreElimination/libcalls.ll @@ -67,19 +67,6 @@ define void @test_strcat_with_lifetime(ptr %src) { ret void } -define void @test_strcat_with_lifetime_nonlocal(ptr %dest, ptr %src) { -; CHECK-LABEL: @test_strcat_with_lifetime_nonlocal( -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 16, ptr nonnull [[DEST:%.*]]) -; CHECK-NEXT: [[CALL:%.*]] = call ptr @strcat(ptr [[DEST]], ptr [[SRC:%.*]]) -; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 16, ptr nonnull [[DEST]]) -; CHECK-NEXT: ret void -; - call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %dest) - %call = call ptr @strcat(ptr %dest, ptr %src) - call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %dest) - ret void -} - declare ptr @strncat(ptr %dest, ptr %src, i64 %n) nounwind define void @test4(ptr %src) { ; CHECK-LABEL: @test4( diff --git a/llvm/test/Transforms/DeadStoreElimination/lifetime.ll b/llvm/test/Transforms/DeadStoreElimination/lifetime.ll index 73b9903..19e7b0d 100644 --- a/llvm/test/Transforms/DeadStoreElimination/lifetime.ll +++ b/llvm/test/Transforms/DeadStoreElimination/lifetime.ll @@ -25,12 +25,12 @@ define void @test1() { define void @test2(ptr %P) { ; CHECK-LABEL: @test2( -; CHECK-NEXT: [[Q:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 1 +; CHECK-NEXT: [[Q:%.*]] = alloca i32, align 4 ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[Q]]) ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[Q]]) ; CHECK-NEXT: ret void ; - %Q = getelementptr i32, ptr %P, i32 1 + %Q = alloca i32 call void @llvm.lifetime.start.p0(i64 4, ptr %Q) store i32 0, ptr %Q ;; This store is dead. call void @llvm.lifetime.end.p0(i64 4, ptr %Q) @@ -114,19 +114,19 @@ exit: ; lifetime.end only marks the first two bytes of %A as dead. Make sure ; `store i8 20, ptr %A.2 is not removed. -define void @test5_lifetime_end_partial(ptr %A) { +define void @test5_lifetime_end_partial() { ; CHECK-LABEL: @test5_lifetime_end_partial( -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 2, ptr [[A:%.*]]) +; CHECK-NEXT: [[A:%.*]] = alloca [4 x i8], align 1 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 2, ptr [[A]]) ; CHECK-NEXT: [[A_1:%.*]] = getelementptr i8, ptr [[A]], i64 1 ; CHECK-NEXT: [[A_2:%.*]] = getelementptr i8, ptr [[A]], i64 2 ; CHECK-NEXT: store i8 20, ptr [[A_2]], align 1 ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 2, ptr [[A]]) ; CHECK-NEXT: call void @use(ptr [[A_1]]) -; CHECK-NEXT: store i8 30, ptr [[A_1]], align 1 -; CHECK-NEXT: store i8 40, ptr [[A_2]], align 1 ; CHECK-NEXT: ret void ; + %A = alloca [4 x i8] call void @llvm.lifetime.start.p0(i64 2, ptr %A) %A.1 = getelementptr i8, ptr %A, i64 1 %A.2 = getelementptr i8, ptr %A, i64 2 diff --git a/llvm/test/Transforms/DeadStoreElimination/multiblock-multipath.ll b/llvm/test/Transforms/DeadStoreElimination/multiblock-multipath.ll index 95bd859..588bdc0 100644 --- a/llvm/test/Transforms/DeadStoreElimination/multiblock-multipath.ll +++ b/llvm/test/Transforms/DeadStoreElimination/multiblock-multipath.ll @@ -398,7 +398,7 @@ bb5: @linenum = external local_unnamed_addr global i32, align 4 -define void @accessible_after_return11_loop() { +define void @accessible_after_return11_loop(ptr noalias %p) { ; CHECK-LABEL: @accessible_after_return11_loop( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[FOR_BODY_I:%.*]] @@ -406,7 +406,7 @@ define void @accessible_after_return11_loop() { ; CHECK-NEXT: [[C_1:%.*]] = call i1 @cond() ; CHECK-NEXT: br i1 [[C_1]], label [[FOR_BODY_I]], label [[INIT_PARSE_EXIT:%.*]] ; CHECK: init_parse.exit: -; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 16, ptr nonnull undef) +; CHECK-NEXT: store i32 1, ptr [[P:%.*]], align 4 ; CHECK-NEXT: store i32 0, ptr @linenum, align 4 ; CHECK-NEXT: br label [[FOR_BODY_I20:%.*]] ; CHECK: for.body.i20: @@ -424,7 +424,7 @@ for.body.i: ; preds = %for.body.i, %entry init_parse.exit: ; preds = %for.body.i store i32 0, ptr @linenum, align 4 - call void @llvm.lifetime.end.p0(i64 16, ptr nonnull undef) #2 + store i32 1, ptr %p store i32 0, ptr @linenum, align 4 br label %for.body.i20 @@ -435,7 +435,6 @@ for.body.i20: ; preds = %for.body.i20, %init exit: ret void } -declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) declare i1 @cond() readnone nounwind ; Tests where the pointer/object is *NOT* accessible after the function returns. diff --git a/llvm/test/Transforms/EarlyCSE/memoryssa.ll b/llvm/test/Transforms/EarlyCSE/memoryssa.ll index 942b6f8..ba4cce4 100644 --- a/llvm/test/Transforms/EarlyCSE/memoryssa.ll +++ b/llvm/test/Transforms/EarlyCSE/memoryssa.ll @@ -142,10 +142,12 @@ end: ;; Check that we respect lifetime.start/lifetime.end intrinsics when deleting ;; stores that, without the lifetime calls, would be writebacks. -define void @test_writeback_lifetimes(ptr %p) { +define void @test_writeback_lifetimes() { ; CHECK-NOMEMSSA-LABEL: @test_writeback_lifetimes( ; CHECK-NOMEMSSA-NEXT: entry: -; CHECK-NOMEMSSA-NEXT: [[Q:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 1 +; CHECK-NOMEMSSA-NEXT: [[P:%.*]] = alloca i64, align 8 +; CHECK-NOMEMSSA-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[P]]) +; CHECK-NOMEMSSA-NEXT: [[Q:%.*]] = getelementptr i32, ptr [[P]], i64 1 ; CHECK-NOMEMSSA-NEXT: [[PV:%.*]] = load i32, ptr [[P]], align 4 ; CHECK-NOMEMSSA-NEXT: [[QV:%.*]] = load i32, ptr [[Q]], align 4 ; CHECK-NOMEMSSA-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[P]]) @@ -156,7 +158,9 @@ define void @test_writeback_lifetimes(ptr %p) { ; ; CHECK-LABEL: @test_writeback_lifetimes( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[Q:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 1 +; CHECK-NEXT: [[P:%.*]] = alloca i64, align 8 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[P]]) +; CHECK-NEXT: [[Q:%.*]] = getelementptr i32, ptr [[P]], i64 1 ; CHECK-NEXT: [[PV:%.*]] = load i32, ptr [[P]], align 4 ; CHECK-NEXT: [[QV:%.*]] = load i32, ptr [[Q]], align 4 ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[P]]) @@ -166,6 +170,8 @@ define void @test_writeback_lifetimes(ptr %p) { ; CHECK-NEXT: ret void ; entry: + %p = alloca i64 + call void @llvm.lifetime.start.p0(i64 8, ptr %p) %q = getelementptr i32, ptr %p, i64 1 %pv = load i32, ptr %p %qv = load i32, ptr %q @@ -178,10 +184,12 @@ entry: ;; Check that we respect lifetime.start/lifetime.end intrinsics when deleting ;; stores that, without the lifetime calls, would be writebacks. -define void @test_writeback_lifetimes_multi_arg(ptr %p, ptr %q) { +define void @test_writeback_lifetimes_multi_arg(ptr %q) { ; CHECK-NOMEMSSA-LABEL: @test_writeback_lifetimes_multi_arg( ; CHECK-NOMEMSSA-NEXT: entry: -; CHECK-NOMEMSSA-NEXT: [[PV:%.*]] = load i32, ptr [[P:%.*]], align 4 +; CHECK-NOMEMSSA-NEXT: [[P:%.*]] = alloca i64, align 8 +; CHECK-NOMEMSSA-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[P]]) +; CHECK-NOMEMSSA-NEXT: [[PV:%.*]] = load i32, ptr [[P]], align 4 ; CHECK-NOMEMSSA-NEXT: [[QV:%.*]] = load i32, ptr [[Q:%.*]], align 4 ; CHECK-NOMEMSSA-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[P]]) ; CHECK-NOMEMSSA-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[P]]) @@ -191,15 +199,18 @@ define void @test_writeback_lifetimes_multi_arg(ptr %p, ptr %q) { ; ; CHECK-LABEL: @test_writeback_lifetimes_multi_arg( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[PV:%.*]] = load i32, ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[P:%.*]] = alloca i64, align 8 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[P]]) +; CHECK-NEXT: [[PV:%.*]] = load i32, ptr [[P]], align 4 ; CHECK-NEXT: [[QV:%.*]] = load i32, ptr [[Q:%.*]], align 4 ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[P]]) ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[P]]) ; CHECK-NEXT: store i32 [[PV]], ptr [[P]], align 4 -; CHECK-NEXT: store i32 [[QV]], ptr [[Q]], align 4 ; CHECK-NEXT: ret void ; entry: + %p = alloca i64 + call void @llvm.lifetime.start.p0(i64 8, ptr %p) %pv = load i32, ptr %p %qv = load i32, ptr %q call void @llvm.lifetime.end.p0(i64 8, ptr %p) diff --git a/llvm/test/Transforms/GVN/assume.ll b/llvm/test/Transforms/GVN/assume.ll index 1498aa4..5d3a23b 100644 --- a/llvm/test/Transforms/GVN/assume.ll +++ b/llvm/test/Transforms/GVN/assume.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -passes=gvn -verify-analysis-invalidation -S | FileCheck --check-prefixes=CHECK,MDEP %s -; RUN: opt < %s -passes='gvn<memoryssa;no-memdep>' -verify-analysis-invalidation -S | FileCheck --check-prefixes=CHECK,MSSA %s +; RUN: opt < %s -passes='gvn<memoryssa>' -verify-analysis-invalidation -S | FileCheck --check-prefixes=CHECK,MSSA %s declare void @llvm.assume(i1) declare void @use(i1) diff --git a/llvm/test/Transforms/GVN/basic.ll b/llvm/test/Transforms/GVN/basic.ll index c1a358a..2e360aa 100644 --- a/llvm/test/Transforms/GVN/basic.ll +++ b/llvm/test/Transforms/GVN/basic.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt < %s -passes=gvn -S | FileCheck %s --check-prefixes=CHECK,MDEP -; RUN: opt < %s -passes='gvn<memoryssa;no-memdep>' -S | FileCheck --check-prefixes=CHECK,MSSA %s +; RUN: opt < %s -passes='gvn<memoryssa>' -S | FileCheck --check-prefixes=CHECK,MSSA %s define i32 @main() { ; CHECK-LABEL: define i32 @main() { diff --git a/llvm/test/Transforms/GVN/lifetime-simple.ll b/llvm/test/Transforms/GVN/lifetime-simple.ll index bf7a6ef..177f43f 100644 --- a/llvm/test/Transforms/GVN/lifetime-simple.ll +++ b/llvm/test/Transforms/GVN/lifetime-simple.ll @@ -1,13 +1,19 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt < %s -passes=gvn -S | FileCheck %s -target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" -target triple = "i386-apple-darwin7" - -define i8 @test(ptr %P) nounwind { -; CHECK: lifetime.start -; CHECK-NOT: load -; CHECK: lifetime.end +define i8 @test() nounwind { +; CHECK-LABEL: define i8 @test( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[P:%.*]] = alloca [32 x i8], align 1 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 32, ptr [[P]]) +; CHECK-NEXT: store i8 1, ptr [[P]], align 1 +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 32, ptr [[P]]) +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[P]], align 1 +; CHECK-NEXT: ret i8 [[TMP0]] +; entry: + %P = alloca [32 x i8] call void @llvm.lifetime.start.p0(i64 32, ptr %P) %0 = load i8, ptr %P store i8 1, ptr %P diff --git a/llvm/test/Transforms/GVN/nonescaping.ll b/llvm/test/Transforms/GVN/nonescaping.ll index 2913755..0866a27 100644 --- a/llvm/test/Transforms/GVN/nonescaping.ll +++ b/llvm/test/Transforms/GVN/nonescaping.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -S -passes=gvn 2>&1 | FileCheck %s --check-prefixes=CHECK,MDEP -; RUN: opt < %s -S -passes='gvn<memoryssa;no-memdep>' 2>&1 | FileCheck --check-prefixes=CHECK,MSSA %s +; RUN: opt < %s -S -passes='gvn<memoryssa>' 2>&1 | FileCheck --check-prefixes=CHECK,MSSA %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" diff --git a/llvm/test/Transforms/GVN/opt-remarks.ll b/llvm/test/Transforms/GVN/opt-remarks.ll index 8fb2d57..87cd54d 100644 --- a/llvm/test/Transforms/GVN/opt-remarks.ll +++ b/llvm/test/Transforms/GVN/opt-remarks.ll @@ -107,7 +107,8 @@ entry: ret i32 %add } -define i8 @lifetime_end(ptr %p, i8 %val) { +define i8 @lifetime_end(i8 %val) { + %p = alloca [32 x i8] call void @llvm.lifetime.start.p0(i64 32, ptr %p) store i8 %val, ptr %p call void @llvm.lifetime.end.p0(i64 32, ptr %p) diff --git a/llvm/test/Transforms/GVN/phi.ll b/llvm/test/Transforms/GVN/phi.ll index 5b607f7..a0207cf 100644 --- a/llvm/test/Transforms/GVN/phi.ll +++ b/llvm/test/Transforms/GVN/phi.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -passes=gvn < %s | FileCheck --check-prefixes=CHECK,MDEP %s -; RUN: opt -S -passes='gvn<memoryssa;no-memdep>' < %s | FileCheck %s +; RUN: opt -S -passes='gvn<memoryssa>' < %s | FileCheck %s define i64 @test1(i1 %c, i64 %a, i64 %b) { diff --git a/llvm/test/Transforms/GVN/pr14166.ll b/llvm/test/Transforms/GVN/pr14166.ll index bbc8c89..6e23bdc 100644 --- a/llvm/test/Transforms/GVN/pr14166.ll +++ b/llvm/test/Transforms/GVN/pr14166.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -disable-basic-aa -passes=gvn -S < %s | FileCheck %s --check-prefixes=CHECK,MDEP -; RUN: opt -disable-basic-aa -passes='gvn<memoryssa;no-memdep>' -S < %s | FileCheck --check-prefixes=CHECK,MSSA %s +; RUN: opt -disable-basic-aa -passes='gvn<memoryssa>' -S < %s | FileCheck --check-prefixes=CHECK,MSSA %s target datalayout = "e-p:32:32:32" define <2 x i32> @test1() { ; MDEP-LABEL: define <2 x i32> @test1() { diff --git a/llvm/test/Transforms/GVN/pre-compare.ll b/llvm/test/Transforms/GVN/pre-compare.ll index 574d40d..c4f083b 100644 --- a/llvm/test/Transforms/GVN/pre-compare.ll +++ b/llvm/test/Transforms/GVN/pre-compare.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -passes=gvn -S < %s | FileCheck --check-prefixes=CHECK,MDEP %s -; RUN: opt -passes='gvn<memoryssa;no-memdep>' -S < %s | FileCheck --check-prefixes=CHECK,MSSA %s +; RUN: opt -passes='gvn<memoryssa>' -S < %s | FileCheck --check-prefixes=CHECK,MSSA %s ; C source: ; diff --git a/llvm/test/Transforms/GVN/readattrs.ll b/llvm/test/Transforms/GVN/readattrs.ll index be018834..6e02dd3 100644 --- a/llvm/test/Transforms/GVN/readattrs.ll +++ b/llvm/test/Transforms/GVN/readattrs.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -passes=gvn -S -o - < %s | FileCheck --check-prefixes=CHECK,MDEP %s -; RUN: opt -passes='gvn<memoryssa;no-memdep>' -S -o - < %s | FileCheck --check-prefixes=CHECK,MSSA %s +; RUN: opt -passes='gvn<memoryssa>' -S -o - < %s | FileCheck --check-prefixes=CHECK,MSSA %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Transforms/GVN/setjmp.ll b/llvm/test/Transforms/GVN/setjmp.ll index 7777038..53518784 100644 --- a/llvm/test/Transforms/GVN/setjmp.ll +++ b/llvm/test/Transforms/GVN/setjmp.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -S -passes=gvn < %s | FileCheck --check-prefixes=CHECK,MDEP %s -; RUN: opt -S -passes='gvn<memoryssa;no-memdep>' -verify-analysis-invalidation < %s | FileCheck --check-prefixes=CHECK,MSSA %s +; RUN: opt -S -passes='gvn<memoryssa>' -verify-analysis-invalidation < %s | FileCheck --check-prefixes=CHECK,MSSA %s declare i32 @setjmp() returns_twice declare void @longjmp() declare ptr @malloc(i64) diff --git a/llvm/test/Transforms/GVN/tbaa.ll b/llvm/test/Transforms/GVN/tbaa.ll index 366dfec..59ace14 100644 --- a/llvm/test/Transforms/GVN/tbaa.ll +++ b/llvm/test/Transforms/GVN/tbaa.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -passes=gvn -S < %s | FileCheck --check-prefixes=CHECK,MDEP %s -; RUN: opt -passes='gvn<memoryssa;no-memdep>' -S < %s | FileCheck --check-prefixes=CHECK,MSSA %s +; RUN: opt -passes='gvn<memoryssa>' -S < %s | FileCheck --check-prefixes=CHECK,MSSA %s define i32 @test1(ptr %p, ptr %q) { ; MDEP-LABEL: define i32 @test1( diff --git a/llvm/test/Transforms/GVN/vscale.ll b/llvm/test/Transforms/GVN/vscale.ll index 646a67d..5d6c559 100644 --- a/llvm/test/Transforms/GVN/vscale.ll +++ b/llvm/test/Transforms/GVN/vscale.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S < %s -passes=gvn,dce | FileCheck --check-prefixes=CHECK,MDEP %s -; RUN: opt -S < %s -passes='gvn<memoryssa;no-memdep>',dce | FileCheck --check-prefixes=CHECK,MSSA %s +; RUN: opt -S < %s -passes='gvn<memoryssa>',dce | FileCheck --check-prefixes=CHECK,MSSA %s ; Analyze Load from clobbering Load. diff --git a/llvm/test/Transforms/GVNSink/lifetime.ll b/llvm/test/Transforms/GVNSink/lifetime.ll new file mode 100644 index 0000000..1a8a69b --- /dev/null +++ b/llvm/test/Transforms/GVNSink/lifetime.ll @@ -0,0 +1,77 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=gvn-sink < %s | FileCheck %s + +; Make sure we do not sink lifetime markers if this would introduce a +; lifetime with non-alloca operand. + +define void @test_cant_sink(i1 %c) { +; CHECK-LABEL: define void @test_cant_sink( +; CHECK-SAME: i1 [[C:%.*]]) { +; CHECK-NEXT: [[A:%.*]] = alloca i8, align 1 +; CHECK-NEXT: [[B:%.*]] = alloca i8, align 1 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 1, ptr [[A]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 1, ptr [[B]]) +; CHECK-NEXT: br i1 [[C]], label %[[IF:.*]], label %[[ELSE:.*]] +; CHECK: [[IF]]: +; CHECK-NEXT: store i64 1, ptr [[A]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 1, ptr [[A]]) +; CHECK-NEXT: br label %[[JOIN:.*]] +; CHECK: [[ELSE]]: +; CHECK-NEXT: store i64 1, ptr [[B]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 1, ptr [[B]]) +; CHECK-NEXT: br label %[[JOIN]] +; CHECK: [[JOIN]]: +; CHECK-NEXT: ret void +; + %a = alloca i8 + %b = alloca i8 + call void @llvm.lifetime.start(i64 1, ptr %a) + call void @llvm.lifetime.start(i64 1, ptr %b) + br i1 %c, label %if, label %else + +if: + store i64 1, ptr %a + call void @llvm.lifetime.end(i64 1, ptr %a) + br label %join + +else: + store i64 1, ptr %b + call void @llvm.lifetime.end(i64 1, ptr %b) + br label %join + +join: + ret void +} + +define void @test_can_sink(i1 %c) { +; CHECK-LABEL: define void @test_can_sink( +; CHECK-SAME: i1 [[C:%.*]]) { +; CHECK-NEXT: [[A:%.*]] = alloca i8, align 1 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 1, ptr [[A]]) +; CHECK-NEXT: br i1 [[C]], label %[[IF:.*]], label %[[ELSE:.*]] +; CHECK: [[IF]]: +; CHECK-NEXT: br label %[[JOIN:.*]] +; CHECK: [[ELSE]]: +; CHECK-NEXT: br label %[[JOIN]] +; CHECK: [[JOIN]]: +; CHECK-NEXT: store i64 1, ptr [[A]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 1, ptr [[A]]) +; CHECK-NEXT: ret void +; + %a = alloca i8 + call void @llvm.lifetime.start(i64 1, ptr %a) + br i1 %c, label %if, label %else + +if: + store i64 1, ptr %a + call void @llvm.lifetime.end(i64 1, ptr %a) + br label %join + +else: + store i64 1, ptr %a + call void @llvm.lifetime.end(i64 1, ptr %a) + br label %join + +join: + ret void +} diff --git a/llvm/test/Transforms/HipStdPar/global-var-indirection-wrong-table-member-0.ll b/llvm/test/Transforms/HipStdPar/global-var-indirection-wrong-table-member-0.ll new file mode 100644 index 0000000..258bcfb --- /dev/null +++ b/llvm/test/Transforms/HipStdPar/global-var-indirection-wrong-table-member-0.ll @@ -0,0 +1,15 @@ +; REQUIRES: amdgpu-registered-target +; RUN: not opt -S -mtriple=amdgcn-amd-amdhsa -passes=hipstdpar-select-accelerator-code \ +; RUN: %s 2>&1 | FileCheck %s + +; CHECK: error: The first element in the Indirection Table must be an integer; %struct.anon.1 = type { ptr, ptr } is incorrect. +%struct.anon.1 = type { ptr, ptr } +%class.anon = type { %struct.anon.1, ptr, %struct.anon.1 } +@a = external hidden local_unnamed_addr addrspace(1) global ptr, align 8 +@__hipstdpar_symbol_indirection_table = weak_odr protected addrspace(4) externally_initialized constant %class.anon zeroinitializer, align 8 + +define amdgpu_kernel void @store(ptr %p) { +entry: + store ptr %p, ptr addrspace(1) @a, align 8 + ret void +} diff --git a/llvm/test/Transforms/HipStdPar/global-var-indirection-wrong-table-member-1.ll b/llvm/test/Transforms/HipStdPar/global-var-indirection-wrong-table-member-1.ll new file mode 100644 index 0000000..331f4bf9 --- /dev/null +++ b/llvm/test/Transforms/HipStdPar/global-var-indirection-wrong-table-member-1.ll @@ -0,0 +1,15 @@ +; REQUIRES: amdgpu-registered-target +; RUN: not opt -S -mtriple=amdgcn-amd-amdhsa -passes=hipstdpar-select-accelerator-code \ +; RUN: %s 2>&1 | FileCheck %s + +; CHECK: error: The second element in the Indirection Table must be a pointer; %struct.anon.1 = type { ptr, ptr } is incorrect. +%struct.anon.1 = type { ptr, ptr } +%class.anon = type { i64, %struct.anon.1, %struct.anon.1 } +@a = external hidden local_unnamed_addr addrspace(1) global ptr, align 8 +@__hipstdpar_symbol_indirection_table = weak_odr protected addrspace(4) externally_initialized constant %class.anon zeroinitializer, align 8 + +define amdgpu_kernel void @store(ptr %p) { +entry: + store ptr %p, ptr addrspace(1) @a, align 8 + ret void +} diff --git a/llvm/test/Transforms/HipStdPar/global-var-indirection-wrong-table-member-2.ll b/llvm/test/Transforms/HipStdPar/global-var-indirection-wrong-table-member-2.ll new file mode 100644 index 0000000..6bdedcb --- /dev/null +++ b/llvm/test/Transforms/HipStdPar/global-var-indirection-wrong-table-member-2.ll @@ -0,0 +1,15 @@ +; REQUIRES: amdgpu-registered-target +; RUN: not opt -S -mtriple=amdgcn-amd-amdhsa -passes=hipstdpar-select-accelerator-code \ +; RUN: %s 2>&1 | FileCheck %s + +; CHECK: error: The third element in the Indirection Table must be a struct type; i64 is incorrect. +%struct.anon.1 = type { ptr, ptr } +%class.anon = type { i64, ptr, i64 } +@a = external hidden local_unnamed_addr addrspace(1) global ptr, align 8 +@__hipstdpar_symbol_indirection_table = weak_odr protected addrspace(4) externally_initialized constant %class.anon zeroinitializer, align 8 + +define amdgpu_kernel void @store(ptr %p) { +entry: + store ptr %p, ptr addrspace(1) @a, align 8 + ret void +} diff --git a/llvm/test/Transforms/HipStdPar/global-var-indirection-wrong-table-member-count.ll b/llvm/test/Transforms/HipStdPar/global-var-indirection-wrong-table-member-count.ll new file mode 100644 index 0000000..cf0efa0 --- /dev/null +++ b/llvm/test/Transforms/HipStdPar/global-var-indirection-wrong-table-member-count.ll @@ -0,0 +1,14 @@ +; REQUIRES: amdgpu-registered-target +; RUN: not opt -S -mtriple=amdgcn-amd-amdhsa -passes=hipstdpar-select-accelerator-code \ +; RUN: %s 2>&1 | FileCheck %s + +; CHECK: error: The Indirection Table must have 3 elements; 2 is incorrect. +%class.anon = type { i64, ptr } +@a = external hidden local_unnamed_addr addrspace(1) global ptr, align 8 +@__hipstdpar_symbol_indirection_table = weak_odr protected addrspace(4) externally_initialized constant %class.anon zeroinitializer, align 8 + +define amdgpu_kernel void @store(ptr %p) { +entry: + store ptr %p, ptr addrspace(1) @a, align 8 + ret void +} diff --git a/llvm/test/Transforms/HipStdPar/global-var-indirection-wrong-table-type.ll b/llvm/test/Transforms/HipStdPar/global-var-indirection-wrong-table-type.ll new file mode 100644 index 0000000..f32e378 --- /dev/null +++ b/llvm/test/Transforms/HipStdPar/global-var-indirection-wrong-table-type.ll @@ -0,0 +1,13 @@ +; REQUIRES: amdgpu-registered-target +; RUN: not opt -S -mtriple=amdgcn-amd-amdhsa -passes=hipstdpar-select-accelerator-code \ +; RUN: %s 2>&1 | FileCheck %s + +; CHECK: error: The Indirection Table must be a struct type; ptr is incorrect. +@a = external hidden local_unnamed_addr addrspace(1) global ptr, align 8 +@__hipstdpar_symbol_indirection_table = weak_odr protected addrspace(4) externally_initialized constant ptr zeroinitializer, align 8 + +define amdgpu_kernel void @store(ptr %p) { +entry: + store ptr %p, ptr addrspace(1) @a, align 8 + ret void +} diff --git a/llvm/test/Transforms/HipStdPar/global-var-indirection.ll b/llvm/test/Transforms/HipStdPar/global-var-indirection.ll new file mode 100644 index 0000000..98cace6 --- /dev/null +++ b/llvm/test/Transforms/HipStdPar/global-var-indirection.ll @@ -0,0 +1,110 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --scrub-attributes --check-globals all --version 5 +; REQUIRES: amdgpu-registered-target +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=hipstdpar-select-accelerator-code \ +; RUN: %s | FileCheck %s + +%class.anon = type { i64, ptr, %struct.anon.1 } +%struct.anon.1 = type { ptr, ptr } +%struct.A = type { i32, i32, i32, i32, i32, double, [205 x double], [2000 x i32], [52000 x i32], [156000 x double], [14823 x double] } + +@do_not_indirect = protected addrspace(4) externally_initialized constant [4 x double] [double 1.000000e+00, double 1.000000e+00, double 2.000000e+00, double 6.000000e+00], align 16 +@a = external hidden local_unnamed_addr addrspace(1) global %struct.A, align 8 +@b = external hidden local_unnamed_addr addrspace(1) global ptr, align 8 +@c = internal addrspace(1) global { i32 } zeroinitializer, align 4 +@d = external hidden local_unnamed_addr addrspace(1) global ptr addrspace(1), align 8 +@__hipstdpar_symbol_indirection_table = weak_odr protected addrspace(4) externally_initialized constant %class.anon zeroinitializer, align 8 + +declare i64 @fn(i64 %x, i32 %y, i64 %z, i64 %w) + +;. +; CHECK: @do_not_indirect = protected addrspace(4) externally_initialized constant [4 x double] [double 1.000000e+00, double 1.000000e+00, double 2.000000e+00, double 6.000000e+00], align 16 +; CHECK: @[[GLOB0:[0-9]+]] = private addrspace(1) constant [2 x i8] c"a\00" +; CHECK: @[[GLOB1:[0-9]+]] = private addrspace(1) externally_initialized constant ptr addrspace(1) poison +; CHECK: @[[GLOB2:[0-9]+]] = private addrspace(1) constant [2 x i8] c"b\00" +; CHECK: @[[GLOB3:[0-9]+]] = private addrspace(1) externally_initialized constant ptr addrspace(1) poison +; CHECK: @[[GLOB4:[0-9]+]] = private addrspace(1) constant [2 x i8] c"c\00" +; CHECK: @[[GLOB5:[0-9]+]] = private addrspace(1) externally_initialized constant ptr addrspace(1) poison +; CHECK: @[[GLOB6:[0-9]+]] = private addrspace(1) constant [2 x i8] c"d\00" +; CHECK: @[[GLOB7:[0-9]+]] = private addrspace(1) externally_initialized constant ptr addrspace(1) poison +; CHECK: @[[GLOB8:[0-9]+]] = private addrspace(1) constant [4 x %struct.anon.1] [%struct.anon.1 { ptr addrspacecast (ptr addrspace(1) @[[GLOB0]] to ptr), ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr) }, %struct.anon.1 { ptr addrspacecast (ptr addrspace(1) @[[GLOB2]] to ptr), ptr addrspacecast (ptr addrspace(1) @[[GLOB3]] to ptr) }, %struct.anon.1 { ptr addrspacecast (ptr addrspace(1) @[[GLOB4]] to ptr), ptr addrspacecast (ptr addrspace(1) @[[GLOB5]] to ptr) }, %struct.anon.1 { ptr addrspacecast (ptr addrspace(1) @[[GLOB6]] to ptr), ptr addrspacecast (ptr addrspace(1) @[[GLOB7]] to ptr) }] +; CHECK: @__hipstdpar_symbol_indirection_table = weak_odr protected addrspace(4) externally_initialized constant %class.anon { i64 4, ptr addrspacecast (ptr addrspace(1) @[[GLOB8]] to ptr), %struct.anon.1 poison }, align 8 +;. +define double @gep(i64 %idx) { +; CHECK-LABEL: define double @gep( +; CHECK-SAME: i64 [[IDX:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(1) @[[GLOB1]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(1) [[TMP0]], i64 217672 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [156000 x double], ptr addrspace(1) [[TMP1]], i64 0, i64 [[IDX]] +; CHECK-NEXT: [[R:%.*]] = load double, ptr addrspace(1) [[ARRAYIDX]], align 8 +; CHECK-NEXT: ret double [[R]] +; +entry: + %arrayidx = getelementptr inbounds [156000 x double], ptr addrspace(1) getelementptr inbounds nuw (i8, ptr addrspace(1) @a, i64 217672), i64 0, i64 %idx + %r = load double, ptr addrspace(1) %arrayidx, align 8 + ret double %r +} + +define void @store(ptr %p) { +; CHECK-LABEL: define void @store( +; CHECK-SAME: ptr [[P:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(1) @[[GLOB3]], align 8 +; CHECK-NEXT: store ptr [[P]], ptr addrspace(1) [[TMP0]], align 8 +; CHECK-NEXT: ret void +; +entry: + store ptr %p, ptr addrspace(1) @b, align 8 + ret void +} + +define i64 @chain(i64 %x, i32 %y, i64 %z) { +; CHECK-LABEL: define i64 @chain( +; CHECK-SAME: i64 [[X:%.*]], i32 [[Y:%.*]], i64 [[Z:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(1) @[[GLOB5]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr +; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @fn(i64 [[X]], i32 [[Y]], i64 [[TMP2]], i64 [[Z]]) +; CHECK-NEXT: ret i64 [[TMP3]] +; +entry: + %0 = call i64 @fn(i64 %x, i32 %y, i64 ptrtoint (ptr addrspacecast (ptr addrspace(1) @c to ptr) to i64), i64 %z) + ret i64 %0 +} + +define void @direct(ptr %p, i64 %n) { +; CHECK-LABEL: define void @direct( +; CHECK-SAME: ptr [[P:%.*]], i64 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(1) @[[GLOB7]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP0]], align 8 +; CHECK-NEXT: tail call void @llvm.memcpy.p0.p1.i64(ptr align 4 [[P]], ptr addrspace(1) align 4 [[TMP1]], i64 [[N]], i1 false) +; CHECK-NEXT: ret void +; +entry: + %0 = load ptr addrspace(1), ptr addrspace(1) @d, align 8 + tail call void @llvm.memcpy.p0.p1.i64(ptr align 4 %p, ptr addrspace(1) align 4 %0, i64 %n, i1 false) + ret void +} + +define amdgpu_kernel void @ensure_reachable(ptr %p, i64 %idx, i64 %x, i32 %y, i64 %z) { +; CHECK-LABEL: define amdgpu_kernel void @ensure_reachable( +; CHECK-SAME: ptr [[P:%.*]], i64 [[IDX:%.*]], i64 [[X:%.*]], i32 [[Y:%.*]], i64 [[Z:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: call void @store(ptr [[P]]) +; CHECK-NEXT: [[TMP0:%.*]] = call double @gep(i64 [[IDX]]) +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @chain(i64 [[X]], i32 [[Y]], i64 [[Z]]) +; CHECK-NEXT: call void @direct(ptr [[P]], i64 [[X]]) +; CHECK-NEXT: ret void +; +entry: + call void @store(ptr %p) + %0 = call double @gep(i64 %idx) + %1 = call i64 @chain(i64 %x, i32 %y, i64 %z) + call void @direct(ptr %p, i64 %x) + ret void +} +;. +; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } +;. diff --git a/llvm/test/Transforms/HipStdPar/global-var.ll b/llvm/test/Transforms/HipStdPar/global-var.ll index 860c30e..3a22a7b 100644 --- a/llvm/test/Transforms/HipStdPar/global-var.ll +++ b/llvm/test/Transforms/HipStdPar/global-var.ll @@ -2,8 +2,8 @@ ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=hipstdpar-select-accelerator-code \ ; RUN: %s | FileCheck %s -; CHECK: @var = extern_weak addrspace(1) externally_initialized global i32, align 4 -@var = addrspace(1) global i32 0, align 4 +; CHECK: @var = addrspace(1) global i32 poison, align 4 +@var = external addrspace(1) global i32, align 4 define amdgpu_kernel void @kernel() { entry: diff --git a/llvm/test/Transforms/HotColdSplit/lifetime-markers-on-inputs-1.ll b/llvm/test/Transforms/HotColdSplit/lifetime-markers-on-inputs-1.ll index e4e68ae..e5bab0c 100644 --- a/llvm/test/Transforms/HotColdSplit/lifetime-markers-on-inputs-1.ll +++ b/llvm/test/Transforms/HotColdSplit/lifetime-markers-on-inputs-1.ll @@ -36,11 +36,10 @@ outlinedPath: ; These two uses of stack slots are overlapping. This should prevent ; merging of stack slots. CodeExtractor must replicate the effects of ; these markers in the caller to inhibit stack coloring. - %gep1 = getelementptr inbounds i8, ptr %local1, i64 1 - call void @llvm.lifetime.start.p0(i64 1, ptr %gep1) + call void @llvm.lifetime.start.p0(i64 1, ptr %local1) call void @llvm.lifetime.start.p0(i64 1, ptr %local2) call void @cold_use2(ptr %local1, ptr %local2) - call void @llvm.lifetime.end.p0(i64 1, ptr %gep1) + call void @llvm.lifetime.end.p0(i64 1, ptr %local1) call void @llvm.lifetime.end.p0(i64 1, ptr %local2) br i1 undef, label %outlinedPath2, label %outlinedPathExit diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/lifetime.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/lifetime.ll index d39a0b3..053d073 100644 --- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/lifetime.ll +++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/lifetime.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=infer-address-spaces %s | FileCheck %s define i32 @lifetime_flat_pointer() { @@ -5,18 +6,15 @@ define i32 @lifetime_flat_pointer() { ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5) ; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[ALLOCA]]) ; CHECK-NEXT: store i32 1, ptr addrspace(5) [[ALLOCA]], align 4 -; CHECK-NEXT: %ret = load i32, ptr addrspace(5) [[ALLOCA]], align 4 +; CHECK-NEXT: [[RET:%.*]] = load i32, ptr addrspace(5) [[ALLOCA]], align 4 ; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[ALLOCA]]) -; CHECK-NEXT: ret i32 %ret +; CHECK-NEXT: ret i32 [[RET]] ; %alloca = alloca i32, align 4, addrspace(5) %flat = addrspacecast ptr addrspace(5) %alloca to ptr - call void @llvm.lifetime.start.p0(i64 4 , ptr %flat) + call void @llvm.lifetime.start(i64 4, ptr addrspace(5) %alloca) store i32 1, ptr %flat, align 4 %ret = load i32, ptr %flat, align 4 - call void @llvm.lifetime.end.p0(i64 4 , ptr %flat) + call void @llvm.lifetime.end(i64 4, ptr addrspace(5) %alloca) ret i32 %ret } - -declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) -declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) diff --git a/llvm/test/Transforms/InferAddressSpaces/NVPTX/lifetime.ll b/llvm/test/Transforms/InferAddressSpaces/NVPTX/lifetime.ll index 8bf6312..31e914a 100644 --- a/llvm/test/Transforms/InferAddressSpaces/NVPTX/lifetime.ll +++ b/llvm/test/Transforms/InferAddressSpaces/NVPTX/lifetime.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -S -passes=infer-address-spaces %s | FileCheck %s target triple = "nvptx64-nvidia-cuda" @@ -6,20 +7,18 @@ define i32 @lifetime_flat_pointer() { ; CHECK-LABEL: define i32 @lifetime_flat_pointer() { ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4 ; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[ALLOCA]] to ptr addrspace(5) -; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP1]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[ALLOCA]]) ; CHECK-NEXT: store i32 1, ptr addrspace(5) [[TMP1]], align 4 -; CHECK-NEXT: %ret = load i32, ptr addrspace(5) [[TMP1]], align 4 -; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP1]]) -; CHECK-NEXT: ret i32 %ret +; CHECK-NEXT: [[RET:%.*]] = load i32, ptr addrspace(5) [[TMP1]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[ALLOCA]]) +; CHECK-NEXT: ret i32 [[RET]] ; %alloca = alloca i32, align 4 %1 = addrspacecast ptr %alloca to ptr addrspace(5) - %2 = addrspacecast ptr addrspace(5) %1 to ptr - %3 = addrspacecast ptr addrspace(5) %1 to ptr - call void @llvm.lifetime.start.p0(i64 4, ptr %2) + call void @llvm.lifetime.start.p0(i64 4, ptr %alloca) store i32 1, ptr addrspace(5) %1, align 4 %ret = load i32, ptr addrspace(5) %1, align 4 - call void @llvm.lifetime.end.p0(i64 4, ptr %3) + call void @llvm.lifetime.end.p0(i64 4, ptr %alloca) ret i32 %ret } diff --git a/llvm/test/Transforms/Inline/alloca-bonus.ll b/llvm/test/Transforms/Inline/alloca-bonus.ll index 1dec660..45ff527 100644 --- a/llvm/test/Transforms/Inline/alloca-bonus.ll +++ b/llvm/test/Transforms/Inline/alloca-bonus.ll @@ -3,8 +3,6 @@ target datalayout = "p:32:32" -declare void @llvm.lifetime.start.p0(i64 %size, ptr nocapture %ptr) - @glbl = external global i32 define void @outer1() { @@ -20,7 +18,6 @@ define void @inner1(ptr %ptr) { store i32 0, ptr %ptr %D = getelementptr inbounds i32, ptr %ptr, i32 1 %F = select i1 false, ptr %ptr, ptr @glbl - call void @llvm.lifetime.start.p0(i64 0, ptr %ptr) call void @extern() ret void } @@ -39,7 +36,6 @@ define void @inner2(ptr %ptr) { store i32 0, ptr %ptr %D = getelementptr inbounds i32, ptr %ptr, i32 %A %F = select i1 false, ptr %ptr, ptr @glbl - call void @llvm.lifetime.start.p0(i64 0, ptr %ptr) call void @extern() ret void } @@ -146,7 +142,6 @@ define void @inner5(i1 %flag, ptr %ptr) { if.then: %D = getelementptr inbounds i32, ptr %ptr, i32 %A %F = select i1 false, ptr %ptr, ptr @glbl - call void @llvm.lifetime.start.p0(i64 0, ptr %ptr) ret void exit: diff --git a/llvm/test/Transforms/Inline/inlined-mustprogress-loop-metadata.ll b/llvm/test/Transforms/Inline/inlined-mustprogress-loop-metadata.ll index 12a328d..4e13ff4 100644 --- a/llvm/test/Transforms/Inline/inlined-mustprogress-loop-metadata.ll +++ b/llvm/test/Transforms/Inline/inlined-mustprogress-loop-metadata.ll @@ -1,7 +1,22 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes --force-update +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --version 5 ; RUN: opt < %s -S -passes="inline" | FileCheck %s define void @callee(i32 %a, i32 %b) #0 { +; CHECK: Function Attrs: mustprogress +; CHECK-LABEL: define void @callee( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[FOR_COND:.*]] +; CHECK: [[FOR_COND]]: +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[A]], [[B]] +; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: br label %[[WHILE_BODY:.*]] +; CHECK: [[WHILE_BODY]]: +; CHECK-NEXT: br label %[[WHILE_BODY]] +; entry: br label %for.cond for.cond: @@ -17,20 +32,20 @@ while.body: define void @caller(i32 %a, i32 %b) #1 { ; CHECK: Function Attrs: noinline -; CHECK-LABEL: define {{[^@]+}}@caller -; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1:#.*]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[FOR_COND:%.*]] -; CHECK: for.cond: +; CHECK-LABEL: define void @caller( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[FOR_COND:.*]] +; CHECK: [[FOR_COND]]: ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[A]], [[B]] -; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] -; CHECK: for.body: -; CHECK-NEXT: br label [[FOR_COND]] -; CHECK: for.end: -; CHECK-NEXT: br label [[FOR_COND_I:%.*]] -; CHECK: for.cond.i: - ; CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP0:![0-9]+]] -; CHECK: callee.exit: +; CHECK-NEXT: br i1 [[CMP]], label %[[CALLEE_EXIT:.*]], label %[[FOR_END:.*]] +; CHECK: [[CALLEE_EXIT]]: +; CHECK-NEXT: br label %[[FOR_COND]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: br label %[[FOR_COND_I:.*]] +; CHECK: [[FOR_COND_I]]: +; CHECK-NEXT: br label %[[FOR_COND_I]], !llvm.loop [[LOOP0]] +; CHECK: [[CALLEE_EXIT1:.*:]] ; CHECK-NEXT: ret void ; entry: @@ -46,6 +61,20 @@ for.end: } define void @callee_no_metadata(i32 %a, i32 %b) { +; CHECK-LABEL: define void @callee_no_metadata( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[FOR_COND:.*]] +; CHECK: [[FOR_COND]]: +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[A]], [[B]] +; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: br label %[[FOR_COND]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: br label %[[WHILE_BODY:.*]] +; CHECK: [[WHILE_BODY]]: +; CHECK-NEXT: br label %[[WHILE_BODY]] +; entry: br label %for.cond for.cond: @@ -60,20 +89,20 @@ while.body: } define void @caller_no_metadata(i32 %a, i32 %b) { -; CHECK-LABEL: define {{[^@]+}}@caller_no_metadata -; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[FOR_COND:%.*]] -; CHECK: for.cond: +; CHECK-LABEL: define void @caller_no_metadata( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[FOR_COND:.*]] +; CHECK: [[FOR_COND]]: ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[A]], [[B]] -; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] -; CHECK: for.body: -; CHECK-NEXT: br label [[FOR_COND]] -; CHECK: for.end: -; CHECK-NEXT: br label [[FOR_COND_I:%.*]] -; CHECK: for.cond.i: -; CHECK-NEXT: br label [[FOR_COND_I]] -; CHECK: callee_no_metadata.exit: +; CHECK-NEXT: br i1 [[CMP]], label %[[CALLEE_NO_METADATA_EXIT:.*]], label %[[FOR_END:.*]] +; CHECK: [[CALLEE_NO_METADATA_EXIT]]: +; CHECK-NEXT: br label %[[FOR_COND]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: br label %[[FOR_COND_I:.*]] +; CHECK: [[FOR_COND_I]]: +; CHECK-NEXT: br label %[[FOR_COND_I]] +; CHECK: [[CALLEE_NO_METADATA_EXIT1:.*:]] ; CHECK-NEXT: ret void ; entry: @@ -89,6 +118,21 @@ for.end: } define void @callee_mustprogress(i32 %a, i32 %b) #0 { +; CHECK: Function Attrs: mustprogress +; CHECK-LABEL: define void @callee_mustprogress( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[FOR_COND:.*]] +; CHECK: [[FOR_COND]]: +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[A]], [[B]] +; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: br label %[[FOR_COND]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: br label %[[WHILE_BODY:.*]] +; CHECK: [[WHILE_BODY]]: +; CHECK-NEXT: br label %[[WHILE_BODY]] +; entry: br label %for.cond for.cond: @@ -104,20 +148,20 @@ while.body: define void @caller_mustprogress(i32 %a, i32 %b) #0 { ; CHECK: Function Attrs: mustprogress -; CHECK-LABEL: define {{[^@]+}}@caller_mustprogress -; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR0:#[0-9]+]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[FOR_COND:%.*]] -; CHECK: for.cond: +; CHECK-LABEL: define void @caller_mustprogress( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[FOR_COND:.*]] +; CHECK: [[FOR_COND]]: ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[A]], [[B]] -; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] -; CHECK: for.body: -; CHECK-NEXT: br label [[FOR_COND]] -; CHECK: for.end: -; CHECK-NEXT: br label [[FOR_COND_I:%.*]] -; CHECK: for.cond.i: -; CHECK-NEXT: br label [[FOR_COND_I]] -; CHECK: callee_mustprogress.exit: +; CHECK-NEXT: br i1 [[CMP]], label %[[CALLEE_MUSTPROGRESS_EXIT:.*]], label %[[FOR_END:.*]] +; CHECK: [[CALLEE_MUSTPROGRESS_EXIT]]: +; CHECK-NEXT: br label %[[FOR_COND]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: br label %[[FOR_COND_I:.*]] +; CHECK: [[FOR_COND_I]]: +; CHECK-NEXT: br label %[[FOR_COND_I]] +; CHECK: [[CALLEE_MUSTPROGRESS_EXIT1:.*:]] ; CHECK-NEXT: ret void ; entry: @@ -133,20 +177,20 @@ for.end: } define void @caller_mustprogress_callee_no_metadata(i32 %a, i32 %b) #0 { -; CHECK-LABEL: define {{[^@]+}}@caller_mustprogress_callee_no_metadata -; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[FOR_COND:%.*]] -; CHECK: for.cond: +; CHECK-LABEL: define void @caller_mustprogress_callee_no_metadata( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[FOR_COND:.*]] +; CHECK: [[FOR_COND]]: ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[A]], [[B]] -; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] -; CHECK: for.body: -; CHECK-NEXT: br label [[FOR_COND]] -; CHECK: for.end: -; CHECK-NEXT: br label [[FOR_COND_I:%.*]] -; CHECK: for.cond.i: -; CHECK-NEXT: br label [[FOR_COND_I]] -; CHECK: callee_no_metadata.exit: +; CHECK-NEXT: br i1 [[CMP]], label %[[CALLEE_NO_METADATA_EXIT:.*]], label %[[FOR_END:.*]] +; CHECK: [[CALLEE_NO_METADATA_EXIT]]: +; CHECK-NEXT: br label %[[FOR_COND]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: br label %[[FOR_COND_I:.*]] +; CHECK: [[FOR_COND_I]]: +; CHECK-NEXT: br label %[[FOR_COND_I]] +; CHECK: [[CALLEE_NO_METADATA_EXIT1:.*:]] ; CHECK-NEXT: ret void ; entry: @@ -162,6 +206,42 @@ for.end: } define void @callee_multiple(i32 %a, i32 %b) #0 { +; CHECK: Function Attrs: mustprogress +; CHECK-LABEL: define void @callee_multiple( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 +; CHECK-NEXT: store i32 [[B]], ptr [[B_ADDR]], align 4 +; CHECK-NEXT: br label %[[FOR_COND:.*]] +; CHECK: [[FOR_COND]]: +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], [[TMP1]] +; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: store i32 0, ptr [[I]], align 4 +; CHECK-NEXT: br label %[[FOR_COND1:.*]] +; CHECK: [[FOR_COND1]]: +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[I]], align 4 +; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP2]], 10 +; CHECK-NEXT: br i1 [[CMP2]], label %[[FOR_BODY3:.*]], label %[[FOR_END4:.*]] +; CHECK: [[FOR_BODY3]]: +; CHECK-NEXT: br label %[[FOR_INC:.*]] +; CHECK: [[FOR_INC]]: +; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[I]], align 4 +; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP3]], 1 +; CHECK-NEXT: store i32 [[INC]], ptr [[I]], align 4 +; CHECK-NEXT: br label %[[FOR_COND1]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[FOR_END4]]: +; CHECK-NEXT: br label %[[WHILE_BODY:.*]] +; CHECK: [[WHILE_BODY]]: +; CHECK-NEXT: br label %[[WHILE_BODY]] +; entry: %a.addr = alloca i32, align 4 %b.addr = alloca i32, align 4 @@ -198,9 +278,9 @@ while.body: define void @caller_multiple(i32 %a, i32 %b) #1 { ; CHECK: Function Attrs: noinline -; CHECK-LABEL: define {{[^@]+}}@caller_multiple -; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] { -; CHECK-NEXT: entry: +; CHECK-LABEL: define void @caller_multiple( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[A_ADDR_I:%.*]] = alloca i32, align 4 ; CHECK-NEXT: [[B_ADDR_I:%.*]] = alloca i32, align 4 ; CHECK-NEXT: [[I_I:%.*]] = alloca i32, align 4 @@ -209,59 +289,59 @@ define void @caller_multiple(i32 %a, i32 %b) #1 { ; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 ; CHECK-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 ; CHECK-NEXT: store i32 [[B]], ptr [[B_ADDR]], align 4 -; CHECK-NEXT: br label [[FOR_COND:%.*]] -; CHECK: for.cond: +; CHECK-NEXT: br label %[[FOR_COND:.*]] +; CHECK: [[FOR_COND]]: ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4 ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], [[TMP1]] -; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] -; CHECK: for.body: -; CHECK-NEXT: br label [[FOR_COND]] -; CHECK: for.end: +; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: br label %[[FOR_COND]] +; CHECK: [[FOR_END]]: ; CHECK-NEXT: store i32 0, ptr [[I]], align 4 -; CHECK-NEXT: br label [[FOR_COND1:%.*]] -; CHECK: for.cond1: +; CHECK-NEXT: br label %[[FOR_COND1:.*]] +; CHECK: [[FOR_COND1]]: ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[I]], align 4 ; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP2]], 10 -; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_BODY3:%.*]], label [[FOR_END4:%.*]] -; CHECK: for.body3: -; CHECK-NEXT: br label [[FOR_INC:%.*]] -; CHECK: for.inc: +; CHECK-NEXT: br i1 [[CMP2]], label %[[FOR_BODY3:.*]], label %[[FOR_END4:.*]] +; CHECK: [[FOR_BODY3]]: +; CHECK-NEXT: br label %[[FOR_INC:.*]] +; CHECK: [[FOR_INC]]: ; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[I]], align 4 ; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP3]], 1 ; CHECK-NEXT: store i32 [[INC]], ptr [[I]], align 4 -; CHECK-NEXT: br label [[FOR_COND1]] -; CHECK: for.end4: +; CHECK-NEXT: br label %[[FOR_COND1]] +; CHECK: [[FOR_END4]]: ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[A_ADDR_I]]) ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[B_ADDR_I]]) ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[I_I]]) ; CHECK-NEXT: store i32 0, ptr [[A_ADDR_I]], align 4 ; CHECK-NEXT: store i32 5, ptr [[B_ADDR_I]], align 4 -; CHECK-NEXT: br label [[FOR_COND_I:%.*]] -; CHECK: for.cond.i: -; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[A_ADDR_I]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[B_ADDR_I]], align 4 -; CHECK-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP7]], [[TMP8]] -; CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I:%.*]], label [[FOR_END_I:%.*]] -; CHECK: for.body.i: - ; CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP2:![0-9]+]] -; CHECK: for.end.i: +; CHECK-NEXT: br label %[[FOR_COND_I:.*]] +; CHECK: [[FOR_COND_I]]: +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[A_ADDR_I]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[B_ADDR_I]], align 4 +; CHECK-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP4]], [[TMP5]] +; CHECK-NEXT: br i1 [[CMP_I]], label %[[FOR_BODY_I:.*]], label %[[FOR_END_I:.*]] +; CHECK: [[FOR_BODY_I]]: +; CHECK-NEXT: br label %[[FOR_COND_I]], !llvm.loop [[LOOP2]] +; CHECK: [[FOR_END_I]]: ; CHECK-NEXT: store i32 0, ptr [[I_I]], align 4 -; CHECK-NEXT: br label [[FOR_COND1_I:%.*]] -; CHECK: for.cond1.i: -; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[I_I]], align 4 -; CHECK-NEXT: [[CMP2_I:%.*]] = icmp slt i32 [[TMP9]], 10 -; CHECK-NEXT: br i1 [[CMP2_I]], label [[FOR_BODY3_I:%.*]], label [[FOR_END4_I:%.*]] -; CHECK: for.body3.i: -; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[I_I]], align 4 -; CHECK-NEXT: [[INC_I:%.*]] = add nsw i32 [[TMP10]], 1 +; CHECK-NEXT: br label %[[FOR_COND1_I:.*]] +; CHECK: [[FOR_COND1_I]]: +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[I_I]], align 4 +; CHECK-NEXT: [[CMP2_I:%.*]] = icmp slt i32 [[TMP6]], 10 +; CHECK-NEXT: br i1 [[CMP2_I]], label %[[FOR_BODY3_I:.*]], label %[[CALLEE_MULTIPLE_EXIT:.*]] +; CHECK: [[FOR_BODY3_I]]: +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[I_I]], align 4 +; CHECK-NEXT: [[INC_I:%.*]] = add nsw i32 [[TMP7]], 1 ; CHECK-NEXT: store i32 [[INC_I]], ptr [[I_I]], align 4 -; CHECK-NEXT: br label [[FOR_COND1_I]], !llvm.loop [[LOOP3:![0-9]+]] -; CHECK: for.end4.i: -; CHECK-NEXT: br label [[WHILE_BODY_I:%.*]] -; CHECK: while.body.i: -; CHECK-NEXT: br label [[WHILE_BODY_I]] -; CHECK: callee_multiple.exit: +; CHECK-NEXT: br label %[[FOR_COND1_I]], !llvm.loop [[LOOP3]] +; CHECK: [[CALLEE_MULTIPLE_EXIT]]: +; CHECK-NEXT: br label %[[WHILE_BODY_I:.*]] +; CHECK: [[WHILE_BODY_I]]: +; CHECK-NEXT: br label %[[WHILE_BODY_I]] +; CHECK: [[CALLEE_MULTIPLE_EXIT1:.*:]] ; CHECK-NEXT: ret void ; entry: @@ -298,6 +378,51 @@ for.end4: } define void @callee_nested(i32 %a, i32 %b) #0 { +; CHECK: Function Attrs: mustprogress +; CHECK-LABEL: define void @callee_nested( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 +; CHECK-NEXT: store i32 [[B]], ptr [[B_ADDR]], align 4 +; CHECK-NEXT: br label %[[FOR_COND:.*]] +; CHECK: [[FOR_COND]]: +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], [[TMP1]] +; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP0]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: store i32 0, ptr [[I]], align 4 +; CHECK-NEXT: br label %[[FOR_COND1:.*]] +; CHECK: [[FOR_COND1]]: +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[I]], align 4 +; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP2]], 10 +; CHECK-NEXT: br i1 [[CMP2]], label %[[FOR_BODY3:.*]], label %[[FOR_END8:.*]] +; CHECK: [[FOR_BODY3]]: +; CHECK-NEXT: br label %[[FOR_COND4:.*]] +; CHECK: [[FOR_COND4]]: +; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[B_ADDR]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[A_ADDR]], align 4 +; CHECK-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP3]], [[TMP4]] +; CHECK-NEXT: br i1 [[CMP5]], label %[[FOR_BODY6:.*]], label %[[FOR_END7:.*]] +; CHECK: [[FOR_BODY6]]: +; CHECK-NEXT: br label %[[FOR_COND4]], !llvm.loop [[LOOP2]] +; CHECK: [[FOR_END7]]: +; CHECK-NEXT: br label %[[FOR_INC:.*]] +; CHECK: [[FOR_INC]]: +; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[I]], align 4 +; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP5]], 1 +; CHECK-NEXT: store i32 [[INC]], ptr [[I]], align 4 +; CHECK-NEXT: br label %[[FOR_COND1]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[FOR_END8]]: +; CHECK-NEXT: br label %[[WHILE_BODY:.*]] +; CHECK: [[WHILE_BODY]]: +; CHECK-NEXT: br label %[[WHILE_BODY]] +; entry: %a.addr = alloca i32, align 4 %b.addr = alloca i32, align 4 @@ -343,9 +468,9 @@ while.body: define void @caller_nested(i32 %a, i32 %b) #1 { ; CHECK: Function Attrs: noinline -; CHECK-LABEL: define {{[^@]+}}@caller_nested -; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]] { -; CHECK-NEXT: entry: +; CHECK-LABEL: define void @caller_nested( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[A_ADDR_I:%.*]] = alloca i32, align 4 ; CHECK-NEXT: [[B_ADDR_I:%.*]] = alloca i32, align 4 ; CHECK-NEXT: [[I_I:%.*]] = alloca i32, align 4 @@ -355,91 +480,91 @@ define void @caller_nested(i32 %a, i32 %b) #1 { ; CHECK-NEXT: [[I9:%.*]] = alloca i32, align 4 ; CHECK-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 ; CHECK-NEXT: store i32 [[B]], ptr [[B_ADDR]], align 4 -; CHECK-NEXT: br label [[FOR_COND:%.*]] -; CHECK: for.cond: +; CHECK-NEXT: br label %[[FOR_COND:.*]] +; CHECK: [[FOR_COND]]: ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4 ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], [[TMP1]] -; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END8:%.*]] -; CHECK: for.body: +; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY:.*]], label %[[FOR_END8:.*]] +; CHECK: [[FOR_BODY]]: ; CHECK-NEXT: store i32 0, ptr [[I]], align 4 -; CHECK-NEXT: br label [[FOR_COND1:%.*]] -; CHECK: for.cond1: +; CHECK-NEXT: br label %[[FOR_COND1:.*]] +; CHECK: [[FOR_COND1]]: ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[I]], align 4 ; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP2]], 10 -; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_BODY3:%.*]], label [[FOR_END7:%.*]] -; CHECK: for.body3: -; CHECK-NEXT: br label [[FOR_COND4:%.*]] -; CHECK: for.cond4: +; CHECK-NEXT: br i1 [[CMP2]], label %[[FOR_BODY3:.*]], label %[[FOR_END7:.*]] +; CHECK: [[FOR_BODY3]]: +; CHECK-NEXT: br label %[[FOR_COND4:.*]] +; CHECK: [[FOR_COND4]]: ; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[B_ADDR]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[A_ADDR]], align 4 ; CHECK-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP3]], [[TMP4]] -; CHECK-NEXT: br i1 [[CMP5]], label [[FOR_BODY6:%.*]], label [[FOR_END:%.*]] -; CHECK: for.body6: -; CHECK-NEXT: br label [[FOR_COND4]] -; CHECK: for.end: -; CHECK-NEXT: br label [[FOR_INC:%.*]] -; CHECK: for.inc: +; CHECK-NEXT: br i1 [[CMP5]], label %[[FOR_BODY6:.*]], label %[[FOR_END:.*]] +; CHECK: [[FOR_BODY6]]: +; CHECK-NEXT: br label %[[FOR_COND4]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: br label %[[FOR_INC:.*]] +; CHECK: [[FOR_INC]]: ; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[I]], align 4 ; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP5]], 1 ; CHECK-NEXT: store i32 [[INC]], ptr [[I]], align 4 -; CHECK-NEXT: br label [[FOR_COND1]] -; CHECK: for.end7: -; CHECK-NEXT: br label [[FOR_COND]] -; CHECK: for.end8: +; CHECK-NEXT: br label %[[FOR_COND1]] +; CHECK: [[FOR_END7]]: +; CHECK-NEXT: br label %[[FOR_COND]] +; CHECK: [[FOR_END8]]: ; CHECK-NEXT: store i32 0, ptr [[I9]], align 4 -; CHECK-NEXT: br label [[FOR_COND10:%.*]] -; CHECK: for.cond10: +; CHECK-NEXT: br label %[[FOR_COND10:.*]] +; CHECK: [[FOR_COND10]]: ; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[I9]], align 4 ; CHECK-NEXT: [[CMP11:%.*]] = icmp slt i32 [[TMP6]], 10 -; CHECK-NEXT: br i1 [[CMP11]], label [[FOR_BODY12:%.*]], label [[FOR_END15:%.*]] -; CHECK: for.body12: -; CHECK-NEXT: br label [[FOR_INC13:%.*]] -; CHECK: for.inc13: +; CHECK-NEXT: br i1 [[CMP11]], label %[[FOR_BODY12:.*]], label %[[FOR_END15:.*]] +; CHECK: [[FOR_BODY12]]: +; CHECK-NEXT: br label %[[FOR_INC13:.*]] +; CHECK: [[FOR_INC13]]: ; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[I9]], align 4 ; CHECK-NEXT: [[INC14:%.*]] = add nsw i32 [[TMP7]], 1 ; CHECK-NEXT: store i32 [[INC14]], ptr [[I9]], align 4 -; CHECK-NEXT: br label [[FOR_COND10]] -; CHECK: for.end15: +; CHECK-NEXT: br label %[[FOR_COND10]] +; CHECK: [[FOR_END15]]: ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[A_ADDR_I]]) ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[B_ADDR_I]]) ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[I_I]]) ; CHECK-NEXT: store i32 0, ptr [[A_ADDR_I]], align 4 ; CHECK-NEXT: store i32 5, ptr [[B_ADDR_I]], align 4 -; CHECK-NEXT: br label [[FOR_COND_I:%.*]] -; CHECK: for.cond.i: -; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[A_ADDR_I]], align 4 -; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[B_ADDR_I]], align 4 -; CHECK-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]] -; CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I:%.*]], label [[FOR_END_I:%.*]] -; CHECK: for.body.i: -; CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP0]] -; CHECK: for.end.i: +; CHECK-NEXT: br label %[[FOR_COND_I:.*]] +; CHECK: [[FOR_COND_I]]: +; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR_I]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[B_ADDR_I]], align 4 +; CHECK-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP8]], [[TMP9]] +; CHECK-NEXT: br i1 [[CMP_I]], label %[[FOR_BODY_I:.*]], label %[[FOR_END_I:.*]] +; CHECK: [[FOR_BODY_I]]: +; CHECK-NEXT: br label %[[FOR_COND_I]], !llvm.loop [[LOOP0]] +; CHECK: [[FOR_END_I]]: ; CHECK-NEXT: store i32 0, ptr [[I_I]], align 4 -; CHECK-NEXT: br label [[FOR_COND1_I:%.*]] -; CHECK: for.cond1.i: +; CHECK-NEXT: br label %[[FOR_COND1_I:.*]] +; CHECK: [[FOR_COND1_I]]: +; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[I_I]], align 4 +; CHECK-NEXT: [[CMP2_I:%.*]] = icmp slt i32 [[TMP10]], 10 +; CHECK-NEXT: br i1 [[CMP2_I]], label %[[FOR_BODY3_I:.*]], label %[[CALLEE_NESTED_EXIT:.*]] +; CHECK: [[FOR_BODY3_I]]: +; CHECK-NEXT: br label %[[FOR_COND4_I:.*]] +; CHECK: [[FOR_COND4_I]]: +; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[B_ADDR_I]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[A_ADDR_I]], align 4 +; CHECK-NEXT: [[CMP5_I:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]] +; CHECK-NEXT: br i1 [[CMP5_I]], label %[[FOR_BODY6_I:.*]], label %[[FOR_END7_I:.*]] +; CHECK: [[FOR_BODY6_I]]: +; CHECK-NEXT: br label %[[FOR_COND4_I]], !llvm.loop [[LOOP2]] +; CHECK: [[FOR_END7_I]]: ; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[I_I]], align 4 -; CHECK-NEXT: [[CMP2_I:%.*]] = icmp slt i32 [[TMP13]], 10 -; CHECK-NEXT: br i1 [[CMP2_I]], label [[FOR_BODY3_I:%.*]], label [[FOR_END8_I:%.*]] -; CHECK: for.body3.i: -; CHECK-NEXT: br label [[FOR_COND4_I:%.*]] -; CHECK: for.cond4.i: -; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[B_ADDR_I]], align 4 -; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[A_ADDR_I]], align 4 -; CHECK-NEXT: [[CMP5_I:%.*]] = icmp slt i32 [[TMP14]], [[TMP15]] -; CHECK-NEXT: br i1 [[CMP5_I]], label [[FOR_BODY6_I:%.*]], label [[FOR_END7_I:%.*]] -; CHECK: for.body6.i: - ; CHECK-NEXT: br label [[FOR_COND4_I]], !llvm.loop [[LOOP2:![0-9]+]] -; CHECK: for.end7.i: -; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[I_I]], align 4 -; CHECK-NEXT: [[INC_I:%.*]] = add nsw i32 [[TMP16]], 1 +; CHECK-NEXT: [[INC_I:%.*]] = add nsw i32 [[TMP13]], 1 ; CHECK-NEXT: store i32 [[INC_I]], ptr [[I_I]], align 4 -; CHECK-NEXT: br label [[FOR_COND1_I]], !llvm.loop [[LOOP4:![0-9]+]] -; CHECK: for.end8.i: -; CHECK-NEXT: br label [[WHILE_BODY_I:%.*]] -; CHECK: while.body.i: -; CHECK-NEXT: br label [[WHILE_BODY_I]] -; CHECK: callee_nested.exit: +; CHECK-NEXT: br label %[[FOR_COND1_I]], !llvm.loop [[LOOP4]] +; CHECK: [[CALLEE_NESTED_EXIT]]: +; CHECK-NEXT: br label %[[WHILE_BODY_I:.*]] +; CHECK: [[WHILE_BODY_I]]: +; CHECK-NEXT: br label %[[WHILE_BODY_I]] +; CHECK: [[CALLEE_NESTED_EXIT1:.*:]] ; CHECK-NEXT: ret void ; entry: @@ -499,14 +624,7 @@ for.end15: ret void } -; CHECK: attributes [[ATTR0]] = { mustprogress } -; CHECK: attributes [[ATTR1]] = { noinline } -; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[GEN1:!.*]]} -; CHECK: [[GEN1]] = !{!"llvm.loop.mustprogress"} -; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[GEN1:!.*]]} -; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[GEN1:!.*]]} -; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[GEN1:!.*]]} attributes #0 = { mustprogress } attributes #1 = { noinline } @@ -520,3 +638,10 @@ attributes #2 = { noinline mustprogress } !5 = distinct !{!5, !1} !6 = distinct !{!6, !1} !7 = distinct !{!7, !1} +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.mustprogress"} +; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[META1]]} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]]} +;. diff --git a/llvm/test/Transforms/Inline/redundant-loads.ll b/llvm/test/Transforms/Inline/redundant-loads.ll index 773be78..3b066ef 100644 --- a/llvm/test/Transforms/Inline/redundant-loads.ll +++ b/llvm/test/Transforms/Inline/redundant-loads.ll @@ -104,11 +104,8 @@ define void @outer6(ptr %a, ptr %ptr) { ret void } -declare void @llvm.lifetime.start.p0(i64, ptr nocapture) argmemonly nounwind - define void @inner6(ptr %a, ptr %ptr) { %1 = load i32, ptr %a - call void @llvm.lifetime.start.p0(i64 32, ptr %ptr) ; This intrinsic does not clobber the first load. %2 = load i32, ptr %a call void @pad() %3 = load i32, ptr %a diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/wmma-f8f6f4.ll b/llvm/test/Transforms/InstCombine/AMDGPU/wmma-f8f6f4.ll new file mode 100644 index 0000000..d255eb0 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/AMDGPU/wmma-f8f6f4.ll @@ -0,0 +1,158 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=instcombine < %s | FileCheck %s + +; ------------------------------------------------------------------------------------ +; Incorrect signature for format cases (IR vector too large) wmma.f32.16x16x128.f8f6f4 +; ------------------------------------------------------------------------------------ + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp6___v16i32_fp8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp6___v16i32_fp8( +; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> +; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 2, <12 x i32> [[TMP0]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> [[C]]) +; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 +; CHECK-NEXT: ret void +; +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 2, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf8___v16i32_fp6(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf8___v16i32_fp6( +; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> +; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 1, <16 x i32> [[A]], i32 2, <12 x i32> [[TMP0]], i16 0, <8 x float> [[C]]) +; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 +; CHECK-NEXT: ret void +; +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 1, <16 x i32> %A, i32 2, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf6___v16i32_bf8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf6___v16i32_bf8( +; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> +; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 3, <12 x i32> [[TMP0]], i32 1, <16 x i32> [[B]], i16 0, <8 x float> [[C]]) +; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 +; CHECK-NEXT: ret void +; +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 3, <16 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf8___v16i32_bf6(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf8___v16i32_bf6( +; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> +; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 1, <16 x i32> [[A]], i32 3, <12 x i32> [[TMP0]], i16 0, <8 x float> [[C]]) +; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 +; CHECK-NEXT: ret void +; +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 1, <16 x i32> %A, i32 3, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp4___v16i32_fp8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp4___v16i32_fp8( +; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> +; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 4, <8 x i32> [[TMP0]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> [[C]]) +; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 +; CHECK-NEXT: ret void +; +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 4, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v16i32_fp4(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v16i32_fp4( +; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> +; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> [[A]], i32 4, <8 x i32> [[TMP0]], i16 0, <8 x float> [[C]]) +; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 +; CHECK-NEXT: ret void +; +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 4, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v12i32_fp4___v16i32_fp8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v12i32_fp4___v16i32_fp8( +; CHECK-SAME: <12 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <12 x i32> [[A]], <12 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> +; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 4, <8 x i32> [[TMP0]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> [[C]]) +; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 +; CHECK-NEXT: ret void +; +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 4, <12 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v12i32_fp4(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v12i32_fp4( +; CHECK-SAME: <16 x i32> [[A:%.*]], <12 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <12 x i32> [[B]], <12 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> +; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> [[A]], i32 4, <8 x i32> [[TMP0]], i16 0, <8 x float> [[C]]) +; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 +; CHECK-NEXT: ret void +; +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 4, <12 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v12i32_fp4___v16i32_fp6(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v12i32_fp4___v16i32_fp6( +; CHECK-SAME: <12 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <12 x i32> [[A]], <12 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> +; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v12i32(i32 4, <8 x i32> [[TMP0]], i32 2, <12 x i32> [[TMP1]], i16 0, <8 x float> [[C]]) +; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 +; CHECK-NEXT: ret void +; +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 4, <12 x i32> %A, i32 2, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf6___v12i32_fp4(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf6___v12i32_fp4( +; CHECK-SAME: <16 x i32> [[A:%.*]], <12 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <12 x i32> [[B]], <12 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> +; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v8i32(i32 3, <12 x i32> [[TMP0]], i32 4, <8 x i32> [[TMP1]], i16 0, <8 x float> [[C]]) +; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 +; CHECK-NEXT: ret void +; +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 3, <16 x i32> %A, i32 4, <12 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/Transforms/InstCombine/deadcode.ll b/llvm/test/Transforms/InstCombine/deadcode.ll index e65f0ab..f3e1ba6 100644 --- a/llvm/test/Transforms/InstCombine/deadcode.ll +++ b/llvm/test/Transforms/InstCombine/deadcode.ll @@ -26,8 +26,9 @@ declare void @llvm.lifetime.start.p0(i64, ptr) declare void @llvm.lifetime.end.p0(i64, ptr) define void @test3() { - call void @llvm.lifetime.start.p0(i64 -1, ptr undef) - call void @llvm.lifetime.end.p0(i64 -1, ptr undef) + %a = alloca i32 + call void @llvm.lifetime.start.p0(i64 -1, ptr %a) + call void @llvm.lifetime.end.p0(i64 -1, ptr %a) ret void } diff --git a/llvm/test/Transforms/InstCombine/getelementptr.ll b/llvm/test/Transforms/InstCombine/getelementptr.ll index 752ff0c..bb0a94c 100644 --- a/llvm/test/Transforms/InstCombine/getelementptr.ll +++ b/llvm/test/Transforms/InstCombine/getelementptr.ll @@ -682,15 +682,15 @@ define i32 @test28() nounwind { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ORIENTATIONS:%.*]] = alloca [1 x [1 x %struct.x]], align 8 ; CHECK-NEXT: [[T3:%.*]] = call i32 @puts(ptr noundef nonnull dereferenceable(1) @.str) #[[ATTR0]] -; CHECK-NEXT: [[T45:%.*]] = getelementptr inbounds nuw i8, ptr [[ORIENTATIONS]], i64 1 ; CHECK-NEXT: br label [[BB10:%.*]] ; CHECK: bb10: ; CHECK-NEXT: [[INDVAR:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDVAR_NEXT:%.*]], [[BB10]] ] ; CHECK-NEXT: [[T12_REC:%.*]] = xor i32 [[INDVAR]], -1 ; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[T12_REC]] to i64 -; CHECK-NEXT: [[T12:%.*]] = getelementptr inbounds i8, ptr [[T45]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = add nsw i64 [[TMP0]], 1 +; CHECK-NEXT: [[T12:%.*]] = getelementptr inbounds i8, ptr [[ORIENTATIONS]], i64 [[TMP1]] ; CHECK-NEXT: [[T16:%.*]] = call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str1, ptr nonnull [[T12]]) #[[ATTR0]] -; CHECK-NEXT: [[T84:%.*]] = icmp eq i32 [[INDVAR]], 0 +; CHECK-NEXT: [[T84:%.*]] = icmp eq i64 [[TMP1]], 0 ; CHECK-NEXT: [[INDVAR_NEXT]] = add i32 [[INDVAR]], 1 ; CHECK-NEXT: br i1 [[T84]], label [[BB17:%.*]], label [[BB10]] ; CHECK: bb17: diff --git a/llvm/test/Transforms/InstCombine/icmp-gep.ll b/llvm/test/Transforms/InstCombine/icmp-gep.ll index 3f10405..aede844 100644 --- a/llvm/test/Transforms/InstCombine/icmp-gep.ll +++ b/llvm/test/Transforms/InstCombine/icmp-gep.ll @@ -849,3 +849,279 @@ define i1 @gep_mugtiple_ugt_inbounds_nusw(ptr %base, i64 %idx, i64 %idx2) { %cmp = icmp ugt ptr %gep2, %base ret i1 %cmp } + +define i1 @gep_multiple_multi_use_below_limit(ptr %base, i64 %idx1, i64 %idx2, i64 %idx3) { +; CHECK-LABEL: @gep_multiple_multi_use_below_limit( +; CHECK-NEXT: [[GEP3_IDX:%.*]] = shl i64 [[IDX3:%.*]], 2 +; CHECK-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr [[GEP2:%.*]], i64 [[GEP3_IDX]] +; CHECK-NEXT: call void @use(ptr [[GEP3]]) +; CHECK-NEXT: [[GEP2_IDX:%.*]] = shl i64 [[IDX2:%.*]], 2 +; CHECK-NEXT: [[GEP4:%.*]] = getelementptr i8, ptr [[GEP3]], i64 [[GEP2_IDX]] +; CHECK-NEXT: call void @use(ptr [[GEP4]]) +; CHECK-NEXT: [[GEP3_IDX1:%.*]] = shl i64 [[IDX4:%.*]], 2 +; CHECK-NEXT: [[GEP5:%.*]] = getelementptr i8, ptr [[GEP4]], i64 [[GEP3_IDX1]] +; CHECK-NEXT: call void @use(ptr [[GEP5]]) +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[GEP3_IDX]], [[GEP2_IDX]] +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 0, [[GEP3_IDX1]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[TMP1]], [[TMP2]] +; CHECK-NEXT: ret i1 [[CMP]] +; + %gep1 = getelementptr i32, ptr %base, i64 %idx1 + call void @use(ptr %gep1) + %gep2 = getelementptr i32, ptr %gep1, i64 %idx2 + call void @use(ptr %gep2) + %gep3 = getelementptr i32, ptr %gep2, i64 %idx3 + call void @use(ptr %gep3) + %cmp = icmp eq ptr %gep3, %base + ret i1 %cmp +} + +define i1 @gep_multiple_multi_use_below_limit_extra_one_use_gep1(ptr %base, i64 %idx1, i64 %idx2, i64 %idx3, i64 %idx4) { +; CHECK-LABEL: @gep_multiple_multi_use_below_limit_extra_one_use_gep1( +; CHECK-NEXT: [[GEP1_IDX:%.*]] = shl i64 [[IDX1:%.*]], 2 +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[GEP1_IDX]] +; CHECK-NEXT: call void @use(ptr [[GEP1]]) +; CHECK-NEXT: [[GEP2_IDX:%.*]] = shl i64 [[IDX2:%.*]], 2 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr [[GEP1]], i64 [[GEP2_IDX]] +; CHECK-NEXT: call void @use(ptr [[GEP2]]) +; CHECK-NEXT: [[GEP3_IDX:%.*]] = shl i64 [[IDX3:%.*]], 2 +; CHECK-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr [[GEP2]], i64 [[GEP3_IDX]] +; CHECK-NEXT: call void @use(ptr [[GEP3]]) +; CHECK-NEXT: [[GEP4_IDX_NEG:%.*]] = mul i64 [[IDX4:%.*]], -4 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[GEP1_IDX]], [[GEP2_IDX]] +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[TMP1]], [[GEP3_IDX]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[TMP2]], [[GEP4_IDX_NEG]] +; CHECK-NEXT: ret i1 [[CMP]] +; + %gep1 = getelementptr i32, ptr %base, i64 %idx1 + call void @use(ptr %gep1) + %gep2 = getelementptr i32, ptr %gep1, i64 %idx2 + call void @use(ptr %gep2) + %gep3 = getelementptr i32, ptr %gep2, i64 %idx3 + call void @use(ptr %gep3) + %gep4 = getelementptr i32, ptr %gep3, i64 %idx4 + %cmp = icmp eq ptr %gep4, %base + ret i1 %cmp +} + +define i1 @gep_multiple_multi_use_below_limit_extra_one_use_gep2(ptr %base, i64 %idx1, i64 %idx2, i64 %idx3, i64 %idx4) { +; CHECK-LABEL: @gep_multiple_multi_use_below_limit_extra_one_use_gep2( +; CHECK-NEXT: [[GEP1_IDX1:%.*]] = add i64 [[IDX1:%.*]], [[IDX2:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[GEP1_IDX1]], 2 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +; CHECK-NEXT: call void @use(ptr [[GEP2]]) +; CHECK-NEXT: [[GEP3_IDX:%.*]] = shl i64 [[IDX3:%.*]], 2 +; CHECK-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr [[GEP2]], i64 [[GEP3_IDX]] +; CHECK-NEXT: call void @use(ptr [[GEP3]]) +; CHECK-NEXT: [[GEP4_IDX:%.*]] = shl i64 [[IDX4:%.*]], 2 +; CHECK-NEXT: [[GEP4:%.*]] = getelementptr i8, ptr [[GEP3]], i64 [[GEP4_IDX]] +; CHECK-NEXT: call void @use(ptr [[GEP4]]) +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[TMP1]], [[GEP3_IDX]] +; CHECK-NEXT: [[GEP4_IDX_NEG:%.*]] = sub i64 0, [[GEP4_IDX]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[TMP2]], [[GEP4_IDX_NEG]] +; CHECK-NEXT: ret i1 [[CMP]] +; + %gep1 = getelementptr i32, ptr %base, i64 %idx1 + %gep2 = getelementptr i32, ptr %gep1, i64 %idx2 + call void @use(ptr %gep2) + %gep3 = getelementptr i32, ptr %gep2, i64 %idx3 + call void @use(ptr %gep3) + %gep4 = getelementptr i32, ptr %gep3, i64 %idx4 + call void @use(ptr %gep4) + %cmp = icmp eq ptr %gep4, %base + ret i1 %cmp +} + +define i1 @gep_multiple_multi_above_below_limit_consts(ptr %base, i64 %idx1, i64 %idx2) { +; CHECK-LABEL: @gep_multiple_multi_above_below_limit_consts( +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 16 +; CHECK-NEXT: call void @use(ptr [[GEP1]]) +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i32, ptr [[GEP1]], i64 [[IDX1:%.*]] +; CHECK-NEXT: call void @use(ptr [[GEP2]]) +; CHECK-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr [[GEP2]], i64 16 +; CHECK-NEXT: call void @use(ptr [[GEP3]]) +; CHECK-NEXT: [[GEP4:%.*]] = getelementptr i32, ptr [[GEP3]], i64 [[IDX2:%.*]] +; CHECK-NEXT: call void @use(ptr [[GEP4]]) +; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr [[GEP4]], [[BASE]] +; CHECK-NEXT: ret i1 [[CMP]] +; + %gep1 = getelementptr i32, ptr %base, i64 4 + call void @use(ptr %gep1) + %gep2 = getelementptr i32, ptr %gep1, i64 %idx1 + call void @use(ptr %gep2) + %gep3 = getelementptr i32, ptr %gep2, i64 4 + call void @use(ptr %gep3) + %gep4 = getelementptr i32, ptr %gep3, i64 %idx2 + call void @use(ptr %gep4) + %cmp = icmp eq ptr %gep4, %base + ret i1 %cmp +} + +define i1 @gep_multiple_multi_use_above_limit(ptr %base, i64 %idx1, i64 %idx2, i64 %idx3, i64 %idx4) { +; CHECK-LABEL: @gep_multiple_multi_use_above_limit( +; CHECK-NEXT: [[GEP4:%.*]] = getelementptr i32, ptr [[BASE:%.*]], i64 [[IDX1:%.*]] +; CHECK-NEXT: call void @use(ptr [[GEP4]]) +; CHECK-NEXT: [[GEP3:%.*]] = getelementptr i32, ptr [[GEP4]], i64 [[IDX2:%.*]] +; CHECK-NEXT: call void @use(ptr [[GEP3]]) +; CHECK-NEXT: [[GEP5:%.*]] = getelementptr i32, ptr [[GEP3]], i64 [[IDX3:%.*]] +; CHECK-NEXT: call void @use(ptr [[GEP5]]) +; CHECK-NEXT: [[GEP6:%.*]] = getelementptr i32, ptr [[GEP5]], i64 [[IDX4:%.*]] +; CHECK-NEXT: call void @use(ptr [[GEP6]]) +; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr [[GEP6]], [[BASE]] +; CHECK-NEXT: ret i1 [[CMP]] +; + %gep1 = getelementptr i32, ptr %base, i64 %idx1 + call void @use(ptr %gep1) + %gep2 = getelementptr i32, ptr %gep1, i64 %idx2 + call void @use(ptr %gep2) + %gep3 = getelementptr i32, ptr %gep2, i64 %idx3 + call void @use(ptr %gep3) + %gep4 = getelementptr i32, ptr %gep3, i64 %idx4 + call void @use(ptr %gep4) + %cmp = icmp eq ptr %gep4, %base + ret i1 %cmp +} + +define i1 @gep_gep_multiple_eq(ptr %base, i64 %idx1, i64 %idx2, i64 %idx3, i64 %idx4) { +; CHECK-LABEL: @gep_gep_multiple_eq( +; CHECK-NEXT: [[GEP1_IDX1:%.*]] = add i64 [[IDX1:%.*]], [[IDX2:%.*]] +; CHECK-NEXT: [[GEP3_IDX2:%.*]] = add i64 [[IDX3:%.*]], [[IDX4:%.*]] +; CHECK-NEXT: [[CMP_UNSHIFTED:%.*]] = xor i64 [[GEP1_IDX1]], [[GEP3_IDX2]] +; CHECK-NEXT: [[CMP_MASK:%.*]] = and i64 [[CMP_UNSHIFTED]], 4611686018427387903 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[CMP_MASK]], 0 +; CHECK-NEXT: ret i1 [[CMP]] +; + %gep1 = getelementptr i32, ptr %base, i64 %idx1 + %gep2 = getelementptr i32, ptr %gep1, i64 %idx2 + %gep3 = getelementptr i32, ptr %base, i64 %idx3 + %gep4 = getelementptr i32, ptr %gep3, i64 %idx4 + %cmp = icmp eq ptr %gep2, %gep4 + ret i1 %cmp +} + +define i1 @gep_gep_multiple_eq_nuw(ptr %base, i64 %idx1, i64 %idx2, i64 %idx3, i64 %idx4) { +; CHECK-LABEL: @gep_gep_multiple_eq_nuw( +; CHECK-NEXT: [[GEP1_IDX1:%.*]] = add i64 [[IDX1:%.*]], [[IDX2:%.*]] +; CHECK-NEXT: [[GEP3_IDX2:%.*]] = add i64 [[IDX3:%.*]], [[IDX4:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[GEP1_IDX1]], [[GEP3_IDX2]] +; CHECK-NEXT: ret i1 [[CMP]] +; + %gep1 = getelementptr nuw i32, ptr %base, i64 %idx1 + %gep2 = getelementptr nuw i32, ptr %gep1, i64 %idx2 + %gep3 = getelementptr nuw i32, ptr %base, i64 %idx3 + %gep4 = getelementptr nuw i32, ptr %gep3, i64 %idx4 + %cmp = icmp eq ptr %gep2, %gep4 + ret i1 %cmp +} + +define i1 @gep_gep_multiple_eq_nuw_different_scales(ptr %base, i64 %idx1, i64 %idx2, i64 %idx3, i64 %idx4) { +; CHECK-LABEL: @gep_gep_multiple_eq_nuw_different_scales( +; CHECK-NEXT: [[GEP1_IDX:%.*]] = shl nuw i64 [[IDX1:%.*]], 2 +; CHECK-NEXT: [[GEP2_IDX:%.*]] = shl nuw i64 [[IDX2:%.*]], 3 +; CHECK-NEXT: [[TMP1:%.*]] = add nuw i64 [[GEP1_IDX]], [[GEP2_IDX]] +; CHECK-NEXT: [[GEP3_IDX:%.*]] = shl nuw i64 [[IDX3:%.*]], 2 +; CHECK-NEXT: [[GEP4_IDX:%.*]] = shl nuw i64 [[IDX4:%.*]], 3 +; CHECK-NEXT: [[TMP2:%.*]] = add nuw i64 [[GEP3_IDX]], [[GEP4_IDX]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[TMP1]], [[TMP2]] +; CHECK-NEXT: ret i1 [[CMP]] +; + %gep1 = getelementptr nuw i32, ptr %base, i64 %idx1 + %gep2 = getelementptr nuw i64, ptr %gep1, i64 %idx2 + %gep3 = getelementptr nuw i32, ptr %base, i64 %idx3 + %gep4 = getelementptr nuw i64, ptr %gep3, i64 %idx4 + %cmp = icmp eq ptr %gep2, %gep4 + ret i1 %cmp +} + +define i1 @gep_gep_multiple_eq_partial_nuw_different_scales(ptr %base, i64 %idx1, i64 %idx2, i64 %idx3, i64 %idx4) { +; CHECK-LABEL: @gep_gep_multiple_eq_partial_nuw_different_scales( +; CHECK-NEXT: [[GEP1_IDX:%.*]] = shl nuw i64 [[IDX1:%.*]], 2 +; CHECK-NEXT: [[GEP2_IDX:%.*]] = shl nuw i64 [[IDX2:%.*]], 3 +; CHECK-NEXT: [[TMP1:%.*]] = add nuw i64 [[GEP1_IDX]], [[GEP2_IDX]] +; CHECK-NEXT: [[GEP3_IDX:%.*]] = shl nuw i64 [[IDX3:%.*]], 2 +; CHECK-NEXT: [[GEP4_IDX:%.*]] = shl i64 [[IDX4:%.*]], 3 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[GEP3_IDX]], [[GEP4_IDX]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[TMP1]], [[TMP2]] +; CHECK-NEXT: ret i1 [[CMP]] +; + %gep1 = getelementptr nuw i32, ptr %base, i64 %idx1 + %gep2 = getelementptr nuw i64, ptr %gep1, i64 %idx2 + %gep3 = getelementptr nuw i32, ptr %base, i64 %idx3 + %gep4 = getelementptr i64, ptr %gep3, i64 %idx4 + %cmp = icmp eq ptr %gep2, %gep4 + ret i1 %cmp +} + +define i1 @gep_gep_multiple_eq_partial_inbounds_different_scales(ptr %base, i64 %idx1, i64 %idx2, i64 %idx3, i64 %idx4) { +; CHECK-LABEL: @gep_gep_multiple_eq_partial_inbounds_different_scales( +; CHECK-NEXT: [[GEP1_IDX:%.*]] = shl nsw i64 [[IDX1:%.*]], 2 +; CHECK-NEXT: [[GEP2_IDX:%.*]] = shl nsw i64 [[IDX2:%.*]], 3 +; CHECK-NEXT: [[TMP1:%.*]] = add nsw i64 [[GEP1_IDX]], [[GEP2_IDX]] +; CHECK-NEXT: [[GEP3_IDX:%.*]] = shl nsw i64 [[IDX3:%.*]], 2 +; CHECK-NEXT: [[GEP4_IDX:%.*]] = shl i64 [[IDX4:%.*]], 3 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[GEP3_IDX]], [[GEP4_IDX]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[TMP1]], [[TMP2]] +; CHECK-NEXT: ret i1 [[CMP]] +; + %gep1 = getelementptr inbounds i32, ptr %base, i64 %idx1 + %gep2 = getelementptr inbounds i64, ptr %gep1, i64 %idx2 + %gep3 = getelementptr inbounds i32, ptr %base, i64 %idx3 + %gep4 = getelementptr i64, ptr %gep3, i64 %idx4 + %cmp = icmp eq ptr %gep2, %gep4 + ret i1 %cmp +} + +define i1 @gep_gep_multiple_ult_nuw(ptr %base, i64 %idx1, i64 %idx2, i64 %idx3, i64 %idx4) { +; CHECK-LABEL: @gep_gep_multiple_ult_nuw( +; CHECK-NEXT: [[GEP1_IDX1:%.*]] = add i64 [[IDX1:%.*]], [[IDX2:%.*]] +; CHECK-NEXT: [[GEP3_IDX2:%.*]] = add i64 [[IDX3:%.*]], [[IDX4:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[GEP1_IDX1]], [[GEP3_IDX2]] +; CHECK-NEXT: ret i1 [[CMP]] +; + %gep1 = getelementptr nuw i32, ptr %base, i64 %idx1 + %gep2 = getelementptr nuw i32, ptr %gep1, i64 %idx2 + %gep3 = getelementptr nuw i32, ptr %base, i64 %idx3 + %gep4 = getelementptr nuw i32, ptr %gep3, i64 %idx4 + %cmp = icmp ult ptr %gep2, %gep4 + ret i1 %cmp +} + +define i1 @gep_gep_multiple_ult_missing_nuw(ptr %base, i64 %idx1, i64 %idx2, i64 %idx3, i64 %idx4) { +; CHECK-LABEL: @gep_gep_multiple_ult_missing_nuw( +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr nuw i32, ptr [[BASE:%.*]], i64 [[IDX1:%.*]] +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr nuw i32, ptr [[GEP1]], i64 [[IDX2:%.*]] +; CHECK-NEXT: [[GEP3:%.*]] = getelementptr nuw i32, ptr [[BASE]], i64 [[IDX3:%.*]] +; CHECK-NEXT: [[GEP4:%.*]] = getelementptr i32, ptr [[GEP3]], i64 [[IDX4:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult ptr [[GEP2]], [[GEP4]] +; CHECK-NEXT: ret i1 [[CMP]] +; + %gep1 = getelementptr nuw i32, ptr %base, i64 %idx1 + %gep2 = getelementptr nuw i32, ptr %gep1, i64 %idx2 + %gep3 = getelementptr nuw i32, ptr %base, i64 %idx3 + %gep4 = getelementptr i32, ptr %gep3, i64 %idx4 + %cmp = icmp ult ptr %gep2, %gep4 + ret i1 %cmp +} + +define i1 @gep_gep_multiple_ult_nuw_multi_use(ptr %base, i64 %idx1, i64 %idx2, i64 %idx3, i64 %idx4) { +; CHECK-LABEL: @gep_gep_multiple_ult_nuw_multi_use( +; CHECK-NEXT: [[IDX3:%.*]] = add i64 [[IDX1:%.*]], [[IDX2:%.*]] +; CHECK-NEXT: [[GEP3_IDX:%.*]] = shl nuw i64 [[IDX3]], 2 +; CHECK-NEXT: [[GEP3:%.*]] = getelementptr nuw i8, ptr [[BASE:%.*]], i64 [[GEP3_IDX]] +; CHECK-NEXT: [[IDX4:%.*]] = add i64 [[IDX5:%.*]], [[IDX6:%.*]] +; CHECK-NEXT: [[GEP4_IDX:%.*]] = shl nuw i64 [[IDX4]], 2 +; CHECK-NEXT: [[GEP5:%.*]] = getelementptr nuw i8, ptr [[BASE]], i64 [[GEP4_IDX]] +; CHECK-NEXT: call void @use(ptr [[GEP3]]) +; CHECK-NEXT: call void @use(ptr [[GEP5]]) +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[GEP3_IDX]], [[GEP4_IDX]] +; CHECK-NEXT: ret i1 [[CMP]] +; + %gep1 = getelementptr nuw i32, ptr %base, i64 %idx1 + %gep2 = getelementptr nuw i32, ptr %gep1, i64 %idx2 + %gep3 = getelementptr nuw i32, ptr %base, i64 %idx3 + %gep4 = getelementptr nuw i32, ptr %gep3, i64 %idx4 + call void @use(ptr %gep2) + call void @use(ptr %gep4) + %cmp = icmp ult ptr %gep2, %gep4 + ret i1 %cmp +} diff --git a/llvm/test/Transforms/InstCombine/malloc-free.ll b/llvm/test/Transforms/InstCombine/malloc-free.ll index 989074f..d8a1c07 100644 --- a/llvm/test/Transforms/InstCombine/malloc-free.ll +++ b/llvm/test/Transforms/InstCombine/malloc-free.ll @@ -109,8 +109,6 @@ define void @test3(ptr %src) { ; CHECK-NEXT: ret void ; %a = call noalias ptr @malloc(i32 10) - call void @llvm.lifetime.start.p0(i64 10, ptr %a) - call void @llvm.lifetime.end.p0(i64 10, ptr %a) %size = call i64 @llvm.objectsize.i64(ptr %a, i1 true) store i8 42, ptr %a call void @llvm.memcpy.p0.p0.i32(ptr %a, ptr %src, i32 32, i1 false) diff --git a/llvm/test/Transforms/InstCombine/scalable-vector-struct.ll b/llvm/test/Transforms/InstCombine/scalable-vector-struct.ll index 9a0a6ae..95753a2 100644 --- a/llvm/test/Transforms/InstCombine/scalable-vector-struct.ll +++ b/llvm/test/Transforms/InstCombine/scalable-vector-struct.ll @@ -174,16 +174,12 @@ define { <16 x i8>, <32 x i8> } @differenttypes({ <4 x i32>, <8 x i32> } %a, ptr ; CHECK-LABEL: define { <16 x i8>, <32 x i8> } @differenttypes ; CHECK-SAME: ({ <4 x i32>, <8 x i32> } [[A:%.*]], ptr [[P:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 -1, ptr nonnull [[P]]) ; CHECK-NEXT: store { <4 x i32>, <8 x i32> } [[A]], ptr [[P]], align 16 ; CHECK-NEXT: [[TMP0:%.*]] = load { <16 x i8>, <32 x i8> }, ptr [[P]], align 16 -; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 -1, ptr nonnull [[P]]) ; CHECK-NEXT: ret { <16 x i8>, <32 x i8> } [[TMP0]] ; entry: - call void @llvm.lifetime.start.p0(i64 -1, ptr nonnull %p) #5 store { <4 x i32>, <8 x i32> } %a, ptr %p, align 16 %2 = load { <16 x i8>, <32 x i8> }, ptr %p, align 16 - call void @llvm.lifetime.end.p0(i64 -1, ptr nonnull %p) #5 ret { <16 x i8>, <32 x i8> } %2 } diff --git a/llvm/test/Transforms/InstCombine/sub-gep.ll b/llvm/test/Transforms/InstCombine/sub-gep.ll index 11af6b4..45e5686 100644 --- a/llvm/test/Transforms/InstCombine/sub-gep.ll +++ b/llvm/test/Transforms/InstCombine/sub-gep.ll @@ -945,19 +945,15 @@ define i64 @multiple_geps_two_chains_gep_base(ptr %base, i64 %base.idx, i64 %idx define i64 @multiple_geps_two_chains_multi_use(ptr %base, i64 %idx1, i64 %idx2, i64 %idx3, i64 %idx4) { ; CHECK-LABEL: @multiple_geps_two_chains_multi_use( -; CHECK-NEXT: [[P2_IDX:%.*]] = shl nsw i64 [[IDX2:%.*]], 2 -; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, ptr [[P1:%.*]], i64 [[P2_IDX]] -; CHECK-NEXT: [[P4_IDX:%.*]] = shl nsw i64 [[IDX4:%.*]], 2 -; CHECK-NEXT: [[P5:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[P4_IDX]] -; CHECK-NEXT: [[P3_IDX:%.*]] = shl nsw i64 [[IDX3:%.*]], 2 -; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[P3_IDX]] -; CHECK-NEXT: [[P4_IDX1:%.*]] = shl nsw i64 [[IDX5:%.*]], 2 -; CHECK-NEXT: [[P4:%.*]] = getelementptr inbounds i8, ptr [[P3]], i64 [[P4_IDX1]] +; CHECK-NEXT: [[P1_IDX1:%.*]] = add i64 [[IDX1:%.*]], [[IDX2:%.*]] +; CHECK-NEXT: [[P4_IDX:%.*]] = shl i64 [[P1_IDX1]], 2 +; CHECK-NEXT: [[P5:%.*]] = getelementptr inbounds i8, ptr [[P2:%.*]], i64 [[P4_IDX]] +; CHECK-NEXT: [[P3_IDX2:%.*]] = add i64 [[IDX3:%.*]], [[IDX4:%.*]] +; CHECK-NEXT: [[P4_IDX1:%.*]] = shl i64 [[P3_IDX2]], 2 +; CHECK-NEXT: [[P4:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[P4_IDX1]] ; CHECK-NEXT: call void @use(ptr [[P5]]) ; CHECK-NEXT: call void @use(ptr [[P4]]) -; CHECK-NEXT: [[TMP1:%.*]] = add nsw i64 [[P2_IDX]], [[P4_IDX]] -; CHECK-NEXT: [[TMP2:%.*]] = add nsw i64 [[P3_IDX]], [[P4_IDX1]] -; CHECK-NEXT: [[GEPDIFF:%.*]] = sub nsw i64 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[GEPDIFF:%.*]] = sub nsw i64 [[P4_IDX]], [[P4_IDX1]] ; CHECK-NEXT: ret i64 [[GEPDIFF]] ; %p1 = getelementptr inbounds i32, ptr %base, i64 %idx1 @@ -974,23 +970,18 @@ define i64 @multiple_geps_two_chains_multi_use(ptr %base, i64 %idx1, i64 %idx2, define i64 @multiple_geps_two_chains_partial_multi_use(ptr %base, i64 %idx1, i64 %idx2, i64 %idx3, i64 %idx4, i64 %idx5, i64 %idx6) { ; CHECK-LABEL: @multiple_geps_two_chains_partial_multi_use( -; CHECK-NEXT: [[P2_IDX:%.*]] = shl nsw i64 [[IDX2:%.*]], 2 -; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, ptr [[P1:%.*]], i64 [[P2_IDX]] -; CHECK-NEXT: [[P4_IDX:%.*]] = shl nsw i64 [[IDX4:%.*]], 2 -; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[P4_IDX]] -; CHECK-NEXT: [[P3_IDX:%.*]] = shl nsw i64 [[IDX3:%.*]], 2 -; CHECK-NEXT: [[P4_IDX1:%.*]] = shl nsw i64 [[IDX7:%.*]], 2 -; CHECK-NEXT: [[P5:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[P4_IDX1]] -; CHECK-NEXT: [[P5_IDX:%.*]] = shl nsw i64 [[IDX5:%.*]], 2 -; CHECK-NEXT: [[P4:%.*]] = getelementptr inbounds i8, ptr [[P5]], i64 [[P5_IDX]] -; CHECK-NEXT: [[P6_IDX:%.*]] = shl nsw i64 [[IDX6:%.*]], 2 +; CHECK-NEXT: [[P1_IDX1:%.*]] = add i64 [[IDX1:%.*]], [[IDX2:%.*]] +; CHECK-NEXT: [[P4_IDX:%.*]] = shl i64 [[P1_IDX1]], 2 +; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds i8, ptr [[P2:%.*]], i64 [[P4_IDX]] +; CHECK-NEXT: [[P4_IDX2:%.*]] = add i64 [[IDX4:%.*]], [[IDX5:%.*]] +; CHECK-NEXT: [[P5_IDX:%.*]] = shl i64 [[P4_IDX2]], 2 +; CHECK-NEXT: [[P4:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[P5_IDX]] ; CHECK-NEXT: call void @use(ptr [[P3]]) ; CHECK-NEXT: call void @use(ptr [[P4]]) -; CHECK-NEXT: [[TMP1:%.*]] = add nsw i64 [[P2_IDX]], [[P4_IDX]] -; CHECK-NEXT: [[TMP2:%.*]] = add nsw i64 [[TMP1]], [[P3_IDX]] -; CHECK-NEXT: [[TMP3:%.*]] = add nsw i64 [[P4_IDX1]], [[P5_IDX]] -; CHECK-NEXT: [[TMP4:%.*]] = add nsw i64 [[TMP3]], [[P6_IDX]] -; CHECK-NEXT: [[GEPDIFF:%.*]] = sub nsw i64 [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[P1_IDX1]], [[IDX3:%.*]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[P4_IDX2]], [[IDX6:%.*]] +; CHECK-NEXT: [[TMP5:%.*]] = sub i64 [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[GEPDIFF:%.*]] = shl i64 [[TMP5]], 2 ; CHECK-NEXT: ret i64 [[GEPDIFF]] ; %p1 = getelementptr inbounds i32, ptr %base, i64 %idx1 @@ -1007,6 +998,29 @@ define i64 @multiple_geps_two_chains_partial_multi_use(ptr %base, i64 %idx1, i64 ret i64 %d } +define i64 @multiple_geps_two_chains_partial_multi_use_insert_point(ptr %p, i64 %idx1, i64 %idx2, i64 %idx3) { +; CHECK-LABEL: @multiple_geps_two_chains_partial_multi_use_insert_point( +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 8 +; CHECK-NEXT: call void @use(ptr [[GEP2]]) +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[IDX2:%.*]], [[IDX3:%.*]] +; CHECK-NEXT: [[GEP4:%.*]] = getelementptr i8, ptr [[GEP2]], i64 [[TMP1]] +; CHECK-NEXT: call void @use(ptr [[GEP4]]) +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[TMP1]], 8 +; CHECK-NEXT: [[GEPDIFF:%.*]] = sub i64 [[IDX1:%.*]], [[TMP2]] +; CHECK-NEXT: ret i64 [[GEPDIFF]] +; + %gep1 = getelementptr i8, ptr %p, i64 %idx1 + %gep2 = getelementptr i8, ptr %p, i64 8 + call void @use(ptr %gep2) + %gep3 = getelementptr i8, ptr %gep2, i64 %idx2 + %gep4 = getelementptr i8, ptr %gep3, i64 %idx3 + call void @use(ptr %gep4) + %gep1.int = ptrtoint ptr %gep1 to i64 + %gep4.int = ptrtoint ptr %gep4 to i64 + %sub = sub i64 %gep1.int, %gep4.int + ret i64 %sub +} + define i64 @multiple_geps_inbounds(ptr %base, i64 %idx, i64 %idx2) { ; CHECK-LABEL: @multiple_geps_inbounds( ; CHECK-NEXT: [[D:%.*]] = add nsw i64 [[IDX:%.*]], [[IDX2:%.*]] @@ -1158,3 +1172,65 @@ define i64 @nuw_ptrdiff_mul_nsw_nneg_scale_multiuse(ptr %base, i64 %idx) { %diff = sub nuw i64 %lhs, %rhs ret i64 %diff } + +define i64 @multiple_geps_multi_use_below_limit(ptr %base, i64 %idx1, i64 %idx2, i64 %idx3, i64 %idx4) { +; CHECK-LABEL: @multiple_geps_multi_use_below_limit( +; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds nuw i8, ptr [[P1:%.*]], i64 [[IDX2:%.*]] +; CHECK-NEXT: call void @use(ptr [[P2]]) +; CHECK-NEXT: [[P4:%.*]] = getelementptr inbounds nuw i8, ptr [[P2]], i64 [[IDX5:%.*]] +; CHECK-NEXT: call void @use(ptr [[P4]]) +; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds nuw i8, ptr [[P1]], i64 [[IDX3:%.*]] +; CHECK-NEXT: call void @use(ptr [[P3]]) +; CHECK-NEXT: [[P5:%.*]] = getelementptr inbounds nuw i8, ptr [[P3]], i64 [[IDX4:%.*]] +; CHECK-NEXT: call void @use(ptr [[P5]]) +; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i64 [[IDX2]], [[IDX5]] +; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[IDX3]], [[IDX4]] +; CHECK-NEXT: [[GEPDIFF:%.*]] = sub nsw i64 [[TMP1]], [[TMP2]] +; CHECK-NEXT: ret i64 [[GEPDIFF]] +; + %p1 = getelementptr inbounds nuw i8, ptr %base, i64 %idx1 + call void @use(ptr %p1) + %p2 = getelementptr inbounds nuw i8, ptr %p1, i64 %idx2 + call void @use(ptr %p2) + %p3 = getelementptr inbounds nuw i8, ptr %base, i64 %idx3 + call void @use(ptr %p3) + %p4 = getelementptr inbounds nuw i8, ptr %p3, i64 %idx4 + call void @use(ptr %p4) + %i1 = ptrtoint ptr %p4 to i64 + %i2 = ptrtoint ptr %p2 to i64 + %d = sub i64 %i2, %i1 + ret i64 %d +} + +define i64 @multiple_geps_multi_use_above_limit(ptr %base, i64 %idx1, i64 %idx2, i64 %idx3, i64 %idx4, i64 %idx5) { +; CHECK-LABEL: @multiple_geps_multi_use_above_limit( +; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds nuw i8, ptr [[P1:%.*]], i64 [[IDX2:%.*]] +; CHECK-NEXT: call void @use(ptr [[P2]]) +; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds nuw i8, ptr [[P2]], i64 [[IDX6:%.*]] +; CHECK-NEXT: call void @use(ptr [[P3]]) +; CHECK-NEXT: [[P5:%.*]] = getelementptr inbounds nuw i8, ptr [[P1]], i64 [[TMP3:%.*]] +; CHECK-NEXT: call void @use(ptr [[P5]]) +; CHECK-NEXT: [[P6:%.*]] = getelementptr inbounds nuw i8, ptr [[P5]], i64 [[IDX7:%.*]] +; CHECK-NEXT: call void @use(ptr [[P6]]) +; CHECK-NEXT: [[P7:%.*]] = getelementptr inbounds nuw i8, ptr [[P6]], i64 [[IDX5:%.*]] +; CHECK-NEXT: call void @use(ptr [[P7]]) +; CHECK-NEXT: [[I1:%.*]] = ptrtoint ptr [[P7]] to i64 +; CHECK-NEXT: [[I2:%.*]] = ptrtoint ptr [[P3]] to i64 +; CHECK-NEXT: [[D:%.*]] = sub i64 [[I2]], [[I1]] +; CHECK-NEXT: ret i64 [[D]] +; + %p1 = getelementptr inbounds nuw i8, ptr %base, i64 %idx1 + call void @use(ptr %p1) + %p2 = getelementptr inbounds nuw i8, ptr %p1, i64 %idx2 + call void @use(ptr %p2) + %p3 = getelementptr inbounds nuw i8, ptr %base, i64 %idx3 + call void @use(ptr %p3) + %p4 = getelementptr inbounds nuw i8, ptr %p3, i64 %idx4 + call void @use(ptr %p4) + %p5 = getelementptr inbounds nuw i8, ptr %p4, i64 %idx5 + call void @use(ptr %p5) + %i1 = ptrtoint ptr %p5 to i64 + %i2 = ptrtoint ptr %p2 to i64 + %d = sub i64 %i2, %i1 + ret i64 %d +} diff --git a/llvm/test/Transforms/InstSimplify/const-fold-nvvm-unary-arithmetic.ll b/llvm/test/Transforms/InstSimplify/const-fold-nvvm-unary-arithmetic.ll new file mode 100644 index 0000000..75b8509 --- /dev/null +++ b/llvm/test/Transforms/InstSimplify/const-fold-nvvm-unary-arithmetic.ll @@ -0,0 +1,646 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt < %s -passes=instsimplify -march=nvptx64 -S | FileCheck %s + +; Test constant-folding for various NVVM unary arithmetic intrinsics. + +;############################################################### +;# Ceil # +;############################################################### + +define double @test_ceil_d_1_25() { +; CHECK-LABEL: define double @test_ceil_d_1_25() { +; CHECK-NEXT: ret double 2.000000e+00 +; + %res = call double @llvm.nvvm.ceil.d(double 1.25) + ret double %res +} + +define float @test_ceil_f_1_25() { +; CHECK-LABEL: define float @test_ceil_f_1_25() { +; CHECK-NEXT: ret float 2.000000e+00 +; + %res = call float @llvm.nvvm.ceil.f(float 1.25) + ret float %res +} + +define float @test_ceil_ftz_f_1_25() { +; CHECK-LABEL: define float @test_ceil_ftz_f_1_25() { +; CHECK-NEXT: ret float 2.000000e+00 +; + %res = call float @llvm.nvvm.ceil.ftz.f(float 1.25) + ret float %res +} + +define double @test_ceil_d_pos_subnorm() { +; CHECK-LABEL: define double @test_ceil_d_pos_subnorm() { +; CHECK-NEXT: ret double 1.000000e+00 +; + %res = call double @llvm.nvvm.ceil.d(double 0x380FFFFFC0000000) + ret double %res +} + +define float @test_ceil_f_pos_subnorm() { +; CHECK-LABEL: define float @test_ceil_f_pos_subnorm() { +; CHECK-NEXT: ret float 1.000000e+00 +; + %res = call float @llvm.nvvm.ceil.f(float 0x380FFFFFC0000000) + ret float %res +} + +define float @test_ceil_ftz_f_pos_subnorm() { +; CHECK-LABEL: define float @test_ceil_ftz_f_pos_subnorm() { +; CHECK-NEXT: ret float 0.000000e+00 +; + %res = call float @llvm.nvvm.ceil.ftz.f(float 0x380FFFFFC0000000) + ret float %res +} + +;############################################################### +;# FAbs # +;############################################################### + +define float @test_fabs_neg_1_5() { +; CHECK-LABEL: define float @test_fabs_neg_1_5() { +; CHECK-NEXT: ret float 1.500000e+00 +; + %res = call float @llvm.nvvm.fabs(float -1.5) + ret float %res +} + +define float @test_fabs_ftz_neg_1_5() { +; CHECK-LABEL: define float @test_fabs_ftz_neg_1_5() { +; CHECK-NEXT: ret float 1.500000e+00 +; + %res = call float @llvm.nvvm.fabs.ftz(float -1.5) + ret float %res +} + +define float @test_fabs_1_25() { +; CHECK-LABEL: define float @test_fabs_1_25() { +; CHECK-NEXT: ret float 1.250000e+00 +; + %res = call float @llvm.nvvm.fabs(float 1.25) + ret float %res +} + +define float @test_fabs_ftz_1_25() { +; CHECK-LABEL: define float @test_fabs_ftz_1_25() { +; CHECK-NEXT: ret float 1.250000e+00 +; + %res = call float @llvm.nvvm.fabs.ftz(float 1.25) + ret float %res +} + +define float @test_fabs_neg_subnorm() { +; CHECK-LABEL: define float @test_fabs_neg_subnorm() { +; CHECK-NEXT: ret float 0x380FFFFFC0000000 +; + %res = call float @llvm.nvvm.fabs(float 0xB80FFFFFC0000000) + ret float %res +} + +define float @test_fabs_ftz_neg_subnorm() { +; CHECK-LABEL: define float @test_fabs_ftz_neg_subnorm() { +; CHECK-NEXT: ret float 0.000000e+00 +; + %res = call float @llvm.nvvm.fabs.ftz(float 0xB80FFFFFC0000000) + ret float %res +} + +define float @test_fabs_pos_subnorm() { +; CHECK-LABEL: define float @test_fabs_pos_subnorm() { +; CHECK-NEXT: ret float 0x380FFFFFC0000000 +; + %res = call float @llvm.nvvm.fabs(float 0x380FFFFFC0000000) + ret float %res +} + +define float @test_fabs_ftz_pos_subnorm() { +; CHECK-LABEL: define float @test_fabs_ftz_pos_subnorm() { +; CHECK-NEXT: ret float 0.000000e+00 +; + %res = call float @llvm.nvvm.fabs.ftz(float 0x380FFFFFC0000000) + ret float %res +} + + +;############################################################### +;# Floor # +;############################################################### + +define double @test_floor_d_1_25() { +; CHECK-LABEL: define double @test_floor_d_1_25() { +; CHECK-NEXT: ret double 1.000000e+00 +; + %res = call double @llvm.nvvm.floor.d(double 1.25) + ret double %res +} + +define float @test_floor_f_1_25() { +; CHECK-LABEL: define float @test_floor_f_1_25() { +; CHECK-NEXT: ret float 1.000000e+00 +; + %res = call float @llvm.nvvm.floor.f(float 1.25) + ret float %res +} + +define float @test_floor_ftz_f_1_25() { +; CHECK-LABEL: define float @test_floor_ftz_f_1_25() { +; CHECK-NEXT: ret float 1.000000e+00 +; + %res = call float @llvm.nvvm.floor.ftz.f(float 1.25) + ret float %res +} + +define double @test_floor_d_neg_subnorm() { +; CHECK-LABEL: define double @test_floor_d_neg_subnorm() { +; CHECK-NEXT: ret double -1.000000e+00 +; + %res = call double @llvm.nvvm.floor.d(double 0xB80FFFFFC0000000) + ret double %res +} + +define float @test_floor_f_neg_subnorm() { +; CHECK-LABEL: define float @test_floor_f_neg_subnorm() { +; CHECK-NEXT: ret float -1.000000e+00 +; + %res = call float @llvm.nvvm.floor.f(float 0xB80FFFFFC0000000) + ret float %res +} + +define float @test_floor_ftz_f_neg_subnorm() { +; CHECK-LABEL: define float @test_floor_ftz_f_neg_subnorm() { +; CHECK-NEXT: ret float -0.000000e+00 +; + %res = call float @llvm.nvvm.floor.ftz.f(float 0xB80FFFFFC0000000) + ret float %res +} + +;############################################################### +;# Rcp # +;############################################################### + +;+-------------------------------------------------------------+ +;| rcp_rm | +;+-------------------------------------------------------------+ +define double @test_rcp_rm_d_0_5() { +; CHECK-LABEL: define double @test_rcp_rm_d_0_5() { +; CHECK-NEXT: ret double 2.000000e+00 +; + %res = call double @llvm.nvvm.rcp.rm.d(double 0.5) + ret double %res +} + +define float @test_rcp_rm_f_0_5() { +; CHECK-LABEL: define float @test_rcp_rm_f_0_5() { +; CHECK-NEXT: ret float 2.000000e+00 +; + %res = call float @llvm.nvvm.rcp.rm.f(float 0.5) + ret float %res +} + +define float @test_rcp_rm_ftz_f_0_5() { +; CHECK-LABEL: define float @test_rcp_rm_ftz_f_0_5() { +; CHECK-NEXT: ret float 2.000000e+00 +; + %res = call float @llvm.nvvm.rcp.rm.ftz.f(float 0.5) + ret float %res +} + +define double @test_rcp_rm_d_neg_subnorm() { +; CHECK-LABEL: define double @test_rcp_rm_d_neg_subnorm() { +; CHECK-NEXT: ret double 0xC7D0000020000041 +; + %res = call double @llvm.nvvm.rcp.rm.d(double 0xB80FFFFFC0000000) + ret double %res +} + +define float @test_rcp_rm_f_neg_subnorm() { +; CHECK-LABEL: define float @test_rcp_rm_f_neg_subnorm() { +; CHECK-NEXT: ret float 0xC7D0000040000000 +; + %res = call float @llvm.nvvm.rcp.rm.f(float 0xB80FFFFFC0000000) + ret float %res +} + +define float @test_rcp_rm_ftz_f_neg_subnorm() { +; CHECK-LABEL: define float @test_rcp_rm_ftz_f_neg_subnorm() { +; CHECK-NEXT: [[RES:%.*]] = call float @llvm.nvvm.rcp.rm.ftz.f(float 0xB80FFFFFC0000000) +; CHECK-NEXT: ret float [[RES]] +; + %res = call float @llvm.nvvm.rcp.rm.ftz.f(float 0xB80FFFFFC0000000) + ret float %res +} + +;+-------------------------------------------------------------+ +;| rcp_rn | +;+-------------------------------------------------------------+ +define double @test_rcp_rn_d_0_5() { +; CHECK-LABEL: define double @test_rcp_rn_d_0_5() { +; CHECK-NEXT: ret double 2.000000e+00 +; + %res = call double @llvm.nvvm.rcp.rn.d(double 0.5) + ret double %res +} + +define float @test_rcp_rn_f_0_5() { +; CHECK-LABEL: define float @test_rcp_rn_f_0_5() { +; CHECK-NEXT: ret float 2.000000e+00 +; + %res = call float @llvm.nvvm.rcp.rn.f(float 0.5) + ret float %res +} + +define float @test_rcp_rn_ftz_f_0_5() { +; CHECK-LABEL: define float @test_rcp_rn_ftz_f_0_5() { +; CHECK-NEXT: ret float 2.000000e+00 +; + %res = call float @llvm.nvvm.rcp.rn.ftz.f(float 0.5) + ret float %res +} + +define double @test_rcp_rn_d_neg_subnorm() { +; CHECK-LABEL: define double @test_rcp_rn_d_neg_subnorm() { +; CHECK-NEXT: ret double 0xC7D0000020000040 +; + %res = call double @llvm.nvvm.rcp.rn.d(double 0xB80FFFFFC0000000) + ret double %res +} + +define float @test_rcp_rn_f_neg_subnorm() { +; CHECK-LABEL: define float @test_rcp_rn_f_neg_subnorm() { +; CHECK-NEXT: ret float 0xC7D0000020000000 +; + %res = call float @llvm.nvvm.rcp.rn.f(float 0xB80FFFFFC0000000) + ret float %res +} + +define float @test_rcp_rn_ftz_f_neg_subnorm() { +; CHECK-LABEL: define float @test_rcp_rn_ftz_f_neg_subnorm() { +; CHECK-NEXT: [[RES:%.*]] = call float @llvm.nvvm.rcp.rn.ftz.f(float 0xB80FFFFFC0000000) +; CHECK-NEXT: ret float [[RES]] +; + %res = call float @llvm.nvvm.rcp.rn.ftz.f(float 0xB80FFFFFC0000000) + ret float %res +} + +;+-------------------------------------------------------------+ +;| rcp_rp | +;+-------------------------------------------------------------+ +define double @test_rcp_rp_d_0_5() { +; CHECK-LABEL: define double @test_rcp_rp_d_0_5() { +; CHECK-NEXT: ret double 2.000000e+00 +; + %res = call double @llvm.nvvm.rcp.rp.d(double 0.5) + ret double %res +} + +define float @test_rcp_rp_f_0_5() { +; CHECK-LABEL: define float @test_rcp_rp_f_0_5() { +; CHECK-NEXT: ret float 2.000000e+00 +; + %res = call float @llvm.nvvm.rcp.rp.f(float 0.5) + ret float %res +} + +define float @test_rcp_rp_ftz_f_0_5() { +; CHECK-LABEL: define float @test_rcp_rp_ftz_f_0_5() { +; CHECK-NEXT: ret float 2.000000e+00 +; + %res = call float @llvm.nvvm.rcp.rp.ftz.f(float 0.5) + ret float %res +} + +define double @test_rcp_rp_d_neg_subnorm() { +; CHECK-LABEL: define double @test_rcp_rp_d_neg_subnorm() { +; CHECK-NEXT: ret double 0xC7D0000020000040 +; + %res = call double @llvm.nvvm.rcp.rp.d(double 0xB80FFFFFC0000000) + ret double %res +} + +define float @test_rcp_rp_f_neg_subnorm() { +; CHECK-LABEL: define float @test_rcp_rp_f_neg_subnorm() { +; CHECK-NEXT: ret float 0xC7D0000020000000 +; + %res = call float @llvm.nvvm.rcp.rp.f(float 0xB80FFFFFC0000000) + ret float %res +} + +define float @test_rcp_rp_ftz_f_neg_subnorm() { +; CHECK-LABEL: define float @test_rcp_rp_ftz_f_neg_subnorm() { +; CHECK-NEXT: [[RES:%.*]] = call float @llvm.nvvm.rcp.rp.ftz.f(float 0xB80FFFFFC0000000) +; CHECK-NEXT: ret float [[RES]] +; + %res = call float @llvm.nvvm.rcp.rp.ftz.f(float 0xB80FFFFFC0000000) + ret float %res +} + +;+-------------------------------------------------------------+ +;| rcp_rz | +;+-------------------------------------------------------------+ +define double @test_rcp_rz_d_0_5() { +; CHECK-LABEL: define double @test_rcp_rz_d_0_5() { +; CHECK-NEXT: ret double 2.000000e+00 +; + %res = call double @llvm.nvvm.rcp.rz.d(double 0.5) + ret double %res +} + +define float @test_rcp_rz_f_0_5() { +; CHECK-LABEL: define float @test_rcp_rz_f_0_5() { +; CHECK-NEXT: ret float 2.000000e+00 +; + %res = call float @llvm.nvvm.rcp.rz.f(float 0.5) + ret float %res +} + +define float @test_rcp_rz_ftz_f_0_5() { +; CHECK-LABEL: define float @test_rcp_rz_ftz_f_0_5() { +; CHECK-NEXT: ret float 2.000000e+00 +; + %res = call float @llvm.nvvm.rcp.rz.ftz.f(float 0.5) + ret float %res +} + +define double @test_rcp_rz_d_neg_subnorm() { +; CHECK-LABEL: define double @test_rcp_rz_d_neg_subnorm() { +; CHECK-NEXT: ret double 0xC7D0000020000040 +; + %res = call double @llvm.nvvm.rcp.rz.d(double 0xB80FFFFFC0000000) + ret double %res +} + +define float @test_rcp_rz_f_neg_subnorm() { +; CHECK-LABEL: define float @test_rcp_rz_f_neg_subnorm() { +; CHECK-NEXT: ret float 0xC7D0000020000000 +; + %res = call float @llvm.nvvm.rcp.rz.f(float 0xB80FFFFFC0000000) + ret float %res +} + +define float @test_rcp_rz_ftz_f_neg_subnorm() { +; CHECK-LABEL: define float @test_rcp_rz_ftz_f_neg_subnorm() { +; CHECK-NEXT: [[RES:%.*]] = call float @llvm.nvvm.rcp.rz.ftz.f(float 0xB80FFFFFC0000000) +; CHECK-NEXT: ret float [[RES]] +; + %res = call float @llvm.nvvm.rcp.rz.ftz.f(float 0xB80FFFFFC0000000) + ret float %res +} + +;############################################################### +;# Round # +;############################################################### + +define double @test_round_d_neg_1_5() { +; CHECK-LABEL: define double @test_round_d_neg_1_5() { +; CHECK-NEXT: ret double -2.000000e+00 +; + %res = call double @llvm.nvvm.round.d(double -1.5) + ret double %res +} + +define float @test_round_f_neg_1_5() { +; CHECK-LABEL: define float @test_round_f_neg_1_5() { +; CHECK-NEXT: ret float -2.000000e+00 +; + %res = call float @llvm.nvvm.round.f(float -1.5) + ret float %res +} + +define float @test_round_ftz_f_neg_1_5() { +; CHECK-LABEL: define float @test_round_ftz_f_neg_1_5() { +; CHECK-NEXT: ret float -2.000000e+00 +; + %res = call float @llvm.nvvm.round.ftz.f(float -1.5) + ret float %res +} + +define double @test_round_d_neg_subnorm() { +; CHECK-LABEL: define double @test_round_d_neg_subnorm() { +; CHECK-NEXT: ret double -0.000000e+00 +; + %res = call double @llvm.nvvm.round.d(double 0xB80FFFFFC0000000) + ret double %res +} + +define float @test_round_f_neg_subnorm() { +; CHECK-LABEL: define float @test_round_f_neg_subnorm() { +; CHECK-NEXT: ret float -0.000000e+00 +; + %res = call float @llvm.nvvm.round.f(float 0xB80FFFFFC0000000) + ret float %res +} + +define float @test_round_ftz_f_neg_subnorm() { +; CHECK-LABEL: define float @test_round_ftz_f_neg_subnorm() { +; CHECK-NEXT: ret float -0.000000e+00 +; + %res = call float @llvm.nvvm.round.ftz.f(float 0xB80FFFFFC0000000) + ret float %res +} + +;############################################################### +;# Saturate # +;############################################################### + +define double @test_saturate_d_1_25() { +; CHECK-LABEL: define double @test_saturate_d_1_25() { +; CHECK-NEXT: ret double 1.000000e+00 +; + %res = call double @llvm.nvvm.saturate.d(double 1.25) + ret double %res +} + +define float @test_saturate_f_1_25() { +; CHECK-LABEL: define float @test_saturate_f_1_25() { +; CHECK-NEXT: ret float 1.000000e+00 +; + %res = call float @llvm.nvvm.saturate.f(float 1.25) + ret float %res +} + +define float @test_saturate_ftz_f_1_25() { +; CHECK-LABEL: define float @test_saturate_ftz_f_1_25() { +; CHECK-NEXT: ret float 1.000000e+00 +; + %res = call float @llvm.nvvm.saturate.ftz.f(float 1.25) + ret float %res +} + +define double @test_saturate_d_neg_1_25() { +; CHECK-LABEL: define double @test_saturate_d_neg_1_25() { +; CHECK-NEXT: ret double 0.000000e+00 +; + %res = call double @llvm.nvvm.saturate.d(double -1.25) + ret double %res +} + +define float @test_saturate_f_neg_1_25() { +; CHECK-LABEL: define float @test_saturate_f_neg_1_25() { +; CHECK-NEXT: ret float 0.000000e+00 +; + %res = call float @llvm.nvvm.saturate.f(float -1.25) + ret float %res +} + +define float @test_saturate_ftz_f_neg_1_25() { +; CHECK-LABEL: define float @test_saturate_ftz_f_neg_1_25() { +; CHECK-NEXT: ret float 0.000000e+00 +; + %res = call float @llvm.nvvm.saturate.ftz.f(float -1.25) + ret float %res +} + +define double @test_saturate_d_0_5() { +; CHECK-LABEL: define double @test_saturate_d_0_5() { +; CHECK-NEXT: ret double 5.000000e-01 +; + %res = call double @llvm.nvvm.saturate.d(double 0.5) + ret double %res +} + +define float @test_saturate_f_0_5() { +; CHECK-LABEL: define float @test_saturate_f_0_5() { +; CHECK-NEXT: ret float 5.000000e-01 +; + %res = call float @llvm.nvvm.saturate.f(float 0.5) + ret float %res +} + +define float @test_saturate_ftz_f_0_5() { +; CHECK-LABEL: define float @test_saturate_ftz_f_0_5() { +; CHECK-NEXT: ret float 5.000000e-01 +; + %res = call float @llvm.nvvm.saturate.ftz.f(float 0.5) + ret float %res +} + +define double @test_saturate_d_pos_subnorm() { +; CHECK-LABEL: define double @test_saturate_d_pos_subnorm() { +; CHECK-NEXT: ret double 0x380FFFFFC0000000 +; + %res = call double @llvm.nvvm.saturate.d(double 0x380FFFFFC0000000) + ret double %res +} + +define float @test_saturate_f_pos_subnorm() { +; CHECK-LABEL: define float @test_saturate_f_pos_subnorm() { +; CHECK-NEXT: ret float 0x380FFFFFC0000000 +; + %res = call float @llvm.nvvm.saturate.f(float 0x380FFFFFC0000000) + ret float %res +} + +define float @test_saturate_ftz_f_pos_subnorm() { +; CHECK-LABEL: define float @test_saturate_ftz_f_pos_subnorm() { +; CHECK-NEXT: ret float 0.000000e+00 +; + %res = call float @llvm.nvvm.saturate.ftz.f(float 0x380FFFFFC0000000) + ret float %res +} + +;############################################################### +;# Sqrt # +;############################################################### + +define float @test_sqrt_f_4() { +; CHECK-LABEL: define float @test_sqrt_f_4() { +; CHECK-NEXT: ret float 2.000000e+00 +; + %res = call float @llvm.nvvm.sqrt.f(float 4.0) + ret float %res +} + +define float @test_sqrt_rn_f_4() { +; CHECK-LABEL: define float @test_sqrt_rn_f_4() { +; CHECK-NEXT: ret float 2.000000e+00 +; + %res = call float @llvm.nvvm.sqrt.rn.f(float 4.0) + ret float %res +} + +define double @test_sqrt_rn_d_4() { +; CHECK-LABEL: define double @test_sqrt_rn_d_4() { +; CHECK-NEXT: ret double 2.000000e+00 +; + %res = call double @llvm.nvvm.sqrt.rn.d(double 4.0) + ret double %res +} + +define float @test_sqrt_rn_ftz_f_4() { +; CHECK-LABEL: define float @test_sqrt_rn_ftz_f_4() { +; CHECK-NEXT: ret float 2.000000e+00 +; + %res = call float @llvm.nvvm.sqrt.rn.ftz.f(float 4.0) + ret float %res +} + +define float @test_sqrt_f_pos_subnorm() { +; CHECK-LABEL: define float @test_sqrt_f_pos_subnorm() { +; CHECK-NEXT: ret float 0x3BFFFFFFE0000000 +; + %res = call float @llvm.nvvm.sqrt.f(float 0x380FFFFFC0000000) + ret float %res +} + +define float @test_sqrt_rn_f_pos_subnorm() { +; CHECK-LABEL: define float @test_sqrt_rn_f_pos_subnorm() { +; CHECK-NEXT: ret float 0x3BFFFFFFE0000000 +; + %res = call float @llvm.nvvm.sqrt.rn.f(float 0x380FFFFFC0000000) + ret float %res +} + +define double @test_sqrt_rn_d_pos_subnorm() { +; CHECK-LABEL: define double @test_sqrt_rn_d_pos_subnorm() { +; CHECK-NEXT: ret double 0x3BFFFFFFDFFFFFF0 +; + %res = call double @llvm.nvvm.sqrt.rn.d(double 0x380FFFFFC0000000) + ret double %res +} + +define float @test_sqrt_rn_ftz_f_pos_subnorm() { +; CHECK-LABEL: define float @test_sqrt_rn_ftz_f_pos_subnorm() { +; CHECK-NEXT: ret float 0.000000e+00 +; + %res = call float @llvm.nvvm.sqrt.rn.ftz.f(float 0x380FFFFFC0000000) + ret float %res +} + +declare double @llvm.nvvm.ceil.d(double) +declare float @llvm.nvvm.ceil.f(float) +declare float @llvm.nvvm.ceil.ftz.f(float) + +declare float @llvm.nvvm.fabs(float) +declare float @llvm.nvvm.fabs.ftz(float) + +declare double @llvm.nvvm.floor.d(double) +declare float @llvm.nvvm.floor.f(float) +declare float @llvm.nvvm.floor.ftz.f(float) + +declare double @llvm.nvvm.rcp.rm.d(double) +declare float @llvm.nvvm.rcp.rm.f(float) +declare float @llvm.nvvm.rcp.rm.ftz.f(float) +declare double @llvm.nvvm.rcp.rn.d(double) +declare float @llvm.nvvm.rcp.rn.f(float) +declare float @llvm.nvvm.rcp.rn.ftz.f(float) +declare double @llvm.nvvm.rcp.rp.d(double) +declare float @llvm.nvvm.rcp.rp.f(float) +declare float @llvm.nvvm.rcp.rp.ftz.f(float) +declare double @llvm.nvvm.rcp.rz.d(double) +declare float @llvm.nvvm.rcp.rz.f(float) +declare float @llvm.nvvm.rcp.rz.ftz.f(float) + +declare double @llvm.nvvm.round.d(double) +declare float @llvm.nvvm.round.f(float) +declare float @llvm.nvvm.round.ftz.f(float) + +declare double @llvm.nvvm.saturate.d(double) +declare float @llvm.nvvm.saturate.f(float) +declare float @llvm.nvvm.saturate.ftz.f(float) + +declare float @llvm.nvvm.sqrt.f(float) +declare double @llvm.nvvm.sqrt.rn.d(double) +declare float @llvm.nvvm.sqrt.rn.f(float) +declare float @llvm.nvvm.sqrt.rn.ftz.f(float) diff --git a/llvm/test/Transforms/InstSimplify/exp10.ll b/llvm/test/Transforms/InstSimplify/exp10.ll index c415c41..17c0811 100644 --- a/llvm/test/Transforms/InstSimplify/exp10.ll +++ b/llvm/test/Transforms/InstSimplify/exp10.ll @@ -57,8 +57,7 @@ define <vscale x 2 x float> @exp10_exp10_scalable_vector(<vscale x 2 x float> %x define float @exp10_poison() { ; CHECK-LABEL: define float @exp10_poison() { -; CHECK-NEXT: [[RET:%.*]] = call float @llvm.exp10.f32(float poison) -; CHECK-NEXT: ret float [[RET]] +; CHECK-NEXT: ret float poison ; %ret = call float @llvm.exp10.f32(float poison) ret float %ret @@ -66,8 +65,7 @@ define float @exp10_poison() { define <2 x float> @exp10_poison_vector() { ; CHECK-LABEL: define <2 x float> @exp10_poison_vector() { -; CHECK-NEXT: [[RET:%.*]] = call <2 x float> @llvm.exp10.v2f32(<2 x float> poison) -; CHECK-NEXT: ret <2 x float> [[RET]] +; CHECK-NEXT: ret <2 x float> poison ; %ret = call <2 x float> @llvm.exp10.v2f32(<2 x float> poison) ret <2 x float> %ret @@ -75,8 +73,7 @@ define <2 x float> @exp10_poison_vector() { define <vscale x 2 x float> @exp10_poison_scaleable_vector() { ; CHECK-LABEL: define <vscale x 2 x float> @exp10_poison_scaleable_vector() { -; CHECK-NEXT: [[RET:%.*]] = call <vscale x 2 x float> @llvm.exp10.nxv2f32(<vscale x 2 x float> poison) -; CHECK-NEXT: ret <vscale x 2 x float> [[RET]] +; CHECK-NEXT: ret <vscale x 2 x float> poison ; %ret = call <vscale x 2 x float> @llvm.exp10.nxv2f32(<vscale x 2 x float> poison) ret <vscale x 2 x float> %ret diff --git a/llvm/test/Transforms/InstSimplify/fold-intrinsics.ll b/llvm/test/Transforms/InstSimplify/fold-intrinsics.ll index e4cfa46..45f5e37 100644 --- a/llvm/test/Transforms/InstSimplify/fold-intrinsics.ll +++ b/llvm/test/Transforms/InstSimplify/fold-intrinsics.ll @@ -286,3 +286,327 @@ define void @tanh_poison(ptr %P) { ret void } + + +define void @exp_poison(ptr %P) { +; CHECK-LABEL: @exp_poison( +; CHECK-NEXT: store volatile float poison, ptr [[P:%.*]], align 4 +; CHECK-NEXT: store volatile <2 x float> poison, ptr [[P]], align 8 +; CHECK-NEXT: store volatile <4 x double> poison, ptr [[P]], align 32 +; CHECK-NEXT: store volatile float poison, ptr [[P]], align 4 +; CHECK-NEXT: store volatile <2 x float> poison, ptr [[P]], align 8 +; CHECK-NEXT: store volatile <4 x double> poison, ptr [[P]], align 32 +; CHECK-NEXT: store volatile float poison, ptr [[P]], align 4 +; CHECK-NEXT: store volatile <2 x float> poison, ptr [[P]], align 8 +; CHECK-NEXT: store volatile <4 x double> poison, ptr [[P]], align 32 +; CHECK-NEXT: ret void +; + %exp_f32 = call float @llvm.exp(float poison) + store volatile float %exp_f32, ptr %P + + %exp_2xf32 = call <2 x float> @llvm.exp(<2 x float> poison) + store volatile <2 x float> %exp_2xf32, ptr %P + + %exp_4xf64 = call <4 x double> @llvm.exp(<4 x double> poison) + store volatile <4 x double> %exp_4xf64, ptr %P + + %exp2_f32 = call float @llvm.exp2(float poison) + store volatile float %exp2_f32, ptr %P + + %exp2_2xf32 = call <2 x float> @llvm.exp2(<2 x float> poison) + store volatile <2 x float> %exp2_2xf32, ptr %P + + %exp2_4xf64 = call <4 x double> @llvm.exp2(<4 x double> poison) + store volatile <4 x double> %exp2_4xf64, ptr %P + + %exp10_f32 = call float @llvm.exp10(float poison) + store volatile float %exp10_f32, ptr %P + + %exp10_2xf32 = call <2 x float> @llvm.exp10(<2 x float> poison) + store volatile <2 x float> %exp10_2xf32, ptr %P + + %exp10_4xf64 = call <4 x double> @llvm.exp10(<4 x double> poison) + store volatile <4 x double> %exp10_4xf64, ptr %P + ret void +} + + +define void @log_poison(ptr %P) { +; CHECK-LABEL: @log_poison( +; CHECK-NEXT: store volatile float poison, ptr [[P:%.*]], align 4 +; CHECK-NEXT: store volatile <2 x float> poison, ptr [[P]], align 8 +; CHECK-NEXT: store volatile <4 x double> poison, ptr [[P]], align 32 +; CHECK-NEXT: store volatile float poison, ptr [[P]], align 4 +; CHECK-NEXT: store volatile <2 x float> poison, ptr [[P]], align 8 +; CHECK-NEXT: store volatile <4 x double> poison, ptr [[P]], align 32 +; CHECK-NEXT: store volatile float poison, ptr [[P]], align 4 +; CHECK-NEXT: store volatile <2 x float> poison, ptr [[P]], align 8 +; CHECK-NEXT: store volatile <4 x double> poison, ptr [[P]], align 32 +; CHECK-NEXT: ret void +; + %log_f32 = call float @llvm.log(float poison) + store volatile float %log_f32, ptr %P + + %log_2xf32 = call <2 x float> @llvm.log(<2 x float> poison) + store volatile <2 x float> %log_2xf32, ptr %P + + %log_4xf64 = call <4 x double> @llvm.log(<4 x double> poison) + store volatile <4 x double> %log_4xf64, ptr %P + + %log2_f32 = call float @llvm.log2(float poison) + store volatile float %log2_f32, ptr %P + + %log2_2xf32 = call <2 x float> @llvm.log2(<2 x float> poison) + store volatile <2 x float> %log2_2xf32, ptr %P + + %log2_4xf64 = call <4 x double> @llvm.log2(<4 x double> poison) + store volatile <4 x double> %log2_4xf64, ptr %P + + %log10_f32 = call float @llvm.log10(float poison) + store volatile float %log10_f32, ptr %P + + %log10_2xf32 = call <2 x float> @llvm.log10(<2 x float> poison) + store volatile <2 x float> %log10_2xf32, ptr %P + + %log10_4xf64 = call <4 x double> @llvm.log10(<4 x double> poison) + store volatile <4 x double> %log10_4xf64, ptr %P + ret void +} + + +define void @modf_poison(ptr %P) { +; CHECK-LABEL: @modf_poison( +; CHECK-NEXT: store volatile { float, float } poison, ptr [[P:%.*]], align 4 +; CHECK-NEXT: store volatile { <2 x float>, <2 x float> } poison, ptr [[P]], align 8 +; CHECK-NEXT: store volatile { <4 x double>, <4 x double> } poison, ptr [[P]], align 32 +; CHECK-NEXT: ret void +; + %modf_f32 = call { float, float } @llvm.modf(float poison) + store volatile { float, float } %modf_f32, ptr %P + + %modf_2xf32 = call { <2 x float>, <2 x float> } @llvm.modf(<2 x float> poison) + store volatile { <2 x float>, <2 x float> } %modf_2xf32, ptr %P + + %modf_4xf64 = call { <4 x double>, <4 x double> } @llvm.modf(<4 x double> poison) + store volatile { <4 x double>, <4 x double> } %modf_4xf64, ptr %P + + ret void +} + + +define void @floor_poison(ptr %P) { +; CHECK-LABEL: @floor_poison( +; CHECK-NEXT: store volatile float poison, ptr [[P:%.*]], align 4 +; CHECK-NEXT: store volatile <2 x float> poison, ptr [[P]], align 8 +; CHECK-NEXT: store volatile <4 x double> poison, ptr [[P]], align 32 +; CHECK-NEXT: ret void +; + %floor_f32 = call float @llvm.floor(float poison) + store volatile float %floor_f32, ptr %P + + %floor_2xf32 = call <2 x float> @llvm.floor(<2 x float> poison) + store volatile <2 x float> %floor_2xf32, ptr %P + + %floor_4xf64 = call <4 x double> @llvm.floor(<4 x double> poison) + store volatile <4 x double> %floor_4xf64, ptr %P + + ret void +} + + +define void @ceil_poison(ptr %P) { +; CHECK-LABEL: @ceil_poison( +; CHECK-NEXT: store volatile float poison, ptr [[P:%.*]], align 4 +; CHECK-NEXT: store volatile <2 x float> poison, ptr [[P]], align 8 +; CHECK-NEXT: store volatile <4 x double> poison, ptr [[P]], align 32 +; CHECK-NEXT: ret void +; + %ceil_f32 = call float @llvm.ceil(float poison) + store volatile float %ceil_f32, ptr %P + + %ceil_2xf32 = call <2 x float> @llvm.ceil(<2 x float> poison) + store volatile <2 x float> %ceil_2xf32, ptr %P + + %ceil_4xf64 = call <4 x double> @llvm.ceil(<4 x double> poison) + store volatile <4 x double> %ceil_4xf64, ptr %P + + ret void +} + + +define void @trunc_poison(ptr %P) { +; CHECK-LABEL: @trunc_poison( +; CHECK-NEXT: store volatile float poison, ptr [[P:%.*]], align 4 +; CHECK-NEXT: store volatile <2 x float> poison, ptr [[P]], align 8 +; CHECK-NEXT: store volatile <4 x double> poison, ptr [[P]], align 32 +; CHECK-NEXT: ret void +; + %trunc_f32 = call float @llvm.trunc(float poison) + store volatile float %trunc_f32, ptr %P + + %trunc_2xf32 = call <2 x float> @llvm.trunc(<2 x float> poison) + store volatile <2 x float> %trunc_2xf32, ptr %P + + %trunc_4xf64 = call <4 x double> @llvm.trunc(<4 x double> poison) + store volatile <4 x double> %trunc_4xf64, ptr %P + + ret void +} + +define void @rint_poison(ptr %P) { +; CHECK-LABEL: @rint_poison( +; CHECK-NEXT: store volatile float poison, ptr [[P:%.*]], align 4 +; CHECK-NEXT: store volatile <2 x float> poison, ptr [[P]], align 8 +; CHECK-NEXT: store volatile <4 x double> poison, ptr [[P]], align 32 +; CHECK-NEXT: ret void +; + %rint_f32 = call float @llvm.rint(float poison) + store volatile float %rint_f32, ptr %P + + %rint_2xf32 = call <2 x float> @llvm.rint(<2 x float> poison) + store volatile <2 x float> %rint_2xf32, ptr %P + + %rint_4xf64 = call <4 x double> @llvm.rint(<4 x double> poison) + store volatile <4 x double> %rint_4xf64, ptr %P + + ret void +} + +define void @nearbyint_poison(ptr %P) { +; CHECK-LABEL: @nearbyint_poison( +; CHECK-NEXT: store volatile float poison, ptr [[P:%.*]], align 4 +; CHECK-NEXT: store volatile <2 x float> poison, ptr [[P]], align 8 +; CHECK-NEXT: store volatile <4 x double> poison, ptr [[P]], align 32 +; CHECK-NEXT: ret void +; + %nearbyint_f32 = call float @llvm.nearbyint(float poison) + store volatile float %nearbyint_f32, ptr %P + + %nearbyint_2xf32 = call <2 x float> @llvm.nearbyint(<2 x float> poison) + store volatile <2 x float> %nearbyint_2xf32, ptr %P + + %nearbyint_4xf64 = call <4 x double> @llvm.nearbyint(<4 x double> poison) + store volatile <4 x double> %nearbyint_4xf64, ptr %P + + ret void +} + + +define void @round_poison(ptr %P) { +; CHECK-LABEL: @round_poison( +; CHECK-NEXT: store volatile float poison, ptr [[P:%.*]], align 4 +; CHECK-NEXT: store volatile <2 x float> poison, ptr [[P]], align 8 +; CHECK-NEXT: store volatile <4 x double> poison, ptr [[P]], align 32 +; CHECK-NEXT: ret void +; + %round_f32 = call float @llvm.round(float poison) + store volatile float %round_f32, ptr %P + + %round_2xf32 = call <2 x float> @llvm.round(<2 x float> poison) + store volatile <2 x float> %round_2xf32, ptr %P + + %round_4xf64 = call <4 x double> @llvm.round(<4 x double> poison) + store volatile <4 x double> %round_4xf64, ptr %P + + ret void +} + + +define void @roundeven_poison(ptr %P) { +; CHECK-LABEL: @roundeven_poison( +; CHECK-NEXT: store volatile float poison, ptr [[P:%.*]], align 4 +; CHECK-NEXT: store volatile <2 x float> poison, ptr [[P]], align 8 +; CHECK-NEXT: store volatile <4 x double> poison, ptr [[P]], align 32 +; CHECK-NEXT: ret void +; + %roundeven_f32 = call float @llvm.roundeven(float poison) + store volatile float %roundeven_f32, ptr %P + + %roundeven_2xf32 = call <2 x float> @llvm.roundeven(<2 x float> poison) + store volatile <2 x float> %roundeven_2xf32, ptr %P + + %roundeven_4xf64 = call <4 x double> @llvm.roundeven(<4 x double> poison) + store volatile <4 x double> %roundeven_4xf64, ptr %P + + ret void +} + + +define void @lrint_poison(ptr %P) { +; CHECK-LABEL: @lrint_poison( +; CHECK-NEXT: store volatile i32 poison, ptr [[P:%.*]], align 4 +; CHECK-NEXT: store volatile <2 x i32> poison, ptr [[P]], align 8 +; CHECK-NEXT: store volatile <4 x i64> poison, ptr [[P]], align 32 +; CHECK-NEXT: ret void +; + %lrint_f32 = call i32 @llvm.lrint(float poison) + store volatile i32 %lrint_f32, ptr %P + + %lrint_2xf32 = call <2 x i32> @llvm.lrint(<2 x float> poison) + store volatile <2 x i32> %lrint_2xf32, ptr %P + + %lrint_4xf64 = call <4 x i64> @llvm.lrint(<4 x double> poison) + store volatile <4 x i64> %lrint_4xf64, ptr %P + + ret void +} + + +define void @llrint_poison(ptr %P) { +; CHECK-LABEL: @llrint_poison( +; CHECK-NEXT: store volatile i32 poison, ptr [[P:%.*]], align 4 +; CHECK-NEXT: store volatile <2 x i32> poison, ptr [[P]], align 8 +; CHECK-NEXT: store volatile <4 x i64> poison, ptr [[P]], align 32 +; CHECK-NEXT: ret void +; + %llrint_f32 = call i32 @llvm.llrint(float poison) + store volatile i32 %llrint_f32, ptr %P + + %llrint_2xf32 = call <2 x i32> @llvm.llrint(<2 x float> poison) + store volatile <2 x i32> %llrint_2xf32, ptr %P + + %llrint_4xf64 = call <4 x i64> @llvm.llrint(<4 x double> poison) + store volatile <4 x i64> %llrint_4xf64, ptr %P + + ret void +} + + +define void @umul_fix_poison(ptr %P) { +; CHECK-LABEL: @umul_fix_poison( +; CHECK-NEXT: store volatile i16 poison, ptr [[P:%.*]], align 2 +; CHECK-NEXT: store volatile i32 poison, ptr [[P]], align 4 +; CHECK-NEXT: store volatile <4 x i32> poison, ptr [[P]], align 16 +; CHECK-NEXT: ret void +; + %umul_fix_i16 = call i16 @llvm.umul.fix(i16 poison, i16 poison, i32 2) + store volatile i16 %umul_fix_i16, ptr %P + + %umul_fix_i32 = call i32 @llvm.umul.fix(i32 poison, i32 poison, i32 2) + store volatile i32 %umul_fix_i32, ptr %P + + %umul_fix_4xi32 = call <4 x i32> @llvm.umul.fix(<4 x i32> poison, <4 x i32> poison, i32 2) + store volatile <4 x i32> %umul_fix_4xi32, ptr %P + + ret void +} + + +define void @umul_fix_sat_poison(ptr %P) { +; CHECK-LABEL: @umul_fix_sat_poison( +; CHECK-NEXT: store volatile i16 poison, ptr [[P:%.*]], align 2 +; CHECK-NEXT: store volatile i32 poison, ptr [[P]], align 4 +; CHECK-NEXT: store volatile <4 x i32> poison, ptr [[P]], align 16 +; CHECK-NEXT: ret void +; + %umul_fix_sati16 = call i16 @llvm.umul.fix.sat(i16 poison, i16 poison, i32 2) + store volatile i16 %umul_fix_sati16, ptr %P + + %umul_fix_sati32 = call i32 @llvm.umul.fix.sat(i32 poison, i32 poison, i32 2) + store volatile i32 %umul_fix_sati32, ptr %P + + %umul_fix_sat4xi32 = call <4 x i32> @llvm.umul.fix.sat(<4 x i32> poison, <4 x i32> poison, i32 2) + store volatile <4 x i32> %umul_fix_sat4xi32, ptr %P + + ret void +} diff --git a/llvm/test/Transforms/LoopSimplifyCFG/enter-through-indirectbr.ll b/llvm/test/Transforms/LoopSimplifyCFG/enter-through-indirectbr.ll new file mode 100644 index 0000000..dd524ab --- /dev/null +++ b/llvm/test/Transforms/LoopSimplifyCFG/enter-through-indirectbr.ll @@ -0,0 +1,28 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -passes='require<domtree>,loop(loop-simplifycfg)' -verify-loop-info -verify-dom-info -verify-loop-lcssa < %s | FileCheck %s + +define void @test(ptr %addr) { +; CHECK-LABEL: define void @test( +; CHECK-SAME: ptr [[ADDR:%.*]]) { +; CHECK-NEXT: indirectbr ptr [[ADDR]], [label %[[A:.*]], label %C] +; CHECK: [[A]]: +; CHECK-NEXT: br i1 true, label %[[B:.*]], label %[[C_LOOPEXIT:.*]] +; CHECK: [[B]]: +; CHECK-NEXT: br i1 true, label %[[A]], label %[[C_LOOPEXIT]] +; CHECK: [[C_LOOPEXIT]]: +; CHECK-NEXT: br label %[[C:.*]] +; CHECK: [[C]]: +; CHECK-NEXT: unreachable +; + + indirectbr ptr %addr, [label %A, label %C] + +A: + br i1 true, label %B, label %C + +B: + br i1 true, label %A, label %C + +C: + unreachable +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll b/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll index 1f61989..812bca9 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll @@ -46,27 +46,17 @@ define void @_Z3foov() { ; CHECK-V2-IC4-LABEL: define void @_Z3foov( ; CHECK-V2-IC4-SAME: ) #[[ATTR0:[0-9]+]] { ; CHECK-V2-IC4: [[VEC_EPILOG_VECTOR_BODY1:.*:]] -; CHECK-V2-IC4: br i1 [[MIN_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]], !prof [[PROF0:![0-9]+]] -; CHECK-V2-IC4: [[VECTOR_MAIN_LOOP_ITER_CHECK]]: -; CHECK-V2-IC4: br i1 false, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF0]] +; CHECK-V2-IC4: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF0:![0-9]+]] ; CHECK-V2-IC4: [[VECTOR_PH]]: ; CHECK-V2-IC4: br label %[[VECTOR_BODY:.*]] ; CHECK-V2-IC4: [[VECTOR_BODY]]: -; CHECK-V2-IC4: br i1 [[TMP12:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-V2-IC4: br i1 [[TMP10:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK-V2-IC4: [[MIDDLE_BLOCK]]: -; CHECK-V2-IC4: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]], !prof [[PROF5:![0-9]+]] -; CHECK-V2-IC4: [[VEC_EPILOG_ITER_CHECK]]: -; CHECK-V2-IC4: br i1 [[MIN_EPILOG_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF6:![0-9]+]] -; CHECK-V2-IC4: [[VEC_EPILOG_PH]]: -; CHECK-V2-IC4: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] -; CHECK-V2-IC4: [[VEC_EPILOG_VECTOR_BODY]]: -; CHECK-V2-IC4: br i1 [[TMP23:%.*]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] -; CHECK-V2-IC4: [[VEC_EPILOG_MIDDLE_BLOCK]]: -; CHECK-V2-IC4: br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP]], label %[[VEC_EPILOG_SCALAR_PH]], !prof [[PROF8:![0-9]+]] -; CHECK-V2-IC4: [[VEC_EPILOG_SCALAR_PH]]: +; CHECK-V2-IC4: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]], !prof [[PROF5:![0-9]+]] +; CHECK-V2-IC4: [[SCALAR_PH]]: ; CHECK-V2-IC4: br label %[[FOR_BODY:.*]] ; CHECK-V2-IC4: [[FOR_BODY]]: -; CHECK-V2-IC4: br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF9:![0-9]+]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-V2-IC4: br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK-V2-IC4: [[FOR_COND_CLEANUP]]: ; entry: @@ -111,9 +101,6 @@ for.cond.cleanup: ; preds = %for.body ; CHECK-V2-IC4: [[META3]] = !{!"llvm.loop.isvectorized", i32 1} ; CHECK-V2-IC4: [[META4]] = !{!"llvm.loop.unroll.runtime.disable"} ; CHECK-V2-IC4: [[PROF5]] = !{!"branch_weights", i32 1, i32 15} -; CHECK-V2-IC4: [[PROF6]] = !{!"branch_weights", i32 2, i32 0} -; CHECK-V2-IC4: [[LOOP7]] = distinct !{[[LOOP7]], [[META3]], [[META4]]} -; CHECK-V2-IC4: [[PROF8]] = !{!"branch_weights", i32 1, i32 1} -; CHECK-V2-IC4: [[PROF9]] = !{!"branch_weights", i32 0, i32 0} -; CHECK-V2-IC4: [[LOOP10]] = distinct !{[[LOOP10]], [[META4]], [[META3]]} +; CHECK-V2-IC4: [[PROF6]] = !{!"branch_weights", i32 0, i32 0} +; CHECK-V2-IC4: [[LOOP7]] = distinct !{[[LOOP7]], [[META4]], [[META3]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll new file mode 100644 index 0000000..5b8acee --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll @@ -0,0 +1,395 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5 +; RUN: opt -passes=loop-vectorize -S %s | FileCheck %s + +target triple = "aarch64-linux-gnu" + +; Original loop has trip count 16, but contains interleave groups with gaps, so +; the last iteration must execute in the scalar loop. Thus the vector loop can +; only execute up to 15 iterations. +define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i32 %x) #0 { +; CHECK-LABEL: define i64 @vector_loop_with_remaining_iterations( +; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[X:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ITER_CHECK:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 17, [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]] +; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]: +; CHECK-NEXT: br i1 false, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[X]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[BROADCAST_SPLAT]], i1 false) +; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[BROADCAST_SPLAT]], i1 false) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <16 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], i64 [[INDEX]], i32 0, i64 3 +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <64 x i8>, ptr [[TMP4]], align 1 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <64 x i8> [[WIDE_VEC]], <64 x i8> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60> +; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[STRIDED_VEC]] to <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP2]], <16 x i32> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP3]], <16 x i32> [[TMP6]]) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 +; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP9]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = zext <16 x i32> [[TMP7]] to <16 x i64> +; CHECK-NEXT: [[TMP11]] = or <16 x i64> [[VEC_PHI]], [[TMP10]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 +; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> [[TMP11]]) +; CHECK-NEXT: br label %[[VEC_EPILOG_ITER_CHECK:.*]] +; CHECK: [[VEC_EPILOG_ITER_CHECK]]: +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 2 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 1, [[TMP15]] +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] +; CHECK: [[VEC_EPILOG_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP13]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP17:%.*]] = mul nuw i64 [[TMP16]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 17, [[TMP17]] +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i64 [[TMP17]], i64 [[N_MOD_VF]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 17, [[TMP19]] +; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP21:%.*]] = mul nuw i64 [[TMP20]], 2 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[X]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <vscale x 2 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0 +; CHECK-NEXT: [[TMP23:%.*]] = call <vscale x 2 x i32> @llvm.abs.nxv2i32(<vscale x 2 x i32> [[BROADCAST_SPLAT2]], i1 false) +; CHECK-NEXT: [[TMP24:%.*]] = call <vscale x 2 x i32> @llvm.abs.nxv2i32(<vscale x 2 x i32> [[BROADCAST_SPLAT2]], i1 false) +; CHECK-NEXT: [[TMP25:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64() +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT3]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP26:%.*]] = mul <vscale x 2 x i64> [[TMP25]], splat (i64 1) +; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> [[BROADCAST_SPLAT4]], [[TMP26]] +; CHECK-NEXT: [[TMP27:%.*]] = mul i64 1, [[TMP21]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP27]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT5]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] +; CHECK: [[VEC_EPILOG_VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX7:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI8:%.*]] = phi <vscale x 2 x i64> [ [[TMP22]], %[[VEC_EPILOG_PH]] ], [ [[TMP35:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], <vscale x 2 x i64> [[VEC_IND]], i32 0, i64 3 +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8.nxv2p0(<vscale x 2 x ptr> [[TMP28]], i32 1, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i8> poison) +; CHECK-NEXT: [[TMP29:%.*]] = zext <vscale x 2 x i8> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i32> +; CHECK-NEXT: [[TMP30:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP23]], <vscale x 2 x i32> [[TMP29]]) +; CHECK-NEXT: [[TMP31:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP24]], <vscale x 2 x i32> [[TMP30]]) +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX7]] +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[TMP32]], i32 0 +; CHECK-NEXT: store <vscale x 2 x i8> zeroinitializer, ptr [[TMP33]], align 1 +; CHECK-NEXT: [[TMP34:%.*]] = zext <vscale x 2 x i32> [[TMP31]] to <vscale x 2 x i64> +; CHECK-NEXT: [[TMP35]] = or <vscale x 2 x i64> [[VEC_PHI8]], [[TMP34]] +; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX7]], [[TMP21]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT6]] +; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP37:%.*]] = call i64 @llvm.vector.reduce.or.nxv2i64(<vscale x 2 x i64> [[TMP35]]) +; CHECK-NEXT: br label %[[VEC_EPILOG_SCALAR_PH]] +; CHECK: [[VEC_EPILOG_SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL10:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 16, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX11:%.*]] = phi i64 [ [[TMP37]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP13]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL10]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RED:%.*]] = phi i64 [ [[BC_MERGE_RDX11]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_SRC_I_I:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], i64 [[IV]], i32 0, i64 3 +; CHECK-NEXT: [[L:%.*]] = load i8, ptr [[GEP_SRC_I_I]], align 1 +; CHECK-NEXT: [[L_EXT:%.*]] = zext i8 [[L]] to i32 +; CHECK-NEXT: [[ABS_0:%.*]] = call i32 @llvm.abs.i32(i32 [[X]], i1 false) +; CHECK-NEXT: [[MIN_0:%.*]] = call i32 @llvm.umin.i32(i32 [[ABS_0]], i32 [[L_EXT]]) +; CHECK-NEXT: [[ABS_1:%.*]] = call i32 @llvm.abs.i32(i32 [[X]], i1 false) +; CHECK-NEXT: [[MIN_1:%.*]] = call i32 @llvm.umin.i32(i32 [[ABS_1]], i32 [[MIN_0]]) +; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[IV]] +; CHECK-NEXT: store i8 0, ptr [[GEP_DST]], align 1 +; CHECK-NEXT: [[MIN_EXT:%.*]] = zext i32 [[MIN_1]] to i64 +; CHECK-NEXT: [[RED_NEXT]] = or i64 [[RED]], [[MIN_EXT]] +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT_I_I:%.*]] = icmp eq i64 [[IV_NEXT]], 17 +; CHECK-NEXT: br i1 [[EXITCOND_NOT_I_I]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOP]] ] +; CHECK-NEXT: ret i64 [[RED_NEXT_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %red = phi i64 [ 0, %entry ], [ %red.next, %loop ] + %gep.src.i.i = getelementptr { [4 x i8] }, ptr %src, i64 %iv, i32 0, i64 3 + %l = load i8, ptr %gep.src.i.i, align 1 + %l.ext = zext i8 %l to i32 + %abs.0 = call i32 @llvm.abs.i32(i32 %x, i1 false) + %min.0 = call i32 @llvm.umin.i32(i32 %abs.0, i32 %l.ext) + %abs.1 = call i32 @llvm.abs.i32(i32 %x, i1 false) + %min.1 = call i32 @llvm.umin.i32(i32 %abs.1, i32 %min.0) + %gep.dst = getelementptr inbounds i8, ptr %dst, i64 %iv + store i8 0, ptr %gep.dst, align 1 + %min.ext = zext i32 %min.1 to i64 + %red.next = or i64 %red, %min.ext + %iv.next = add i64 %iv, 1 + %exitcond.not.i.i = icmp eq i64 %iv.next, 17 + br i1 %exitcond.not.i.i, label %exit, label %loop + +exit: + ret i64 %red.next +} + +; Original loop has trip count 17, but contains interleave groups with gaps, so +; the last iteration must execute in the scalar loop. Thus the vector loop can +; only execute up to 16 iterations. +define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr noalias %dst, i32 %x) #0 { +; CHECK-LABEL: define i64 @main_vector_loop_fixed_with_no_remaining_iterations( +; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[X:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ITER_CHECK:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 17, [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]] +; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]: +; CHECK-NEXT: br i1 false, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[X]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[BROADCAST_SPLAT]], i1 false) +; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[BROADCAST_SPLAT]], i1 false) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <16 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], i64 [[INDEX]], i32 0, i64 3 +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <64 x i8>, ptr [[TMP4]], align 1 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <64 x i8> [[WIDE_VEC]], <64 x i8> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60> +; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[STRIDED_VEC]] to <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP2]], <16 x i32> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP3]], <16 x i32> [[TMP6]]) +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i32 0 +; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP27]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = zext <16 x i32> [[TMP7]] to <16 x i64> +; CHECK-NEXT: [[TMP11]] = or <16 x i64> [[VEC_PHI]], [[TMP10]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 +; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> [[TMP11]]) +; CHECK-NEXT: br label %[[VEC_EPILOG_ITER_CHECK:.*]] +; CHECK: [[VEC_EPILOG_ITER_CHECK]]: +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 2 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 1, [[TMP15]] +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] +; CHECK: [[VEC_EPILOG_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP13]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP17:%.*]] = mul nuw i64 [[TMP16]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 17, [[TMP17]] +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i64 [[TMP17]], i64 [[N_MOD_VF]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 17, [[TMP19]] +; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP21:%.*]] = mul nuw i64 [[TMP20]], 2 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[X]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <vscale x 2 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0 +; CHECK-NEXT: [[TMP23:%.*]] = call <vscale x 2 x i32> @llvm.abs.nxv2i32(<vscale x 2 x i32> [[BROADCAST_SPLAT2]], i1 false) +; CHECK-NEXT: [[TMP24:%.*]] = call <vscale x 2 x i32> @llvm.abs.nxv2i32(<vscale x 2 x i32> [[BROADCAST_SPLAT2]], i1 false) +; CHECK-NEXT: [[TMP25:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64() +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT3]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP38:%.*]] = mul <vscale x 2 x i64> [[TMP25]], splat (i64 1) +; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> [[BROADCAST_SPLAT4]], [[TMP38]] +; CHECK-NEXT: [[TMP39:%.*]] = mul i64 1, [[TMP21]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP39]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT5]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] +; CHECK: [[VEC_EPILOG_VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX7:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI8:%.*]] = phi <vscale x 2 x i64> [ [[TMP22]], %[[VEC_EPILOG_PH]] ], [ [[TMP35:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], <vscale x 2 x i64> [[VEC_IND]], i32 0, i64 3 +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8.nxv2p0(<vscale x 2 x ptr> [[TMP28]], i32 1, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i8> poison) +; CHECK-NEXT: [[TMP29:%.*]] = zext <vscale x 2 x i8> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i32> +; CHECK-NEXT: [[TMP30:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP23]], <vscale x 2 x i32> [[TMP29]]) +; CHECK-NEXT: [[TMP31:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP24]], <vscale x 2 x i32> [[TMP30]]) +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX7]] +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[TMP32]], i32 0 +; CHECK-NEXT: store <vscale x 2 x i8> zeroinitializer, ptr [[TMP33]], align 1 +; CHECK-NEXT: [[TMP34:%.*]] = zext <vscale x 2 x i32> [[TMP31]] to <vscale x 2 x i64> +; CHECK-NEXT: [[TMP35]] = or <vscale x 2 x i64> [[VEC_PHI8]], [[TMP34]] +; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX7]], [[TMP21]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT6]] +; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP37:%.*]] = call i64 @llvm.vector.reduce.or.nxv2i64(<vscale x 2 x i64> [[TMP35]]) +; CHECK-NEXT: br label %[[VEC_EPILOG_SCALAR_PH]] +; CHECK: [[VEC_EPILOG_SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL10:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 16, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX11:%.*]] = phi i64 [ [[TMP37]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP13]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL10]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RED:%.*]] = phi i64 [ [[BC_MERGE_RDX11]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_SRC_I_I:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], i64 [[IV]], i32 0, i64 3 +; CHECK-NEXT: [[L:%.*]] = load i8, ptr [[GEP_SRC_I_I]], align 1 +; CHECK-NEXT: [[L_EXT:%.*]] = zext i8 [[L]] to i32 +; CHECK-NEXT: [[ABS_0:%.*]] = call i32 @llvm.abs.i32(i32 [[X]], i1 false) +; CHECK-NEXT: [[MIN_0:%.*]] = call i32 @llvm.umin.i32(i32 [[ABS_0]], i32 [[L_EXT]]) +; CHECK-NEXT: [[ABS_1:%.*]] = call i32 @llvm.abs.i32(i32 [[X]], i1 false) +; CHECK-NEXT: [[MIN_1:%.*]] = call i32 @llvm.umin.i32(i32 [[ABS_1]], i32 [[MIN_0]]) +; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[IV]] +; CHECK-NEXT: store i8 0, ptr [[GEP_DST]], align 1 +; CHECK-NEXT: [[MIN_EXT:%.*]] = zext i32 [[MIN_1]] to i64 +; CHECK-NEXT: [[RED_NEXT]] = or i64 [[RED]], [[MIN_EXT]] +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT_I_I:%.*]] = icmp eq i64 [[IV_NEXT]], 17 +; CHECK-NEXT: br i1 [[EXITCOND_NOT_I_I]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOP]] ] +; CHECK-NEXT: ret i64 [[RED_NEXT_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %red = phi i64 [ 0, %entry ], [ %red.next, %loop ] + %gep.src.i.i = getelementptr { [4 x i8] }, ptr %src, i64 %iv, i32 0, i64 3 + %l = load i8, ptr %gep.src.i.i, align 1 + %l.ext = zext i8 %l to i32 + %abs.0 = call i32 @llvm.abs.i32(i32 %x, i1 false) + %min.0 = call i32 @llvm.umin.i32(i32 %abs.0, i32 %l.ext) + %abs.1 = call i32 @llvm.abs.i32(i32 %x, i1 false) + %min.1 = call i32 @llvm.umin.i32(i32 %abs.1, i32 %min.0) + %gep.dst = getelementptr inbounds i8, ptr %dst, i64 %iv + store i8 0, ptr %gep.dst, align 1 + %min.ext = zext i32 %min.1 to i64 + %red.next = or i64 %red, %min.ext + %iv.next = add i64 %iv, 1 + %exitcond.not.i.i = icmp eq i64 %iv.next, 17 + br i1 %exitcond.not.i.i, label %exit, label %loop + +exit: + ret i64 %red.next +} + +; Test case for https://github.com/llvm/llvm-project/issues/149726. +define void @main_vector_loop_fixed_single_vector_iteration_with_runtime_checks(ptr noalias %A, ptr noalias %B, ptr noalias %C, ptr noalias %D, ptr noalias %E, ptr noalias %F, ptr noalias %G, ptr noalias %H, ptr noalias %I, ptr noalias %J, ptr noalias %K, ptr %L) #1 { +; CHECK-LABEL: define void @main_vector_loop_fixed_single_vector_iteration_with_runtime_checks( +; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], ptr noalias [[D:%.*]], ptr noalias [[E:%.*]], ptr noalias [[F:%.*]], ptr noalias [[G:%.*]], ptr noalias [[H:%.*]], ptr noalias [[I:%.*]], ptr noalias [[J:%.*]], ptr noalias [[K:%.*]], ptr [[L:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2 +; CHECK-NEXT: [[IV:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 4 +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 6 +; CHECK-NEXT: [[GEP_J:%.*]] = getelementptr i64, ptr [[J]], i64 [[IV]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[GEP_J]], align 8 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> +; CHECK-NEXT: [[TMP5:%.*]] = trunc <4 x i64> [[STRIDED_VEC]] to <4 x i16> +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i16, ptr [[K]], i64 [[IV]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i16, ptr [[K]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i16, ptr [[K]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i16, ptr [[K]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i16> [[TMP5]], i32 0 +; CHECK-NEXT: store i16 [[TMP10]], ptr [[TMP6]], align 2 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i16> [[TMP5]], i32 1 +; CHECK-NEXT: store i16 [[TMP11]], ptr [[TMP7]], align 2 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2 +; CHECK-NEXT: store i16 [[TMP12]], ptr [[TMP8]], align 2 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3 +; CHECK-NEXT: store i16 [[TMP13]], ptr [[TMP9]], align 2 +; CHECK-NEXT: store i64 0, ptr [[A]], align 8 +; CHECK-NEXT: store i64 0, ptr [[B]], align 8 +; CHECK-NEXT: store i64 0, ptr [[C]], align 8 +; CHECK-NEXT: store i64 0, ptr [[D]], align 8 +; CHECK-NEXT: store i64 0, ptr [[E]], align 8 +; CHECK-NEXT: store i64 0, ptr [[F]], align 8 +; CHECK-NEXT: store i64 0, ptr [[G]], align 8 +; CHECK-NEXT: store i64 0, ptr [[H]], align 8 +; CHECK-NEXT: store i64 0, ptr [[I]], align 8 +; CHECK-NEXT: store i64 0, ptr [[L]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4 +; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 8, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_J1:%.*]] = getelementptr i64, ptr [[J]], i64 [[IV1]] +; CHECK-NEXT: [[L_J:%.*]] = load i64, ptr [[GEP_J1]], align 8 +; CHECK-NEXT: [[L_TRUNC:%.*]] = trunc i64 [[L_J]] to i16 +; CHECK-NEXT: [[GEP_K:%.*]] = getelementptr i16, ptr [[K]], i64 [[IV1]] +; CHECK-NEXT: store i16 [[L_TRUNC]], ptr [[GEP_K]], align 2 +; CHECK-NEXT: store i64 0, ptr [[A]], align 8 +; CHECK-NEXT: store i64 0, ptr [[B]], align 8 +; CHECK-NEXT: store i64 0, ptr [[C]], align 8 +; CHECK-NEXT: store i64 0, ptr [[D]], align 8 +; CHECK-NEXT: store i64 0, ptr [[E]], align 8 +; CHECK-NEXT: store i64 0, ptr [[F]], align 8 +; CHECK-NEXT: store i64 0, ptr [[G]], align 8 +; CHECK-NEXT: store i64 0, ptr [[H]], align 8 +; CHECK-NEXT: store i64 0, ptr [[I]], align 8 +; CHECK-NEXT: store i64 0, ptr [[L]], align 8 +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV1]], 2 +; CHECK-NEXT: [[EC:%.*]] = icmp ult i64 [[IV1]], 14 +; CHECK-NEXT: br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep.J = getelementptr i64, ptr %J, i64 %iv + %l.J = load i64, ptr %gep.J, align 8 + %l.trunc = trunc i64 %l.J to i16 + %gep.K = getelementptr i16, ptr %K, i64 %iv + store i16 %l.trunc, ptr %gep.K, align 2 + store i64 0, ptr %A, align 8 + store i64 0, ptr %B, align 8 + store i64 0, ptr %C, align 8 + store i64 0, ptr %D, align 8 + store i64 0, ptr %E, align 8 + store i64 0, ptr %F, align 8 + store i64 0, ptr %G, align 8 + store i64 0, ptr %H, align 8 + store i64 0, ptr %I, align 8 + store i64 0, ptr %L, align 8 + %iv.next = add i64 %iv, 2 + %ec = icmp ult i64 %iv, 14 + br i1 %ec, label %loop, label %exit, !llvm.loop !0 + +exit: + ret void +} + +declare i32 @llvm.umin.i32(i32, i32) + +declare i32 @llvm.abs.i32(i32, i1 immarg) + +attributes #0 = { "target-cpu"="neoverse-512tvb" } +attributes #1 = { "target-cpu"="grace" } + +!0 = distinct !{!0, !1, !2} +!1 = !{!"llvm.loop.mustprogress"} +!2 = !{!"llvm.loop.vectorize.enable", i1 true} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll index 400b031..7090ae8 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll @@ -7,11 +7,7 @@ target triple = "aarch64-none-unknown-elf" define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-LABEL: define i32 @dotp( ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: iter.check: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] -; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -33,64 +29,8 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) -; CHECK-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] -; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP13:%.*]] = mul nuw i64 [[TMP12]], 4 -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP13]] -; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] -; CHECK: vec.epilog.ph: -; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 4 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP15]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP17:%.*]] = mul nuw i64 [[TMP16]], 4 -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0 -; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] -; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ [[TMP18]], [[VEC_EPILOG_PH]] ], [ [[TMP27:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX2]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP21]], align 1 -; CHECK-NEXT: [[TMP22:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX2]] -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[TMP23]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <vscale x 4 x i8>, ptr [[TMP24]], align 1 -; CHECK-NEXT: [[TMP25:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD5]] to <vscale x 4 x i32> -; CHECK-NEXT: [[TMP26:%.*]] = mul <vscale x 4 x i32> [[TMP25]], [[TMP22]] -; CHECK-NEXT: [[TMP27]] = add <vscale x 4 x i32> [[TMP26]], [[VEC_PHI3]] -; CHECK-NEXT: [[INDEX_NEXT6]] = add nuw i64 [[INDEX2]], [[TMP17]] -; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP28]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] -; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP27]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT]], label [[VEC_EPILOG_SCALAR_PH]] -; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 1024, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX7:%.*]] = phi i32 [ [[TMP29]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP11]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX7]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] -; CHECK-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 -; CHECK-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 -; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] -; CHECK-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 -; CHECK-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 -; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] -; CHECK-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] -; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] -; CHECK: for.exit: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ [[TMP29]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] -; CHECK-NEXT: ret i32 [[ADD_LCSSA]] +; CHECK-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_PH]] +; CHECK: scalar.ph: ; entry: br label %for.body @@ -142,7 +82,7 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 { ; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[IV_NEXT]] -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[IV_NEXT]] @@ -174,7 +114,7 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 { ; CHECK-NEXT: [[TMP13]] = add <4 x i32> [[TMP14]], [[VEC_PHI9]] ; CHECK-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX9]], 4 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT14]], [[N_VEC5]] -; CHECK-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP13]]) ; CHECK-NEXT: [[CMP_N15:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC5]] @@ -198,7 +138,7 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 { ; CHECK-NEXT: [[CMP_IV_NEG:%.*]] = icmp ugt i64 [[IV_NEG]], 0 ; CHECK-NEXT: [[CMP_IV:%.*]] = icmp ne i64 [[ACCUM1]], -1 ; CHECK-NEXT: [[EXITCOND:%.*]] = and i1 [[CMP_IV_NEG]], [[CMP_IV]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[WHILE_BODY1]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[WHILE_BODY1]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: while.end.loopexit: ; CHECK-NEXT: [[RESULT:%.*]] = phi i32 [ [[ADD]], [[WHILE_BODY1]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ [[TMP15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret void @@ -557,7 +497,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) { ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16) ; CHECK-NEXT: [[TMP181:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP181]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP181]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP182:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) ; CHECK-NEXT: br label [[EXIT:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll index 14725e0..14a73db 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll @@ -1816,13 +1816,12 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: entry: ; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 -; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 16, [[TMP1]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: ; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 16, [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 16, [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 41, [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 41, [[N_MOD_VF]] ; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2 ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] @@ -1845,7 +1844,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP15]]) -; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 16, [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 41, [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVE1: scalar.ph: ; @@ -1854,13 +1853,13 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: entry: ; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 16, [[TMP1]] +; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 41, [[TMP1]] ; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: ; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 16, [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 16, [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 41, [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 41, [[N_MOD_VF]] ; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] @@ -1897,7 +1896,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <vscale x 2 x i64> [[TMP25]], [[TMP24]] ; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[BIN_RDX]]) -; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 16, [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 41, [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVED: scalar.ph: ; @@ -1906,19 +1905,19 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: entry: ; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8 -; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 16, [[TMP1]] +; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 41, [[TMP1]] ; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: ; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 -; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 16, [[TMP3]] -; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 16, [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 41, [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 41, [[N_MOD_VF]] ; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8 ; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-MAXBW: vector.body: ; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] ; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] ; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 ; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1 @@ -1927,15 +1926,15 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP10]] ; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP11]], i32 0 ; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP12]], align 1 -; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i64> -; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = mul nuw nsw <vscale x 8 x i64> [[TMP13]], [[TMP9]] -; CHECK-MAXBW-NEXT: [[TMP19]] = add <vscale x 8 x i64> [[VEC_PHI]], [[TMP14]] +; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i64> +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = mul nuw nsw <vscale x 8 x i64> [[TMP17]], [[TMP9]] +; CHECK-MAXBW-NEXT: [[TMP14]] = add <vscale x 8 x i64> [[VEC_PHI]], [[TMP13]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK-MAXBW: middle.block: -; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = call i64 @llvm.vector.reduce.add.nxv8i64(<vscale x 8 x i64> [[TMP19]]) -; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 16, [[N_VEC]] +; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = call i64 @llvm.vector.reduce.add.nxv8i64(<vscale x 8 x i64> [[TMP14]]) +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 41, [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-MAXBW: scalar.ph: ; @@ -1954,7 +1953,7 @@ for.body: ; preds = %entry, %for.body %conv3 = zext i8 %1 to i64 %mul = mul nuw nsw i64 %conv3, %conv %add = add i64 %sum, %mul - %exitcond.not = icmp eq i64 %i.iv.next, 16 + %exitcond.not = icmp eq i64 %i.iv.next, 41 br i1 %exitcond.not, label %exit, label %for.body exit: ; preds = %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-no-remaining-iterations.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-no-remaining-iterations.ll deleted file mode 100644 index d85bc48..0000000 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-no-remaining-iterations.ll +++ /dev/null @@ -1,146 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5 -; RUN: opt -passes=loop-vectorize -S %s | FileCheck %s - -target triple = "aarch64-linux-gnu" - -define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr noalias %dst, i32 %x) #0 { -; CHECK-LABEL: define i64 @main_vector_loop_fixed_with_no_remaining_iterations( -; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[X:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[ITER_CHECK:.*]]: -; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 16, [[TMP3]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]] -; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]: -; CHECK-NEXT: br i1 true, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]] -; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[X]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP0:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[BROADCAST_SPLAT]], i1 false) -; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[BROADCAST_SPLAT]], i1 false) -; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] -; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], i64 [[INDEX]], i32 0, i64 3 -; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <64 x i8>, ptr [[TMP4]], align 1 -; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <64 x i8> [[WIDE_VEC2]], <64 x i8> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60> -; CHECK-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[STRIDED_VEC3]] to <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP0]], <16 x i32> [[TMP6]]) -; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP8]]) -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 -; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP12]], align 1 -; CHECK-NEXT: [[TMP15:%.*]] = zext <16 x i32> [[TMP10]] to <16 x i64> -; CHECK-NEXT: [[TMP17]] = or <16 x i64> [[VEC_PHI1]], [[TMP15]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> [[TMP17]]) -; CHECK-NEXT: br label %[[VEC_EPILOG_ITER_CHECK:.*]] -; CHECK: [[VEC_EPILOG_ITER_CHECK]]: -; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP14:%.*]] = mul nuw i64 [[TMP13]], 2 -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 16, [[TMP14]] -; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] -; CHECK: [[VEC_EPILOG_PH]]: -; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP18]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP16:%.*]] = mul nuw i64 [[TMP31]], 2 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 16, [[TMP16]] -; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 -; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP32]], i64 [[TMP16]], i64 [[N_MOD_VF]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 16, [[TMP36]] -; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP20:%.*]] = mul nuw i64 [[TMP19]], 2 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[X]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = insertelement <vscale x 2 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0 -; CHECK-NEXT: [[TMP22:%.*]] = call <vscale x 2 x i32> @llvm.abs.nxv2i32(<vscale x 2 x i32> [[BROADCAST_SPLAT2]], i1 false) -; CHECK-NEXT: [[TMP23:%.*]] = call <vscale x 2 x i32> @llvm.abs.nxv2i32(<vscale x 2 x i32> [[BROADCAST_SPLAT2]], i1 false) -; CHECK-NEXT: [[TMP24:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64() -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[VEC_EPILOG_RESUME_VAL]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP25:%.*]] = mul <vscale x 2 x i64> [[TMP24]], splat (i64 1) -; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> [[DOTSPLAT]], [[TMP25]] -; CHECK-NEXT: [[TMP37:%.*]] = mul i64 1, [[TMP20]] -; CHECK-NEXT: [[DOTSPLATINSERT4:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP37]], i64 0 -; CHECK-NEXT: [[DOTSPLAT5:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT4]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer -; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] -; CHECK: [[VEC_EPILOG_VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT10:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <vscale x 2 x i64> [ [[TMP21]], %[[VEC_EPILOG_PH]] ], [ [[TMP34:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP38:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], <vscale x 2 x i64> [[VEC_IND]], i32 0, i64 3 -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8.nxv2p0(<vscale x 2 x ptr> [[TMP38]], i32 1, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i8> poison) -; CHECK-NEXT: [[TMP28:%.*]] = zext <vscale x 2 x i8> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i32> -; CHECK-NEXT: [[TMP29:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP22]], <vscale x 2 x i32> [[TMP28]]) -; CHECK-NEXT: [[TMP39:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP23]], <vscale x 2 x i32> [[TMP29]]) -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX6]] -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i32 0 -; CHECK-NEXT: store <vscale x 2 x i8> zeroinitializer, ptr [[TMP27]], align 1 -; CHECK-NEXT: [[TMP33:%.*]] = zext <vscale x 2 x i32> [[TMP39]] to <vscale x 2 x i64> -; CHECK-NEXT: [[TMP34]] = or <vscale x 2 x i64> [[VEC_PHI6]], [[TMP33]] -; CHECK-NEXT: [[INDEX_NEXT10]] = add nuw i64 [[INDEX6]], [[TMP20]] -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT5]] -; CHECK-NEXT: [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT10]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP35]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] -; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: -; CHECK-NEXT: [[TMP30:%.*]] = call i64 @llvm.vector.reduce.or.nxv2i64(<vscale x 2 x i64> [[TMP34]]) -; CHECK-NEXT: br label %[[VEC_EPILOG_SCALAR_PH]] -; CHECK: [[VEC_EPILOG_SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL8:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX9:%.*]] = phi i64 [ [[TMP30]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP18]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] -; CHECK-NEXT: br label %[[LOOP:.*]] -; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL8]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[RED:%.*]] = phi i64 [ [[BC_MERGE_RDX9]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[GEP_SRC_I_I:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], i64 [[IV]], i32 0, i64 3 -; CHECK-NEXT: [[L:%.*]] = load i8, ptr [[GEP_SRC_I_I]], align 1 -; CHECK-NEXT: [[L_EXT:%.*]] = zext i8 [[L]] to i32 -; CHECK-NEXT: [[ABS_0:%.*]] = call i32 @llvm.abs.i32(i32 [[X]], i1 false) -; CHECK-NEXT: [[MIN_0:%.*]] = call i32 @llvm.umin.i32(i32 [[ABS_0]], i32 [[L_EXT]]) -; CHECK-NEXT: [[ABS_1:%.*]] = call i32 @llvm.abs.i32(i32 [[X]], i1 false) -; CHECK-NEXT: [[MIN_1:%.*]] = call i32 @llvm.umin.i32(i32 [[ABS_1]], i32 [[MIN_0]]) -; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[IV]] -; CHECK-NEXT: store i8 0, ptr [[GEP_DST]], align 1 -; CHECK-NEXT: [[MIN_EXT:%.*]] = zext i32 [[MIN_1]] to i64 -; CHECK-NEXT: [[RED_NEXT]] = or i64 [[RED]], [[MIN_EXT]] -; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; CHECK-NEXT: [[EXITCOND_NOT_I_I:%.*]] = icmp eq i64 [[IV_NEXT]], 16 -; CHECK-NEXT: br i1 [[EXITCOND_NOT_I_I]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] -; CHECK: [[EXIT]]: -; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOP]] ] -; CHECK-NEXT: ret i64 [[RED_NEXT_LCSSA]] -; -entry: - br label %loop - -loop: - %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] - %red = phi i64 [ 0, %entry ], [ %red.next, %loop ] - %gep.src.i.i = getelementptr { [4 x i8] }, ptr %src, i64 %iv, i32 0, i64 3 - %l = load i8, ptr %gep.src.i.i, align 1 - %l.ext = zext i8 %l to i32 - %abs.0 = call i32 @llvm.abs.i32(i32 %x, i1 false) - %min.0 = call i32 @llvm.umin.i32(i32 %abs.0, i32 %l.ext) - %abs.1 = call i32 @llvm.abs.i32(i32 %x, i1 false) - %min.1 = call i32 @llvm.umin.i32(i32 %abs.1, i32 %min.0) - %gep.dst = getelementptr inbounds i8, ptr %dst, i64 %iv - store i8 0, ptr %gep.dst, align 1 - %min.ext = zext i32 %min.1 to i64 - %red.next = or i64 %red, %min.ext - %iv.next = add i64 %iv, 1 - %exitcond.not.i.i = icmp eq i64 %iv.next, 16 - br i1 %exitcond.not.i.i, label %exit, label %loop - -exit: - ret i64 %red.next -} - -declare i32 @llvm.umin.i32(i32, i32) - -declare i32 @llvm.abs.i32(i32, i1 immarg) - -attributes #0 = { "target-cpu"="neoverse-512tvb" } diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll index 45357dd..dbe6f27 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^scalar.ph:" --version 2 ; RUN: opt -mtriple=riscv64-none-linux-gnu -S -passes=loop-vectorize,instcombine -mattr=+v -prefer-predicate-over-epilogue=scalar-epilogue %s 2>&1 | FileCheck %s -check-prefix=SCALAR_EPILOGUE -; RUN: opt -mtriple=riscv64-none-linux-gnu -S -passes=loop-vectorize,instcombine -mattr=+v -prefer-predicate-over-epilogue=predicate-dont-vectorize %s 2>&1 | FileCheck %s -check-prefix=PREDICATED_TAIL_FOLDING -; RUN: opt -mtriple=riscv64-none-linux-gnu -S -passes=loop-vectorize,instcombine -mattr=+v -prefer-predicate-over-epilogue=predicate-dont-vectorize -force-tail-folding-style=data-with-evl %s 2>&1 | FileCheck %s -check-prefix=PREDICATED_EVL +; RUN: opt -mtriple=riscv64-none-linux-gnu -S -passes=loop-vectorize,instcombine -mattr=+v -prefer-predicate-over-epilogue=predicate-dont-vectorize -force-tail-folding-style=data %s 2>&1 | FileCheck %s -check-prefix=PREDICATED_DATA +; RUN: opt -mtriple=riscv64-none-linux-gnu -S -passes=loop-vectorize,instcombine -mattr=+v -prefer-predicate-over-epilogue=predicate-dont-vectorize -force-tail-folding-style=data-with-evl %s 2>&1 | FileCheck %s -check-prefix=PREDICATED_DATA-WITH-EVL target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" @@ -55,105 +55,105 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali ; SCALAR_EPILOGUE-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; SCALAR_EPILOGUE: scalar.ph: ; -; PREDICATED_TAIL_FOLDING-LABEL: define void @masked_strided_factor2 -; PREDICATED_TAIL_FOLDING-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) #[[ATTR0:[0-9]+]] { -; PREDICATED_TAIL_FOLDING-NEXT: entry: -; PREDICATED_TAIL_FOLDING-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; PREDICATED_TAIL_FOLDING: vector.ph: -; PREDICATED_TAIL_FOLDING-NEXT: [[CONV:%.*]] = zext i8 [[GUARD]] to i32 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 4 -; PREDICATED_TAIL_FOLDING-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1023 -; PREDICATED_TAIL_FOLDING-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]] -; PREDICATED_TAIL_FOLDING-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4 -; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0 -; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32() -; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP3]], i64 0 -; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer -; PREDICATED_TAIL_FOLDING-NEXT: br label [[VECTOR_BODY:%.*]] -; PREDICATED_TAIL_FOLDING: vector.body: -; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 1024) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i1> zeroinitializer -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], splat (i32 1) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64> -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP8]] -; PREDICATED_TAIL_FOLDING-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP9]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 1) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64> -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP11]] -; PREDICATED_TAIL_FOLDING-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP12]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64> -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP14]] -; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x ptr> [[TMP15]], i32 1, <vscale x 16 x i1> [[TMP6]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP13]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64> -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP17]] -; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP16]], <vscale x 16 x ptr> [[TMP18]], i32 1, <vscale x 16 x i1> [[TMP6]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]] -; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; PREDICATED_TAIL_FOLDING: middle.block: -; PREDICATED_TAIL_FOLDING-NEXT: br label [[FOR_END:%.*]] -; PREDICATED_TAIL_FOLDING: scalar.ph: +; PREDICATED_DATA-LABEL: define void @masked_strided_factor2 +; PREDICATED_DATA-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) #[[ATTR0:[0-9]+]] { +; PREDICATED_DATA-NEXT: entry: +; PREDICATED_DATA-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; PREDICATED_DATA: vector.ph: +; PREDICATED_DATA-NEXT: [[CONV:%.*]] = zext i8 [[GUARD]] to i32 +; PREDICATED_DATA-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() +; PREDICATED_DATA-NEXT: [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 4 +; PREDICATED_DATA-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1023 +; PREDICATED_DATA-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]] +; PREDICATED_DATA-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] +; PREDICATED_DATA-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() +; PREDICATED_DATA-NEXT: [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4 +; PREDICATED_DATA-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0 +; PREDICATED_DATA-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer +; PREDICATED_DATA-NEXT: [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32() +; PREDICATED_DATA-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP3]], i64 0 +; PREDICATED_DATA-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer +; PREDICATED_DATA-NEXT: br label [[VECTOR_BODY:%.*]] +; PREDICATED_DATA: vector.body: +; PREDICATED_DATA-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; PREDICATED_DATA-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; PREDICATED_DATA-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 1024) +; PREDICATED_DATA-NEXT: [[TMP5:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] +; PREDICATED_DATA-NEXT: [[TMP6:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i1> zeroinitializer +; PREDICATED_DATA-NEXT: [[TMP7:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], splat (i32 1) +; PREDICATED_DATA-NEXT: [[TMP8:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64> +; PREDICATED_DATA-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP8]] +; PREDICATED_DATA-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP9]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison) +; PREDICATED_DATA-NEXT: [[TMP10:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 1) +; PREDICATED_DATA-NEXT: [[TMP11:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64> +; PREDICATED_DATA-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP11]] +; PREDICATED_DATA-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP12]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison) +; PREDICATED_DATA-NEXT: [[TMP13:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]]) +; PREDICATED_DATA-NEXT: [[TMP14:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64> +; PREDICATED_DATA-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP14]] +; PREDICATED_DATA-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x ptr> [[TMP15]], i32 1, <vscale x 16 x i1> [[TMP6]]) +; PREDICATED_DATA-NEXT: [[TMP16:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP13]] +; PREDICATED_DATA-NEXT: [[TMP17:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64> +; PREDICATED_DATA-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP17]] +; PREDICATED_DATA-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP16]], <vscale x 16 x ptr> [[TMP18]], i32 1, <vscale x 16 x i1> [[TMP6]]) +; PREDICATED_DATA-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]] +; PREDICATED_DATA-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] +; PREDICATED_DATA-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; PREDICATED_DATA-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; PREDICATED_DATA: middle.block: +; PREDICATED_DATA-NEXT: br label [[FOR_END:%.*]] +; PREDICATED_DATA: scalar.ph: ; -; PREDICATED_EVL-LABEL: define void @masked_strided_factor2 -; PREDICATED_EVL-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) #[[ATTR0:[0-9]+]] { -; PREDICATED_EVL-NEXT: entry: -; PREDICATED_EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; PREDICATED_EVL: vector.ph: -; PREDICATED_EVL-NEXT: [[CONV:%.*]] = zext i8 [[GUARD]] to i32 -; PREDICATED_EVL-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() -; PREDICATED_EVL-NEXT: [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 4 -; PREDICATED_EVL-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1023 -; PREDICATED_EVL-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]] -; PREDICATED_EVL-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] -; PREDICATED_EVL-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() -; PREDICATED_EVL-NEXT: [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4 -; PREDICATED_EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0 -; PREDICATED_EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer -; PREDICATED_EVL-NEXT: [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32() -; PREDICATED_EVL-NEXT: br label [[VECTOR_BODY:%.*]] -; PREDICATED_EVL: vector.body: -; PREDICATED_EVL-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; PREDICATED_EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; PREDICATED_EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; PREDICATED_EVL-NEXT: [[AVL:%.*]] = sub i32 1024, [[EVL_BASED_IV]] -; PREDICATED_EVL-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 16, i1 true) -; PREDICATED_EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0 -; PREDICATED_EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer -; PREDICATED_EVL-NEXT: [[TMP6:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] -; PREDICATED_EVL-NEXT: [[TMP7:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], splat (i32 1) -; PREDICATED_EVL-NEXT: [[TMP8:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64> -; PREDICATED_EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP8]] -; PREDICATED_EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP9]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]]) -; PREDICATED_EVL-NEXT: [[TMP10:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 1) -; PREDICATED_EVL-NEXT: [[TMP11:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64> -; PREDICATED_EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP11]] -; PREDICATED_EVL-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP12]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]]) -; PREDICATED_EVL-NEXT: [[TMP13:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]]) -; PREDICATED_EVL-NEXT: [[TMP14:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64> -; PREDICATED_EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP14]] -; PREDICATED_EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x ptr> align 1 [[TMP15]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]]) -; PREDICATED_EVL-NEXT: [[TMP16:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP13]] -; PREDICATED_EVL-NEXT: [[TMP17:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64> -; PREDICATED_EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP17]] -; PREDICATED_EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP16]], <vscale x 16 x ptr> align 1 [[TMP18]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]]) -; PREDICATED_EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP5]], [[EVL_BASED_IV]] -; PREDICATED_EVL-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]] -; PREDICATED_EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] -; PREDICATED_EVL-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; PREDICATED_EVL-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; PREDICATED_EVL: middle.block: -; PREDICATED_EVL-NEXT: br label [[FOR_END:%.*]] -; PREDICATED_EVL: scalar.ph: +; PREDICATED_DATA-WITH-EVL-LABEL: define void @masked_strided_factor2 +; PREDICATED_DATA-WITH-EVL-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) #[[ATTR0:[0-9]+]] { +; PREDICATED_DATA-WITH-EVL-NEXT: entry: +; PREDICATED_DATA-WITH-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; PREDICATED_DATA-WITH-EVL: vector.ph: +; PREDICATED_DATA-WITH-EVL-NEXT: [[CONV:%.*]] = zext i8 [[GUARD]] to i32 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 4 +; PREDICATED_DATA-WITH-EVL-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1023 +; PREDICATED_DATA-WITH-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4 +; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0 +; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32() +; PREDICATED_DATA-WITH-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; PREDICATED_DATA-WITH-EVL: vector.body: +; PREDICATED_DATA-WITH-EVL-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; PREDICATED_DATA-WITH-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; PREDICATED_DATA-WITH-EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; PREDICATED_DATA-WITH-EVL-NEXT: [[AVL:%.*]] = sub i32 1024, [[EVL_BASED_IV]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 16, i1 true) +; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0 +; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP6:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP7:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], splat (i32 1) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP8:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64> +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP8]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP9]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP10:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 1) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP11:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64> +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP11]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP12]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP13:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP14:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64> +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP14]] +; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x ptr> align 1 [[TMP15]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP16:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP13]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP17:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64> +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP17]] +; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP16]], <vscale x 16 x ptr> align 1 [[TMP18]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP5]], [[EVL_BASED_IV]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; PREDICATED_DATA-WITH-EVL-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; PREDICATED_DATA-WITH-EVL: middle.block: +; PREDICATED_DATA-WITH-EVL-NEXT: br label [[FOR_END:%.*]] +; PREDICATED_DATA-WITH-EVL: scalar.ph: ; entry: %conv = zext i8 %guard to i32 @@ -256,137 +256,137 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali ; SCALAR_EPILOGUE-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; SCALAR_EPILOGUE: scalar.ph: ; -; PREDICATED_TAIL_FOLDING-LABEL: define void @masked_strided_factor4 -; PREDICATED_TAIL_FOLDING-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) #[[ATTR0]] { -; PREDICATED_TAIL_FOLDING-NEXT: entry: -; PREDICATED_TAIL_FOLDING-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; PREDICATED_TAIL_FOLDING: vector.ph: -; PREDICATED_TAIL_FOLDING-NEXT: [[CONV:%.*]] = zext i8 [[GUARD]] to i32 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 4 -; PREDICATED_TAIL_FOLDING-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1023 -; PREDICATED_TAIL_FOLDING-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]] -; PREDICATED_TAIL_FOLDING-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4 -; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0 -; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32() -; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP3]], i64 0 -; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer -; PREDICATED_TAIL_FOLDING-NEXT: br label [[VECTOR_BODY:%.*]] -; PREDICATED_TAIL_FOLDING: vector.body: -; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 1024) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i1> zeroinitializer -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], splat (i32 2) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 1) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 2) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 3) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64> -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP11]] -; PREDICATED_TAIL_FOLDING-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP12]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64> -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP13]] -; PREDICATED_TAIL_FOLDING-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP14]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = zext nneg <vscale x 16 x i32> [[TMP9]] to <vscale x 16 x i64> -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP15]] -; PREDICATED_TAIL_FOLDING-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP16]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64> -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP17]] -; PREDICATED_TAIL_FOLDING-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP18]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP19]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP21:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER4]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER5]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP22:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP21]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP23:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64> -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP23]] -; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP19]], <vscale x 16 x ptr> [[TMP24]], i32 1, <vscale x 16 x i1> [[TMP6]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP25:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64> -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP25]] -; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP20]], <vscale x 16 x ptr> [[TMP26]], i32 1, <vscale x 16 x i1> [[TMP6]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP27:%.*]] = zext nneg <vscale x 16 x i32> [[TMP9]] to <vscale x 16 x i64> -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP27]] -; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP21]], <vscale x 16 x ptr> [[TMP28]], i32 1, <vscale x 16 x i1> [[TMP6]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP29:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64> -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP29]] -; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP22]], <vscale x 16 x ptr> [[TMP30]], i32 1, <vscale x 16 x i1> [[TMP6]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]] -; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP31:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] -; PREDICATED_TAIL_FOLDING: middle.block: -; PREDICATED_TAIL_FOLDING-NEXT: br label [[FOR_END:%.*]] -; PREDICATED_TAIL_FOLDING: scalar.ph: +; PREDICATED_DATA-LABEL: define void @masked_strided_factor4 +; PREDICATED_DATA-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) #[[ATTR0]] { +; PREDICATED_DATA-NEXT: entry: +; PREDICATED_DATA-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; PREDICATED_DATA: vector.ph: +; PREDICATED_DATA-NEXT: [[CONV:%.*]] = zext i8 [[GUARD]] to i32 +; PREDICATED_DATA-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() +; PREDICATED_DATA-NEXT: [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 4 +; PREDICATED_DATA-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1023 +; PREDICATED_DATA-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]] +; PREDICATED_DATA-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] +; PREDICATED_DATA-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() +; PREDICATED_DATA-NEXT: [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4 +; PREDICATED_DATA-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0 +; PREDICATED_DATA-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer +; PREDICATED_DATA-NEXT: [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32() +; PREDICATED_DATA-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP3]], i64 0 +; PREDICATED_DATA-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer +; PREDICATED_DATA-NEXT: br label [[VECTOR_BODY:%.*]] +; PREDICATED_DATA: vector.body: +; PREDICATED_DATA-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; PREDICATED_DATA-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; PREDICATED_DATA-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 1024) +; PREDICATED_DATA-NEXT: [[TMP5:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] +; PREDICATED_DATA-NEXT: [[TMP6:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i1> zeroinitializer +; PREDICATED_DATA-NEXT: [[TMP7:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], splat (i32 2) +; PREDICATED_DATA-NEXT: [[TMP8:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 1) +; PREDICATED_DATA-NEXT: [[TMP9:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 2) +; PREDICATED_DATA-NEXT: [[TMP10:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 3) +; PREDICATED_DATA-NEXT: [[TMP11:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64> +; PREDICATED_DATA-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP11]] +; PREDICATED_DATA-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP12]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison) +; PREDICATED_DATA-NEXT: [[TMP13:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64> +; PREDICATED_DATA-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP13]] +; PREDICATED_DATA-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP14]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison) +; PREDICATED_DATA-NEXT: [[TMP15:%.*]] = zext nneg <vscale x 16 x i32> [[TMP9]] to <vscale x 16 x i64> +; PREDICATED_DATA-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP15]] +; PREDICATED_DATA-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP16]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison) +; PREDICATED_DATA-NEXT: [[TMP17:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64> +; PREDICATED_DATA-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP17]] +; PREDICATED_DATA-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP18]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison) +; PREDICATED_DATA-NEXT: [[TMP19:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]]) +; PREDICATED_DATA-NEXT: [[TMP20:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP19]] +; PREDICATED_DATA-NEXT: [[TMP21:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER4]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER5]]) +; PREDICATED_DATA-NEXT: [[TMP22:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP21]] +; PREDICATED_DATA-NEXT: [[TMP23:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64> +; PREDICATED_DATA-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP23]] +; PREDICATED_DATA-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP19]], <vscale x 16 x ptr> [[TMP24]], i32 1, <vscale x 16 x i1> [[TMP6]]) +; PREDICATED_DATA-NEXT: [[TMP25:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64> +; PREDICATED_DATA-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP25]] +; PREDICATED_DATA-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP20]], <vscale x 16 x ptr> [[TMP26]], i32 1, <vscale x 16 x i1> [[TMP6]]) +; PREDICATED_DATA-NEXT: [[TMP27:%.*]] = zext nneg <vscale x 16 x i32> [[TMP9]] to <vscale x 16 x i64> +; PREDICATED_DATA-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP27]] +; PREDICATED_DATA-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP21]], <vscale x 16 x ptr> [[TMP28]], i32 1, <vscale x 16 x i1> [[TMP6]]) +; PREDICATED_DATA-NEXT: [[TMP29:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64> +; PREDICATED_DATA-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP29]] +; PREDICATED_DATA-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP22]], <vscale x 16 x ptr> [[TMP30]], i32 1, <vscale x 16 x i1> [[TMP6]]) +; PREDICATED_DATA-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]] +; PREDICATED_DATA-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] +; PREDICATED_DATA-NEXT: [[TMP31:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; PREDICATED_DATA-NEXT: br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; PREDICATED_DATA: middle.block: +; PREDICATED_DATA-NEXT: br label [[FOR_END:%.*]] +; PREDICATED_DATA: scalar.ph: ; -; PREDICATED_EVL-LABEL: define void @masked_strided_factor4 -; PREDICATED_EVL-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) #[[ATTR0]] { -; PREDICATED_EVL-NEXT: entry: -; PREDICATED_EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; PREDICATED_EVL: vector.ph: -; PREDICATED_EVL-NEXT: [[CONV:%.*]] = zext i8 [[GUARD]] to i32 -; PREDICATED_EVL-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() -; PREDICATED_EVL-NEXT: [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 4 -; PREDICATED_EVL-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1023 -; PREDICATED_EVL-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]] -; PREDICATED_EVL-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] -; PREDICATED_EVL-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() -; PREDICATED_EVL-NEXT: [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4 -; PREDICATED_EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0 -; PREDICATED_EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer -; PREDICATED_EVL-NEXT: [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32() -; PREDICATED_EVL-NEXT: br label [[VECTOR_BODY:%.*]] -; PREDICATED_EVL: vector.body: -; PREDICATED_EVL-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; PREDICATED_EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; PREDICATED_EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; PREDICATED_EVL-NEXT: [[AVL:%.*]] = sub i32 1024, [[EVL_BASED_IV]] -; PREDICATED_EVL-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 16, i1 true) -; PREDICATED_EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0 -; PREDICATED_EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer -; PREDICATED_EVL-NEXT: [[TMP6:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] -; PREDICATED_EVL-NEXT: [[TMP7:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], splat (i32 2) -; PREDICATED_EVL-NEXT: [[TMP8:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 1) -; PREDICATED_EVL-NEXT: [[TMP9:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 2) -; PREDICATED_EVL-NEXT: [[TMP10:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 3) -; PREDICATED_EVL-NEXT: [[TMP11:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64> -; PREDICATED_EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP11]] -; PREDICATED_EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP12]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]]) -; PREDICATED_EVL-NEXT: [[TMP13:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64> -; PREDICATED_EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP13]] -; PREDICATED_EVL-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP14]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]]) -; PREDICATED_EVL-NEXT: [[TMP15:%.*]] = zext nneg <vscale x 16 x i32> [[TMP9]] to <vscale x 16 x i64> -; PREDICATED_EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP15]] -; PREDICATED_EVL-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP16]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]]) -; PREDICATED_EVL-NEXT: [[TMP17:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64> -; PREDICATED_EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP17]] -; PREDICATED_EVL-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP18]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]]) -; PREDICATED_EVL-NEXT: [[TMP19:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]]) -; PREDICATED_EVL-NEXT: [[TMP20:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP19]] -; PREDICATED_EVL-NEXT: [[TMP21:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER4]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER5]]) -; PREDICATED_EVL-NEXT: [[TMP22:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP21]] -; PREDICATED_EVL-NEXT: [[TMP23:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64> -; PREDICATED_EVL-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP23]] -; PREDICATED_EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP19]], <vscale x 16 x ptr> align 1 [[TMP24]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]]) -; PREDICATED_EVL-NEXT: [[TMP25:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64> -; PREDICATED_EVL-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP25]] -; PREDICATED_EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP20]], <vscale x 16 x ptr> align 1 [[TMP26]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]]) -; PREDICATED_EVL-NEXT: [[TMP27:%.*]] = zext nneg <vscale x 16 x i32> [[TMP9]] to <vscale x 16 x i64> -; PREDICATED_EVL-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP27]] -; PREDICATED_EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP21]], <vscale x 16 x ptr> align 1 [[TMP28]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]]) -; PREDICATED_EVL-NEXT: [[TMP29:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64> -; PREDICATED_EVL-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP29]] -; PREDICATED_EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP22]], <vscale x 16 x ptr> align 1 [[TMP30]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]]) -; PREDICATED_EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP5]], [[EVL_BASED_IV]] -; PREDICATED_EVL-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]] -; PREDICATED_EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] -; PREDICATED_EVL-NEXT: [[TMP31:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; PREDICATED_EVL-NEXT: br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] -; PREDICATED_EVL: middle.block: -; PREDICATED_EVL-NEXT: br label [[FOR_END:%.*]] -; PREDICATED_EVL: scalar.ph: +; PREDICATED_DATA-WITH-EVL-LABEL: define void @masked_strided_factor4 +; PREDICATED_DATA-WITH-EVL-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) #[[ATTR0]] { +; PREDICATED_DATA-WITH-EVL-NEXT: entry: +; PREDICATED_DATA-WITH-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; PREDICATED_DATA-WITH-EVL: vector.ph: +; PREDICATED_DATA-WITH-EVL-NEXT: [[CONV:%.*]] = zext i8 [[GUARD]] to i32 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 4 +; PREDICATED_DATA-WITH-EVL-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1023 +; PREDICATED_DATA-WITH-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4 +; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0 +; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32() +; PREDICATED_DATA-WITH-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; PREDICATED_DATA-WITH-EVL: vector.body: +; PREDICATED_DATA-WITH-EVL-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; PREDICATED_DATA-WITH-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; PREDICATED_DATA-WITH-EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; PREDICATED_DATA-WITH-EVL-NEXT: [[AVL:%.*]] = sub i32 1024, [[EVL_BASED_IV]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 16, i1 true) +; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0 +; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP6:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP7:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], splat (i32 2) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP8:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 1) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP9:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 2) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP10:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 3) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP11:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64> +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP11]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP12]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP13:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64> +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP13]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP14]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP15:%.*]] = zext nneg <vscale x 16 x i32> [[TMP9]] to <vscale x 16 x i64> +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP15]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP16]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP17:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64> +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP17]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP18]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP19:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP20:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP19]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP21:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER4]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER5]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP22:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP21]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP23:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64> +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP23]] +; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP19]], <vscale x 16 x ptr> align 1 [[TMP24]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP25:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64> +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP25]] +; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP20]], <vscale x 16 x ptr> align 1 [[TMP26]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP27:%.*]] = zext nneg <vscale x 16 x i32> [[TMP9]] to <vscale x 16 x i64> +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP27]] +; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP21]], <vscale x 16 x ptr> align 1 [[TMP28]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP29:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64> +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP29]] +; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP22]], <vscale x 16 x ptr> align 1 [[TMP30]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP5]], [[EVL_BASED_IV]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP31:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; PREDICATED_DATA-WITH-EVL-NEXT: br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; PREDICATED_DATA-WITH-EVL: middle.block: +; PREDICATED_DATA-WITH-EVL-NEXT: br label [[FOR_END:%.*]] +; PREDICATED_DATA-WITH-EVL: scalar.ph: ; entry: %conv = zext i8 %guard to i32 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll index 6c57d2f..e2641ab 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll @@ -133,15 +133,15 @@ define void @trip8_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 8) +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 8, i32 4, i1 true) ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8:%.*]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP9]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP9]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]]) ; CHECK-NEXT: [[TMP10:%.*]] = shl <vscale x 4 x i8> [[WIDE_MASKED_LOAD]], splat (i8 1) ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11:%.*]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]]) ; CHECK-NEXT: [[TMP13:%.*]] = add <vscale x 4 x i8> [[TMP10]], [[WIDE_MASKED_LOAD1]] ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 -; CHECK-NEXT: call void @llvm.masked.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP13]], ptr [[TMP14]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: call void @llvm.vp.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP13]], ptr align 1 [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]]) ; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_END:%.*]] @@ -358,3 +358,64 @@ for.end: ; preds = %for.body attributes #0 = { "target-features"="+v,+d" vscale_range(2, 1024) } +; This is a non-power-of-2 low trip count, so we will try to tail-fold this. But +; the reduction is a multiply which is only legal for fixed-length VFs. But +; fixed-length VFs aren't legal for the default tail-folding style +; data-with-evl, so make sure we gracefully fall back to data-without-lane-mask. + +define i8 @mul_non_pow_2_low_trip_count(ptr noalias %a) { +; CHECK-LABEL: @mul_non_pow_2_low_trip_count( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <16 x i8> [ <i8 2, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = add <16 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = icmp ule <16 x i64> [[VEC_IV]], splat (i64 9) +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) +; CHECK-NEXT: [[TMP2]] = mul <16 x i8> [[WIDE_MASKED_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> [[TMP2]], <16 x i8> [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP4:%.*]] = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> [[TMP3]]) +; CHECK-NEXT: br label [[FOR_END:%.*]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ 2, [[ENTRY]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[RDX:%.*]] = phi i8 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MUL:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr [[GEP]], align 1 +; CHECK-NEXT: [[MUL]] = mul i8 [[TMP5]], [[RDX]] +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 10 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: [[MUL_LCSSA:%.*]] = phi i8 [ [[MUL]], [[FOR_BODY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i8 [[MUL_LCSSA]] +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %rdx = phi i8 [ 2, %entry ], [ %mul, %for.body ] + %gep = getelementptr i8, ptr %a, i64 %iv + %0 = load i8, ptr %gep + %mul = mul i8 %0, %rdx + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 10 + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret i8 %mul +} diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll index a1201dcf..0228811 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll @@ -7,29 +7,49 @@ define void @test(ptr %p, i64 %a, i8 %b) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH1:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[B]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[A]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP0:%.*]] = shl <16 x i64> [[BROADCAST_SPLAT2]], splat (i64 48) -; CHECK-NEXT: [[TMP1:%.*]] = ashr <16 x i64> [[TMP0]], splat (i64 52) -; CHECK-NEXT: [[TMP2:%.*]] = trunc <16 x i64> [[TMP1]] to <16 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i32 [[TMP0]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = sub i32 [[TMP1]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 9, [[TMP2]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP4:%.*]] = mul nuw i32 [[TMP3]], 2 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i8> poison, i8 [[B]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[A]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = shl <vscale x 2 x i64> [[BROADCAST_SPLAT2]], splat (i64 48) +; CHECK-NEXT: [[TMP6:%.*]] = ashr <vscale x 2 x i64> [[TMP5]], splat (i64 52) +; CHECK-NEXT: [[TMP7:%.*]] = trunc <vscale x 2 x i64> [[TMP6]] to <vscale x 2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = zext <vscale x 2 x i8> [[BROADCAST_SPLAT]] to <vscale x 2 x i32> +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[P]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT3]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32() +; CHECK-NEXT: [[TMP10:%.*]] = mul <vscale x 2 x i32> [[TMP9]], splat (i32 1) +; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i32> zeroinitializer, [[TMP10]] ; CHECK-NEXT: br label [[FOR_COND:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_COND]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <16 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_COND]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 9) -; CHECK-NEXT: [[TMP4:%.*]] = icmp sge <16 x i32> [[VEC_IND]], splat (i32 2) -; CHECK-NEXT: [[TMP5:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i1> [[TMP4]], <16 x i1> zeroinitializer -; CHECK-NEXT: [[PREDPHI:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP2]], <16 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP6:%.*]] = shl <16 x i32> [[PREDPHI]], splat (i32 8) -; CHECK-NEXT: [[TMP8:%.*]] = trunc <16 x i32> [[TMP6]] to <16 x i8> -; CHECK-NEXT: [[TMP40:%.*]] = extractelement <16 x i8> [[TMP8]], i32 15 -; CHECK-NEXT: store i8 [[TMP40]], ptr [[P]], align 1 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <16 x i32> [[VEC_IND]], splat (i32 16) -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[FOR_COND]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[FOR_COND]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_COND]] ] +; CHECK-NEXT: [[AVL:%.*]] = sub i32 9, [[EVL_BASED_IV]] +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 2, i1 true) +; CHECK-NEXT: [[TMP12:%.*]] = mul i32 1, [[TMP11]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP12]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT5]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = icmp ule <vscale x 2 x i32> [[VEC_IND]], splat (i32 8) +; CHECK-NEXT: [[TMP14:%.*]] = icmp sge <vscale x 2 x i32> [[VEC_IND]], splat (i32 2) +; CHECK-NEXT: [[TMP15:%.*]] = select <vscale x 2 x i1> [[TMP13]], <vscale x 2 x i1> [[TMP14]], <vscale x 2 x i1> zeroinitializer +; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP15]], <vscale x 2 x i32> [[TMP7]], <vscale x 2 x i32> [[TMP8]] +; CHECK-NEXT: [[TMP16:%.*]] = shl <vscale x 2 x i32> [[PREDPHI]], splat (i32 8) +; CHECK-NEXT: [[TMP17:%.*]] = trunc <vscale x 2 x i32> [[TMP16]] to <vscale x 2 x i8> +; CHECK-NEXT: call void @llvm.vp.scatter.nxv2i8.nxv2p0(<vscale x 2 x i8> [[TMP17]], <vscale x 2 x ptr> align 1 [[BROADCAST_SPLAT4]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]]) +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP11]], [[EVL_BASED_IV]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP4]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i32> [[VEC_IND]], [[BROADCAST_SPLAT6]] +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_COND]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[EXIT1:%.*]] ; CHECK: scalar.ph: @@ -52,7 +72,7 @@ define void @test(ptr %p, i64 %a, i8 %b) { ; CHECK-NEXT: [[TRUNC:%.*]] = trunc i32 [[SHL_I32]] to i8 ; CHECK-NEXT: store i8 [[TRUNC]], ptr [[P]], align 1 ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[IV]], 8 -; CHECK-NEXT: br i1 [[CMP]], label [[FOR_COND1]], label [[EXIT1]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_COND1]], label [[EXIT1]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -84,8 +104,9 @@ exit: ; preds = %for.body ret void } ;. -; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]} ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK: [[META2]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} +; CHECK: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META3]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll deleted file mode 100644 index 4844c2f..0000000 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll +++ /dev/null @@ -1,690 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5 -;; This is the loop in c++ being vectorize in this file with -;; vector.reverse -;; #pragma clang loop vectorize_width(4, scalable) -;; for (int i = N-1; i >= 0; --i) -;; a[i] = b[i] + 1.0; - -; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v -S < %s \ -; RUN: | FileCheck --check-prefix=RV64 %s - -; RUN: opt -passes=loop-vectorize -mtriple=riscv32 -mattr=+v -S < %s \ -; RUN: | FileCheck --check-prefix=RV32 %s - -; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v -force-vector-interleave=2 -S < %s \ -; RUN: | FileCheck --check-prefix=RV64-UF2 %s - -define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) { -; RV64-LABEL: define void @vector_reverse_i32( -; RV64-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] { -; RV64-NEXT: [[ENTRY:.*]]: -; RV64-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; RV64-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; RV64-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]] -; RV64-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; RV64: [[VECTOR_PH]]: -; RV64-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; RV64-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; RV64-NEXT: [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]] -; RV64-NEXT: [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]] -; RV64-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; RV64-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 -; RV64-NEXT: [[TMP6:%.*]] = sub i64 1023, [[N_VEC]] -; RV64-NEXT: br label %[[VECTOR_BODY:.*]] -; RV64: [[VECTOR_BODY]]: -; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; RV64-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] -; RV64-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 -; RV64-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP8]] -; RV64-NEXT: [[TMP10:%.*]] = mul i64 0, [[TMP5]] -; RV64-NEXT: [[TMP22:%.*]] = sub i64 [[TMP5]], 1 -; RV64-NEXT: [[TMP11:%.*]] = mul i64 -1, [[TMP22]] -; RV64-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 [[TMP10]] -; RV64-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i64 [[TMP11]] -; RV64-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP13]], align 4 -; RV64-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]]) -; RV64-NEXT: [[TMP14:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1) -; RV64-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]] -; RV64-NEXT: [[TMP16:%.*]] = mul i64 0, [[TMP5]] -; RV64-NEXT: [[TMP23:%.*]] = sub i64 [[TMP5]], 1 -; RV64-NEXT: [[TMP17:%.*]] = mul i64 -1, [[TMP23]] -; RV64-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 [[TMP16]] -; RV64-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i64 [[TMP17]] -; RV64-NEXT: [[REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP14]]) -; RV64-NEXT: store <vscale x 4 x i32> [[REVERSE1]], ptr [[TMP19]], align 4 -; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; RV64-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV64-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; RV64: [[MIDDLE_BLOCK]]: -; RV64-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]] -; RV64-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] -; RV64: [[SCALAR_PH]]: -; RV64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ] -; RV64-NEXT: br label %[[FOR_BODY:.*]] -; RV64: [[FOR_BODY]]: -; RV64-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] -; RV64-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1 -; RV64-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV_NEXT]] -; RV64-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4 -; RV64-NEXT: [[ADD:%.*]] = add i32 [[TMP21]], 1 -; RV64-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV_NEXT]] -; RV64-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX_A]], align 4 -; RV64-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1 -; RV64-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP3:![0-9]+]] -; RV64: [[EXIT]]: -; RV64-NEXT: ret void -; -; RV32-LABEL: define void @vector_reverse_i32( -; RV32-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] { -; RV32-NEXT: [[ENTRY:.*]]: -; RV32-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; RV32-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; RV32-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]] -; RV32-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; RV32: [[VECTOR_PH]]: -; RV32-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; RV32-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; RV32-NEXT: [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]] -; RV32-NEXT: [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]] -; RV32-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; RV32-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 -; RV32-NEXT: [[TMP6:%.*]] = sub i64 1023, [[N_VEC]] -; RV32-NEXT: br label %[[VECTOR_BODY:.*]] -; RV32: [[VECTOR_BODY]]: -; RV32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; RV32-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] -; RV32-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 -; RV32-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP8]] -; RV32-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP5]] to i32 -; RV32-NEXT: [[TMP11:%.*]] = mul i32 0, [[TMP10]] -; RV32-NEXT: [[TMP24:%.*]] = sub i32 [[TMP10]], 1 -; RV32-NEXT: [[TMP12:%.*]] = mul i32 -1, [[TMP24]] -; RV32-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 [[TMP11]] -; RV32-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 [[TMP12]] -; RV32-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4 -; RV32-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]]) -; RV32-NEXT: [[TMP15:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1) -; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]] -; RV32-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP5]] to i32 -; RV32-NEXT: [[TMP18:%.*]] = mul i32 0, [[TMP17]] -; RV32-NEXT: [[TMP25:%.*]] = sub i32 [[TMP17]], 1 -; RV32-NEXT: [[TMP19:%.*]] = mul i32 -1, [[TMP25]] -; RV32-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 [[TMP18]] -; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 [[TMP19]] -; RV32-NEXT: [[REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP15]]) -; RV32-NEXT: store <vscale x 4 x i32> [[REVERSE1]], ptr [[TMP21]], align 4 -; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; RV32-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV32-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; RV32: [[MIDDLE_BLOCK]]: -; RV32-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]] -; RV32-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] -; RV32: [[SCALAR_PH]]: -; RV32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ] -; RV32-NEXT: br label %[[FOR_BODY:.*]] -; RV32: [[FOR_BODY]]: -; RV32-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] -; RV32-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1 -; RV32-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV_NEXT]] -; RV32-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4 -; RV32-NEXT: [[ADD:%.*]] = add i32 [[TMP23]], 1 -; RV32-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV_NEXT]] -; RV32-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX_A]], align 4 -; RV32-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1 -; RV32-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP3:![0-9]+]] -; RV32: [[EXIT]]: -; RV32-NEXT: ret void -; -; RV64-UF2-LABEL: define void @vector_reverse_i32( -; RV64-UF2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] { -; RV64-UF2-NEXT: [[ENTRY:.*]]: -; RV64-UF2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; RV64-UF2-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8 -; RV64-UF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]] -; RV64-UF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; RV64-UF2: [[VECTOR_PH]]: -; RV64-UF2-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; RV64-UF2-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 -; RV64-UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]] -; RV64-UF2-NEXT: [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]] -; RV64-UF2-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; RV64-UF2-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 -; RV64-UF2-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2 -; RV64-UF2-NEXT: [[TMP7:%.*]] = sub i64 1023, [[N_VEC]] -; RV64-UF2-NEXT: br label %[[VECTOR_BODY:.*]] -; RV64-UF2: [[VECTOR_BODY]]: -; RV64-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; RV64-UF2-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] -; RV64-UF2-NEXT: [[TMP9:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 -; RV64-UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP9]] -; RV64-UF2-NEXT: [[TMP11:%.*]] = mul i64 0, [[TMP5]] -; RV64-UF2-NEXT: [[TMP32:%.*]] = sub i64 [[TMP5]], 1 -; RV64-UF2-NEXT: [[TMP12:%.*]] = mul i64 -1, [[TMP32]] -; RV64-UF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP11]] -; RV64-UF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i64 [[TMP12]] -; RV64-UF2-NEXT: [[TMP15:%.*]] = mul i64 -1, [[TMP5]] -; RV64-UF2-NEXT: [[TMP33:%.*]] = sub i64 [[TMP5]], 1 -; RV64-UF2-NEXT: [[TMP16:%.*]] = mul i64 -1, [[TMP33]] -; RV64-UF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP15]] -; RV64-UF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 [[TMP16]] -; RV64-UF2-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4 -; RV64-UF2-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]]) -; RV64-UF2-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[TMP18]], align 4 -; RV64-UF2-NEXT: [[REVERSE2:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD1]]) -; RV64-UF2-NEXT: [[TMP19:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1) -; RV64-UF2-NEXT: [[TMP20:%.*]] = add <vscale x 4 x i32> [[REVERSE2]], splat (i32 1) -; RV64-UF2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP9]] -; RV64-UF2-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP5]] -; RV64-UF2-NEXT: [[TMP34:%.*]] = sub i64 [[TMP5]], 1 -; RV64-UF2-NEXT: [[TMP23:%.*]] = mul i64 -1, [[TMP34]] -; RV64-UF2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP22]] -; RV64-UF2-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i64 [[TMP23]] -; RV64-UF2-NEXT: [[TMP26:%.*]] = mul i64 -1, [[TMP5]] -; RV64-UF2-NEXT: [[TMP35:%.*]] = sub i64 [[TMP5]], 1 -; RV64-UF2-NEXT: [[TMP27:%.*]] = mul i64 -1, [[TMP35]] -; RV64-UF2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP26]] -; RV64-UF2-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i64 [[TMP27]] -; RV64-UF2-NEXT: [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP19]]) -; RV64-UF2-NEXT: store <vscale x 4 x i32> [[REVERSE3]], ptr [[TMP25]], align 4 -; RV64-UF2-NEXT: [[REVERSE4:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP20]]) -; RV64-UF2-NEXT: store <vscale x 4 x i32> [[REVERSE4]], ptr [[TMP29]], align 4 -; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; RV64-UF2-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV64-UF2-NEXT: br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; RV64-UF2: [[MIDDLE_BLOCK]]: -; RV64-UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]] -; RV64-UF2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] -; RV64-UF2: [[SCALAR_PH]]: -; RV64-UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ] -; RV64-UF2-NEXT: br label %[[FOR_BODY:.*]] -; RV64-UF2: [[FOR_BODY]]: -; RV64-UF2-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] -; RV64-UF2-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1 -; RV64-UF2-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV_NEXT]] -; RV64-UF2-NEXT: [[TMP31:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4 -; RV64-UF2-NEXT: [[ADD:%.*]] = add i32 [[TMP31]], 1 -; RV64-UF2-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV_NEXT]] -; RV64-UF2-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX_A]], align 4 -; RV64-UF2-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1 -; RV64-UF2-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP3:![0-9]+]] -; RV64-UF2: [[EXIT]]: -; RV64-UF2-NEXT: ret void -; -entry: - br label %for.body - -for.body: - %dec.iv = phi i64 [ 1023, %entry ], [ %iv.next, %for.body ] - %iv.next = add nsw i64 %dec.iv, -1 - %arrayidx.b = getelementptr inbounds i32, ptr %B, i64 %iv.next - %0 = load i32, ptr %arrayidx.b, align 4 - %add = add i32 %0, 1 - %arrayidx.a = getelementptr inbounds i32, ptr %A, i64 %iv.next - store i32 %add, ptr %arrayidx.a, align 4 - %cmp = icmp ugt i64 %dec.iv, 1 - br i1 %cmp, label %for.body, label %exit, !llvm.loop !0 - -exit: - ret void -} - -define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) { -; RV64-LABEL: define void @vector_reverse_f32( -; RV64-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] { -; RV64-NEXT: [[ENTRY:.*]]: -; RV64-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; RV64-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; RV64-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]] -; RV64-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; RV64: [[VECTOR_PH]]: -; RV64-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; RV64-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; RV64-NEXT: [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]] -; RV64-NEXT: [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]] -; RV64-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; RV64-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 -; RV64-NEXT: [[TMP6:%.*]] = sub i64 1023, [[N_VEC]] -; RV64-NEXT: br label %[[VECTOR_BODY:.*]] -; RV64: [[VECTOR_BODY]]: -; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; RV64-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] -; RV64-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 -; RV64-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP8]] -; RV64-NEXT: [[TMP10:%.*]] = mul i64 0, [[TMP5]] -; RV64-NEXT: [[TMP22:%.*]] = sub i64 [[TMP5]], 1 -; RV64-NEXT: [[TMP11:%.*]] = mul i64 -1, [[TMP22]] -; RV64-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[TMP10]] -; RV64-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[TMP11]] -; RV64-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP13]], align 4 -; RV64-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]]) -; RV64-NEXT: [[TMP14:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00) -; RV64-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]] -; RV64-NEXT: [[TMP16:%.*]] = mul i64 0, [[TMP5]] -; RV64-NEXT: [[TMP23:%.*]] = sub i64 [[TMP5]], 1 -; RV64-NEXT: [[TMP17:%.*]] = mul i64 -1, [[TMP23]] -; RV64-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i64 [[TMP16]] -; RV64-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP18]], i64 [[TMP17]] -; RV64-NEXT: [[REVERSE1:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP14]]) -; RV64-NEXT: store <vscale x 4 x float> [[REVERSE1]], ptr [[TMP19]], align 4 -; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; RV64-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV64-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] -; RV64: [[MIDDLE_BLOCK]]: -; RV64-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]] -; RV64-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] -; RV64: [[SCALAR_PH]]: -; RV64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ] -; RV64-NEXT: br label %[[FOR_BODY:.*]] -; RV64: [[FOR_BODY]]: -; RV64-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] -; RV64-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1 -; RV64-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV_NEXT]] -; RV64-NEXT: [[TMP21:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4 -; RV64-NEXT: [[FADD:%.*]] = fadd float [[TMP21]], 1.000000e+00 -; RV64-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV_NEXT]] -; RV64-NEXT: store float [[FADD]], ptr [[ARRAYIDX_A]], align 4 -; RV64-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1 -; RV64-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP5:![0-9]+]] -; RV64: [[EXIT]]: -; RV64-NEXT: ret void -; -; RV32-LABEL: define void @vector_reverse_f32( -; RV32-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] { -; RV32-NEXT: [[ENTRY:.*]]: -; RV32-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; RV32-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; RV32-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]] -; RV32-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; RV32: [[VECTOR_PH]]: -; RV32-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; RV32-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; RV32-NEXT: [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]] -; RV32-NEXT: [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]] -; RV32-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; RV32-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 -; RV32-NEXT: [[TMP6:%.*]] = sub i64 1023, [[N_VEC]] -; RV32-NEXT: br label %[[VECTOR_BODY:.*]] -; RV32: [[VECTOR_BODY]]: -; RV32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; RV32-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] -; RV32-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 -; RV32-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP8]] -; RV32-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP5]] to i32 -; RV32-NEXT: [[TMP11:%.*]] = mul i32 0, [[TMP10]] -; RV32-NEXT: [[TMP24:%.*]] = sub i32 [[TMP10]], 1 -; RV32-NEXT: [[TMP12:%.*]] = mul i32 -1, [[TMP24]] -; RV32-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 [[TMP11]] -; RV32-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 [[TMP12]] -; RV32-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP14]], align 4 -; RV32-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]]) -; RV32-NEXT: [[TMP15:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00) -; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]] -; RV32-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP5]] to i32 -; RV32-NEXT: [[TMP18:%.*]] = mul i32 0, [[TMP17]] -; RV32-NEXT: [[TMP25:%.*]] = sub i32 [[TMP17]], 1 -; RV32-NEXT: [[TMP19:%.*]] = mul i32 -1, [[TMP25]] -; RV32-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 [[TMP18]] -; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 [[TMP19]] -; RV32-NEXT: [[REVERSE1:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP15]]) -; RV32-NEXT: store <vscale x 4 x float> [[REVERSE1]], ptr [[TMP21]], align 4 -; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; RV32-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV32-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] -; RV32: [[MIDDLE_BLOCK]]: -; RV32-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]] -; RV32-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] -; RV32: [[SCALAR_PH]]: -; RV32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ] -; RV32-NEXT: br label %[[FOR_BODY:.*]] -; RV32: [[FOR_BODY]]: -; RV32-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] -; RV32-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1 -; RV32-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV_NEXT]] -; RV32-NEXT: [[TMP23:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4 -; RV32-NEXT: [[FADD:%.*]] = fadd float [[TMP23]], 1.000000e+00 -; RV32-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV_NEXT]] -; RV32-NEXT: store float [[FADD]], ptr [[ARRAYIDX_A]], align 4 -; RV32-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1 -; RV32-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP5:![0-9]+]] -; RV32: [[EXIT]]: -; RV32-NEXT: ret void -; -; RV64-UF2-LABEL: define void @vector_reverse_f32( -; RV64-UF2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] { -; RV64-UF2-NEXT: [[ENTRY:.*]]: -; RV64-UF2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; RV64-UF2-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8 -; RV64-UF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]] -; RV64-UF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; RV64-UF2: [[VECTOR_PH]]: -; RV64-UF2-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; RV64-UF2-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 -; RV64-UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]] -; RV64-UF2-NEXT: [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]] -; RV64-UF2-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; RV64-UF2-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 -; RV64-UF2-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2 -; RV64-UF2-NEXT: [[TMP7:%.*]] = sub i64 1023, [[N_VEC]] -; RV64-UF2-NEXT: br label %[[VECTOR_BODY:.*]] -; RV64-UF2: [[VECTOR_BODY]]: -; RV64-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; RV64-UF2-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] -; RV64-UF2-NEXT: [[TMP9:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 -; RV64-UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP9]] -; RV64-UF2-NEXT: [[TMP11:%.*]] = mul i64 0, [[TMP5]] -; RV64-UF2-NEXT: [[TMP32:%.*]] = sub i64 [[TMP5]], 1 -; RV64-UF2-NEXT: [[TMP12:%.*]] = mul i64 -1, [[TMP32]] -; RV64-UF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[TMP11]] -; RV64-UF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[TMP12]] -; RV64-UF2-NEXT: [[TMP15:%.*]] = mul i64 -1, [[TMP5]] -; RV64-UF2-NEXT: [[TMP33:%.*]] = sub i64 [[TMP5]], 1 -; RV64-UF2-NEXT: [[TMP16:%.*]] = mul i64 -1, [[TMP33]] -; RV64-UF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[TMP15]] -; RV64-UF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP16]] -; RV64-UF2-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP14]], align 4 -; RV64-UF2-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]]) -; RV64-UF2-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP18]], align 4 -; RV64-UF2-NEXT: [[REVERSE2:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD1]]) -; RV64-UF2-NEXT: [[TMP19:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00) -; RV64-UF2-NEXT: [[TMP20:%.*]] = fadd <vscale x 4 x float> [[REVERSE2]], splat (float 1.000000e+00) -; RV64-UF2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]] -; RV64-UF2-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP5]] -; RV64-UF2-NEXT: [[TMP34:%.*]] = sub i64 [[TMP5]], 1 -; RV64-UF2-NEXT: [[TMP23:%.*]] = mul i64 -1, [[TMP34]] -; RV64-UF2-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[TMP22]] -; RV64-UF2-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i64 [[TMP23]] -; RV64-UF2-NEXT: [[TMP26:%.*]] = mul i64 -1, [[TMP5]] -; RV64-UF2-NEXT: [[TMP35:%.*]] = sub i64 [[TMP5]], 1 -; RV64-UF2-NEXT: [[TMP27:%.*]] = mul i64 -1, [[TMP35]] -; RV64-UF2-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[TMP26]] -; RV64-UF2-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i64 [[TMP27]] -; RV64-UF2-NEXT: [[REVERSE3:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP19]]) -; RV64-UF2-NEXT: store <vscale x 4 x float> [[REVERSE3]], ptr [[TMP25]], align 4 -; RV64-UF2-NEXT: [[REVERSE4:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP20]]) -; RV64-UF2-NEXT: store <vscale x 4 x float> [[REVERSE4]], ptr [[TMP29]], align 4 -; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; RV64-UF2-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV64-UF2-NEXT: br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] -; RV64-UF2: [[MIDDLE_BLOCK]]: -; RV64-UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]] -; RV64-UF2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] -; RV64-UF2: [[SCALAR_PH]]: -; RV64-UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ] -; RV64-UF2-NEXT: br label %[[FOR_BODY:.*]] -; RV64-UF2: [[FOR_BODY]]: -; RV64-UF2-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] -; RV64-UF2-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1 -; RV64-UF2-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV_NEXT]] -; RV64-UF2-NEXT: [[TMP31:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4 -; RV64-UF2-NEXT: [[FADD:%.*]] = fadd float [[TMP31]], 1.000000e+00 -; RV64-UF2-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV_NEXT]] -; RV64-UF2-NEXT: store float [[FADD]], ptr [[ARRAYIDX_A]], align 4 -; RV64-UF2-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1 -; RV64-UF2-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP5:![0-9]+]] -; RV64-UF2: [[EXIT]]: -; RV64-UF2-NEXT: ret void -; -entry: - br label %for.body - -for.body: - %dec.iv = phi i64 [ 1023, %entry ], [ %iv.next, %for.body ] - %iv.next = add nsw i64 %dec.iv, -1 - %arrayidx.b = getelementptr inbounds float, ptr %B, i64 %iv.next - %0 = load float, ptr %arrayidx.b, align 4 - %fadd = fadd float %0, 1.000000e+00 - %arrayidx.a = getelementptr inbounds float, ptr %A, i64 %iv.next - store float %fadd, ptr %arrayidx.a, align 4 - %cmp = icmp ugt i64 %dec.iv, 1 - br i1 %cmp, label %for.body, label %exit, !llvm.loop !0 - -exit: - ret void -} - -define void @vector_reverse_irregular_type(ptr noalias %A, ptr noalias %B) { -; RV64-LABEL: define void @vector_reverse_irregular_type( -; RV64-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] { -; RV64-NEXT: [[ENTRY:.*]]: -; RV64-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; RV64: [[VECTOR_PH]]: -; RV64-NEXT: br label %[[VECTOR_BODY:.*]] -; RV64: [[VECTOR_BODY]]: -; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; RV64-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] -; RV64-NEXT: [[DEC_IV:%.*]] = add i64 [[OFFSET_IDX]], 0 -; RV64-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], -1 -; RV64-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], -2 -; RV64-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], -3 -; RV64-NEXT: [[IV_NEXT:%.*]] = add nsw i64 [[DEC_IV]], -1 -; RV64-NEXT: [[TMP5:%.*]] = add nsw i64 [[TMP1]], -1 -; RV64-NEXT: [[TMP6:%.*]] = add nsw i64 [[TMP2]], -1 -; RV64-NEXT: [[TMP7:%.*]] = add nsw i64 [[TMP3]], -1 -; RV64-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[IV_NEXT]] -; RV64-NEXT: [[TMP9:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP5]] -; RV64-NEXT: [[TMP10:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP6]] -; RV64-NEXT: [[TMP11:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP7]] -; RV64-NEXT: [[TMP0:%.*]] = load i7, ptr [[ARRAYIDX_B]], align 1 -; RV64-NEXT: [[TMP13:%.*]] = load i7, ptr [[TMP9]], align 1 -; RV64-NEXT: [[TMP14:%.*]] = load i7, ptr [[TMP10]], align 1 -; RV64-NEXT: [[TMP15:%.*]] = load i7, ptr [[TMP11]], align 1 -; RV64-NEXT: [[TMP16:%.*]] = insertelement <4 x i7> poison, i7 [[TMP0]], i32 0 -; RV64-NEXT: [[TMP17:%.*]] = insertelement <4 x i7> [[TMP16]], i7 [[TMP13]], i32 1 -; RV64-NEXT: [[TMP18:%.*]] = insertelement <4 x i7> [[TMP17]], i7 [[TMP14]], i32 2 -; RV64-NEXT: [[TMP19:%.*]] = insertelement <4 x i7> [[TMP18]], i7 [[TMP15]], i32 3 -; RV64-NEXT: [[TMP20:%.*]] = add <4 x i7> [[TMP19]], splat (i7 1) -; RV64-NEXT: [[TMP21:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[IV_NEXT]] -; RV64-NEXT: [[TMP22:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP5]] -; RV64-NEXT: [[TMP23:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP6]] -; RV64-NEXT: [[TMP24:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP7]] -; RV64-NEXT: [[TMP25:%.*]] = extractelement <4 x i7> [[TMP20]], i32 0 -; RV64-NEXT: store i7 [[TMP25]], ptr [[TMP21]], align 1 -; RV64-NEXT: [[TMP26:%.*]] = extractelement <4 x i7> [[TMP20]], i32 1 -; RV64-NEXT: store i7 [[TMP26]], ptr [[TMP22]], align 1 -; RV64-NEXT: [[TMP27:%.*]] = extractelement <4 x i7> [[TMP20]], i32 2 -; RV64-NEXT: store i7 [[TMP27]], ptr [[TMP23]], align 1 -; RV64-NEXT: [[TMP28:%.*]] = extractelement <4 x i7> [[TMP20]], i32 3 -; RV64-NEXT: store i7 [[TMP28]], ptr [[TMP24]], align 1 -; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; RV64-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1020 -; RV64-NEXT: br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] -; RV64: [[MIDDLE_BLOCK]]: -; RV64-NEXT: br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]] -; RV64: [[SCALAR_PH]]: -; RV64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ] -; RV64-NEXT: br label %[[FOR_BODY:.*]] -; RV64: [[FOR_BODY]]: -; RV64-NEXT: [[DEC_IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], %[[FOR_BODY]] ] -; RV64-NEXT: [[IV_NEXT1]] = add nsw i64 [[DEC_IV1]], -1 -; RV64-NEXT: [[ARRAYIDX_B1:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[IV_NEXT1]] -; RV64-NEXT: [[TMP30:%.*]] = load i7, ptr [[ARRAYIDX_B1]], align 1 -; RV64-NEXT: [[ADD:%.*]] = add i7 [[TMP30]], 1 -; RV64-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[IV_NEXT1]] -; RV64-NEXT: store i7 [[ADD]], ptr [[ARRAYIDX_A]], align 1 -; RV64-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV1]], 1 -; RV64-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP7:![0-9]+]] -; RV64: [[EXIT]]: -; RV64-NEXT: ret void -; -; RV32-LABEL: define void @vector_reverse_irregular_type( -; RV32-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] { -; RV32-NEXT: [[ENTRY:.*]]: -; RV32-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; RV32: [[VECTOR_PH]]: -; RV32-NEXT: br label %[[VECTOR_BODY:.*]] -; RV32: [[VECTOR_BODY]]: -; RV32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; RV32-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] -; RV32-NEXT: [[DEC_IV:%.*]] = add i64 [[OFFSET_IDX]], 0 -; RV32-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], -1 -; RV32-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], -2 -; RV32-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], -3 -; RV32-NEXT: [[IV_NEXT:%.*]] = add nsw i64 [[DEC_IV]], -1 -; RV32-NEXT: [[TMP5:%.*]] = add nsw i64 [[TMP1]], -1 -; RV32-NEXT: [[TMP6:%.*]] = add nsw i64 [[TMP2]], -1 -; RV32-NEXT: [[TMP7:%.*]] = add nsw i64 [[TMP3]], -1 -; RV32-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[IV_NEXT]] -; RV32-NEXT: [[TMP9:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP5]] -; RV32-NEXT: [[TMP10:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP6]] -; RV32-NEXT: [[TMP11:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP7]] -; RV32-NEXT: [[TMP0:%.*]] = load i7, ptr [[ARRAYIDX_B]], align 1 -; RV32-NEXT: [[TMP13:%.*]] = load i7, ptr [[TMP9]], align 1 -; RV32-NEXT: [[TMP14:%.*]] = load i7, ptr [[TMP10]], align 1 -; RV32-NEXT: [[TMP15:%.*]] = load i7, ptr [[TMP11]], align 1 -; RV32-NEXT: [[TMP16:%.*]] = insertelement <4 x i7> poison, i7 [[TMP0]], i32 0 -; RV32-NEXT: [[TMP17:%.*]] = insertelement <4 x i7> [[TMP16]], i7 [[TMP13]], i32 1 -; RV32-NEXT: [[TMP18:%.*]] = insertelement <4 x i7> [[TMP17]], i7 [[TMP14]], i32 2 -; RV32-NEXT: [[TMP19:%.*]] = insertelement <4 x i7> [[TMP18]], i7 [[TMP15]], i32 3 -; RV32-NEXT: [[TMP20:%.*]] = add <4 x i7> [[TMP19]], splat (i7 1) -; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[IV_NEXT]] -; RV32-NEXT: [[TMP22:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP5]] -; RV32-NEXT: [[TMP23:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP6]] -; RV32-NEXT: [[TMP24:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP7]] -; RV32-NEXT: [[TMP25:%.*]] = extractelement <4 x i7> [[TMP20]], i32 0 -; RV32-NEXT: store i7 [[TMP25]], ptr [[TMP21]], align 1 -; RV32-NEXT: [[TMP26:%.*]] = extractelement <4 x i7> [[TMP20]], i32 1 -; RV32-NEXT: store i7 [[TMP26]], ptr [[TMP22]], align 1 -; RV32-NEXT: [[TMP27:%.*]] = extractelement <4 x i7> [[TMP20]], i32 2 -; RV32-NEXT: store i7 [[TMP27]], ptr [[TMP23]], align 1 -; RV32-NEXT: [[TMP28:%.*]] = extractelement <4 x i7> [[TMP20]], i32 3 -; RV32-NEXT: store i7 [[TMP28]], ptr [[TMP24]], align 1 -; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; RV32-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1020 -; RV32-NEXT: br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] -; RV32: [[MIDDLE_BLOCK]]: -; RV32-NEXT: br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]] -; RV32: [[SCALAR_PH]]: -; RV32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ] -; RV32-NEXT: br label %[[FOR_BODY:.*]] -; RV32: [[FOR_BODY]]: -; RV32-NEXT: [[DEC_IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], %[[FOR_BODY]] ] -; RV32-NEXT: [[IV_NEXT1]] = add nsw i64 [[DEC_IV1]], -1 -; RV32-NEXT: [[ARRAYIDX_B1:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[IV_NEXT1]] -; RV32-NEXT: [[TMP30:%.*]] = load i7, ptr [[ARRAYIDX_B1]], align 1 -; RV32-NEXT: [[ADD:%.*]] = add i7 [[TMP30]], 1 -; RV32-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[IV_NEXT1]] -; RV32-NEXT: store i7 [[ADD]], ptr [[ARRAYIDX_A]], align 1 -; RV32-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV1]], 1 -; RV32-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP7:![0-9]+]] -; RV32: [[EXIT]]: -; RV32-NEXT: ret void -; -; RV64-UF2-LABEL: define void @vector_reverse_irregular_type( -; RV64-UF2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] { -; RV64-UF2-NEXT: [[ENTRY:.*]]: -; RV64-UF2-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; RV64-UF2: [[VECTOR_PH]]: -; RV64-UF2-NEXT: br label %[[VECTOR_BODY:.*]] -; RV64-UF2: [[VECTOR_BODY]]: -; RV64-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; RV64-UF2-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] -; RV64-UF2-NEXT: [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 0 -; RV64-UF2-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], -1 -; RV64-UF2-NEXT: [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], -2 -; RV64-UF2-NEXT: [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], -3 -; RV64-UF2-NEXT: [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], -4 -; RV64-UF2-NEXT: [[TMP42:%.*]] = add i64 [[OFFSET_IDX]], -5 -; RV64-UF2-NEXT: [[TMP43:%.*]] = add i64 [[OFFSET_IDX]], -6 -; RV64-UF2-NEXT: [[TMP50:%.*]] = add i64 [[OFFSET_IDX]], -7 -; RV64-UF2-NEXT: [[TMP1:%.*]] = add nsw i64 [[TMP16]], -1 -; RV64-UF2-NEXT: [[TMP2:%.*]] = add nsw i64 [[TMP0]], -1 -; RV64-UF2-NEXT: [[TMP51:%.*]] = add nsw i64 [[TMP17]], -1 -; RV64-UF2-NEXT: [[TMP11:%.*]] = add nsw i64 [[TMP24]], -1 -; RV64-UF2-NEXT: [[TMP59:%.*]] = add nsw i64 [[TMP25]], -1 -; RV64-UF2-NEXT: [[TMP13:%.*]] = add nsw i64 [[TMP42]], -1 -; RV64-UF2-NEXT: [[TMP14:%.*]] = add nsw i64 [[TMP43]], -1 -; RV64-UF2-NEXT: [[TMP15:%.*]] = add nsw i64 [[TMP50]], -1 -; RV64-UF2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP1]] -; RV64-UF2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP2]] -; RV64-UF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP51]] -; RV64-UF2-NEXT: [[TMP19:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP11]] -; RV64-UF2-NEXT: [[TMP20:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP59]] -; RV64-UF2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP13]] -; RV64-UF2-NEXT: [[TMP22:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP14]] -; RV64-UF2-NEXT: [[TMP23:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP15]] -; RV64-UF2-NEXT: [[TMP5:%.*]] = load i7, ptr [[TMP3]], align 1 -; RV64-UF2-NEXT: [[TMP6:%.*]] = load i7, ptr [[TMP4]], align 1 -; RV64-UF2-NEXT: [[TMP26:%.*]] = load i7, ptr [[TMP18]], align 1 -; RV64-UF2-NEXT: [[TMP27:%.*]] = load i7, ptr [[TMP19]], align 1 -; RV64-UF2-NEXT: [[TMP28:%.*]] = insertelement <4 x i7> poison, i7 [[TMP5]], i32 0 -; RV64-UF2-NEXT: [[TMP29:%.*]] = insertelement <4 x i7> [[TMP28]], i7 [[TMP6]], i32 1 -; RV64-UF2-NEXT: [[TMP30:%.*]] = insertelement <4 x i7> [[TMP29]], i7 [[TMP26]], i32 2 -; RV64-UF2-NEXT: [[TMP31:%.*]] = insertelement <4 x i7> [[TMP30]], i7 [[TMP27]], i32 3 -; RV64-UF2-NEXT: [[TMP32:%.*]] = load i7, ptr [[TMP20]], align 1 -; RV64-UF2-NEXT: [[TMP33:%.*]] = load i7, ptr [[TMP21]], align 1 -; RV64-UF2-NEXT: [[TMP34:%.*]] = load i7, ptr [[TMP22]], align 1 -; RV64-UF2-NEXT: [[TMP35:%.*]] = load i7, ptr [[TMP23]], align 1 -; RV64-UF2-NEXT: [[TMP36:%.*]] = insertelement <4 x i7> poison, i7 [[TMP32]], i32 0 -; RV64-UF2-NEXT: [[TMP37:%.*]] = insertelement <4 x i7> [[TMP36]], i7 [[TMP33]], i32 1 -; RV64-UF2-NEXT: [[TMP38:%.*]] = insertelement <4 x i7> [[TMP37]], i7 [[TMP34]], i32 2 -; RV64-UF2-NEXT: [[TMP39:%.*]] = insertelement <4 x i7> [[TMP38]], i7 [[TMP35]], i32 3 -; RV64-UF2-NEXT: [[TMP40:%.*]] = add <4 x i7> [[TMP31]], splat (i7 1) -; RV64-UF2-NEXT: [[TMP41:%.*]] = add <4 x i7> [[TMP39]], splat (i7 1) -; RV64-UF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP1]] -; RV64-UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP2]] -; RV64-UF2-NEXT: [[TMP44:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP51]] -; RV64-UF2-NEXT: [[TMP45:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP11]] -; RV64-UF2-NEXT: [[TMP46:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP59]] -; RV64-UF2-NEXT: [[TMP47:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP13]] -; RV64-UF2-NEXT: [[TMP48:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP14]] -; RV64-UF2-NEXT: [[TMP49:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP15]] -; RV64-UF2-NEXT: [[TMP7:%.*]] = extractelement <4 x i7> [[TMP40]], i32 0 -; RV64-UF2-NEXT: store i7 [[TMP7]], ptr [[TMP9]], align 1 -; RV64-UF2-NEXT: [[TMP8:%.*]] = extractelement <4 x i7> [[TMP40]], i32 1 -; RV64-UF2-NEXT: store i7 [[TMP8]], ptr [[TMP10]], align 1 -; RV64-UF2-NEXT: [[TMP52:%.*]] = extractelement <4 x i7> [[TMP40]], i32 2 -; RV64-UF2-NEXT: store i7 [[TMP52]], ptr [[TMP44]], align 1 -; RV64-UF2-NEXT: [[TMP53:%.*]] = extractelement <4 x i7> [[TMP40]], i32 3 -; RV64-UF2-NEXT: store i7 [[TMP53]], ptr [[TMP45]], align 1 -; RV64-UF2-NEXT: [[TMP54:%.*]] = extractelement <4 x i7> [[TMP41]], i32 0 -; RV64-UF2-NEXT: store i7 [[TMP54]], ptr [[TMP46]], align 1 -; RV64-UF2-NEXT: [[TMP55:%.*]] = extractelement <4 x i7> [[TMP41]], i32 1 -; RV64-UF2-NEXT: store i7 [[TMP55]], ptr [[TMP47]], align 1 -; RV64-UF2-NEXT: [[TMP56:%.*]] = extractelement <4 x i7> [[TMP41]], i32 2 -; RV64-UF2-NEXT: store i7 [[TMP56]], ptr [[TMP48]], align 1 -; RV64-UF2-NEXT: [[TMP57:%.*]] = extractelement <4 x i7> [[TMP41]], i32 3 -; RV64-UF2-NEXT: store i7 [[TMP57]], ptr [[TMP49]], align 1 -; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; RV64-UF2-NEXT: [[TMP58:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1016 -; RV64-UF2-NEXT: br i1 [[TMP58]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] -; RV64-UF2: [[MIDDLE_BLOCK]]: -; RV64-UF2-NEXT: br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]] -; RV64-UF2: [[SCALAR_PH]]: -; RV64-UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 7, %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ] -; RV64-UF2-NEXT: br label %[[FOR_BODY:.*]] -; RV64-UF2: [[FOR_BODY]]: -; RV64-UF2-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] -; RV64-UF2-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1 -; RV64-UF2-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[IV_NEXT]] -; RV64-UF2-NEXT: [[TMP12:%.*]] = load i7, ptr [[ARRAYIDX_B]], align 1 -; RV64-UF2-NEXT: [[ADD:%.*]] = add i7 [[TMP12]], 1 -; RV64-UF2-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[IV_NEXT]] -; RV64-UF2-NEXT: store i7 [[ADD]], ptr [[ARRAYIDX_A]], align 1 -; RV64-UF2-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1 -; RV64-UF2-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP7:![0-9]+]] -; RV64-UF2: [[EXIT]]: -; RV64-UF2-NEXT: ret void -; -entry: - br label %for.body - -for.body: - %dec.iv = phi i64 [ 1023, %entry ], [ %iv.next, %for.body ] - %iv.next = add nsw i64 %dec.iv, -1 - %arrayidx.b = getelementptr inbounds i7, ptr %B, i64 %iv.next - %0 = load i7, ptr %arrayidx.b, align 1 - %add = add i7 %0, 1 - %arrayidx.a = getelementptr inbounds i7, ptr %A, i64 %iv.next - store i7 %add, ptr %arrayidx.a, align 1 - %cmp = icmp ugt i64 %dec.iv, 1 - br i1 %cmp, label %for.body, label %exit, !llvm.loop !4 - -exit: - ret void -} - -!0 = distinct !{!0, !1, !2, !3} -!1 = !{!"llvm.loop.vectorize.width", i32 4} -!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} -!3 = !{!"llvm.loop.vectorize.enable", i1 true} -!4 = distinct !{!4, !1, !3} diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll index ad445c8..f59ab56 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll @@ -1,400 +1,455 @@ -; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5 -; This is the loop in c++ being vectorize in this file with -;vector.reverse -; #pragma clang loop vectorize_width(4, scalable) -; for (int i = N-1; i >= 0; --i) -; a[i] = b[i] + 1.0; +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "for.body:" --version 5 +;; This is the loop in c++ being vectorize in this file with +;; vector.reverse +;; #pragma clang loop vectorize_width(4, scalable) +;; for (int i = N-1; i >= 0; --i) +;; a[i] = b[i] + 1.0; -; REQUIRES: asserts -; RUN: opt -passes=loop-vectorize,dce,instcombine -mtriple riscv64-linux-gnu \ -; RUN: -mattr=+v -debug-only=loop-vectorize,vplan -scalable-vectorization=on \ -; RUN: -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v -S < %s \ +; RUN: | FileCheck --check-prefix=RV64 %s + +; RUN: opt -passes=loop-vectorize -mtriple=riscv32 -mattr=+v -S < %s \ +; RUN: | FileCheck --check-prefix=RV32 %s + +; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v \ +; RUN: -force-vector-interleave=2 -S < %s \ +; RUN: | FileCheck --check-prefix=RV64-UF2 %s + +define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) { +; RV64-LABEL: define void @vector_reverse_i32( +; RV64-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; RV64-NEXT: [[ENTRY:.*]]: +; RV64-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; RV64-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 +; RV64-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]] +; RV64-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; RV64: [[VECTOR_PH]]: +; RV64-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; RV64-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 +; RV64-NEXT: [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]] +; RV64-NEXT: [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]] +; RV64-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; RV64-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 +; RV64-NEXT: [[TMP6:%.*]] = sub i64 1023, [[N_VEC]] +; RV64-NEXT: br label %[[VECTOR_BODY:.*]] +; RV64: [[VECTOR_BODY]]: +; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; RV64-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] +; RV64-NEXT: [[TMP7:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 +; RV64-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP7]] +; RV64-NEXT: [[TMP9:%.*]] = mul i64 0, [[TMP5]] +; RV64-NEXT: [[TMP10:%.*]] = sub i64 [[TMP5]], 1 +; RV64-NEXT: [[TMP11:%.*]] = mul i64 -1, [[TMP10]] +; RV64-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i64 [[TMP9]] +; RV64-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i64 [[TMP11]] +; RV64-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP13]], align 4 +; RV64-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]]) +; RV64-NEXT: [[TMP14:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1) +; RV64-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP7]] +; RV64-NEXT: [[TMP16:%.*]] = mul i64 0, [[TMP5]] +; RV64-NEXT: [[TMP17:%.*]] = sub i64 [[TMP5]], 1 +; RV64-NEXT: [[TMP18:%.*]] = mul i64 -1, [[TMP17]] +; RV64-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 [[TMP16]] +; RV64-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 [[TMP18]] +; RV64-NEXT: [[REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP14]]) +; RV64-NEXT: store <vscale x 4 x i32> [[REVERSE1]], ptr [[TMP20]], align 4 +; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; RV64-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; RV64-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; RV64: [[MIDDLE_BLOCK]]: +; RV64-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]] +; RV64-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; RV64: [[SCALAR_PH]]: +; RV64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ] +; RV64-NEXT: br label %[[FOR_BODY:.*]] +; RV64: [[FOR_BODY]]: +; +; RV32-LABEL: define void @vector_reverse_i32( +; RV32-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; RV32-NEXT: [[ENTRY:.*]]: +; RV32-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; RV32-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 +; RV32-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]] +; RV32-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; RV32: [[VECTOR_PH]]: +; RV32-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; RV32-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 +; RV32-NEXT: [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]] +; RV32-NEXT: [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]] +; RV32-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; RV32-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 +; RV32-NEXT: [[TMP6:%.*]] = sub i64 1023, [[N_VEC]] +; RV32-NEXT: br label %[[VECTOR_BODY:.*]] +; RV32: [[VECTOR_BODY]]: +; RV32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; RV32-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] +; RV32-NEXT: [[TMP7:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 +; RV32-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP7]] +; RV32-NEXT: [[TMP9:%.*]] = trunc i64 [[TMP5]] to i32 +; RV32-NEXT: [[TMP10:%.*]] = mul i32 0, [[TMP9]] +; RV32-NEXT: [[TMP11:%.*]] = sub i32 [[TMP9]], 1 +; RV32-NEXT: [[TMP12:%.*]] = mul i32 -1, [[TMP11]] +; RV32-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 [[TMP10]] +; RV32-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 [[TMP12]] +; RV32-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4 +; RV32-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]]) +; RV32-NEXT: [[TMP15:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1) +; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP7]] +; RV32-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP5]] to i32 +; RV32-NEXT: [[TMP18:%.*]] = mul i32 0, [[TMP17]] +; RV32-NEXT: [[TMP19:%.*]] = sub i32 [[TMP17]], 1 +; RV32-NEXT: [[TMP20:%.*]] = mul i32 -1, [[TMP19]] +; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 [[TMP18]] +; RV32-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 [[TMP20]] +; RV32-NEXT: [[REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP15]]) +; RV32-NEXT: store <vscale x 4 x i32> [[REVERSE1]], ptr [[TMP22]], align 4 +; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; RV32-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; RV32-NEXT: br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; RV32: [[MIDDLE_BLOCK]]: +; RV32-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]] +; RV32-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; RV32: [[SCALAR_PH]]: +; RV32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ] +; RV32-NEXT: br label %[[FOR_BODY:.*]] +; RV32: [[FOR_BODY]]: +; +; RV64-UF2-LABEL: define void @vector_reverse_i32( +; RV64-UF2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; RV64-UF2-NEXT: [[ENTRY:.*]]: +; RV64-UF2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; RV64-UF2-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8 +; RV64-UF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]] +; RV64-UF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; RV64-UF2: [[VECTOR_PH]]: +; RV64-UF2-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; RV64-UF2-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 +; RV64-UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]] +; RV64-UF2-NEXT: [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]] +; RV64-UF2-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; RV64-UF2-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 +; RV64-UF2-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2 +; RV64-UF2-NEXT: [[TMP7:%.*]] = sub i64 1023, [[N_VEC]] +; RV64-UF2-NEXT: br label %[[VECTOR_BODY:.*]] +; RV64-UF2: [[VECTOR_BODY]]: +; RV64-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; RV64-UF2-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] +; RV64-UF2-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 +; RV64-UF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP8]] +; RV64-UF2-NEXT: [[TMP10:%.*]] = mul i64 0, [[TMP5]] +; RV64-UF2-NEXT: [[TMP11:%.*]] = sub i64 [[TMP5]], 1 +; RV64-UF2-NEXT: [[TMP12:%.*]] = mul i64 -1, [[TMP11]] +; RV64-UF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 [[TMP10]] +; RV64-UF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i64 [[TMP12]] +; RV64-UF2-NEXT: [[TMP15:%.*]] = mul i64 -1, [[TMP5]] +; RV64-UF2-NEXT: [[TMP16:%.*]] = sub i64 [[TMP5]], 1 +; RV64-UF2-NEXT: [[TMP17:%.*]] = mul i64 -1, [[TMP16]] +; RV64-UF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 [[TMP15]] +; RV64-UF2-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i64 [[TMP17]] +; RV64-UF2-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4 +; RV64-UF2-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]]) +; RV64-UF2-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[TMP19]], align 4 +; RV64-UF2-NEXT: [[REVERSE2:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD1]]) +; RV64-UF2-NEXT: [[TMP20:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1) +; RV64-UF2-NEXT: [[TMP21:%.*]] = add <vscale x 4 x i32> [[REVERSE2]], splat (i32 1) +; RV64-UF2-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]] +; RV64-UF2-NEXT: [[TMP23:%.*]] = mul i64 0, [[TMP5]] +; RV64-UF2-NEXT: [[TMP24:%.*]] = sub i64 [[TMP5]], 1 +; RV64-UF2-NEXT: [[TMP25:%.*]] = mul i64 -1, [[TMP24]] +; RV64-UF2-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i64 [[TMP23]] +; RV64-UF2-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP26]], i64 [[TMP25]] +; RV64-UF2-NEXT: [[TMP28:%.*]] = mul i64 -1, [[TMP5]] +; RV64-UF2-NEXT: [[TMP29:%.*]] = sub i64 [[TMP5]], 1 +; RV64-UF2-NEXT: [[TMP30:%.*]] = mul i64 -1, [[TMP29]] +; RV64-UF2-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i64 [[TMP28]] +; RV64-UF2-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[TMP31]], i64 [[TMP30]] +; RV64-UF2-NEXT: [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP20]]) +; RV64-UF2-NEXT: store <vscale x 4 x i32> [[REVERSE3]], ptr [[TMP27]], align 4 +; RV64-UF2-NEXT: [[REVERSE4:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP21]]) +; RV64-UF2-NEXT: store <vscale x 4 x i32> [[REVERSE4]], ptr [[TMP32]], align 4 +; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] +; RV64-UF2-NEXT: [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; RV64-UF2-NEXT: br i1 [[TMP33]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; RV64-UF2: [[MIDDLE_BLOCK]]: +; RV64-UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]] +; RV64-UF2-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; RV64-UF2: [[SCALAR_PH]]: +; RV64-UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ] +; RV64-UF2-NEXT: br label %[[FOR_BODY:.*]] +; RV64-UF2: [[FOR_BODY]]: +; +entry: + br label %for.body + +for.body: + %dec.iv = phi i64 [ 1023, %entry ], [ %iv.next, %for.body ] + %iv.next = add nsw i64 %dec.iv, -1 + %arrayidx.b = getelementptr inbounds i32, ptr %B, i64 %iv.next + %0 = load i32, ptr %arrayidx.b, align 4 + %add = add i32 %0, 1 + %arrayidx.a = getelementptr inbounds i32, ptr %A, i64 %iv.next + store i32 %add, ptr %arrayidx.a, align 4 + %cmp = icmp ugt i64 %dec.iv, 1 + br i1 %cmp, label %for.body, label %exit, !llvm.loop !0 + +exit: + ret void +} define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocapture noundef readonly %B, i32 noundef signext %n) { -; CHECK-LABEL: 'vector_reverse_i64' -; CHECK-NEXT: LV: Loop hints: force=enabled width=vscale x 4 interleave=0 -; CHECK-NEXT: LV: Found a loop: for.body -; CHECK-NEXT: LV: Found an induction variable. -; CHECK-NEXT: LV: Found an induction variable. -; CHECK-NEXT: LV: Did not find one integer induction var. -; CHECK-NEXT: LV: We can vectorize this loop (with a runtime bound check)! -; CHECK-NEXT: LV: Loop does not require scalar epilogue -; CHECK-NEXT: LV: Found trip count: 0 -; CHECK-NEXT: LV: Found maximum trip count: 4294967295 -; CHECK-NEXT: LV: Scalable vectorization is available -; CHECK-NEXT: LV: The max safe fixed VF is: 67108864. -; CHECK-NEXT: LV: The max safe scalable VF is: vscale x 4294967295. -; CHECK-NEXT: LV: Found uniform instruction: %cmp = icmp ugt i64 %indvars.iv, 1 -; CHECK-NEXT: LV: Found uniform instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Found uniform instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom -; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] -; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 -; CHECK-NEXT: LV: Found uniform instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] -; CHECK-NEXT: LV: Found uniform instruction: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] -; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4 -; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %add9 = add i32 %1, 1 -; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 -; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0 -; CHECK-NEXT: LV: Using user VF vscale x 4. -; CHECK-NEXT: Creating VPBasicBlock for for.body -; CHECK-NEXT: VPlan 'Plain CFG -; CHECK-NEXT: for UF>=1' { -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<for.body.preheader>: -; CHECK-NEXT: IR %0 = zext i32 %n to i64 -; CHECK-NEXT: Successor(s): for.body -; CHECK-EMPTY: -; CHECK-NEXT: for.body: -; CHECK-NEXT: WIDEN-PHI ir<%indvars.iv> = phi [ ir<%indvars.iv.next>, for.body ], [ ir<%0>, ir-bb<for.body.preheader> ] -; CHECK-NEXT: WIDEN-PHI ir<%i.0.in8> = phi [ ir<%i.0>, for.body ], [ ir<%n>, ir-bb<for.body.preheader> ] -; CHECK-NEXT: EMIT ir<%i.0> = add ir<%i.0.in8>, ir<-1> -; CHECK-NEXT: EMIT-SCALAR ir<%idxprom> = zext ir<%i.0> -; CHECK-NEXT: EMIT ir<%arrayidx> = getelementptr ir<%B>, ir<%idxprom> -; CHECK-NEXT: EMIT ir<%1> = load ir<%arrayidx> -; CHECK-NEXT: EMIT ir<%add9> = add ir<%1>, ir<1> -; CHECK-NEXT: EMIT ir<%arrayidx3> = getelementptr ir<%A>, ir<%idxprom> -; CHECK-NEXT: EMIT store ir<%add9>, ir<%arrayidx3> -; CHECK-NEXT: EMIT ir<%cmp> = icmp ir<%indvars.iv>, ir<1> -; CHECK-NEXT: EMIT ir<%indvars.iv.next> = add ir<%indvars.iv>, ir<-1> -; CHECK-NEXT: EMIT branch-on-cond ir<%cmp> -; CHECK-NEXT: Successor(s): for.body, ir-bb<for.cond.cleanup.loopexit> -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<for.cond.cleanup.loopexit>: -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; CHECK-NEXT: LV: Loop does not require scalar epilogue -; CHECK-NEXT: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Scalarizing: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Scalarizing: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom -; CHECK-NEXT: LV: Scalarizing: %cmp = icmp ugt i64 %indvars.iv, 1 -; CHECK-NEXT: LV: Scalarizing: %indvars.iv.next = add nsw i64 %indvars.iv, -1 -; CHECK-NEXT: VPlan 'Initial VPlan for VF={vscale x 4},UF>=1' { -; CHECK-NEXT: Live-in vp<%0> = VF -; CHECK-NEXT: Live-in vp<%1> = VF * UF -; CHECK-NEXT: Live-in vp<%2> = vector-trip-count -; CHECK-NEXT: vp<%3> = original trip-count -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<for.body.preheader>: -; CHECK-NEXT: IR %0 = zext i32 %n to i64 -; CHECK-NEXT: EMIT vp<%3> = EXPAND SCEV (zext i32 %n to i64) -; CHECK-NEXT: Successor(s): scalar.ph, vector.ph -; CHECK-EMPTY: -; CHECK-NEXT: vector.ph: -; CHECK-NEXT: vp<%4> = DERIVED-IV ir<%0> + vp<%2> * ir<-1> -; CHECK-NEXT: vp<%5> = DERIVED-IV ir<%n> + vp<%2> * ir<-1> -; CHECK-NEXT: Successor(s): vector loop -; CHECK-EMPTY: -; CHECK-NEXT: <x1> vector loop: { -; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK-NEXT: vp<%7> = DERIVED-IV ir<%n> + vp<%6> * ir<-1> -; CHECK-NEXT: vp<%8> = SCALAR-STEPS vp<%7>, ir<-1>, vp<%0> -; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<%8>, ir<-1> -; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0> -; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom> -; CHECK-NEXT: vp<%9> = vector-end-pointer inbounds ir<%arrayidx>, vp<%0> -; CHECK-NEXT: WIDEN ir<%1> = load vp<%9> -; CHECK-NEXT: WIDEN ir<%add9> = add ir<%1>, ir<1> -; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom> -; CHECK-NEXT: vp<%10> = vector-end-pointer inbounds ir<%arrayidx3>, vp<%0> -; CHECK-NEXT: WIDEN store vp<%10>, ir<%add9> -; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%6>, vp<%1> -; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%2> -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; CHECK-NEXT: Successor(s): middle.block -; CHECK-EMPTY: -; CHECK-NEXT: middle.block: -; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<%2> -; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> -; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup.loopexit>, scalar.ph -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<for.cond.cleanup.loopexit>: -; CHECK-NEXT: No successors -; CHECK-EMPTY: -; CHECK-NEXT: scalar.ph: -; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%4>, middle.block ], [ ir<%0>, ir-bb<for.body.preheader> ] -; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val>.1 = phi [ vp<%5>, middle.block ], [ ir<%n>, ir-bb<for.body.preheader> ] -; CHECK-NEXT: Successor(s): ir-bb<for.body> -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<for.body>: -; CHECK-NEXT: IR %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] (extra operand: vp<%bc.resume.val> from scalar.ph) -; CHECK-NEXT: IR %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] (extra operand: vp<%bc.resume.val>.1 from scalar.ph) -; CHECK-NEXT: IR %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: IR %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: IR %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom -; CHECK-NEXT: IR %1 = load i32, ptr %arrayidx, align 4 -; CHECK-NEXT: IR %add9 = add i32 %1, 1 -; CHECK-NEXT: IR %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom -; CHECK-NEXT: IR store i32 %add9, ptr %arrayidx3, align 4 -; CHECK-NEXT: IR %cmp = icmp ugt i64 %indvars.iv, 1 -; CHECK-NEXT: IR %indvars.iv.next = add nsw i64 %indvars.iv, -1 -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] -; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4 -; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %add9 = add i32 %1, 1 -; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 -; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0 -; CHECK-NEXT: LV(REG): Calculating max register usage: -; CHECK-NEXT: LV(REG): At #0 Interval # 0 -; CHECK-NEXT: LV(REG): At #1 Interval # 1 -; CHECK-NEXT: LV(REG): At #2 Interval # 2 -; CHECK-NEXT: LV(REG): At #3 Interval # 2 -; CHECK-NEXT: LV(REG): At #4 Interval # 2 -; CHECK-NEXT: LV(REG): At #5 Interval # 2 -; CHECK-NEXT: LV(REG): At #6 Interval # 3 -; CHECK-NEXT: LV(REG): At #7 Interval # 3 -; CHECK-NEXT: LV(REG): At #8 Interval # 3 -; CHECK-NEXT: LV(REG): At #9 Interval # 3 -; CHECK-NEXT: LV(REG): At #10 Interval # 3 -; CHECK-NEXT: LV(REG): At #11 Interval # 3 -; CHECK-NEXT: LV(REG): At #12 Interval # 2 -; CHECK-NEXT: LV(REG): At #13 Interval # 2 -; CHECK-NEXT: LV(REG): VF = vscale x 4 -; CHECK-NEXT: LV(REG): Found max usage: 2 item -; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers -; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 2 registers -; CHECK-NEXT: LV(REG): Found invariant usage: 1 item -; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers -; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class -; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class -; CHECK-NEXT: LV: Loop does not require scalar epilogue -; CHECK-NEXT: LV: Loop cost is 24 -; CHECK-NEXT: LV: IC is 1 -; CHECK-NEXT: LV: VF is vscale x 4 -; CHECK-NEXT: LV: Not Interleaving. -; CHECK-NEXT: LV: Interleaving is not beneficial. -; CHECK-NEXT: LV: Found a vectorizable loop (vscale x 4) in <stdin> -; CHECK-NEXT: LEV: Epilogue vectorization is not profitable for this loop -; CHECK-NEXT: LV: Loop does not require scalar epilogue -; CHECK-NEXT: LV: Loop does not require scalar epilogue -; CHECK-NEXT: Executing best plan with VF=vscale x 4, UF=1 -; CHECK-NEXT: VPlan 'Final VPlan for VF={vscale x 4},UF={1}' { -; CHECK-NEXT: Live-in ir<%18> = VF -; CHECK-NEXT: Live-in ir<%18>.1 = VF * UF -; CHECK-NEXT: Live-in ir<%n.vec> = vector-trip-count -; CHECK-NEXT: Live-in ir<%0> = original trip-count -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<for.body.preheader>: -; CHECK-NEXT: IR %0 = zext i32 %n to i64 -; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.scevcheck> -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<vector.scevcheck>: -; CHECK-NEXT: IR %3 = add nsw i64 %0, -1 -; CHECK-NEXT: IR %4 = add i32 %n, -1 -; CHECK-NEXT: IR %5 = trunc i64 %3 to i32 -; CHECK-NEXT: IR %mul = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 %5) -; CHECK-NEXT: IR %mul.result = extractvalue { i32, i1 } %mul, 0 -; CHECK-NEXT: IR %mul.overflow = extractvalue { i32, i1 } %mul, 1 -; CHECK-NEXT: IR %6 = sub i32 %4, %mul.result -; CHECK-NEXT: IR %7 = icmp ugt i32 %6, %4 -; CHECK-NEXT: IR %8 = or i1 %7, %mul.overflow -; CHECK-NEXT: IR %9 = icmp ugt i64 %3, 4294967295 -; CHECK-NEXT: IR %10 = or i1 %8, %9 -; CHECK-NEXT: EMIT branch-on-cond ir<%10> -; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.memcheck> -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<vector.memcheck>: -; CHECK-NEXT: IR %11 = call i64 @llvm.vscale.i64() -; CHECK-NEXT: IR %12 = mul nuw i64 %11, 4 -; CHECK-NEXT: IR %13 = mul i64 %12, 4 -; CHECK-NEXT: IR %14 = sub i64 %B1, %A2 -; CHECK-NEXT: IR %diff.check = icmp ult i64 %14, %13 -; CHECK-NEXT: EMIT branch-on-cond ir<%diff.check> -; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.ph> -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<vector.ph>: -; CHECK-NEXT: IR %15 = call i64 @llvm.vscale.i64() -; CHECK-NEXT: IR %16 = mul nuw i64 %15, 4 -; CHECK-NEXT: IR %n.mod.vf = urem i64 %0, %16 -; CHECK-NEXT: IR %n.vec = sub i64 %0, %n.mod.vf -; CHECK-NEXT: IR %17 = call i64 @llvm.vscale.i64() -; CHECK-NEXT: IR %18 = mul nuw i64 %17, 4 -; CHECK-NEXT: vp<%3> = DERIVED-IV ir<%0> + ir<%n.vec> * ir<-1> -; CHECK-NEXT: vp<%4> = DERIVED-IV ir<%n> + ir<%n.vec> * ir<-1> -; CHECK-NEXT: Successor(s): vector.body -; CHECK-EMPTY: -; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT-SCALAR vp<%index> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<%index.next>, vector.body ] -; CHECK-NEXT: vp<%5> = DERIVED-IV ir<%n> + vp<%index> * ir<-1> -; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<%5>, ir<-1> -; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0> -; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom> -; CHECK-NEXT: vp<%6> = vector-end-pointer inbounds ir<%arrayidx>, ir<%18> -; CHECK-NEXT: WIDEN ir<%19> = load vp<%6> -; CHECK-NEXT: WIDEN ir<%add9> = add ir<%19>, ir<1> -; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom> -; CHECK-NEXT: vp<%7> = vector-end-pointer inbounds ir<%arrayidx3>, ir<%18> -; CHECK-NEXT: WIDEN store vp<%7>, ir<%add9> -; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%index>, ir<%18>.1 -; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, ir<%n.vec> -; CHECK-NEXT: Successor(s): middle.block, vector.body -; CHECK-EMPTY: -; CHECK-NEXT: middle.block: -; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq ir<%0>, ir<%n.vec> -; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> -; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup.loopexit>, ir-bb<scalar.ph> -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<for.cond.cleanup.loopexit>: -; CHECK-NEXT: No successors -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<scalar.ph>: -; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%3>, middle.block ], [ ir<%0>, ir-bb<for.body.preheader> ], [ ir<%0>, ir-bb<vector.scevcheck> ], [ ir<%0>, ir-bb<vector.memcheck> ] -; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val>.1 = phi [ vp<%4>, middle.block ], [ ir<%n>, ir-bb<for.body.preheader> ], [ ir<%n>, ir-bb<vector.scevcheck> ], [ ir<%n>, ir-bb<vector.memcheck> ] -; CHECK-NEXT: Successor(s): ir-bb<for.body> -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<for.body>: -; CHECK-NEXT: IR %indvars.iv = phi i64 [ %0, %scalar.ph ], [ %indvars.iv.next, %for.body ] (extra operand: vp<%bc.resume.val> from ir-bb<scalar.ph>) -; CHECK-NEXT: IR %i.0.in8 = phi i32 [ %n, %scalar.ph ], [ %i.0, %for.body ] (extra operand: vp<%bc.resume.val>.1 from ir-bb<scalar.ph>) -; CHECK-NEXT: IR %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: IR %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: IR %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom -; CHECK-NEXT: IR %19 = load i32, ptr %arrayidx, align 4 -; CHECK-NEXT: IR %add9 = add i32 %19, 1 -; CHECK-NEXT: IR %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom -; CHECK-NEXT: IR store i32 %add9, ptr %arrayidx3, align 4 -; CHECK-NEXT: IR %cmp = icmp ugt i64 %indvars.iv, 1 -; CHECK-NEXT: IR %indvars.iv.next = add nsw i64 %indvars.iv, -1 -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<for.body.preheader> in BB: for.body.preheader -; CHECK-NEXT: LV: filled BB: -; CHECK-NEXT: for.body.preheader: ; preds = %entry -; CHECK-NEXT: %0 = zext i32 %n to i64 -; CHECK-NEXT: %1 = call i64 @llvm.vscale.i64() -; CHECK-NEXT: %2 = mul nuw i64 %1, 4 -; CHECK-NEXT: %min.iters.check = icmp ult i64 %0, %2 -; CHECK-NEXT: br i1 %min.iters.check, label %scalar.ph, label %vector.ph -; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<vector.scevcheck> in BB: vector.scevcheck -; CHECK-NEXT: LV: filled BB: -; CHECK-NEXT: vector.scevcheck: ; No predecessors! -; CHECK-NEXT: %3 = add nsw i64 %0, -1 -; CHECK-NEXT: %4 = add i32 %n, -1 -; CHECK-NEXT: %5 = trunc i64 %3 to i32 -; CHECK-NEXT: %mul = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 %5) -; CHECK-NEXT: %mul.result = extractvalue { i32, i1 } %mul, 0 -; CHECK-NEXT: %mul.overflow = extractvalue { i32, i1 } %mul, 1 -; CHECK-NEXT: %6 = sub i32 %4, %mul.result -; CHECK-NEXT: %7 = icmp ugt i32 %6, %4 -; CHECK-NEXT: %8 = or i1 %7, %mul.overflow -; CHECK-NEXT: %9 = icmp ugt i64 %3, 4294967295 -; CHECK-NEXT: %10 = or i1 %8, %9 -; CHECK-NEXT: br i1 %10, <null operand!>, <null operand!> -; CHECK-NEXT: LV: draw edge from for.body.preheader -; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<vector.memcheck> in BB: vector.memcheck -; CHECK-NEXT: LV: filled BB: -; CHECK-NEXT: vector.memcheck: ; No predecessors! -; CHECK-NEXT: %11 = call i64 @llvm.vscale.i64() -; CHECK-NEXT: %12 = mul nuw i64 %11, 4 -; CHECK-NEXT: %13 = mul i64 %12, 4 -; CHECK-NEXT: %14 = sub i64 %B1, %A2 -; CHECK-NEXT: %diff.check = icmp ult i64 %14, %13 -; CHECK-NEXT: br i1 %diff.check, <null operand!>, <null operand!> -; CHECK-NEXT: LV: draw edge from vector.scevcheck -; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<vector.ph> in BB: vector.ph -; CHECK-NEXT: LV: filled BB: -; CHECK-NEXT: vector.ph: ; No predecessors! -; CHECK-NEXT: %15 = call i64 @llvm.vscale.i64() -; CHECK-NEXT: %16 = mul nuw i64 %15, 4 -; CHECK-NEXT: %n.mod.vf = urem i64 %0, %16 -; CHECK-NEXT: %n.vec = sub i64 %0, %n.mod.vf -; CHECK-NEXT: %17 = call i64 @llvm.vscale.i64() -; CHECK-NEXT: %18 = mul nuw i64 %17, 4 -; CHECK-NEXT: %19 = sub i64 %0, %n.vec -; CHECK-NEXT: %.cast = trunc i64 %n.vec to i32 -; CHECK-NEXT: %20 = sub i32 %n, %.cast -; CHECK-NEXT: br -; CHECK-NEXT: LV: draw edge from vector.memcheck -; CHECK-NEXT: LV: created vector.body -; CHECK-NEXT: LV: draw edge from vector.ph -; CHECK-NEXT: LV: vectorizing VPBB: vector.body in BB: vector.body -; CHECK-NEXT: LV: filled BB: -; CHECK-NEXT: vector.body: ; preds = %vector.body, %vector.ph -; CHECK-NEXT: %index = phi i64 [ 0, %vector.ph ] -; CHECK-NEXT: %.cast3 = trunc i64 %index to i32 -; CHECK-NEXT: %offset.idx = sub i32 %n, %.cast3 -; CHECK-NEXT: %21 = add nsw i32 %offset.idx, -1 -; CHECK-NEXT: %22 = zext i32 %21 to i64 -; CHECK-NEXT: %23 = getelementptr inbounds i32, ptr %B, i64 %22 -; CHECK-NEXT: %24 = mul i64 0, %18 -; CHECK-NEXT: %25 = sub i64 %18, 1 -; CHECK-NEXT: %26 = mul i64 -1, %25 -; CHECK-NEXT: %27 = getelementptr inbounds i32, ptr %23, i64 %24 -; CHECK-NEXT: %28 = getelementptr inbounds i32, ptr %27, i64 %26 -; CHECK-NEXT: %wide.load = load <vscale x 4 x i32>, ptr %28, align 4 -; CHECK-NEXT: %reverse = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %wide.load) -; CHECK-NEXT: %29 = add <vscale x 4 x i32> %reverse, splat (i32 1) -; CHECK-NEXT: %30 = getelementptr inbounds i32, ptr %A, i64 %22 -; CHECK-NEXT: %31 = mul i64 0, %18 -; CHECK-NEXT: %32 = sub i64 %18, 1 -; CHECK-NEXT: %33 = mul i64 -1, %32 -; CHECK-NEXT: %34 = getelementptr inbounds i32, ptr %30, i64 %31 -; CHECK-NEXT: %35 = getelementptr inbounds i32, ptr %34, i64 %33 -; CHECK-NEXT: %reverse4 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %29) -; CHECK-NEXT: store <vscale x 4 x i32> %reverse4, ptr %35, align 4 -; CHECK-NEXT: %index.next = add nuw i64 %index, %18 -; CHECK-NEXT: %36 = icmp eq i64 %index.next, %n.vec -; CHECK-NEXT: br i1 %36, <null operand!>, label %vector.body -; CHECK-NEXT: LV: created middle.block -; CHECK-NEXT: LV: draw edge from vector.body -; CHECK-NEXT: LV: vectorizing VPBB: middle.block in BB: middle.block -; CHECK-NEXT: LV: filled BB: -; CHECK-NEXT: middle.block: ; preds = %vector.body -; CHECK-NEXT: %cmp.n = icmp eq i64 %0, %n.vec -; CHECK-NEXT: br i1 %cmp.n, <null operand!>, <null operand!> -; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<for.cond.cleanup.loopexit> in BB: for.cond.cleanup.loopexit -; CHECK-NEXT: LV: filled BB: -; CHECK-NEXT: for.cond.cleanup.loopexit: ; preds = %for.body -; CHECK-NEXT: br label %for.cond.cleanup -; CHECK-NEXT: LV: draw edge from middle.block -; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<scalar.ph> in BB: scalar.ph -; CHECK-NEXT: LV: filled BB: -; CHECK-NEXT: scalar.ph: ; preds = %for.body.preheader -; CHECK-NEXT: %bc.resume.val = phi i64 [ %19, %middle.block ], [ %0, %for.body.preheader ], [ %0, %vector.scevcheck ], [ %0, %vector.memcheck ] -; CHECK-NEXT: %bc.resume.val5 = phi i32 [ %20, %middle.block ], [ %n, %for.body.preheader ], [ %n, %vector.scevcheck ], [ %n, %vector.memcheck ] -; CHECK-NEXT: br label %for.body -; CHECK-NEXT: LV: draw edge from middle.block -; CHECK-NEXT: LV: draw edge from for.body.preheader -; CHECK-NEXT: LV: draw edge from vector.scevcheck -; CHECK-NEXT: LV: draw edge from vector.memcheck -; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<for.body> in BB: for.body -; CHECK-NEXT: LV: filled BB: -; CHECK-NEXT: for.body: ; preds = %for.body, %scalar.ph -; CHECK-NEXT: %indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next, %for.body ] -; CHECK-NEXT: %i.0.in8 = phi i32 [ %bc.resume.val5, %scalar.ph ], [ %i.0, %for.body ] -; CHECK-NEXT: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom -; CHECK-NEXT: %37 = load i32, ptr %arrayidx, align 4 -; CHECK-NEXT: %add9 = add i32 %37, 1 -; CHECK-NEXT: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom -; CHECK-NEXT: store i32 %add9, ptr %arrayidx3, align 4 -; CHECK-NEXT: %cmp = icmp ugt i64 %indvars.iv, 1 -; CHECK-NEXT: %indvars.iv.next = add nsw i64 %indvars.iv, -1 -; CHECK-NEXT: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0 -; CHECK-NEXT: LV: draw edge from scalar.ph -; CHECK-NEXT: LV: Interleaving disabled by the pass manager -; CHECK-NEXT: LV: Vectorizing: innermost loop. -; CHECK-EMPTY: +; RV64-LABEL: define void @vector_reverse_i64( +; RV64-SAME: ptr noundef writeonly captures(none) [[A:%.*]], ptr noundef readonly captures(none) [[B:%.*]], i32 noundef signext [[N:%.*]]) #[[ATTR0]] { +; RV64-NEXT: [[ENTRY:.*:]] +; RV64-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64 +; RV64-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64 +; RV64-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N]], 0 +; RV64-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] +; RV64: [[FOR_BODY_PREHEADER]]: +; RV64-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 +; RV64-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; RV64-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4 +; RV64-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; RV64-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] +; RV64: [[VECTOR_SCEVCHECK]]: +; RV64-NEXT: [[TMP3:%.*]] = add nsw i64 [[TMP0]], -1 +; RV64-NEXT: [[TMP4:%.*]] = add i32 [[N]], -1 +; RV64-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP3]] to i32 +; RV64-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP5]]) +; RV64-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0 +; RV64-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1 +; RV64-NEXT: [[TMP6:%.*]] = sub i32 [[TMP4]], [[MUL_RESULT]] +; RV64-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], [[TMP4]] +; RV64-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]] +; RV64-NEXT: [[TMP9:%.*]] = icmp ugt i64 [[TMP3]], 4294967295 +; RV64-NEXT: [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]] +; RV64-NEXT: br i1 [[TMP10]], label %[[SCALAR_PH]], label %[[VECTOR_MEMCHECK:.*]] +; RV64: [[VECTOR_MEMCHECK]]: +; RV64-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; RV64-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 4 +; RV64-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 +; RV64-NEXT: [[TMP14:%.*]] = sub i64 [[B1]], [[A2]] +; RV64-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP14]], [[TMP13]] +; RV64-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; RV64: [[VECTOR_PH]]: +; RV64-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; RV64-NEXT: [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 4 +; RV64-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP16]] +; RV64-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; RV64-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; RV64-NEXT: [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 4 +; RV64-NEXT: [[TMP19:%.*]] = sub i64 [[TMP0]], [[N_VEC]] +; RV64-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 +; RV64-NEXT: [[TMP20:%.*]] = sub i32 [[N]], [[DOTCAST]] +; RV64-NEXT: br label %[[VECTOR_BODY:.*]] +; RV64: [[VECTOR_BODY]]: +; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; RV64-NEXT: [[DOTCAST3:%.*]] = trunc i64 [[INDEX]] to i32 +; RV64-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST3]] +; RV64-NEXT: [[TMP21:%.*]] = add nsw i32 [[OFFSET_IDX]], -1 +; RV64-NEXT: [[TMP22:%.*]] = zext i32 [[TMP21]] to i64 +; RV64-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP22]] +; RV64-NEXT: [[TMP24:%.*]] = mul i64 0, [[TMP18]] +; RV64-NEXT: [[TMP25:%.*]] = sub i64 [[TMP18]], 1 +; RV64-NEXT: [[TMP26:%.*]] = mul i64 -1, [[TMP25]] +; RV64-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i64 [[TMP24]] +; RV64-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i64 [[TMP26]] +; RV64-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP28]], align 4 +; RV64-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]]) +; RV64-NEXT: [[TMP29:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1) +; RV64-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP22]] +; RV64-NEXT: [[TMP31:%.*]] = mul i64 0, [[TMP18]] +; RV64-NEXT: [[TMP32:%.*]] = sub i64 [[TMP18]], 1 +; RV64-NEXT: [[TMP33:%.*]] = mul i64 -1, [[TMP32]] +; RV64-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[TMP30]], i64 [[TMP31]] +; RV64-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[TMP34]], i64 [[TMP33]] +; RV64-NEXT: [[REVERSE4:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP29]]) +; RV64-NEXT: store <vscale x 4 x i32> [[REVERSE4]], ptr [[TMP35]], align 4 +; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP18]] +; RV64-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; RV64-NEXT: br i1 [[TMP36]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; RV64: [[MIDDLE_BLOCK]]: +; RV64-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; RV64-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; RV64: [[SCALAR_PH]]: +; RV64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP19]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[VECTOR_MEMCHECK]] ] +; RV64-NEXT: [[BC_RESUME_VAL5:%.*]] = phi i32 [ [[TMP20]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ], [ [[N]], %[[VECTOR_MEMCHECK]] ] +; RV64-NEXT: br label %[[FOR_BODY:.*]] +; RV64: [[FOR_COND_CLEANUP_LOOPEXIT]]: +; RV64-NEXT: br label %[[FOR_COND_CLEANUP]] +; RV64: [[FOR_COND_CLEANUP]]: +; RV64-NEXT: ret void +; RV64: [[FOR_BODY]]: +; +; RV32-LABEL: define void @vector_reverse_i64( +; RV32-SAME: ptr noundef writeonly captures(none) [[A:%.*]], ptr noundef readonly captures(none) [[B:%.*]], i32 noundef signext [[N:%.*]]) #[[ATTR0]] { +; RV32-NEXT: [[ENTRY:.*:]] +; RV32-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i32 +; RV32-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i32 +; RV32-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N]], 0 +; RV32-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] +; RV32: [[FOR_BODY_PREHEADER]]: +; RV32-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 +; RV32-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; RV32-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4 +; RV32-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; RV32-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; RV32: [[VECTOR_MEMCHECK]]: +; RV32-NEXT: [[TMP3:%.*]] = call i32 @llvm.vscale.i32() +; RV32-NEXT: [[TMP4:%.*]] = mul nuw i32 [[TMP3]], 4 +; RV32-NEXT: [[TMP5:%.*]] = mul i32 [[TMP4]], 4 +; RV32-NEXT: [[TMP6:%.*]] = sub i32 [[B1]], [[A2]] +; RV32-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i32 [[TMP6]], [[TMP5]] +; RV32-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; RV32: [[VECTOR_PH]]: +; RV32-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; RV32-NEXT: [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4 +; RV32-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP8]] +; RV32-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; RV32-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; RV32-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4 +; RV32-NEXT: [[TMP11:%.*]] = sub i64 [[TMP0]], [[N_VEC]] +; RV32-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 +; RV32-NEXT: [[TMP12:%.*]] = sub i32 [[N]], [[DOTCAST]] +; RV32-NEXT: br label %[[VECTOR_BODY:.*]] +; RV32: [[VECTOR_BODY]]: +; RV32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; RV32-NEXT: [[DOTCAST3:%.*]] = trunc i64 [[INDEX]] to i32 +; RV32-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST3]] +; RV32-NEXT: [[TMP13:%.*]] = add nsw i32 [[OFFSET_IDX]], -1 +; RV32-NEXT: [[TMP14:%.*]] = zext i32 [[TMP13]] to i64 +; RV32-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP14]] +; RV32-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP10]] to i32 +; RV32-NEXT: [[TMP17:%.*]] = mul i32 0, [[TMP16]] +; RV32-NEXT: [[TMP18:%.*]] = sub i32 [[TMP16]], 1 +; RV32-NEXT: [[TMP19:%.*]] = mul i32 -1, [[TMP18]] +; RV32-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 [[TMP17]] +; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 [[TMP19]] +; RV32-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP21]], align 4 +; RV32-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]]) +; RV32-NEXT: [[TMP22:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1) +; RV32-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]] +; RV32-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP10]] to i32 +; RV32-NEXT: [[TMP25:%.*]] = mul i32 0, [[TMP24]] +; RV32-NEXT: [[TMP26:%.*]] = sub i32 [[TMP24]], 1 +; RV32-NEXT: [[TMP27:%.*]] = mul i32 -1, [[TMP26]] +; RV32-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 [[TMP25]] +; RV32-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i32 [[TMP27]] +; RV32-NEXT: [[REVERSE4:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP22]]) +; RV32-NEXT: store <vscale x 4 x i32> [[REVERSE4]], ptr [[TMP29]], align 4 +; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] +; RV32-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; RV32-NEXT: br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; RV32: [[MIDDLE_BLOCK]]: +; RV32-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; RV32-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; RV32: [[SCALAR_PH]]: +; RV32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_MEMCHECK]] ] +; RV32-NEXT: [[BC_RESUME_VAL5:%.*]] = phi i32 [ [[TMP12]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_MEMCHECK]] ] +; RV32-NEXT: br label %[[FOR_BODY:.*]] +; RV32: [[FOR_COND_CLEANUP_LOOPEXIT]]: +; RV32-NEXT: br label %[[FOR_COND_CLEANUP]] +; RV32: [[FOR_COND_CLEANUP]]: +; RV32-NEXT: ret void +; RV32: [[FOR_BODY]]: +; +; RV64-UF2-LABEL: define void @vector_reverse_i64( +; RV64-UF2-SAME: ptr noundef writeonly captures(none) [[A:%.*]], ptr noundef readonly captures(none) [[B:%.*]], i32 noundef signext [[N:%.*]]) #[[ATTR0]] { +; RV64-UF2-NEXT: [[ENTRY:.*:]] +; RV64-UF2-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64 +; RV64-UF2-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64 +; RV64-UF2-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N]], 0 +; RV64-UF2-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] +; RV64-UF2: [[FOR_BODY_PREHEADER]]: +; RV64-UF2-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 +; RV64-UF2-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; RV64-UF2-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 8 +; RV64-UF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; RV64-UF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] +; RV64-UF2: [[VECTOR_SCEVCHECK]]: +; RV64-UF2-NEXT: [[TMP3:%.*]] = add nsw i64 [[TMP0]], -1 +; RV64-UF2-NEXT: [[TMP4:%.*]] = add i32 [[N]], -1 +; RV64-UF2-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP3]] to i32 +; RV64-UF2-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP5]]) +; RV64-UF2-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0 +; RV64-UF2-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1 +; RV64-UF2-NEXT: [[TMP6:%.*]] = sub i32 [[TMP4]], [[MUL_RESULT]] +; RV64-UF2-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], [[TMP4]] +; RV64-UF2-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]] +; RV64-UF2-NEXT: [[TMP9:%.*]] = icmp ugt i64 [[TMP3]], 4294967295 +; RV64-UF2-NEXT: [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]] +; RV64-UF2-NEXT: br i1 [[TMP10]], label %[[SCALAR_PH]], label %[[VECTOR_MEMCHECK:.*]] +; RV64-UF2: [[VECTOR_MEMCHECK]]: +; RV64-UF2-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; RV64-UF2-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 4 +; RV64-UF2-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 8 +; RV64-UF2-NEXT: [[TMP14:%.*]] = sub i64 [[B1]], [[A2]] +; RV64-UF2-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP14]], [[TMP13]] +; RV64-UF2-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; RV64-UF2: [[VECTOR_PH]]: +; RV64-UF2-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; RV64-UF2-NEXT: [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 8 +; RV64-UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP16]] +; RV64-UF2-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; RV64-UF2-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; RV64-UF2-NEXT: [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 4 +; RV64-UF2-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 2 +; RV64-UF2-NEXT: [[TMP20:%.*]] = sub i64 [[TMP0]], [[N_VEC]] +; RV64-UF2-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 +; RV64-UF2-NEXT: [[TMP21:%.*]] = sub i32 [[N]], [[DOTCAST]] +; RV64-UF2-NEXT: br label %[[VECTOR_BODY:.*]] +; RV64-UF2: [[VECTOR_BODY]]: +; RV64-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; RV64-UF2-NEXT: [[DOTCAST3:%.*]] = trunc i64 [[INDEX]] to i32 +; RV64-UF2-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST3]] +; RV64-UF2-NEXT: [[TMP22:%.*]] = add nsw i32 [[OFFSET_IDX]], -1 +; RV64-UF2-NEXT: [[TMP23:%.*]] = zext i32 [[TMP22]] to i64 +; RV64-UF2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP23]] +; RV64-UF2-NEXT: [[TMP25:%.*]] = mul i64 0, [[TMP18]] +; RV64-UF2-NEXT: [[TMP26:%.*]] = sub i64 [[TMP18]], 1 +; RV64-UF2-NEXT: [[TMP27:%.*]] = mul i64 -1, [[TMP26]] +; RV64-UF2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i64 [[TMP25]] +; RV64-UF2-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i64 [[TMP27]] +; RV64-UF2-NEXT: [[TMP30:%.*]] = mul i64 -1, [[TMP18]] +; RV64-UF2-NEXT: [[TMP31:%.*]] = sub i64 [[TMP18]], 1 +; RV64-UF2-NEXT: [[TMP32:%.*]] = mul i64 -1, [[TMP31]] +; RV64-UF2-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i64 [[TMP30]] +; RV64-UF2-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[TMP33]], i64 [[TMP32]] +; RV64-UF2-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP29]], align 4 +; RV64-UF2-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]]) +; RV64-UF2-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i32>, ptr [[TMP34]], align 4 +; RV64-UF2-NEXT: [[REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD4]]) +; RV64-UF2-NEXT: [[TMP35:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1) +; RV64-UF2-NEXT: [[TMP36:%.*]] = add <vscale x 4 x i32> [[REVERSE5]], splat (i32 1) +; RV64-UF2-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP23]] +; RV64-UF2-NEXT: [[TMP38:%.*]] = mul i64 0, [[TMP18]] +; RV64-UF2-NEXT: [[TMP39:%.*]] = sub i64 [[TMP18]], 1 +; RV64-UF2-NEXT: [[TMP40:%.*]] = mul i64 -1, [[TMP39]] +; RV64-UF2-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, ptr [[TMP37]], i64 [[TMP38]] +; RV64-UF2-NEXT: [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[TMP41]], i64 [[TMP40]] +; RV64-UF2-NEXT: [[TMP43:%.*]] = mul i64 -1, [[TMP18]] +; RV64-UF2-NEXT: [[TMP44:%.*]] = sub i64 [[TMP18]], 1 +; RV64-UF2-NEXT: [[TMP45:%.*]] = mul i64 -1, [[TMP44]] +; RV64-UF2-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, ptr [[TMP37]], i64 [[TMP43]] +; RV64-UF2-NEXT: [[TMP47:%.*]] = getelementptr inbounds i32, ptr [[TMP46]], i64 [[TMP45]] +; RV64-UF2-NEXT: [[REVERSE6:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP35]]) +; RV64-UF2-NEXT: store <vscale x 4 x i32> [[REVERSE6]], ptr [[TMP42]], align 4 +; RV64-UF2-NEXT: [[REVERSE7:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP36]]) +; RV64-UF2-NEXT: store <vscale x 4 x i32> [[REVERSE7]], ptr [[TMP47]], align 4 +; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]] +; RV64-UF2-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; RV64-UF2-NEXT: br i1 [[TMP48]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; RV64-UF2: [[MIDDLE_BLOCK]]: +; RV64-UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; RV64-UF2-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; RV64-UF2: [[SCALAR_PH]]: +; RV64-UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP20]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[VECTOR_MEMCHECK]] ] +; RV64-UF2-NEXT: [[BC_RESUME_VAL8:%.*]] = phi i32 [ [[TMP21]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ], [ [[N]], %[[VECTOR_MEMCHECK]] ] +; RV64-UF2-NEXT: br label %[[FOR_BODY:.*]] +; RV64-UF2: [[FOR_COND_CLEANUP_LOOPEXIT]]: +; RV64-UF2-NEXT: br label %[[FOR_COND_CLEANUP]] +; RV64-UF2: [[FOR_COND_CLEANUP]]: +; RV64-UF2-NEXT: ret void +; RV64-UF2: [[FOR_BODY]]: ; entry: %cmp7 = icmp sgt i32 %n, 0 @@ -423,390 +478,259 @@ for.body: ; preds = %for.body.preheader, } define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocapture noundef readonly %B, i32 noundef signext %n) { -; CHECK-LABEL: 'vector_reverse_f32' -; CHECK-NEXT: LV: Loop hints: force=enabled width=vscale x 4 interleave=0 -; CHECK-NEXT: LV: Found a loop: for.body -; CHECK-NEXT: LV: Found an induction variable. -; CHECK-NEXT: LV: Found an induction variable. -; CHECK-NEXT: LV: Found FP op with unsafe algebra. -; CHECK-NEXT: LV: Did not find one integer induction var. -; CHECK-NEXT: LV: We can vectorize this loop (with a runtime bound check)! -; CHECK-NEXT: LV: Loop does not require scalar epilogue -; CHECK-NEXT: LV: Found trip count: 0 -; CHECK-NEXT: LV: Found maximum trip count: 4294967295 -; CHECK-NEXT: LV: Scalable vectorization is available -; CHECK-NEXT: LV: The max safe fixed VF is: 67108864. -; CHECK-NEXT: LV: The max safe scalable VF is: vscale x 4294967295. -; CHECK-NEXT: LV: Found uniform instruction: %cmp = icmp ugt i64 %indvars.iv, 1 -; CHECK-NEXT: LV: Found uniform instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Found uniform instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom -; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] -; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 -; CHECK-NEXT: LV: Found uniform instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] -; CHECK-NEXT: LV: Found uniform instruction: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] -; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4 -; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00 -; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 -; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0 -; CHECK-NEXT: LV: Using user VF vscale x 4. -; CHECK-NEXT: Creating VPBasicBlock for for.body -; CHECK-NEXT: VPlan 'Plain CFG -; CHECK-NEXT: for UF>=1' { -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<for.body.preheader>: -; CHECK-NEXT: IR %0 = zext i32 %n to i64 -; CHECK-NEXT: Successor(s): for.body -; CHECK-EMPTY: -; CHECK-NEXT: for.body: -; CHECK-NEXT: WIDEN-PHI ir<%indvars.iv> = phi [ ir<%indvars.iv.next>, for.body ], [ ir<%0>, ir-bb<for.body.preheader> ] -; CHECK-NEXT: WIDEN-PHI ir<%i.0.in8> = phi [ ir<%i.0>, for.body ], [ ir<%n>, ir-bb<for.body.preheader> ] -; CHECK-NEXT: EMIT ir<%i.0> = add ir<%i.0.in8>, ir<-1> -; CHECK-NEXT: EMIT-SCALAR ir<%idxprom> = zext ir<%i.0> -; CHECK-NEXT: EMIT ir<%arrayidx> = getelementptr ir<%B>, ir<%idxprom> -; CHECK-NEXT: EMIT ir<%1> = load ir<%arrayidx> -; CHECK-NEXT: EMIT ir<%conv1> = fadd ir<%1>, ir<1.000000e+00> -; CHECK-NEXT: EMIT ir<%arrayidx3> = getelementptr ir<%A>, ir<%idxprom> -; CHECK-NEXT: EMIT store ir<%conv1>, ir<%arrayidx3> -; CHECK-NEXT: EMIT ir<%cmp> = icmp ir<%indvars.iv>, ir<1> -; CHECK-NEXT: EMIT ir<%indvars.iv.next> = add ir<%indvars.iv>, ir<-1> -; CHECK-NEXT: EMIT branch-on-cond ir<%cmp> -; CHECK-NEXT: Successor(s): for.body, ir-bb<for.cond.cleanup.loopexit> -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<for.cond.cleanup.loopexit>: -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; CHECK-NEXT: LV: Loop does not require scalar epilogue -; CHECK-NEXT: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Scalarizing: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Scalarizing: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom -; CHECK-NEXT: LV: Scalarizing: %cmp = icmp ugt i64 %indvars.iv, 1 -; CHECK-NEXT: LV: Scalarizing: %indvars.iv.next = add nsw i64 %indvars.iv, -1 -; CHECK-NEXT: VPlan 'Initial VPlan for VF={vscale x 4},UF>=1' { -; CHECK-NEXT: Live-in vp<%0> = VF -; CHECK-NEXT: Live-in vp<%1> = VF * UF -; CHECK-NEXT: Live-in vp<%2> = vector-trip-count -; CHECK-NEXT: vp<%3> = original trip-count -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<for.body.preheader>: -; CHECK-NEXT: IR %0 = zext i32 %n to i64 -; CHECK-NEXT: EMIT vp<%3> = EXPAND SCEV (zext i32 %n to i64) -; CHECK-NEXT: Successor(s): scalar.ph, vector.ph -; CHECK-EMPTY: -; CHECK-NEXT: vector.ph: -; CHECK-NEXT: vp<%4> = DERIVED-IV ir<%0> + vp<%2> * ir<-1> -; CHECK-NEXT: vp<%5> = DERIVED-IV ir<%n> + vp<%2> * ir<-1> -; CHECK-NEXT: Successor(s): vector loop -; CHECK-EMPTY: -; CHECK-NEXT: <x1> vector loop: { -; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK-NEXT: vp<%7> = DERIVED-IV ir<%n> + vp<%6> * ir<-1> -; CHECK-NEXT: vp<%8> = SCALAR-STEPS vp<%7>, ir<-1>, vp<%0> -; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<%8>, ir<-1> -; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0> -; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom> -; CHECK-NEXT: vp<%9> = vector-end-pointer inbounds ir<%arrayidx>, vp<%0> -; CHECK-NEXT: WIDEN ir<%1> = load vp<%9> -; CHECK-NEXT: WIDEN ir<%conv1> = fadd ir<%1>, ir<1.000000e+00> -; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom> -; CHECK-NEXT: vp<%10> = vector-end-pointer inbounds ir<%arrayidx3>, vp<%0> -; CHECK-NEXT: WIDEN store vp<%10>, ir<%conv1> -; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%6>, vp<%1> -; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%2> -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; CHECK-NEXT: Successor(s): middle.block -; CHECK-EMPTY: -; CHECK-NEXT: middle.block: -; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<%2> -; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> -; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup.loopexit>, scalar.ph -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<for.cond.cleanup.loopexit>: -; CHECK-NEXT: No successors -; CHECK-EMPTY: -; CHECK-NEXT: scalar.ph: -; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%4>, middle.block ], [ ir<%0>, ir-bb<for.body.preheader> ] -; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val>.1 = phi [ vp<%5>, middle.block ], [ ir<%n>, ir-bb<for.body.preheader> ] -; CHECK-NEXT: Successor(s): ir-bb<for.body> -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<for.body>: -; CHECK-NEXT: IR %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] (extra operand: vp<%bc.resume.val> from scalar.ph) -; CHECK-NEXT: IR %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] (extra operand: vp<%bc.resume.val>.1 from scalar.ph) -; CHECK-NEXT: IR %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: IR %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: IR %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom -; CHECK-NEXT: IR %1 = load float, ptr %arrayidx, align 4 -; CHECK-NEXT: IR %conv1 = fadd float %1, 1.000000e+00 -; CHECK-NEXT: IR %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom -; CHECK-NEXT: IR store float %conv1, ptr %arrayidx3, align 4 -; CHECK-NEXT: IR %cmp = icmp ugt i64 %indvars.iv, 1 -; CHECK-NEXT: IR %indvars.iv.next = add nsw i64 %indvars.iv, -1 -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] -; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4 -; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00 -; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 -; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0 -; CHECK-NEXT: LV(REG): Calculating max register usage: -; CHECK-NEXT: LV(REG): At #0 Interval # 0 -; CHECK-NEXT: LV(REG): At #1 Interval # 1 -; CHECK-NEXT: LV(REG): At #2 Interval # 2 -; CHECK-NEXT: LV(REG): At #3 Interval # 2 -; CHECK-NEXT: LV(REG): At #4 Interval # 2 -; CHECK-NEXT: LV(REG): At #5 Interval # 2 -; CHECK-NEXT: LV(REG): At #6 Interval # 3 -; CHECK-NEXT: LV(REG): At #7 Interval # 3 -; CHECK-NEXT: LV(REG): At #8 Interval # 3 -; CHECK-NEXT: LV(REG): At #9 Interval # 3 -; CHECK-NEXT: LV(REG): At #10 Interval # 3 -; CHECK-NEXT: LV(REG): At #11 Interval # 3 -; CHECK-NEXT: LV(REG): At #12 Interval # 2 -; CHECK-NEXT: LV(REG): At #13 Interval # 2 -; CHECK-NEXT: LV(REG): VF = vscale x 4 -; CHECK-NEXT: LV(REG): Found max usage: 2 item -; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers -; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 2 registers -; CHECK-NEXT: LV(REG): Found invariant usage: 1 item -; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers -; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class -; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class -; CHECK-NEXT: LV: Loop does not require scalar epilogue -; CHECK-NEXT: LV: Loop cost is 26 -; CHECK-NEXT: LV: IC is 1 -; CHECK-NEXT: LV: VF is vscale x 4 -; CHECK-NEXT: LV: Not Interleaving. -; CHECK-NEXT: LV: Interleaving is not beneficial. -; CHECK-NEXT: LV: Found a vectorizable loop (vscale x 4) in <stdin> -; CHECK-NEXT: LEV: Epilogue vectorization is not profitable for this loop -; CHECK-NEXT: LV: Loop does not require scalar epilogue -; CHECK-NEXT: LV: Loop does not require scalar epilogue -; CHECK-NEXT: Executing best plan with VF=vscale x 4, UF=1 -; CHECK-NEXT: VPlan 'Final VPlan for VF={vscale x 4},UF={1}' { -; CHECK-NEXT: Live-in ir<%18> = VF -; CHECK-NEXT: Live-in ir<%18>.1 = VF * UF -; CHECK-NEXT: Live-in ir<%n.vec> = vector-trip-count -; CHECK-NEXT: Live-in ir<%0> = original trip-count -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<for.body.preheader>: -; CHECK-NEXT: IR %0 = zext i32 %n to i64 -; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.scevcheck> -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<vector.scevcheck>: -; CHECK-NEXT: IR %3 = add nsw i64 %0, -1 -; CHECK-NEXT: IR %4 = add i32 %n, -1 -; CHECK-NEXT: IR %5 = trunc i64 %3 to i32 -; CHECK-NEXT: IR %mul = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 %5) -; CHECK-NEXT: IR %mul.result = extractvalue { i32, i1 } %mul, 0 -; CHECK-NEXT: IR %mul.overflow = extractvalue { i32, i1 } %mul, 1 -; CHECK-NEXT: IR %6 = sub i32 %4, %mul.result -; CHECK-NEXT: IR %7 = icmp ugt i32 %6, %4 -; CHECK-NEXT: IR %8 = or i1 %7, %mul.overflow -; CHECK-NEXT: IR %9 = icmp ugt i64 %3, 4294967295 -; CHECK-NEXT: IR %10 = or i1 %8, %9 -; CHECK-NEXT: EMIT branch-on-cond ir<%10> -; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.memcheck> -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<vector.memcheck>: -; CHECK-NEXT: IR %11 = call i64 @llvm.vscale.i64() -; CHECK-NEXT: IR %12 = mul nuw i64 %11, 4 -; CHECK-NEXT: IR %13 = mul i64 %12, 4 -; CHECK-NEXT: IR %14 = sub i64 %B1, %A2 -; CHECK-NEXT: IR %diff.check = icmp ult i64 %14, %13 -; CHECK-NEXT: EMIT branch-on-cond ir<%diff.check> -; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.ph> -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<vector.ph>: -; CHECK-NEXT: IR %15 = call i64 @llvm.vscale.i64() -; CHECK-NEXT: IR %16 = mul nuw i64 %15, 4 -; CHECK-NEXT: IR %n.mod.vf = urem i64 %0, %16 -; CHECK-NEXT: IR %n.vec = sub i64 %0, %n.mod.vf -; CHECK-NEXT: IR %17 = call i64 @llvm.vscale.i64() -; CHECK-NEXT: IR %18 = mul nuw i64 %17, 4 -; CHECK-NEXT: vp<%3> = DERIVED-IV ir<%0> + ir<%n.vec> * ir<-1> -; CHECK-NEXT: vp<%4> = DERIVED-IV ir<%n> + ir<%n.vec> * ir<-1> -; CHECK-NEXT: Successor(s): vector.body -; CHECK-EMPTY: -; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT-SCALAR vp<%index> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<%index.next>, vector.body ] -; CHECK-NEXT: vp<%5> = DERIVED-IV ir<%n> + vp<%index> * ir<-1> -; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<%5>, ir<-1> -; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0> -; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom> -; CHECK-NEXT: vp<%6> = vector-end-pointer inbounds ir<%arrayidx>, ir<%18> -; CHECK-NEXT: WIDEN ir<%19> = load vp<%6> -; CHECK-NEXT: WIDEN ir<%conv1> = fadd ir<%19>, ir<1.000000e+00> -; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom> -; CHECK-NEXT: vp<%7> = vector-end-pointer inbounds ir<%arrayidx3>, ir<%18> -; CHECK-NEXT: WIDEN store vp<%7>, ir<%conv1> -; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%index>, ir<%18>.1 -; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, ir<%n.vec> -; CHECK-NEXT: Successor(s): middle.block, vector.body -; CHECK-EMPTY: -; CHECK-NEXT: middle.block: -; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq ir<%0>, ir<%n.vec> -; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> -; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup.loopexit>, ir-bb<scalar.ph> -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<for.cond.cleanup.loopexit>: -; CHECK-NEXT: No successors -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<scalar.ph>: -; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%3>, middle.block ], [ ir<%0>, ir-bb<for.body.preheader> ], [ ir<%0>, ir-bb<vector.scevcheck> ], [ ir<%0>, ir-bb<vector.memcheck> ] -; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val>.1 = phi [ vp<%4>, middle.block ], [ ir<%n>, ir-bb<for.body.preheader> ], [ ir<%n>, ir-bb<vector.scevcheck> ], [ ir<%n>, ir-bb<vector.memcheck> ] -; CHECK-NEXT: Successor(s): ir-bb<for.body> -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb<for.body>: -; CHECK-NEXT: IR %indvars.iv = phi i64 [ %0, %scalar.ph ], [ %indvars.iv.next, %for.body ] (extra operand: vp<%bc.resume.val> from ir-bb<scalar.ph>) -; CHECK-NEXT: IR %i.0.in8 = phi i32 [ %n, %scalar.ph ], [ %i.0, %for.body ] (extra operand: vp<%bc.resume.val>.1 from ir-bb<scalar.ph>) -; CHECK-NEXT: IR %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: IR %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: IR %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom -; CHECK-NEXT: IR %19 = load float, ptr %arrayidx, align 4 -; CHECK-NEXT: IR %conv1 = fadd float %19, 1.000000e+00 -; CHECK-NEXT: IR %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom -; CHECK-NEXT: IR store float %conv1, ptr %arrayidx3, align 4 -; CHECK-NEXT: IR %cmp = icmp ugt i64 %indvars.iv, 1 -; CHECK-NEXT: IR %indvars.iv.next = add nsw i64 %indvars.iv, -1 -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<for.body.preheader> in BB: for.body.preheader -; CHECK-NEXT: LV: filled BB: -; CHECK-NEXT: for.body.preheader: ; preds = %entry -; CHECK-NEXT: %0 = zext i32 %n to i64 -; CHECK-NEXT: %1 = call i64 @llvm.vscale.i64() -; CHECK-NEXT: %2 = mul nuw i64 %1, 4 -; CHECK-NEXT: %min.iters.check = icmp ult i64 %0, %2 -; CHECK-NEXT: br i1 %min.iters.check, label %scalar.ph, label %vector.ph -; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<vector.scevcheck> in BB: vector.scevcheck -; CHECK-NEXT: LV: filled BB: -; CHECK-NEXT: vector.scevcheck: ; No predecessors! -; CHECK-NEXT: %3 = add nsw i64 %0, -1 -; CHECK-NEXT: %4 = add i32 %n, -1 -; CHECK-NEXT: %5 = trunc i64 %3 to i32 -; CHECK-NEXT: %mul = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 %5) -; CHECK-NEXT: %mul.result = extractvalue { i32, i1 } %mul, 0 -; CHECK-NEXT: %mul.overflow = extractvalue { i32, i1 } %mul, 1 -; CHECK-NEXT: %6 = sub i32 %4, %mul.result -; CHECK-NEXT: %7 = icmp ugt i32 %6, %4 -; CHECK-NEXT: %8 = or i1 %7, %mul.overflow -; CHECK-NEXT: %9 = icmp ugt i64 %3, 4294967295 -; CHECK-NEXT: %10 = or i1 %8, %9 -; CHECK-NEXT: br i1 %10, <null operand!>, <null operand!> -; CHECK-NEXT: LV: draw edge from for.body.preheader -; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<vector.memcheck> in BB: vector.memcheck -; CHECK-NEXT: LV: filled BB: -; CHECK-NEXT: vector.memcheck: ; No predecessors! -; CHECK-NEXT: %11 = call i64 @llvm.vscale.i64() -; CHECK-NEXT: %12 = mul nuw i64 %11, 4 -; CHECK-NEXT: %13 = mul i64 %12, 4 -; CHECK-NEXT: %14 = sub i64 %B1, %A2 -; CHECK-NEXT: %diff.check = icmp ult i64 %14, %13 -; CHECK-NEXT: br i1 %diff.check, <null operand!>, <null operand!> -; CHECK-NEXT: LV: draw edge from vector.scevcheck -; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<vector.ph> in BB: vector.ph -; CHECK-NEXT: LV: filled BB: -; CHECK-NEXT: vector.ph: ; No predecessors! -; CHECK-NEXT: %15 = call i64 @llvm.vscale.i64() -; CHECK-NEXT: %16 = mul nuw i64 %15, 4 -; CHECK-NEXT: %n.mod.vf = urem i64 %0, %16 -; CHECK-NEXT: %n.vec = sub i64 %0, %n.mod.vf -; CHECK-NEXT: %17 = call i64 @llvm.vscale.i64() -; CHECK-NEXT: %18 = mul nuw i64 %17, 4 -; CHECK-NEXT: %19 = sub i64 %0, %n.vec -; CHECK-NEXT: %.cast = trunc i64 %n.vec to i32 -; CHECK-NEXT: %20 = sub i32 %n, %.cast -; CHECK-NEXT: br -; CHECK-NEXT: LV: draw edge from vector.memcheck -; CHECK-NEXT: LV: created vector.body -; CHECK-NEXT: LV: draw edge from vector.ph -; CHECK-NEXT: LV: vectorizing VPBB: vector.body in BB: vector.body -; CHECK-NEXT: LV: filled BB: -; CHECK-NEXT: vector.body: ; preds = %vector.body, %vector.ph -; CHECK-NEXT: %index = phi i64 [ 0, %vector.ph ] -; CHECK-NEXT: %.cast3 = trunc i64 %index to i32 -; CHECK-NEXT: %offset.idx = sub i32 %n, %.cast3 -; CHECK-NEXT: %21 = add nsw i32 %offset.idx, -1 -; CHECK-NEXT: %22 = zext i32 %21 to i64 -; CHECK-NEXT: %23 = getelementptr inbounds float, ptr %B, i64 %22 -; CHECK-NEXT: %24 = mul i64 0, %18 -; CHECK-NEXT: %25 = sub i64 %18, 1 -; CHECK-NEXT: %26 = mul i64 -1, %25 -; CHECK-NEXT: %27 = getelementptr inbounds float, ptr %23, i64 %24 -; CHECK-NEXT: %28 = getelementptr inbounds float, ptr %27, i64 %26 -; CHECK-NEXT: %wide.load = load <vscale x 4 x float>, ptr %28, align 4 -; CHECK-NEXT: %reverse = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %wide.load) -; CHECK-NEXT: %29 = fadd <vscale x 4 x float> %reverse, splat (float 1.000000e+00) -; CHECK-NEXT: %30 = getelementptr inbounds float, ptr %A, i64 %22 -; CHECK-NEXT: %31 = mul i64 0, %18 -; CHECK-NEXT: %32 = sub i64 %18, 1 -; CHECK-NEXT: %33 = mul i64 -1, %32 -; CHECK-NEXT: %34 = getelementptr inbounds float, ptr %30, i64 %31 -; CHECK-NEXT: %35 = getelementptr inbounds float, ptr %34, i64 %33 -; CHECK-NEXT: %reverse4 = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %29) -; CHECK-NEXT: store <vscale x 4 x float> %reverse4, ptr %35, align 4 -; CHECK-NEXT: %index.next = add nuw i64 %index, %18 -; CHECK-NEXT: %36 = icmp eq i64 %index.next, %n.vec -; CHECK-NEXT: br i1 %36, <null operand!>, label %vector.body -; CHECK-NEXT: LV: created middle.block -; CHECK-NEXT: LV: draw edge from vector.body -; CHECK-NEXT: LV: vectorizing VPBB: middle.block in BB: middle.block -; CHECK-NEXT: LV: filled BB: -; CHECK-NEXT: middle.block: ; preds = %vector.body -; CHECK-NEXT: %cmp.n = icmp eq i64 %0, %n.vec -; CHECK-NEXT: br i1 %cmp.n, <null operand!>, <null operand!> -; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<for.cond.cleanup.loopexit> in BB: for.cond.cleanup.loopexit -; CHECK-NEXT: LV: filled BB: -; CHECK-NEXT: for.cond.cleanup.loopexit: ; preds = %for.body -; CHECK-NEXT: br label %for.cond.cleanup -; CHECK-NEXT: LV: draw edge from middle.block -; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<scalar.ph> in BB: scalar.ph -; CHECK-NEXT: LV: filled BB: -; CHECK-NEXT: scalar.ph: ; preds = %for.body.preheader -; CHECK-NEXT: %bc.resume.val = phi i64 [ %19, %middle.block ], [ %0, %for.body.preheader ], [ %0, %vector.scevcheck ], [ %0, %vector.memcheck ] -; CHECK-NEXT: %bc.resume.val5 = phi i32 [ %20, %middle.block ], [ %n, %for.body.preheader ], [ %n, %vector.scevcheck ], [ %n, %vector.memcheck ] -; CHECK-NEXT: br label %for.body -; CHECK-NEXT: LV: draw edge from middle.block -; CHECK-NEXT: LV: draw edge from for.body.preheader -; CHECK-NEXT: LV: draw edge from vector.scevcheck -; CHECK-NEXT: LV: draw edge from vector.memcheck -; CHECK-NEXT: LV: vectorizing VPBB: ir-bb<for.body> in BB: for.body -; CHECK-NEXT: LV: filled BB: -; CHECK-NEXT: for.body: ; preds = %for.body, %scalar.ph -; CHECK-NEXT: %indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next, %for.body ] -; CHECK-NEXT: %i.0.in8 = phi i32 [ %bc.resume.val5, %scalar.ph ], [ %i.0, %for.body ] -; CHECK-NEXT: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom -; CHECK-NEXT: %37 = load float, ptr %arrayidx, align 4 -; CHECK-NEXT: %conv1 = fadd float %37, 1.000000e+00 -; CHECK-NEXT: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom -; CHECK-NEXT: store float %conv1, ptr %arrayidx3, align 4 -; CHECK-NEXT: %cmp = icmp ugt i64 %indvars.iv, 1 -; CHECK-NEXT: %indvars.iv.next = add nsw i64 %indvars.iv, -1 -; CHECK-NEXT: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0 -; CHECK-NEXT: LV: draw edge from scalar.ph -; CHECK-NEXT: LV: Interleaving disabled by the pass manager -; CHECK-NEXT: LV: Vectorizing: innermost loop. +; RV64-LABEL: define void @vector_reverse_f32( +; RV64-SAME: ptr noundef writeonly captures(none) [[A:%.*]], ptr noundef readonly captures(none) [[B:%.*]], i32 noundef signext [[N:%.*]]) #[[ATTR0]] { +; RV64-NEXT: [[ENTRY:.*:]] +; RV64-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64 +; RV64-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64 +; RV64-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N]], 0 +; RV64-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] +; RV64: [[FOR_BODY_PREHEADER]]: +; RV64-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 +; RV64-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; RV64-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4 +; RV64-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; RV64-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] +; RV64: [[VECTOR_SCEVCHECK]]: +; RV64-NEXT: [[TMP3:%.*]] = add nsw i64 [[TMP0]], -1 +; RV64-NEXT: [[TMP4:%.*]] = add i32 [[N]], -1 +; RV64-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP3]] to i32 +; RV64-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP5]]) +; RV64-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0 +; RV64-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1 +; RV64-NEXT: [[TMP6:%.*]] = sub i32 [[TMP4]], [[MUL_RESULT]] +; RV64-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], [[TMP4]] +; RV64-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]] +; RV64-NEXT: [[TMP9:%.*]] = icmp ugt i64 [[TMP3]], 4294967295 +; RV64-NEXT: [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]] +; RV64-NEXT: br i1 [[TMP10]], label %[[SCALAR_PH]], label %[[VECTOR_MEMCHECK:.*]] +; RV64: [[VECTOR_MEMCHECK]]: +; RV64-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; RV64-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 4 +; RV64-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 +; RV64-NEXT: [[TMP14:%.*]] = sub i64 [[B1]], [[A2]] +; RV64-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP14]], [[TMP13]] +; RV64-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; RV64: [[VECTOR_PH]]: +; RV64-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; RV64-NEXT: [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 4 +; RV64-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP16]] +; RV64-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; RV64-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; RV64-NEXT: [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 4 +; RV64-NEXT: [[TMP19:%.*]] = sub i64 [[TMP0]], [[N_VEC]] +; RV64-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 +; RV64-NEXT: [[TMP20:%.*]] = sub i32 [[N]], [[DOTCAST]] +; RV64-NEXT: br label %[[VECTOR_BODY:.*]] +; RV64: [[VECTOR_BODY]]: +; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; RV64-NEXT: [[DOTCAST3:%.*]] = trunc i64 [[INDEX]] to i32 +; RV64-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST3]] +; RV64-NEXT: [[TMP21:%.*]] = add nsw i32 [[OFFSET_IDX]], -1 +; RV64-NEXT: [[TMP22:%.*]] = zext i32 [[TMP21]] to i64 +; RV64-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP22]] +; RV64-NEXT: [[TMP24:%.*]] = mul i64 0, [[TMP18]] +; RV64-NEXT: [[TMP25:%.*]] = sub i64 [[TMP18]], 1 +; RV64-NEXT: [[TMP26:%.*]] = mul i64 -1, [[TMP25]] +; RV64-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[TMP24]] +; RV64-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP27]], i64 [[TMP26]] +; RV64-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP28]], align 4 +; RV64-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]]) +; RV64-NEXT: [[TMP29:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00) +; RV64-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP22]] +; RV64-NEXT: [[TMP31:%.*]] = mul i64 0, [[TMP18]] +; RV64-NEXT: [[TMP32:%.*]] = sub i64 [[TMP18]], 1 +; RV64-NEXT: [[TMP33:%.*]] = mul i64 -1, [[TMP32]] +; RV64-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, ptr [[TMP30]], i64 [[TMP31]] +; RV64-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP33]] +; RV64-NEXT: [[REVERSE4:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP29]]) +; RV64-NEXT: store <vscale x 4 x float> [[REVERSE4]], ptr [[TMP35]], align 4 +; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP18]] +; RV64-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; RV64-NEXT: br i1 [[TMP36]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; RV64: [[MIDDLE_BLOCK]]: +; RV64-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; RV64-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; RV64: [[SCALAR_PH]]: +; RV64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP19]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[VECTOR_MEMCHECK]] ] +; RV64-NEXT: [[BC_RESUME_VAL5:%.*]] = phi i32 [ [[TMP20]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ], [ [[N]], %[[VECTOR_MEMCHECK]] ] +; RV64-NEXT: br label %[[FOR_BODY:.*]] +; RV64: [[FOR_COND_CLEANUP_LOOPEXIT]]: +; RV64-NEXT: br label %[[FOR_COND_CLEANUP]] +; RV64: [[FOR_COND_CLEANUP]]: +; RV64-NEXT: ret void +; RV64: [[FOR_BODY]]: +; +; RV32-LABEL: define void @vector_reverse_f32( +; RV32-SAME: ptr noundef writeonly captures(none) [[A:%.*]], ptr noundef readonly captures(none) [[B:%.*]], i32 noundef signext [[N:%.*]]) #[[ATTR0]] { +; RV32-NEXT: [[ENTRY:.*:]] +; RV32-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i32 +; RV32-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i32 +; RV32-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N]], 0 +; RV32-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] +; RV32: [[FOR_BODY_PREHEADER]]: +; RV32-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 +; RV32-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; RV32-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4 +; RV32-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; RV32-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; RV32: [[VECTOR_MEMCHECK]]: +; RV32-NEXT: [[TMP3:%.*]] = call i32 @llvm.vscale.i32() +; RV32-NEXT: [[TMP4:%.*]] = mul nuw i32 [[TMP3]], 4 +; RV32-NEXT: [[TMP5:%.*]] = mul i32 [[TMP4]], 4 +; RV32-NEXT: [[TMP6:%.*]] = sub i32 [[B1]], [[A2]] +; RV32-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i32 [[TMP6]], [[TMP5]] +; RV32-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; RV32: [[VECTOR_PH]]: +; RV32-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; RV32-NEXT: [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4 +; RV32-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP8]] +; RV32-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; RV32-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; RV32-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4 +; RV32-NEXT: [[TMP11:%.*]] = sub i64 [[TMP0]], [[N_VEC]] +; RV32-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 +; RV32-NEXT: [[TMP12:%.*]] = sub i32 [[N]], [[DOTCAST]] +; RV32-NEXT: br label %[[VECTOR_BODY:.*]] +; RV32: [[VECTOR_BODY]]: +; RV32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; RV32-NEXT: [[DOTCAST3:%.*]] = trunc i64 [[INDEX]] to i32 +; RV32-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST3]] +; RV32-NEXT: [[TMP13:%.*]] = add nsw i32 [[OFFSET_IDX]], -1 +; RV32-NEXT: [[TMP14:%.*]] = zext i32 [[TMP13]] to i64 +; RV32-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP14]] +; RV32-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP10]] to i32 +; RV32-NEXT: [[TMP17:%.*]] = mul i32 0, [[TMP16]] +; RV32-NEXT: [[TMP18:%.*]] = sub i32 [[TMP16]], 1 +; RV32-NEXT: [[TMP19:%.*]] = mul i32 -1, [[TMP18]] +; RV32-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i32 [[TMP17]] +; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 [[TMP19]] +; RV32-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP21]], align 4 +; RV32-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]]) +; RV32-NEXT: [[TMP22:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00) +; RV32-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP14]] +; RV32-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP10]] to i32 +; RV32-NEXT: [[TMP25:%.*]] = mul i32 0, [[TMP24]] +; RV32-NEXT: [[TMP26:%.*]] = sub i32 [[TMP24]], 1 +; RV32-NEXT: [[TMP27:%.*]] = mul i32 -1, [[TMP26]] +; RV32-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i32 [[TMP25]] +; RV32-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i32 [[TMP27]] +; RV32-NEXT: [[REVERSE4:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP22]]) +; RV32-NEXT: store <vscale x 4 x float> [[REVERSE4]], ptr [[TMP29]], align 4 +; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] +; RV32-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; RV32-NEXT: br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; RV32: [[MIDDLE_BLOCK]]: +; RV32-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; RV32-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; RV32: [[SCALAR_PH]]: +; RV32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_MEMCHECK]] ] +; RV32-NEXT: [[BC_RESUME_VAL5:%.*]] = phi i32 [ [[TMP12]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_MEMCHECK]] ] +; RV32-NEXT: br label %[[FOR_BODY:.*]] +; RV32: [[FOR_COND_CLEANUP_LOOPEXIT]]: +; RV32-NEXT: br label %[[FOR_COND_CLEANUP]] +; RV32: [[FOR_COND_CLEANUP]]: +; RV32-NEXT: ret void +; RV32: [[FOR_BODY]]: +; +; RV64-UF2-LABEL: define void @vector_reverse_f32( +; RV64-UF2-SAME: ptr noundef writeonly captures(none) [[A:%.*]], ptr noundef readonly captures(none) [[B:%.*]], i32 noundef signext [[N:%.*]]) #[[ATTR0]] { +; RV64-UF2-NEXT: [[ENTRY:.*:]] +; RV64-UF2-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64 +; RV64-UF2-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64 +; RV64-UF2-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N]], 0 +; RV64-UF2-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] +; RV64-UF2: [[FOR_BODY_PREHEADER]]: +; RV64-UF2-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 +; RV64-UF2-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; RV64-UF2-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 8 +; RV64-UF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; RV64-UF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] +; RV64-UF2: [[VECTOR_SCEVCHECK]]: +; RV64-UF2-NEXT: [[TMP3:%.*]] = add nsw i64 [[TMP0]], -1 +; RV64-UF2-NEXT: [[TMP4:%.*]] = add i32 [[N]], -1 +; RV64-UF2-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP3]] to i32 +; RV64-UF2-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP5]]) +; RV64-UF2-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0 +; RV64-UF2-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1 +; RV64-UF2-NEXT: [[TMP6:%.*]] = sub i32 [[TMP4]], [[MUL_RESULT]] +; RV64-UF2-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], [[TMP4]] +; RV64-UF2-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]] +; RV64-UF2-NEXT: [[TMP9:%.*]] = icmp ugt i64 [[TMP3]], 4294967295 +; RV64-UF2-NEXT: [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]] +; RV64-UF2-NEXT: br i1 [[TMP10]], label %[[SCALAR_PH]], label %[[VECTOR_MEMCHECK:.*]] +; RV64-UF2: [[VECTOR_MEMCHECK]]: +; RV64-UF2-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; RV64-UF2-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 4 +; RV64-UF2-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 8 +; RV64-UF2-NEXT: [[TMP14:%.*]] = sub i64 [[B1]], [[A2]] +; RV64-UF2-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP14]], [[TMP13]] +; RV64-UF2-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; RV64-UF2: [[VECTOR_PH]]: +; RV64-UF2-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; RV64-UF2-NEXT: [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 8 +; RV64-UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP16]] +; RV64-UF2-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; RV64-UF2-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; RV64-UF2-NEXT: [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 4 +; RV64-UF2-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 2 +; RV64-UF2-NEXT: [[TMP20:%.*]] = sub i64 [[TMP0]], [[N_VEC]] +; RV64-UF2-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 +; RV64-UF2-NEXT: [[TMP21:%.*]] = sub i32 [[N]], [[DOTCAST]] +; RV64-UF2-NEXT: br label %[[VECTOR_BODY:.*]] +; RV64-UF2: [[VECTOR_BODY]]: +; RV64-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; RV64-UF2-NEXT: [[DOTCAST3:%.*]] = trunc i64 [[INDEX]] to i32 +; RV64-UF2-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST3]] +; RV64-UF2-NEXT: [[TMP22:%.*]] = add nsw i32 [[OFFSET_IDX]], -1 +; RV64-UF2-NEXT: [[TMP23:%.*]] = zext i32 [[TMP22]] to i64 +; RV64-UF2-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP23]] +; RV64-UF2-NEXT: [[TMP25:%.*]] = mul i64 0, [[TMP18]] +; RV64-UF2-NEXT: [[TMP26:%.*]] = sub i64 [[TMP18]], 1 +; RV64-UF2-NEXT: [[TMP27:%.*]] = mul i64 -1, [[TMP26]] +; RV64-UF2-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i64 [[TMP25]] +; RV64-UF2-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i64 [[TMP27]] +; RV64-UF2-NEXT: [[TMP30:%.*]] = mul i64 -1, [[TMP18]] +; RV64-UF2-NEXT: [[TMP31:%.*]] = sub i64 [[TMP18]], 1 +; RV64-UF2-NEXT: [[TMP32:%.*]] = mul i64 -1, [[TMP31]] +; RV64-UF2-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i64 [[TMP30]] +; RV64-UF2-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, ptr [[TMP33]], i64 [[TMP32]] +; RV64-UF2-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP29]], align 4 +; RV64-UF2-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]]) +; RV64-UF2-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 4 x float>, ptr [[TMP34]], align 4 +; RV64-UF2-NEXT: [[REVERSE5:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD4]]) +; RV64-UF2-NEXT: [[TMP35:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00) +; RV64-UF2-NEXT: [[TMP36:%.*]] = fadd <vscale x 4 x float> [[REVERSE5]], splat (float 1.000000e+00) +; RV64-UF2-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP23]] +; RV64-UF2-NEXT: [[TMP38:%.*]] = mul i64 0, [[TMP18]] +; RV64-UF2-NEXT: [[TMP39:%.*]] = sub i64 [[TMP18]], 1 +; RV64-UF2-NEXT: [[TMP40:%.*]] = mul i64 -1, [[TMP39]] +; RV64-UF2-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, ptr [[TMP37]], i64 [[TMP38]] +; RV64-UF2-NEXT: [[TMP42:%.*]] = getelementptr inbounds float, ptr [[TMP41]], i64 [[TMP40]] +; RV64-UF2-NEXT: [[TMP43:%.*]] = mul i64 -1, [[TMP18]] +; RV64-UF2-NEXT: [[TMP44:%.*]] = sub i64 [[TMP18]], 1 +; RV64-UF2-NEXT: [[TMP45:%.*]] = mul i64 -1, [[TMP44]] +; RV64-UF2-NEXT: [[TMP46:%.*]] = getelementptr inbounds float, ptr [[TMP37]], i64 [[TMP43]] +; RV64-UF2-NEXT: [[TMP47:%.*]] = getelementptr inbounds float, ptr [[TMP46]], i64 [[TMP45]] +; RV64-UF2-NEXT: [[REVERSE6:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP35]]) +; RV64-UF2-NEXT: store <vscale x 4 x float> [[REVERSE6]], ptr [[TMP42]], align 4 +; RV64-UF2-NEXT: [[REVERSE7:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP36]]) +; RV64-UF2-NEXT: store <vscale x 4 x float> [[REVERSE7]], ptr [[TMP47]], align 4 +; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]] +; RV64-UF2-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; RV64-UF2-NEXT: br i1 [[TMP48]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; RV64-UF2: [[MIDDLE_BLOCK]]: +; RV64-UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; RV64-UF2-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; RV64-UF2: [[SCALAR_PH]]: +; RV64-UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP20]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[VECTOR_MEMCHECK]] ] +; RV64-UF2-NEXT: [[BC_RESUME_VAL8:%.*]] = phi i32 [ [[TMP21]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ], [ [[N]], %[[VECTOR_MEMCHECK]] ] +; RV64-UF2-NEXT: br label %[[FOR_BODY:.*]] +; RV64-UF2: [[FOR_COND_CLEANUP_LOOPEXIT]]: +; RV64-UF2-NEXT: br label %[[FOR_COND_CLEANUP]] +; RV64-UF2: [[FOR_COND_CLEANUP]]: +; RV64-UF2-NEXT: ret void +; RV64-UF2: [[FOR_BODY]]: ; entry: %cmp7 = icmp sgt i32 %n, 0 @@ -834,8 +758,397 @@ for.body: ; preds = %for.body.preheader, br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !0 } -!0 = distinct !{!0, !1, !2, !3, !4} -!1 = !{!"llvm.loop.mustprogress"} -!2 = !{!"llvm.loop.vectorize.width", i32 4} -!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} -!4 = !{!"llvm.loop.vectorize.enable", i1 true} +define void @vector_reverse_f32_simplify(ptr noalias %A, ptr noalias %B) { +; RV64-LABEL: define void @vector_reverse_f32_simplify( +; RV64-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] { +; RV64-NEXT: [[ENTRY:.*]]: +; RV64-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; RV64-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 +; RV64-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]] +; RV64-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; RV64: [[VECTOR_PH]]: +; RV64-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; RV64-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 +; RV64-NEXT: [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]] +; RV64-NEXT: [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]] +; RV64-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; RV64-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 +; RV64-NEXT: [[TMP6:%.*]] = sub i64 1023, [[N_VEC]] +; RV64-NEXT: br label %[[VECTOR_BODY:.*]] +; RV64: [[VECTOR_BODY]]: +; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; RV64-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] +; RV64-NEXT: [[TMP7:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 +; RV64-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP7]] +; RV64-NEXT: [[TMP9:%.*]] = mul i64 0, [[TMP5]] +; RV64-NEXT: [[TMP10:%.*]] = sub i64 [[TMP5]], 1 +; RV64-NEXT: [[TMP11:%.*]] = mul i64 -1, [[TMP10]] +; RV64-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i64 [[TMP9]] +; RV64-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[TMP11]] +; RV64-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP13]], align 4 +; RV64-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]]) +; RV64-NEXT: [[TMP14:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00) +; RV64-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]] +; RV64-NEXT: [[TMP16:%.*]] = mul i64 0, [[TMP5]] +; RV64-NEXT: [[TMP17:%.*]] = sub i64 [[TMP5]], 1 +; RV64-NEXT: [[TMP18:%.*]] = mul i64 -1, [[TMP17]] +; RV64-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i64 [[TMP16]] +; RV64-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i64 [[TMP18]] +; RV64-NEXT: [[REVERSE1:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP14]]) +; RV64-NEXT: store <vscale x 4 x float> [[REVERSE1]], ptr [[TMP20]], align 4 +; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; RV64-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; RV64-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; RV64: [[MIDDLE_BLOCK]]: +; RV64-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]] +; RV64-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; RV64: [[SCALAR_PH]]: +; RV64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ] +; RV64-NEXT: br label %[[FOR_BODY:.*]] +; RV64: [[FOR_BODY]]: +; +; RV32-LABEL: define void @vector_reverse_f32_simplify( +; RV32-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] { +; RV32-NEXT: [[ENTRY:.*]]: +; RV32-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; RV32-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 +; RV32-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]] +; RV32-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; RV32: [[VECTOR_PH]]: +; RV32-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; RV32-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 +; RV32-NEXT: [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]] +; RV32-NEXT: [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]] +; RV32-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; RV32-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 +; RV32-NEXT: [[TMP6:%.*]] = sub i64 1023, [[N_VEC]] +; RV32-NEXT: br label %[[VECTOR_BODY:.*]] +; RV32: [[VECTOR_BODY]]: +; RV32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; RV32-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] +; RV32-NEXT: [[TMP7:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 +; RV32-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP7]] +; RV32-NEXT: [[TMP9:%.*]] = trunc i64 [[TMP5]] to i32 +; RV32-NEXT: [[TMP10:%.*]] = mul i32 0, [[TMP9]] +; RV32-NEXT: [[TMP11:%.*]] = sub i32 [[TMP9]], 1 +; RV32-NEXT: [[TMP12:%.*]] = mul i32 -1, [[TMP11]] +; RV32-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 [[TMP10]] +; RV32-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 [[TMP12]] +; RV32-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP14]], align 4 +; RV32-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]]) +; RV32-NEXT: [[TMP15:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00) +; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]] +; RV32-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP5]] to i32 +; RV32-NEXT: [[TMP18:%.*]] = mul i32 0, [[TMP17]] +; RV32-NEXT: [[TMP19:%.*]] = sub i32 [[TMP17]], 1 +; RV32-NEXT: [[TMP20:%.*]] = mul i32 -1, [[TMP19]] +; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 [[TMP18]] +; RV32-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i32 [[TMP20]] +; RV32-NEXT: [[REVERSE1:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP15]]) +; RV32-NEXT: store <vscale x 4 x float> [[REVERSE1]], ptr [[TMP22]], align 4 +; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; RV32-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; RV32-NEXT: br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; RV32: [[MIDDLE_BLOCK]]: +; RV32-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]] +; RV32-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; RV32: [[SCALAR_PH]]: +; RV32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ] +; RV32-NEXT: br label %[[FOR_BODY:.*]] +; RV32: [[FOR_BODY]]: +; +; RV64-UF2-LABEL: define void @vector_reverse_f32_simplify( +; RV64-UF2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] { +; RV64-UF2-NEXT: [[ENTRY:.*]]: +; RV64-UF2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; RV64-UF2-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8 +; RV64-UF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]] +; RV64-UF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; RV64-UF2: [[VECTOR_PH]]: +; RV64-UF2-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; RV64-UF2-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 +; RV64-UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]] +; RV64-UF2-NEXT: [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]] +; RV64-UF2-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; RV64-UF2-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 +; RV64-UF2-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2 +; RV64-UF2-NEXT: [[TMP7:%.*]] = sub i64 1023, [[N_VEC]] +; RV64-UF2-NEXT: br label %[[VECTOR_BODY:.*]] +; RV64-UF2: [[VECTOR_BODY]]: +; RV64-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; RV64-UF2-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] +; RV64-UF2-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 +; RV64-UF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP8]] +; RV64-UF2-NEXT: [[TMP10:%.*]] = mul i64 0, [[TMP5]] +; RV64-UF2-NEXT: [[TMP11:%.*]] = sub i64 [[TMP5]], 1 +; RV64-UF2-NEXT: [[TMP12:%.*]] = mul i64 -1, [[TMP11]] +; RV64-UF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[TMP10]] +; RV64-UF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[TMP12]] +; RV64-UF2-NEXT: [[TMP15:%.*]] = mul i64 -1, [[TMP5]] +; RV64-UF2-NEXT: [[TMP16:%.*]] = sub i64 [[TMP5]], 1 +; RV64-UF2-NEXT: [[TMP17:%.*]] = mul i64 -1, [[TMP16]] +; RV64-UF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[TMP15]] +; RV64-UF2-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP18]], i64 [[TMP17]] +; RV64-UF2-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP14]], align 4 +; RV64-UF2-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]]) +; RV64-UF2-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP19]], align 4 +; RV64-UF2-NEXT: [[REVERSE2:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD1]]) +; RV64-UF2-NEXT: [[TMP20:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00) +; RV64-UF2-NEXT: [[TMP21:%.*]] = fadd <vscale x 4 x float> [[REVERSE2]], splat (float 1.000000e+00) +; RV64-UF2-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]] +; RV64-UF2-NEXT: [[TMP23:%.*]] = mul i64 0, [[TMP5]] +; RV64-UF2-NEXT: [[TMP24:%.*]] = sub i64 [[TMP5]], 1 +; RV64-UF2-NEXT: [[TMP25:%.*]] = mul i64 -1, [[TMP24]] +; RV64-UF2-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP23]] +; RV64-UF2-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP26]], i64 [[TMP25]] +; RV64-UF2-NEXT: [[TMP28:%.*]] = mul i64 -1, [[TMP5]] +; RV64-UF2-NEXT: [[TMP29:%.*]] = sub i64 [[TMP5]], 1 +; RV64-UF2-NEXT: [[TMP30:%.*]] = mul i64 -1, [[TMP29]] +; RV64-UF2-NEXT: [[TMP31:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP28]] +; RV64-UF2-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP31]], i64 [[TMP30]] +; RV64-UF2-NEXT: [[REVERSE3:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP20]]) +; RV64-UF2-NEXT: store <vscale x 4 x float> [[REVERSE3]], ptr [[TMP27]], align 4 +; RV64-UF2-NEXT: [[REVERSE4:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP21]]) +; RV64-UF2-NEXT: store <vscale x 4 x float> [[REVERSE4]], ptr [[TMP32]], align 4 +; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] +; RV64-UF2-NEXT: [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; RV64-UF2-NEXT: br i1 [[TMP33]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; RV64-UF2: [[MIDDLE_BLOCK]]: +; RV64-UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]] +; RV64-UF2-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; RV64-UF2: [[SCALAR_PH]]: +; RV64-UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ] +; RV64-UF2-NEXT: br label %[[FOR_BODY:.*]] +; RV64-UF2: [[FOR_BODY]]: +; +entry: + br label %for.body + +for.body: + %dec.iv = phi i64 [ 1023, %entry ], [ %iv.next, %for.body ] + %iv.next = add nsw i64 %dec.iv, -1 + %arrayidx.b = getelementptr inbounds float, ptr %B, i64 %iv.next + %0 = load float, ptr %arrayidx.b, align 4 + %fadd = fadd float %0, 1.000000e+00 + %arrayidx.a = getelementptr inbounds float, ptr %A, i64 %iv.next + store float %fadd, ptr %arrayidx.a, align 4 + %cmp = icmp ugt i64 %dec.iv, 1 + br i1 %cmp, label %for.body, label %exit, !llvm.loop !0 + +exit: + ret void +} + +define void @vector_reverse_irregular_type(ptr noalias %A, ptr noalias %B) { +; RV64-LABEL: define void @vector_reverse_irregular_type( +; RV64-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] { +; RV64-NEXT: [[ENTRY:.*]]: +; RV64-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; RV64: [[VECTOR_PH]]: +; RV64-NEXT: br label %[[VECTOR_BODY:.*]] +; RV64: [[VECTOR_BODY]]: +; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; RV64-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] +; RV64-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; RV64-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], -1 +; RV64-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], -2 +; RV64-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], -3 +; RV64-NEXT: [[TMP4:%.*]] = add nsw i64 [[TMP0]], -1 +; RV64-NEXT: [[TMP5:%.*]] = add nsw i64 [[TMP1]], -1 +; RV64-NEXT: [[TMP6:%.*]] = add nsw i64 [[TMP2]], -1 +; RV64-NEXT: [[TMP7:%.*]] = add nsw i64 [[TMP3]], -1 +; RV64-NEXT: [[TMP8:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP4]] +; RV64-NEXT: [[TMP9:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP5]] +; RV64-NEXT: [[TMP10:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP6]] +; RV64-NEXT: [[TMP11:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP7]] +; RV64-NEXT: [[TMP12:%.*]] = load i7, ptr [[TMP8]], align 1 +; RV64-NEXT: [[TMP13:%.*]] = load i7, ptr [[TMP9]], align 1 +; RV64-NEXT: [[TMP14:%.*]] = load i7, ptr [[TMP10]], align 1 +; RV64-NEXT: [[TMP15:%.*]] = load i7, ptr [[TMP11]], align 1 +; RV64-NEXT: [[TMP16:%.*]] = insertelement <4 x i7> poison, i7 [[TMP12]], i32 0 +; RV64-NEXT: [[TMP17:%.*]] = insertelement <4 x i7> [[TMP16]], i7 [[TMP13]], i32 1 +; RV64-NEXT: [[TMP18:%.*]] = insertelement <4 x i7> [[TMP17]], i7 [[TMP14]], i32 2 +; RV64-NEXT: [[TMP19:%.*]] = insertelement <4 x i7> [[TMP18]], i7 [[TMP15]], i32 3 +; RV64-NEXT: [[TMP20:%.*]] = add <4 x i7> [[TMP19]], splat (i7 1) +; RV64-NEXT: [[TMP21:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP4]] +; RV64-NEXT: [[TMP22:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP5]] +; RV64-NEXT: [[TMP23:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP6]] +; RV64-NEXT: [[TMP24:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP7]] +; RV64-NEXT: [[TMP25:%.*]] = extractelement <4 x i7> [[TMP20]], i32 0 +; RV64-NEXT: store i7 [[TMP25]], ptr [[TMP21]], align 1 +; RV64-NEXT: [[TMP26:%.*]] = extractelement <4 x i7> [[TMP20]], i32 1 +; RV64-NEXT: store i7 [[TMP26]], ptr [[TMP22]], align 1 +; RV64-NEXT: [[TMP27:%.*]] = extractelement <4 x i7> [[TMP20]], i32 2 +; RV64-NEXT: store i7 [[TMP27]], ptr [[TMP23]], align 1 +; RV64-NEXT: [[TMP28:%.*]] = extractelement <4 x i7> [[TMP20]], i32 3 +; RV64-NEXT: store i7 [[TMP28]], ptr [[TMP24]], align 1 +; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; RV64-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1020 +; RV64-NEXT: br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; RV64: [[MIDDLE_BLOCK]]: +; RV64-NEXT: br i1 false, [[EXIT:label %.*]], label %[[SCALAR_PH]] +; RV64: [[SCALAR_PH]]: +; RV64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ] +; RV64-NEXT: br label %[[FOR_BODY:.*]] +; RV64: [[FOR_BODY]]: +; +; RV32-LABEL: define void @vector_reverse_irregular_type( +; RV32-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] { +; RV32-NEXT: [[ENTRY:.*]]: +; RV32-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; RV32: [[VECTOR_PH]]: +; RV32-NEXT: br label %[[VECTOR_BODY:.*]] +; RV32: [[VECTOR_BODY]]: +; RV32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; RV32-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] +; RV32-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; RV32-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], -1 +; RV32-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], -2 +; RV32-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], -3 +; RV32-NEXT: [[TMP4:%.*]] = add nsw i64 [[TMP0]], -1 +; RV32-NEXT: [[TMP5:%.*]] = add nsw i64 [[TMP1]], -1 +; RV32-NEXT: [[TMP6:%.*]] = add nsw i64 [[TMP2]], -1 +; RV32-NEXT: [[TMP7:%.*]] = add nsw i64 [[TMP3]], -1 +; RV32-NEXT: [[TMP8:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP4]] +; RV32-NEXT: [[TMP9:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP5]] +; RV32-NEXT: [[TMP10:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP6]] +; RV32-NEXT: [[TMP11:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP7]] +; RV32-NEXT: [[TMP12:%.*]] = load i7, ptr [[TMP8]], align 1 +; RV32-NEXT: [[TMP13:%.*]] = load i7, ptr [[TMP9]], align 1 +; RV32-NEXT: [[TMP14:%.*]] = load i7, ptr [[TMP10]], align 1 +; RV32-NEXT: [[TMP15:%.*]] = load i7, ptr [[TMP11]], align 1 +; RV32-NEXT: [[TMP16:%.*]] = insertelement <4 x i7> poison, i7 [[TMP12]], i32 0 +; RV32-NEXT: [[TMP17:%.*]] = insertelement <4 x i7> [[TMP16]], i7 [[TMP13]], i32 1 +; RV32-NEXT: [[TMP18:%.*]] = insertelement <4 x i7> [[TMP17]], i7 [[TMP14]], i32 2 +; RV32-NEXT: [[TMP19:%.*]] = insertelement <4 x i7> [[TMP18]], i7 [[TMP15]], i32 3 +; RV32-NEXT: [[TMP20:%.*]] = add <4 x i7> [[TMP19]], splat (i7 1) +; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP4]] +; RV32-NEXT: [[TMP22:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP5]] +; RV32-NEXT: [[TMP23:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP6]] +; RV32-NEXT: [[TMP24:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP7]] +; RV32-NEXT: [[TMP25:%.*]] = extractelement <4 x i7> [[TMP20]], i32 0 +; RV32-NEXT: store i7 [[TMP25]], ptr [[TMP21]], align 1 +; RV32-NEXT: [[TMP26:%.*]] = extractelement <4 x i7> [[TMP20]], i32 1 +; RV32-NEXT: store i7 [[TMP26]], ptr [[TMP22]], align 1 +; RV32-NEXT: [[TMP27:%.*]] = extractelement <4 x i7> [[TMP20]], i32 2 +; RV32-NEXT: store i7 [[TMP27]], ptr [[TMP23]], align 1 +; RV32-NEXT: [[TMP28:%.*]] = extractelement <4 x i7> [[TMP20]], i32 3 +; RV32-NEXT: store i7 [[TMP28]], ptr [[TMP24]], align 1 +; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; RV32-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1020 +; RV32-NEXT: br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; RV32: [[MIDDLE_BLOCK]]: +; RV32-NEXT: br i1 false, [[EXIT:label %.*]], label %[[SCALAR_PH]] +; RV32: [[SCALAR_PH]]: +; RV32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ] +; RV32-NEXT: br label %[[FOR_BODY:.*]] +; RV32: [[FOR_BODY]]: +; +; RV64-UF2-LABEL: define void @vector_reverse_irregular_type( +; RV64-UF2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] { +; RV64-UF2-NEXT: [[ENTRY:.*]]: +; RV64-UF2-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; RV64-UF2: [[VECTOR_PH]]: +; RV64-UF2-NEXT: br label %[[VECTOR_BODY:.*]] +; RV64-UF2: [[VECTOR_BODY]]: +; RV64-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; RV64-UF2-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] +; RV64-UF2-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; RV64-UF2-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], -1 +; RV64-UF2-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], -2 +; RV64-UF2-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], -3 +; RV64-UF2-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], -4 +; RV64-UF2-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], -5 +; RV64-UF2-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], -6 +; RV64-UF2-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], -7 +; RV64-UF2-NEXT: [[TMP8:%.*]] = add nsw i64 [[TMP0]], -1 +; RV64-UF2-NEXT: [[TMP9:%.*]] = add nsw i64 [[TMP1]], -1 +; RV64-UF2-NEXT: [[TMP10:%.*]] = add nsw i64 [[TMP2]], -1 +; RV64-UF2-NEXT: [[TMP11:%.*]] = add nsw i64 [[TMP3]], -1 +; RV64-UF2-NEXT: [[TMP12:%.*]] = add nsw i64 [[TMP4]], -1 +; RV64-UF2-NEXT: [[TMP13:%.*]] = add nsw i64 [[TMP5]], -1 +; RV64-UF2-NEXT: [[TMP14:%.*]] = add nsw i64 [[TMP6]], -1 +; RV64-UF2-NEXT: [[TMP15:%.*]] = add nsw i64 [[TMP7]], -1 +; RV64-UF2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP8]] +; RV64-UF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP9]] +; RV64-UF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP10]] +; RV64-UF2-NEXT: [[TMP19:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP11]] +; RV64-UF2-NEXT: [[TMP20:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP12]] +; RV64-UF2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP13]] +; RV64-UF2-NEXT: [[TMP22:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP14]] +; RV64-UF2-NEXT: [[TMP23:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP15]] +; RV64-UF2-NEXT: [[TMP24:%.*]] = load i7, ptr [[TMP16]], align 1 +; RV64-UF2-NEXT: [[TMP25:%.*]] = load i7, ptr [[TMP17]], align 1 +; RV64-UF2-NEXT: [[TMP26:%.*]] = load i7, ptr [[TMP18]], align 1 +; RV64-UF2-NEXT: [[TMP27:%.*]] = load i7, ptr [[TMP19]], align 1 +; RV64-UF2-NEXT: [[TMP28:%.*]] = insertelement <4 x i7> poison, i7 [[TMP24]], i32 0 +; RV64-UF2-NEXT: [[TMP29:%.*]] = insertelement <4 x i7> [[TMP28]], i7 [[TMP25]], i32 1 +; RV64-UF2-NEXT: [[TMP30:%.*]] = insertelement <4 x i7> [[TMP29]], i7 [[TMP26]], i32 2 +; RV64-UF2-NEXT: [[TMP31:%.*]] = insertelement <4 x i7> [[TMP30]], i7 [[TMP27]], i32 3 +; RV64-UF2-NEXT: [[TMP32:%.*]] = load i7, ptr [[TMP20]], align 1 +; RV64-UF2-NEXT: [[TMP33:%.*]] = load i7, ptr [[TMP21]], align 1 +; RV64-UF2-NEXT: [[TMP34:%.*]] = load i7, ptr [[TMP22]], align 1 +; RV64-UF2-NEXT: [[TMP35:%.*]] = load i7, ptr [[TMP23]], align 1 +; RV64-UF2-NEXT: [[TMP36:%.*]] = insertelement <4 x i7> poison, i7 [[TMP32]], i32 0 +; RV64-UF2-NEXT: [[TMP37:%.*]] = insertelement <4 x i7> [[TMP36]], i7 [[TMP33]], i32 1 +; RV64-UF2-NEXT: [[TMP38:%.*]] = insertelement <4 x i7> [[TMP37]], i7 [[TMP34]], i32 2 +; RV64-UF2-NEXT: [[TMP39:%.*]] = insertelement <4 x i7> [[TMP38]], i7 [[TMP35]], i32 3 +; RV64-UF2-NEXT: [[TMP40:%.*]] = add <4 x i7> [[TMP31]], splat (i7 1) +; RV64-UF2-NEXT: [[TMP41:%.*]] = add <4 x i7> [[TMP39]], splat (i7 1) +; RV64-UF2-NEXT: [[TMP42:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP8]] +; RV64-UF2-NEXT: [[TMP43:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP9]] +; RV64-UF2-NEXT: [[TMP44:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP10]] +; RV64-UF2-NEXT: [[TMP45:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP11]] +; RV64-UF2-NEXT: [[TMP46:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP12]] +; RV64-UF2-NEXT: [[TMP47:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP13]] +; RV64-UF2-NEXT: [[TMP48:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP14]] +; RV64-UF2-NEXT: [[TMP49:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP15]] +; RV64-UF2-NEXT: [[TMP50:%.*]] = extractelement <4 x i7> [[TMP40]], i32 0 +; RV64-UF2-NEXT: store i7 [[TMP50]], ptr [[TMP42]], align 1 +; RV64-UF2-NEXT: [[TMP51:%.*]] = extractelement <4 x i7> [[TMP40]], i32 1 +; RV64-UF2-NEXT: store i7 [[TMP51]], ptr [[TMP43]], align 1 +; RV64-UF2-NEXT: [[TMP52:%.*]] = extractelement <4 x i7> [[TMP40]], i32 2 +; RV64-UF2-NEXT: store i7 [[TMP52]], ptr [[TMP44]], align 1 +; RV64-UF2-NEXT: [[TMP53:%.*]] = extractelement <4 x i7> [[TMP40]], i32 3 +; RV64-UF2-NEXT: store i7 [[TMP53]], ptr [[TMP45]], align 1 +; RV64-UF2-NEXT: [[TMP54:%.*]] = extractelement <4 x i7> [[TMP41]], i32 0 +; RV64-UF2-NEXT: store i7 [[TMP54]], ptr [[TMP46]], align 1 +; RV64-UF2-NEXT: [[TMP55:%.*]] = extractelement <4 x i7> [[TMP41]], i32 1 +; RV64-UF2-NEXT: store i7 [[TMP55]], ptr [[TMP47]], align 1 +; RV64-UF2-NEXT: [[TMP56:%.*]] = extractelement <4 x i7> [[TMP41]], i32 2 +; RV64-UF2-NEXT: store i7 [[TMP56]], ptr [[TMP48]], align 1 +; RV64-UF2-NEXT: [[TMP57:%.*]] = extractelement <4 x i7> [[TMP41]], i32 3 +; RV64-UF2-NEXT: store i7 [[TMP57]], ptr [[TMP49]], align 1 +; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; RV64-UF2-NEXT: [[TMP58:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1016 +; RV64-UF2-NEXT: br i1 [[TMP58]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; RV64-UF2: [[MIDDLE_BLOCK]]: +; RV64-UF2-NEXT: br i1 false, [[EXIT:label %.*]], label %[[SCALAR_PH]] +; RV64-UF2: [[SCALAR_PH]]: +; RV64-UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 7, %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ] +; RV64-UF2-NEXT: br label %[[FOR_BODY:.*]] +; RV64-UF2: [[FOR_BODY]]: +; +entry: + br label %for.body + +for.body: + %dec.iv = phi i64 [ 1023, %entry ], [ %iv.next, %for.body ] + %iv.next = add nsw i64 %dec.iv, -1 + %arrayidx.b = getelementptr inbounds i7, ptr %B, i64 %iv.next + %0 = load i7, ptr %arrayidx.b, align 1 + %add = add i7 %0, 1 + %arrayidx.a = getelementptr inbounds i7, ptr %A, i64 %iv.next + store i7 %add, ptr %arrayidx.a, align 1 + %cmp = icmp ugt i64 %dec.iv, 1 + br i1 %cmp, label %for.body, label %exit, !llvm.loop !4 + +exit: + ret void +} + +!0 = distinct !{!0, !1, !2, !3} +!1 = !{!"llvm.loop.vectorize.width", i32 4} +!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} +!3 = !{!"llvm.loop.vectorize.enable", i1 true} +!4 = distinct !{!4, !1, !3} diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll index ff9c585..b046f61 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll @@ -24,12 +24,16 @@ define void @vector_add(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 1025) -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = sub i64 1025, [[EVL_BASED_IV]] +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[EVL_BASED_IV]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP7]]) ; CHECK-NEXT: [[TMP10:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP10]], ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP10]], ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP7]]) +; CHECK-NEXT: [[TMP12:%.*]] = zext i32 [[TMP7]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP12]], [[EVL_BASED_IV]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -46,7 +50,7 @@ define void @vector_add(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-NEXT: store i64 [[ADD]], ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -87,15 +91,19 @@ define void @indexed_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 1025) -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = sub i64 1025, [[EVL_BASED_IV]] +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[EVL_BASED_IV]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP7]]) ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], <vscale x 2 x i64> [[WIDE_MASKED_LOAD]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x ptr> [[TMP10]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: call void @llvm.vp.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x ptr> align 8 [[TMP10]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP7]]) +; CHECK-NEXT: [[TMP12:%.*]] = zext i32 [[TMP7]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP12]], [[EVL_BASED_IV]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: @@ -109,7 +117,7 @@ define void @indexed_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i ; CHECK-NEXT: store i64 [[V]], ptr [[AADDR]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -146,20 +154,24 @@ define i64 @indexed_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 1025) -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[AVL:%.*]] = sub i64 1025, [[EVL_BASED_IV]] +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[EVL_BASED_IV]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP7]]) ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], <vscale x 2 x i64> [[WIDE_MASKED_LOAD]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP10]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison) -; CHECK-NEXT: [[TMP11]] = add <vscale x 2 x i64> [[VEC_PHI]], [[WIDE_MASKED_GATHER]] -; CHECK-NEXT: [[TMP12:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[VEC_PHI]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.vp.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> align 8 [[TMP10]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP7]]) +; CHECK-NEXT: [[TMP12:%.*]] = add <vscale x 2 x i64> [[VEC_PHI]], [[WIDE_MASKED_GATHER]] +; CHECK-NEXT: [[TMP11]] = call <vscale x 2 x i64> @llvm.vp.merge.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[TMP12]], <vscale x 2 x i64> [[VEC_PHI]], i32 [[TMP7]]) +; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP7]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP15]], [[EVL_BASED_IV]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP12]]) +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP11]]) ; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] @@ -175,7 +187,7 @@ define i64 @indexed_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[SUM_NEXT]] = add i64 [[SUM]], [[ELEM]] ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[SUM_NEXT_LCSSA]] @@ -217,13 +229,17 @@ define void @splat_int(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 1025) -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = sub i64 1025, [[EVL_BASED_IV]] +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[EVL_BASED_IV]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0 -; CHECK-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP7]]) +; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP7]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP11]], [[EVL_BASED_IV]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: @@ -235,7 +251,7 @@ define void @splat_int(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -272,14 +288,18 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 1025) +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = sub i64 1025, [[EVL_BASED_IV]] +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) ; CHECK-NEXT: store i64 [[V]], ptr [[B:%.*]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[EVL_BASED_IV]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0 -; CHECK-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP7]]) +; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP7]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP11]], [[EVL_BASED_IV]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: @@ -292,7 +312,7 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i ; CHECK-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -363,15 +383,19 @@ define void @vector_add_trip1024(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 1024) -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = sub i64 1024, [[EVL_BASED_IV]] +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[EVL_BASED_IV]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP7]]) ; CHECK-NEXT: [[TMP10:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP10]], ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP10]], ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP7]]) +; CHECK-NEXT: [[TMP12:%.*]] = zext i32 [[TMP7]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP12]], [[EVL_BASED_IV]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: @@ -385,7 +409,7 @@ define void @vector_add_trip1024(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-NEXT: store i64 [[ADD]], ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cost.ll index b4afdd7..cd53ea0 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cost.ll @@ -1,17 +1,17 @@ ; REQUIRES: asserts -; RUN: opt < %s -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \ -; RUN: -mtriple riscv64-linux-gnu -mattr=+v,+f -S -disable-output -debug-only=loop-vectorize 2>&1 | FileCheck %s +; RUN: opt < %s -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -force-tail-folding-style=data \ +; RUN: -mtriple riscv64-linux-gnu -mattr=+v,+f -S -disable-output -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefix=DATA ; RUN: opt < %s -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \ ; RUN: -mtriple riscv64-linux-gnu -force-tail-folding-style=data-with-evl -mattr=+v,+f -S \ ; RUN: -disable-output -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefix=EVL -; CHECK: Cost of 2 for VF 2: EMIT{{.*}} = active lane mask -; CHECK: Cost of 4 for VF 4: EMIT{{.*}} = active lane mask -; CHECK: Cost of 8 for VF 8: EMIT{{.*}} = active lane mask -; CHECK: Cost of 2 for VF vscale x 1: EMIT{{.*}} = active lane mask -; CHECK: Cost of 4 for VF vscale x 2: EMIT{{.*}} = active lane mask -; CHECK: Cost of 8 for VF vscale x 4: EMIT{{.*}} = active lane mask +; DATA: Cost of 2 for VF 2: EMIT{{.*}} = active lane mask +; DATA: Cost of 4 for VF 4: EMIT{{.*}} = active lane mask +; DATA: Cost of 8 for VF 8: EMIT{{.*}} = active lane mask +; DATA: Cost of 2 for VF vscale x 1: EMIT{{.*}} = active lane mask +; DATA: Cost of 4 for VF vscale x 2: EMIT{{.*}} = active lane mask +; DATA: Cost of 8 for VF vscale x 4: EMIT{{.*}} = active lane mask ; EVL: Cost of 1 for VF vscale x 1: EMIT{{.*}} = EXPLICIT-VECTOR-LENGTH ; EVL: Cost of 1 for VF vscale x 2: EMIT{{.*}} = EXPLICIT-VECTOR-LENGTH diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll index 528cec0..b56e712 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll @@ -170,15 +170,11 @@ define void @truncate_to_i1_used_by_branch(i8 %x, ptr %dst) #0 { ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[INDEX]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT3]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32() -; CHECK-NEXT: [[TMP14:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP13]] -; CHECK-NEXT: [[VEC_IV:%.*]] = add <vscale x 4 x i32> [[BROADCAST_SPLAT4]], [[TMP14]] -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <vscale x 4 x i32> [[VEC_IV]], i32 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 [[TMP15]], i32 9) -; CHECK-NEXT: [[TMP11:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> zeroinitializer -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i8.nxv4p0(<vscale x 4 x i8> zeroinitializer, <vscale x 4 x ptr> [[BROADCAST_SPLAT2]], i32 1, <vscale x 4 x i1> [[TMP11]]) +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = sub i32 9, [[EVL_BASED_IV]] +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 4, i1 true) +; CHECK-NEXT: call void @llvm.vp.scatter.nxv4i8.nxv4p0(<vscale x 4 x i8> zeroinitializer, <vscale x 4 x ptr> align 1 [[BROADCAST_SPLAT2]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP6]]) +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP6]], [[EVL_BASED_IV]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]] ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] @@ -199,7 +195,7 @@ define void @truncate_to_i1_used_by_branch(i8 %x, ptr %dst) #0 { ; CHECK-NEXT: [[ADD]] = add i8 [[F_039]], 1 ; CHECK-NEXT: [[CONV:%.*]] = sext i8 [[F_039]] to i32 ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[CONV]], 8 -; CHECK-NEXT: br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -298,7 +294,7 @@ define void @icmp_only_first_op_truncated(ptr noalias %dst, i32 %x, i64 %N, i64 ; CHECK-NEXT: call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> [[WIDE_MASKED_GATHER]], <vscale x 2 x ptr> [[BROADCAST_SPLAT6]], i32 8, <vscale x 2 x i1> [[TMP8]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -319,7 +315,7 @@ define void @icmp_only_first_op_truncated(ptr noalias %dst, i32 %x, i64 %N, i64 ; CHECK: [[LOOP_LATCH]]: ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[V]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -359,8 +355,9 @@ attributes #1 = { "target-features"="+64bit,+v" } ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} ; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} ; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} -; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} -; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} -; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} -; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META7:![0-9]+]], [[META2]]} +; CHECK: [[META7]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} +; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META2]], [[META1]]} +; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]]} +; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META2]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll index 8baf9d9..c6955f1 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll @@ -2,9 +2,6 @@ ; RUN: opt < %s -passes=loop-vectorize -scalable-vectorization=on -riscv-v-vector-bits-min=0 -mtriple riscv64-linux-gnu -mattr=+v,+f -S 2>%t | FileCheck %s -check-prefix=SCALABLE ; RUN: opt < %s -passes=loop-vectorize -scalable-vectorization=off -mtriple riscv64-linux-gnu -mattr=+v,+f -S 2>%t | FileCheck %s -check-prefix=FIXEDLEN ; RUN: opt < %s -passes=loop-vectorize -scalable-vectorization=on -riscv-v-vector-bits-min=0 -prefer-predicate-over-epilogue=predicate-dont-vectorize -mtriple riscv64-linux-gnu -mattr=+v,+f -S 2>%t | FileCheck %s -check-prefix=TF-SCALABLE -; RUN: opt < %s -passes=loop-vectorize -scalable-vectorization=off -prefer-predicate-over-epilogue=predicate-dont-vectorize -mtriple riscv64-linux-gnu -mattr=+v,+f -S 2>%t | FileCheck %s -check-prefix=TF-FIXEDLEN - - target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128" target triple = "riscv64" @@ -103,15 +100,19 @@ define void @uniform_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i6 ; TF-SCALABLE-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2 ; TF-SCALABLE-NEXT: br label %[[VECTOR_BODY:.*]] ; TF-SCALABLE: [[VECTOR_BODY]]: -; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 1025) +; TF-SCALABLE-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TF-SCALABLE-NEXT: [[AVL:%.*]] = sub i64 1025, [[INDEX]] +; TF-SCALABLE-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) ; TF-SCALABLE-NEXT: [[TMP5:%.*]] = load i64, ptr [[B]], align 8 ; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP5]], i64 0 ; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer ; TF-SCALABLE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] ; TF-SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0 -; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP8]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]]) -; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] +; TF-SCALABLE-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr align 8 [[TMP8]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP6]]) +; TF-SCALABLE-NEXT: [[TMP10:%.*]] = zext i32 [[TMP6]] to i64 +; TF-SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP10]], [[INDEX]] +; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[TMP4]] ; TF-SCALABLE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; TF-SCALABLE-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; TF-SCALABLE: [[MIDDLE_BLOCK]]: @@ -126,44 +127,10 @@ define void @uniform_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i6 ; TF-SCALABLE-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 ; TF-SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; TF-SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; TF-SCALABLE: [[FOR_END]]: ; TF-SCALABLE-NEXT: ret void ; -; TF-FIXEDLEN-LABEL: define void @uniform_load( -; TF-FIXEDLEN-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { -; TF-FIXEDLEN-NEXT: [[ENTRY:.*]]: -; TF-FIXEDLEN-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; TF-FIXEDLEN: [[VECTOR_PH]]: -; TF-FIXEDLEN-NEXT: br label %[[VECTOR_BODY:.*]] -; TF-FIXEDLEN: [[VECTOR_BODY]]: -; TF-FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; TF-FIXEDLEN-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 1025) -; TF-FIXEDLEN-NEXT: [[TMP0:%.*]] = load i64, ptr [[B]], align 8 -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP0]], i64 0 -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer -; TF-FIXEDLEN-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] -; TF-FIXEDLEN-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0 -; TF-FIXEDLEN-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP2]], i32 8, <4 x i1> [[ACTIVE_LANE_MASK]]) -; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028 -; TF-FIXEDLEN-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; TF-FIXEDLEN: [[MIDDLE_BLOCK]]: -; TF-FIXEDLEN-NEXT: br label %[[FOR_END:.*]] -; TF-FIXEDLEN: [[SCALAR_PH]]: -; TF-FIXEDLEN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] -; TF-FIXEDLEN-NEXT: br label %[[FOR_BODY:.*]] -; TF-FIXEDLEN: [[FOR_BODY]]: -; TF-FIXEDLEN-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] -; TF-FIXEDLEN-NEXT: [[V:%.*]] = load i64, ptr [[B]], align 8 -; TF-FIXEDLEN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] -; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 -; TF-FIXEDLEN-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; TF-FIXEDLEN-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] -; TF-FIXEDLEN: [[FOR_END]]: -; TF-FIXEDLEN-NEXT: ret void -; entry: br label %for.body @@ -277,22 +244,6 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap ; TF-SCALABLE-NEXT: [[V_LCSSA:%.*]] = phi i64 [ [[V]], %[[FOR_BODY]] ] ; TF-SCALABLE-NEXT: ret i64 [[V_LCSSA]] ; -; TF-FIXEDLEN-LABEL: define i64 @uniform_load_outside_use( -; TF-FIXEDLEN-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; TF-FIXEDLEN-NEXT: [[ENTRY:.*]]: -; TF-FIXEDLEN-NEXT: br label %[[FOR_BODY:.*]] -; TF-FIXEDLEN: [[FOR_BODY]]: -; TF-FIXEDLEN-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] -; TF-FIXEDLEN-NEXT: [[V:%.*]] = load i64, ptr [[B]], align 8 -; TF-FIXEDLEN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] -; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 -; TF-FIXEDLEN-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; TF-FIXEDLEN-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END:.*]], label %[[FOR_BODY]] -; TF-FIXEDLEN: [[FOR_END]]: -; TF-FIXEDLEN-NEXT: [[V_LCSSA:%.*]] = phi i64 [ [[V]], %[[FOR_BODY]] ] -; TF-FIXEDLEN-NEXT: ret i64 [[V_LCSSA]] -; entry: br label %for.body @@ -437,25 +388,31 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca ; TF-SCALABLE-NEXT: [[TMP5:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64() ; TF-SCALABLE-NEXT: [[TMP6:%.*]] = mul <vscale x 4 x i64> [[TMP5]], splat (i64 1) ; TF-SCALABLE-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP6]] -; TF-SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 1, [[TMP4]] -; TF-SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP8]], i64 0 -; TF-SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer ; TF-SCALABLE-NEXT: br label %[[VECTOR_BODY:.*]] ; TF-SCALABLE: [[VECTOR_BODY]]: -; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TF-SCALABLE-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] ; TF-SCALABLE-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 1025) +; TF-SCALABLE-NEXT: [[AVL:%.*]] = sub i64 1025, [[INDEX]] +; TF-SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; TF-SCALABLE-NEXT: [[TMP11:%.*]] = zext i32 [[TMP7]] to i64 +; TF-SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 1, [[TMP11]] +; TF-SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP8]], i64 0 +; TF-SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer +; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IND]], splat (i64 1024) ; TF-SCALABLE-NEXT: [[TMP10:%.*]] = icmp ugt <vscale x 4 x i64> [[VEC_IND]], splat (i64 10) ; TF-SCALABLE-NEXT: [[TMP9:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> zeroinitializer -; TF-SCALABLE-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i64> @llvm.masked.gather.nxv4i64.nxv4p0(<vscale x 4 x ptr> [[BROADCAST_SPLAT]], i32 8, <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i64> poison) +; TF-SCALABLE-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i64> @llvm.vp.gather.nxv4i64.nxv4p0(<vscale x 4 x ptr> align 8 [[BROADCAST_SPLAT]], <vscale x 4 x i1> [[TMP10]], i32 [[TMP7]]) ; TF-SCALABLE-NEXT: [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i64> [[WIDE_MASKED_GATHER]], <vscale x 4 x i64> zeroinitializer ; TF-SCALABLE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] ; TF-SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 0 -; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv4i64.p0(<vscale x 4 x i64> [[PREDPHI]], ptr [[TMP13]], i32 8, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]]) -; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] +; TF-SCALABLE-NEXT: call void @llvm.vp.store.nxv4i64.p0(<vscale x 4 x i64> [[PREDPHI]], ptr align 8 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]]) +; TF-SCALABLE-NEXT: [[TMP15:%.*]] = zext i32 [[TMP7]] to i64 +; TF-SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP15]], [[INDEX]] +; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[TMP4]] ; TF-SCALABLE-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]] ; TF-SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; TF-SCALABLE-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; TF-SCALABLE: [[MIDDLE_BLOCK]]: ; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]] ; TF-SCALABLE: [[SCALAR_PH]]: @@ -474,55 +431,10 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca ; TF-SCALABLE-NEXT: store i64 [[PHI]], ptr [[ARRAYIDX]], align 8 ; TF-SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; TF-SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; TF-SCALABLE: [[FOR_END]]: ; TF-SCALABLE-NEXT: ret void ; -; TF-FIXEDLEN-LABEL: define void @conditional_uniform_load( -; TF-FIXEDLEN-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; TF-FIXEDLEN-NEXT: [[ENTRY:.*]]: -; TF-FIXEDLEN-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; TF-FIXEDLEN: [[VECTOR_PH]]: -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x ptr> poison, ptr [[B]], i64 0 -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT]], <4 x ptr> poison, <4 x i32> zeroinitializer -; TF-FIXEDLEN-NEXT: br label %[[VECTOR_BODY:.*]] -; TF-FIXEDLEN: [[VECTOR_BODY]]: -; TF-FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; TF-FIXEDLEN-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; TF-FIXEDLEN-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 1025) -; TF-FIXEDLEN-NEXT: [[TMP1:%.*]] = icmp ugt <4 x i64> [[VEC_IND]], splat (i64 10) -; TF-FIXEDLEN-NEXT: [[TMP2:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP1]], <4 x i1> zeroinitializer -; TF-FIXEDLEN-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[BROADCAST_SPLAT]], i32 8, <4 x i1> [[TMP2]], <4 x i64> poison) -; TF-FIXEDLEN-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> [[WIDE_MASKED_GATHER]], <4 x i64> zeroinitializer -; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] -; TF-FIXEDLEN-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; TF-FIXEDLEN-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[PREDPHI]], ptr [[TMP4]], i32 8, <4 x i1> [[ACTIVE_LANE_MASK]]) -; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; TF-FIXEDLEN-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) -; TF-FIXEDLEN-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028 -; TF-FIXEDLEN-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] -; TF-FIXEDLEN: [[MIDDLE_BLOCK]]: -; TF-FIXEDLEN-NEXT: br label %[[FOR_END:.*]] -; TF-FIXEDLEN: [[SCALAR_PH]]: -; TF-FIXEDLEN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] -; TF-FIXEDLEN-NEXT: br label %[[FOR_BODY:.*]] -; TF-FIXEDLEN: [[FOR_BODY]]: -; TF-FIXEDLEN-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ] -; TF-FIXEDLEN-NEXT: [[CMP:%.*]] = icmp ugt i64 [[IV]], 10 -; TF-FIXEDLEN-NEXT: br i1 [[CMP]], label %[[DO_LOAD:.*]], label %[[LATCH]] -; TF-FIXEDLEN: [[DO_LOAD]]: -; TF-FIXEDLEN-NEXT: [[V:%.*]] = load i64, ptr [[B]], align 8 -; TF-FIXEDLEN-NEXT: br label %[[LATCH]] -; TF-FIXEDLEN: [[LATCH]]: -; TF-FIXEDLEN-NEXT: [[PHI:%.*]] = phi i64 [ 0, %[[FOR_BODY]] ], [ [[V]], %[[DO_LOAD]] ] -; TF-FIXEDLEN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] -; TF-FIXEDLEN-NEXT: store i64 [[PHI]], ptr [[ARRAYIDX]], align 8 -; TF-FIXEDLEN-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; TF-FIXEDLEN-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] -; TF-FIXEDLEN: [[FOR_END]]: -; TF-FIXEDLEN-NEXT: ret void -; entry: br label %for.body @@ -640,17 +552,21 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt ; TF-SCALABLE-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2 ; TF-SCALABLE-NEXT: br label %[[VECTOR_BODY:.*]] ; TF-SCALABLE: [[VECTOR_BODY]]: -; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 1025) +; TF-SCALABLE-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TF-SCALABLE-NEXT: [[AVL:%.*]] = sub i64 1025, [[INDEX]] +; TF-SCALABLE-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) ; TF-SCALABLE-NEXT: [[TMP5:%.*]] = load i64, ptr [[B]], align 1 ; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP5]], i64 0 ; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer ; TF-SCALABLE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] ; TF-SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0 -; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP8]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]]) -; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] +; TF-SCALABLE-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr align 8 [[TMP8]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP6]]) +; TF-SCALABLE-NEXT: [[TMP10:%.*]] = zext i32 [[TMP6]] to i64 +; TF-SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP10]], [[INDEX]] +; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[TMP4]] ; TF-SCALABLE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; TF-SCALABLE-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; TF-SCALABLE: [[MIDDLE_BLOCK]]: ; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]] ; TF-SCALABLE: [[SCALAR_PH]]: @@ -663,44 +579,10 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt ; TF-SCALABLE-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 ; TF-SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; TF-SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; TF-SCALABLE: [[FOR_END]]: ; TF-SCALABLE-NEXT: ret void ; -; TF-FIXEDLEN-LABEL: define void @uniform_load_unaligned( -; TF-FIXEDLEN-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; TF-FIXEDLEN-NEXT: [[ENTRY:.*]]: -; TF-FIXEDLEN-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; TF-FIXEDLEN: [[VECTOR_PH]]: -; TF-FIXEDLEN-NEXT: br label %[[VECTOR_BODY:.*]] -; TF-FIXEDLEN: [[VECTOR_BODY]]: -; TF-FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; TF-FIXEDLEN-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 1025) -; TF-FIXEDLEN-NEXT: [[TMP0:%.*]] = load i64, ptr [[B]], align 1 -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP0]], i64 0 -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer -; TF-FIXEDLEN-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] -; TF-FIXEDLEN-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0 -; TF-FIXEDLEN-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP2]], i32 8, <4 x i1> [[ACTIVE_LANE_MASK]]) -; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028 -; TF-FIXEDLEN-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] -; TF-FIXEDLEN: [[MIDDLE_BLOCK]]: -; TF-FIXEDLEN-NEXT: br label %[[FOR_END:.*]] -; TF-FIXEDLEN: [[SCALAR_PH]]: -; TF-FIXEDLEN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] -; TF-FIXEDLEN-NEXT: br label %[[FOR_BODY:.*]] -; TF-FIXEDLEN: [[FOR_BODY]]: -; TF-FIXEDLEN-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] -; TF-FIXEDLEN-NEXT: [[V:%.*]] = load i64, ptr [[B]], align 1 -; TF-FIXEDLEN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] -; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 -; TF-FIXEDLEN-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; TF-FIXEDLEN-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] -; TF-FIXEDLEN: [[FOR_END]]: -; TF-FIXEDLEN-NEXT: ret void -; entry: br label %for.body @@ -813,15 +695,19 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i ; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer ; TF-SCALABLE-NEXT: br label %[[VECTOR_BODY:.*]] ; TF-SCALABLE: [[VECTOR_BODY]]: -; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 1025) +; TF-SCALABLE-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TF-SCALABLE-NEXT: [[AVL:%.*]] = sub i64 1025, [[INDEX]] +; TF-SCALABLE-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) ; TF-SCALABLE-NEXT: store i64 [[V]], ptr [[B]], align 8 ; TF-SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] ; TF-SCALABLE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0 -; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]]) -; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] +; TF-SCALABLE-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr align 8 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]]) +; TF-SCALABLE-NEXT: [[TMP9:%.*]] = zext i32 [[TMP5]] to i64 +; TF-SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP9]], [[INDEX]] +; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[TMP4]] ; TF-SCALABLE-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; TF-SCALABLE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; TF-SCALABLE: [[MIDDLE_BLOCK]]: ; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]] ; TF-SCALABLE: [[SCALAR_PH]]: @@ -834,44 +720,10 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i ; TF-SCALABLE-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 ; TF-SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; TF-SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; TF-SCALABLE: [[FOR_END]]: ; TF-SCALABLE-NEXT: ret void ; -; TF-FIXEDLEN-LABEL: define void @uniform_store( -; TF-FIXEDLEN-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; TF-FIXEDLEN-NEXT: [[ENTRY:.*]]: -; TF-FIXEDLEN-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; TF-FIXEDLEN: [[VECTOR_PH]]: -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i64 0 -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer -; TF-FIXEDLEN-NEXT: br label %[[VECTOR_BODY:.*]] -; TF-FIXEDLEN: [[VECTOR_BODY]]: -; TF-FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; TF-FIXEDLEN-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 1025) -; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[B]], align 8 -; TF-FIXEDLEN-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] -; TF-FIXEDLEN-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0 -; TF-FIXEDLEN-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP2]], i32 8, <4 x i1> [[ACTIVE_LANE_MASK]]) -; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028 -; TF-FIXEDLEN-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] -; TF-FIXEDLEN: [[MIDDLE_BLOCK]]: -; TF-FIXEDLEN-NEXT: br label %[[FOR_END:.*]] -; TF-FIXEDLEN: [[SCALAR_PH]]: -; TF-FIXEDLEN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] -; TF-FIXEDLEN-NEXT: br label %[[FOR_BODY:.*]] -; TF-FIXEDLEN: [[FOR_BODY]]: -; TF-FIXEDLEN-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] -; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[B]], align 8 -; TF-FIXEDLEN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] -; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 -; TF-FIXEDLEN-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; TF-FIXEDLEN-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] -; TF-FIXEDLEN: [[FOR_END]]: -; TF-FIXEDLEN-NEXT: ret void -; entry: br label %for.body @@ -1003,22 +855,27 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias ; TF-SCALABLE-NEXT: [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64() ; TF-SCALABLE-NEXT: [[TMP7:%.*]] = mul <vscale x 2 x i64> [[TMP5]], splat (i64 1) ; TF-SCALABLE-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP7]] -; TF-SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 1, [[TMP4]] -; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP8]], i64 0 -; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer ; TF-SCALABLE-NEXT: br label %[[VECTOR_BODY:.*]] ; TF-SCALABLE: [[VECTOR_BODY]]: -; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TF-SCALABLE-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] ; TF-SCALABLE-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 1025) -; TF-SCALABLE-NEXT: call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[VEC_IND]], <vscale x 2 x ptr> [[BROADCAST_SPLAT]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]]) +; TF-SCALABLE-NEXT: [[AVL:%.*]] = sub i64 1025, [[INDEX]] +; TF-SCALABLE-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) +; TF-SCALABLE-NEXT: [[TMP13:%.*]] = zext i32 [[TMP9]] to i64 +; TF-SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 1, [[TMP13]] +; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP8]], i64 0 +; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer +; TF-SCALABLE-NEXT: call void @llvm.vp.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[VEC_IND]], <vscale x 2 x ptr> align 8 [[BROADCAST_SPLAT]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP9]]) ; TF-SCALABLE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] ; TF-SCALABLE-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0 -; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT3]], ptr [[TMP11]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]]) -; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] +; TF-SCALABLE-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT3]], ptr align 8 [[TMP11]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP9]]) +; TF-SCALABLE-NEXT: [[TMP14:%.*]] = zext i32 [[TMP9]] to i64 +; TF-SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP14]], [[INDEX]] +; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[TMP4]] ; TF-SCALABLE-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]] ; TF-SCALABLE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; TF-SCALABLE-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; TF-SCALABLE: [[MIDDLE_BLOCK]]: ; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]] ; TF-SCALABLE: [[SCALAR_PH]]: @@ -1031,71 +888,10 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias ; TF-SCALABLE-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 ; TF-SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; TF-SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; TF-SCALABLE: [[FOR_END]]: ; TF-SCALABLE-NEXT: ret void ; -; TF-FIXEDLEN-LABEL: define void @uniform_store_of_loop_varying( -; TF-FIXEDLEN-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; TF-FIXEDLEN-NEXT: [[ENTRY:.*]]: -; TF-FIXEDLEN-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; TF-FIXEDLEN: [[VECTOR_PH]]: -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i64 0 -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer -; TF-FIXEDLEN-NEXT: br label %[[VECTOR_BODY:.*]] -; TF-FIXEDLEN: [[VECTOR_BODY]]: -; TF-FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE6:.*]] ] -; TF-FIXEDLEN-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 1025) -; TF-FIXEDLEN-NEXT: [[TMP0:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 0 -; TF-FIXEDLEN-NEXT: br i1 [[TMP0]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] -; TF-FIXEDLEN: [[PRED_STORE_IF]]: -; TF-FIXEDLEN-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 -; TF-FIXEDLEN-NEXT: store i64 [[TMP1]], ptr [[B]], align 8 -; TF-FIXEDLEN-NEXT: br label %[[PRED_STORE_CONTINUE]] -; TF-FIXEDLEN: [[PRED_STORE_CONTINUE]]: -; TF-FIXEDLEN-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 1 -; TF-FIXEDLEN-NEXT: br i1 [[TMP2]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]] -; TF-FIXEDLEN: [[PRED_STORE_IF1]]: -; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 1 -; TF-FIXEDLEN-NEXT: store i64 [[TMP3]], ptr [[B]], align 8 -; TF-FIXEDLEN-NEXT: br label %[[PRED_STORE_CONTINUE2]] -; TF-FIXEDLEN: [[PRED_STORE_CONTINUE2]]: -; TF-FIXEDLEN-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 2 -; TF-FIXEDLEN-NEXT: br i1 [[TMP4]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]] -; TF-FIXEDLEN: [[PRED_STORE_IF3]]: -; TF-FIXEDLEN-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 2 -; TF-FIXEDLEN-NEXT: store i64 [[TMP5]], ptr [[B]], align 8 -; TF-FIXEDLEN-NEXT: br label %[[PRED_STORE_CONTINUE4]] -; TF-FIXEDLEN: [[PRED_STORE_CONTINUE4]]: -; TF-FIXEDLEN-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 3 -; TF-FIXEDLEN-NEXT: br i1 [[TMP6]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6]] -; TF-FIXEDLEN: [[PRED_STORE_IF5]]: -; TF-FIXEDLEN-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 3 -; TF-FIXEDLEN-NEXT: store i64 [[TMP7]], ptr [[B]], align 8 -; TF-FIXEDLEN-NEXT: br label %[[PRED_STORE_CONTINUE6]] -; TF-FIXEDLEN: [[PRED_STORE_CONTINUE6]]: -; TF-FIXEDLEN-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] -; TF-FIXEDLEN-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0 -; TF-FIXEDLEN-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP9]], i32 8, <4 x i1> [[ACTIVE_LANE_MASK]]) -; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; TF-FIXEDLEN-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028 -; TF-FIXEDLEN-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] -; TF-FIXEDLEN: [[MIDDLE_BLOCK]]: -; TF-FIXEDLEN-NEXT: br label %[[FOR_END:.*]] -; TF-FIXEDLEN: [[SCALAR_PH]]: -; TF-FIXEDLEN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] -; TF-FIXEDLEN-NEXT: br label %[[FOR_BODY:.*]] -; TF-FIXEDLEN: [[FOR_BODY]]: -; TF-FIXEDLEN-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] -; TF-FIXEDLEN-NEXT: store i64 [[IV]], ptr [[B]], align 8 -; TF-FIXEDLEN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] -; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 -; TF-FIXEDLEN-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; TF-FIXEDLEN-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] -; TF-FIXEDLEN: [[FOR_END]]: -; TF-FIXEDLEN-NEXT: ret void -; entry: br label %for.body @@ -1240,24 +1036,28 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc ; TF-SCALABLE-NEXT: [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64() ; TF-SCALABLE-NEXT: [[TMP7:%.*]] = mul <vscale x 2 x i64> [[TMP5]], splat (i64 1) ; TF-SCALABLE-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP7]] -; TF-SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 1, [[TMP4]] -; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP8]], i64 0 -; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer ; TF-SCALABLE-NEXT: br label %[[VECTOR_BODY:.*]] ; TF-SCALABLE: [[VECTOR_BODY]]: -; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TF-SCALABLE-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] ; TF-SCALABLE-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 1025) +; TF-SCALABLE-NEXT: [[AVL:%.*]] = sub i64 1025, [[INDEX]] +; TF-SCALABLE-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) +; TF-SCALABLE-NEXT: [[TMP11:%.*]] = zext i32 [[TMP9]] to i64 +; TF-SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 1, [[TMP11]] +; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP8]], i64 0 +; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer ; TF-SCALABLE-NEXT: [[TMP10:%.*]] = icmp ugt <vscale x 2 x i64> [[VEC_IND]], splat (i64 10) -; TF-SCALABLE-NEXT: [[TMP9:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP10]], <vscale x 2 x i1> zeroinitializer -; TF-SCALABLE-NEXT: call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[BROADCAST_SPLAT1]], <vscale x 2 x ptr> [[BROADCAST_SPLAT2]], i32 8, <vscale x 2 x i1> [[TMP9]]) +; TF-SCALABLE-NEXT: call void @llvm.vp.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[BROADCAST_SPLAT1]], <vscale x 2 x ptr> align 8 [[BROADCAST_SPLAT2]], <vscale x 2 x i1> [[TMP10]], i32 [[TMP9]]) ; TF-SCALABLE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] ; TF-SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 0 -; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT1]], ptr [[TMP13]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]]) -; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] +; TF-SCALABLE-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT1]], ptr align 8 [[TMP13]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP9]]) +; TF-SCALABLE-NEXT: [[TMP15:%.*]] = zext i32 [[TMP9]] to i64 +; TF-SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP15]], [[INDEX]] +; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[TMP4]] ; TF-SCALABLE-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; TF-SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; TF-SCALABLE-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; TF-SCALABLE: [[MIDDLE_BLOCK]]: ; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]] ; TF-SCALABLE: [[SCALAR_PH]]: @@ -1275,55 +1075,10 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc ; TF-SCALABLE-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 ; TF-SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; TF-SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; TF-SCALABLE: [[FOR_END]]: ; TF-SCALABLE-NEXT: ret void ; -; TF-FIXEDLEN-LABEL: define void @conditional_uniform_store( -; TF-FIXEDLEN-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; TF-FIXEDLEN-NEXT: [[ENTRY:.*]]: -; TF-FIXEDLEN-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; TF-FIXEDLEN: [[VECTOR_PH]]: -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i64 0 -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x ptr> poison, ptr [[B]], i64 0 -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT1]], <4 x ptr> poison, <4 x i32> zeroinitializer -; TF-FIXEDLEN-NEXT: br label %[[VECTOR_BODY:.*]] -; TF-FIXEDLEN: [[VECTOR_BODY]]: -; TF-FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; TF-FIXEDLEN-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; TF-FIXEDLEN-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 1025) -; TF-FIXEDLEN-NEXT: [[TMP0:%.*]] = icmp ugt <4 x i64> [[VEC_IND]], splat (i64 10) -; TF-FIXEDLEN-NEXT: [[TMP1:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP0]], <4 x i1> zeroinitializer -; TF-FIXEDLEN-NEXT: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[BROADCAST_SPLAT]], <4 x ptr> [[BROADCAST_SPLAT2]], i32 8, <4 x i1> [[TMP1]]) -; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] -; TF-FIXEDLEN-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; TF-FIXEDLEN-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], i32 8, <4 x i1> [[ACTIVE_LANE_MASK]]) -; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; TF-FIXEDLEN-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) -; TF-FIXEDLEN-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028 -; TF-FIXEDLEN-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] -; TF-FIXEDLEN: [[MIDDLE_BLOCK]]: -; TF-FIXEDLEN-NEXT: br label %[[FOR_END:.*]] -; TF-FIXEDLEN: [[SCALAR_PH]]: -; TF-FIXEDLEN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] -; TF-FIXEDLEN-NEXT: br label %[[FOR_BODY:.*]] -; TF-FIXEDLEN: [[FOR_BODY]]: -; TF-FIXEDLEN-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ] -; TF-FIXEDLEN-NEXT: [[CMP:%.*]] = icmp ugt i64 [[IV]], 10 -; TF-FIXEDLEN-NEXT: br i1 [[CMP]], label %[[DO_STORE:.*]], label %[[LATCH]] -; TF-FIXEDLEN: [[DO_STORE]]: -; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[B]], align 8 -; TF-FIXEDLEN-NEXT: br label %[[LATCH]] -; TF-FIXEDLEN: [[LATCH]]: -; TF-FIXEDLEN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] -; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 -; TF-FIXEDLEN-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; TF-FIXEDLEN-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] -; TF-FIXEDLEN: [[FOR_END]]: -; TF-FIXEDLEN-NEXT: ret void -; entry: br label %for.body @@ -1442,15 +1197,19 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap ; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer ; TF-SCALABLE-NEXT: br label %[[VECTOR_BODY:.*]] ; TF-SCALABLE: [[VECTOR_BODY]]: -; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 1025) +; TF-SCALABLE-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TF-SCALABLE-NEXT: [[AVL:%.*]] = sub i64 1025, [[INDEX]] +; TF-SCALABLE-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) ; TF-SCALABLE-NEXT: store i64 [[V]], ptr [[B]], align 1 ; TF-SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] ; TF-SCALABLE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0 -; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]]) -; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] +; TF-SCALABLE-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr align 8 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]]) +; TF-SCALABLE-NEXT: [[TMP9:%.*]] = zext i32 [[TMP5]] to i64 +; TF-SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP9]], [[INDEX]] +; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[TMP4]] ; TF-SCALABLE-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; TF-SCALABLE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; TF-SCALABLE: [[MIDDLE_BLOCK]]: ; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]] ; TF-SCALABLE: [[SCALAR_PH]]: @@ -1463,44 +1222,10 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap ; TF-SCALABLE-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 ; TF-SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; TF-SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; TF-SCALABLE: [[FOR_END]]: ; TF-SCALABLE-NEXT: ret void ; -; TF-FIXEDLEN-LABEL: define void @uniform_store_unaligned( -; TF-FIXEDLEN-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; TF-FIXEDLEN-NEXT: [[ENTRY:.*]]: -; TF-FIXEDLEN-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; TF-FIXEDLEN: [[VECTOR_PH]]: -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i64 0 -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer -; TF-FIXEDLEN-NEXT: br label %[[VECTOR_BODY:.*]] -; TF-FIXEDLEN: [[VECTOR_BODY]]: -; TF-FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; TF-FIXEDLEN-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 1025) -; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[B]], align 1 -; TF-FIXEDLEN-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] -; TF-FIXEDLEN-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0 -; TF-FIXEDLEN-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP2]], i32 8, <4 x i1> [[ACTIVE_LANE_MASK]]) -; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028 -; TF-FIXEDLEN-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] -; TF-FIXEDLEN: [[MIDDLE_BLOCK]]: -; TF-FIXEDLEN-NEXT: br label %[[FOR_END:.*]] -; TF-FIXEDLEN: [[SCALAR_PH]]: -; TF-FIXEDLEN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] -; TF-FIXEDLEN-NEXT: br label %[[FOR_BODY:.*]] -; TF-FIXEDLEN: [[FOR_BODY]]: -; TF-FIXEDLEN-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] -; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[B]], align 1 -; TF-FIXEDLEN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] -; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 -; TF-FIXEDLEN-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; TF-FIXEDLEN-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] -; TF-FIXEDLEN: [[FOR_END]]: -; TF-FIXEDLEN-NEXT: ret void -; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll index fe6a693..acfcf90 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll @@ -16,48 +16,39 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) { ; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; IF-EVL: vector.ph: ; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; IF-EVL-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8 +; IF-EVL-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 ; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 ; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP6]] ; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 ; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4 -; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer ; IF-EVL-NEXT: [[TMP10:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64() ; IF-EVL-NEXT: [[TMP12:%.*]] = mul <vscale x 4 x i64> [[TMP10]], splat (i64 1) ; IF-EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP12]] ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP8]], i64 0 +; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP13:%.*]] = zext i32 [[TMP11]] to i64 +; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP13]] +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP9]], i64 0 ; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer -; IF-EVL-NEXT: [[STEP_ADD:%.*]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]] -; IF-EVL-NEXT: [[TMP19:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] -; IF-EVL-NEXT: [[TMP20:%.*]] = icmp ule <vscale x 4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]] ; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], <vscale x 4 x i64> [[VEC_IND]], i32 0 -; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <vscale x 4 x i64> [[STEP_ADD]], i32 0 -; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP21]], i32 4, <vscale x 4 x i1> [[TMP19]], <vscale x 4 x i32> poison) -; IF-EVL-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP22]], i32 4, <vscale x 4 x i1> [[TMP20]], <vscale x 4 x i32> poison) +; IF-EVL-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP21]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP11]]) ; IF-EVL-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <vscale x 4 x i64> [[VEC_IND]], i32 1 -; IF-EVL-NEXT: [[TMP24:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <vscale x 4 x i64> [[STEP_ADD]], i32 1 -; IF-EVL-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP23]], i32 4, <vscale x 4 x i1> [[TMP19]], <vscale x 4 x i32> poison) -; IF-EVL-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP24]], i32 4, <vscale x 4 x i1> [[TMP20]], <vscale x 4 x i32> poison) -; IF-EVL-NEXT: [[TMP25:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_GATHER4]], [[WIDE_MASKED_GATHER]] +; IF-EVL-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP23]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP11]]) ; IF-EVL-NEXT: [[TMP26:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_GATHER5]], [[WIDE_MASKED_GATHER3]] -; IF-EVL-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]] +; IF-EVL-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 0 -; IF-EVL-NEXT: [[TMP30:%.*]] = call i64 @llvm.vscale.i64() -; IF-EVL-NEXT: [[TMP31:%.*]] = mul nuw i64 [[TMP30]], 4 -; IF-EVL-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i64 [[TMP31]] -; IF-EVL-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP25]], ptr [[TMP29]], i32 4, <vscale x 4 x i1> [[TMP19]]) -; IF-EVL-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP26]], ptr [[TMP32]], i32 4, <vscale x 4 x i1> [[TMP20]]) -; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] -; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT2]] +; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP26]], ptr align 4 [[TMP29]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = zext i32 [[TMP11]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]] ; IF-EVL-NEXT: [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL: middle.block: @@ -76,50 +67,36 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) { ; IF-EVL-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4 ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; IF-EVL: for.cond.cleanup: ; IF-EVL-NEXT: ret void ; ; NO-VP-LABEL: @interleave( ; NO-VP-NEXT: entry: ; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; NO-VP-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8 +; NO-VP-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 ; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]] ; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; NO-VP: vector.ph: ; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; NO-VP-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 +; NO-VP-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 ; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] ; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; NO-VP-NEXT: [[TMP8:%.*]] = mul nuw i64 [[TMP4]], 4 -; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP8]], 2 ; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] ; NO-VP: vector.body: ; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; NO-VP-NEXT: [[TMP9:%.*]] = add i64 [[TMP8]], 0 -; NO-VP-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 1 -; NO-VP-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]] -; NO-VP-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], i64 [[INDEX]], i32 0 -; NO-VP-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[TMP11]], i32 0 -; NO-VP-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP12]], align 4 -; NO-VP-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]]) -; NO-VP-NEXT: [[TMP15:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0 -; NO-VP-NEXT: [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1 +; NO-VP-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], i64 [[INDEX]], i32 0 ; NO-VP-NEXT: [[WIDE_VEC1:%.*]] = load <vscale x 8 x i32>, ptr [[TMP13]], align 4 ; NO-VP-NEXT: [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC1]]) ; NO-VP-NEXT: [[TMP18:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0 ; NO-VP-NEXT: [[TMP19:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 1 -; NO-VP-NEXT: [[TMP20:%.*]] = add nsw <vscale x 4 x i32> [[TMP16]], [[TMP15]] ; NO-VP-NEXT: [[TMP21:%.*]] = add nsw <vscale x 4 x i32> [[TMP19]], [[TMP18]] ; NO-VP-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]] ; NO-VP-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i32 0 -; NO-VP-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() -; NO-VP-NEXT: [[TMP26:%.*]] = mul nuw i64 [[TMP25]], 4 -; NO-VP-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i64 [[TMP26]] -; NO-VP-NEXT: store <vscale x 4 x i32> [[TMP20]], ptr [[TMP24]], align 4 -; NO-VP-NEXT: store <vscale x 4 x i32> [[TMP21]], ptr [[TMP27]], align 4 -; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; NO-VP-NEXT: store <vscale x 4 x i32> [[TMP21]], ptr [[TMP24]], align 4 +; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] ; NO-VP-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; NO-VP-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; NO-VP: middle.block: @@ -163,6 +140,5 @@ for.cond.cleanup: ret void } -!0 = distinct !{!0, !1, !2} -!1 = !{!"llvm.loop.interleave.count", i32 2} -!2 = !{!"llvm.loop.vectorize.enable", i1 true} +!0 = distinct !{!0, !1} +!1 = !{!"llvm.loop.vectorize.enable", i1 true} diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-riscv-vector-reverse.ll new file mode 100644 index 0000000..d7c9ce4 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-riscv-vector-reverse.ll @@ -0,0 +1,80 @@ +; This is the loop in c++ being vectorize in this file with +;vector.reverse +; #pragma clang loop vectorize_width(4, scalable) +; for (int i = N-1; i >= 0; --i) +; a[i] = b[i] + 1.0; + +; REQUIRES: asserts +; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v \ +; RUN: -debug-only=loop-vectorize -scalable-vectorization=on \ +; RUN: -disable-output < %s 2>&1 | FileCheck %s + +define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocapture noundef readonly %B, i32 noundef signext %n) { +; CHECK: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' { +; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF +; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count +; CHECK-NEXT: vp<[[OTC:%.+]]> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb<entry>: +; CHECK-NEXT: EMIT vp<[[OTC]]> = EXPAND SCEV (1 + (-1 * (1 umin %n))<nuw><nsw> + %n) +; CHECK-NEXT: Successor(s): scalar.ph, vector.ph +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: vp<[[RESUME_IV_A:%.+]]> = DERIVED-IV ir<%n> + vp<[[VTC]]> * ir<-1> +; CHECK-NEXT: vp<[[RESUME_IV_B:%.+]]> = DERIVED-IV ir<%n> + vp<[[VTC]]> * ir<-1> +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: <x1> vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<[[INDUCTION:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[INDEX_NEXT:%.+]]> +; CHECK-NEXT: vp<[[DERIVED_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[INDUCTION]]> * ir<-1> +; CHECK-NEXT: vp<[[SCALAR_STEPS:%.+]]> = SCALAR-STEPS vp<[[DERIVED_IV]]>, ir<-1>, vp<[[VF]]> +; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[SCALAR_STEPS]]>, ir<-1> +; CHECK-NEXT: CLONE ir<[[IDX_PROM:%.+]]> = zext ir<[[IDX]]> +; CHECK-NEXT: CLONE ir<[[ARRAY_IDX_B:%.+]]> = getelementptr inbounds ir<[[B:%.+]]>, ir<[[IDX_PROM]]> +; CHECK-NEXT: vp<[[VEC_END_PTR_B:%.+]]> = vector-end-pointer inbounds ir<[[ARRAY_IDX_B]]>, vp<[[VF]]> +; CHECK-NEXT: WIDEN ir<[[VAL_B:%.+]]> = load vp<[[VEC_END_PTR_B]]> +; CHECK-NEXT: WIDEN ir<[[ADD_RESULT:%.+]]> = add ir<[[VAL_B]]>, ir<1> +; CHECK-NEXT: CLONE ir<[[ARRAY_IDX_A:%.+]]> = getelementptr inbounds ir<[[A:%.+]]>, ir<[[IDX_PROM]]> +; CHECK-NEXT: vp<[[VEC_END_PTR_A:%.+]]> = vector-end-pointer inbounds ir<[[ARRAY_IDX_A]]>, vp<[[VF]]> +; CHECK-NEXT: WIDEN store vp<[[VEC_END_PTR_A]]>, ir<[[ADD_RESULT]]> +; CHECK-NEXT: EMIT vp<[[INDEX_NEXT]]> = add nuw vp<[[INDUCTION]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[INDEX_NEXT]]>, vp<[[VTC]]> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq vp<[[OTC]]>, vp<[[VTC]]> +; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]> +; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup>, scalar.ph +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb<for.cond.cleanup>: +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<[[RESUME_IV_A]]>, middle.block ], [ ir<%n>, ir-bb<entry> ] +; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val>.1 = phi [ vp<[[RESUME_IV_B]]>, middle.block ], [ ir<%n>, ir-bb<entry> ] +; CHECK-NEXT: Successor(s): ir-bb<for.body> +; +entry: + br label %for.body + +for.body: + %indvars.iv = phi i32 [ %n, %entry ], [ %indvars.iv.next, %for.body ] + %i.0.in8 = phi i32 [ %n, %entry ], [ %i.0, %for.body ] + %i.0 = add nsw i32 %i.0.in8, -1 + %idxprom = zext i32 %i.0 to i64 + %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom + %1 = load i32, ptr %arrayidx, align 4 + %add9 = add i32 %1, 1 + %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom + store i32 %add9, ptr %arrayidx3, align 4 + %cmp = icmp ugt i32 %indvars.iv, 1 + %indvars.iv.next = add nsw i32 %indvars.iv, -1 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-call-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-call-intrinsics.ll index bb61f431d..9652351 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-call-intrinsics.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-call-intrinsics.ll @@ -22,7 +22,7 @@ define void @vp_smax(ptr %a, ptr %b, ptr %c, i64 %N) { ; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION ; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]> ; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]> -; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> +; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> ; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]] ; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> ; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> @@ -78,7 +78,7 @@ define void @vp_smin(ptr %a, ptr %b, ptr %c, i64 %N) { ; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION ; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]> ; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]> -; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> +; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> ; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]> ; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> ; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> @@ -134,7 +134,7 @@ define void @vp_umax(ptr %a, ptr %b, ptr %c, i64 %N) { ; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION ; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]> ; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]> -; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> +; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> ; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]> ; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> ; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> @@ -190,7 +190,7 @@ define void @vp_umin(ptr %a, ptr %b, ptr %c, i64 %N) { ; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION ; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]> ; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]> -; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> +; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> ; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]] ; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> ; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> @@ -246,7 +246,7 @@ define void @vp_ctlz(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION ; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]> ; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]> -; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> +; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> ; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]> ; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> ; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> @@ -297,7 +297,7 @@ define void @vp_cttz(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION ; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]> ; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]> -; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> +; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> ; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]> ; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> ; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> @@ -348,7 +348,7 @@ define void @vp_lrint(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION ; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]> ; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]> -; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> +; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> ; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]> ; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> ; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> @@ -403,7 +403,7 @@ define void @vp_llrint(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION ; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]> ; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]> -; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> +; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> ; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]> ; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> ; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> @@ -458,7 +458,7 @@ define void @vp_abs(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION ; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]> ; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]> -; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> +; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> ; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]> ; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> ; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cast-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cast-intrinsics.ll index 2e1bcaa..3ec48ef 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cast-intrinsics.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cast-intrinsics.ll @@ -21,7 +21,7 @@ define void @vp_sext(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION ; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]> ; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]> -; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> +; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> ; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]> ; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> ; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> @@ -74,7 +74,7 @@ define void @vp_zext(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION ; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]> ; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]> -; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> +; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> ; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]> ; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> ; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> @@ -125,7 +125,7 @@ define void @vp_trunc(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION ; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]> ; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]> -; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> +; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> ; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]> ; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> ; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> @@ -176,7 +176,7 @@ define void @vp_fpext(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION ; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]> ; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]> -; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> +; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> ; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]> ; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> ; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> @@ -227,7 +227,7 @@ define void @vp_fptrunc(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION ; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]> ; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]> -; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> +; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> ; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]> ; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> ; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> @@ -278,7 +278,7 @@ define void @vp_sitofp(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION ; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]> ; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]> -; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> +; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> ; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]> ; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> ; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> @@ -329,7 +329,7 @@ define void @vp_uitofp(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION ; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]> ; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]> -; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> +; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> ; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]] ; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> ; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> @@ -380,7 +380,7 @@ define void @vp_fptosi(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION ; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]> ; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]> -; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> +; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> ; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]> ; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> ; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> @@ -431,7 +431,7 @@ define void @vp_fptoui(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION ; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]> ; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]> -; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> +; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> ; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]> ; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> ; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> @@ -482,7 +482,7 @@ define void @vp_inttoptr(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION ; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]> ; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]> -; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> +; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> ; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]> ; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> ; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> @@ -534,7 +534,7 @@ define void @vp_ptrtoint(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[INDEX_EVL:%.+]]> = phi ir<0>, vp<[[INDEX_EVL_NEXT:%.+]]> ; IF-EVL-NEXT: ir<[[IV:%.+]]> = WIDEN-INDUCTION ir<0>, ir<1>, vp<[[EVL]]> ; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<[[N]]>, vp<[[INDEX_EVL]]> -; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> +; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> ; IF-EVL-NEXT: vp<[[SCALAR_STEPS:%.+]]> = SCALAR-STEPS vp<[[INDEX_EVL]]>, ir<1>, vp<[[EVL]]> ; IF-EVL-NEXT: WIDEN-GEP Inv[Var] ir<[[GEP:%.+]]> = getelementptr inbounds ir<%b>, ir<[[IV]]> ; IF-EVL-NEXT: WIDEN-CAST ir<[[PTRTOINT:%.+]]> = ptrtoint ir<[[GEP]]> to i64 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-fixed-order-recurrence.ll index 7540b77..7f29213 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-fixed-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-fixed-order-recurrence.ll @@ -29,7 +29,7 @@ define void @first_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) { ; IF-EVL-NEXT: FIRST-ORDER-RECURRENCE-PHI ir<[[FOR_PHI:%.+]]> = phi ir<33>, ir<[[LD:%.+]]> ; IF-EVL-NEXT: EMIT-SCALAR vp<[[PREV_EVL:%.+]]> = phi [ vp<[[VF32]]>, vector.ph ], [ vp<[[EVL:%.+]]>, vector.body ] ; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%TC>, vp<[[EVL_PHI]]> -; IF-EVL-NEXT: EMIT vp<[[EVL]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> +; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> ; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1> ; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds nuw ir<%A>, vp<[[ST]] ; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll index aa15a20..baf546b 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll @@ -45,7 +45,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]> ; IF-EVL-OUTLOOP-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX_PHI:%.+]]> = phi vp<[[RDX_START]]>, vp<[[RDX_SELECT:%.+]]> ; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%n>, vp<[[EVL_PHI]]> -; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> +; IF-EVL-OUTLOOP-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> ; IF-EVL-OUTLOOP-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]> ; IF-EVL-OUTLOOP-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> ; IF-EVL-OUTLOOP-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> @@ -84,7 +84,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; IF-EVL-INLOOP-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]> ; IF-EVL-INLOOP-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX_PHI:%.+]]> = phi vp<[[RDX_START]]>, ir<[[RDX_NEXT:%.+]]> ; IF-EVL-INLOOP-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%n>, vp<[[EVL_PHI]]> -; IF-EVL-INLOOP-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> +; IF-EVL-INLOOP-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> ; IF-EVL-INLOOP-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]> ; IF-EVL-INLOOP-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> ; IF-EVL-INLOOP-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll index 563e515..97a6130 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll @@ -27,7 +27,7 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { ; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION ; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]> ; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]> -; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> +; IF-EVL-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> ; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]> ; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> ; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-lifetime-ends.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-lifetime-ends.ll index bdd0c6f..7cc8458 100644 --- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-lifetime-ends.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-lifetime-ends.ll @@ -431,195 +431,26 @@ exit: ret void } -define void @lifetime_for_ptr_first_arg_before_multiply(ptr noalias %A, ptr noalias %B, ptr noalias %C, i1 %c.0) { -; CHECK-LABEL: @lifetime_for_ptr_first_arg_before_multiply( -; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]] -; CHECK: then: -; CHECK-NEXT: br label [[EXIT]] -; CHECK: exit: -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr double, ptr [[A:%.*]], i64 0 -; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8 -; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 2 -; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[B:%.*]], i64 0 -; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[TMP1]], align 8 -; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr double, ptr [[TMP1]], i64 2 -; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8 -; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> poison, double [[TMP2]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = fmul contract <1 x double> [[BLOCK]], [[SPLAT_SPLAT]] -; CHECK-NEXT: [[BLOCK5:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1 -; CHECK-NEXT: [[SPLAT_SPLATINSERT6:%.*]] = insertelement <1 x double> poison, double [[TMP4]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT7:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT6]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK5]], <1 x double> [[SPLAT_SPLAT7]], <1 x double> [[TMP3]]) -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <1 x double> [[TMP5]], <1 x double> poison, <2 x i32> <i32 0, i32 poison> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP6]], <2 x i32> <i32 2, i32 1> -; CHECK-NEXT: [[BLOCK8:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <1 x i32> <i32 1> -; CHECK-NEXT: [[BLOCK9:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> <i32 1> -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLATINSERT10:%.*]] = insertelement <1 x double> poison, double [[TMP8]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT11:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT10]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = fmul contract <1 x double> [[BLOCK9]], [[SPLAT_SPLAT11]] -; CHECK-NEXT: [[BLOCK12:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> <i32 1> -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1 -; CHECK-NEXT: [[SPLAT_SPLATINSERT13:%.*]] = insertelement <1 x double> poison, double [[TMP10]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT14:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT13]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK12]], <1 x double> [[SPLAT_SPLAT14]], <1 x double> [[TMP9]]) -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <1 x double> [[TMP11]], <1 x double> poison, <2 x i32> <i32 0, i32 poison> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP12]], <2 x i32> <i32 0, i32 2> -; CHECK-NEXT: [[BLOCK15:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLATINSERT16:%.*]] = insertelement <1 x double> poison, double [[TMP14]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT17:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT16]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = fmul contract <1 x double> [[BLOCK15]], [[SPLAT_SPLAT17]] -; CHECK-NEXT: [[BLOCK18:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1 -; CHECK-NEXT: [[SPLAT_SPLATINSERT19:%.*]] = insertelement <1 x double> poison, double [[TMP16]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT20:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT19]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK18]], <1 x double> [[SPLAT_SPLAT20]], <1 x double> [[TMP15]]) -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <1 x double> [[TMP17]], <1 x double> poison, <2 x i32> <i32 0, i32 poison> -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP18]], <2 x i32> <i32 2, i32 1> -; CHECK-NEXT: [[BLOCK21:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> poison, <1 x i32> <i32 1> -; CHECK-NEXT: [[BLOCK22:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> <i32 1> -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x double> poison, double [[TMP20]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT23]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = fmul contract <1 x double> [[BLOCK22]], [[SPLAT_SPLAT24]] -; CHECK-NEXT: [[BLOCK25:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> <i32 1> -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1 -; CHECK-NEXT: [[SPLAT_SPLATINSERT26:%.*]] = insertelement <1 x double> poison, double [[TMP22]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT27:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT26]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP23:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK25]], <1 x double> [[SPLAT_SPLAT27]], <1 x double> [[TMP21]]) -; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <1 x double> [[TMP23]], <1 x double> poison, <2 x i32> <i32 0, i32 poison> -; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> [[TMP24]], <2 x i32> <i32 0, i32 2> -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr double, ptr [[C1:%.*]], i64 0 -; CHECK-NEXT: store <2 x double> [[TMP13]], ptr [[TMP26]], align 8 -; CHECK-NEXT: [[VEC_GEP28:%.*]] = getelementptr double, ptr [[TMP26]], i64 2 -; CHECK-NEXT: store <2 x double> [[TMP25]], ptr [[VEC_GEP28]], align 8 -; CHECK-NEXT: ret void -; -entry: - %a = load <4 x double>, ptr %A, align 8 - %b = load <4 x double>, ptr %B, align 8 - br i1 %c.0, label %then, label %exit - -then: - call void @llvm.lifetime.end(i64 -1, ptr %A) - br label %exit - -exit: - %m = call <4 x double> @llvm.matrix.multiply(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2) - store <4 x double> %m, ptr %C, align 8 - ret void -} - -define void @lifetime_for_both_ptr_args_before_multiply(ptr noalias %A, ptr noalias %B, ptr noalias %C, i1 %c.0) { -; CHECK-LABEL: @lifetime_for_both_ptr_args_before_multiply( -; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]] -; CHECK: then: -; CHECK-NEXT: br label [[EXIT]] -; CHECK: exit: -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr double, ptr [[A:%.*]], i64 0 -; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8 -; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 2 -; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[B:%.*]], i64 0 -; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[TMP1]], align 8 -; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr double, ptr [[TMP1]], i64 2 -; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8 -; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> poison, double [[TMP2]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = fmul contract <1 x double> [[BLOCK]], [[SPLAT_SPLAT]] -; CHECK-NEXT: [[BLOCK5:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1 -; CHECK-NEXT: [[SPLAT_SPLATINSERT6:%.*]] = insertelement <1 x double> poison, double [[TMP4]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT7:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT6]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK5]], <1 x double> [[SPLAT_SPLAT7]], <1 x double> [[TMP3]]) -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <1 x double> [[TMP5]], <1 x double> poison, <2 x i32> <i32 0, i32 poison> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP6]], <2 x i32> <i32 2, i32 1> -; CHECK-NEXT: [[BLOCK8:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <1 x i32> <i32 1> -; CHECK-NEXT: [[BLOCK9:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> <i32 1> -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLATINSERT10:%.*]] = insertelement <1 x double> poison, double [[TMP8]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT11:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT10]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = fmul contract <1 x double> [[BLOCK9]], [[SPLAT_SPLAT11]] -; CHECK-NEXT: [[BLOCK12:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> <i32 1> -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1 -; CHECK-NEXT: [[SPLAT_SPLATINSERT13:%.*]] = insertelement <1 x double> poison, double [[TMP10]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT14:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT13]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK12]], <1 x double> [[SPLAT_SPLAT14]], <1 x double> [[TMP9]]) -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <1 x double> [[TMP11]], <1 x double> poison, <2 x i32> <i32 0, i32 poison> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP12]], <2 x i32> <i32 0, i32 2> -; CHECK-NEXT: [[BLOCK15:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLATINSERT16:%.*]] = insertelement <1 x double> poison, double [[TMP14]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT17:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT16]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = fmul contract <1 x double> [[BLOCK15]], [[SPLAT_SPLAT17]] -; CHECK-NEXT: [[BLOCK18:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1 -; CHECK-NEXT: [[SPLAT_SPLATINSERT19:%.*]] = insertelement <1 x double> poison, double [[TMP16]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT20:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT19]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK18]], <1 x double> [[SPLAT_SPLAT20]], <1 x double> [[TMP15]]) -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <1 x double> [[TMP17]], <1 x double> poison, <2 x i32> <i32 0, i32 poison> -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP18]], <2 x i32> <i32 2, i32 1> -; CHECK-NEXT: [[BLOCK21:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> poison, <1 x i32> <i32 1> -; CHECK-NEXT: [[BLOCK22:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> <i32 1> -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x double> poison, double [[TMP20]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT23]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = fmul contract <1 x double> [[BLOCK22]], [[SPLAT_SPLAT24]] -; CHECK-NEXT: [[BLOCK25:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> <i32 1> -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1 -; CHECK-NEXT: [[SPLAT_SPLATINSERT26:%.*]] = insertelement <1 x double> poison, double [[TMP22]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT27:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT26]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP23:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK25]], <1 x double> [[SPLAT_SPLAT27]], <1 x double> [[TMP21]]) -; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <1 x double> [[TMP23]], <1 x double> poison, <2 x i32> <i32 0, i32 poison> -; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> [[TMP24]], <2 x i32> <i32 0, i32 2> -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr double, ptr [[C1:%.*]], i64 0 -; CHECK-NEXT: store <2 x double> [[TMP13]], ptr [[TMP26]], align 8 -; CHECK-NEXT: [[VEC_GEP28:%.*]] = getelementptr double, ptr [[TMP26]], i64 2 -; CHECK-NEXT: store <2 x double> [[TMP25]], ptr [[VEC_GEP28]], align 8 -; CHECK-NEXT: ret void -; -entry: - %a = load <4 x double>, ptr %A, align 8 - %b = load <4 x double>, ptr %B, align 8 - br i1 %c.0, label %then, label %exit - -then: - call void @llvm.lifetime.end(i64 -1, ptr %B) - call void @llvm.lifetime.end(i64 -1, ptr %A) - br label %exit - -exit: - %m = call <4 x double> @llvm.matrix.multiply(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2) - store <4 x double> %m, ptr %C, align 8 - ret void -} - -define void @multiple_unrelated_lifetimes(ptr noalias %A, ptr noalias %B, ptr noalias %C, i1 %c.0) { +define void @multiple_unrelated_lifetimes(ptr noalias %C, i1 %c.0) { ; CHECK-LABEL: @multiple_unrelated_lifetimes( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ALLOC_1:%.*]] = alloca i32, align 4 ; CHECK-NEXT: [[ALLOC_2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[A:%.*]] = alloca <4 x double>, align 32 +; CHECK-NEXT: [[B:%.*]] = alloca <4 x double>, align 32 +; CHECK-NEXT: call void @init(ptr [[A]]) +; CHECK-NEXT: call void @init(ptr [[B]]) ; CHECK-NEXT: br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]] ; CHECK: then: ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 -1, ptr [[ALLOC_1]]) ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 -1, ptr [[ALLOC_2]]) ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr double, ptr [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr double, ptr [[A]], i64 0 ; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8 ; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 2 ; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[B:%.*]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[B]], i64 0 ; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[TMP1]], align 8 ; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr double, ptr [[TMP1]], i64 2 ; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8 @@ -682,6 +513,10 @@ define void @multiple_unrelated_lifetimes(ptr noalias %A, ptr noalias %B, ptr no entry: %alloc.1 = alloca i32 %alloc.2 = alloca i32 + %A = alloca <4 x double> + %B = alloca <4 x double> + call void @init(ptr %A) + call void @init(ptr %B) %a = load <4 x double>, ptr %A, align 8 %b = load <4 x double>, ptr %B, align 8 br i1 %c.0, label %then, label %exit @@ -699,106 +534,20 @@ exit: ret void } -define void @lifetime_for_ptr_select_before_multiply(ptr noalias %A, ptr noalias %B, ptr noalias %C, i1 %c.0, i1 %c.1) { -; CHECK-LABEL: @lifetime_for_ptr_select_before_multiply( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[P:%.*]] = select i1 [[C_0:%.*]], ptr [[A:%.*]], ptr [[B:%.*]] -; CHECK-NEXT: br i1 [[C_1:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]] -; CHECK: then: -; CHECK-NEXT: br label [[EXIT]] -; CHECK: exit: -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr double, ptr [[P]], i64 0 -; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8 -; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 2 -; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[B]], i64 0 -; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[TMP1]], align 8 -; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr double, ptr [[TMP1]], i64 2 -; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8 -; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> poison, double [[TMP2]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = fmul contract <1 x double> [[BLOCK]], [[SPLAT_SPLAT]] -; CHECK-NEXT: [[BLOCK5:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1 -; CHECK-NEXT: [[SPLAT_SPLATINSERT6:%.*]] = insertelement <1 x double> poison, double [[TMP4]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT7:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT6]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK5]], <1 x double> [[SPLAT_SPLAT7]], <1 x double> [[TMP3]]) -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <1 x double> [[TMP5]], <1 x double> poison, <2 x i32> <i32 0, i32 poison> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP6]], <2 x i32> <i32 2, i32 1> -; CHECK-NEXT: [[BLOCK8:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <1 x i32> <i32 1> -; CHECK-NEXT: [[BLOCK9:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> <i32 1> -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLATINSERT10:%.*]] = insertelement <1 x double> poison, double [[TMP8]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT11:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT10]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = fmul contract <1 x double> [[BLOCK9]], [[SPLAT_SPLAT11]] -; CHECK-NEXT: [[BLOCK12:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> <i32 1> -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1 -; CHECK-NEXT: [[SPLAT_SPLATINSERT13:%.*]] = insertelement <1 x double> poison, double [[TMP10]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT14:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT13]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK12]], <1 x double> [[SPLAT_SPLAT14]], <1 x double> [[TMP9]]) -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <1 x double> [[TMP11]], <1 x double> poison, <2 x i32> <i32 0, i32 poison> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP12]], <2 x i32> <i32 0, i32 2> -; CHECK-NEXT: [[BLOCK15:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLATINSERT16:%.*]] = insertelement <1 x double> poison, double [[TMP14]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT17:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT16]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = fmul contract <1 x double> [[BLOCK15]], [[SPLAT_SPLAT17]] -; CHECK-NEXT: [[BLOCK18:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1 -; CHECK-NEXT: [[SPLAT_SPLATINSERT19:%.*]] = insertelement <1 x double> poison, double [[TMP16]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT20:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT19]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK18]], <1 x double> [[SPLAT_SPLAT20]], <1 x double> [[TMP15]]) -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <1 x double> [[TMP17]], <1 x double> poison, <2 x i32> <i32 0, i32 poison> -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP18]], <2 x i32> <i32 2, i32 1> -; CHECK-NEXT: [[BLOCK21:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> poison, <1 x i32> <i32 1> -; CHECK-NEXT: [[BLOCK22:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> <i32 1> -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x double> poison, double [[TMP20]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT23]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = fmul contract <1 x double> [[BLOCK22]], [[SPLAT_SPLAT24]] -; CHECK-NEXT: [[BLOCK25:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> <i32 1> -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1 -; CHECK-NEXT: [[SPLAT_SPLATINSERT26:%.*]] = insertelement <1 x double> poison, double [[TMP22]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT27:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT26]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP23:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK25]], <1 x double> [[SPLAT_SPLAT27]], <1 x double> [[TMP21]]) -; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <1 x double> [[TMP23]], <1 x double> poison, <2 x i32> <i32 0, i32 poison> -; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> [[TMP24]], <2 x i32> <i32 0, i32 2> -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr double, ptr [[C:%.*]], i64 0 -; CHECK-NEXT: store <2 x double> [[TMP13]], ptr [[TMP26]], align 8 -; CHECK-NEXT: [[VEC_GEP28:%.*]] = getelementptr double, ptr [[TMP26]], i64 2 -; CHECK-NEXT: store <2 x double> [[TMP25]], ptr [[VEC_GEP28]], align 8 -; CHECK-NEXT: ret void -; -entry: - %P = select i1 %c.0, ptr %A, ptr %B - %a = load <4 x double>, ptr %P, align 8 - %b = load <4 x double>, ptr %B, align 8 - br i1 %c.1, label %then, label %exit - -then: - call void @llvm.lifetime.end(i64 -1, ptr %P) - br label %exit - -exit: - %m = call <4 x double> @llvm.matrix.multiply(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2) - store <4 x double> %m, ptr %C, align 8 - ret void -} - -define void @lifetimes_for_args_in_different_blocks(ptr noalias %B, ptr noalias %C, i1 %c.0) { +define void @lifetimes_for_args_in_different_blocks(ptr noalias %C, i1 %c.0) { ; CHECK-LABEL: @lifetimes_for_args_in_different_blocks( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[A:%.*]] = alloca <4 x double>, align 32 +; CHECK-NEXT: [[B:%.*]] = alloca <4 x double>, align 32 ; CHECK-NEXT: call void @init(ptr [[A]]) +; CHECK-NEXT: call void @init(ptr [[B]]) ; CHECK-NEXT: br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]] ; CHECK: then: ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr double, ptr [[A]], i64 0 ; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8 ; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 2 ; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[B:%.*]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[B]], i64 0 ; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[TMP1]], align 8 ; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr double, ptr [[TMP1]], i64 2 ; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8 @@ -864,7 +613,9 @@ define void @lifetimes_for_args_in_different_blocks(ptr noalias %B, ptr noalias ; entry: %A = alloca <4 x double> + %B = alloca <4 x double> call void @init(ptr %A) + call void @init(ptr %B) br i1 %c.0, label %then, label %exit then: @@ -880,15 +631,17 @@ exit: ret void } -define void @lifetimes_for_args_in_different_blocks2(ptr noalias %B, ptr noalias %C, i1 %c.0) { +define void @lifetimes_for_args_in_different_blocks2(ptr noalias %C, i1 %c.0) { ; CHECK-LABEL: @lifetimes_for_args_in_different_blocks2( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[A:%.*]] = alloca <4 x double>, align 32 +; CHECK-NEXT: [[B:%.*]] = alloca <4 x double>, align 32 ; CHECK-NEXT: call void @init(ptr [[A]]) +; CHECK-NEXT: call void @init(ptr [[B]]) ; CHECK-NEXT: br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]] ; CHECK: then: ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 -1, ptr [[A]]) -; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 -1, ptr [[B:%.*]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 -1, ptr [[B]]) ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr double, ptr [[A]], i64 0 @@ -957,7 +710,9 @@ define void @lifetimes_for_args_in_different_blocks2(ptr noalias %B, ptr noalias ; entry: %A = alloca <4 x double> + %B = alloca <4 x double> call void @init(ptr %A) + call void @init(ptr %B) br i1 %c.0, label %then, label %exit then: @@ -973,18 +728,20 @@ exit: ret void } -define void @lifetimes_for_args_load0_in_different_block(ptr noalias %B, ptr noalias %C, i1 %c.0) { +define void @lifetimes_for_args_load0_in_different_block(ptr noalias %C, i1 %c.0) { ; CHECK-LABEL: @lifetimes_for_args_load0_in_different_block( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[A:%.*]] = alloca <4 x double>, align 32 +; CHECK-NEXT: [[B:%.*]] = alloca <4 x double>, align 32 ; CHECK-NEXT: call void @init(ptr [[A]]) +; CHECK-NEXT: call void @init(ptr [[B]]) ; CHECK-NEXT: br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]] ; CHECK: then: ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr double, ptr [[A]], i64 0 ; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8 ; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 2 ; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[B:%.*]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[B]], i64 0 ; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[TMP1]], align 8 ; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr double, ptr [[TMP1]], i64 2 ; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8 @@ -1048,7 +805,9 @@ define void @lifetimes_for_args_load0_in_different_block(ptr noalias %B, ptr noa ; entry: %A = alloca <4 x double> + %B = alloca <4 x double> call void @init(ptr %A) + call void @init(ptr %B) %a = load <4 x double>, ptr %A, align 8 call void @llvm.lifetime.end(i64 -1, ptr %A) br i1 %c.0, label %then, label %exit @@ -1064,18 +823,20 @@ exit: ret void } -define void @lifetimes_for_args_load1_in_different_block(ptr noalias %B, ptr noalias %C, i1 %c.0) { +define void @lifetimes_for_args_load1_in_different_block(ptr noalias %C, i1 %c.0) { ; CHECK-LABEL: @lifetimes_for_args_load1_in_different_block( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[A:%.*]] = alloca <4 x double>, align 32 +; CHECK-NEXT: [[B:%.*]] = alloca <4 x double>, align 32 ; CHECK-NEXT: call void @init(ptr [[A]]) +; CHECK-NEXT: call void @init(ptr [[B]]) ; CHECK-NEXT: br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]] ; CHECK: then: ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr double, ptr [[A]], i64 0 ; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8 ; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 2 ; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[B:%.*]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[B]], i64 0 ; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[TMP1]], align 8 ; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr double, ptr [[TMP1]], i64 2 ; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8 @@ -1139,7 +900,9 @@ define void @lifetimes_for_args_load1_in_different_block(ptr noalias %B, ptr noa ; entry: %A = alloca <4 x double> + %B = alloca <4 x double> call void @init(ptr %A) + call void @init(ptr %B) %b = load <4 x double>, ptr %B, align 8 call void @llvm.lifetime.end(i64 -1, ptr %B) br i1 %c.0, label %then, label %exit diff --git a/llvm/test/Transforms/Mem2Reg/alloca_addrspace.ll b/llvm/test/Transforms/Mem2Reg/alloca_addrspace.ll index 03c86bc..87ff922 100644 --- a/llvm/test/Transforms/Mem2Reg/alloca_addrspace.ll +++ b/llvm/test/Transforms/Mem2Reg/alloca_addrspace.ll @@ -10,9 +10,6 @@ define amdgpu_kernel void @addressspace_alloca() { ; CHECK-NEXT: ret void ; %alloca = alloca i8, align 8, addrspace(5) - %cast = addrspacecast ptr addrspace(5) %alloca to ptr - call void @llvm.lifetime.start.p0(i64 2, ptr %cast) + call void @llvm.lifetime.start(i64 2, ptr addrspace(5) %alloca) ret void } - -declare void @llvm.lifetime.start.p0(i64 %size, ptr nocapture %ptr) diff --git a/llvm/test/Transforms/Mem2Reg/ignore-droppable.ll b/llvm/test/Transforms/Mem2Reg/ignore-droppable.ll index e9f40b5..d4bc097 100644 --- a/llvm/test/Transforms/Mem2Reg/ignore-droppable.ll +++ b/llvm/test/Transforms/Mem2Reg/ignore-droppable.ll @@ -54,10 +54,10 @@ define void @positive_gep_assume_uses() { ; %A = alloca {i8, i16} %B = getelementptr {i8, i16}, ptr %A, i32 0, i32 0 - call void @llvm.lifetime.start.p0(i64 2, ptr %B) + call void @llvm.lifetime.start.p0(i64 2, ptr %A) call void @llvm.assume(i1 true) ["align"(ptr %B, i64 8), "align"(ptr %B, i64 16)] store {i8, i16} zeroinitializer, ptr %A - call void @llvm.lifetime.end.p0(i64 2, ptr %B) + call void @llvm.lifetime.end.p0(i64 2, ptr %A) call void @llvm.assume(i1 true) ["nonnull"(ptr %B), "align"(ptr %B, i64 2)] ret void } diff --git a/llvm/test/Transforms/Mem2Reg/ignore-lifetime.ll b/llvm/test/Transforms/Mem2Reg/ignore-lifetime.ll index 3773d41..bcc9693 100644 --- a/llvm/test/Transforms/Mem2Reg/ignore-lifetime.ll +++ b/llvm/test/Transforms/Mem2Reg/ignore-lifetime.ll @@ -17,9 +17,8 @@ define void @test2() { ; CHECK: test2 ; CHECK-NOT: alloca %A = alloca {i8, i16} - %B = getelementptr {i8, i16}, ptr %A, i32 0, i32 0 - call void @llvm.lifetime.start.p0(i64 2, ptr %B) + call void @llvm.lifetime.start.p0(i64 2, ptr %A) store {i8, i16} zeroinitializer, ptr %A - call void @llvm.lifetime.end.p0(i64 2, ptr %B) + call void @llvm.lifetime.end.p0(i64 2, ptr %A) ret void } diff --git a/llvm/test/Transforms/MemCpyOpt/lifetime.ll b/llvm/test/Transforms/MemCpyOpt/lifetime.ll index 6158874..e9fc06b 100644 --- a/llvm/test/Transforms/MemCpyOpt/lifetime.ll +++ b/llvm/test/Transforms/MemCpyOpt/lifetime.ll @@ -116,22 +116,3 @@ define i32 @call_slot_clobber_before_lifetime_start() { %v = load i32, ptr %dst ret i32 %v } - -define void @call_slot_lifetime_bitcast(ptr %ptr) { -; CHECK-LABEL: @call_slot_lifetime_bitcast( -; CHECK-NEXT: [[TMP1:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[TMP2:%.*]] = alloca i32, align 4 -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP2]], ptr align 4 [[PTR:%.*]], i64 4, i1 false) -; CHECK-NEXT: [[TMP1_CAST:%.*]] = bitcast ptr [[TMP1]] to ptr -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[TMP1_CAST]]) -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP1]], ptr align 4 [[PTR]], i64 4, i1 false) -; CHECK-NEXT: ret void -; - %tmp1 = alloca i32 - %tmp2 = alloca i32 - call void @llvm.memcpy.p0.p0.i64(ptr align 8 %tmp2, ptr align 4 %ptr, i64 4, i1 false) - %tmp1.cast = bitcast ptr %tmp1 to ptr - call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %tmp1.cast) - call void @llvm.memcpy.p0.p0.i64(ptr align 4 %tmp1.cast, ptr align 4 %tmp2, i64 4, i1 false) - ret void -} diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy-undef.ll b/llvm/test/Transforms/MemCpyOpt/memcpy-undef.ll index 2f1ce37..816e103 100644 --- a/llvm/test/Transforms/MemCpyOpt/memcpy-undef.ll +++ b/llvm/test/Transforms/MemCpyOpt/memcpy-undef.ll @@ -26,35 +26,41 @@ define i32 @test1(ptr nocapture %foobie) nounwind noinline ssp uwtable { } ; Check that the memcpy is removed. -define void @test2(ptr sret(i8) noalias nocapture %out, ptr %in) nounwind noinline ssp uwtable { +define void @test2(ptr sret(i8) noalias nocapture %out) nounwind noinline ssp uwtable { ; CHECK-LABEL: @test2( -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[IN:%.*]]) +; CHECK-NEXT: [[IN:%.*]] = alloca i64, align 8 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[IN]]) ; CHECK-NEXT: ret void ; + %in = alloca i64 call void @llvm.lifetime.start.p0(i64 8, ptr %in) call void @llvm.memcpy.p0.p0.i64(ptr %out, ptr %in, i64 8, i1 false) ret void } ; Check that the memcpy is not removed. -define void @test3(ptr sret(i8) noalias nocapture %out, ptr %in) nounwind noinline ssp uwtable { +define void @test3(ptr sret(i8) noalias nocapture %out) nounwind noinline ssp uwtable { ; CHECK-LABEL: @test3( -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[IN:%.*]]) +; CHECK-NEXT: [[IN:%.*]] = alloca i64, align 8 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[IN]]) ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[OUT:%.*]], ptr [[IN]], i64 8, i1 false) ; CHECK-NEXT: ret void ; + %in = alloca i64 call void @llvm.lifetime.start.p0(i64 4, ptr %in) call void @llvm.memcpy.p0.p0.i64(ptr %out, ptr %in, i64 8, i1 false) ret void } ; Check that the memcpy is not removed. -define void @test_lifetime_may_alias(ptr %lifetime, ptr %src, ptr %dst) { +define void @test_lifetime_may_alias(ptr %src, ptr %dst) { ; CHECK-LABEL: @test_lifetime_may_alias( -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[LIFETIME:%.*]]) +; CHECK-NEXT: [[LIFETIME:%.*]] = alloca i64, align 8 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[LIFETIME]]) ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 8, i1 false) ; CHECK-NEXT: ret void ; + %lifetime = alloca i64 call void @llvm.lifetime.start.p0(i64 8, ptr %lifetime) call void @llvm.memcpy.p0.p0.i64(ptr %dst, ptr %src, i64 8, i1 false) ret void diff --git a/llvm/test/Transforms/MemCpyOpt/memset-memcpy-oversized.ll b/llvm/test/Transforms/MemCpyOpt/memset-memcpy-oversized.ll index 0c16f34..7ea63bb 100644 --- a/llvm/test/Transforms/MemCpyOpt/memset-memcpy-oversized.ll +++ b/llvm/test/Transforms/MemCpyOpt/memset-memcpy-oversized.ll @@ -37,29 +37,10 @@ define void @test_alloca_with_lifetimes(ptr %result) { ret void } -define void @test_malloc_with_lifetimes(ptr %result) { -; CHECK-LABEL: @test_malloc_with_lifetimes( -; CHECK-NEXT: [[A:%.*]] = call ptr @malloc(i64 16) -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 16, ptr [[A]]) -; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[A]], i8 0, i64 12, i1 false) -; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr [[RESULT:%.*]], i8 0, i64 12, i1 false) -; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 16, ptr [[A]]) -; CHECK-NEXT: call void @free(ptr [[A]]) -; CHECK-NEXT: ret void -; - %a = call ptr @malloc(i64 16) - call void @llvm.lifetime.start.p0(i64 16, ptr %a) - call void @llvm.memset.p0.i64(ptr align 8 %a, i8 0, i64 12, i1 false) - call void @llvm.memcpy.p0.p0.i64(ptr %result, ptr align 8 %a, i64 16, i1 false) - call void @llvm.lifetime.end.p0(i64 16, ptr %a) - call void @free(ptr %a) - ret void -} - ; memcpy size is larger than lifetime, don't optimize. define void @test_copy_larger_than_lifetime_size(ptr %result) { ; CHECK-LABEL: @test_copy_larger_than_lifetime_size( -; CHECK-NEXT: [[A:%.*]] = call ptr @malloc(i64 16) +; CHECK-NEXT: [[A:%.*]] = alloca [[T:%.*]], align 8 ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr [[A]]) ; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[A]], i8 0, i64 12, i1 false) ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[RESULT:%.*]], ptr align 8 [[A]], i64 16, i1 false) @@ -67,7 +48,7 @@ define void @test_copy_larger_than_lifetime_size(ptr %result) { ; CHECK-NEXT: call void @free(ptr [[A]]) ; CHECK-NEXT: ret void ; - %a = call ptr @malloc(i64 16) + %a = alloca %T, align 8 call void @llvm.lifetime.start.p0(i64 12, ptr %a) call void @llvm.memset.p0.i64(ptr align 8 %a, i8 0, i64 12, i1 false) call void @llvm.memcpy.p0.p0.i64(ptr %result, ptr align 8 %a, i64 16, i1 false) diff --git a/llvm/test/Transforms/MemCpyOpt/preserve-memssa.ll b/llvm/test/Transforms/MemCpyOpt/preserve-memssa.ll index b654319..ff36bf0 100644 --- a/llvm/test/Transforms/MemCpyOpt/preserve-memssa.ll +++ b/llvm/test/Transforms/MemCpyOpt/preserve-memssa.ll @@ -94,21 +94,6 @@ entry: ret void } -define i8 @test6(ptr %ptr, ptr noalias %ptr.1) { -; CHECK-LABEL: @test6( -; CHECK-NEXT: entry: -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 24, ptr [[PTR:%.*]]) -; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[PTR]], align 8 -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[PTR]], ptr [[PTR_1:%.*]], i64 24, i1 false) -; CHECK-NEXT: ret i8 [[TMP0]] -; -entry: - call void @llvm.lifetime.start.p0(i64 24, ptr %ptr) - %0 = load i8, ptr %ptr, align 8 - call void @llvm.memmove.p0.p0.i64(ptr %ptr, ptr %ptr.1, i64 24, i1 false) - ret i8 %0 -} - define void @test7(ptr %ptr) { ; CHECK-LABEL: @test7( ; CHECK-NEXT: entry: diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll b/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll index 323df12..1784c2f 100644 --- a/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll +++ b/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll @@ -121,13 +121,14 @@ attributes #6 = { builtin } !12 = !{i64 789, i64 300} !13 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !14, producer: "clang version 21.0.0git (git@github.com:llvm/llvm-project.git e391301e0e4d9183fe06e69602e87b0bc889aeda)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) !14 = !DIFile(filename: "basic.cc", directory: "", checksumkind: CSK_MD5, checksum: "8636c46e81402013b9d54e8307d2f149") -!15 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv", scope: !14, file: !14, line: 1, type: !16, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !13) +!15 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv", scope: !14, file: !14, line: 1, type: !16, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !13, declaration: !22) !16 = !DISubroutineType(types: !17) !17 = !{!18} !18 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !19, size: 64) !19 = !DIBasicType(name: "char", size: 8, encoding: DW_ATE_signed_char) !20 = !{i32 7, !"Dwarf Version", i32 5} !21 = !{i32 2, !"Debug Info Version", i32 3} +!22 = !DISubprogram(name: "bar", linkageName: "_Z3barv", scope: !14, file: !14, line: 1, type: !16, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized) ; DUMP: CCG before cloning: ; DUMP: Callsite Context Graph: @@ -290,7 +291,8 @@ attributes #6 = { builtin } ; IR: attributes #[[NOTCOLD]] = { builtin "memprof"="notcold" } ; IR: attributes #[[COLD]] = { builtin "memprof"="cold" } ;; Make sure the clone's linkageName was updated. -; IR: ![[SP]] = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv.memprof.1" +; IR: ![[SP]] = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv.memprof.1", {{.*}} declaration: ![[SP2:[0-9]+]]) +; IR: ![[SP2]] = !DISubprogram(name: "bar", linkageName: "_Z3barv.memprof.1" ; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) diff --git a/llvm/test/Transforms/MoveAutoInit/clobber.ll b/llvm/test/Transforms/MoveAutoInit/clobber.ll index 09084b6..08ffb13 100644 --- a/llvm/test/Transforms/MoveAutoInit/clobber.ll +++ b/llvm/test/Transforms/MoveAutoInit/clobber.ll @@ -10,14 +10,14 @@ define i32 @foo(i32 noundef %0, i32 noundef %1, i32 noundef %2) #0 { ; CHECK-NEXT: [[TMP4:%.*]] = alloca [100 x i8], align 16 ; CHECK-NEXT: [[TMP5:%.*]] = alloca [2 x i8], align 1 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [100 x i8], ptr [[TMP4]], i64 0, i64 0 -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 100, ptr nonnull [[TMP6]]) #[[ATTR3:[0-9]+]] +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 100, ptr nonnull [[TMP4]]) #[[ATTR3:[0-9]+]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8], ptr [[TMP5]], i64 0, i64 0 -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[TMP7]]) #[[ATTR3]] +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[TMP5]]) #[[ATTR3]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8], ptr [[TMP5]], i64 0, i64 1 ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP1:%.*]], 0 ; CHECK-NEXT: br i1 [[TMP9]], label [[TMP15:%.*]], label [[TMP10:%.*]] ; CHECK: 10: -; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr noundef nonnull align 16 dereferenceable(100) [[TMP6]], i8 -86, i64 100, i1 false), !annotation !0 +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr noundef nonnull align 16 dereferenceable(100) [[TMP6]], i8 -86, i64 100, i1 false), !annotation [[META0:![0-9]+]] ; CHECK-NEXT: [[TMP11:%.*]] = sext i32 [[TMP0:%.*]] to i64 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [100 x i8], ptr [[TMP4]], i64 0, i64 [[TMP11]] ; CHECK-NEXT: store i8 12, ptr [[TMP12]], align 1 @@ -28,8 +28,8 @@ define i32 @foo(i32 noundef %0, i32 noundef %1, i32 noundef %2) #0 { ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP2:%.*]], 0 ; CHECK-NEXT: br i1 [[TMP16]], label [[TMP22]], label [[TMP17:%.*]] ; CHECK: 17: -; CHECK-NEXT: store i8 -86, ptr [[TMP7]], align 1, !annotation !0 -; CHECK-NEXT: store i8 -86, ptr [[TMP8]], align 1, !annotation !0 +; CHECK-NEXT: store i8 -86, ptr [[TMP7]], align 1, !annotation [[META0]] +; CHECK-NEXT: store i8 -86, ptr [[TMP8]], align 1, !annotation [[META0]] ; CHECK-NEXT: [[TMP18:%.*]] = sext i32 [[TMP0]] to i64 ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds [2 x i8], ptr [[TMP5]], i64 0, i64 [[TMP18]] ; CHECK-NEXT: store i8 12, ptr [[TMP19]], align 1 @@ -38,19 +38,19 @@ define i32 @foo(i32 noundef %0, i32 noundef %1, i32 noundef %2) #0 { ; CHECK-NEXT: br label [[TMP22]] ; CHECK: 22: ; CHECK-NEXT: [[TMP23:%.*]] = phi i32 [ [[TMP14]], [[TMP10]] ], [ [[TMP21]], [[TMP17]] ], [ 0, [[TMP15]] ] -; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[TMP7]]) #[[ATTR3]] -; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 100, ptr nonnull [[TMP6]]) #[[ATTR3]] +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[TMP5]]) #[[ATTR3]] +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 100, ptr nonnull [[TMP4]]) #[[ATTR3]] ; CHECK-NEXT: ret i32 [[TMP23]] ; %4 = alloca [100 x i8], align 16 %5 = alloca [2 x i8], align 1 %6 = getelementptr inbounds [100 x i8], ptr %4, i64 0, i64 0 - call void @llvm.lifetime.start.p0(i64 100, ptr nonnull %6) #3 + call void @llvm.lifetime.start.p0(i64 100, ptr nonnull %4) #3 ; This memset must move. call void @llvm.memset.p0.i64(ptr noundef nonnull align 16 dereferenceable(100) %6, i8 -86, i64 100, i1 false), !annotation !0 %7 = getelementptr inbounds [2 x i8], ptr %5, i64 0, i64 0 - call void @llvm.lifetime.start.p0(i64 2, ptr nonnull %7) #3 + call void @llvm.lifetime.start.p0(i64 2, ptr nonnull %5) #3 ; This store must move. store i8 -86, ptr %7, align 1, !annotation !0 %8 = getelementptr inbounds [2 x i8], ptr %5, i64 0, i64 1 @@ -81,8 +81,8 @@ define i32 @foo(i32 noundef %0, i32 noundef %1, i32 noundef %2) #0 { 22: %23 = phi i32 [ %14, %10 ], [ %21, %17 ], [ 0, %15 ] - call void @llvm.lifetime.end.p0(i64 2, ptr nonnull %7) #3 - call void @llvm.lifetime.end.p0(i64 100, ptr nonnull %6) #3 + call void @llvm.lifetime.end.p0(i64 2, ptr nonnull %5) #3 + call void @llvm.lifetime.end.p0(i64 100, ptr nonnull %4) #3 ret i32 %23 } diff --git a/llvm/test/Transforms/NewGVN/lifetime-simple.ll b/llvm/test/Transforms/NewGVN/lifetime-simple.ll index 55e4611..0a7bd33 100644 --- a/llvm/test/Transforms/NewGVN/lifetime-simple.ll +++ b/llvm/test/Transforms/NewGVN/lifetime-simple.ll @@ -4,10 +4,11 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" target triple = "i386-apple-darwin7" -define i8 @test(ptr %P) nounwind { +define i8 @test() nounwind { ; CHECK-LABEL: define i8 @test( -; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: +; CHECK-NEXT: [[P:%.*]] = alloca [32 x i8], align 1 ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 32, ptr [[P]]) ; CHECK-NEXT: store i8 1, ptr [[P]], align 1 ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 32, ptr [[P]]) @@ -15,6 +16,7 @@ define i8 @test(ptr %P) nounwind { ; CHECK-NEXT: ret i8 [[TMP0]] ; entry: + %P = alloca [32 x i8] call void @llvm.lifetime.start.p0(i64 32, ptr %P) %0 = load i8, ptr %P store i8 1, ptr %P diff --git a/llvm/test/Transforms/NewGVN/salvage-eliminate-instruction.ll b/llvm/test/Transforms/NewGVN/salvage-eliminate-instruction.ll new file mode 100644 index 0000000..d1da7ea --- /dev/null +++ b/llvm/test/Transforms/NewGVN/salvage-eliminate-instruction.ll @@ -0,0 +1,45 @@ +; RUN: opt -S -passes=newgvn %s | FileCheck %s + +; Check that eliminateInstruction() replaces the debug uses of the instructions +; marked for deletion with the dominating leader. + +define void @binop(i32 %x, i32 %y) !dbg !5 { +; CHECK: #dbg_value(i32 %add1, [[META9:![0-9]+]], !DIExpression(), [[META12:![0-9]+]]) +; CHECK-NEXT: #dbg_value(i32 %add1, [[META11:![0-9]+]], !DIExpression(), [[META13:![0-9]+]]) +; + %add1 = add i32 %x, %y, !dbg !12 + #dbg_value(i32 %add1, !9, !DIExpression(), !12) + %add2 = add i32 %y, %x, !dbg !13 + #dbg_value(i32 %add2, !11, !DIExpression(), !13) + call void @use(i32 %add1, i32 %add2), !dbg !14 + ret void, !dbg !15 +} + +declare void @use(i32, i32) + +!llvm.dbg.cu = !{!0} +!llvm.debugify = !{!2, !3} +!llvm.module.flags = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!1 = !DIFile(filename: "/app/example.ll", directory: "/") +!2 = !{i32 4} +!3 = !{i32 2} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = distinct !DISubprogram(name: "binop", linkageName: "binop", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !8) +!6 = !DISubroutineType(types: !7) +!7 = !{} +!8 = !{!9, !11} +!9 = !DILocalVariable(name: "1", scope: !5, file: !1, line: 1, type: !10) +!10 = !DIBasicType(name: "ty32", size: 32, encoding: DW_ATE_unsigned) +!11 = !DILocalVariable(name: "2", scope: !5, file: !1, line: 2, type: !10) +!12 = !DILocation(line: 1, column: 1, scope: !5) +!13 = !DILocation(line: 2, column: 1, scope: !5) +!14 = !DILocation(line: 3, column: 1, scope: !5) +!15 = !DILocation(line: 4, column: 1, scope: !5) +;. +; CHECK: [[META9]] = !DILocalVariable(name: "1", +; CHECK: [[META11]] = !DILocalVariable(name: "2", +; CHECK: [[META12]] = !DILocation(line: 1, +; CHECK: [[META13]] = !DILocation(line: 2, +;. diff --git a/llvm/test/Transforms/NewGVN/salvage-trivially-dead-inst.ll b/llvm/test/Transforms/NewGVN/salvage-trivially-dead-inst.ll new file mode 100644 index 0000000..cc69541 --- /dev/null +++ b/llvm/test/Transforms/NewGVN/salvage-trivially-dead-inst.ll @@ -0,0 +1,33 @@ +; RUN: opt -passes=newgvn -S %s | FileCheck %s + +; Check that assignDFSNumbers() in NewGVN salvages the debug values of the +; trivially dead instructions that are marked for deletion. + +; CHECK: #dbg_value(i8 %tmp, [[META11:![0-9]+]], !DIExpression(DW_OP_constu, 8, DW_OP_eq, DW_OP_stack_value), [[META26:![0-9]+]]) +; CHECK: [[META11]] = !DILocalVariable(name: "2" +; CHECK: [[META26]] = !DILocation(line: 2 + +define void @test13() !dbg !5 { +entry: + %tmp = load i8, ptr null, align 1 + %tmp2 = icmp eq i8 %tmp, 8, !dbg !13 + #dbg_value(i1 %tmp2, !11, !DIExpression(), !13) + ret void +} + +!llvm.dbg.cu = !{!0} +!llvm.debugify = !{!2, !3} +!llvm.module.flags = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!1 = !DIFile(filename: "/app/example.ll", directory: "/") +!2 = !{i32 3} +!3 = !{i32 2} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = distinct !DISubprogram(name: "test13", linkageName: "test13", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !8) +!6 = !DISubroutineType(types: !7) +!7 = !{} +!8 = !{!11} +!10 = !DIBasicType(name: "ty8", size: 8, encoding: DW_ATE_unsigned) +!11 = !DILocalVariable(name: "2", scope: !5, file: !1, line: 2, type: !10) +!13 = !DILocation(line: 2, column: 1, scope: !5)
\ No newline at end of file diff --git a/llvm/test/Transforms/NewGVN/verify-memoryphi.ll b/llvm/test/Transforms/NewGVN/verify-memoryphi.ll index 2a1fcf3..a19a2a6 100644 --- a/llvm/test/Transforms/NewGVN/verify-memoryphi.ll +++ b/llvm/test/Transforms/NewGVN/verify-memoryphi.ll @@ -10,6 +10,7 @@ declare void @llvm.lifetime.start.p0(i64, ptr nocapture) define void @tinkywinky() { ; CHECK-LABEL: define void @tinkywinky() { ; CHECK-NEXT: entry: +; CHECK-NEXT: [[A:%.*]] = alloca i8, align 1 ; CHECK-NEXT: br i1 false, label [[BODY:%.*]], label [[END:%.*]] ; CHECK: body: ; CHECK-NEXT: store i8 poison, ptr null, align 1 @@ -18,11 +19,12 @@ define void @tinkywinky() { ; CHECK-NEXT: ret void ; entry: - call void @llvm.lifetime.start.p0(i64 4, ptr undef) + %a = alloca i8 + call void @llvm.lifetime.start.p0(i64 4, ptr %a) br i1 false, label %body, label %end body: - call void @llvm.lifetime.start.p0(i64 4, ptr undef) + call void @llvm.lifetime.start.p0(i64 4, ptr %a) br label %end end: diff --git a/llvm/test/Transforms/ObjCARC/inlined-autorelease-return-value.ll b/llvm/test/Transforms/ObjCARC/inlined-autorelease-return-value.ll index 60180c4..180fd0a 100644 --- a/llvm/test/Transforms/ObjCARC/inlined-autorelease-return-value.ll +++ b/llvm/test/Transforms/ObjCARC/inlined-autorelease-return-value.ll @@ -80,12 +80,14 @@ entry: ; CHECK-LABEL: define ptr @elide_with_retainRV_splitByLifetime( ; CHECK-NEXT: entry: +; CHECK-NEXT: %x = alloca ptr ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr %x) ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr %x) ; CHECK-NEXT: ret ptr %x -define ptr @elide_with_retainRV_splitByLifetime(ptr %x) nounwind { +define ptr @elide_with_retainRV_splitByLifetime() nounwind { entry: ; Cleanup should skip over lifetime intrinsics. + %x = alloca ptr call void @llvm.lifetime.start(i64 8, ptr %x) %b = call ptr @llvm.objc.autoreleaseReturnValue(ptr %x) nounwind call void @llvm.lifetime.end(i64 8, ptr %x) @@ -218,13 +220,15 @@ entry: ; CHECK-LABEL: define ptr @elide_with_claimRV_splitByLifetime( ; CHECK-NEXT: entry: +; CHECK-NEXT: %x = alloca ptr ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr %x) ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr %x) ; CHECK-NEXT: tail call void @llvm.objc.release(ptr %x) ; CHECK-NEXT: ret ptr %x -define ptr @elide_with_claimRV_splitByLifetime(ptr %x) nounwind { +define ptr @elide_with_claimRV_splitByLifetime() nounwind { entry: ; Cleanup should skip over lifetime intrinsics. + %x = alloca ptr call void @llvm.lifetime.start(i64 8, ptr %x) %b = call ptr @llvm.objc.autoreleaseReturnValue(ptr %x) nounwind call void @llvm.lifetime.end(i64 8, ptr %x) diff --git a/llvm/test/Transforms/ObjCARC/test_autorelease_pool.ll b/llvm/test/Transforms/ObjCARC/test_autorelease_pool.ll new file mode 100644 index 0000000..896717f --- /dev/null +++ b/llvm/test/Transforms/ObjCARC/test_autorelease_pool.ll @@ -0,0 +1,319 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; Test for autorelease pool optimizations +; RUN: opt -passes=objc-arc < %s -S | FileCheck %s + +declare ptr @llvm.objc.autoreleasePoolPush() +declare void @llvm.objc.autoreleasePoolPop(ptr) +declare ptr @llvm.objc.autorelease(ptr) +declare ptr @llvm.objc.retain(ptr) +declare ptr @create_object() +declare void @use_object(ptr) +declare ptr @object_with_thing() +declare void @opaque_callee() + +; Empty autorelease pool should be eliminated +define void @test_empty_pool() { +; CHECK-LABEL: define void @test_empty_pool() { +; CHECK-NEXT: ret void +; + %pool = call ptr @llvm.objc.autoreleasePoolPush() + call void @llvm.objc.autoreleasePoolPop(ptr %pool) + ret void +} + +; Pool with only release should be removed +define void @test_autorelease_to_release() { +; CHECK-LABEL: define void @test_autorelease_to_release() { +; CHECK-NEXT: [[OBJ:%.*]] = call ptr @create_object() +; CHECK-NEXT: call void @llvm.objc.release(ptr [[OBJ]]) #[[ATTR0:[0-9]+]], !clang.imprecise_release [[META0:![0-9]+]] +; CHECK-NEXT: ret void +; + %obj = call ptr @create_object() + %pool = call ptr @llvm.objc.autoreleasePoolPush() + call ptr @llvm.objc.autorelease(ptr %obj) + call void @llvm.objc.autoreleasePoolPop(ptr %pool) + ret void +} + +; Pool with autoreleases should not be optimized +define void @test_multiple_autoreleases() { +; CHECK-LABEL: define void @test_multiple_autoreleases() { +; CHECK-NEXT: [[OBJ1:%.*]] = call ptr @create_object() +; CHECK-NEXT: [[OBJ2:%.*]] = call ptr @create_object() +; CHECK-NEXT: [[POOL:%.*]] = call ptr @llvm.objc.autoreleasePoolPush() #[[ATTR0]] +; CHECK-NEXT: call void @use_object(ptr [[OBJ1]]) +; CHECK-NEXT: [[TMP1:%.*]] = call ptr @llvm.objc.autorelease(ptr [[OBJ1]]) #[[ATTR0]] +; CHECK-NEXT: call void @use_object(ptr [[OBJ2]]) +; CHECK-NEXT: [[TMP2:%.*]] = call ptr @llvm.objc.autorelease(ptr [[OBJ2]]) #[[ATTR0]] +; CHECK-NEXT: call void @llvm.objc.autoreleasePoolPop(ptr [[POOL]]) #[[ATTR0]] +; CHECK-NEXT: ret void +; + %obj1 = call ptr @create_object() + %obj2 = call ptr @create_object() + %pool = call ptr @llvm.objc.autoreleasePoolPush() + call void @use_object(ptr %obj1) + call ptr @llvm.objc.autorelease(ptr %obj1) + call void @use_object(ptr %obj2) + call ptr @llvm.objc.autorelease(ptr %obj2) + call void @llvm.objc.autoreleasePoolPop(ptr %pool) + ret void +} + +; Pool with calls should not be optimized +define void @test_calls() { +; CHECK-LABEL: define void @test_calls() { +; CHECK-NEXT: [[POOL:%.*]] = call ptr @llvm.objc.autoreleasePoolPush() #[[ATTR0]] +; CHECK-NEXT: [[OBJ1:%.*]] = call ptr @object_with_thing() +; CHECK-NEXT: call void @use_object(ptr [[OBJ1]]) +; CHECK-NEXT: call void @llvm.objc.autoreleasePoolPop(ptr [[POOL]]) #[[ATTR0]] +; CHECK-NEXT: ret void +; + %pool = call ptr @llvm.objc.autoreleasePoolPush() + %obj1 = call ptr @object_with_thing() + call void @use_object(ptr %obj1) + call void @llvm.objc.autoreleasePoolPop(ptr %pool) + ret void +} + +; Pool with opaque call should not be optimized +define void @test_opaque_call() { +; CHECK-LABEL: define void @test_opaque_call() { +; CHECK-NEXT: [[POOL:%.*]] = call ptr @llvm.objc.autoreleasePoolPush() #[[ATTR0]] +; CHECK-NEXT: call void @opaque_callee() +; CHECK-NEXT: call void @llvm.objc.autoreleasePoolPop(ptr [[POOL]]) #[[ATTR0]] +; CHECK-NEXT: ret void +; + %pool = call ptr @llvm.objc.autoreleasePoolPush() + call void @opaque_callee() + call void @llvm.objc.autoreleasePoolPop(ptr %pool) + ret void +} + +; Nested empty pools should be eliminated +define void @test_nested_empty_pools() { +; CHECK-LABEL: define void @test_nested_empty_pools() { +; CHECK-NEXT: ret void +; + %pool1 = call ptr @llvm.objc.autoreleasePoolPush() + %pool2 = call ptr @llvm.objc.autoreleasePoolPush() + call void @llvm.objc.autoreleasePoolPop(ptr %pool2) + call void @llvm.objc.autoreleasePoolPop(ptr %pool1) + ret void +} + +; Empty pool with cast should be eliminated +define void @test_empty_pool_with_cast() { +; CHECK-LABEL: define void @test_empty_pool_with_cast() { +; CHECK-NEXT: [[CAST:%.*]] = bitcast ptr poison to ptr +; CHECK-NEXT: ret void +; + %pool = call ptr @llvm.objc.autoreleasePoolPush() + %cast = bitcast ptr %pool to ptr + call void @llvm.objc.autoreleasePoolPop(ptr %cast) + ret void +} + +; Autorelease shadowing - autorelease in inner pool doesn't prevent outer optimization +define void @test_autorelease_shadowing_basic() { +; CHECK-LABEL: define void @test_autorelease_shadowing_basic() { +; CHECK-NEXT: [[OBJ:%.*]] = call ptr @create_object() +; CHECK-NEXT: call void @llvm.objc.release(ptr [[OBJ]]) #[[ATTR0]], !clang.imprecise_release [[META0]] +; CHECK-NEXT: ret void +; + %obj = call ptr @create_object() + %outer_pool = call ptr @llvm.objc.autoreleasePoolPush() + + ; Inner pool with autorelease - this should be shadowed + %inner_pool = call ptr @llvm.objc.autoreleasePoolPush() + call ptr @llvm.objc.autorelease(ptr %obj) + call void @llvm.objc.autoreleasePoolPop(ptr %inner_pool) + + call void @llvm.objc.autoreleasePoolPop(ptr %outer_pool) + ret void +} + +; Multiple nested levels with shadowing +define void @test_multiple_nested_shadowing() { +; CHECK-LABEL: define void @test_multiple_nested_shadowing() { +; CHECK-NEXT: [[OBJ1:%.*]] = call ptr @create_object() +; CHECK-NEXT: [[OBJ2:%.*]] = call ptr @create_object() +; CHECK-NEXT: call void @llvm.objc.release(ptr [[OBJ1]]) #[[ATTR0]], !clang.imprecise_release [[META0]] +; CHECK-NEXT: call void @llvm.objc.release(ptr [[OBJ2]]) #[[ATTR0]], !clang.imprecise_release [[META0]] +; CHECK-NEXT: ret void +; + %obj1 = call ptr @create_object() + %obj2 = call ptr @create_object() + %outer_pool = call ptr @llvm.objc.autoreleasePoolPush() + + ; First inner pool + %inner1_pool = call ptr @llvm.objc.autoreleasePoolPush() + call ptr @llvm.objc.autorelease(ptr %obj1) + call void @llvm.objc.autoreleasePoolPop(ptr %inner1_pool) + + ; Second inner pool with nested level + %inner2_pool = call ptr @llvm.objc.autoreleasePoolPush() + %inner3_pool = call ptr @llvm.objc.autoreleasePoolPush() + call ptr @llvm.objc.autorelease(ptr %obj2) + call void @llvm.objc.autoreleasePoolPop(ptr %inner3_pool) + call void @llvm.objc.autoreleasePoolPop(ptr %inner2_pool) + + call void @llvm.objc.autoreleasePoolPop(ptr %outer_pool) + ret void +} + +; Autorelease outside inner pool prevents optimization +define void @test_autorelease_outside_inner_pool() { +; CHECK-LABEL: define void @test_autorelease_outside_inner_pool() { +; CHECK-NEXT: [[OBJ1:%.*]] = call ptr @create_object() +; CHECK-NEXT: [[OBJ2:%.*]] = call ptr @create_object() +; CHECK-NEXT: call void @llvm.objc.release(ptr [[OBJ1]]) #[[ATTR0]], !clang.imprecise_release [[META0]] +; CHECK-NEXT: call void @llvm.objc.release(ptr [[OBJ2]]) #[[ATTR0]], !clang.imprecise_release [[META0]] +; CHECK-NEXT: ret void +; + %obj1 = call ptr @create_object() + %obj2 = call ptr @create_object() + %outer_pool = call ptr @llvm.objc.autoreleasePoolPush() + + ; This autorelease is NOT in an inner pool, so outer pool can't be optimized + call ptr @llvm.objc.autorelease(ptr %obj1) + + ; Inner pool with autorelease (shadowed) + %inner_pool = call ptr @llvm.objc.autoreleasePoolPush() + call ptr @llvm.objc.autorelease(ptr %obj2) + call void @llvm.objc.autoreleasePoolPop(ptr %inner_pool) + + call void @llvm.objc.autoreleasePoolPop(ptr %outer_pool) + ret void +} + +; Known ObjC functions don't prevent optimization +define void @test_known_objc_functions() { +; CHECK-LABEL: define void @test_known_objc_functions() { +; CHECK-NEXT: [[OBJ:%.*]] = call ptr @create_object() +; CHECK-NEXT: ret void +; + %obj = call ptr @create_object() + %pool = call ptr @llvm.objc.autoreleasePoolPush() + + ; These are all known ObjC runtime functions that don't produce autoreleases + %retained = call ptr @llvm.objc.retain(ptr %obj) + call void @llvm.objc.release(ptr %obj) + + call void @llvm.objc.autoreleasePoolPop(ptr %pool) + ret void +} + +; Complex shadowing with mixed autoreleases +define void @test_complex_shadowing() { +; CHECK-LABEL: define void @test_complex_shadowing() { +; CHECK-NEXT: [[OBJ1:%.*]] = call ptr @create_object() +; CHECK-NEXT: [[OBJ2:%.*]] = call ptr @create_object() +; CHECK-NEXT: [[OBJ3:%.*]] = call ptr @create_object() +; CHECK-NEXT: call void @llvm.objc.release(ptr [[OBJ1]]) #[[ATTR0]], !clang.imprecise_release [[META0]] +; CHECK-NEXT: call void @llvm.objc.release(ptr [[OBJ2]]) #[[ATTR0]], !clang.imprecise_release [[META0]] +; CHECK-NEXT: [[INNER2_POOL:%.*]] = call ptr @llvm.objc.autoreleasePoolPush() #[[ATTR0]] +; CHECK-NEXT: [[TMP1:%.*]] = call ptr @llvm.objc.autorelease(ptr [[OBJ3]]) #[[ATTR0]] +; CHECK-NEXT: call void @llvm.objc.autoreleasePoolPop(ptr [[INNER2_POOL]]) #[[ATTR0]] +; CHECK-NEXT: ret void +; + %obj1 = call ptr @create_object() + %obj2 = call ptr @create_object() + %obj3 = call ptr @create_object() + %outer_pool = call ptr @llvm.objc.autoreleasePoolPush() + + ; This autorelease is outside inner pools - prevents optimization + call ptr @llvm.objc.autorelease(ptr %obj1) + + ; Inner pool 1 with shadowed autorelease + %inner1_pool = call ptr @llvm.objc.autoreleasePoolPush() + call ptr @llvm.objc.autorelease(ptr %obj2) + call void @llvm.objc.autoreleasePoolPop(ptr %inner1_pool) + + ; Some safe ObjC operations + %retained = call ptr @llvm.objc.retain(ptr %obj3) + call void @llvm.objc.release(ptr %retained) + + ; Inner pool 2 with shadowed autorelease + %inner2_pool = call ptr @llvm.objc.autoreleasePoolPush() + call ptr @llvm.objc.autorelease(ptr %obj3) + call void @llvm.objc.autoreleasePoolPop(ptr %inner2_pool) + + call void @llvm.objc.autoreleasePoolPop(ptr %outer_pool) + ret void +} + +; Non-ObjC function that may autorelease prevents optimization +define void @test_non_objc_may_autorelease() { +; CHECK-LABEL: define void @test_non_objc_may_autorelease() { +; CHECK-NEXT: [[POOL:%.*]] = call ptr @llvm.objc.autoreleasePoolPush() #[[ATTR0]] +; CHECK-NEXT: [[TMP1:%.*]] = call ptr @function_that_might_autorelease() +; CHECK-NEXT: call void @llvm.objc.autoreleasePoolPop(ptr [[POOL]]) #[[ATTR0]] +; CHECK-NEXT: ret void +; + %pool = call ptr @llvm.objc.autoreleasePoolPush() + call ptr @function_that_might_autorelease() + call void @llvm.objc.autoreleasePoolPop(ptr %pool) + ret void +} + +; Non-ObjC function that doesn't autorelease allows optimization +define void @test_non_objc_no_autorelease() { +; CHECK-LABEL: define void @test_non_objc_no_autorelease() { +; CHECK-NEXT: call void @safe_function() +; CHECK-NEXT: ret void +; + %pool = call ptr @llvm.objc.autoreleasePoolPush() + call void @safe_function() + call void @llvm.objc.autoreleasePoolPop(ptr %pool) + ret void +} + +; Incomplete push/pop pairs across blocks - only inner pairs count +define void @test_incomplete_pairs_inner_shadowing() { +; CHECK-LABEL: define void @test_incomplete_pairs_inner_shadowing() { +; CHECK-NEXT: [[OBJ:%.*]] = call ptr @create_object() +; CHECK-NEXT: [[OUTER_POOL:%.*]] = call ptr @llvm.objc.autoreleasePoolPush() #[[ATTR0]] +; CHECK-NEXT: call void @llvm.objc.release(ptr [[OBJ]]) #[[ATTR0]], !clang.imprecise_release [[META0]] +; CHECK-NEXT: ret void +; + %obj = call ptr @create_object() + %outer_pool = call ptr @llvm.objc.autoreleasePoolPush() + + ; Inner complete pair - autorelease should be shadowed by this + %inner_pool = call ptr @llvm.objc.autoreleasePoolPush() + call ptr @llvm.objc.autorelease(ptr %obj) ; This SHOULD be shadowed by inner pair + call void @llvm.objc.autoreleasePoolPop(ptr %inner_pool) ; Completes the inner pair + + ; Note: %outer_pool pop is in a different block (common pattern) + ; But the autorelease was shadowed by the complete inner pair + ret void +} + +; Helper functions for testing interprocedural analysis + +; Safe function that doesn't call autorelease +define void @safe_function() { + ; Just some computation, no autoreleases +; CHECK-LABEL: define void @safe_function() { +; CHECK-NEXT: [[X:%.*]] = add i32 1, 2 +; CHECK-NEXT: ret void +; + %x = add i32 1, 2 + ret void +} + +; Function that may produce autoreleases (simulated by calling autorelease) +define ptr @function_that_might_autorelease() { +; CHECK-LABEL: define ptr @function_that_might_autorelease() { +; CHECK-NEXT: [[OBJ:%.*]] = call ptr @create_object() +; CHECK-NEXT: [[AUTORELEASED:%.*]] = call ptr @llvm.objc.autorelease(ptr [[OBJ]]) #[[ATTR0]] +; CHECK-NEXT: ret ptr [[AUTORELEASED]] +; + %obj = call ptr @create_object() + %autoreleased = call ptr @llvm.objc.autorelease(ptr %obj) + ret ptr %autoreleased +} + +;. +; CHECK: [[META0]] = !{} +;. diff --git a/llvm/test/Transforms/SCCP/uscmp.ll b/llvm/test/Transforms/SCCP/uscmp.ll new file mode 100644 index 0000000..d010c06 --- /dev/null +++ b/llvm/test/Transforms/SCCP/uscmp.ll @@ -0,0 +1,185 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes=sccp -S < %s | FileCheck %s + +define i32 @scmp_to_sub(i32 range(i32 -1, 2) %a) { +; CHECK-LABEL: define i32 @scmp_to_sub( +; CHECK-SAME: i32 range(i32 -1, 2) [[A:%.*]]) { +; CHECK-NEXT: [[SCMP:%.*]] = sub nsw i32 [[A]], 0 +; CHECK-NEXT: ret i32 [[SCMP]] +; + %scmp = call i32 @llvm.scmp(i32 %a, i32 0) + ret i32 %scmp +} + +define i32 @scmp_zext_to_sub(i1 %a, i1 %b) { +; CHECK-LABEL: define i32 @scmp_zext_to_sub( +; CHECK-SAME: i1 [[A:%.*]], i1 [[B:%.*]]) { +; CHECK-NEXT: [[ZEXT_A:%.*]] = zext i1 [[A]] to i32 +; CHECK-NEXT: [[ZEXT_B:%.*]] = zext i1 [[B]] to i32 +; CHECK-NEXT: [[SCMP:%.*]] = sub nsw i32 [[ZEXT_A]], [[ZEXT_B]] +; CHECK-NEXT: ret i32 [[SCMP]] +; + %zext_a = zext i1 %a to i32 + %zext_b = zext i1 %b to i32 + %scmp = call i32 @llvm.scmp(i32 %zext_a, i32 %zext_b) + ret i32 %scmp +} + +define i8 @scmp_to_sub_trunc(i32 range(i32 -1, 2) %a) { +; CHECK-LABEL: define i8 @scmp_to_sub_trunc( +; CHECK-SAME: i32 range(i32 -1, 2) [[A:%.*]]) { +; CHECK-NEXT: [[SCMP1:%.*]] = sub nsw i32 [[A]], 0 +; CHECK-NEXT: [[SCMP:%.*]] = trunc i32 [[SCMP1]] to i8 +; CHECK-NEXT: ret i8 [[SCMP]] +; + %scmp = call i8 @llvm.scmp(i32 %a, i32 0) + ret i8 %scmp +} + +define i64 @scmp_to_sub_sext(i32 range(i32 -1, 2) %a) { +; CHECK-LABEL: define i64 @scmp_to_sub_sext( +; CHECK-SAME: i32 range(i32 -1, 2) [[A:%.*]]) { +; CHECK-NEXT: [[SCMP1:%.*]] = sub nsw i32 [[A]], 0 +; CHECK-NEXT: [[SCMP:%.*]] = sext i32 [[SCMP1]] to i64 +; CHECK-NEXT: ret i64 [[SCMP]] +; + %scmp = call i64 @llvm.scmp(i32 %a, i32 0) + ret i64 %scmp +} + +define i32 @scmp_to_sub_small_range(i32 range(i32 -1, 1) %a) { +; CHECK-LABEL: define i32 @scmp_to_sub_small_range( +; CHECK-SAME: i32 range(i32 -1, 1) [[A:%.*]]) { +; CHECK-NEXT: [[SCMP:%.*]] = sub nsw i32 [[A]], 0 +; CHECK-NEXT: ret i32 [[SCMP]] +; + %scmp = call i32 @llvm.scmp(i32 %a, i32 0) + ret i32 %scmp +} + +define i32 @ucmp_to_sub(i32 range(i32 0, 3) %a) { +; CHECK-LABEL: define i32 @ucmp_to_sub( +; CHECK-SAME: i32 range(i32 0, 3) [[A:%.*]]) { +; CHECK-NEXT: [[SCMP:%.*]] = sub i32 [[A]], 1 +; CHECK-NEXT: ret i32 [[SCMP]] +; + %ucmp = call i32 @llvm.ucmp(i32 %a, i32 1) + ret i32 %ucmp +} + +define i8 @ucmp_to_sub_trunc(i32 range(i32 0, 3) %a) { +; CHECK-LABEL: define i8 @ucmp_to_sub_trunc( +; CHECK-SAME: i32 range(i32 0, 3) [[A:%.*]]) { +; CHECK-NEXT: [[UCMP1:%.*]] = sub i32 [[A]], 1 +; CHECK-NEXT: [[UCMP:%.*]] = trunc i32 [[UCMP1]] to i8 +; CHECK-NEXT: ret i8 [[UCMP]] +; + %ucmp = call i8 @llvm.ucmp(i32 %a, i32 1) + ret i8 %ucmp +} + +define i64 @ucmp_to_sub_sext(i32 range(i32 0, 3) %a) { +; CHECK-LABEL: define i64 @ucmp_to_sub_sext( +; CHECK-SAME: i32 range(i32 0, 3) [[A:%.*]]) { +; CHECK-NEXT: [[UCMP1:%.*]] = sub i32 [[A]], 1 +; CHECK-NEXT: [[UCMP:%.*]] = sext i32 [[UCMP1]] to i64 +; CHECK-NEXT: ret i64 [[UCMP]] +; + %ucmp = call i64 @llvm.ucmp(i32 %a, i32 1) + ret i64 %ucmp +} + +; TODO: we can fold this into %a. +define i32 @ucmp_to_sub_small_range(i32 range(i32 0, 2) %a) { +; CHECK-LABEL: define i32 @ucmp_to_sub_small_range( +; CHECK-SAME: i32 range(i32 0, 2) [[A:%.*]]) { +; CHECK-NEXT: [[UCMP:%.*]] = call i32 @llvm.ucmp.i32.i32(i32 [[A]], i32 0) +; CHECK-NEXT: ret i32 [[UCMP]] +; + %ucmp = call i32 @llvm.ucmp(i32 %a, i32 0) + ret i32 %ucmp +} + +define i32 @scmp_to_sub_large_range(i32 range(i32 -1, 3) %a) { +; CHECK-LABEL: define i32 @scmp_to_sub_large_range( +; CHECK-SAME: i32 range(i32 -1, 3) [[A:%.*]]) { +; CHECK-NEXT: [[SCMP:%.*]] = call i32 @llvm.scmp.i32.i32(i32 [[A]], i32 0) +; CHECK-NEXT: ret i32 [[SCMP]] +; + %scmp = call i32 @llvm.scmp(i32 %a, i32 0) + ret i32 %scmp +} + +define i32 @ucmp_to_sub_large_range(i32 range(i32 -1, 3) %a) { +; CHECK-LABEL: define i32 @ucmp_to_sub_large_range( +; CHECK-SAME: i32 range(i32 -1, 3) [[A:%.*]]) { +; CHECK-NEXT: [[SCMP:%.*]] = call i32 @llvm.ucmp.i32.i32(i32 [[A]], i32 0) +; CHECK-NEXT: ret i32 [[SCMP]] +; + %ucmp = call i32 @llvm.ucmp(i32 %a, i32 0) + ret i32 %ucmp +} + +define i32 @scmp_to_sub_wrap(i8 range(i8 127, -126) %a) { +; CHECK-LABEL: define i32 @scmp_to_sub_wrap( +; CHECK-SAME: i8 range(i8 127, -126) [[A:%.*]]) { +; CHECK-NEXT: [[SCMP:%.*]] = call i32 @llvm.scmp.i32.i8(i8 [[A]], i8 -128) +; CHECK-NEXT: ret i32 [[SCMP]] +; + %scmp = call i32 @llvm.scmp(i8 %a, i8 -128) + ret i32 %scmp +} + +define i32 @ucmp_to_sub_wrap(i8 range(i8 -1, 2) %a) { +; CHECK-LABEL: define i32 @ucmp_to_sub_wrap( +; CHECK-SAME: i8 range(i8 -1, 2) [[A:%.*]]) { +; CHECK-NEXT: [[UCMP:%.*]] = call i32 @llvm.ucmp.i32.i8(i8 [[A]], i8 0) +; CHECK-NEXT: ret i32 [[UCMP]] +; + %ucmp = call i32 @llvm.ucmp(i8 %a, i8 0) + ret i32 %ucmp +} + +; It is incorrect to convert a ucmp into sub when the input type is i1. +define i32 @ucmp_to_sub_i1_rhs_const(i1 %a) { +; CHECK-LABEL: define i32 @ucmp_to_sub_i1_rhs_const( +; CHECK-SAME: i1 [[A:%.*]]) { +; CHECK-NEXT: [[SCMP:%.*]] = call i32 @llvm.ucmp.i32.i1(i1 [[A]], i1 false) +; CHECK-NEXT: ret i32 [[SCMP]] +; + %ucmp = call i32 @llvm.ucmp(i1 %a, i1 false) + ret i32 %ucmp +} + +; It is incorrect to convert a ucmp into sub when the input type is i1. +define i32 @ucmp_to_sub_i1_lhs_const(i1 %a) { +; CHECK-LABEL: define i32 @ucmp_to_sub_i1_lhs_const( +; CHECK-SAME: i1 [[A:%.*]]) { +; CHECK-NEXT: [[SCMP:%.*]] = call i32 @llvm.ucmp.i32.i1(i1 false, i1 [[A]]) +; CHECK-NEXT: ret i32 [[SCMP]] +; + %ucmp = call i32 @llvm.ucmp(i1 false, i1 %a) + ret i32 %ucmp +} + +; It is incorrect to convert a ucmp into sub when the input type is i1. +define i32 @ucmp_to_sub_i1(i1 %a, i1 %b) { +; CHECK-LABEL: define i32 @ucmp_to_sub_i1( +; CHECK-SAME: i1 [[A:%.*]], i1 [[B:%.*]]) { +; CHECK-NEXT: [[SCMP:%.*]] = call i32 @llvm.ucmp.i32.i1(i1 [[A]], i1 [[B]]) +; CHECK-NEXT: ret i32 [[SCMP]] +; + %ucmp = call i32 @llvm.ucmp(i1 %a, i1 %b) + ret i32 %ucmp +} + +; It is incorrect to convert a scmp into sub when the input type is i1. +define i32 @scmp_to_sub_i1_rhs_const(i1 %a) { +; CHECK-LABEL: define i32 @scmp_to_sub_i1_rhs_const( +; CHECK-SAME: i1 [[A:%.*]]) { +; CHECK-NEXT: [[SCMP:%.*]] = call i32 @llvm.scmp.i32.i1(i1 [[A]], i1 false) +; CHECK-NEXT: ret i32 [[SCMP]] +; + %scmp = call i32 @llvm.scmp(i1 %a, i1 false) + ret i32 %scmp +} diff --git a/llvm/test/Transforms/SROA/alloca-address-space.ll b/llvm/test/Transforms/SROA/alloca-address-space.ll index 4c638a9..31305c8 100644 --- a/llvm/test/Transforms/SROA/alloca-address-space.ll +++ b/llvm/test/Transforms/SROA/alloca-address-space.ll @@ -140,12 +140,10 @@ define void @addressspace_alloca_lifetime() { ; CHECK-NEXT: ret void ; %alloca = alloca i8, align 8, addrspace(2) - %cast = addrspacecast ptr addrspace(2) %alloca to ptr - call void @llvm.lifetime.start.p0(i64 2, ptr %cast) + call void @llvm.lifetime.start(i64 2, ptr addrspace(2) %alloca) ret void } -declare void @llvm.lifetime.start.p0(i64 %size, ptr nocapture %ptr) ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; CHECK-MODIFY-CFG: {{.*}} ; CHECK-PRESERVE-CFG: {{.*}} diff --git a/llvm/test/Transforms/SROA/basictest.ll b/llvm/test/Transforms/SROA/basictest.ll index 145da52..3034aaa 100644 --- a/llvm/test/Transforms/SROA/basictest.ll +++ b/llvm/test/Transforms/SROA/basictest.ll @@ -1834,8 +1834,7 @@ define void @PR27999() unnamed_addr { entry-block: %0 = alloca [2 x i64], align 8 call void @llvm.lifetime.start.p0(i64 16, ptr %0) - %1 = getelementptr inbounds [2 x i64], ptr %0, i32 0, i32 1 - call void @llvm.lifetime.end.p0(i64 8, ptr %1) + call void @llvm.lifetime.end.p0(i64 8, ptr %0) ret void } diff --git a/llvm/test/Transforms/SROA/ignore-droppable.ll b/llvm/test/Transforms/SROA/ignore-droppable.ll index 0b9a036b..9c95dc0 100644 --- a/llvm/test/Transforms/SROA/ignore-droppable.ll +++ b/llvm/test/Transforms/SROA/ignore-droppable.ll @@ -55,10 +55,10 @@ define void @positive_gep_assume_uses() { ; %A = alloca {i8, i16} %B = getelementptr {i8, i16}, ptr %A, i32 0, i32 0 - call void @llvm.lifetime.start.p0(i64 2, ptr %B) + call void @llvm.lifetime.start.p0(i64 2, ptr %A) call void @llvm.assume(i1 true) ["align"(ptr %B, i64 8), "align"(ptr %B, i64 16)] store {i8, i16} zeroinitializer, ptr %A - call void @llvm.lifetime.end.p0(i64 2, ptr %B) + call void @llvm.lifetime.end.p0(i64 2, ptr %A) call void @llvm.assume(i1 true) ["nonnull"(ptr %B), "align"(ptr %B, i64 2)] ret void } diff --git a/llvm/test/Transforms/SafeStack/X86/coloring2.ll b/llvm/test/Transforms/SafeStack/X86/coloring2.ll index 2e02ea6..ae5f375 100644 --- a/llvm/test/Transforms/SafeStack/X86/coloring2.ll +++ b/llvm/test/Transforms/SafeStack/X86/coloring2.ll @@ -478,43 +478,6 @@ l2: br label %l2 } -; This test checks for a bug where the stack coloring algorithm was not tracking -; the live range of allocas through phi instructions, so it did not consider -; alloca and alloca2 to be live at the same time. As a result it was using -; the same stack slot for both allocas. To ensure this bug isn't present, we -; check that there are 64 bytes allocated for the unsafe stack which is enough -; space for both allocas. -; CHECK-LABEL: @stack_coloring_liveness_bug -define void @stack_coloring_liveness_bug(i32 %arg0) #0 { -entry: -; CHECK: %[[USP:.*]] = load ptr, ptr @__safestack_unsafe_stack_ptr -; CHECK-NEXT: getelementptr i8, ptr %[[USP]], i32 -64 - %alloca = alloca [32 x i8], align 16 - %alloca2 = alloca [32 x i8], align 16 - %cond = icmp eq i32 %arg0, 0 - br i1 %cond, label %if, label %else - -if: - br label %end - -else: -; CHECK: getelementptr i8, ptr %[[USP]], i32 -32 - call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %alloca) - call void @capture8(ptr %alloca) - call void @llvm.lifetime.end.p0(i64 32, ptr nonnull %alloca) - br label %end - -end: -; CHECK: getelementptr i8, ptr %[[USP]], i32 -64 - %alloca.end = phi ptr [ %alloca, %if], [%alloca, %else] - call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %alloca2) - call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %alloca.end) - call void @capture2_8(ptr %alloca2, ptr %alloca.end) - call void @llvm.lifetime.end.p0(i64 32, ptr nonnull %alloca2) - call void @llvm.lifetime.end.p0(i64 32, ptr nonnull %alloca.end) - ret void -} - attributes #0 = { safestack } declare void @llvm.lifetime.start.p0(i64, ptr nocapture) diff --git a/llvm/test/Transforms/Scalarizer/extractvalue-struct-of-vectors.ll b/llvm/test/Transforms/Scalarizer/extractvalue-struct-of-vectors.ll new file mode 100644 index 0000000..b8d1b92 --- /dev/null +++ b/llvm/test/Transforms/Scalarizer/extractvalue-struct-of-vectors.ll @@ -0,0 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes='function(scalarizer)' -S < %s | FileCheck %s + +define void @func(<2 x i32> noundef %a, <2 x i32> noundef %b) { +; CHECK-LABEL: define void @func( +; CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) { +; CHECK-NEXT: [[A_I0:%.*]] = extractelement <2 x i32> [[A]], i64 0 +; CHECK-NEXT: [[B_I0:%.*]] = extractelement <2 x i32> [[B]], i64 0 +; CHECK-NEXT: [[UADDC_I0:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A_I0]], i32 [[B_I0]]) +; CHECK-NEXT: [[A_I1:%.*]] = extractelement <2 x i32> [[A]], i64 1 +; CHECK-NEXT: [[B_I1:%.*]] = extractelement <2 x i32> [[B]], i64 1 +; CHECK-NEXT: [[UADDC_I1:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A_I1]], i32 [[B_I1]]) +; CHECK-NEXT: [[CARRY_ELEM1:%.*]] = extractvalue { i32, i1 } [[UADDC_I0]], 1 +; CHECK-NEXT: [[CARRY_ELEM11:%.*]] = extractvalue { i32, i1 } [[UADDC_I1]], 1 +; CHECK-NEXT: [[CARRY_ZEXT_I0:%.*]] = zext i1 [[CARRY_ELEM1]] to i32 +; CHECK-NEXT: [[CARRY_ZEXT_I1:%.*]] = zext i1 [[CARRY_ELEM11]] to i32 +; CHECK-NEXT: ret void +; + %uaddc = call { <2 x i32>, <2 x i1> } @llvm.uadd.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) + %carry = extractvalue { <2 x i32>, <2 x i1> } %uaddc, 1 + %carry_zext = zext <2 x i1> %carry to <2 x i32> + ret void +} diff --git a/llvm/test/Transforms/SimplifyCFG/X86/empty-cleanuppad.ll b/llvm/test/Transforms/SimplifyCFG/X86/empty-cleanuppad.ll index 17ce141..162a3ab 100644 --- a/llvm/test/Transforms/SimplifyCFG/X86/empty-cleanuppad.ll +++ b/llvm/test/Transforms/SimplifyCFG/X86/empty-cleanuppad.ll @@ -502,6 +502,7 @@ cleanupret2: define void @f11() personality ptr @__CxxFrameHandler3 { ; CHECK-LABEL: @f11( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[X:%.*]] = alloca i8, align 1 ; CHECK-NEXT: invoke void @g() ; CHECK-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[CATCH_DISPATCH:%.*]] ; CHECK: invoke.cont: @@ -519,6 +520,7 @@ define void @f11() personality ptr @__CxxFrameHandler3 { ; CHECK-NEXT: ret void ; entry: + %x = alloca i8 invoke void @g() to label %invoke.cont unwind label %ehcleanup @@ -531,7 +533,6 @@ invoke.cont2: ; preds = %invoke.cont to label %return unwind label %catch.dispatch ehcleanup: ; preds = %invoke.cont, %entry - %x = phi ptr [ undef, %invoke.cont ], [ undef, %entry ] %0 = cleanuppad within none [] call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %x) cleanupret from %0 unwind label %catch.dispatch diff --git a/llvm/test/Transforms/SimplifyCFG/invoke_unwind_lifetime.ll b/llvm/test/Transforms/SimplifyCFG/invoke_unwind_lifetime.ll index ff031e9..ea14b17 100644 --- a/llvm/test/Transforms/SimplifyCFG/invoke_unwind_lifetime.ll +++ b/llvm/test/Transforms/SimplifyCFG/invoke_unwind_lifetime.ll @@ -67,17 +67,17 @@ invoke.cont: lpad.v0: %i8 = landingpad { ptr, i32 } cleanup call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %i0) + call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %i4) br label %end lpad.v1: %i9 = landingpad { ptr, i32 } cleanup call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %i2) + call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %i6) br label %end end: %i10 = phi { ptr, i32 } [ %i8, %lpad.v0 ], [ %i9, %lpad.v1 ] - %i11 = phi ptr [ %i4, %lpad.v0 ], [ %i6, %lpad.v1 ] - call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %i11) resume { ptr, i32 } %i10 } ;. diff --git a/llvm/test/Transforms/VectorCombine/X86/bitop-of-castops.ll b/llvm/test/Transforms/VectorCombine/X86/bitop-of-castops.ll new file mode 100644 index 0000000..220556c --- /dev/null +++ b/llvm/test/Transforms/VectorCombine/X86/bitop-of-castops.ll @@ -0,0 +1,262 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- | FileCheck %s + +; Negative test: bitcast from float to int (optimization should not apply) +define <4 x i32> @and_bitcast_v4f32_to_v4i32(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: @and_bitcast_v4f32_to_v4i32( +; CHECK-NEXT: [[BC1:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> +; CHECK-NEXT: [[BC2:%.*]] = bitcast <4 x float> [[B:%.*]] to <4 x i32> +; CHECK-NEXT: [[AND:%.*]] = and <4 x i32> [[BC1]], [[BC2]] +; CHECK-NEXT: ret <4 x i32> [[AND]] +; + %bc1 = bitcast <4 x float> %a to <4 x i32> + %bc2 = bitcast <4 x float> %b to <4 x i32> + %and = and <4 x i32> %bc1, %bc2 + ret <4 x i32> %and +} + +; Test bitwise operations with integer-to-integer bitcast +define <2 x i32> @or_bitcast_v4i16_to_v2i32(<4 x i16> %a, <4 x i16> %b) { +; CHECK-LABEL: @or_bitcast_v4i16_to_v2i32( +; CHECK-NEXT: [[B:%.*]] = or <4 x i16> [[A:%.*]], [[B1:%.*]] +; CHECK-NEXT: [[BC2:%.*]] = bitcast <4 x i16> [[B]] to <2 x i32> +; CHECK-NEXT: ret <2 x i32> [[BC2]] +; + %bc1 = bitcast <4 x i16> %a to <2 x i32> + %bc2 = bitcast <4 x i16> %b to <2 x i32> + %or = or <2 x i32> %bc1, %bc2 + ret <2 x i32> %or +} + +define <16 x i8> @xor_bitcast_v2i64_to_v16i8(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: @xor_bitcast_v2i64_to_v16i8( +; CHECK-NEXT: [[B:%.*]] = xor <2 x i64> [[A:%.*]], [[B1:%.*]] +; CHECK-NEXT: [[BC2:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +; CHECK-NEXT: ret <16 x i8> [[BC2]] +; + %bc1 = bitcast <2 x i64> %a to <16 x i8> + %bc2 = bitcast <2 x i64> %b to <16 x i8> + %xor = xor <16 x i8> %bc1, %bc2 + ret <16 x i8> %xor +} + +; Test bitwise operations with truncate +define <4 x i16> @and_trunc_v4i32_to_v4i16(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: @and_trunc_v4i32_to_v4i16( +; CHECK-NEXT: [[AND_INNER:%.*]] = and <4 x i32> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[AND:%.*]] = trunc <4 x i32> [[AND_INNER]] to <4 x i16> +; CHECK-NEXT: ret <4 x i16> [[AND]] +; + %t1 = trunc <4 x i32> %a to <4 x i16> + %t2 = trunc <4 x i32> %b to <4 x i16> + %and = and <4 x i16> %t1, %t2 + ret <4 x i16> %and +} + +define <8 x i8> @or_trunc_v8i16_to_v8i8(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: @or_trunc_v8i16_to_v8i8( +; CHECK-NEXT: [[OR_INNER:%.*]] = or <8 x i16> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[OR:%.*]] = trunc <8 x i16> [[OR_INNER]] to <8 x i8> +; CHECK-NEXT: ret <8 x i8> [[OR]] +; + %t1 = trunc <8 x i16> %a to <8 x i8> + %t2 = trunc <8 x i16> %b to <8 x i8> + %or = or <8 x i8> %t1, %t2 + ret <8 x i8> %or +} + +define <2 x i32> @xor_trunc_v2i64_to_v2i32(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: @xor_trunc_v2i64_to_v2i32( +; CHECK-NEXT: [[XOR_INNER:%.*]] = xor <2 x i64> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[XOR:%.*]] = trunc <2 x i64> [[XOR_INNER]] to <2 x i32> +; CHECK-NEXT: ret <2 x i32> [[XOR]] +; + %t1 = trunc <2 x i64> %a to <2 x i32> + %t2 = trunc <2 x i64> %b to <2 x i32> + %xor = xor <2 x i32> %t1, %t2 + ret <2 x i32> %xor +} + +; Test bitwise operations with zero extend +define <4 x i32> @and_zext_v4i16_to_v4i32(<4 x i16> %a, <4 x i16> %b) { +; CHECK-LABEL: @and_zext_v4i16_to_v4i32( +; CHECK-NEXT: [[AND_INNER:%.*]] = and <4 x i16> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[AND:%.*]] = zext <4 x i16> [[AND_INNER]] to <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[AND]] +; + %z1 = zext <4 x i16> %a to <4 x i32> + %z2 = zext <4 x i16> %b to <4 x i32> + %and = and <4 x i32> %z1, %z2 + ret <4 x i32> %and +} + +define <8 x i16> @or_zext_v8i8_to_v8i16(<8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: @or_zext_v8i8_to_v8i16( +; CHECK-NEXT: [[OR_INNER:%.*]] = or <8 x i8> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[OR:%.*]] = zext <8 x i8> [[OR_INNER]] to <8 x i16> +; CHECK-NEXT: ret <8 x i16> [[OR]] +; + %z1 = zext <8 x i8> %a to <8 x i16> + %z2 = zext <8 x i8> %b to <8 x i16> + %or = or <8 x i16> %z1, %z2 + ret <8 x i16> %or +} + +define <2 x i64> @xor_zext_v2i32_to_v2i64(<2 x i32> %a, <2 x i32> %b) { +; CHECK-LABEL: @xor_zext_v2i32_to_v2i64( +; CHECK-NEXT: [[XOR_INNER:%.*]] = xor <2 x i32> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[XOR:%.*]] = zext <2 x i32> [[XOR_INNER]] to <2 x i64> +; CHECK-NEXT: ret <2 x i64> [[XOR]] +; + %z1 = zext <2 x i32> %a to <2 x i64> + %z2 = zext <2 x i32> %b to <2 x i64> + %xor = xor <2 x i64> %z1, %z2 + ret <2 x i64> %xor +} + +; Test bitwise operations with sign extend +define <4 x i32> @and_sext_v4i16_to_v4i32(<4 x i16> %a, <4 x i16> %b) { +; CHECK-LABEL: @and_sext_v4i16_to_v4i32( +; CHECK-NEXT: [[AND_INNER:%.*]] = and <4 x i16> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[AND:%.*]] = sext <4 x i16> [[AND_INNER]] to <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[AND]] +; + %s1 = sext <4 x i16> %a to <4 x i32> + %s2 = sext <4 x i16> %b to <4 x i32> + %and = and <4 x i32> %s1, %s2 + ret <4 x i32> %and +} + +define <8 x i16> @or_sext_v8i8_to_v8i16(<8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: @or_sext_v8i8_to_v8i16( +; CHECK-NEXT: [[OR_INNER:%.*]] = or <8 x i8> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[OR:%.*]] = sext <8 x i8> [[OR_INNER]] to <8 x i16> +; CHECK-NEXT: ret <8 x i16> [[OR]] +; + %s1 = sext <8 x i8> %a to <8 x i16> + %s2 = sext <8 x i8> %b to <8 x i16> + %or = or <8 x i16> %s1, %s2 + ret <8 x i16> %or +} + +define <2 x i64> @xor_sext_v2i32_to_v2i64(<2 x i32> %a, <2 x i32> %b) { +; CHECK-LABEL: @xor_sext_v2i32_to_v2i64( +; CHECK-NEXT: [[XOR_INNER:%.*]] = xor <2 x i32> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[XOR:%.*]] = sext <2 x i32> [[XOR_INNER]] to <2 x i64> +; CHECK-NEXT: ret <2 x i64> [[XOR]] +; + %s1 = sext <2 x i32> %a to <2 x i64> + %s2 = sext <2 x i32> %b to <2 x i64> + %xor = xor <2 x i64> %s1, %s2 + ret <2 x i64> %xor +} + +; Negative test: mismatched cast types (zext and sext) +define <4 x i32> @and_zext_sext_mismatch(<4 x i16> %a, <4 x i16> %b) { +; CHECK-LABEL: @and_zext_sext_mismatch( +; CHECK-NEXT: [[Z1:%.*]] = zext <4 x i16> [[A:%.*]] to <4 x i32> +; CHECK-NEXT: [[S2:%.*]] = sext <4 x i16> [[B:%.*]] to <4 x i32> +; CHECK-NEXT: [[AND:%.*]] = and <4 x i32> [[Z1]], [[S2]] +; CHECK-NEXT: ret <4 x i32> [[AND]] +; + %z1 = zext <4 x i16> %a to <4 x i32> + %s2 = sext <4 x i16> %b to <4 x i32> + %and = and <4 x i32> %z1, %s2 + ret <4 x i32> %and +} + +; Negative test: mismatched source types +define <4 x i32> @or_zext_different_src_types(<4 x i16> %a, <4 x i8> %b) { +; CHECK-LABEL: @or_zext_different_src_types( +; CHECK-NEXT: [[Z1:%.*]] = zext <4 x i16> [[A:%.*]] to <4 x i32> +; CHECK-NEXT: [[Z2:%.*]] = zext <4 x i8> [[B:%.*]] to <4 x i32> +; CHECK-NEXT: [[OR:%.*]] = or <4 x i32> [[Z1]], [[Z2]] +; CHECK-NEXT: ret <4 x i32> [[OR]] +; + %z1 = zext <4 x i16> %a to <4 x i32> + %z2 = zext <4 x i8> %b to <4 x i32> + %or = or <4 x i32> %z1, %z2 + ret <4 x i32> %or +} + +; Negative test: scalar types (not vectors) +define i32 @xor_zext_scalar(i16 %a, i16 %b) { +; CHECK-LABEL: @xor_zext_scalar( +; CHECK-NEXT: [[Z1:%.*]] = zext i16 [[A:%.*]] to i32 +; CHECK-NEXT: [[Z2:%.*]] = zext i16 [[B:%.*]] to i32 +; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[Z1]], [[Z2]] +; CHECK-NEXT: ret i32 [[XOR]] +; + %z1 = zext i16 %a to i32 + %z2 = zext i16 %b to i32 + %xor = xor i32 %z1, %z2 + ret i32 %xor +} + +; Test multi-use: one cast has multiple uses +define <4 x i32> @and_zext_multiuse(<4 x i16> %a, <4 x i16> %b) { +; CHECK-LABEL: @and_zext_multiuse( +; CHECK-NEXT: [[Z1:%.*]] = zext <4 x i16> [[A:%.*]] to <4 x i32> +; CHECK-NEXT: [[AND_INNER:%.*]] = and <4 x i16> [[A]], [[B:%.*]] +; CHECK-NEXT: [[AND:%.*]] = zext <4 x i16> [[AND_INNER]] to <4 x i32> +; CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[Z1]], [[AND]] +; CHECK-NEXT: ret <4 x i32> [[ADD]] +; + %z1 = zext <4 x i16> %a to <4 x i32> + %z2 = zext <4 x i16> %b to <4 x i32> + %and = and <4 x i32> %z1, %z2 + %add = add <4 x i32> %z1, %and ; z1 has multiple uses + ret <4 x i32> %add +} + +; Test with different vector sizes +define <16 x i16> @or_zext_v16i8_to_v16i16(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: @or_zext_v16i8_to_v16i16( +; CHECK-NEXT: [[OR_INNER:%.*]] = or <16 x i8> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[OR:%.*]] = zext <16 x i8> [[OR_INNER]] to <16 x i16> +; CHECK-NEXT: ret <16 x i16> [[OR]] +; + %z1 = zext <16 x i8> %a to <16 x i16> + %z2 = zext <16 x i8> %b to <16 x i16> + %or = or <16 x i16> %z1, %z2 + ret <16 x i16> %or +} + +; Test bitcast with different element counts +define <8 x i16> @xor_bitcast_v4i32_to_v8i16(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: @xor_bitcast_v4i32_to_v8i16( +; CHECK-NEXT: [[XOR_INNER:%.*]] = xor <4 x i32> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[XOR:%.*]] = bitcast <4 x i32> [[XOR_INNER]] to <8 x i16> +; CHECK-NEXT: ret <8 x i16> [[XOR]] +; + %bc1 = bitcast <4 x i32> %a to <8 x i16> + %bc2 = bitcast <4 x i32> %b to <8 x i16> + %xor = xor <8 x i16> %bc1, %bc2 + ret <8 x i16> %xor +} + +; Test truncate with flag preservation +define <4 x i16> @and_trunc_nuw_nsw(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: @and_trunc_nuw_nsw( +; CHECK-NEXT: [[AND_INNER:%.*]] = and <4 x i32> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[AND:%.*]] = trunc nuw nsw <4 x i32> [[AND_INNER]] to <4 x i16> +; CHECK-NEXT: ret <4 x i16> [[AND]] +; + %t1 = trunc nuw nsw <4 x i32> %a to <4 x i16> + %t2 = trunc nuw nsw <4 x i32> %b to <4 x i16> + %and = and <4 x i16> %t1, %t2 + ret <4 x i16> %and +} + +; Test sign extend with nneg flag +define <4 x i32> @or_zext_nneg(<4 x i16> %a, <4 x i16> %b) { +; CHECK-LABEL: @or_zext_nneg( +; CHECK-NEXT: [[OR_INNER:%.*]] = or <4 x i16> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[OR:%.*]] = zext nneg <4 x i16> [[OR_INNER]] to <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[OR]] +; + %z1 = zext nneg <4 x i16> %a to <4 x i32> + %z2 = zext nneg <4 x i16> %b to <4 x i32> + %or = or <4 x i32> %z1, %z2 + ret <4 x i32> %or +} |