; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 --data-layout="e" | FileCheck %s --check-prefixes=CHECK,SSE ; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 --data-layout="e" | FileCheck %s --check-prefixes=CHECK,AVX ; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 --data-layout="E" | FileCheck %s --check-prefixes=CHECK,SSE ; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 --data-layout="E" | FileCheck %s --check-prefixes=CHECK,AVX ;------------------------------------------------------------------------------- ; Here we know we can load 128 bits as per dereferenceability and alignment. ; We don't widen scalar loads per-se. define <1 x float> @scalar(ptr align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @scalar( ; CHECK-NEXT: [[R:%.*]] = load <1 x float>, ptr [[P:%.*]], align 16 ; CHECK-NEXT: ret <1 x float> [[R]] ; %r = load <1 x float>, ptr %p, align 16 ret <1 x float> %r } ; We don't widen single-element loads, these get scalarized. define <1 x float> @vec_with_1elt(ptr align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @vec_with_1elt( ; CHECK-NEXT: [[R:%.*]] = load <1 x float>, ptr [[P:%.*]], align 16 ; CHECK-NEXT: ret <1 x float> [[R]] ; %r = load <1 x float>, ptr %p, align 16 ret <1 x float> %r } define <2 x float> @vec_with_2elts(ptr align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @vec_with_2elts( ; CHECK-NEXT: [[R:%.*]] = load <2 x float>, ptr [[P:%.*]], align 16 ; CHECK-NEXT: ret <2 x float> [[R]] ; %r = load <2 x float>, ptr %p, align 16 ret <2 x float> %r } define <3 x float> @vec_with_3elts(ptr align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @vec_with_3elts( ; CHECK-NEXT: [[R:%.*]] = load <3 x float>, ptr [[P:%.*]], align 16 ; CHECK-NEXT: ret <3 x float> [[R]] ; %r = load <3 x float>, ptr %p, align 16 ret <3 x float> %r } ; Full-vector load. All good already. define <4 x float> @vec_with_4elts(ptr align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @vec_with_4elts( ; CHECK-NEXT: [[R:%.*]] = load <4 x float>, ptr [[P:%.*]], align 16 ; CHECK-NEXT: ret <4 x float> [[R]] ; %r = load <4 x float>, ptr %p, align 16 ret <4 x float> %r } ; We don't know we can load 256 bits though. define <5 x float> @vec_with_5elts(ptr align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @vec_with_5elts( ; CHECK-NEXT: [[R:%.*]] = load <5 x float>, ptr [[P:%.*]], align 16 ; CHECK-NEXT: ret <5 x float> [[R]] ; %r = load <5 x float>, ptr %p, align 16 ret <5 x float> %r } ;------------------------------------------------------------------------------- ; We can load 128 bits, and the fact that it's underaligned isn't relevant. define <3 x float> @vec_with_3elts_underaligned(ptr align 8 dereferenceable(16) %p) { ; CHECK-LABEL: @vec_with_3elts_underaligned( ; CHECK-NEXT: [[R:%.*]] = load <3 x float>, ptr [[P:%.*]], align 8 ; CHECK-NEXT: ret <3 x float> [[R]] ; %r = load <3 x float>, ptr %p, align 8 ret <3 x float> %r } ; We don't know we can load 128 bits, but since it's aligned, we still can do wide load. ; FIXME: this should still get widened. define <3 x float> @vec_with_3elts_underdereferenceable(ptr align 16 dereferenceable(12) %p) { ; CHECK-LABEL: @vec_with_3elts_underdereferenceable( ; CHECK-NEXT: [[R:%.*]] = load <3 x float>, ptr [[P:%.*]], align 16 ; CHECK-NEXT: ret <3 x float> [[R]] ; %r = load <3 x float>, ptr %p, align 16 ret <3 x float> %r } ; We can't tell if we can load 128 bits. define <3 x float> @vec_with_3elts_underaligned_underdereferenceable(ptr align 8 dereferenceable(12) %p) { ; CHECK-LABEL: @vec_with_3elts_underaligned_underdereferenceable( ; CHECK-NEXT: [[R:%.*]] = load <3 x float>, ptr [[P:%.*]], align 8 ; CHECK-NEXT: ret <3 x float> [[R]] ; %r = load <3 x float>, ptr %p, align 8 ret <3 x float> %r } ;------------------------------------------------------------------------------- ; Here we know we can load 256 bits as per dereferenceability and alignment. define <1 x float> @vec_with_1elt_256bits(ptr align 32 dereferenceable(32) %p) { ; CHECK-LABEL: @vec_with_1elt_256bits( ; CHECK-NEXT: [[R:%.*]] = load <1 x float>, ptr [[P:%.*]], align 32 ; CHECK-NEXT: ret <1 x float> [[R]] ; %r = load <1 x float>, ptr %p, align 32 ret <1 x float> %r } define <2 x float> @vec_with_2elts_256bits(ptr align 32 dereferenceable(32) %p) { ; CHECK-LABEL: @vec_with_2elts_256bits( ; CHECK-NEXT: [[R:%.*]] = load <2 x float>, ptr [[P:%.*]], align 32 ; CHECK-NEXT: ret <2 x float> [[R]] ; %r = load <2 x float>, ptr %p, align 32 ret <2 x float> %r } define <3 x float> @vec_with_3elts_256bits(ptr align 32 dereferenceable(32) %p) { ; CHECK-LABEL: @vec_with_3elts_256bits( ; CHECK-NEXT: [[R:%.*]] = load <3 x float>, ptr [[P:%.*]], align 32 ; CHECK-NEXT: ret <3 x float> [[R]] ; %r = load <3 x float>, ptr %p, align 32 ret <3 x float> %r } define <4 x float> @vec_with_4elts_256bits(ptr align 32 dereferenceable(32) %p) { ; CHECK-LABEL: @vec_with_4elts_256bits( ; CHECK-NEXT: [[R:%.*]] = load <4 x float>, ptr [[P:%.*]], align 32 ; CHECK-NEXT: ret <4 x float> [[R]] ; %r = load <4 x float>, ptr %p, align 32 ret <4 x float> %r } define <5 x float> @vec_with_5elts_256bits(ptr align 32 dereferenceable(32) %p) { ; CHECK-LABEL: @vec_with_5elts_256bits( ; CHECK-NEXT: [[R:%.*]] = load <5 x float>, ptr [[P:%.*]], align 32 ; CHECK-NEXT: ret <5 x float> [[R]] ; %r = load <5 x float>, ptr %p, align 32 ret <5 x float> %r } define <6 x float> @vec_with_6elts_256bits(ptr align 32 dereferenceable(32) %p) { ; CHECK-LABEL: @vec_with_6elts_256bits( ; CHECK-NEXT: [[R:%.*]] = load <6 x float>, ptr [[P:%.*]], align 32 ; CHECK-NEXT: ret <6 x float> [[R]] ; %r = load <6 x float>, ptr %p, align 32 ret <6 x float> %r } define <7 x float> @vec_with_7elts_256bits(ptr align 32 dereferenceable(32) %p) { ; CHECK-LABEL: @vec_with_7elts_256bits( ; CHECK-NEXT: [[R:%.*]] = load <7 x float>, ptr [[P:%.*]], align 32 ; CHECK-NEXT: ret <7 x float> [[R]] ; %r = load <7 x float>, ptr %p, align 32 ret <7 x float> %r } ; Full-vector load. All good already. define <8 x float> @vec_with_8elts_256bits(ptr align 32 dereferenceable(32) %p) { ; CHECK-LABEL: @vec_with_8elts_256bits( ; CHECK-NEXT: [[R:%.*]] = load <8 x float>, ptr [[P:%.*]], align 32 ; CHECK-NEXT: ret <8 x float> [[R]] ; %r = load <8 x float>, ptr %p, align 32 ret <8 x float> %r } ; We can't tell if we can load more than 256 bits. define <9 x float> @vec_with_9elts_256bits(ptr align 32 dereferenceable(32) %p) { ; CHECK-LABEL: @vec_with_9elts_256bits( ; CHECK-NEXT: [[R:%.*]] = load <9 x float>, ptr [[P:%.*]], align 32 ; CHECK-NEXT: ret <9 x float> [[R]] ; %r = load <9 x float>, ptr %p, align 32 ret <9 x float> %r } ;------------------------------------------------------------------------------- ; Weird types we don't deal with define <2 x i7> @vec_with_two_subbyte_elts(ptr align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @vec_with_two_subbyte_elts( ; CHECK-NEXT: [[R:%.*]] = load <2 x i7>, ptr [[P:%.*]], align 16 ; CHECK-NEXT: ret <2 x i7> [[R]] ; %r = load <2 x i7>, ptr %p, align 16 ret <2 x i7> %r } define <2 x i9> @vec_with_two_nonbyte_sized_elts(ptr align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @vec_with_two_nonbyte_sized_elts( ; CHECK-NEXT: [[R:%.*]] = load <2 x i9>, ptr [[P:%.*]], align 16 ; CHECK-NEXT: ret <2 x i9> [[R]] ; %r = load <2 x i9>, ptr %p, align 16 ret <2 x i9> %r } define <2 x i24> @vec_with_two_nonpoweroftwo_sized_elts(ptr align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @vec_with_two_nonpoweroftwo_sized_elts( ; CHECK-NEXT: [[R:%.*]] = load <2 x i24>, ptr [[P:%.*]], align 16 ; CHECK-NEXT: ret <2 x i24> [[R]] ; %r = load <2 x i24>, ptr %p, align 16 ret <2 x i24> %r } define <2 x float> @vec_with_2elts_addressspace(ptr addrspace(2) align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @vec_with_2elts_addressspace( ; CHECK-NEXT: [[R:%.*]] = load <2 x float>, ptr addrspace(2) [[P:%.*]], align 16 ; CHECK-NEXT: ret <2 x float> [[R]] ; %r = load <2 x float>, ptr addrspace(2) %p, align 16 ret <2 x float> %r } ;------------------------------------------------------------------------------- ; Widening these would change the legalized type, so leave them alone. define <2 x i1> @vec_with_2elts_128bits_i1(ptr align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @vec_with_2elts_128bits_i1( ; CHECK-NEXT: [[R:%.*]] = load <2 x i1>, ptr [[P:%.*]], align 16 ; CHECK-NEXT: ret <2 x i1> [[R]] ; %r = load <2 x i1>, ptr %p, align 16 ret <2 x i1> %r } define <2 x i2> @vec_with_2elts_128bits_i2(ptr align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @vec_with_2elts_128bits_i2( ; CHECK-NEXT: [[R:%.*]] = load <2 x i2>, ptr [[P:%.*]], align 16 ; CHECK-NEXT: ret <2 x i2> [[R]] ; %r = load <2 x i2>, ptr %p, align 16 ret <2 x i2> %r } define <2 x i4> @vec_with_2elts_128bits_i4(ptr align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @vec_with_2elts_128bits_i4( ; CHECK-NEXT: [[R:%.*]] = load <2 x i4>, ptr [[P:%.*]], align 16 ; CHECK-NEXT: ret <2 x i4> [[R]] ; %r = load <2 x i4>, ptr %p, align 16 ret <2 x i4> %r } ; Load the 128-bit vector because there is no additional cost. define <4 x float> @load_v1f32_v4f32(ptr dereferenceable(16) %p) { ; CHECK-LABEL: @load_v1f32_v4f32( ; CHECK-NEXT: [[S:%.*]] = load <4 x float>, ptr [[P:%.*]], align 16 ; CHECK-NEXT: ret <4 x float> [[S]] ; %l = load <1 x float>, ptr %p, align 16 %s = shufflevector <1 x float> %l, <1 x float> poison, <4 x i32> ret <4 x float> %s } ; Load the 128-bit vector because there is no additional cost. ; Alignment is taken from param attr. define <4 x float> @load_v2f32_v4f32(ptr align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @load_v2f32_v4f32( ; CHECK-NEXT: [[S:%.*]] = load <4 x float>, ptr [[P:%.*]], align 16 ; CHECK-NEXT: ret <4 x float> [[S]] ; %l = load <2 x float>, ptr %p, align 1 %s = shufflevector <2 x float> %l, <2 x float> poison, <4 x i32> ret <4 x float> %s } ; Load the 128-bit vector because there is no additional cost. define <4 x float> @load_v3f32_v4f32(ptr dereferenceable(16) %p) { ; CHECK-LABEL: @load_v3f32_v4f32( ; CHECK-NEXT: [[S:%.*]] = load <4 x float>, ptr [[P:%.*]], align 1 ; CHECK-NEXT: ret <4 x float> [[S]] ; %l = load <3 x float>, ptr %p, align 1 %s = shufflevector <3 x float> %l, <3 x float> poison, <4 x i32> ret <4 x float> %s } ; Negative test - the shuffle must be a simple subvector insert. define <4 x float> @load_v3f32_v4f32_wrong_mask(ptr dereferenceable(16) %p) { ; CHECK-LABEL: @load_v3f32_v4f32_wrong_mask( ; CHECK-NEXT: [[L:%.*]] = load <3 x float>, ptr [[P:%.*]], align 1 ; CHECK-NEXT: [[S:%.*]] = shufflevector <3 x float> [[L]], <3 x float> poison, <4 x i32> ; CHECK-NEXT: ret <4 x float> [[S]] ; %l = load <3 x float>, ptr %p, align 1 %s = shufflevector <3 x float> %l, <3 x float> poison, <4 x i32> ret <4 x float> %s } ; Negative test - must be dereferenceable to vector width. define <4 x float> @load_v3f32_v4f32_not_deref(ptr dereferenceable(15) %p) { ; CHECK-LABEL: @load_v3f32_v4f32_not_deref( ; CHECK-NEXT: [[L:%.*]] = load <3 x float>, ptr [[P:%.*]], align 16 ; CHECK-NEXT: [[S:%.*]] = shufflevector <3 x float> [[L]], <3 x float> poison, <4 x i32> ; CHECK-NEXT: ret <4 x float> [[S]] ; %l = load <3 x float>, ptr %p, align 16 %s = shufflevector <3 x float> %l, <3 x float> poison, <4 x i32> ret <4 x float> %s } ; Without AVX, the cost of loading 256-bits would be greater. define <8 x float> @load_v2f32_v8f32(ptr dereferenceable(32) %p) { ; SSE-LABEL: @load_v2f32_v8f32( ; SSE-NEXT: [[L:%.*]] = load <2 x float>, ptr [[P:%.*]], align 1 ; SSE-NEXT: [[S:%.*]] = shufflevector <2 x float> [[L]], <2 x float> poison, <8 x i32> ; SSE-NEXT: ret <8 x float> [[S]] ; ; AVX-LABEL: @load_v2f32_v8f32( ; AVX-NEXT: [[S:%.*]] = load <8 x float>, ptr [[P:%.*]], align 1 ; AVX-NEXT: ret <8 x float> [[S]] ; %l = load <2 x float>, ptr %p, align 1 %s = shufflevector <2 x float> %l, <2 x float> poison, <8 x i32> ret <8 x float> %s } ; Integer type is ok too. define <4 x i32> @load_v2i32_v4i32(ptr dereferenceable(16) %p) { ; CHECK-LABEL: @load_v2i32_v4i32( ; CHECK-NEXT: [[S:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 1 ; CHECK-NEXT: ret <4 x i32> [[S]] ; %l = load <2 x i32>, ptr %p, align 1 %s = shufflevector <2 x i32> %l, <2 x i32> poison, <4 x i32> ret <4 x i32> %s } ; TODO: We assumed the shuffle mask is canonical. define <4 x i32> @load_v2i32_v4i32_non_canonical_mask(ptr dereferenceable(16) %p) { ; CHECK-LABEL: @load_v2i32_v4i32_non_canonical_mask( ; CHECK-NEXT: [[L:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 1 ; CHECK-NEXT: [[S:%.*]] = shufflevector <2 x i32> [[L]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[S]] ; %l = load <2 x i32>, ptr %p, align 1 %s = shufflevector <2 x i32> %l, <2 x i32> poison, <4 x i32> ret <4 x i32> %s } ; Allow non-canonical commuted shuffle. define <4 x i32> @load_v2i32_v4i32_non_canonical_mask_commute(ptr dereferenceable(16) %p) { ; CHECK-LABEL: @load_v2i32_v4i32_non_canonical_mask_commute( ; CHECK-NEXT: [[S:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 1 ; CHECK-NEXT: ret <4 x i32> [[S]] ; %l = load <2 x i32>, ptr %p, align 1 %s = shufflevector <2 x i32> poison, <2 x i32> %l, <4 x i32> ret <4 x i32> %s } ; The wide load must be in the same addrspace as the original load. define <4 x i32> @load_v2i32_v4i32_addrspacecast(ptr addrspace(5) align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @load_v2i32_v4i32_addrspacecast( ; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[P:%.*]] to ptr addrspace(42) ; CHECK-NEXT: [[S:%.*]] = load <4 x i32>, ptr addrspace(42) [[TMP1]], align 16 ; CHECK-NEXT: ret <4 x i32> [[S]] ; %asc = addrspacecast ptr addrspace(5) %p to ptr addrspace(42) %l = load <2 x i32>, ptr addrspace(42) %asc, align 4 %s = shufflevector <2 x i32> %l, <2 x i32> poison, <4 x i32> ret <4 x i32> %s } ; Negative-negative tests with msan, which should be OK with widening. define <4 x float> @load_v1f32_v4f32_msan(ptr dereferenceable(16) %p) sanitize_memory { ; CHECK-LABEL: @load_v1f32_v4f32_msan( ; CHECK-NEXT: [[S:%.*]] = load <4 x float>, ptr [[P:%.*]], align 16 ; CHECK-NEXT: ret <4 x float> [[S]] ; %l = load <1 x float>, ptr %p, align 16 %s = shufflevector <1 x float> %l, <1 x float> poison, <4 x i32> ret <4 x float> %s } ; Negative tests with sanitizers. define <4 x float> @load_v1f32_v4f32_asan(ptr dereferenceable(16) %p) sanitize_address { ; CHECK-LABEL: @load_v1f32_v4f32_asan( ; CHECK-NEXT: [[L:%.*]] = load <1 x float>, ptr [[P:%.*]], align 16 ; CHECK-NEXT: [[S:%.*]] = shufflevector <1 x float> [[L]], <1 x float> poison, <4 x i32> ; CHECK-NEXT: ret <4 x float> [[S]] ; %l = load <1 x float>, ptr %p, align 16 %s = shufflevector <1 x float> %l, <1 x float> poison, <4 x i32> ret <4 x float> %s } define <4 x float> @load_v2f32_v4f32_hwasan(ptr align 16 dereferenceable(16) %p) sanitize_hwaddress { ; CHECK-LABEL: @load_v2f32_v4f32_hwasan( ; CHECK-NEXT: [[L:%.*]] = load <2 x float>, ptr [[P:%.*]], align 1 ; CHECK-NEXT: [[S:%.*]] = shufflevector <2 x float> [[L]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: ret <4 x float> [[S]] ; %l = load <2 x float>, ptr %p, align 1 %s = shufflevector <2 x float> %l, <2 x float> poison, <4 x i32> ret <4 x float> %s } define <4 x float> @load_v3f32_v4f32_tsan(ptr dereferenceable(16) %p) sanitize_thread { ; CHECK-LABEL: @load_v3f32_v4f32_tsan( ; CHECK-NEXT: [[L:%.*]] = load <3 x float>, ptr [[P:%.*]], align 1 ; CHECK-NEXT: [[S:%.*]] = shufflevector <3 x float> [[L]], <3 x float> poison, <4 x i32> ; CHECK-NEXT: ret <4 x float> [[S]] ; %l = load <3 x float>, ptr %p, align 1 %s = shufflevector <3 x float> %l, <3 x float> poison, <4 x i32> ret <4 x float> %s } define <8 x float> @load_v2f32_v8f32_hwasan(ptr dereferenceable(32) %p) sanitize_hwaddress { ; CHECK-LABEL: @load_v2f32_v8f32_hwasan( ; CHECK-NEXT: [[L:%.*]] = load <2 x float>, ptr [[P:%.*]], align 1 ; CHECK-NEXT: [[S:%.*]] = shufflevector <2 x float> [[L]], <2 x float> poison, <8 x i32> ; CHECK-NEXT: ret <8 x float> [[S]] ; %l = load <2 x float>, ptr %p, align 1 %s = shufflevector <2 x float> %l, <2 x float> poison, <8 x i32> ret <8 x float> %s } define <4 x i32> @load_v2i32_v4i32_asan(ptr dereferenceable(16) %p) sanitize_address { ; CHECK-LABEL: @load_v2i32_v4i32_asan( ; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i32>, ptr [[P:%.*]], align 1 ; CHECK-NEXT: [[S:%.*]] = shufflevector <1 x i32> [[TMP1]], <1 x i32> poison, <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[S]] ; %l = load <2 x i32>, ptr %p, align 1 %s = shufflevector <2 x i32> %l, <2 x i32> poison, <4 x i32> ret <4 x i32> %s } define <4 x i32> @load_v2i32_v4i32_non_canonical_mask_commute_hwasan(ptr dereferenceable(16) %p) sanitize_hwaddress { ; CHECK-LABEL: @load_v2i32_v4i32_non_canonical_mask_commute_hwasan( ; CHECK-NEXT: [[L:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 1 ; CHECK-NEXT: [[S:%.*]] = shufflevector <2 x i32> poison, <2 x i32> [[L]], <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[S]] ; %l = load <2 x i32>, ptr %p, align 1 %s = shufflevector <2 x i32> poison, <2 x i32> %l, <4 x i32> ret <4 x i32> %s }