aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCraig Topper <craig.topper@intel.com>2017-07-26 04:31:04 +0000
committerCraig Topper <craig.topper@intel.com>2017-07-26 04:31:04 +0000
commit050c9c8f83b4fc78a0057629109a1830c834fa4b (patch)
tree66ca48fcd6f0052bfc13c355b1969d5ec79361f6
parent7b05a2712a673fe7032189a142f299ded3885491 (diff)
downloadllvm-050c9c8f83b4fc78a0057629109a1830c834fa4b.zip
llvm-050c9c8f83b4fc78a0057629109a1830c834fa4b.tar.gz
llvm-050c9c8f83b4fc78a0057629109a1830c834fa4b.tar.bz2
[X86] Prevent selecting masked aligned load instructions if the load should be non-temporal
Summary: The aligned load predicates don't suppress themselves if the load is non-temporal the way the unaligned predicates do. For the most part this isn't a problem because the aligned predicates are mostly used for instructions that only load the the non-temporal loads have priority over those. The exception are masked loads. Reviewers: RKSimon, zvi Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D35712 llvm-svn: 309079
-rw-r--r--llvm/lib/Target/X86/X86InstrFragmentsSIMD.td9
-rw-r--r--llvm/test/CodeGen/X86/nontemporal-loads.ll115
2 files changed, 121 insertions, 3 deletions
diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
index 8b5bbf2..e7b2e6b 100644
--- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -698,17 +698,20 @@ def alignedstore512 : PatFrag<(ops node:$val, node:$ptr),
// Like 'load', but always requires 128-bit vector alignment.
def alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
- return cast<LoadSDNode>(N)->getAlignment() >= 16;
+ return cast<LoadSDNode>(N)->getAlignment() >= 16 &&
+ (!Subtarget->hasSSE41() || !cast<LoadSDNode>(N)->isNonTemporal());
}]>;
// Like 'load', but always requires 256-bit vector alignment.
def alignedload256 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
- return cast<LoadSDNode>(N)->getAlignment() >= 32;
+ return cast<LoadSDNode>(N)->getAlignment() >= 32 &&
+ (!Subtarget->hasAVX2() || !cast<LoadSDNode>(N)->isNonTemporal());
}]>;
// Like 'load', but always requires 512-bit vector alignment.
def alignedload512 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
- return cast<LoadSDNode>(N)->getAlignment() >= 64;
+ return cast<LoadSDNode>(N)->getAlignment() >= 64 &&
+ (!Subtarget->hasAVX512() || !cast<LoadSDNode>(N)->isNonTemporal());
}]>;
// 128-bit aligned load pattern fragments
diff --git a/llvm/test/CodeGen/X86/nontemporal-loads.ll b/llvm/test/CodeGen/X86/nontemporal-loads.ll
index a9e42ad..9e203cb 100644
--- a/llvm/test/CodeGen/X86/nontemporal-loads.ll
+++ b/llvm/test/CodeGen/X86/nontemporal-loads.ll
@@ -1797,4 +1797,119 @@ define <64 x i8> @test_unaligned_v64i8(<64 x i8>* %src) {
ret <64 x i8> %1
}
+define <16 x i32> @test_masked_v16i32(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) {
+; SSE2-LABEL: test_masked_v16i32:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa %xmm0, %xmm10
+; SSE2-NEXT: pxor %xmm12, %xmm12
+; SSE2-NEXT: pcmpeqd %xmm12, %xmm7
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT: movdqa %xmm7, %xmm8
+; SSE2-NEXT: pxor %xmm0, %xmm8
+; SSE2-NEXT: pcmpeqd %xmm12, %xmm6
+; SSE2-NEXT: movdqa %xmm6, %xmm9
+; SSE2-NEXT: pxor %xmm0, %xmm9
+; SSE2-NEXT: pcmpeqd %xmm12, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm11
+; SSE2-NEXT: pxor %xmm0, %xmm11
+; SSE2-NEXT: pcmpeqd %xmm12, %xmm4
+; SSE2-NEXT: pxor %xmm4, %xmm0
+; SSE2-NEXT: pandn (%rdi), %xmm4
+; SSE2-NEXT: pandn %xmm10, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: pandn 16(%rdi), %xmm5
+; SSE2-NEXT: pandn %xmm1, %xmm11
+; SSE2-NEXT: por %xmm5, %xmm11
+; SSE2-NEXT: pandn 32(%rdi), %xmm6
+; SSE2-NEXT: pandn %xmm2, %xmm9
+; SSE2-NEXT: por %xmm6, %xmm9
+; SSE2-NEXT: pandn 48(%rdi), %xmm7
+; SSE2-NEXT: pandn %xmm3, %xmm8
+; SSE2-NEXT: por %xmm7, %xmm8
+; SSE2-NEXT: movdqa %xmm11, %xmm1
+; SSE2-NEXT: movdqa %xmm9, %xmm2
+; SSE2-NEXT: movdqa %xmm8, %xmm3
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_masked_v16i32:
+; SSE41: # BB#0:
+; SSE41-NEXT: movdqa %xmm0, %xmm8
+; SSE41-NEXT: pxor %xmm0, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm9, %xmm9
+; SSE41-NEXT: pxor %xmm9, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm6
+; SSE41-NEXT: pxor %xmm9, %xmm6
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
+; SSE41-NEXT: pxor %xmm9, %xmm5
+; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
+; SSE41-NEXT: pxor %xmm9, %xmm0
+; SSE41-NEXT: movntdqa 48(%rdi), %xmm9
+; SSE41-NEXT: movntdqa 32(%rdi), %xmm10
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm11
+; SSE41-NEXT: movntdqa (%rdi), %xmm4
+; SSE41-NEXT: blendvps %xmm0, %xmm4, %xmm8
+; SSE41-NEXT: movdqa %xmm5, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm11, %xmm1
+; SSE41-NEXT: movdqa %xmm6, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm10, %xmm2
+; SSE41-NEXT: movdqa %xmm7, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm9, %xmm3
+; SSE41-NEXT: movaps %xmm8, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_masked_v16i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6
+; AVX1-NEXT: vpxor %xmm6, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpxor %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpxor %xmm6, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpxor %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm4
+; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm5
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
+; AVX1-NEXT: vblendvps %ymm3, %ymm4, %ymm1, %ymm1
+; AVX1-NEXT: vmovntdqa (%rdi), %xmm3
+; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: vblendvps %ymm2, %ymm3, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_masked_v16i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX2-NEXT: vpcmpeqd %ymm4, %ymm3, %ymm3
+; AVX2-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5
+; AVX2-NEXT: vpxor %ymm5, %ymm3, %ymm3
+; AVX2-NEXT: vpcmpeqd %ymm4, %ymm2, %ymm2
+; AVX2-NEXT: vpxor %ymm5, %ymm2, %ymm2
+; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm4
+; AVX2-NEXT: vblendvps %ymm3, %ymm4, %ymm1, %ymm1
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm3
+; AVX2-NEXT: vblendvps %ymm2, %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_masked_v16i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; AVX512-NEXT: vpcmpneqd %zmm2, %zmm1, %k1
+; AVX512-NEXT: vmovntdqa (%rdi), %zmm1
+; AVX512-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512-NEXT: retq
+ %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <16 x i32>*
+ %r = load <16 x i32>, <16 x i32>* %vaddr, align 64, !nontemporal !1
+ %res = select <16 x i1> %mask, <16 x i32> %r, <16 x i32> %old
+ ret <16 x i32>%res
+}
+
!1 = !{i32 1}