diff options
-rw-r--r-- | clang/include/clang/Basic/BuiltinsAMDGPU.def | 7 | ||||
-rw-r--r-- | clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 35 | ||||
-rw-r--r-- | clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-load-monitor.cl | 66 | ||||
-rw-r--r-- | clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl | 12 | ||||
-rw-r--r-- | llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 21 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUGISel.td | 3 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 16 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h | 3 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp | 10 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h | 2 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 6 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/FLATInstructions.td | 36 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 12 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.monitor.gfx1250.ll | 201 |
14 files changed, 430 insertions, 0 deletions
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 0b16e12..945e11b 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -645,6 +645,13 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_f16_f32, "V2hV2hfUiIb", "nc", "f32-to-f16 TARGET_BUILTIN(__builtin_amdgcn_flat_prefetch, "vvC*0Ii", "nc", "vmem-pref-insts") TARGET_BUILTIN(__builtin_amdgcn_global_prefetch, "vvC*1Ii", "nc", "vmem-pref-insts") +TARGET_BUILTIN(__builtin_amdgcn_global_load_monitor_b32, "ii*1Ii", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_global_load_monitor_b64, "V2iV2i*1Ii", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_global_load_monitor_b128, "V4iV4i*1Ii", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_flat_load_monitor_b32, "ii*0Ii", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_flat_load_monitor_b64, "V2iV2i*0Ii", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_flat_load_monitor_b128, "V4iV4i*0Ii", "nc", "gfx1250-insts") + TARGET_BUILTIN(__builtin_amdgcn_tensor_load_to_lds, "vV4iV8iV4iV4iIi", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_tensor_load_to_lds_d2, "vV4iV8iIi", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_tensor_store_from_lds, "vV4iV8iV4iV4iIi", "nc", "gfx1250-insts") diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index e469a01..70f510a 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -633,6 +633,41 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, llvm::Function *F = CGM.getIntrinsic(IID, {LoadTy}); return Builder.CreateCall(F, {Addr}); } + case AMDGPU::BI__builtin_amdgcn_global_load_monitor_b32: + case AMDGPU::BI__builtin_amdgcn_global_load_monitor_b64: + case AMDGPU::BI__builtin_amdgcn_global_load_monitor_b128: + case AMDGPU::BI__builtin_amdgcn_flat_load_monitor_b32: + case AMDGPU::BI__builtin_amdgcn_flat_load_monitor_b64: + case AMDGPU::BI__builtin_amdgcn_flat_load_monitor_b128: { + + Intrinsic::ID IID; + switch (BuiltinID) { + case AMDGPU::BI__builtin_amdgcn_global_load_monitor_b32: + IID = Intrinsic::amdgcn_global_load_monitor_b32; + break; + case AMDGPU::BI__builtin_amdgcn_global_load_monitor_b64: + IID = Intrinsic::amdgcn_global_load_monitor_b64; + break; + case AMDGPU::BI__builtin_amdgcn_global_load_monitor_b128: + IID = Intrinsic::amdgcn_global_load_monitor_b128; + break; + case AMDGPU::BI__builtin_amdgcn_flat_load_monitor_b32: + IID = Intrinsic::amdgcn_flat_load_monitor_b32; + break; + case AMDGPU::BI__builtin_amdgcn_flat_load_monitor_b64: + IID = Intrinsic::amdgcn_flat_load_monitor_b64; + break; + case AMDGPU::BI__builtin_amdgcn_flat_load_monitor_b128: + IID = Intrinsic::amdgcn_flat_load_monitor_b128; + break; + } + + llvm::Type *LoadTy = ConvertType(E->getType()); + llvm::Value *Addr = EmitScalarExpr(E->getArg(0)); + llvm::Value *Val = EmitScalarExpr(E->getArg(1)); + llvm::Function *F = CGM.getIntrinsic(IID, {LoadTy}); + return Builder.CreateCall(F, {Addr, Val}); + } case AMDGPU::BI__builtin_amdgcn_load_to_lds: { // Should this have asan instrumentation? return emitBuiltinWithOneOverloadedType<5>(*this, E, diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-load-monitor.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-load-monitor.cl new file mode 100644 index 0000000..f2552d4 --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-load-monitor.cl @@ -0,0 +1,66 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1250 + +typedef int v2i __attribute__((ext_vector_type(2))); +typedef int v4i __attribute__((ext_vector_type(4))); + +// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_monitor_b32( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.global.load.monitor.b32.i32(ptr addrspace(1) [[INPTR:%.*]], i32 1) +// CHECK-GFX1250-NEXT: ret i32 [[TMP0]] +// +int test_amdgcn_global_load_monitor_b32(global int* inptr) +{ + return __builtin_amdgcn_global_load_monitor_b32(inptr, 1); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_monitor_b64( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.global.load.monitor.b64.v2i32(ptr addrspace(1) [[INPTR:%.*]], i32 10) +// CHECK-GFX1250-NEXT: ret <2 x i32> [[TMP0]] +// +v2i test_amdgcn_global_load_monitor_b64(global v2i* inptr) +{ + return __builtin_amdgcn_global_load_monitor_b64(inptr, 10); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_monitor_b128( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.global.load.monitor.b128.v4i32(ptr addrspace(1) [[INPTR:%.*]], i32 22) +// CHECK-GFX1250-NEXT: ret <4 x i32> [[TMP0]] +// +v4i test_amdgcn_global_load_monitor_b128(global v4i* inptr) +{ + return __builtin_amdgcn_global_load_monitor_b128(inptr, 22); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_flat_load_monitor_b32( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.flat.load.monitor.b32.i32(ptr [[INPTR:%.*]], i32 27) +// CHECK-GFX1250-NEXT: ret i32 [[TMP0]] +// +int test_amdgcn_flat_load_monitor_b32(int* inptr) +{ + return __builtin_amdgcn_flat_load_monitor_b32(inptr, 27); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_flat_load_monitor_b64( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.flat.load.monitor.b64.v2i32(ptr [[INPTR:%.*]], i32 1) +// CHECK-GFX1250-NEXT: ret <2 x i32> [[TMP0]] +// +v2i test_amdgcn_flat_load_monitor_b64(v2i* inptr) +{ + return __builtin_amdgcn_flat_load_monitor_b64(inptr, 1); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_flat_load_monitor_b128( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.flat.load.monitor.b128.v4i32(ptr [[INPTR:%.*]], i32 0) +// CHECK-GFX1250-NEXT: ret <4 x i32> [[TMP0]] +// +v4i test_amdgcn_flat_load_monitor_b128(v4i* inptr) +{ + return __builtin_amdgcn_flat_load_monitor_b128(inptr, 0); +} diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl index 12a0f3c..3247380 100644 --- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl +++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl @@ -1,6 +1,7 @@ // REQUIRES: amdgpu-registered-target // RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-- -target-cpu gfx1250 -verify -S -o - %s +typedef int v2i __attribute__((ext_vector_type(2))); typedef int v4i __attribute__((ext_vector_type(4))); typedef int v8i __attribute__((ext_vector_type(8))); @@ -28,6 +29,17 @@ void test__builtin_amdgcn_cvt_f16_bf8(int a, int b) { __builtin_amdgcn_cvt_f16_bf8(a, b); // expected-error {{'__builtin_amdgcn_cvt_f16_bf8' must be a constant integer}} } +void test_amdgcn_load_monitor(global int* b32gaddr, global v2i* b64gaddr, global v4i* b128gaddr, int *b32faddr, v2i* b64faddr, v4i *b128faddr, + global int* b32out, global v2i* b64out, global v4i* b128out, int cpol) +{ + *b32out = __builtin_amdgcn_global_load_monitor_b32(b32gaddr, cpol); // expected-error {{'__builtin_amdgcn_global_load_monitor_b32' must be a constant integer}} + *b64out = __builtin_amdgcn_global_load_monitor_b64(b64gaddr, cpol); // expected-error {{'__builtin_amdgcn_global_load_monitor_b64' must be a constant integer}} + *b128out = __builtin_amdgcn_global_load_monitor_b128(b128gaddr, cpol); // expected-error {{'__builtin_amdgcn_global_load_monitor_b128' must be a constant integer}} + *b32out = __builtin_amdgcn_flat_load_monitor_b32(b32faddr, cpol); // expected-error {{'__builtin_amdgcn_flat_load_monitor_b32' must be a constant integer}} + *b64out = __builtin_amdgcn_flat_load_monitor_b64(b64faddr, cpol); // expected-error {{'__builtin_amdgcn_flat_load_monitor_b64' must be a constant integer}} + *b128out = __builtin_amdgcn_flat_load_monitor_b128(b128faddr, cpol); // expected-error {{'__builtin_amdgcn_flat_load_monitor_b128' must be a constant integer}} +} + void test_amdgcn_tensor_load_store(v4i sg0, v8i sg1, v4i sg2, v4i sg3, int cpol) { __builtin_amdgcn_tensor_load_to_lds(sg0, sg1, sg2, sg3, cpol); // expected-error {{'__builtin_amdgcn_tensor_load_to_lds' must be a constant integer}} diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index f313c6b..3a7db6d 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -10,6 +10,7 @@ // //===----------------------------------------------------------------------===// +def flat_ptr_ty : LLVMQualPointerType<0>; def global_ptr_ty : LLVMQualPointerType<1>; def local_ptr_ty : LLVMQualPointerType<3>; @@ -3846,6 +3847,26 @@ def int_amdgcn_tensor_load_to_lds_d2 : def int_amdgcn_tensor_store_from_lds_d2 : ClangBuiltin<"__builtin_amdgcn_tensor_store_from_lds_d2">, AMDGPUTensorLoadStoreD2; +class AMDGPULoadMonitor<LLVMType ptr_ty>: + Intrinsic< + [llvm_any_ty], + [ptr_ty, + llvm_i32_ty], // gfx12+ cachepolicy: + // bits [0-2] = th + // bits [3-4] = scope + [IntrArgMemOnly, IntrReadMem, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<1>>, + IntrWillReturn, IntrConvergent, IntrNoCallback, IntrNoFree], + "", + [SDNPMemOperand] + >; + +def int_amdgcn_flat_load_monitor_b32 : AMDGPULoadMonitor<flat_ptr_ty>; +def int_amdgcn_flat_load_monitor_b64 : AMDGPULoadMonitor<flat_ptr_ty>; +def int_amdgcn_flat_load_monitor_b128 : AMDGPULoadMonitor<flat_ptr_ty>; +def int_amdgcn_global_load_monitor_b32 : AMDGPULoadMonitor<global_ptr_ty>; +def int_amdgcn_global_load_monitor_b64 : AMDGPULoadMonitor<global_ptr_ty>; +def int_amdgcn_global_load_monitor_b128 : AMDGPULoadMonitor<global_ptr_ty>; + /// Emit an addrspacecast without null pointer checking. /// Should only be inserted by a pass based on analysis of an addrspacecast's src. def int_amdgcn_addrspacecast_nonnull : DefaultAttrsIntrinsic< diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index 108842f..c01e5d3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -137,6 +137,9 @@ def gi_global_offset : def gi_global_saddr : GIComplexOperandMatcher<s64, "selectGlobalSAddr">, GIComplexPatternEquiv<GlobalSAddr>; +def gi_global_saddr_cpol : + GIComplexOperandMatcher<s64, "selectGlobalSAddrCPol">, + GIComplexPatternEquiv<GlobalSAddrCPol>; def gi_global_saddr_glc : GIComplexOperandMatcher<s64, "selectGlobalSAddrGLC">, GIComplexPatternEquiv<GlobalSAddrGLC>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 0ca2286..dfaa145 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -2020,6 +2020,22 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr, return true; } +bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPol(SDNode *N, SDValue Addr, + SDValue &SAddr, SDValue &VOffset, + SDValue &Offset, + SDValue &CPol) const { + bool ScaleOffset; + if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset)) + return false; + + // We are assuming CPol is always the last operand of the intrinsic. + auto PassedCPol = + N->getConstantOperandVal(N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL; + CPol = CurDAG->getTargetConstant( + (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32); + return true; +} + bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &VOffset, SDValue &Offset, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index a6ce745..5636d89 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -168,6 +168,9 @@ private: bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &VOffset, SDValue &Offset, SDValue &CPol) const; + bool SelectGlobalSAddrCPol(SDNode *N, SDValue Addr, SDValue &SAddr, + SDValue &VOffset, SDValue &Offset, + SDValue &CPol) const; bool SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &VOffset, SDValue &Offset, SDValue &CPol) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 8ca9a97..266dee1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -5774,6 +5774,16 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const { } InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectGlobalSAddrCPol(MachineOperand &Root) const { + const MachineInstr &I = *Root.getParent(); + + // We are assuming CPol is always the last operand of the intrinsic. + auto PassedCPol = + I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL; + return selectGlobalSAddr(Root, PassedCPol); +} + +InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const { return selectGlobalSAddr(Root, AMDGPU::CPol::GLC); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 61d9de1..fe9743d0a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -261,6 +261,8 @@ private: InstructionSelector::ComplexRendererFns selectGlobalSAddr(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns + selectGlobalSAddrCPol(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns selectGlobalSAddrGLC(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 787db67..c5a1d9e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -5180,6 +5180,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_ds_load_tr16_b128: case Intrinsic::amdgcn_ds_load_tr4_b64: case Intrinsic::amdgcn_ds_load_tr6_b96: + case Intrinsic::amdgcn_flat_load_monitor_b32: + case Intrinsic::amdgcn_flat_load_monitor_b64: + case Intrinsic::amdgcn_flat_load_monitor_b128: + case Intrinsic::amdgcn_global_load_monitor_b32: + case Intrinsic::amdgcn_global_load_monitor_b64: + case Intrinsic::amdgcn_global_load_monitor_b128: case Intrinsic::amdgcn_ds_read_tr4_b64: case Intrinsic::amdgcn_ds_read_tr6_b96: case Intrinsic::amdgcn_ds_read_tr8_b64: diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 5ccf1e5..2cdc589 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -13,6 +13,7 @@ let WantsRoot = true in { def GlobalSAddr : ComplexPattern<iPTR, 4, "SelectGlobalSAddr", [], [], -10>; def GlobalSAddrGLC : ComplexPattern<iPTR, 4, "SelectGlobalSAddrGLC", [], [], -10>; + def GlobalSAddrCPol : ComplexPattern<iPTR, 4, "SelectGlobalSAddrCPol", [], [], -10>; def ScratchSAddr : ComplexPattern<iPTR, 2, "SelectScratchSAddr", [], [], -10>; def ScratchSVAddr : ComplexPattern<iPTR, 4, "SelectScratchSVAddr", [], [], -10>; } @@ -1274,6 +1275,11 @@ class FlatLoadPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCN (inst $vaddr, $offset) >; +class FlatLoadPat_CPOL <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (FlatOffset i64:$vaddr, i32:$offset), (i32 timm:$cpol))), + (inst $vaddr, $offset, $cpol) +>; + class FlatLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (node (FlatOffset (i64 VReg_64:$vaddr), i32:$offset), vt:$in), (inst $vaddr, $offset, 0, $in) @@ -1324,6 +1330,16 @@ class FlatLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> (inst $saddr, $voffset, $offset, $cpol) >; +class FlatLoadSignedPat_CPOL <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), (i32 timm:$cpol))), + (inst $vaddr, $offset, $cpol) +>; + +class GlobalLoadSaddrPat_CPOL <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (GlobalSAddrCPol (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), (i32 timm))), + (inst $saddr, $voffset, $offset, $cpol) +>; + class FlatStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (node vt:$data, (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol)), @@ -1519,6 +1535,16 @@ multiclass GlobalFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueTyp } } +multiclass GlobalFLATLoadPats_CPOL<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { + def : FlatLoadSignedPat_CPOL<inst, node, vt> { + let AddedComplexity = 10; + } + + def : GlobalLoadSaddrPat_CPOL<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 11; + } +} + multiclass GlobalFLATLoadPats_D16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { def : FlatSignedLoadPat_D16 <inst, node, vt> { let AddedComplexity = 10; @@ -2055,6 +2081,16 @@ let WaveSizePredicate = isWave32, OtherPredicates = [HasTransposeLoadF4F6Insts] defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR6_B96, int_amdgcn_global_load_tr6_b96, v3i32>; } +let OtherPredicates = [isGFX125xOnly] in { + def : FlatLoadPat_CPOL <FLAT_LOAD_MONITOR_B32, int_amdgcn_flat_load_monitor_b32, i32>; + def : FlatLoadPat_CPOL <FLAT_LOAD_MONITOR_B64, int_amdgcn_flat_load_monitor_b64, v2i32>; + def : FlatLoadPat_CPOL <FLAT_LOAD_MONITOR_B128, int_amdgcn_flat_load_monitor_b128, v4i32>; + + defm : GlobalFLATLoadPats_CPOL <GLOBAL_LOAD_MONITOR_B32, int_amdgcn_global_load_monitor_b32, i32>; + defm : GlobalFLATLoadPats_CPOL <GLOBAL_LOAD_MONITOR_B64, int_amdgcn_global_load_monitor_b64, v2i32>; + defm : GlobalFLATLoadPats_CPOL <GLOBAL_LOAD_MONITOR_B128, int_amdgcn_global_load_monitor_b128, v4i32>; +} // End SubtargetPredicate = isGFX125xOnly + let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in { defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 74fe2b8..0eee7ad 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1477,6 +1477,12 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, MachineMemOperand::MOVolatile; return true; } + case Intrinsic::amdgcn_flat_load_monitor_b32: + case Intrinsic::amdgcn_flat_load_monitor_b64: + case Intrinsic::amdgcn_flat_load_monitor_b128: + case Intrinsic::amdgcn_global_load_monitor_b32: + case Intrinsic::amdgcn_global_load_monitor_b64: + case Intrinsic::amdgcn_global_load_monitor_b128: case Intrinsic::amdgcn_ds_load_tr6_b96: case Intrinsic::amdgcn_ds_load_tr4_b64: case Intrinsic::amdgcn_ds_load_tr8_b64: @@ -1603,10 +1609,16 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II, case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: case Intrinsic::amdgcn_flat_atomic_fmax_num: case Intrinsic::amdgcn_flat_atomic_fmin_num: + case Intrinsic::amdgcn_flat_load_monitor_b128: + case Intrinsic::amdgcn_flat_load_monitor_b32: + case Intrinsic::amdgcn_flat_load_monitor_b64: case Intrinsic::amdgcn_global_atomic_csub: case Intrinsic::amdgcn_global_atomic_fmax_num: case Intrinsic::amdgcn_global_atomic_fmin_num: case Intrinsic::amdgcn_global_atomic_ordered_add_b64: + case Intrinsic::amdgcn_global_load_monitor_b128: + case Intrinsic::amdgcn_global_load_monitor_b32: + case Intrinsic::amdgcn_global_load_monitor_b64: case Intrinsic::amdgcn_global_load_tr_b64: case Intrinsic::amdgcn_global_load_tr_b128: case Intrinsic::amdgcn_global_load_tr4_b64: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.monitor.gfx1250.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.monitor.gfx1250.ll new file mode 100644 index 0000000..017d402 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.monitor.gfx1250.ll @@ -0,0 +1,201 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s + +declare i32 @llvm.amdgcn.global.load.monitor.b32.i32(ptr addrspace(1), i32) +declare <2 x i32> @llvm.amdgcn.global.load.monitor.b64.v2i32(ptr addrspace(1), i32) +declare <4 x i32> @llvm.amdgcn.global.load.monitor.b128.v4i32(ptr addrspace(1), i32) +declare i32 @llvm.amdgcn.flat.load.monitor.b32.i32(ptr, i32) +declare <2 x i32> @llvm.amdgcn.flat.load.monitor.b64.v2i32(ptr, i32) +declare <4 x i32> @llvm.amdgcn.flat.load.monitor.b128.v4i32(ptr, i32) + +define amdgpu_ps void @global_load_monitor_b32_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) { +; GFX1250-LABEL: global_load_monitor_b32_vaddr: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_load_monitor_b32 v0, v[0:1], off offset:32 th:TH_LOAD_NT +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v[2:3], v0, off +; GFX1250-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %val = call i32 @llvm.amdgcn.global.load.monitor.b32.i32(ptr addrspace(1) %gep, i32 1) + store i32 %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_ps void @global_load_monitor_b32_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) { +; GFX1250-LABEL: global_load_monitor_b32_saddr: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: global_load_monitor_b32 v2, v2, s[0:1] offset:32 th:TH_LOAD_HT scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v[0:1], v2, off +; GFX1250-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %val = call i32 @llvm.amdgcn.global.load.monitor.b32.i32(ptr addrspace(1) %gep, i32 10) + store i32 %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_ps void @global_load_monitor_b64_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) { +; GFX1250-LABEL: global_load_monitor_b64_vaddr: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_load_monitor_b64 v[0:1], v[0:1], off offset:32 th:TH_LOAD_NT_HT scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX1250-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %val = call <2 x i32> @llvm.amdgcn.global.load.monitor.b64.v2i32(ptr addrspace(1) %gep, i32 22) + store <2 x i32> %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_ps void @global_load_monitor_b64_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) { +; GFX1250-LABEL: global_load_monitor_b64_saddr: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: global_load_monitor_b64 v[2:3], v2, s[0:1] offset:32 th:TH_LOAD_BYPASS scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX1250-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %val = call <2 x i32> @llvm.amdgcn.global.load.monitor.b64.v2i32(ptr addrspace(1) %gep, i32 27) + store <2 x i32> %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_ps void @global_load_monitor_b128_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) { +; GFX1250-LABEL: global_load_monitor_b128_vaddr: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_load_monitor_b128 v[4:7], v[0:1], off offset:32 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX1250-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %val = call <4 x i32> @llvm.amdgcn.global.load.monitor.b128.v4i32(ptr addrspace(1) %gep, i32 0) + store <4 x i32> %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_ps void @global_load_monitor_b128_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) { +; GFX1250-LABEL: global_load_monitor_b128_saddr: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: global_load_monitor_b128 v[2:5], v2, s[0:1] offset:32 th:TH_LOAD_NT +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %val = call <4 x i32> @llvm.amdgcn.global.load.monitor.b128.v4i32(ptr addrspace(1) %gep, i32 1) + store <4 x i32> %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_ps void @flat_load_monitor_b32(ptr %addr, ptr addrspace(1) %use) { +; GFX1250-LABEL: flat_load_monitor_b32: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: flat_load_monitor_b32 v0, v[0:1] offset:32 th:TH_LOAD_HT scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_store_b32 v[2:3], v0, off +; GFX1250-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(0) %addr, i32 4 + %val = call i32 @llvm.amdgcn.flat.load.monitor.b32.i32(ptr addrspace(0) %gep, i32 10) + store i32 %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_ps void @flat_load_monitor_b64(ptr %addr, ptr addrspace(1) %use) { +; GFX1250-LABEL: flat_load_monitor_b64: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: flat_load_monitor_b64 v[0:1], v[0:1] offset:32 th:TH_LOAD_NT_HT scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX1250-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(0) %addr, i32 4 + %val = call <2 x i32> @llvm.amdgcn.flat.load.monitor.b64.v2i32(ptr addrspace(0) %gep, i32 22) + store <2 x i32> %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_ps void @flat_load_monitor_b128(ptr %addr, ptr addrspace(1) %use) { +; GFX1250-LABEL: flat_load_monitor_b128: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: flat_load_monitor_b128 v[4:7], v[0:1] offset:32 th:TH_LOAD_BYPASS scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX1250-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(0) %addr, i32 4 + %val = call <4 x i32> @llvm.amdgcn.flat.load.monitor.b128.v4i32(ptr addrspace(0) %gep, i32 27) + store <4 x i32> %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_ps void @global_load_monitor_b32_saddr_scale_offset(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use, i32 %idx) { +; GFX1250-LABEL: global_load_monitor_b32_saddr_scale_offset: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_load_monitor_b32 v2, v2, s[0:1] scale_offset th:TH_LOAD_NT +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v[0:1], v2, off +; GFX1250-NEXT: s_endpgm +entry: + %idxprom = sext i32 %idx to i64 + %gep = getelementptr i32, ptr addrspace(1) %addr, i64 %idxprom + %val = call i32 @llvm.amdgcn.global.load.monitor.b32.i32(ptr addrspace(1) %gep, i32 1) + store i32 %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_ps void @global_load_monitor_b64_saddr_scale_offset(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use, i32 %idx) { +; GFX1250-LABEL: global_load_monitor_b64_saddr_scale_offset: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_load_monitor_b64 v[2:3], v2, s[0:1] scale_offset th:TH_LOAD_NT +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX1250-NEXT: s_endpgm +entry: + %idxprom = sext i32 %idx to i64 + %gep = getelementptr i64, ptr addrspace(1) %addr, i64 %idxprom + %val = call <2 x i32> @llvm.amdgcn.global.load.monitor.b64.v2i32(ptr addrspace(1) %gep, i32 1) + store <2 x i32> %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_ps void @global_load_monitor_b64_saddr_no_scale_offset(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use, i32 %idx) { +; GFX1250-SDAG-LABEL: global_load_monitor_b64_saddr_no_scale_offset: +; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 2, s[0:1] +; GFX1250-SDAG-NEXT: global_load_monitor_b64 v[2:3], v[2:3], off th:TH_LOAD_NT +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: global_load_monitor_b64_saddr_no_scale_offset: +; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3] +; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, v5, v3, vcc_lo +; GFX1250-GISEL-NEXT: global_load_monitor_b64 v[2:3], v[2:3], off th:TH_LOAD_NT +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX1250-GISEL-NEXT: s_endpgm +entry: + %idxprom = sext i32 %idx to i64 + %gep = getelementptr i32, ptr addrspace(1) %addr, i64 %idxprom + %val = call <2 x i32> @llvm.amdgcn.global.load.monitor.b64.v2i32(ptr addrspace(1) %gep, i32 1) + store <2 x i32> %val, ptr addrspace(1) %use + ret void +} |