aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPetar Avramovic <Petar.Avramovic@amd.com>2025-08-26 14:10:41 +0200
committerPetar Avramovic <Petar.Avramovic@amd.com>2025-08-26 14:14:49 +0200
commitacde25e13c32446d2ba67c6455e197b334807a72 (patch)
tree11655366c2440a7cea8cfc7fd657e2e30a89e505
parent0a46b289afc8922c2d832a6a3c01b9f8451fbd2a (diff)
downloadllvm-users/petar-avramovic/d16-loads.zip
llvm-users/petar-avramovic/d16-loads.tar.gz
llvm-users/petar-avramovic/d16-loads.tar.bz2
AMDGPU/GlobalISel: Import D16 load patterns and add combines for themusers/petar-avramovic/d16-loads
Add G_AMDGPU_LOAD_D16 generic instructions and GINodeEquivs for them, this will import D16 load patterns to global-isel's tablegened instruction selector. For newly imported patterns to work add combines for G_AMDGPU_LOAD_D16 in AMDGPURegBankCombiner.
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCombine.td9
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGISel.td7
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp86
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstructions.td15
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_flat.ll15
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_global.ll15
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_local_2.ll13
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/load-d16.ll412
-rw-r--r--llvm/test/CodeGen/AMDGPU/global-saddr-load.ll246
9 files changed, 622 insertions, 196 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index b5dac95..e8b211f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -71,6 +71,12 @@ def int_minmax_to_med3 : GICombineRule<
[{ return matchIntMinMaxToMed3(*${min_or_max}, ${matchinfo}); }]),
(apply [{ applyMed3(*${min_or_max}, ${matchinfo}); }])>;
+let Predicates = [Predicate<"Subtarget->d16PreservesUnusedBits()">] in
+def d16_load : GICombineRule<
+ (defs root:$bitcast),
+ (combine (G_BITCAST $dst, $src):$bitcast,
+ [{ return combineD16Load(*${bitcast} ); }])>;
+
def fp_minmax_to_med3 : GICombineRule<
(defs root:$min_or_max, med3_matchdata:$matchinfo),
(match (wip_match_opcode G_FMAXNUM,
@@ -219,5 +225,6 @@ def AMDGPURegBankCombiner : GICombiner<
zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain,
fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp,
identity_combines, redundant_and, constant_fold_cast_op,
- cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines]> {
+ cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines,
+ d16_load]> {
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 394a143..a4ccf36 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -309,6 +309,13 @@ def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_SSHORT, SIsbuffer_load_short>;
def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_USHORT, SIsbuffer_load_ushort>;
def : GINodeEquiv<G_AMDGPU_S_BUFFER_PREFETCH, SIsbuffer_prefetch>;
+def : GINodeEquiv<G_AMDGPU_LOAD_D16_LO, SIload_d16_lo>;
+def : GINodeEquiv<G_AMDGPU_LOAD_D16_LO_U8, SIload_d16_lo_u8>;
+def : GINodeEquiv<G_AMDGPU_LOAD_D16_LO_I8, SIload_d16_lo_i8>;
+def : GINodeEquiv<G_AMDGPU_LOAD_D16_HI, SIload_d16_hi>;
+def : GINodeEquiv<G_AMDGPU_LOAD_D16_HI_U8, SIload_d16_hi_u8>;
+def : GINodeEquiv<G_AMDGPU_LOAD_D16_HI_I8, SIload_d16_hi_i8>;
+
def : GINodeEquiv<G_AMDGPU_WHOLE_WAVE_FUNC_SETUP, AMDGPUwhole_wave_setup>;
// G_AMDGPU_WHOLE_WAVE_FUNC_RETURN is simpler than AMDGPUwhole_wave_return,
// so we don't mark it as equivalent.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index ee324a5..fd604e1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -89,6 +89,10 @@ public:
void applyCanonicalizeZextShiftAmt(MachineInstr &MI, MachineInstr &Ext) const;
+ bool combineD16Load(MachineInstr &MI) const;
+ bool applyD16Load(unsigned D16Opc, MachineInstr &DstMI,
+ MachineInstr *SmallLoad, Register ToOverwriteD16) const;
+
private:
SIModeRegisterDefaults getMode() const;
bool getIEEE() const;
@@ -392,6 +396,88 @@ void AMDGPURegBankCombinerImpl::applyCanonicalizeZextShiftAmt(
MI.eraseFromParent();
}
+bool AMDGPURegBankCombinerImpl::combineD16Load(MachineInstr &MI) const {
+ Register Dst;
+ MachineInstr *Load, *SextLoad;
+ const int64_t CleanLo16 = 0xFFFFFFFFFFFF0000;
+ const int64_t CleanHi16 = 0x000000000000FFFF;
+
+ // Load lo
+ if (mi_match(MI.getOperand(1).getReg(), MRI,
+ m_GOr(m_GAnd(m_GBitcast(m_Reg(Dst)),
+ m_Copy(m_SpecificICst(CleanLo16))),
+ m_MInstr(Load)))) {
+
+ if (Load->getOpcode() == AMDGPU::G_ZEXTLOAD) {
+ const MachineMemOperand *MMO = *Load->memoperands_begin();
+ unsigned LoadSize = MMO->getSizeInBits().getValue();
+ if (LoadSize == 8)
+ return applyD16Load(AMDGPU::G_AMDGPU_LOAD_D16_LO_U8, MI, Load, Dst);
+ if (LoadSize == 16)
+ return applyD16Load(AMDGPU::G_AMDGPU_LOAD_D16_LO, MI, Load, Dst);
+ return false;
+ }
+
+ if (mi_match(
+ Load, MRI,
+ m_GAnd(m_MInstr(SextLoad), m_Copy(m_SpecificICst(CleanHi16))))) {
+ if (SextLoad->getOpcode() != AMDGPU::G_SEXTLOAD)
+ return false;
+
+ const MachineMemOperand *MMO = *SextLoad->memoperands_begin();
+ if (MMO->getSizeInBits().getValue() != 8)
+ return false;
+
+ return applyD16Load(AMDGPU::G_AMDGPU_LOAD_D16_LO_I8, MI, SextLoad, Dst);
+ }
+
+ return false;
+ }
+
+ // Load hi
+ if (mi_match(MI.getOperand(1).getReg(), MRI,
+ m_GOr(m_GAnd(m_GBitcast(m_Reg(Dst)),
+ m_Copy(m_SpecificICst(CleanHi16))),
+ m_GShl(m_MInstr(Load), m_Copy(m_SpecificICst(16)))))) {
+
+ if (Load->getOpcode() == AMDGPU::G_ZEXTLOAD) {
+ const MachineMemOperand *MMO = *Load->memoperands_begin();
+ unsigned LoadSize = MMO->getSizeInBits().getValue();
+ if (LoadSize == 8)
+ return applyD16Load(AMDGPU::G_AMDGPU_LOAD_D16_HI_U8, MI, Load, Dst);
+ if (LoadSize == 16)
+ return applyD16Load(AMDGPU::G_AMDGPU_LOAD_D16_HI, MI, Load, Dst);
+ return false;
+ }
+
+ if (mi_match(
+ Load, MRI,
+ m_GAnd(m_MInstr(SextLoad), m_Copy(m_SpecificICst(CleanHi16))))) {
+ if (SextLoad->getOpcode() != AMDGPU::G_SEXTLOAD)
+ return false;
+ const MachineMemOperand *MMO = *SextLoad->memoperands_begin();
+ if (MMO->getSizeInBits().getValue() != 8)
+ return false;
+
+ return applyD16Load(AMDGPU::G_AMDGPU_LOAD_D16_HI_I8, MI, SextLoad, Dst);
+ }
+
+ return false;
+ }
+
+ return false;
+}
+
+bool AMDGPURegBankCombinerImpl::applyD16Load(
+ unsigned D16Opc, MachineInstr &DstMI, MachineInstr *SmallLoad,
+ Register SrcReg32ToOverwriteD16) const {
+ B.buildInstr(D16Opc, {DstMI.getOperand(0).getReg()},
+ {SmallLoad->getOperand(1).getReg(), SrcReg32ToOverwriteD16})
+ .setMemRefs(SmallLoad->memoperands());
+ DstMI.eraseFromParent();
+ return true;
+}
+
SIModeRegisterDefaults AMDGPURegBankCombinerImpl::getMode() const {
return MF.getInfo<SIMachineFunctionInfo>()->getMode();
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index e8b4501..56cc324 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -4251,6 +4251,21 @@ def G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : BufferLoadGenericInstruction;
def G_AMDGPU_TBUFFER_LOAD_FORMAT : TBufferLoadGenericInstruction;
def G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : TBufferLoadGenericInstruction;
+class D16LoadGenericInstruction : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins ptype1:$addr);
+ let hasSideEffects = 0;
+ let mayLoad = 1;
+}
+
+def G_AMDGPU_LOAD_D16_LO : D16LoadGenericInstruction;
+def G_AMDGPU_LOAD_D16_LO_U8 : D16LoadGenericInstruction;
+def G_AMDGPU_LOAD_D16_LO_I8 : D16LoadGenericInstruction;
+def G_AMDGPU_LOAD_D16_HI : D16LoadGenericInstruction;
+def G_AMDGPU_LOAD_D16_HI_U8 : D16LoadGenericInstruction;
+def G_AMDGPU_LOAD_D16_HI_I8 : D16LoadGenericInstruction;
+
+
class BufferStoreGenericInstruction : AMDGPUGenericInstruction {
let OutOperandList = (outs);
let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset,
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_flat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_flat.ll
index 97694f3..d03bbde 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_flat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_flat.ll
@@ -186,11 +186,11 @@ define <2 x i16> @atomic_load_flat_monotonic_i16_d16_hi_vector_insert(ptr %ptr,
; GFX9-LABEL: atomic_load_flat_monotonic_i16_d16_hi_vector_insert:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: flat_load_ushort v0, v[0:1] glc
-; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff
+; GFX9-NEXT: flat_load_ushort v3, v[0:1] glc
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: flat_load_short_d16_hi v2, v[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
%load = load atomic i16, ptr %ptr monotonic, align 2
%insert = insertelement <2 x i16> %vec, i16 %load, i32 1
@@ -260,10 +260,11 @@ define <2 x i16> @atomic_load_flat_monotonic_i16_d16_lo_vector_insert(ptr %ptr,
; GFX9-LABEL: atomic_load_flat_monotonic_i16_d16_lo_vector_insert:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: flat_load_ushort v0, v[0:1] glc
-; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff0000
+; GFX9-NEXT: flat_load_ushort v3, v[0:1] glc
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: flat_load_short_d16 v2, v[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
%load = load atomic i16, ptr %ptr monotonic, align 2
%insert = insertelement <2 x i16> %vec, i16 %load, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_global.ll
index 5d902d5..a8def6e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_global.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_global.ll
@@ -519,11 +519,11 @@ define <2 x i16> @atomic_load_global_monotonic_i16_d16_hi_vector_insert(ptr addr
; GFX9-LABEL: atomic_load_global_monotonic_i16_d16_hi_vector_insert:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc
-; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff
+; GFX9-NEXT: global_load_ushort v3, v[0:1], off glc
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: global_load_short_d16_hi v2, v[0:1], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
%load = load atomic i16, ptr addrspace(1) %ptr monotonic, align 2
%insert = insertelement <2 x i16> %vec, i16 %load, i32 1
@@ -622,10 +622,11 @@ define <2 x i16> @atomic_load_global_monotonic_i16_d16_lo_vector_insert(ptr addr
; GFX9-LABEL: atomic_load_global_monotonic_i16_d16_lo_vector_insert:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc
-; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff0000
+; GFX9-NEXT: global_load_ushort v3, v[0:1], off glc
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: global_load_short_d16 v2, v[0:1], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
%load = load atomic i16, ptr addrspace(1) %ptr monotonic, align 2
%insert = insertelement <2 x i16> %vec, i16 %load, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_local_2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_local_2.ll
index 31cdbbe..fc7eafb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_local_2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_local_2.ll
@@ -400,11 +400,10 @@ define <2 x i16> @atomic_load_local_monotonic_i16_d16_hi_vector_insert(ptr addrs
; GFX9-LABEL: atomic_load_local_monotonic_i16_d16_hi_vector_insert:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ds_read_u16 v0, v0
-; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
+; GFX9-NEXT: ds_read_u16 v2, v0
+; GFX9-NEXT: ds_read_u16_d16_hi v1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
%load = load atomic i16, ptr addrspace(3) %ptr monotonic, align 2
%insert = insertelement <2 x i16> %vec, i16 %load, i32 1
@@ -478,10 +477,10 @@ define <2 x i16> @atomic_load_local_monotonic_i16_d16_lo_vector_insert(ptr addrs
; GFX9-LABEL: atomic_load_local_monotonic_i16_d16_lo_vector_insert:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ds_read_u16 v0, v0
-; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff0000
+; GFX9-NEXT: ds_read_u16 v2, v0
+; GFX9-NEXT: ds_read_u16_d16 v1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
%load = load atomic i16, ptr addrspace(3) %ptr monotonic, align 2
%insert = insertelement <2 x i16> %vec, i16 %load, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-d16.ll
new file mode 100644
index 0000000..5d5b752
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-d16.ll
@@ -0,0 +1,412 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12 %s
+
+define amdgpu_ps void @load_P0_B16_D16(<2 x i16> %vec, ptr addrspace(0) %ptra, ptr addrspace(0) %out) {
+; GFX12-LABEL: load_P0_B16_D16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: flat_load_d16_b16 v0, v[1:2]
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_store_b32 v[3:4], v0
+; GFX12-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(0) %ptra
+ %res = insertelement <2 x i16> %vec, i16 %a, i32 0
+ store <2 x i16> %res, ptr addrspace(0) %out
+ ret void
+}
+
+define amdgpu_ps void @load_P0_B16_D16_Hi(<2 x i16> %vec, ptr addrspace(0) %ptra, ptr addrspace(0) %out) {
+; GFX12-LABEL: load_P0_B16_D16_Hi:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: flat_load_d16_hi_b16 v0, v[1:2]
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_store_b32 v[3:4], v0
+; GFX12-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(0) %ptra
+ %res = insertelement <2 x i16> %vec, i16 %a, i32 1
+ store <2 x i16> %res, ptr addrspace(0) %out
+ ret void
+}
+
+define amdgpu_ps void @sextload_P0_i8_D16(<2 x i16> %vec, ptr addrspace(0) %ptra, ptr addrspace(0) %out) {
+; GFX12-LABEL: sextload_P0_i8_D16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: flat_load_d16_i8 v0, v[1:2]
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_store_b32 v[3:4], v0
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(0) %ptra
+ %a16 = sext i8 %a to i16
+ %res = insertelement <2 x i16> %vec, i16 %a16, i32 0
+ store <2 x i16> %res, ptr addrspace(0) %out
+ ret void
+}
+
+define amdgpu_ps void @sextload_P0_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(0) %ptra, ptr addrspace(0) %out) {
+; GFX12-LABEL: sextload_P0_i8_D16_Hi:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: flat_load_d16_hi_i8 v0, v[1:2]
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_store_b32 v[3:4], v0
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(0) %ptra
+ %a16 = sext i8 %a to i16
+ %res = insertelement <2 x i16> %vec, i16 %a16, i32 1
+ store <2 x i16> %res, ptr addrspace(0) %out
+ ret void
+}
+
+define amdgpu_ps void @zextload_P0_i8_D16(<2 x i16> %vec, ptr addrspace(0) %ptra, ptr addrspace(0) %out) {
+; GFX12-LABEL: zextload_P0_i8_D16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: flat_load_d16_u8 v0, v[1:2]
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_store_b32 v[3:4], v0
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(0) %ptra
+ %a16 = zext i8 %a to i16
+ %res = insertelement <2 x i16> %vec, i16 %a16, i32 0
+ store <2 x i16> %res, ptr addrspace(0) %out
+ ret void
+}
+
+define amdgpu_ps void @zextload_P0_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(0) %ptra, ptr addrspace(0) %out) {
+; GFX12-LABEL: zextload_P0_i8_D16_Hi:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: flat_load_d16_hi_u8 v0, v[1:2]
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_store_b32 v[3:4], v0
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(0) %ptra
+ %a16 = zext i8 %a to i16
+ %res = insertelement <2 x i16> %vec, i16 %a16, i32 1
+ store <2 x i16> %res, ptr addrspace(0) %out
+ ret void
+}
+
+define amdgpu_ps void @load_P1_B16_D16(<2 x i16> %vec, ptr addrspace(1) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: load_P1_B16_D16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_d16_b16 v0, v[1:2], off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b32 v[3:4], v0, off
+; GFX12-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(1) %ptra
+ %res = insertelement <2 x i16> %vec, i16 %a, i32 0
+ store <2 x i16> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @load_P1_B16_D16_Hi(<2 x i16> %vec, ptr addrspace(1) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: load_P1_B16_D16_Hi:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_d16_hi_b16 v0, v[1:2], off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b32 v[3:4], v0, off
+; GFX12-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(1) %ptra
+ %res = insertelement <2 x i16> %vec, i16 %a, i32 1
+ store <2 x i16> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @sextload_P1_i8_D16(<2 x i16> %vec, ptr addrspace(1) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: sextload_P1_i8_D16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_d16_i8 v0, v[1:2], off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b32 v[3:4], v0, off
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(1) %ptra
+ %a16 = sext i8 %a to i16
+ %res = insertelement <2 x i16> %vec, i16 %a16, i32 0
+ store <2 x i16> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @sextload_P1_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(1) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: sextload_P1_i8_D16_Hi:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_d16_hi_i8 v0, v[1:2], off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b32 v[3:4], v0, off
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(1) %ptra
+ %a16 = sext i8 %a to i16
+ %res = insertelement <2 x i16> %vec, i16 %a16, i32 1
+ store <2 x i16> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @zextload_P1_i8_D16(<2 x i16> %vec, ptr addrspace(1) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: zextload_P1_i8_D16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_d16_u8 v0, v[1:2], off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b32 v[3:4], v0, off
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(1) %ptra
+ %a16 = zext i8 %a to i16
+ %res = insertelement <2 x i16> %vec, i16 %a16, i32 0
+ store <2 x i16> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @zextload_P1_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(1) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: zextload_P1_i8_D16_Hi:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_d16_hi_u8 v0, v[1:2], off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b32 v[3:4], v0, off
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(1) %ptra
+ %a16 = zext i8 %a to i16
+ %res = insertelement <2 x i16> %vec, i16 %a16, i32 1
+ store <2 x i16> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @load_P3_B16_D16(<2 x i16> %vec, ptr addrspace(3) %ptra, ptr addrspace(3) %out) {
+; GFX12-LABEL: load_P3_B16_D16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: ds_load_u16_d16 v0, v1
+; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: ds_store_b32 v2, v0
+; GFX12-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(3) %ptra
+ %res = insertelement <2 x i16> %vec, i16 %a, i32 0
+ store <2 x i16> %res, ptr addrspace(3) %out
+ ret void
+}
+
+define amdgpu_ps void @load_P3_B16_D16_Hi(<2 x i16> %vec, ptr addrspace(3) %ptra, ptr addrspace(3) %out) {
+; GFX12-LABEL: load_P3_B16_D16_Hi:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: ds_load_u16_d16_hi v0, v1
+; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: ds_store_b32 v2, v0
+; GFX12-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(3) %ptra
+ %res = insertelement <2 x i16> %vec, i16 %a, i32 1
+ store <2 x i16> %res, ptr addrspace(3) %out
+ ret void
+}
+
+define amdgpu_ps void @sextload_P3_i8_D16(<2 x i16> %vec, ptr addrspace(3) %ptra, ptr addrspace(3) %out) {
+; GFX12-LABEL: sextload_P3_i8_D16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: ds_load_i8_d16 v0, v1
+; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: ds_store_b32 v2, v0
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(3) %ptra
+ %a16 = sext i8 %a to i16
+ %res = insertelement <2 x i16> %vec, i16 %a16, i32 0
+ store <2 x i16> %res, ptr addrspace(3) %out
+ ret void
+}
+
+define amdgpu_ps void @sextload_P3_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(3) %ptra, ptr addrspace(3) %out) {
+; GFX12-LABEL: sextload_P3_i8_D16_Hi:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: ds_load_i8_d16_hi v0, v1
+; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: ds_store_b32 v2, v0
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(3) %ptra
+ %a16 = sext i8 %a to i16
+ %res = insertelement <2 x i16> %vec, i16 %a16, i32 1
+ store <2 x i16> %res, ptr addrspace(3) %out
+ ret void
+}
+
+define amdgpu_ps void @zextload_P3_i8_D16(<2 x i16> %vec, ptr addrspace(3) %ptra, ptr addrspace(3) %out) {
+; GFX12-LABEL: zextload_P3_i8_D16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: ds_load_u8_d16 v0, v1
+; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: ds_store_b32 v2, v0
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(3) %ptra
+ %a16 = zext i8 %a to i16
+ %res = insertelement <2 x i16> %vec, i16 %a16, i32 0
+ store <2 x i16> %res, ptr addrspace(3) %out
+ ret void
+}
+
+define amdgpu_ps void @zextload_P3_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(3) %ptra, ptr addrspace(3) %out) {
+; GFX12-LABEL: zextload_P3_i8_D16_Hi:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: ds_load_u8_d16_hi v0, v1
+; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: ds_store_b32 v2, v0
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(3) %ptra
+ %a16 = zext i8 %a to i16
+ %res = insertelement <2 x i16> %vec, i16 %a16, i32 1
+ store <2 x i16> %res, ptr addrspace(3) %out
+ ret void
+}
+
+define amdgpu_ps void @load_P4_B16_D16(<2 x i16> %vec, ptr addrspace(4) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: load_P4_B16_D16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_d16_b16 v0, v[1:2], off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b32 v[3:4], v0, off
+; GFX12-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(4) %ptra
+ %res = insertelement <2 x i16> %vec, i16 %a, i32 0
+ store <2 x i16> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @load_P4_B16_D16_Hi(<2 x i16> %vec, ptr addrspace(4) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: load_P4_B16_D16_Hi:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_d16_hi_b16 v0, v[1:2], off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b32 v[3:4], v0, off
+; GFX12-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(4) %ptra
+ %res = insertelement <2 x i16> %vec, i16 %a, i32 1
+ store <2 x i16> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @sextload_P4_i8_D16(<2 x i16> %vec, ptr addrspace(4) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: sextload_P4_i8_D16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_d16_i8 v0, v[1:2], off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b32 v[3:4], v0, off
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(4) %ptra
+ %a16 = sext i8 %a to i16
+ %res = insertelement <2 x i16> %vec, i16 %a16, i32 0
+ store <2 x i16> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @sextload_P4_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(4) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: sextload_P4_i8_D16_Hi:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_d16_hi_i8 v0, v[1:2], off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b32 v[3:4], v0, off
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(4) %ptra
+ %a16 = sext i8 %a to i16
+ %res = insertelement <2 x i16> %vec, i16 %a16, i32 1
+ store <2 x i16> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @zextload_P4_i8_D16(<2 x i16> %vec, ptr addrspace(4) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: zextload_P4_i8_D16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_d16_u8 v0, v[1:2], off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b32 v[3:4], v0, off
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(4) %ptra
+ %a16 = zext i8 %a to i16
+ %res = insertelement <2 x i16> %vec, i16 %a16, i32 0
+ store <2 x i16> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @zextload_P4_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(4) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: zextload_P4_i8_D16_Hi:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_d16_hi_u8 v0, v[1:2], off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b32 v[3:4], v0, off
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(4) %ptra
+ %a16 = zext i8 %a to i16
+ %res = insertelement <2 x i16> %vec, i16 %a16, i32 1
+ store <2 x i16> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @load_P5_B16_D16(<2 x i16> %vec, ptr addrspace(5) %ptra, ptr addrspace(5) %out) {
+; GFX12-LABEL: load_P5_B16_D16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: scratch_load_d16_b16 v0, v1, off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: scratch_store_b32 v2, v0, off
+; GFX12-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(5) %ptra
+ %res = insertelement <2 x i16> %vec, i16 %a, i32 0
+ store <2 x i16> %res, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_ps void @load_P5_B16_D16_Hi(<2 x i16> %vec, ptr addrspace(5) %ptra, ptr addrspace(5) %out) {
+; GFX12-LABEL: load_P5_B16_D16_Hi:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: scratch_load_d16_hi_b16 v0, v1, off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: scratch_store_b32 v2, v0, off
+; GFX12-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(5) %ptra
+ %res = insertelement <2 x i16> %vec, i16 %a, i32 1
+ store <2 x i16> %res, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_ps void @sextload_P5_i8_D16(<2 x i16> %vec, ptr addrspace(5) %ptra, ptr addrspace(5) %out) {
+; GFX12-LABEL: sextload_P5_i8_D16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: scratch_load_d16_i8 v0, v1, off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: scratch_store_b32 v2, v0, off
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(5) %ptra
+ %a16 = sext i8 %a to i16
+ %res = insertelement <2 x i16> %vec, i16 %a16, i32 0
+ store <2 x i16> %res, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_ps void @sextload_P5_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(5) %ptra, ptr addrspace(5) %out) {
+; GFX12-LABEL: sextload_P5_i8_D16_Hi:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: scratch_load_d16_hi_i8 v0, v1, off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: scratch_store_b32 v2, v0, off
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(5) %ptra
+ %a16 = sext i8 %a to i16
+ %res = insertelement <2 x i16> %vec, i16 %a16, i32 1
+ store <2 x i16> %res, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_ps void @zextload_P5_i8_D16(<2 x i16> %vec, ptr addrspace(5) %ptra, ptr addrspace(5) %out) {
+; GFX12-LABEL: zextload_P5_i8_D16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: scratch_load_d16_u8 v0, v1, off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: scratch_store_b32 v2, v0, off
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(5) %ptra
+ %a16 = zext i8 %a to i16
+ %res = insertelement <2 x i16> %vec, i16 %a16, i32 0
+ store <2 x i16> %res, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_ps void @zextload_P5_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(5) %ptra, ptr addrspace(5) %out) {
+; GFX12-LABEL: zextload_P5_i8_D16_Hi:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: scratch_load_d16_hi_u8 v0, v1, off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: scratch_store_b32 v2, v0, off
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(5) %ptra
+ %a16 = zext i8 %a to i16
+ %res = insertelement <2 x i16> %vec, i16 %a16, i32 1
+ store <2 x i16> %res, ptr addrspace(5) %out
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
index 1602e31..9af4eae 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
@@ -4089,19 +4089,12 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_reg_hi(ptr addrspace(1)
; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: ; return to shader part epilog
;
-; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_reg_hi:
-; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: global_load_d16_b16 v1, v0, s[2:3]
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1
-; GFX12-SDAG-NEXT: ; return to shader part epilog
-;
-; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_reg_hi:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: global_load_u16 v0, v0, s[2:3]
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0
-; GFX12-GISEL-NEXT: ; return to shader part epilog
+; GFX12-LABEL: global_load_saddr_i16_d16lo_reg_hi:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_d16_b16 v1, v0, s[2:3]
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load i16, ptr addrspace(1) %gep0
@@ -4125,19 +4118,12 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_reg_hi_immneg128(ptr ad
; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: ; return to shader part epilog
;
-; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_reg_hi_immneg128:
-; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: global_load_d16_b16 v1, v0, s[2:3] offset:-128
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1
-; GFX12-SDAG-NEXT: ; return to shader part epilog
-;
-; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_reg_hi_immneg128:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0
-; GFX12-GISEL-NEXT: ; return to shader part epilog
+; GFX12-LABEL: global_load_saddr_i16_d16lo_reg_hi_immneg128:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_d16_b16 v1, v0, s[2:3] offset:-128
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -4162,19 +4148,12 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zexti8_reg_hi(ptr addrs
; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: ; return to shader part epilog
;
-; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi:
-; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: global_load_d16_u8 v1, v0, s[2:3]
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1
-; GFX12-SDAG-NEXT: ; return to shader part epilog
-;
-; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[2:3]
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0
-; GFX12-GISEL-NEXT: ; return to shader part epilog
+; GFX12-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_d16_u8 v1, v0, s[2:3]
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load i8, ptr addrspace(1) %gep0
@@ -4199,19 +4178,12 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128
; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: ; return to shader part epilog
;
-; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128:
-; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: global_load_d16_u8 v1, v0, s[2:3] offset:-128
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1
-; GFX12-SDAG-NEXT: ; return to shader part epilog
-;
-; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[2:3] offset:-128
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0
-; GFX12-GISEL-NEXT: ; return to shader part epilog
+; GFX12-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_d16_u8 v1, v0, s[2:3] offset:-128
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -4237,21 +4209,12 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_sexti8_reg_hi(ptr addrs
; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: ; return to shader part epilog
;
-; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi:
-; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: global_load_d16_i8 v1, v0, s[2:3]
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1
-; GFX12-SDAG-NEXT: ; return to shader part epilog
-;
-; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: global_load_i8 v0, v0, s[2:3]
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0
-; GFX12-GISEL-NEXT: ; return to shader part epilog
+; GFX12-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_d16_i8 v1, v0, s[2:3]
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load i8, ptr addrspace(1) %gep0
@@ -4276,21 +4239,12 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128
; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: ; return to shader part epilog
;
-; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128:
-; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: global_load_d16_i8 v1, v0, s[2:3] offset:-128
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1
-; GFX12-SDAG-NEXT: ; return to shader part epilog
-;
-; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: global_load_i8 v0, v0, s[2:3] offset:-128
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0
-; GFX12-GISEL-NEXT: ; return to shader part epilog
+; GFX12-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_d16_i8 v1, v0, s[2:3] offset:-128
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -4492,21 +4446,12 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_reg_hi(ptr addrspace(1)
; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: ; return to shader part epilog
;
-; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_reg_hi:
-; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3]
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1
-; GFX12-SDAG-NEXT: ; return to shader part epilog
-;
-; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_reg_hi:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: global_load_u16 v0, v0, s[2:3]
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0
-; GFX12-GISEL-NEXT: ; return to shader part epilog
+; GFX12-LABEL: global_load_saddr_i16_d16hi_reg_hi:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3]
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load i16, ptr addrspace(1) %gep0
@@ -4530,21 +4475,12 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_reg_hi_immneg128(ptr ad
; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: ; return to shader part epilog
;
-; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_reg_hi_immneg128:
-; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] offset:-128
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1
-; GFX12-SDAG-NEXT: ; return to shader part epilog
-;
-; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_reg_hi_immneg128:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0
-; GFX12-GISEL-NEXT: ; return to shader part epilog
+; GFX12-LABEL: global_load_saddr_i16_d16hi_reg_hi_immneg128:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] offset:-128
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -4569,21 +4505,12 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zexti8_reg_hi(ptr addrs
; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: ; return to shader part epilog
;
-; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi:
-; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: global_load_d16_hi_u8 v1, v0, s[2:3]
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1
-; GFX12-SDAG-NEXT: ; return to shader part epilog
-;
-; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[2:3]
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0
-; GFX12-GISEL-NEXT: ; return to shader part epilog
+; GFX12-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_d16_hi_u8 v1, v0, s[2:3]
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load i8, ptr addrspace(1) %gep0
@@ -4608,21 +4535,12 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128
; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: ; return to shader part epilog
;
-; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128:
-; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: global_load_d16_hi_u8 v1, v0, s[2:3] offset:-128
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1
-; GFX12-SDAG-NEXT: ; return to shader part epilog
-;
-; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[2:3] offset:-128
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0
-; GFX12-GISEL-NEXT: ; return to shader part epilog
+; GFX12-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_d16_hi_u8 v1, v0, s[2:3] offset:-128
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -4648,22 +4566,12 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi(ptr addrs
; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: ; return to shader part epilog
;
-; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi:
-; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: global_load_d16_hi_i8 v1, v0, s[2:3]
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1
-; GFX12-SDAG-NEXT: ; return to shader part epilog
-;
-; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: global_load_i8 v0, v0, s[2:3]
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0
-; GFX12-GISEL-NEXT: ; return to shader part epilog
+; GFX12-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_d16_hi_i8 v1, v0, s[2:3]
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load i8, ptr addrspace(1) %gep0
@@ -4688,22 +4596,12 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128
; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: ; return to shader part epilog
;
-; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128:
-; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: global_load_d16_hi_i8 v1, v0, s[2:3] offset:-128
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1
-; GFX12-SDAG-NEXT: ; return to shader part epilog
-;
-; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: global_load_i8 v0, v0, s[2:3] offset:-128
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0
-; GFX12-GISEL-NEXT: ; return to shader part epilog
+; GFX12-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_d16_hi_i8 v1, v0, s[2:3] offset:-128
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128