21 files changed, 383 insertions, 292 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 076a623..639ddcb 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -69,7 +69,6 @@ def push_mul_through_sext : push_opcode_through_ext<G_MUL, G_SEXT>;
 
 def AArch64PreLegalizerCombiner: GICombiner<
   "AArch64PreLegalizerCombinerImpl", [all_combines,
-                                      fconstant_to_constant,
                                       icmp_redundant_trunc,
                                       fold_global_offset,
                                       shuffle_to_extract,
@@ -341,7 +340,7 @@ def AArch64PostLegalizerLowering
     : GICombiner<"AArch64PostLegalizerLoweringImpl",
                        [shuffle_vector_lowering, vashr_vlshr_imm,
                         icmp_lowering, build_vector_lowering,
-                        lower_vector_fcmp, form_truncstore,
+                        lower_vector_fcmp, form_truncstore, fconstant_to_constant,
                         vector_sext_inreg_to_shift,
                         unmerge_ext_to_unmerge, lower_mulv2s64,
                         vector_unmerge_lowering, insertelt_nonconst,
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 0f4bbfc3..1e607f4 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -92,9 +92,18 @@ private:
   bool expandCALL_BTI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
   bool expandStoreSwiftAsyncContext(MachineBasicBlock &MBB,
                                     MachineBasicBlock::iterator MBBI);
-  MachineBasicBlock *
-  expandCommitOrRestoreZASave(MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator MBBI);
+  struct ConditionalBlocks {
+    MachineBasicBlock &CondBB;
+    MachineBasicBlock &EndBB;
+  };
+  ConditionalBlocks expandConditionalPseudo(MachineBasicBlock &MBB,
+                                            MachineBasicBlock::iterator MBBI,
+                                            DebugLoc DL,
+                                            MachineInstrBuilder &Branch);
+  MachineBasicBlock *expandRestoreZASave(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator MBBI);
+  MachineBasicBlock *expandCommitZASave(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MBBI);
   MachineBasicBlock *expandCondSMToggle(MachineBasicBlock &MBB,
                                         MachineBasicBlock::iterator MBBI);
 };
@@ -991,72 +1000,97 @@ bool AArch64ExpandPseudo::expandStoreSwiftAsyncContext(
   return true;
 }
 
-static constexpr unsigned ZERO_ALL_ZA_MASK = 0b11111111;
-
-MachineBasicBlock *AArch64ExpandPseudo::expandCommitOrRestoreZASave(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) {
-  MachineInstr &MI = *MBBI;
-  bool IsRestoreZA = MI.getOpcode() == AArch64::RestoreZAPseudo;
-  assert((MI.getOpcode() == AArch64::RestoreZAPseudo ||
-          MI.getOpcode() == AArch64::CommitZASavePseudo) &&
-         "Expected ZA commit or restore");
+AArch64ExpandPseudo::ConditionalBlocks
+AArch64ExpandPseudo::expandConditionalPseudo(MachineBasicBlock &MBB,
+                                             MachineBasicBlock::iterator MBBI,
+                                             DebugLoc DL,
+                                             MachineInstrBuilder &Branch) {
   assert((std::next(MBBI) != MBB.end() ||
-          MI.getParent()->successors().begin() !=
-              MI.getParent()->successors().end()) &&
-         "Unexpected unreachable in block that restores ZA");
-
-  // Compare TPIDR2_EL0 value against 0.
-  DebugLoc DL = MI.getDebugLoc();
-  MachineInstrBuilder Branch =
-      BuildMI(MBB, MBBI, DL,
-              TII->get(IsRestoreZA ? AArch64::CBZX : AArch64::CBNZX))
-          .add(MI.getOperand(0));
+          MBB.successors().begin() != MBB.successors().end()) &&
+         "Unexpected unreachable in block");
 
   // Split MBB and create two new blocks:
-  //  - MBB now contains all instructions before RestoreZAPseudo.
-  //  - SMBB contains the [Commit|RestoreZA]Pseudo instruction only.
-  //  - EndBB contains all instructions after [Commit|RestoreZA]Pseudo.
+  //  - MBB now contains all instructions before the conditional pseudo.
+  //  - CondBB contains the conditional pseudo instruction only.
+  //  - EndBB contains all instructions after the conditional pseudo.
   MachineInstr &PrevMI = *std::prev(MBBI);
-  MachineBasicBlock *SMBB = MBB.splitAt(PrevMI, /*UpdateLiveIns*/ true);
-  MachineBasicBlock *EndBB = std::next(MI.getIterator()) == SMBB->end()
-                                 ? *SMBB->successors().begin()
-                                 : SMBB->splitAt(MI, /*UpdateLiveIns*/ true);
-
-  // Add the SMBB label to the CB[N]Z instruction & create a branch to EndBB.
-  Branch.addMBB(SMBB);
+  MachineBasicBlock *CondBB = MBB.splitAt(PrevMI, /*UpdateLiveIns*/ true);
+  MachineBasicBlock *EndBB =
+      std::next(MBBI) == CondBB->end()
+          ? *CondBB->successors().begin()
+          : CondBB->splitAt(*MBBI, /*UpdateLiveIns*/ true);
+
+  // Add the SMBB label to the branch instruction & create a branch to EndBB.
+  Branch.addMBB(CondBB);
   BuildMI(&MBB, DL, TII->get(AArch64::B))
       .addMBB(EndBB);
   MBB.addSuccessor(EndBB);
 
+  // Create branch from CondBB to EndBB. Users of this helper should insert new
+  // instructions at CondBB.back() -- i.e. before the branch.
+  BuildMI(CondBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
+  return {*CondBB, *EndBB};
+}
+
+MachineBasicBlock *
+AArch64ExpandPseudo::expandRestoreZASave(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator MBBI) {
+  MachineInstr &MI = *MBBI;
+  DebugLoc DL = MI.getDebugLoc();
+
+  // Compare TPIDR2_EL0 against 0. Restore ZA if TPIDR2_EL0 is zero.
+  MachineInstrBuilder Branch =
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::CBZX)).add(MI.getOperand(0));
+
+  auto [CondBB, EndBB] = expandConditionalPseudo(MBB, MBBI, DL, Branch);
   // Replace the pseudo with a call (BL).
   MachineInstrBuilder MIB =
-      BuildMI(*SMBB, SMBB->end(), DL, TII->get(AArch64::BL));
+      BuildMI(CondBB, CondBB.back(), DL, TII->get(AArch64::BL));
   // Copy operands (mainly the regmask) from the pseudo.
   for (unsigned I = 2; I < MI.getNumOperands(); ++I)
     MIB.add(MI.getOperand(I));
+  // Mark the TPIDR2 block pointer (X0) as an implicit use.
+  MIB.addReg(MI.getOperand(1).getReg(), RegState::Implicit);
 
-  if (IsRestoreZA) {
-    // Mark the TPIDR2 block pointer (X0) as an implicit use.
-    MIB.addReg(MI.getOperand(1).getReg(), RegState::Implicit);
-  } else /*CommitZA*/ {
+  MI.eraseFromParent();
+  return &EndBB;
+}
+
+static constexpr unsigned ZERO_ALL_ZA_MASK = 0b11111111;
+
+MachineBasicBlock *
+AArch64ExpandPseudo::expandCommitZASave(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MBBI) {
+  MachineInstr &MI = *MBBI;
+  DebugLoc DL = MI.getDebugLoc();
+
+  // Compare TPIDR2_EL0 against 0. Commit ZA if TPIDR2_EL0 is non-zero.
+  MachineInstrBuilder Branch =
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::CBNZX)).add(MI.getOperand(0));
+
+  auto [CondBB, EndBB] = expandConditionalPseudo(MBB, MBBI, DL, Branch);
+  // Replace the pseudo with a call (BL).
+  MachineInstrBuilder MIB =
+      BuildMI(CondBB, CondBB.back(), DL, TII->get(AArch64::BL));
+  // Copy operands (mainly the regmask) from the pseudo.
+  for (unsigned I = 2; I < MI.getNumOperands(); ++I)
+    MIB.add(MI.getOperand(I));
+  // Clear TPIDR2_EL0.
+  BuildMI(CondBB, CondBB.back(), DL, TII->get(AArch64::MSR))
+      .addImm(AArch64SysReg::TPIDR2_EL0)
+      .addReg(AArch64::XZR);
+  bool ZeroZA = MI.getOperand(1).getImm() != 0;
+  if (ZeroZA) {
     [[maybe_unused]] auto *TRI =
         MBB.getParent()->getSubtarget().getRegisterInfo();
-    // Clear TPIDR2_EL0.
-    BuildMI(*SMBB, SMBB->end(), DL, TII->get(AArch64::MSR))
-        .addImm(AArch64SysReg::TPIDR2_EL0)
-        .addReg(AArch64::XZR);
-    bool ZeroZA = MI.getOperand(1).getImm() != 0;
-    if (ZeroZA) {
-      assert(MI.definesRegister(AArch64::ZAB0, TRI) && "should define ZA!");
-      BuildMI(*SMBB, SMBB->end(), DL, TII->get(AArch64::ZERO_M))
-          .addImm(ZERO_ALL_ZA_MASK)
-          .addDef(AArch64::ZAB0, RegState::ImplicitDefine);
-    }
+    assert(MI.definesRegister(AArch64::ZAB0, TRI) && "should define ZA!");
+    BuildMI(CondBB, CondBB.back(), DL, TII->get(AArch64::ZERO_M))
+        .addImm(ZERO_ALL_ZA_MASK)
+        .addDef(AArch64::ZAB0, RegState::ImplicitDefine);
   }
 
-  BuildMI(SMBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
   MI.eraseFromParent();
-  return EndBB;
+  return &EndBB;
 }
 
 MachineBasicBlock *
@@ -1130,24 +1164,9 @@ AArch64ExpandPseudo::expandCondSMToggle(MachineBasicBlock &MBB,
   MachineInstrBuilder Tbx =
       BuildMI(MBB, MBBI, DL, TII->get(Opc)).addReg(SMReg32).addImm(0);
 
-  // Split MBB and create two new blocks:
-  //  - MBB now contains all instructions before MSRcond_pstatesvcrImm1.
-  //  - SMBB contains the MSRcond_pstatesvcrImm1 instruction only.
-  //  - EndBB contains all instructions after MSRcond_pstatesvcrImm1.
-  MachineInstr &PrevMI = *std::prev(MBBI);
-  MachineBasicBlock *SMBB = MBB.splitAt(PrevMI, /*UpdateLiveIns*/ true);
-  MachineBasicBlock *EndBB = std::next(MI.getIterator()) == SMBB->end()
-                                 ? *SMBB->successors().begin()
-                                 : SMBB->splitAt(MI, /*UpdateLiveIns*/ true);
-
-  // Add the SMBB label to the TB[N]Z instruction & create a branch to EndBB.
-  Tbx.addMBB(SMBB);
-  BuildMI(&MBB, DL, TII->get(AArch64::B))
-      .addMBB(EndBB);
-  MBB.addSuccessor(EndBB);
-
+  auto [CondBB, EndBB] = expandConditionalPseudo(MBB, MBBI, DL, Tbx);
   // Create the SMSTART/SMSTOP (MSRpstatesvcrImm1) instruction in SMBB.
-  MachineInstrBuilder MIB = BuildMI(*SMBB, SMBB->begin(), MI.getDebugLoc(),
+  MachineInstrBuilder MIB = BuildMI(CondBB, CondBB.back(), MI.getDebugLoc(),
                                     TII->get(AArch64::MSRpstatesvcrImm1));
   // Copy all but the second and third operands of MSRcond_pstatesvcrImm1 (as
   // these contain the CopyFromReg for the first argument and the flag to
@@ -1157,10 +1176,8 @@ AArch64ExpandPseudo::expandCondSMToggle(MachineBasicBlock &MBB,
   for (unsigned i = 4; i < MI.getNumOperands(); ++i)
     MIB.add(MI.getOperand(i));
 
-  BuildMI(SMBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
-
   MI.eraseFromParent();
-  return EndBB;
+  return &EndBB;
 }
 
 bool AArch64ExpandPseudo::expandMultiVecPseudo(
@@ -1674,15 +1691,21 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
      return expandCALL_BTI(MBB, MBBI);
    case AArch64::StoreSwiftAsyncContext:
      return expandStoreSwiftAsyncContext(MBB, MBBI);
+   case AArch64::RestoreZAPseudo:
    case AArch64::CommitZASavePseudo:
-   case AArch64::RestoreZAPseudo: {
-     auto *NewMBB = expandCommitOrRestoreZASave(MBB, MBBI);
-     if (NewMBB != &MBB)
-       NextMBBI = MBB.end(); // The NextMBBI iterator is invalidated.
-     return true;
-   }
    case AArch64::MSRpstatePseudo: {
-     auto *NewMBB = expandCondSMToggle(MBB, MBBI);
+     auto *NewMBB = [&] {
+       switch (Opcode) {
+       case AArch64::RestoreZAPseudo:
+         return expandRestoreZASave(MBB, MBBI);
+       case AArch64::CommitZASavePseudo:
+         return expandCommitZASave(MBB, MBBI);
+       case AArch64::MSRpstatePseudo:
+         return expandCondSMToggle(MBB, MBBI);
+       default:
+         llvm_unreachable("Unexpected conditional pseudo!");
+       }
+     }();
      if (NewMBB != &MBB)
        NextMBBI = MBB.end(); // The NextMBBI iterator is invalidated.
      return true;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index c197550e..9e2d698 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -678,8 +678,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .widenScalarToNextPow2(0)
       .clampScalar(0, s8, s64);
   getActionDefinitionsBuilder(G_FCONSTANT)
-      .legalFor({s32, s64, s128})
-      .legalFor(HasFP16, {s16})
+      // Always legalize s16 to prevent G_FCONSTANT being widened to G_CONSTANT
+      .legalFor({s16, s32, s64, s128})
       .clampScalar(0, MinFPScalar, s128);
 
   // FIXME: fix moreElementsToNextPow2
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
index 63313da..23dcaea 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
@@ -75,6 +75,31 @@ struct ShuffleVectorPseudo {
   ShuffleVectorPseudo() = default;
 };
 
+/// Return true if a G_FCONSTANT instruction is known to be better-represented
+/// as a G_CONSTANT.
+bool matchFConstantToConstant(MachineInstr &MI, MachineRegisterInfo &MRI) {
+  assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT);
+  Register DstReg = MI.getOperand(0).getReg();
+  const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
+  if (DstSize != 16 && DstSize != 32 && DstSize != 64)
+    return false;
+
+  // When we're storing a value, it doesn't matter what register bank it's on.
+  // Since not all floating point constants can be materialized using a fmov,
+  // it makes more sense to just use a GPR.
+  return all_of(MRI.use_nodbg_instructions(DstReg),
+                [](const MachineInstr &Use) { return Use.mayStore(); });
+}
+
+/// Change a G_FCONSTANT into a G_CONSTANT.
+void applyFConstantToConstant(MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT);
+  MachineIRBuilder MIB(MI);
+  const APFloat &ImmValAPF = MI.getOperand(1).getFPImm()->getValueAPF();
+  MIB.buildConstant(MI.getOperand(0).getReg(), ImmValAPF.bitcastToAPInt());
+  MI.eraseFromParent();
+}
+
 /// Check if a G_EXT instruction can handle a shuffle mask \p M when the vector
 /// sources of the shuffle are different.
 std::optional<std::pair<bool, uint64_t>> getExtMask(ArrayRef<int> M,
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
index 8c10673..896eab5 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
@@ -44,31 +44,6 @@ namespace {
 #include "AArch64GenPreLegalizeGICombiner.inc"
 #undef GET_GICOMBINER_TYPES
 
-/// Return true if a G_FCONSTANT instruction is known to be better-represented
-/// as a G_CONSTANT.
-bool matchFConstantToConstant(MachineInstr &MI, MachineRegisterInfo &MRI) {
-  assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT);
-  Register DstReg = MI.getOperand(0).getReg();
-  const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
-  if (DstSize != 32 && DstSize != 64)
-    return false;
-
-  // When we're storing a value, it doesn't matter what register bank it's on.
-  // Since not all floating point constants can be materialized using a fmov,
-  // it makes more sense to just use a GPR.
-  return all_of(MRI.use_nodbg_instructions(DstReg),
-                [](const MachineInstr &Use) { return Use.mayStore(); });
-}
-
-/// Change a G_FCONSTANT into a G_CONSTANT.
-void applyFConstantToConstant(MachineInstr &MI) {
-  assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT);
-  MachineIRBuilder MIB(MI);
-  const APFloat &ImmValAPF = MI.getOperand(1).getFPImm()->getValueAPF();
-  MIB.buildConstant(MI.getOperand(0).getReg(), ImmValAPF.bitcastToAPInt());
-  MI.eraseFromParent();
-}
-
 /// Try to match a G_ICMP of a G_TRUNC with zero, in which the truncated bits
 /// are sign bits. In this case, we can transform the G_ICMP to directly compare
 /// the wide value with a zero.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 723d07e..c7a91f4c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -929,7 +929,7 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
                                             ThinOrFullLTOPhase Phase) {
     if (Level != OptimizationLevel::O0) {
       if (!isLTOPreLink(Phase)) {
-        if (getTargetTriple().isAMDGCN()) {
+        if (EnableAMDGPUAttributor && getTargetTriple().isAMDGCN()) {
           AMDGPUAttributorOptions Opts;
           MPM.addPass(AMDGPUAttributorPass(*this, Opts, Phase));
         }
@@ -966,7 +966,7 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
             PM.addPass(InternalizePass(mustPreserveGV));
             PM.addPass(GlobalDCEPass());
           }
-          if (EnableAMDGPUAttributor) {
+          if (EnableAMDGPUAttributor && getTargetTriple().isAMDGCN()) {
             AMDGPUAttributorOptions Opt;
             if (HasClosedWorldAssumption)
               Opt.IsClosedWorld = true;
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index a8140c3..99ba043 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -2105,6 +2105,10 @@ bool AMDGPUOperand::isInlinableImm(MVT type) const {
     // Only plain immediates are inlinable (e.g. "clamp" attribute is not)
     return false;
   }
+
+  if (getModifiers().Lit != LitModifier::None)
+    return false;
+
   // TODO: We should avoid using host float here. It would be better to
   // check the float bit values which is what a few other places do.
   // We've had bot failures before due to weird NaN support on mips hosts.
@@ -2339,6 +2343,7 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
   bool CanUse64BitLiterals =
       AsmParser->has64BitLiterals() &&
       !(InstDesc.TSFlags & (SIInstrFlags::VOP3 | SIInstrFlags::VOP3P));
+  LitModifier Lit = getModifiers().Lit;
   MCContext &Ctx = AsmParser->getContext();
 
   if (Imm.IsFPImm) { // We got fp literal token
@@ -2348,7 +2353,8 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
     case AMDGPU::OPERAND_REG_INLINE_C_INT64:
     case AMDGPU::OPERAND_REG_INLINE_C_FP64:
     case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
-      if (AMDGPU::isInlinableLiteral64(Literal.getZExtValue(),
+      if (Lit == LitModifier::None &&
+          AMDGPU::isInlinableLiteral64(Literal.getZExtValue(),
                                        AsmParser->hasInv2PiInlineImm())) {
         Inst.addOperand(MCOperand::createImm(Literal.getZExtValue()));
         return;
@@ -2372,14 +2378,20 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
 
         if ((OpTy == AMDGPU::OPERAND_REG_IMM_FP64 ||
              OpTy == AMDGPU::OPERAND_REG_INLINE_C_FP64 ||
-             OpTy == AMDGPU::OPERAND_REG_INLINE_AC_FP64) &&
-            CanUse64BitLiterals && Lo_32(Val) != 0) {
-          Inst.addOperand(MCOperand::createExpr(
-              AMDGPUMCExpr::createLit(LitModifier::Lit64, Val, Ctx)));
-        } else {
-          Inst.addOperand(MCOperand::createImm(Val));
+             OpTy == AMDGPU::OPERAND_REG_INLINE_AC_FP64)) {
+          if (CanUse64BitLiterals && Lit == LitModifier::None &&
+              (isInt<32>(Val) || isUInt<32>(Val))) {
+            // The floating-point operand will be verbalized as an
+            // integer one. If that integer happens to fit 32 bits, on
+            // re-assembling it will be intepreted as the high half of
+            // the actual value, so we have to wrap it into lit64().
+            Lit = LitModifier::Lit64;
+          } else if (Lit == LitModifier::Lit) {
+            // For FP64 operands lit() specifies the high half of the value.
+            Val = Hi_32(Val);
+          }
         }
-        return;
+        break;
       }
 
       // We don't allow fp literals in 64-bit integer instructions. It is
@@ -2388,19 +2400,17 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
       llvm_unreachable("fp literal in 64-bit integer instruction.");
 
     case AMDGPU::OPERAND_KIMM64:
-      if (CanUse64BitLiterals && Lo_32(Val) != 0) {
-        Inst.addOperand(MCOperand::createExpr(
-            AMDGPUMCExpr::createLit(LitModifier::Lit64, Val, Ctx)));
-      } else {
-        Inst.addOperand(MCOperand::createImm(Val));
-      }
-      return;
+      if (CanUse64BitLiterals && Lit == LitModifier::None &&
+          (isInt<32>(Val) || isUInt<32>(Val)))
+        Lit = LitModifier::Lit64;
+      break;
 
     case AMDGPU::OPERAND_REG_IMM_BF16:
     case AMDGPU::OPERAND_REG_INLINE_C_BF16:
     case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
     case AMDGPU::OPERAND_REG_IMM_V2BF16:
-      if (AsmParser->hasInv2PiInlineImm() && Literal == 0x3fc45f306725feed) {
+      if (Lit == LitModifier::None && AsmParser->hasInv2PiInlineImm() &&
+          Literal == 0x3fc45f306725feed) {
         // This is the 1/(2*pi) which is going to be truncated to bf16 with the
         // loss of precision. The constant represents ideomatic fp32 value of
         // 1/(2*pi) = 0.15915494 since bf16 is in fact fp32 with cleared low 16
@@ -2438,14 +2448,19 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
       // We allow precision lost but not overflow or underflow. This should be
       // checked earlier in isLiteralImm()
 
-      uint64_t ImmVal = FPLiteral.bitcastToAPInt().getZExtValue();
-      Inst.addOperand(MCOperand::createImm(ImmVal));
-      return;
+      Val = FPLiteral.bitcastToAPInt().getZExtValue();
+      break;
     }
     default:
       llvm_unreachable("invalid operand size");
     }
 
+    if (Lit != LitModifier::None) {
+      Inst.addOperand(
+          MCOperand::createExpr(AMDGPUMCExpr::createLit(Lit, Val, Ctx)));
+    } else {
+      Inst.addOperand(MCOperand::createImm(Val));
+    }
     return;
   }
 
@@ -2465,12 +2480,12 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
   case AMDGPU::OPERAND_REG_IMM_V2INT32:
   case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32:
   case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
-    Inst.addOperand(MCOperand::createImm(Val));
-    return;
+    break;
 
   case AMDGPU::OPERAND_REG_IMM_INT64:
   case AMDGPU::OPERAND_REG_INLINE_C_INT64:
-    if (AMDGPU::isInlinableLiteral64(Val, AsmParser->hasInv2PiInlineImm())) {
+    if (Lit == LitModifier::None &&
+        AMDGPU::isInlinableLiteral64(Val, AsmParser->hasInv2PiInlineImm())) {
       Inst.addOperand(MCOperand::createImm(Val));
       return;
     }
@@ -2479,22 +2494,15 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
     // truncated to uint32_t), if the target doesn't support 64-bit literals, or
     // the lit modifier is explicitly used, we need to truncate it to the 32
     // LSBs.
-    if (!AsmParser->has64BitLiterals() ||
-        getModifiers().Lit == LitModifier::Lit)
+    if (!AsmParser->has64BitLiterals() || Lit == LitModifier::Lit)
       Val = Lo_32(Val);
-
-    if (CanUse64BitLiterals && (!isInt<32>(Val) || !isUInt<32>(Val))) {
-      Inst.addOperand(MCOperand::createExpr(
-          AMDGPUMCExpr::createLit(LitModifier::Lit64, Val, Ctx)));
-    } else {
-      Inst.addOperand(MCOperand::createImm(Val));
-    }
-    return;
+    break;
 
   case AMDGPU::OPERAND_REG_IMM_FP64:
   case AMDGPU::OPERAND_REG_INLINE_C_FP64:
   case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
-    if (AMDGPU::isInlinableLiteral64(Val, AsmParser->hasInv2PiInlineImm())) {
+    if (Lit == LitModifier::None &&
+        AMDGPU::isInlinableLiteral64(Val, AsmParser->hasInv2PiInlineImm())) {
       Inst.addOperand(MCOperand::createImm(Val));
       return;
     }
@@ -2509,19 +2517,15 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
       // 1) explicitly forced by using lit modifier;
       // 2) the value is a valid 32-bit representation (signed or unsigned),
       // meanwhile not forced by lit64 modifier.
-      if (getModifiers().Lit == LitModifier::Lit ||
-          (getModifiers().Lit != LitModifier::Lit64 &&
-           (isInt<32>(Val) || isUInt<32>(Val))))
+      if (Lit == LitModifier::Lit ||
+          (Lit != LitModifier::Lit64 && (isInt<32>(Val) || isUInt<32>(Val))))
         Val = static_cast<uint64_t>(Val) << 32;
     }
 
-    if (CanUse64BitLiterals && Lo_32(Val) != 0) {
-      Inst.addOperand(MCOperand::createExpr(
-          AMDGPUMCExpr::createLit(LitModifier::Lit64, Val, Ctx)));
-    } else {
-      Inst.addOperand(MCOperand::createImm(Val));
-    }
-    return;
+    // For FP64 operands lit() specifies the high half of the value.
+    if (Lit == LitModifier::Lit)
+      Val = Hi_32(Val);
+    break;
 
   case AMDGPU::OPERAND_REG_IMM_INT16:
   case AMDGPU::OPERAND_REG_INLINE_C_INT16:
@@ -2534,25 +2538,23 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
   case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
   case AMDGPU::OPERAND_KIMM32:
   case AMDGPU::OPERAND_KIMM16:
-    Inst.addOperand(MCOperand::createImm(Val));
-    return;
+    break;
 
   case AMDGPU::OPERAND_KIMM64:
-    if ((isInt<32>(Val) || isUInt<32>(Val)) &&
-        getModifiers().Lit != LitModifier::Lit64)
+    if ((isInt<32>(Val) || isUInt<32>(Val)) && Lit != LitModifier::Lit64)
       Val <<= 32;
-
-    if (CanUse64BitLiterals && Lo_32(Val) != 0) {
-      Inst.addOperand(MCOperand::createExpr(
-          AMDGPUMCExpr::createLit(LitModifier::Lit64, Val, Ctx)));
-    } else {
-      Inst.addOperand(MCOperand::createImm(Val));
-    }
-    return;
+    break;
 
   default:
     llvm_unreachable("invalid operand type");
   }
+
+  if (Lit != LitModifier::None) {
+    Inst.addOperand(
+        MCOperand::createExpr(AMDGPUMCExpr::createLit(Lit, Val, Ctx)));
+  } else {
+    Inst.addOperand(MCOperand::createImm(Val));
+  }
 }
 
 void AMDGPUOperand::addRegOperands(MCInst &Inst, unsigned N) const {
@@ -4821,12 +4823,15 @@ bool AMDGPUAsmParser::validateSOPLiteral(const MCInst &Inst,
     const MCOperand &MO = Inst.getOperand(OpIdx);
     // Exclude special imm operands (like that used by s_set_gpr_idx_on)
     if (AMDGPU::isSISrcOperand(Desc, OpIdx)) {
+      bool IsLit = false;
       std::optional<int64_t> Imm;
       if (MO.isImm()) {
         Imm = MO.getImm();
       } else if (MO.isExpr()) {
-        if (isLitExpr(MO.getExpr()))
+        if (isLitExpr(MO.getExpr())) {
+          IsLit = true;
           Imm = getLitValue(MO.getExpr());
+        }
       } else {
         continue;
       }
@@ -4836,7 +4841,7 @@ bool AMDGPUAsmParser::validateSOPLiteral(const MCInst &Inst,
       } else if (!isInlineConstant(Inst, OpIdx)) {
         auto OpType = static_cast<AMDGPU::OperandType>(
             Desc.operands()[OpIdx].OperandType);
-        int64_t Value = encode32BitLiteral(*Imm, OpType);
+        int64_t Value = encode32BitLiteral(*Imm, OpType, IsLit);
         if (NumLiterals == 0 || LiteralValue != Value) {
           LiteralValue = Value;
           ++NumLiterals;
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index f11b373..be62395 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -1551,7 +1551,7 @@ AMDGPUDisassembler::decodeMandatoryLiteral64Constant(uint64_t Val) const {
   HasLiteral = true;
   Literal = Literal64 = Val;
 
-  bool UseLit64 = Lo_32(Literal64) != 0;
+  bool UseLit64 = Hi_32(Literal64) == 0;
   return UseLit64 ? MCOperand::createExpr(AMDGPUMCExpr::createLit(
                         LitModifier::Lit64, Literal64, getContext()))
                   : MCOperand::createImm(Literal64);
@@ -1584,11 +1584,11 @@ MCOperand AMDGPUDisassembler::decodeLiteralConstant(const MCInstrDesc &Desc,
   if (CanUse64BitLiterals) {
     if (OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
         OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_C_INT64)
-      UseLit64 = !isInt<32>(Val) || !isUInt<32>(Val);
+      UseLit64 = false;
     else if (OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_FP64 ||
              OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_C_FP64 ||
              OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_AC_FP64)
-      UseLit64 = Lo_32(Val) != 0;
+      UseLit64 = Hi_32(Literal64) == 0;
   }
 
   return UseLit64 ? MCOperand::createExpr(AMDGPUMCExpr::createLit(
@@ -1614,12 +1614,12 @@ AMDGPUDisassembler::decodeLiteral64Constant(const MCInst &Inst) const {
   const MCOperandInfo &OpDesc = Desc.operands()[Inst.getNumOperands()];
   if (OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
       OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_C_INT64) {
-    UseLit64 = !isInt<32>(Literal64) || !isUInt<32>(Literal64);
+    UseLit64 = false;
   } else {
     assert(OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_FP64 ||
            OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_C_FP64 ||
            OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_AC_FP64);
-    UseLit64 = Lo_32(Literal64) != 0;
+    UseLit64 = Hi_32(Literal64) == 0;
   }
 
   return UseLit64 ? MCOperand::createExpr(AMDGPUMCExpr::createLit(
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index e82f998..703ec0a 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -73,7 +73,13 @@ void AMDGPUInstPrinter::printU16ImmDecOperand(const MCInst *MI, unsigned OpNo,
 void AMDGPUInstPrinter::printU32ImmOperand(const MCInst *MI, unsigned OpNo,
                                            const MCSubtargetInfo &STI,
                                            raw_ostream &O) {
-  O << formatHex(MI->getOperand(OpNo).getImm() & 0xffffffff);
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isExpr()) {
+    MAI.printExpr(O, *Op.getExpr());
+    return;
+  }
+
+  O << formatHex(Op.getImm() & 0xffffffff);
 }
 
 void AMDGPUInstPrinter::printFP64ImmOperand(const MCInst *MI, unsigned OpNo,
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
index f2879116..ea758bb 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -270,10 +270,19 @@ std::optional<uint64_t> AMDGPUMCCodeEmitter::getLitEncoding(
     const MCInstrDesc &Desc, const MCOperand &MO, unsigned OpNo,
     const MCSubtargetInfo &STI, bool HasMandatoryLiteral) const {
   const MCOperandInfo &OpInfo = Desc.operands()[OpNo];
-  int64_t Imm;
+  int64_t Imm = 0;
   if (MO.isExpr()) {
-    if (!MO.getExpr()->evaluateAsAbsolute(Imm))
-      return AMDGPU::getOperandSize(OpInfo) == 8 ? 254 : 255;
+    if (!MO.getExpr()->evaluateAsAbsolute(Imm) ||
+        AMDGPU::isLitExpr(MO.getExpr())) {
+      if (OpInfo.OperandType == AMDGPU::OPERAND_KIMM16 ||
+          OpInfo.OperandType == AMDGPU::OPERAND_KIMM32 ||
+          OpInfo.OperandType == AMDGPU::OPERAND_KIMM64)
+        return Imm;
+      if (STI.hasFeature(AMDGPU::Feature64BitLiterals) &&
+          AMDGPU::getOperandSize(OpInfo) == 8)
+        return 254;
+      return 255;
+    }
   } else {
     assert(!MO.isDFPImm());
 
@@ -452,13 +461,16 @@ void AMDGPUMCCodeEmitter::encodeInstruction(const MCInst &MI,
     // Yes! Encode it
     int64_t Imm = 0;
 
+    bool IsLit = false;
     if (Op.isImm())
       Imm = Op.getImm();
     else if (Op.isExpr()) {
-      if (const auto *C = dyn_cast<MCConstantExpr>(Op.getExpr()))
+      if (const auto *C = dyn_cast<MCConstantExpr>(Op.getExpr())) {
         Imm = C->getValue();
-      else if (AMDGPU::isLitExpr(Op.getExpr()))
+      } else if (AMDGPU::isLitExpr(Op.getExpr())) {
+        IsLit = true;
         Imm = AMDGPU::getLitValue(Op.getExpr());
+      }
     } else // Exprs will be replaced with a fixup value.
       llvm_unreachable("Must be immediate or expr");
 
@@ -468,7 +480,7 @@ void AMDGPUMCCodeEmitter::encodeInstruction(const MCInst &MI,
     } else {
       auto OpType =
           static_cast<AMDGPU::OperandType>(Desc.operands()[i].OperandType);
-      Imm = AMDGPU::encode32BitLiteral(Imm, OpType);
+      Imm = AMDGPU::encode32BitLiteral(Imm, OpType, IsLit);
       support::endian::write<uint32_t>(CB, Imm, llvm::endianness::little);
     }
 
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 76023d2..3e1b058 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -3145,7 +3145,7 @@ bool isValid32BitLiteral(uint64_t Val, bool IsFP64) {
   return isUInt<32>(Val) || isInt<32>(Val);
 }
 
-int64_t encode32BitLiteral(int64_t Imm, OperandType Type) {
+int64_t encode32BitLiteral(int64_t Imm, OperandType Type, bool IsLit) {
   switch (Type) {
   default:
     break;
@@ -3168,7 +3168,7 @@ int64_t encode32BitLiteral(int64_t Imm, OperandType Type) {
   case OPERAND_REG_INLINE_C_INT32:
     return Lo_32(Imm);
   case OPERAND_REG_IMM_FP64:
-    return Hi_32(Imm);
+    return IsLit ? Imm : Hi_32(Imm);
   }
   return Imm;
 }
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 49b4d02..a01a5fd 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1727,7 +1727,7 @@ LLVM_READNONE
 bool isValid32BitLiteral(uint64_t Val, bool IsFP64);
 
 LLVM_READNONE
-int64_t encode32BitLiteral(int64_t Imm, OperandType Type);
+int64_t encode32BitLiteral(int64_t Imm, OperandType Type, bool IsLit);
 
 bool isArgPassedInSGPR(const Argument *Arg);
 
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
index 186fdd1..53633ea 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
@@ -675,6 +675,45 @@ static void getOperandsForBranch(Register CondReg, RISCVCC::CondCode &CC,
   CC = getRISCVCCFromICmp(Pred);
 }
 
+/// Select the RISC-V Zalasr opcode for the G_LOAD or G_STORE operation
+/// \p GenericOpc, appropriate for the GPR register bank and of memory access
+/// size \p OpSize.
+static unsigned selectZalasrLoadStoreOp(unsigned GenericOpc, unsigned OpSize) {
+  const bool IsStore = GenericOpc == TargetOpcode::G_STORE;
+  switch (OpSize) {
+  default:
+    llvm_unreachable("Unexpected memory size");
+  case 8:
+    return IsStore ? RISCV::SB_RL : RISCV::LB_AQ;
+  case 16:
+    return IsStore ? RISCV::SH_RL : RISCV::LH_AQ;
+  case 32:
+    return IsStore ? RISCV::SW_RL : RISCV::LW_AQ;
+  case 64:
+    return IsStore ? RISCV::SD_RL : RISCV::LD_AQ;
+  }
+}
+
+/// Select the RISC-V regimm opcode for the G_LOAD or G_STORE operation
+/// \p GenericOpc, appropriate for the GPR register bank and of memory access
+/// size \p OpSize. \returns \p GenericOpc if the combination is unsupported.
+static unsigned selectRegImmLoadStoreOp(unsigned GenericOpc, unsigned OpSize) {
+  const bool IsStore = GenericOpc == TargetOpcode::G_STORE;
+  switch (OpSize) {
+  case 8:
+    // Prefer unsigned due to no c.lb in Zcb.
+    return IsStore ? RISCV::SB : RISCV::LBU;
+  case 16:
+    return IsStore ? RISCV::SH : RISCV::LH;
+  case 32:
+    return IsStore ? RISCV::SW : RISCV::LW;
+  case 64:
+    return IsStore ? RISCV::SD : RISCV::LD;
+  }
+
+  return GenericOpc;
+}
+
 bool RISCVInstructionSelector::select(MachineInstr &MI) {
   MachineIRBuilder MIB(MI);
 
@@ -892,6 +931,59 @@ bool RISCVInstructionSelector::select(MachineInstr &MI) {
     return selectImplicitDef(MI, MIB);
   case TargetOpcode::G_UNMERGE_VALUES:
     return selectUnmergeValues(MI, MIB);
+  case TargetOpcode::G_LOAD:
+  case TargetOpcode::G_STORE: {
+    GLoadStore &LdSt = cast<GLoadStore>(MI);
+    const Register ValReg = LdSt.getReg(0);
+    const Register PtrReg = LdSt.getPointerReg();
+    LLT PtrTy = MRI->getType(PtrReg);
+
+    const RegisterBank &RB = *RBI.getRegBank(ValReg, *MRI, TRI);
+    if (RB.getID() != RISCV::GPRBRegBankID)
+      return false;
+
+#ifndef NDEBUG
+    const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, *MRI, TRI);
+    // Check that the pointer register is valid.
+    assert(PtrRB.getID() == RISCV::GPRBRegBankID &&
+           "Load/Store pointer operand isn't a GPR");
+    assert(PtrTy.isPointer() && "Load/Store pointer operand isn't a pointer");
+#endif
+
+    // Can only handle AddressSpace 0.
+    if (PtrTy.getAddressSpace() != 0)
+      return false;
+
+    unsigned MemSize = LdSt.getMemSizeInBits().getValue();
+    AtomicOrdering Order = LdSt.getMMO().getSuccessOrdering();
+
+    if (isStrongerThanMonotonic(Order)) {
+      MI.setDesc(TII.get(selectZalasrLoadStoreOp(Opc, MemSize)));
+      return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
+    }
+
+    const unsigned NewOpc = selectRegImmLoadStoreOp(MI.getOpcode(), MemSize);
+    if (NewOpc == MI.getOpcode())
+      return false;
+
+    // Check if we can fold anything into the addressing mode.
+    auto AddrModeFns = selectAddrRegImm(MI.getOperand(1));
+    if (!AddrModeFns)
+      return false;
+
+    // Folded something. Create a new instruction and return it.
+    auto NewInst = MIB.buildInstr(NewOpc, {}, {}, MI.getFlags());
+    if (isa<GStore>(MI))
+      NewInst.addUse(ValReg);
+    else
+      NewInst.addDef(ValReg);
+    NewInst.cloneMemRefs(MI);
+    for (auto &Fn : *AddrModeFns)
+      Fn(NewInst);
+    MI.eraseFromParent();
+
+    return constrainSelectedInstRegOperands(*NewInst, TII, TRI, RBI);
+  }
   default:
     return false;
   }
diff --git a/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp
index 1c7cbb9..5dd4bf4 100644
--- a/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp
+++ b/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp
@@ -287,8 +287,8 @@ static void doAtomicBinOpExpansion(const RISCVInstrInfo *TII, MachineInstr &MI,
     break;
   }
   BuildMI(LoopMBB, DL, TII->get(getSCForRMW(Ordering, Width, STI)), ScratchReg)
-      .addReg(AddrReg)
-      .addReg(ScratchReg);
+      .addReg(ScratchReg)
+      .addReg(AddrReg);
   BuildMI(LoopMBB, DL, TII->get(RISCV::BNE))
       .addReg(ScratchReg)
       .addReg(RISCV::X0)
@@ -375,8 +375,8 @@ static void doMaskedAtomicBinOpExpansion(const RISCVInstrInfo *TII,
                     ScratchReg);
 
   BuildMI(LoopMBB, DL, TII->get(getSCForRMW32(Ordering, STI)), ScratchReg)
-      .addReg(AddrReg)
-      .addReg(ScratchReg);
+      .addReg(ScratchReg)
+      .addReg(AddrReg);
   BuildMI(LoopMBB, DL, TII->get(RISCV::BNE))
       .addReg(ScratchReg)
       .addReg(RISCV::X0)
@@ -535,8 +535,8 @@ bool RISCVExpandAtomicPseudo::expandAtomicMinMaxOp(
   //   sc.w scratch1, scratch1, (addr)
   //   bnez scratch1, loop
   BuildMI(LoopTailMBB, DL, TII->get(getSCForRMW32(Ordering, STI)), Scratch1Reg)
-      .addReg(AddrReg)
-      .addReg(Scratch1Reg);
+      .addReg(Scratch1Reg)
+      .addReg(AddrReg);
   BuildMI(LoopTailMBB, DL, TII->get(RISCV::BNE))
       .addReg(Scratch1Reg)
       .addReg(RISCV::X0)
@@ -674,8 +674,8 @@ bool RISCVExpandAtomicPseudo::expandAtomicCmpXchg(
     //   bnez scratch, loophead
     BuildMI(LoopTailMBB, DL, TII->get(getSCForRMW(Ordering, Width, STI)),
             ScratchReg)
-        .addReg(AddrReg)
-        .addReg(NewValReg);
+        .addReg(NewValReg)
+        .addReg(AddrReg);
     BuildMI(LoopTailMBB, DL, TII->get(RISCV::BNE))
         .addReg(ScratchReg)
         .addReg(RISCV::X0)
@@ -707,8 +707,8 @@ bool RISCVExpandAtomicPseudo::expandAtomicCmpXchg(
                       MaskReg, ScratchReg);
     BuildMI(LoopTailMBB, DL, TII->get(getSCForRMW(Ordering, Width, STI)),
             ScratchReg)
-        .addReg(AddrReg)
-        .addReg(ScratchReg);
+        .addReg(ScratchReg)
+        .addReg(AddrReg);
     BuildMI(LoopTailMBB, DL, TII->get(RISCV::BNE))
         .addReg(ScratchReg)
         .addReg(RISCV::X0)
diff --git a/llvm/lib/Target/RISCV/RISCVGISel.td b/llvm/lib/Target/RISCV/RISCVGISel.td
index 7dd3385..eba35ef 100644
--- a/llvm/lib/Target/RISCV/RISCVGISel.td
+++ b/llvm/lib/Target/RISCV/RISCVGISel.td
@@ -100,65 +100,11 @@ def : LdPat<load, LD, PtrVT>;
 def : StPat<store, SD, GPR, PtrVT>;
 }
 
-// Load and store patterns for i16, needed because Zfh makes s16 load/store
-// legal and regbank select may not constrain registers to FP.
-def : LdPat<load, LH, i16>;
-def : StPat<store, SH, GPR, i16>;
-
-def : LdPat<extloadi8, LBU, i16>; // Prefer unsigned due to no c.lb in Zcb.
-def : StPat<truncstorei8, SB, GPR, i16>;
-
-let Predicates = [HasAtomicLdSt] in {
-  // Prefer unsigned due to no c.lb in Zcb.
-  def : LdPat<relaxed_load<atomic_load_aext_8>,    LBU, i16>;
-  def : LdPat<relaxed_load<atomic_load_nonext_16>, LH,  i16>;
-
-  def : StPat<relaxed_store<atomic_store_8>,  SB, GPR, i16>;
-  def : StPat<relaxed_store<atomic_store_16>, SH, GPR, i16>;
-}
-
-let Predicates = [HasAtomicLdSt, IsRV64] in {
-  // Load pattern is in RISCVInstrInfoA.td and shared with RV32.
-  def : StPat<relaxed_store<atomic_store_32>, SW, GPR, i32>;
-}
-
 //===----------------------------------------------------------------------===//
 // RV64 i32 patterns not used by SelectionDAG
 //===----------------------------------------------------------------------===//
 
 let Predicates = [IsRV64] in {
-def : LdPat<extloadi8, LBU, i32>; // Prefer unsigned due to no c.lb in Zcb.
-def : LdPat<extloadi16, LH, i32>;
-
-def : StPat<truncstorei8, SB, GPR, i32>;
-def : StPat<truncstorei16, SH, GPR, i32>;
-
 def : Pat<(sext_inreg (i64 (add GPR:$rs1, simm12_lo:$imm)), i32),
           (ADDIW GPR:$rs1, simm12_lo:$imm)>;
 }
-
-//===----------------------------------------------------------------------===//
-// Zalasr patterns not used by SelectionDAG
-//===----------------------------------------------------------------------===//
-
-let Predicates = [HasStdExtZalasr] in {
-  // the sequentially consistent loads use
-  //  .aq instead of .aqrl to match the psABI/A.7
-  def : PatLAQ<acquiring_load<atomic_load_aext_8>, LB_AQ, i16>;
-  def : PatLAQ<seq_cst_load<atomic_load_aext_8>, LB_AQ, i16>;
-
-  def : PatLAQ<acquiring_load<atomic_load_nonext_16>, LH_AQ, i16>;
-  def : PatLAQ<seq_cst_load<atomic_load_nonext_16>, LH_AQ, i16>;
-
-  def : PatSRL<releasing_store<atomic_store_8>, SB_RL, i16>;
-  def : PatSRL<seq_cst_store<atomic_store_8>, SB_RL, i16>;
-
-  def : PatSRL<releasing_store<atomic_store_16>, SH_RL, i16>;
-  def : PatSRL<seq_cst_store<atomic_store_16>, SH_RL, i16>;
-}
-
-let Predicates = [HasStdExtZalasr, IsRV64] in {
-  // Load pattern is in RISCVInstrInfoZalasr.td and shared with RV32.
-  def : PatSRL<releasing_store<atomic_store_32>, SW_RL, i32>;
-  def : PatSRL<seq_cst_store<atomic_store_32>, SW_RL, i32>;
-}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 9855c47..7a14929 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -1980,7 +1980,7 @@ def : LdPat<sextloadi8, LB>;
 def : LdPat<extloadi8, LBU>; // Prefer unsigned due to no c.lb in Zcb.
 def : LdPat<sextloadi16, LH>;
 def : LdPat<extloadi16, LH>;
-def : LdPat<load, LW, i32>;
+def : LdPat<load, LW, i32>, Requires<[IsRV32]>;
 def : LdPat<zextloadi8, LBU>;
 def : LdPat<zextloadi16, LHU>;
 
@@ -1994,7 +1994,7 @@ class StPat<PatFrag StoreOp, RVInst Inst, RegisterClass StTy,
 
 def : StPat<truncstorei8, SB, GPR, XLenVT>;
 def : StPat<truncstorei16, SH, GPR, XLenVT>;
-def : StPat<store, SW, GPR, i32>;
+def : StPat<store, SW, GPR, i32>, Requires<[IsRV32]>;
 
 /// Fences
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
index 2e4326f..571d72f 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
@@ -33,7 +33,7 @@ multiclass LR_r_aq_rl<bits<3> funct3, string opcodestr> {
 let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in
 class SC_r<bit aq, bit rl, bits<3> funct3, string opcodestr>
     : RVInstRAtomic<0b00011, aq, rl, funct3, OPC_AMO,
-                    (outs GPR:$rd), (ins GPRMemZeroOffset:$rs1, GPR:$rs2),
+                    (outs GPR:$rd), (ins GPR:$rs2, GPRMemZeroOffset:$rs1),
                     opcodestr, "$rd, $rs2, $rs1">;
 
 multiclass SC_r_aq_rl<bits<3> funct3, string opcodestr> {
@@ -46,7 +46,7 @@ multiclass SC_r_aq_rl<bits<3> funct3, string opcodestr> {
 let hasSideEffects = 0, mayLoad = 1, mayStore = 1 in
 class AMO_rr<bits<5> funct5, bit aq, bit rl, bits<3> funct3, string opcodestr>
     : RVInstRAtomic<funct5, aq, rl, funct3, OPC_AMO,
-                    (outs GPR:$rd), (ins GPRMemZeroOffset:$rs1, GPR:$rs2),
+                    (outs GPR:$rd), (ins GPR:$rs2, GPRMemZeroOffset:$rs1),
                     opcodestr, "$rd, $rs2, $rs1">;
 
 multiclass AMO_rr_aq_rl<bits<5> funct5, bits<3> funct3, string opcodestr> {
@@ -174,8 +174,9 @@ let Predicates = [HasAtomicLdSt] in {
   def : StPat<relaxed_store<atomic_store_8>,  SB, GPR, XLenVT>;
   def : StPat<relaxed_store<atomic_store_16>, SH, GPR, XLenVT>;
   def : StPat<relaxed_store<atomic_store_32>, SW, GPR, XLenVT>;
+}
 
-  // Used by GISel for RV32 and RV64.
+let Predicates = [HasAtomicLdSt, IsRV32] in {
   def : LdPat<relaxed_load<atomic_load_nonext_32>, LW, i32>;
 }
 
@@ -188,31 +189,34 @@ let Predicates = [HasAtomicLdSt, IsRV64] in {
 
 /// AMOs
 
+class PatAMO<SDPatternOperator OpNode, RVInst Inst, ValueType vt = XLenVT>
+    : Pat<(vt (OpNode (XLenVT GPR:$rs1), (vt GPR:$rs2))), (Inst GPR:$rs2, GPR:$rs1)>;
+
 multiclass AMOPat<string AtomicOp, string BaseInst, ValueType vt = XLenVT,
                   list<Predicate> ExtraPreds = []> {
 let Predicates = !listconcat([HasStdExtA, NoStdExtZtso], ExtraPreds) in {
-  def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_monotonic"),
-                  !cast<RVInst>(BaseInst), vt>;
-  def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_acquire"),
-                  !cast<RVInst>(BaseInst#"_AQ"), vt>;
-  def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_release"),
-                  !cast<RVInst>(BaseInst#"_RL"), vt>;
-  def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_acq_rel"),
-                  !cast<RVInst>(BaseInst#"_AQRL"), vt>;
-  def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_seq_cst"),
-                  !cast<RVInst>(BaseInst#"_AQRL"), vt>;
+  def : PatAMO<!cast<PatFrag>(AtomicOp#"_monotonic"),
+               !cast<RVInst>(BaseInst), vt>;
+  def : PatAMO<!cast<PatFrag>(AtomicOp#"_acquire"),
+               !cast<RVInst>(BaseInst#"_AQ"), vt>;
+  def : PatAMO<!cast<PatFrag>(AtomicOp#"_release"),
+               !cast<RVInst>(BaseInst#"_RL"), vt>;
+  def : PatAMO<!cast<PatFrag>(AtomicOp#"_acq_rel"),
+               !cast<RVInst>(BaseInst#"_AQRL"), vt>;
+  def : PatAMO<!cast<PatFrag>(AtomicOp#"_seq_cst"),
+               !cast<RVInst>(BaseInst#"_AQRL"), vt>;
 }
 let Predicates = !listconcat([HasStdExtA, HasStdExtZtso], ExtraPreds) in {
-  def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_monotonic"),
-                  !cast<RVInst>(BaseInst), vt>;
-  def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_acquire"),
-                  !cast<RVInst>(BaseInst), vt>;
-  def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_release"),
-                  !cast<RVInst>(BaseInst), vt>;
-  def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_acq_rel"),
-                  !cast<RVInst>(BaseInst), vt>;
-  def : PatGprGpr<!cast<PatFrag>(AtomicOp#"_seq_cst"),
-                  !cast<RVInst>(BaseInst), vt>;
+  def : PatAMO<!cast<PatFrag>(AtomicOp#"_monotonic"),
+               !cast<RVInst>(BaseInst), vt>;
+  def : PatAMO<!cast<PatFrag>(AtomicOp#"_acquire"),
+               !cast<RVInst>(BaseInst), vt>;
+  def : PatAMO<!cast<PatFrag>(AtomicOp#"_release"),
+               !cast<RVInst>(BaseInst), vt>;
+  def : PatAMO<!cast<PatFrag>(AtomicOp#"_acq_rel"),
+               !cast<RVInst>(BaseInst), vt>;
+  def : PatAMO<!cast<PatFrag>(AtomicOp#"_seq_cst"),
+               !cast<RVInst>(BaseInst), vt>;
 }
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZa.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZa.td
index c691aa6..20e2142 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZa.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZa.td
@@ -44,7 +44,7 @@ let hasSideEffects = 0, mayLoad = 1, mayStore = 1, Constraints = "$rd = $rd_wb"
 class AMO_cas<bits<5> funct5, bit aq, bit rl, bits<3> funct3, string opcodestr,
               DAGOperand RC>
     : RVInstRAtomic<funct5, aq, rl, funct3, OPC_AMO,
-                    (outs RC:$rd_wb), (ins RC:$rd, GPRMemZeroOffset:$rs1, RC:$rs2),
+                    (outs RC:$rd_wb), (ins RC:$rd, RC:$rs2, GPRMemZeroOffset:$rs1),
                     opcodestr, "$rd, $rs2, $rs1">;
 
 multiclass AMO_cas_aq_rl<bits<5> funct5, bits<3> funct3, string opcodestr,
@@ -71,48 +71,48 @@ defm AMOCAS_Q : AMO_cas_aq_rl<0b00101, 0b100, "amocas.q", GPRPairRV64>;
 multiclass AMOCASPat<string AtomicOp, string BaseInst, ValueType vt = XLenVT,
                      list<Predicate> ExtraPreds = []> {
   let Predicates = !listconcat([HasStdExtZacas, NoStdExtZtso], ExtraPreds) in {
-    def : Pat<(!cast<PatFrag>(AtomicOp#"_monotonic") (vt GPR:$addr),
+    def : Pat<(!cast<PatFrag>(AtomicOp#"_monotonic") (XLenVT GPR:$addr),
                                                      (vt GPR:$cmp),
                                                      (vt GPR:$new)),
-              (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$addr, GPR:$new)>;
-    def : Pat<(!cast<PatFrag>(AtomicOp#"_acquire") (vt GPR:$addr),
+              (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$new, GPR:$addr)>;
+    def : Pat<(!cast<PatFrag>(AtomicOp#"_acquire") (XLenVT GPR:$addr),
                                                    (vt GPR:$cmp),
                                                    (vt GPR:$new)),
-              (!cast<RVInst>(BaseInst#"_AQ") GPR:$cmp, GPR:$addr, GPR:$new)>;
-    def : Pat<(!cast<PatFrag>(AtomicOp#"_release") (vt GPR:$addr),
+              (!cast<RVInst>(BaseInst#"_AQ") GPR:$cmp, GPR:$new, GPR:$addr)>;
+    def : Pat<(!cast<PatFrag>(AtomicOp#"_release") (XLenVT GPR:$addr),
                                                    (vt GPR:$cmp),
                                                    (vt GPR:$new)),
-              (!cast<RVInst>(BaseInst#"_RL") GPR:$cmp, GPR:$addr, GPR:$new)>;
-    def : Pat<(!cast<PatFrag>(AtomicOp#"_acq_rel") (vt GPR:$addr),
+              (!cast<RVInst>(BaseInst#"_RL") GPR:$cmp, GPR:$new, GPR:$addr)>;
+    def : Pat<(!cast<PatFrag>(AtomicOp#"_acq_rel") (XLenVT GPR:$addr),
                                                    (vt GPR:$cmp),
                                                    (vt GPR:$new)),
-              (!cast<RVInst>(BaseInst#"_AQRL") GPR:$cmp, GPR:$addr, GPR:$new)>;
+              (!cast<RVInst>(BaseInst#"_AQRL") GPR:$cmp, GPR:$new, GPR:$addr)>;
     def : Pat<(!cast<PatFrag>(AtomicOp#"_seq_cst") (vt GPR:$addr),
                                                    (vt GPR:$cmp),
                                                    (vt GPR:$new)),
-              (!cast<RVInst>(BaseInst#"_AQRL") GPR:$cmp, GPR:$addr, GPR:$new)>;
+              (!cast<RVInst>(BaseInst#"_AQRL") GPR:$cmp, GPR:$new, GPR:$addr)>;
   } // Predicates = !listconcat([HasStdExtZacas, NoStdExtZtso], ExtraPreds)
   let Predicates = !listconcat([HasStdExtZacas, HasStdExtZtso], ExtraPreds) in {
-    def : Pat<(!cast<PatFrag>(AtomicOp#"_monotonic") (vt GPR:$addr),
+    def : Pat<(!cast<PatFrag>(AtomicOp#"_monotonic") (XLenVT GPR:$addr),
                                                      (vt GPR:$cmp),
                                                      (vt GPR:$new)),
-              (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$addr, GPR:$new)>;
-    def : Pat<(!cast<PatFrag>(AtomicOp#"_acquire") (vt GPR:$addr),
+              (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$new, GPR:$addr)>;
+    def : Pat<(!cast<PatFrag>(AtomicOp#"_acquire") (XLenVT GPR:$addr),
                                                    (vt GPR:$cmp),
                                                    (vt GPR:$new)),
-              (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$addr, GPR:$new)>;
-    def : Pat<(!cast<PatFrag>(AtomicOp#"_release") (vt GPR:$addr),
+              (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$new, GPR:$addr)>;
+    def : Pat<(!cast<PatFrag>(AtomicOp#"_release") (XLenVT GPR:$addr),
                                                    (vt GPR:$cmp),
                                                    (vt GPR:$new)),
-              (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$addr, GPR:$new)>;
-    def : Pat<(!cast<PatFrag>(AtomicOp#"_acq_rel") (vt GPR:$addr),
+              (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$new, GPR:$addr)>;
+    def : Pat<(!cast<PatFrag>(AtomicOp#"_acq_rel") (XLenVT GPR:$addr),
                                                    (vt GPR:$cmp),
                                                    (vt GPR:$new)),
-              (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$addr, GPR:$new)>;
-    def : Pat<(!cast<PatFrag>(AtomicOp#"_seq_cst") (vt GPR:$addr),
+              (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$new, GPR:$addr)>;
+    def : Pat<(!cast<PatFrag>(AtomicOp#"_seq_cst") (XLenVT GPR:$addr),
                                                    (vt GPR:$cmp),
                                                    (vt GPR:$new)),
-              (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$addr, GPR:$new)>;
+              (!cast<RVInst>(BaseInst) GPR:$cmp, GPR:$new, GPR:$addr)>;
   } // Predicates = !listconcat([HasStdExtZacas, HasStdExtZtso], ExtraPreds)
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZalasr.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZalasr.td
index f7ceb0d..5f944034 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZalasr.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZalasr.td
@@ -94,11 +94,12 @@ let Predicates = [HasStdExtZalasr] in {
 
   def : PatSRL<releasing_store<atomic_store_32>, SW_RL>;
   def : PatSRL<seq_cst_store<atomic_store_32>, SW_RL>;
+}
 
-  // Used by GISel for RV32 and RV64.
+let Predicates = [HasStdExtZalasr, IsRV32] in {
   def : PatLAQ<acquiring_load<atomic_load_nonext_32>, LW_AQ, i32>;
   def : PatLAQ<seq_cst_load<atomic_load_nonext_32>, LW_AQ, i32>;
-} // Predicates = [HasStdExtZalasr]
+} // Predicates = [HasStdExtZalasr, IsRV32]
 
 let Predicates = [HasStdExtZalasr, IsRV64] in {
   def : PatLAQ<acquiring_load<atomic_load_asext_32>, LW_AQ, i64>;
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp
index fc14a03..f7be2a1 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp
@@ -32,7 +32,9 @@ class SPIRVLegalizeImplicitBinding : public ModulePass {
 public:
   static char ID;
   SPIRVLegalizeImplicitBinding() : ModulePass(ID) {}
-
+  StringRef getPassName() const override {
+    return "SPIRV Legalize Implicit Binding";
+  }
   bool runOnModule(Module &M) override;
 
 private:
diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.cpp b/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
index c0cd0176e..f66eb9d 100644
--- a/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
+++ b/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
@@ -668,6 +668,7 @@ bool SparcInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
              .addImm(-1);
     MIBundleBuilder(MBB, InstSTBAR, InstLDSTUB);
     MBB.erase(MI);
+    return true;
   }
   }
   return false;