// LoadPair fusion optimization pass for AArch64.
// Copyright (C) 2023-2025 Free Software Foundation, Inc.
//
// This file is part of GCC.
//
// GCC is free software; you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 3, or (at your option)
// any later version.
//
// GCC is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
// General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with GCC; see the file COPYING3.  If not see
// <http://www.gnu.org/licenses/>.

#include "config.h"
#include "system.h"
#include "coretypes.h"
#include "backend.h"
#include "rtl.h"
#include "memmodel.h"
#include "emit-rtl.h"
#include "tm_p.h"
#include "rtl-iter.h"
#include "tree-pass.h"
#include "insn-attr.h"
#include "pair-fusion.h"

static constexpr HOST_WIDE_INT LDP_IMM_BITS = 7;
static constexpr HOST_WIDE_INT LDP_IMM_SIGN_BIT = (1 << (LDP_IMM_BITS - 1));
static constexpr HOST_WIDE_INT LDP_MAX_IMM = LDP_IMM_SIGN_BIT - 1;
static constexpr HOST_WIDE_INT LDP_MIN_IMM = -LDP_MAX_IMM - 1;

struct aarch64_pair_fusion : public pair_fusion
{
  bool fpsimd_op_p (rtx reg_op, machine_mode mem_mode,
		    bool load_p) override final
  {
    // Before RA, we use the modes, noting that stores of constant zero
    // operands use GPRs (even in non-integer modes).  After RA, we use
    // the hard register numbers.
    return reload_completed
      ? (REG_P (reg_op) && FP_REGNUM_P (REGNO (reg_op)))
      : (GET_MODE_CLASS (mem_mode) != MODE_INT
	 && (load_p || !aarch64_const_zero_rtx_p (reg_op)));
  }

  bool pair_mem_insn_p (rtx_insn *rti, bool &load_p) override final;

  bool pair_mem_ok_with_policy (rtx base_mem, bool load_p) override final
  {
    return aarch64_mem_ok_with_ldpstp_policy_model (base_mem,
						    load_p,
						    GET_MODE (base_mem));
  }

  bool pair_operand_mode_ok_p (machine_mode mode) override final;

  rtx gen_pair (rtx *pats, rtx writeback, bool load_p) override final;

  bool pair_reg_operand_ok_p (bool load_p, rtx reg_op,
			      machine_mode mode) override final
  {
    return (load_p
	    ? aarch64_ldp_reg_operand (reg_op, mode)
	    : aarch64_stp_reg_operand (reg_op, mode));
  }

  int pair_mem_alias_check_limit () override final
  {
    return aarch64_ldp_alias_check_limit;
  }

  bool should_handle_writeback (writeback_type which) override final
  {
    if (which == writeback_type::ALL)
      return aarch64_ldp_writeback > 1;
    else
      return aarch64_ldp_writeback;
  }

  bool track_loads_p () override final
  {
    return aarch64_tune_params.ldp_policy_model
	   != AARCH64_LDP_STP_POLICY_NEVER;
  }

  bool track_stores_p () override final
  {
    return aarch64_tune_params.stp_policy_model
	   != AARCH64_LDP_STP_POLICY_NEVER;
  }

  bool pair_mem_in_range_p (HOST_WIDE_INT offset) override final
  {
    return (offset >= LDP_MIN_IMM && offset <= LDP_MAX_IMM);
  }

  rtx gen_promote_writeback_pair (rtx wb_effect, rtx mem, rtx regs[2],
				  bool load_p) override final;

  rtx destructure_pair (rtx regs[2], rtx pattern, bool load_p) override final;
};

bool
aarch64_pair_fusion::pair_mem_insn_p (rtx_insn *rti, bool &load_p)
{
  rtx pat = PATTERN (rti);
  if (GET_CODE (pat) == PARALLEL
      && XVECLEN (pat, 0) == 2)
    {
      const auto attr = get_attr_ldpstp (rti);
      if (attr == LDPSTP_NONE)
	return false;

      load_p = (attr == LDPSTP_LDP);
      gcc_checking_assert (load_p || attr == LDPSTP_STP);
      return true;
    }
  return false;
}

rtx
aarch64_pair_fusion::gen_pair (rtx *pats, rtx writeback, bool load_p)
{
  rtx pair_pat;

  if (writeback)
    {
      auto patvec = gen_rtvec (3, writeback, pats[0], pats[1]);
      return gen_rtx_PARALLEL (VOIDmode, patvec);
    }
  else if (load_p)
    return aarch64_gen_load_pair (XEXP (pats[0], 0),
				  XEXP (pats[1], 0),
				  XEXP (pats[0], 1));
  else
    return aarch64_gen_store_pair (XEXP (pats[0], 0),
				   XEXP (pats[0], 1),
				   XEXP (pats[1], 1));
  return pair_pat;
}

// Return true if we should consider forming ldp/stp insns from memory
// accesses with operand mode MODE at this stage in compilation.
bool
aarch64_pair_fusion::pair_operand_mode_ok_p (machine_mode mode)
{
  if (!aarch64_ldpstp_operand_mode_p (mode))
    return false;

  // We don't pair up TImode accesses before RA because TImode is
  // special in that it can be allocated to a pair of GPRs or a single
  // FPR, and the RA is best placed to make that decision.
  return reload_completed || mode != TImode;
}

// Given a pair mode MODE, return a canonical mode to be used for a single
// operand of such a pair.  Currently we only use this when promoting a
// non-writeback pair into a writeback pair, as it isn't otherwise clear
// which mode to use when storing a modeless CONST_INT.
static machine_mode
aarch64_operand_mode_for_pair_mode (machine_mode mode)
{
  switch (mode)
    {
    case E_V2x4QImode:
      return SImode;
    case E_V2x8QImode:
      return DImode;
    case E_V2x16QImode:
      return V16QImode;
    default:
      gcc_unreachable ();
    }
}

// Given a load pair insn in PATTERN, unpack the insn, storing
// the registers in REGS and returning the mem.
static rtx
aarch64_destructure_load_pair (rtx regs[2], rtx pattern)
{
  rtx mem = NULL_RTX;

  for (int i = 0; i < 2; i++)
    {
      rtx pat = XVECEXP (pattern, 0, i);
      regs[i] = XEXP (pat, 0);
      rtx unspec = XEXP (pat, 1);
      gcc_checking_assert (GET_CODE (unspec) == UNSPEC);
      rtx this_mem = XVECEXP (unspec, 0, 0);
      if (mem)
	gcc_checking_assert (rtx_equal_p (mem, this_mem));
      else
	{
	  gcc_checking_assert (MEM_P (this_mem));
	  mem = this_mem;
	}
    }

  return mem;
}

// Given a store pair insn in PATTERN, unpack the insn, storing
// the register operands in REGS, and returning the mem.
static rtx
aarch64_destructure_store_pair (rtx regs[2], rtx pattern)
{
  rtx mem = XEXP (pattern, 0);
  rtx unspec = XEXP (pattern, 1);
  gcc_checking_assert (GET_CODE (unspec) == UNSPEC);
  for (int i = 0; i < 2; i++)
    regs[i] = XVECEXP (unspec, 0, i);
  return mem;
}

rtx
aarch64_pair_fusion::destructure_pair (rtx regs[2], rtx pattern, bool load_p)
{
  if (load_p)
    return aarch64_destructure_load_pair (regs, pattern);
  else
    return aarch64_destructure_store_pair (regs, pattern);
}

rtx
aarch64_pair_fusion::gen_promote_writeback_pair (rtx wb_effect, rtx pair_mem,
						 rtx regs[2],
						 bool load_p)
{
  auto op_mode = aarch64_operand_mode_for_pair_mode (GET_MODE (pair_mem));

  machine_mode modes[2];
  for (int i = 0; i < 2; i++)
    {
      machine_mode mode = GET_MODE (regs[i]);
      if (load_p)
	gcc_checking_assert (mode != VOIDmode);
      else if (mode == VOIDmode)
	mode = op_mode;

      modes[i] = mode;
    }

  const auto op_size = GET_MODE_SIZE (modes[0]);
  gcc_checking_assert (known_eq (op_size, GET_MODE_SIZE (modes[1])));

  rtx pats[2];
  for (int i = 0; i < 2; i++)
    {
      rtx mem = adjust_address_nv (pair_mem, modes[i], op_size * i);
      pats[i] = load_p
	? gen_rtx_SET (regs[i], mem)
	: gen_rtx_SET (mem, regs[i]);
    }

  return gen_rtx_PARALLEL (VOIDmode,
			   gen_rtvec (3, wb_effect, pats[0], pats[1]));
}

namespace {

const pass_data pass_data_ldp_fusion =
{
  RTL_PASS, /* type */
  "ldp_fusion", /* name */
  OPTGROUP_NONE, /* optinfo_flags */
  TV_NONE, /* tv_id */
  0, /* properties_required */
  0, /* properties_provided */
  0, /* properties_destroyed */
  0, /* todo_flags_start */
  TODO_df_finish, /* todo_flags_finish */
};

class pass_ldp_fusion : public rtl_opt_pass
{
public:
  pass_ldp_fusion (gcc::context *ctx)
    : rtl_opt_pass (pass_data_ldp_fusion, ctx)
    {}

  opt_pass *clone () override { return new pass_ldp_fusion (m_ctxt); }

  bool gate (function *) final override
    {
      if (!optimize || optimize_debug)
	return false;

      if (reload_completed)
	return flag_aarch64_late_ldp_fusion;
      else
	return flag_aarch64_early_ldp_fusion;
    }

  unsigned execute (function *) final override
    {
      aarch64_pair_fusion pass;
      pass.run ();
      return 0;
    }
};

} // anon namespace

rtl_opt_pass *
make_pass_ldp_fusion (gcc::context *ctx)
{
  return new pass_ldp_fusion (ctx);
}