// Copyright (C) 2020-2025 Free Software Foundation, Inc.

// This file is part of GCC.

// GCC is free software; you can redistribute it and/or modify it under
// the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 3, or (at your option) any later
// version.

// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
// WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
// for more details.

// You should have received a copy of the GNU General Public License
// along with GCC; see the file COPYING3.  If not see
// <http://www.gnu.org/licenses/>.

#include "rust-input-source.h"
#include "rust-system.h"
#include "optional.h"
#include "selftest.h"
#include "rust-lex.h"
#include "rust-unicode.h"

#include "rust-unicode-data.h"

namespace Rust {

typedef Codepoint codepoint_t;
typedef std::vector<codepoint_t> string_t;

// These constants are used to compose and decompose of Hangul syllables.
// See `Sample Code for Hangul Algorithms` in 3.1.2
// unicode.org/versions/Unicode15.0.0/ch03.pdf
const uint32_t S_BASE = 0xAC00;
const uint32_t L_BASE = 0x1100, V_BASE = 0x1161, T_BASE = 0x11A7;
const uint32_t L_COUNT = 19, V_COUNT = 21, T_COUNT = 28;
const uint32_t N_COUNT = V_COUNT * T_COUNT;
const uint32_t S_COUNT = L_COUNT * N_COUNT;

// Check if the codepoint is in any of the ranges (half-open intervals [a,b)).
template <std::size_t SIZE>
bool
binary_search_ranges (
  const std::array<std::pair<uint32_t, uint32_t>, SIZE> &ranges,
  uint32_t target_cp)
{
  auto it = std::lower_bound (ranges.begin (), ranges.end (), target_cp,
			      [] (const std::pair<uint32_t, uint32_t> &a,
				  uint32_t b) { return a.second <= b; });
  if (it == ranges.end ())
    return false;
  else
    return it->first <= target_cp && target_cp < it->second;
}

int
lookup_cc (codepoint_t c)
{
  auto it = CCC_TABLE.find (c.value);
  if (it != CCC_TABLE.end ())
    return it->second;
  else
    // Starter. Returns zero.
    return 0;
}

tl::optional<codepoint_t>
lookup_recomp (codepoint_t starter, codepoint_t c)
{
  auto it = Rust::RECOMPOSITION_MAP.find ({starter.value, c.value});
  if (it != Rust::RECOMPOSITION_MAP.end ())
    return {it->second};

  it = Rust::RECOMPOSITION_MAP.find ({starter.value, 0});
  if (it != Rust::RECOMPOSITION_MAP.end ())
    return {it->second};

  return tl::nullopt;
}

void
recursive_decomp_cano (codepoint_t c, string_t &buf)
{
  auto it = Rust::DECOMPOSITION_MAP.find (c.value);
  if (it != Rust::DECOMPOSITION_MAP.end ())
    {
      std::vector<uint32_t> decomped = it->second;
      for (uint32_t cp : decomped)
	recursive_decomp_cano (cp, buf);
    }
  else
    buf.push_back (c);
}

string_t
decomp_cano (string_t s)
{
  string_t buf;
  for (codepoint_t c : s)
    {
      int64_t s_index = c.value - S_BASE;
      if (0 <= s_index && s_index < S_COUNT)
	{
	  // decompose Hangul argorithmically
	  uint32_t l = L_BASE + s_index / N_COUNT;
	  uint32_t v = V_BASE + (s_index % N_COUNT) / T_COUNT;
	  uint32_t t = T_BASE + s_index % T_COUNT;
	  buf.push_back (l);
	  buf.push_back (v);
	  if (t != T_BASE)
	    buf.push_back (t);
	  continue;
	}

      // Current character is not hangul
      recursive_decomp_cano (c, buf);
    }
  return buf;
}

void
sort_cano (string_t &s)
{
  int cc_here, cc_prev;
  if (s.size () == 1)
    return;
  for (unsigned int i = 1; i < s.size (); i++)
    {
      cc_here = lookup_cc (s[i]);
      cc_prev = lookup_cc (s[i - 1]);
      if (cc_here > 0 && cc_prev > 0 && cc_prev > cc_here)
	{
	  // swap
	  codepoint_t tmp = s[i];
	  s[i] = s[i - 1];
	  s[i - 1] = tmp;
	  if (i > 1)
	    i -= 2;
	}
    }
}

string_t
compose_hangul (string_t s)
{
  string_t buf;
  if (s.size () < 2)
    return s;

  codepoint_t last = s[0];
  buf.push_back (last);
  for (unsigned int src_pos = 1; src_pos < s.size (); src_pos++)
    {
      codepoint_t ch = s[src_pos];

      // L V => LV
      int64_t l_index = last.value - L_BASE;
      if (0 <= l_index && l_index < L_COUNT)
	{
	  int64_t v_index = ch.value - V_BASE;
	  if (0 <= v_index && v_index < V_COUNT)
	    {
	      last = S_BASE + (l_index * V_COUNT + v_index) * T_COUNT;
	      // pop L
	      buf.pop_back ();
	      buf.push_back (last);
	      continue;
	    }
	}

      // LV T => LVT
      int64_t s_index = last.value - S_BASE;
      if (0 <= s_index && s_index < S_COUNT && (s_index % T_COUNT) == 0)
	{
	  int64_t t_index = ch.value - T_BASE;
	  if (0 < t_index && t_index < T_COUNT)
	    {
	      last.value += t_index;
	      // pop LV
	      buf.pop_back ();
	      buf.push_back (last);
	      continue;
	    }
	}
      last = ch;
      buf.push_back (last);
    }
  return buf;
}

string_t
recomp (string_t s)
{
  // compose hangul first
  s = compose_hangul (s);

  string_t buf;
  if (s.size () < 2)
    return s;

  int last_class = -1;
  // int starter_pos = 0; // Assume the first character is Starter. Correct?
  // int target_pos = 1;
  codepoint_t starter_ch = s[0];

  for (unsigned int src_pos = 1; src_pos < s.size (); src_pos++)
    {
      // get current character
      codepoint_t ch = s[src_pos];

      int ch_class = lookup_cc (ch);
      tl::optional<codepoint_t> composite = lookup_recomp (starter_ch, ch);
      if (composite.has_value () && last_class < ch_class)
	{
	  // ch can be composed
	  buf.push_back (composite.value ());
	  starter_ch = composite.value ();
	}
      else if (ch_class == 0)
	{
	  // ch is Starter and cannot be composed.
	  if (src_pos == 1)
	    // FIXME: buggy?
	    buf.push_back (starter_ch);
	  starter_ch = ch;
	  last_class = -1;
	  buf.push_back (ch);
	}
      else
	{
	  if (src_pos == 1)
	    // FIXME: buggy?
	    buf.push_back (starter_ch);
	  // ch is not Starter.
	  last_class = ch_class;
	  buf.push_back (ch);
	}
    }
  return buf;
}

// see https://unicode.org/reports/tr15/#Detecting_Normalization_Forms
QuickCheckResult
nfc_quick_check (const string_t &s)
{
  int last_canonical_class = 0;
  QuickCheckResult res = QuickCheckResult::YES;

  for (unsigned long i = 0; i < s.size (); i++)
    {
      codepoint_t c = s[i];

      if (c.is_supplementary_character ())
	i++;

      int canonical_class = lookup_cc (c);
      if (last_canonical_class > canonical_class && canonical_class != 0)
	return QuickCheckResult::NO;

      if (is_nfc_qc_no (c.value))
	return QuickCheckResult::NO;

      if (is_nfc_qc_maybe (c.value))
	res = QuickCheckResult::MAYBE;

      last_canonical_class = canonical_class;
    }
  return res;
}

string_t
nfc_normalize (const string_t &s)
{
  if (nfc_quick_check (s) == QuickCheckResult::YES)
    return s;

  // TODO: optimize normalization.
  // i.e. only normalize a limited area around MAYBE character, instead of
  // performing complete normlization of the entire string

  // decompose
  string_t d = decomp_cano (s);
  sort_cano (d);

  // recompose
  string_t r = recomp (d);
  return r;
}

Utf8String
Utf8String::nfc_normalize () const
{
  return Utf8String (Rust::nfc_normalize (chars));
}

bool
is_alphabetic (uint32_t codepoint)
{
  return binary_search_ranges (ALPHABETIC_RANGES, codepoint);
}

bool
is_numeric (uint32_t codepoint)
{
  return std::binary_search (NUMERIC_CODEPOINTS.begin (),
			     NUMERIC_CODEPOINTS.end (), codepoint);
}

bool
is_nfc_qc_maybe (uint32_t codepoint)
{
  return binary_search_ranges (NFC_QC_MAYBE_RANGES, codepoint);
}

bool
is_nfc_qc_no (uint32_t codepoint)
{
  return binary_search_ranges (NFC_QC_NO_RANGES, codepoint);
}

bool
is_ascii_only (const std::string &str)
{
  for (char c : str)
    if (static_cast<uint32_t> (c) > MAX_ASCII_CODEPOINT)
      return false;
  return true;
}

} // namespace Rust

#if CHECKING_P

namespace selftest {

void
rust_nfc_qc_test ()
{
  ASSERT_EQ (Rust::nfc_quick_check ({0x1e0a /* NFC_QC_YES */}),
	     Rust::QuickCheckResult::YES);
  ASSERT_EQ (Rust::nfc_quick_check (
	       {0x1e0a /* NFC_QC_YES */, 0x0323 /* NFC_QC_MAYBE */}),
	     Rust::QuickCheckResult::MAYBE);
  ASSERT_EQ (Rust::nfc_quick_check ({0x0340 /* NFC_QC_NO */}),
	     Rust::QuickCheckResult::NO);
}

void
assert_normalize (const std::vector<Rust::Codepoint> origin,
		  const std::vector<Rust::Codepoint> expected)
{
  std::vector<Rust::Codepoint> actual = Rust::nfc_normalize (origin);

  ASSERT_EQ (actual.size (), expected.size ());
  for (unsigned int i = 0; i < actual.size (); i++)
    {
      ASSERT_EQ (actual[i], expected[i]);
    }
}

void
rust_utf8_normalize_test ()
{
  // ASCII
  assert_normalize ({'h', 'e', 'l', 'l', 'o'}, {'h', 'e', 'l', 'l', 'o'});
  // ASCII
  assert_normalize ({'/', '\\', '.', ':', '*'}, {'/', '\\', '.', ':', '*'});

  // testcases retrieved from Part0 of
  // https://unicode.org/Public/UNIDATA/NormalizationTest.txt
  assert_normalize ({0x1e0a}, {0x1e0a});
  assert_normalize ({0x1e0c}, {0x1e0c});
  assert_normalize ({0x1e0a, 0x0323}, {0x1e0c, 0x0307});
  assert_normalize ({0x1e0c, 0x0307}, {0x1e0c, 0x0307});
  assert_normalize ({0x0044, 0x0307, 0x0323}, {0x1e0c, 0x0307});

  // testcases for Hangul from Part0
  assert_normalize ({0x1100, 0xac00, 0x11a8}, {0x1100, 0xac01});
  assert_normalize ({0x1100, 0xac00, 0x11a8, 0x11a8}, {0x1100, 0xac01, 0x11a8});
  // testcases for Hangul from Part1
  assert_normalize ({0x3131}, {0x3131});
  assert_normalize ({0x3132}, {0x3132});
  // testcases for Hangul from Part3
  assert_normalize ({0x1100, 0x0334, 0x1161}, {0x1100, 0x0334, 0x1161});
  assert_normalize ({0xac54, 0x0334, 0x11ae}, {0xac54, 0x0334, 0x11ae});

  // TODO: add more testcases in
  // https://unicode.org/Public/UNIDATA/NormalizationTest.txt
}

void
rust_utf8_property_test ()
{
  ASSERT_TRUE (Rust::is_alphabetic ('A'));
  ASSERT_TRUE (Rust::is_alphabetic ('B'));
  ASSERT_TRUE (Rust::is_alphabetic ('x'));
  ASSERT_TRUE (Rust::is_alphabetic ('z'));
  ASSERT_TRUE (Rust::is_alphabetic (0x00b5));  // µ
  ASSERT_TRUE (Rust::is_alphabetic (0x3093));  // ん
  ASSERT_TRUE (Rust::is_alphabetic (0xa8f2));  // ꣲ
  ASSERT_TRUE (Rust::is_alphabetic (0x2b743)); // 𫝃

  ASSERT_FALSE (Rust::is_alphabetic ('\v'));
  ASSERT_FALSE (Rust::is_alphabetic ('-'));
  ASSERT_FALSE (Rust::is_alphabetic ('_'));
  ASSERT_FALSE (Rust::is_alphabetic ('+'));
  ASSERT_FALSE (Rust::is_alphabetic ('0'));
  ASSERT_FALSE (Rust::is_alphabetic ('1'));
  ASSERT_FALSE (Rust::is_alphabetic ('2'));
  ASSERT_FALSE (Rust::is_alphabetic ('9'));
  ASSERT_FALSE (Rust::is_alphabetic (0xa720)); // ◌
  ASSERT_FALSE (Rust::is_alphabetic (0xaac1)); // ◌꫁

  // `Nd`s
  ASSERT_TRUE (Rust::is_numeric ('0'));
  ASSERT_TRUE (Rust::is_numeric ('1'));
  ASSERT_TRUE (Rust::is_numeric ('7'));
  ASSERT_TRUE (Rust::is_numeric ('9'));
  ASSERT_TRUE (Rust::is_numeric (0x07c2)); // ߂
  ASSERT_TRUE (Rust::is_numeric (0x096d)); // ७
  // `Nl`s
  ASSERT_TRUE (Rust::is_numeric (0x16e6));  // ᛮ
  ASSERT_TRUE (Rust::is_numeric (0xa6e6));  // ꛦ
  ASSERT_TRUE (Rust::is_numeric (0x12400)); // 𒐀
  ASSERT_TRUE (Rust::is_numeric (0x1243a)); // 𒐺
  // `No`s
  ASSERT_TRUE (Rust::is_numeric (0x00b2)); // ²
  ASSERT_TRUE (Rust::is_numeric (0x32b1)); // ㊱

  ASSERT_FALSE (Rust::is_numeric ('\n'));
  ASSERT_FALSE (Rust::is_numeric ('-'));
  ASSERT_FALSE (Rust::is_numeric ('_'));
  ASSERT_FALSE (Rust::is_numeric ('('));
  ASSERT_FALSE (Rust::is_numeric ('z'));
  ASSERT_FALSE (Rust::is_numeric (';'));
  ASSERT_FALSE (Rust::is_numeric (0x03f4)); // ϴ
  ASSERT_FALSE (Rust::is_numeric (0x0628)); // ب
  ASSERT_FALSE (Rust::is_numeric (0x0975)); // ॵ
  ASSERT_FALSE (Rust::is_numeric (0x18f0)); // ᣰ
  ASSERT_FALSE (Rust::is_numeric (0x2f30)); // ⼰
}

} // namespace selftest

#endif // CHECKING_P