// Copyright (C) 2020-2024 Free Software Foundation, Inc. // This file is part of GCC. // GCC is free software; you can redistribute it and/or modify it under // the terms of the GNU General Public License as published by the Free // Software Foundation; either version 3, or (at your option) any later // version. // GCC is distributed in the hope that it will be useful, but WITHOUT ANY // WARRANTY; without even the implied warranty of MERCHANTABILITY or // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License // for more details. // You should have received a copy of the GNU General Public License // along with GCC; see the file COPYING3. If not see // . // This file provides functions for punycode conversion // See https://datatracker.ietf.org/doc/html/rfc3492 #include "rust-system.h" #include "rust-unicode.h" #include "optional.h" #include "selftest.h" namespace Rust { // https://tools.ietf.org/html/rfc3492#section-4. constexpr uint32_t BASE = 36; constexpr uint32_t TMIN = 1; constexpr uint32_t TMAX = 26; constexpr uint32_t SKEW = 38; constexpr uint32_t DAMP = 700; constexpr uint32_t INITIAL_BIAS = 72; constexpr uint32_t INITIAL_N = 128; constexpr char DELIMITER = '-'; std::string extract_basic_string (const std::vector &src) { std::string basic_string; for (auto c : src) { if (c.is_ascii ()) basic_string += c.as_string (); } return basic_string; } uint32_t adapt_bias (uint32_t delta, const uint32_t n_points, const bool is_first) { delta /= is_first ? DAMP : 2; delta += delta / n_points; uint32_t k = 0; while (delta > (BASE - TMIN) * TMAX / 2) { delta /= BASE - TMIN; k += BASE; } return k + (BASE - TMIN + 1) * delta / (delta + SKEW); } uint32_t clamped_sub (const uint32_t min, const uint32_t lhs, const uint32_t rhs, const uint32_t max) { if (min + rhs >= lhs) return min; else if (max + rhs <= lhs) return max; else return lhs - rhs; } uint32_t min_gt_or_eq (const std::vector &l, const uint32_t threshold) { uint32_t min = UINT32_MAX; for (auto c : l) if (c.value >= threshold && c.value < min) min = c.value; return min; } char encode_digit (const uint32_t d) { return d + 22 + (d < 26 ? 75 : 0); } tl::optional encode_punycode (const Utf8String &input) { std::vector input_chars = input.get_chars (); uint32_t n = INITIAL_N; uint32_t delta = 0; uint32_t bias = INITIAL_BIAS; std::string output = extract_basic_string (input_chars); uint32_t h = output.size (); const uint32_t b = h; if (b > 0) output += DELIMITER; while (h < input_chars.size ()) { const uint32_t m = min_gt_or_eq (input_chars, n); if (m - n > ((UINT32_MAX - delta) / (h + 1))) return tl::nullopt; delta += (m - n) * (h + 1); n = m; for (const auto c : input_chars) { if (c.value < n) delta++; else if (c.value == n) { uint32_t q = delta; // encode as a variable length integer for (uint32_t k = 1;; k++) { const uint32_t kb = k * BASE; const uint32_t t = clamped_sub (TMIN, kb, bias, TMAX); if (q < t) break; output += encode_digit (t + (q - t) % (BASE - t)); q = (q - t) / (BASE - t); } output += encode_digit (q); bias = adapt_bias (delta, h + 1, h == b); delta = 0; h++; } } delta++; n++; } return {output}; } } // namespace Rust #if CHECKING_P namespace selftest { void encode_assert (const std::string &input, const std::string &expected) { Rust::Utf8String input_utf8 = Rust::Utf8String::make_utf8_string (input).value (); std::string actual = Rust::encode_punycode (input_utf8).value (); ASSERT_EQ (actual, expected); } void rust_punycode_encode_test () { encode_assert ("abc", "abc-"); encode_assert ("12345", "12345-"); encode_assert ("香港", "j6w193g"); // Examples from https://datatracker.ietf.org/doc/html/rfc3492#section-7.1 encode_assert ("ليهمابتكلموشعربي؟", "egbpdaj6bu4bxfgehfvwxn"); encode_assert ("他们为什么不说中文", "ihqwcrb4cv8a8dqg056pqjye"); encode_assert ("他們爲什麽不說中文", "ihqwctvzc91f659drss3x8bo0yb"); encode_assert ("Pročprostěnemluvíčesky", "Proprostnemluvesky-uyb24dma41a"); } } // namespace selftest #endif // CHECKING_P