aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorRaiki Tamura <tamaron1203@gmail.com>2023-07-30 19:54:36 +0900
committerP-E-P <32375388+P-E-P@users.noreply.github.com>2023-08-09 12:41:01 +0000
commitebd449ca9723df317e50ad7c0b66522470b6d909 (patch)
tree30317574367dd636458d12b73c5c0e5adfce7c42 /gcc
parent67d1f4a53e124ff4e0293449a9a73d5e0a012c8e (diff)
downloadgcc-ebd449ca9723df317e50ad7c0b66522470b6d909.zip
gcc-ebd449ca9723df317e50ad7c0b66522470b6d909.tar.gz
gcc-ebd449ca9723df317e50ad7c0b66522470b6d909.tar.bz2
gccrs: Add function `Rust::encode_punycode`
gcc/rust/ChangeLog: * Make-lang.in: Add rust-punycode.o. * rust-lang.cc (run_rust_tests): Add selftest. * util/rust-punycode.cc: New file. * util/rust-punycode.h: New file. Signed-off-by: Raiki Tamura <tamaron1203@gmail.com>
Diffstat (limited to 'gcc')
-rw-r--r--gcc/rust/Make-lang.in1
-rw-r--r--gcc/rust/rust-lang.cc2
-rw-r--r--gcc/rust/util/rust-punycode.cc180
-rw-r--r--gcc/rust/util/rust-punycode.h46
4 files changed, 229 insertions, 0 deletions
diff --git a/gcc/rust/Make-lang.in b/gcc/rust/Make-lang.in
index aa68640..a4b5e1d 100644
--- a/gcc/rust/Make-lang.in
+++ b/gcc/rust/Make-lang.in
@@ -186,6 +186,7 @@ GRS_OBJS = \
rust/rust-feature-gate.o \
rust/rust-dir-owner.o \
rust/rust-unicode.o \
+ rust/rust-punycode.o \
$(END)
# removed object files from here
diff --git a/gcc/rust/rust-lang.cc b/gcc/rust/rust-lang.cc
index 44dc3fc..e544d03 100644
--- a/gcc/rust/rust-lang.cc
+++ b/gcc/rust/rust-lang.cc
@@ -40,6 +40,7 @@
#include "rust-lex.h"
#include "optional.h"
#include "rust-unicode.h"
+#include "rust-punycode.h"
#include <mpfr.h>
// note: header files must be in this order or else forward declarations don't
@@ -456,6 +457,7 @@ run_rust_tests ()
// Call tests for the rust frontend here
rust_input_source_test ();
rust_utf8_normalize_test ();
+ rust_punycode_encode_test ();
rust_cfg_parser_test ();
rust_privacy_ctx_test ();
rust_crate_name_validation_test ();
diff --git a/gcc/rust/util/rust-punycode.cc b/gcc/rust/util/rust-punycode.cc
new file mode 100644
index 0000000..a35d54a
--- /dev/null
+++ b/gcc/rust/util/rust-punycode.cc
@@ -0,0 +1,180 @@
+// Copyright (C) 2020-2023 Free Software Foundation, Inc.
+
+// This file is part of GCC.
+
+// GCC is free software; you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free
+// Software Foundation; either version 3, or (at your option) any later
+// version.
+
+// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+// for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with GCC; see the file COPYING3. If not see
+// <http://www.gnu.org/licenses/>.
+
+// This file provides functions for punycode conversion
+// See https://datatracker.ietf.org/doc/html/rfc3492
+
+#include "rust-system.h"
+#include "rust-unicode.h"
+#include "optional.h"
+#include "selftest.h"
+
+namespace Rust {
+
+// https://tools.ietf.org/html/rfc3492#section-4.
+constexpr uint32_t BASE = 36;
+constexpr uint32_t TMIN = 1;
+constexpr uint32_t TMAX = 26;
+constexpr uint32_t SKEW = 38;
+constexpr uint32_t DAMP = 700;
+constexpr uint32_t INITIAL_BIAS = 72;
+constexpr uint32_t INITIAL_N = 128;
+constexpr char DELIMITER = '-';
+
+constexpr uint32_t MAX_ASCII_CODEPOINT = 0x7F;
+
+std::string
+extract_basic_string (const std::vector<Codepoint> &src)
+{
+ std::string basic_string;
+ for (auto c : src)
+ {
+ if (c.value <= MAX_ASCII_CODEPOINT)
+ basic_string += c.as_string ();
+ }
+ return basic_string;
+}
+
+uint32_t
+adapt_bias (uint32_t delta, const uint32_t n_points, const bool is_first)
+{
+ delta /= is_first ? DAMP : 2;
+ delta += delta / n_points;
+ uint32_t k = 0;
+
+ while (delta > (BASE - TMIN) * TMAX / 2)
+ {
+ delta /= BASE - TMIN;
+ k += BASE;
+ }
+ return k + (BASE - TMIN + 1) * delta / (delta + SKEW);
+}
+
+uint32_t
+clamped_sub (const uint32_t min, const uint32_t lhs, const uint32_t rhs,
+ const uint32_t max)
+{
+ if (min + rhs >= lhs)
+ return min;
+ else if (max + rhs <= lhs)
+ return max;
+ else
+ return lhs - rhs;
+}
+
+uint32_t
+min_gt_or_eq (const std::vector<Codepoint> &l, const uint32_t threshold)
+{
+ uint32_t min = UINT32_MAX;
+ for (auto c : l)
+ if (c.value >= threshold && c.value < min)
+ min = c.value;
+ return min;
+}
+
+char
+encode_digit (const uint32_t d)
+{
+ return d + 22 + (d < 26 ? 75 : 0);
+}
+
+tl::optional<std::string>
+encode_punycode (const Utf8String &input)
+{
+ std::vector<Codepoint> input_chars = input.get_chars ();
+
+ uint32_t n = INITIAL_N;
+ uint32_t delta = 0;
+ uint32_t bias = INITIAL_BIAS;
+
+ std::string output = extract_basic_string (input_chars);
+ uint32_t h = output.size ();
+ const uint32_t b = h;
+ if (b > 0)
+ output += DELIMITER;
+
+ while (h < input_chars.size ())
+ {
+ const uint32_t m = min_gt_or_eq (input_chars, n);
+
+ if (m - n > ((UINT32_MAX - delta) / (h + 1)))
+ return tl::nullopt;
+
+ delta += (m - n) * (h + 1);
+ n = m;
+
+ for (const auto c : input_chars)
+ {
+ if (c.value < n)
+ delta++;
+ else if (c.value == n)
+ {
+ uint32_t q = delta;
+ // encode as a variable length integer
+ for (uint32_t k = 1;; k++)
+ {
+ const uint32_t kb = k * BASE;
+ const uint32_t t = clamped_sub (TMIN, kb, bias, TMAX);
+ if (q < t)
+ break;
+
+ output += encode_digit (t + (q - t) % (BASE - t));
+ q = (q - t) / (BASE - t);
+ }
+ output += encode_digit (q);
+
+ bias = adapt_bias (delta, h + 1, h == b);
+ delta = 0;
+ h++;
+ }
+ }
+ delta++;
+ n++;
+ }
+
+ return {output};
+}
+
+} // namespace Rust
+
+namespace selftest {
+
+void
+encode_assert (const std::string &input, const std::string &expected)
+{
+ Rust::Utf8String input_utf8
+ = Rust::Utf8String::make_utf8_string (input).value ();
+ std::string actual = Rust::encode_punycode (input_utf8).value ();
+ ASSERT_EQ (actual, expected);
+}
+
+void
+rust_punycode_encode_test ()
+{
+ encode_assert ("abc", "abc-");
+ encode_assert ("12345", "12345-");
+ encode_assert ("香港", "j6w193g");
+
+ // Examples from https://datatracker.ietf.org/doc/html/rfc3492#section-7.1
+ encode_assert ("ليهمابتكلموشعربي؟", "egbpdaj6bu4bxfgehfvwxn");
+ encode_assert ("他们为什么不说中文", "ihqwcrb4cv8a8dqg056pqjye");
+ encode_assert ("他們爲什麽不說中文", "ihqwctvzc91f659drss3x8bo0yb");
+ encode_assert ("Pročprostěnemluvíčesky", "Proprostnemluvesky-uyb24dma41a");
+}
+
+} // namespace selftest
diff --git a/gcc/rust/util/rust-punycode.h b/gcc/rust/util/rust-punycode.h
new file mode 100644
index 0000000..ffb139a
--- /dev/null
+++ b/gcc/rust/util/rust-punycode.h
@@ -0,0 +1,46 @@
+// Copyright (C) 2020-2023 Free Software Foundation, Inc.
+
+// This file is part of GCC.
+
+// GCC is free software; you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free
+// Software Foundation; either version 3, or (at your option) any later
+// version.
+
+// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+// for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with GCC; see the file COPYING3. If not see
+// <http://www.gnu.org/licenses/>.
+
+#ifndef RUST_PUNYCODE_H
+#define RUST_PUNYCODE_H
+
+#include "rust-unicode.h"
+#include "optional.h"
+
+namespace Rust {
+
+/* Encode a string as punycode. Returns a string if encoding is successful.
+ * Returns nullopt otherwise. Note that a returned string contains only ASCII
+ * characters and does not start with `xn--`. */
+tl::optional<std::string>
+encode_punycode (const Utf8String &src);
+
+} // namespace Rust
+
+#if CHECKING_P
+
+namespace selftest {
+
+void
+rust_punycode_encode_test ();
+
+} // namespace selftest
+
+#endif // CHECKING_P
+
+#endif