aboutsummaryrefslogtreecommitdiff
path: root/gcc/rust/util/rust-punycode.cc
blob: 89476f2cc82fac7ade231f5ed827ad5cd60c1584 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
// Copyright (C) 2020-2024 Free Software Foundation, Inc.

// This file is part of GCC.

// GCC is free software; you can redistribute it and/or modify it under
// the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 3, or (at your option) any later
// version.

// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
// WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
// for more details.

// You should have received a copy of the GNU General Public License
// along with GCC; see the file COPYING3.  If not see
// <http://www.gnu.org/licenses/>.

// This file provides functions for punycode conversion
// See https://datatracker.ietf.org/doc/html/rfc3492

#include "rust-system.h"
#include "rust-unicode.h"
#include "optional.h"
#include "selftest.h"

namespace Rust {

// https://tools.ietf.org/html/rfc3492#section-4.
constexpr uint32_t BASE = 36;
constexpr uint32_t TMIN = 1;
constexpr uint32_t TMAX = 26;
constexpr uint32_t SKEW = 38;
constexpr uint32_t DAMP = 700;
constexpr uint32_t INITIAL_BIAS = 72;
constexpr uint32_t INITIAL_N = 128;
constexpr char DELIMITER = '-';

std::string
extract_basic_string (const std::vector<Codepoint> &src)
{
  std::string basic_string;
  for (auto c : src)
    {
      if (c.is_ascii ())
	basic_string += c.as_string ();
    }
  return basic_string;
}

uint32_t
adapt_bias (uint32_t delta, const uint32_t n_points, const bool is_first)
{
  delta /= is_first ? DAMP : 2;
  delta += delta / n_points;
  uint32_t k = 0;

  while (delta > (BASE - TMIN) * TMAX / 2)
    {
      delta /= BASE - TMIN;
      k += BASE;
    }
  return k + (BASE - TMIN + 1) * delta / (delta + SKEW);
}

uint32_t
clamped_sub (const uint32_t min, const uint32_t lhs, const uint32_t rhs,
	     const uint32_t max)
{
  if (min + rhs >= lhs)
    return min;
  else if (max + rhs <= lhs)
    return max;
  else
    return lhs - rhs;
}

uint32_t
min_gt_or_eq (const std::vector<Codepoint> &l, const uint32_t threshold)
{
  uint32_t min = UINT32_MAX;
  for (auto c : l)
    if (c.value >= threshold && c.value < min)
      min = c.value;
  return min;
}

char
encode_digit (const uint32_t d)
{
  return d + 22 + (d < 26 ? 75 : 0);
}

tl::optional<std::string>
encode_punycode (const Utf8String &input)
{
  std::vector<Codepoint> input_chars = input.get_chars ();

  uint32_t n = INITIAL_N;
  uint32_t delta = 0;
  uint32_t bias = INITIAL_BIAS;

  std::string output = extract_basic_string (input_chars);
  uint32_t h = output.size ();
  const uint32_t b = h;
  if (b > 0)
    output += DELIMITER;

  while (h < input_chars.size ())
    {
      const uint32_t m = min_gt_or_eq (input_chars, n);

      if (m - n > ((UINT32_MAX - delta) / (h + 1)))
	return tl::nullopt;

      delta += (m - n) * (h + 1);
      n = m;

      for (const auto c : input_chars)
	{
	  if (c.value < n)
	    delta++;
	  else if (c.value == n)
	    {
	      uint32_t q = delta;
	      // encode as a variable length integer
	      for (uint32_t k = 1;; k++)
		{
		  const uint32_t kb = k * BASE;
		  const uint32_t t = clamped_sub (TMIN, kb, bias, TMAX);
		  if (q < t)
		    break;

		  output += encode_digit (t + (q - t) % (BASE - t));
		  q = (q - t) / (BASE - t);
		}
	      output += encode_digit (q);

	      bias = adapt_bias (delta, h + 1, h == b);
	      delta = 0;
	      h++;
	    }
	}
      delta++;
      n++;
    }

  return {output};
}

} // namespace Rust

#if CHECKING_P

namespace selftest {

void
encode_assert (const std::string &input, const std::string &expected)
{
  Rust::Utf8String input_utf8
    = Rust::Utf8String::make_utf8_string (input).value ();
  std::string actual = Rust::encode_punycode (input_utf8).value ();
  ASSERT_EQ (actual, expected);
}

void
rust_punycode_encode_test ()
{
  encode_assert ("abc", "abc-");
  encode_assert ("12345", "12345-");
  encode_assert ("香港", "j6w193g");

  // Examples from https://datatracker.ietf.org/doc/html/rfc3492#section-7.1
  encode_assert ("ليهمابتكلموشعربي؟", "egbpdaj6bu4bxfgehfvwxn");
  encode_assert ("他们为什么不说中文", "ihqwcrb4cv8a8dqg056pqjye");
  encode_assert ("他們爲什麽不說中文", "ihqwctvzc91f659drss3x8bo0yb");
  encode_assert ("Pročprostěnemluvíčesky", "Proprostnemluvesky-uyb24dma41a");
}

} // namespace selftest

#endif // CHECKING_P