diff options
author | Raiki Tamura <tamaron1203@gmail.com> | 2023-08-06 19:17:17 +0900 |
---|---|---|
committer | Arthur Cohen <arthur.cohen@embecosm.com> | 2024-01-16 19:00:30 +0100 |
commit | 5b47923fe512f088a4f1c31466236843c20b7ff9 (patch) | |
tree | 4c3258bbf931ea7093e47d8c7008bcc35d9dbeaa /gcc/rust | |
parent | 1e288d66cb4f0a25a249c0c6dacc2efbf9e44dc8 (diff) | |
download | gcc-5b47923fe512f088a4f1c31466236843c20b7ff9.zip gcc-5b47923fe512f088a4f1c31466236843c20b7ff9.tar.gz gcc-5b47923fe512f088a4f1c31466236843c20b7ff9.tar.bz2 |
gccrs: clean up Codepoint and InputSource
gcc/rust/ChangeLog:
* lex/rust-codepoint.h: Moved to...
* util/rust-codepoint.h: ...here.
* lex/rust-input-source.h: Add missing license
* util/rust-unicode.cc: Add missing license
* util/rust-punycode.cc (extract_basic_string): Remove constant
Signed-off-by: Raiki Tamura <tamaron1203@gmail.com>
Diffstat (limited to 'gcc/rust')
-rw-r--r-- | gcc/rust/lex/rust-input-source.h | 70 | ||||
-rw-r--r-- | gcc/rust/util/rust-codepoint.h (renamed from gcc/rust/lex/rust-codepoint.h) | 0 | ||||
-rw-r--r-- | gcc/rust/util/rust-punycode.cc | 4 | ||||
-rw-r--r-- | gcc/rust/util/rust-unicode.cc | 18 |
4 files changed, 66 insertions, 26 deletions
diff --git a/gcc/rust/lex/rust-input-source.h b/gcc/rust/lex/rust-input-source.h index 07137de..32261a0 100644 --- a/gcc/rust/lex/rust-input-source.h +++ b/gcc/rust/lex/rust-input-source.h @@ -1,3 +1,21 @@ +// Copyright (C) 2020-2023 Free Software Foundation, Inc. + +// This file is part of GCC. + +// GCC is free software; you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 3, or (at your option) any later +// version. + +// GCC is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. + +// You should have received a copy of the GNU General Public License +// along with GCC; see the file COPYING3. If not see +// <http://www.gnu.org/licenses/>. + #ifndef RUST_INPUT_SOURCE_H #define RUST_INPUT_SOURCE_H @@ -5,6 +23,14 @@ #include "optional.h" namespace Rust { + +constexpr uint8_t UTF8_BOM1 = 0xEF; +constexpr uint8_t UTF8_BOM2 = 0xBB; +constexpr uint8_t UTF8_BOM3 = 0xBF; + +constexpr uint32_t MAX_ASCII_CODEPOINT = 0x7F; +constexpr uint32_t CODEPOINT_INVALID = 0xFFFE; + // Input source wrapper thing. class InputSource { @@ -23,7 +49,7 @@ private: if ((int32_t) input == EOF) return Codepoint::eof (); - else if (input < 128) + else if (input <= MAX_ASCII_CODEPOINT) { // ascii -- 1 byte return {input}; @@ -31,14 +57,14 @@ private: else if ((input & 0xC0) == 0x80) { // invalid (continuation; can't be first char) - return {0xFFFE}; + return {CODEPOINT_INVALID}; } else if ((input & 0xE0) == 0xC0) { // 2 bytes uint8_t input2 = next_byte (); if ((input2 & 0xC0) != 0x80) - return {0xFFFE}; + return {CODEPOINT_INVALID}; uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0); return output; @@ -50,23 +76,23 @@ private: // If the second byte is equal to 0xBB then the input is no longer a // valid UTF-8 char. Then, we check if the third byte makes up a UTF // BOM. - if (input == 0xEF && input2 == 0xBB) + if (input == UTF8_BOM1 && input2 == UTF8_BOM2) { uint8_t input3 = next_byte (); - if (input3 == 0xBF) + if (input3 == UTF8_BOM3) // found BOM return next_codepoint (); else - return {0xFFFE}; + return {CODEPOINT_INVALID}; } if ((input2 & 0xC0) != 0x80) - return {0xFFFE}; + return {CODEPOINT_INVALID}; uint8_t input3 = next_byte (); if ((input3 & 0xC0) != 0x80) - return {0xFFFE}; + return {CODEPOINT_INVALID}; uint32_t output = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & 0x3F) << 0); @@ -77,15 +103,15 @@ private: // 4 bytes uint8_t input2 = next_byte (); if ((input2 & 0xC0) != 0x80) - return {0xFFFE}; + return {CODEPOINT_INVALID}; uint8_t input3 = next_byte (); if ((input3 & 0xC0) != 0x80) - return {0xFFFE}; + return {CODEPOINT_INVALID}; uint8_t input4 = next_byte (); if ((input4 & 0xC0) != 0x80) - return {0xFFFE}; + return {CODEPOINT_INVALID}; uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12) | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0); @@ -93,23 +119,26 @@ private: } else { - return {0xFFFE}; + return {CODEPOINT_INVALID}; } } protected: - // Check if the input source is valid as utf-8 and copy all characters to - // `chars`. + // This method must be called by the constructor to initialize the input + // source. We cannot move this to the constructor because it calls a + // virtual method . void init () { + // Check if the input source is valid as utf-8 and copy all characters to + // `chars`. Codepoint char32 = next_codepoint (); - while (!char32.is_eof () && char32 != 0xFFFE) + while (!char32.is_eof () && char32 != CODEPOINT_INVALID) { chars.push_back (char32); char32 = next_codepoint (); } - if (char32 == 0xFFFE) + if (char32 == CODEPOINT_INVALID) { // Input source is not valid as utf-8. is_valid_utf8 = false; @@ -158,11 +187,7 @@ private: public: // Create new input source from file. - FileInputSource (FILE *input) : InputSource (), input (input) - { - // TODO make this better? - init (); - } + FileInputSource (FILE *input) : InputSource (), input (input) { init (); } }; class BufferInputSource : public InputSource @@ -175,7 +200,7 @@ private: { if (offs >= buffer.size ()) return EOF; - return (uint8_t) buffer.at (offs++); + return static_cast<uint8_t> (buffer.at (offs++)); } public: @@ -183,7 +208,6 @@ public: BufferInputSource (const std::string &b, size_t offset) : InputSource (), buffer (b), offs (offset) { - // TODO make this better? init (); } }; diff --git a/gcc/rust/lex/rust-codepoint.h b/gcc/rust/util/rust-codepoint.h index 755c837..755c837 100644 --- a/gcc/rust/lex/rust-codepoint.h +++ b/gcc/rust/util/rust-codepoint.h diff --git a/gcc/rust/util/rust-punycode.cc b/gcc/rust/util/rust-punycode.cc index a35d54a..6c796ab 100644 --- a/gcc/rust/util/rust-punycode.cc +++ b/gcc/rust/util/rust-punycode.cc @@ -36,15 +36,13 @@ constexpr uint32_t INITIAL_BIAS = 72; constexpr uint32_t INITIAL_N = 128; constexpr char DELIMITER = '-'; -constexpr uint32_t MAX_ASCII_CODEPOINT = 0x7F; - std::string extract_basic_string (const std::vector<Codepoint> &src) { std::string basic_string; for (auto c : src) { - if (c.value <= MAX_ASCII_CODEPOINT) + if (c.value <= 0x7F) basic_string += c.as_string (); } return basic_string; diff --git a/gcc/rust/util/rust-unicode.cc b/gcc/rust/util/rust-unicode.cc index b2ddaf0..95653cb 100644 --- a/gcc/rust/util/rust-unicode.cc +++ b/gcc/rust/util/rust-unicode.cc @@ -1,3 +1,21 @@ +// Copyright (C) 2020-2023 Free Software Foundation, Inc. + +// This file is part of GCC. + +// GCC is free software; you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 3, or (at your option) any later +// version. + +// GCC is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. + +// You should have received a copy of the GNU General Public License +// along with GCC; see the file COPYING3. If not see +// <http://www.gnu.org/licenses/>. + #include "rust-system.h" #include "optional.h" #include "selftest.h" |