From 2b1d37ced31362a6212ebd8f04883d7e4cf28e05 Mon Sep 17 00:00:00 2001 From: Raiki Tamura Date: Sun, 6 Aug 2023 19:17:17 +0900 Subject: gccrs: clean up Codepoint and InputSource gcc/rust/ChangeLog: * lex/rust-codepoint.h: Moved to... * util/rust-codepoint.h: ...here. * lex/rust-input-source.h: Add missing license * util/rust-unicode.cc: Add missing license * util/rust-punycode.cc (extract_basic_string): Remove constant Signed-off-by: Raiki Tamura --- gcc/rust/lex/rust-codepoint.h | 48 --------------------------- gcc/rust/lex/rust-input-source.h | 70 +++++++++++++++++++++++++++------------- gcc/rust/util/rust-codepoint.h | 48 +++++++++++++++++++++++++++ gcc/rust/util/rust-punycode.cc | 4 +-- gcc/rust/util/rust-unicode.cc | 18 +++++++++++ 5 files changed, 114 insertions(+), 74 deletions(-) delete mode 100644 gcc/rust/lex/rust-codepoint.h create mode 100644 gcc/rust/util/rust-codepoint.h (limited to 'gcc') diff --git a/gcc/rust/lex/rust-codepoint.h b/gcc/rust/lex/rust-codepoint.h deleted file mode 100644 index eaed664..0000000 --- a/gcc/rust/lex/rust-codepoint.h +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (C) 2020-2023 Free Software Foundation, Inc. - -// This file is part of GCC. - -// GCC is free software; you can redistribute it and/or modify it under -// the terms of the GNU General Public License as published by the Free -// Software Foundation; either version 3, or (at your option) any later -// version. - -// GCC is distributed in the hope that it will be useful, but WITHOUT ANY -// WARRANTY; without even the implied warranty of MERCHANTABILITY or -// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -// for more details. - -// You should have received a copy of the GNU General Public License -// along with GCC; see the file COPYING3. If not see -// . - -#ifndef RUST_CODEPOINT_H -#define RUST_CODEPOINT_H - -#include "rust-system.h" - -namespace Rust { - -// FIXME: move this to rust-unicode.h? -struct Codepoint -{ - uint32_t value; - - // Creates a zero codepoint. - Codepoint () : value (0) {} - - // Creates a codepoint from an encoded UTF-8 value. - Codepoint (uint32_t value) : value (value) {} - - static Codepoint eof () { return Codepoint (UINT32_MAX); } - bool is_eof () const { return value == UINT32_MAX; } - - // Returns a C++ string containing string value of codepoint. - std::string as_string (); - - bool operator== (Codepoint other) const { return value == other.value; } - bool operator!= (Codepoint other) const { return !operator== (other); } -}; -} // namespace Rust - -#endif diff --git a/gcc/rust/lex/rust-input-source.h b/gcc/rust/lex/rust-input-source.h index 07137de..32261a0 100644 --- a/gcc/rust/lex/rust-input-source.h +++ b/gcc/rust/lex/rust-input-source.h @@ -1,3 +1,21 @@ +// Copyright (C) 2020-2023 Free Software Foundation, Inc. + +// This file is part of GCC. + +// GCC is free software; you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 3, or (at your option) any later +// version. + +// GCC is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. + +// You should have received a copy of the GNU General Public License +// along with GCC; see the file COPYING3. If not see +// . + #ifndef RUST_INPUT_SOURCE_H #define RUST_INPUT_SOURCE_H @@ -5,6 +23,14 @@ #include "optional.h" namespace Rust { + +constexpr uint8_t UTF8_BOM1 = 0xEF; +constexpr uint8_t UTF8_BOM2 = 0xBB; +constexpr uint8_t UTF8_BOM3 = 0xBF; + +constexpr uint32_t MAX_ASCII_CODEPOINT = 0x7F; +constexpr uint32_t CODEPOINT_INVALID = 0xFFFE; + // Input source wrapper thing. class InputSource { @@ -23,7 +49,7 @@ private: if ((int32_t) input == EOF) return Codepoint::eof (); - else if (input < 128) + else if (input <= MAX_ASCII_CODEPOINT) { // ascii -- 1 byte return {input}; @@ -31,14 +57,14 @@ private: else if ((input & 0xC0) == 0x80) { // invalid (continuation; can't be first char) - return {0xFFFE}; + return {CODEPOINT_INVALID}; } else if ((input & 0xE0) == 0xC0) { // 2 bytes uint8_t input2 = next_byte (); if ((input2 & 0xC0) != 0x80) - return {0xFFFE}; + return {CODEPOINT_INVALID}; uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0); return output; @@ -50,23 +76,23 @@ private: // If the second byte is equal to 0xBB then the input is no longer a // valid UTF-8 char. Then, we check if the third byte makes up a UTF // BOM. - if (input == 0xEF && input2 == 0xBB) + if (input == UTF8_BOM1 && input2 == UTF8_BOM2) { uint8_t input3 = next_byte (); - if (input3 == 0xBF) + if (input3 == UTF8_BOM3) // found BOM return next_codepoint (); else - return {0xFFFE}; + return {CODEPOINT_INVALID}; } if ((input2 & 0xC0) != 0x80) - return {0xFFFE}; + return {CODEPOINT_INVALID}; uint8_t input3 = next_byte (); if ((input3 & 0xC0) != 0x80) - return {0xFFFE}; + return {CODEPOINT_INVALID}; uint32_t output = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & 0x3F) << 0); @@ -77,15 +103,15 @@ private: // 4 bytes uint8_t input2 = next_byte (); if ((input2 & 0xC0) != 0x80) - return {0xFFFE}; + return {CODEPOINT_INVALID}; uint8_t input3 = next_byte (); if ((input3 & 0xC0) != 0x80) - return {0xFFFE}; + return {CODEPOINT_INVALID}; uint8_t input4 = next_byte (); if ((input4 & 0xC0) != 0x80) - return {0xFFFE}; + return {CODEPOINT_INVALID}; uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12) | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0); @@ -93,23 +119,26 @@ private: } else { - return {0xFFFE}; + return {CODEPOINT_INVALID}; } } protected: - // Check if the input source is valid as utf-8 and copy all characters to - // `chars`. + // This method must be called by the constructor to initialize the input + // source. We cannot move this to the constructor because it calls a + // virtual method . void init () { + // Check if the input source is valid as utf-8 and copy all characters to + // `chars`. Codepoint char32 = next_codepoint (); - while (!char32.is_eof () && char32 != 0xFFFE) + while (!char32.is_eof () && char32 != CODEPOINT_INVALID) { chars.push_back (char32); char32 = next_codepoint (); } - if (char32 == 0xFFFE) + if (char32 == CODEPOINT_INVALID) { // Input source is not valid as utf-8. is_valid_utf8 = false; @@ -158,11 +187,7 @@ private: public: // Create new input source from file. - FileInputSource (FILE *input) : InputSource (), input (input) - { - // TODO make this better? - init (); - } + FileInputSource (FILE *input) : InputSource (), input (input) { init (); } }; class BufferInputSource : public InputSource @@ -175,7 +200,7 @@ private: { if (offs >= buffer.size ()) return EOF; - return (uint8_t) buffer.at (offs++); + return static_cast (buffer.at (offs++)); } public: @@ -183,7 +208,6 @@ public: BufferInputSource (const std::string &b, size_t offset) : InputSource (), buffer (b), offs (offset) { - // TODO make this better? init (); } }; diff --git a/gcc/rust/util/rust-codepoint.h b/gcc/rust/util/rust-codepoint.h new file mode 100644 index 0000000..eaed664 --- /dev/null +++ b/gcc/rust/util/rust-codepoint.h @@ -0,0 +1,48 @@ +// Copyright (C) 2020-2023 Free Software Foundation, Inc. + +// This file is part of GCC. + +// GCC is free software; you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 3, or (at your option) any later +// version. + +// GCC is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. + +// You should have received a copy of the GNU General Public License +// along with GCC; see the file COPYING3. If not see +// . + +#ifndef RUST_CODEPOINT_H +#define RUST_CODEPOINT_H + +#include "rust-system.h" + +namespace Rust { + +// FIXME: move this to rust-unicode.h? +struct Codepoint +{ + uint32_t value; + + // Creates a zero codepoint. + Codepoint () : value (0) {} + + // Creates a codepoint from an encoded UTF-8 value. + Codepoint (uint32_t value) : value (value) {} + + static Codepoint eof () { return Codepoint (UINT32_MAX); } + bool is_eof () const { return value == UINT32_MAX; } + + // Returns a C++ string containing string value of codepoint. + std::string as_string (); + + bool operator== (Codepoint other) const { return value == other.value; } + bool operator!= (Codepoint other) const { return !operator== (other); } +}; +} // namespace Rust + +#endif diff --git a/gcc/rust/util/rust-punycode.cc b/gcc/rust/util/rust-punycode.cc index a35d54a..6c796ab 100644 --- a/gcc/rust/util/rust-punycode.cc +++ b/gcc/rust/util/rust-punycode.cc @@ -36,15 +36,13 @@ constexpr uint32_t INITIAL_BIAS = 72; constexpr uint32_t INITIAL_N = 128; constexpr char DELIMITER = '-'; -constexpr uint32_t MAX_ASCII_CODEPOINT = 0x7F; - std::string extract_basic_string (const std::vector &src) { std::string basic_string; for (auto c : src) { - if (c.value <= MAX_ASCII_CODEPOINT) + if (c.value <= 0x7F) basic_string += c.as_string (); } return basic_string; diff --git a/gcc/rust/util/rust-unicode.cc b/gcc/rust/util/rust-unicode.cc index b2ddaf0..95653cb 100644 --- a/gcc/rust/util/rust-unicode.cc +++ b/gcc/rust/util/rust-unicode.cc @@ -1,3 +1,21 @@ +// Copyright (C) 2020-2023 Free Software Foundation, Inc. + +// This file is part of GCC. + +// GCC is free software; you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 3, or (at your option) any later +// version. + +// GCC is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. + +// You should have received a copy of the GNU General Public License +// along with GCC; see the file COPYING3. If not see +// . + #include "rust-system.h" #include "optional.h" #include "selftest.h" -- cgit v1.1