// Copyright (C) 2020-2025 Free Software Foundation, Inc. // This file is part of GCC. // GCC is free software; you can redistribute it and/or modify it under // the terms of the GNU General Public License as published by the Free // Software Foundation; either version 3, or (at your option) any later // version. // GCC is distributed in the hope that it will be useful, but WITHOUT ANY // WARRANTY; without even the implied warranty of MERCHANTABILITY or // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License // for more details. // You should have received a copy of the GNU General Public License // along with GCC; see the file COPYING3. If not see // <http://www.gnu.org/licenses/>. #ifndef RUST_INPUT_SOURCE_H #define RUST_INPUT_SOURCE_H #include "rust-codepoint.h" #include "optional.h" namespace Rust { constexpr uint8_t UTF8_BOM1 = 0xEF; constexpr uint8_t UTF8_BOM2 = 0xBB; constexpr uint8_t UTF8_BOM3 = 0xBF; // Input source wrapper thing. class InputSource { private: // position of current character unsigned int pos; std::vector<Codepoint> chars; bool is_valid_utf8; // Overload operator () to return next char from input stream. virtual int next_byte () = 0; Codepoint next_codepoint () { uint32_t input = next_byte (); if ((int32_t) input == EOF) return Codepoint::eof (); else if (input <= MAX_ASCII_CODEPOINT) { // ascii -- 1 byte return {input}; } else if ((input & 0xC0) == 0x80) { // invalid (continuation; can't be first char) return {CODEPOINT_INVALID}; } else if ((input & 0xE0) == 0xC0) { // 2 bytes uint8_t input2 = next_byte (); if ((input2 & 0xC0) != 0x80) return {CODEPOINT_INVALID}; uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0); return output; } else if ((input & 0xF0) == 0xE0) { // 3 bytes or UTF-8 BOM uint8_t input2 = next_byte (); // If the second byte is equal to 0xBB then the input is no longer a // valid UTF-8 char. Then, we check if the third byte makes up a UTF // BOM. if (input == UTF8_BOM1 && input2 == UTF8_BOM2) { uint8_t input3 = next_byte (); if (input3 == UTF8_BOM3) // found BOM return next_codepoint (); else return {CODEPOINT_INVALID}; } if ((input2 & 0xC0) != 0x80) return {CODEPOINT_INVALID}; uint8_t input3 = next_byte (); if ((input3 & 0xC0) != 0x80) return {CODEPOINT_INVALID}; uint32_t output = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & 0x3F) << 0); return {output}; } else if ((input & 0xF8) == 0xF0) { // 4 bytes uint8_t input2 = next_byte (); if ((input2 & 0xC0) != 0x80) return {CODEPOINT_INVALID}; uint8_t input3 = next_byte (); if ((input3 & 0xC0) != 0x80) return {CODEPOINT_INVALID}; uint8_t input4 = next_byte (); if ((input4 & 0xC0) != 0x80) return {CODEPOINT_INVALID}; uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12) | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0); return {output}; } else { return {CODEPOINT_INVALID}; } } protected: // This method must be called by the constructor to initialize the input // source. We cannot move this to the constructor because it calls a // virtual method . void init () { // Check if the input source is valid as utf-8 and copy all characters to // `chars`. Codepoint char32 = next_codepoint (); while (!char32.is_eof () && char32 != CODEPOINT_INVALID) { chars.push_back (char32); char32 = next_codepoint (); } if (char32 == CODEPOINT_INVALID) { // Input source is not valid as utf-8. is_valid_utf8 = false; } } public: InputSource () : pos (0), chars ({}), is_valid_utf8 (true) {} virtual ~InputSource () {} // Checks if input source is a valid UTF-8 string bool is_valid () { return is_valid_utf8; } // get the next UTF-8 character Codepoint next () { if (pos >= chars.size ()) return Codepoint::eof (); else { Codepoint c = chars[pos]; pos++; return c; } } // Returns codepoint if input source is a valid UTF-8 string. Returns // nullopt otherwise. tl::optional<std::vector<Codepoint>> get_chars () { if (is_valid ()) return {chars}; else return tl::nullopt; } }; class FileInputSource : public InputSource { private: // Input source file. FILE *input; int next_byte () override { return fgetc (input); } public: // Create new input source from file. FileInputSource (FILE *input) : InputSource (), input (input) { init (); } }; class BufferInputSource : public InputSource { private: const std::string &buffer; size_t offs; int next_byte () override { if (offs >= buffer.size ()) return EOF; return static_cast<uint8_t> (buffer.at (offs++)); } public: // Create new input source from file. BufferInputSource (const std::string &b, size_t offset) : InputSource (), buffer (b), offs (offset) { init (); } }; } // namespace Rust #endif