diff options
Diffstat (limited to 'gcc/rust/lex/rust-input-source.h')
-rw-r--r-- | gcc/rust/lex/rust-input-source.h | 193 |
1 files changed, 193 insertions, 0 deletions
diff --git a/gcc/rust/lex/rust-input-source.h b/gcc/rust/lex/rust-input-source.h new file mode 100644 index 0000000..07137de --- /dev/null +++ b/gcc/rust/lex/rust-input-source.h @@ -0,0 +1,193 @@ +#ifndef RUST_INPUT_SOURCE_H +#define RUST_INPUT_SOURCE_H + +#include "rust-codepoint.h" +#include "optional.h" + +namespace Rust { +// Input source wrapper thing. +class InputSource +{ +private: + // position of current character + unsigned int pos; + std::vector<Codepoint> chars; + bool is_valid_utf8; + + // Overload operator () to return next char from input stream. + virtual int next_byte () = 0; + + Codepoint next_codepoint () + { + uint32_t input = next_byte (); + + if ((int32_t) input == EOF) + return Codepoint::eof (); + else if (input < 128) + { + // ascii -- 1 byte + return {input}; + } + else if ((input & 0xC0) == 0x80) + { + // invalid (continuation; can't be first char) + return {0xFFFE}; + } + else if ((input & 0xE0) == 0xC0) + { + // 2 bytes + uint8_t input2 = next_byte (); + if ((input2 & 0xC0) != 0x80) + return {0xFFFE}; + + uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0); + return output; + } + else if ((input & 0xF0) == 0xE0) + { + // 3 bytes or UTF-8 BOM + uint8_t input2 = next_byte (); + // If the second byte is equal to 0xBB then the input is no longer a + // valid UTF-8 char. Then, we check if the third byte makes up a UTF + // BOM. + if (input == 0xEF && input2 == 0xBB) + { + uint8_t input3 = next_byte (); + if (input3 == 0xBF) + // found BOM + return next_codepoint (); + else + return {0xFFFE}; + } + + if ((input2 & 0xC0) != 0x80) + return {0xFFFE}; + + uint8_t input3 = next_byte (); + + if ((input3 & 0xC0) != 0x80) + return {0xFFFE}; + + uint32_t output = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) + | ((input3 & 0x3F) << 0); + return {output}; + } + else if ((input & 0xF8) == 0xF0) + { + // 4 bytes + uint8_t input2 = next_byte (); + if ((input2 & 0xC0) != 0x80) + return {0xFFFE}; + + uint8_t input3 = next_byte (); + if ((input3 & 0xC0) != 0x80) + return {0xFFFE}; + + uint8_t input4 = next_byte (); + if ((input4 & 0xC0) != 0x80) + return {0xFFFE}; + + uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12) + | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0); + return {output}; + } + else + { + return {0xFFFE}; + } + } + +protected: + // Check if the input source is valid as utf-8 and copy all characters to + // `chars`. + void init () + { + Codepoint char32 = next_codepoint (); + while (!char32.is_eof () && char32 != 0xFFFE) + { + chars.push_back (char32); + char32 = next_codepoint (); + } + + if (char32 == 0xFFFE) + { + // Input source is not valid as utf-8. + is_valid_utf8 = false; + } + } + +public: + InputSource () : pos (0), chars ({}), is_valid_utf8 (true) {} + + virtual ~InputSource () {} + + // Checks if input source is a valid UTF-8 string + bool is_valid () { return is_valid_utf8; } + + // get the next UTF-8 character + Codepoint next () + { + if (pos >= chars.size ()) + return Codepoint::eof (); + else + { + Codepoint c = chars[pos]; + pos++; + return c; + } + } + + // Returns codepoint if input source is a valid UTF-8 string. Returns + // nullopt otherwise. + tl::optional<std::vector<Codepoint>> get_chars () + { + if (is_valid ()) + return {chars}; + else + return tl::nullopt; + } +}; + +class FileInputSource : public InputSource +{ +private: + // Input source file. + FILE *input; + + int next_byte () override { return fgetc (input); } + +public: + // Create new input source from file. + FileInputSource (FILE *input) : InputSource (), input (input) + { + // TODO make this better? + init (); + } +}; + +class BufferInputSource : public InputSource +{ +private: + const std::string &buffer; + size_t offs; + + int next_byte () override + { + if (offs >= buffer.size ()) + return EOF; + return (uint8_t) buffer.at (offs++); + } + +public: + // Create new input source from file. + BufferInputSource (const std::string &b, size_t offset) + : InputSource (), buffer (b), offs (offset) + { + // TODO make this better? + init (); + } +}; + +} // namespace Rust + +#endif |