gccrs: clean up Codepoint and InputSource

gcc/rust/ChangeLog: * lex/rust-codepoint.h: Moved to... * util/rust-codepoint.h: ...here. * lex/rust-input-source.h: Add missing license * util/rust-unicode.cc: Add missing license * util/rust-punycode.cc (extract_basic_string): Remove constant Signed-off-by: Raiki Tamura <tamaron1203@gmail.com>
author: Raiki Tamura <tamaron1203@gmail.com> 2023-08-06 19:17:17 +0900
committer: Arthur Cohen <arthur.cohen@embecosm.com> 2024-01-16 19:00:30 +0100
commit: 5b47923fe512f088a4f1c31466236843c20b7ff9 (patch)
tree: 4c3258bbf931ea7093e47d8c7008bcc35d9dbeaa /gcc/rust
parent: 1e288d66cb4f0a25a249c0c6dacc2efbf9e44dc8 (diff)
download: gcc-5b47923fe512f088a4f1c31466236843c20b7ff9.zip
gcc-5b47923fe512f088a4f1c31466236843c20b7ff9.tar.gz
gcc-5b47923fe512f088a4f1c31466236843c20b7ff9.tar.bz2
4 files changed, 66 insertions, 26 deletions
diff --git a/gcc/rust/lex/rust-input-source.h b/gcc/rust/lex/rust-input-source.h
index 07137de..32261a0 100644
--- a/gcc/rust/lex/rust-input-source.h
+++ b/gcc/rust/lex/rust-input-source.h
@@ -1,3 +1,21 @@
+// Copyright (C) 2020-2023 Free Software Foundation, Inc.
+
+// This file is part of GCC.
+
+// GCC is free software; you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free
+// Software Foundation; either version 3, or (at your option) any later
+// version.
+
+// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+// for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with GCC; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
 #ifndef RUST_INPUT_SOURCE_H
 #define RUST_INPUT_SOURCE_H
 
@@ -5,6 +23,14 @@
 #include "optional.h"
 
 namespace Rust {
+
+constexpr uint8_t UTF8_BOM1 = 0xEF;
+constexpr uint8_t UTF8_BOM2 = 0xBB;
+constexpr uint8_t UTF8_BOM3 = 0xBF;
+
+constexpr uint32_t MAX_ASCII_CODEPOINT = 0x7F;
+constexpr uint32_t CODEPOINT_INVALID = 0xFFFE;
+
 // Input source wrapper thing.
 class InputSource
 {
@@ -23,7 +49,7 @@ private:
 
     if ((int32_t) input == EOF)
       return Codepoint::eof ();
-    else if (input < 128)
+    else if (input <= MAX_ASCII_CODEPOINT)
       {
 	// ascii -- 1 byte
 	return {input};
@@ -31,14 +57,14 @@ private:
     else if ((input & 0xC0) == 0x80)
       {
 	// invalid (continuation; can't be first char)
-	return {0xFFFE};
+	return {CODEPOINT_INVALID};
       }
     else if ((input & 0xE0) == 0xC0)
       {
 	// 2 bytes
 	uint8_t input2 = next_byte ();
 	if ((input2 & 0xC0) != 0x80)
-	  return {0xFFFE};
+	  return {CODEPOINT_INVALID};
 
 	uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
 	return output;
@@ -50,23 +76,23 @@ private:
 	// If the second byte is equal to 0xBB then the input is no longer a
 	// valid UTF-8 char. Then, we check if the third byte makes up a UTF
 	// BOM.
-	if (input == 0xEF && input2 == 0xBB)
+	if (input == UTF8_BOM1 && input2 == UTF8_BOM2)
 	  {
 	    uint8_t input3 = next_byte ();
-	    if (input3 == 0xBF)
+	    if (input3 == UTF8_BOM3)
 	      // found BOM
 	      return next_codepoint ();
 	    else
-	      return {0xFFFE};
+	      return {CODEPOINT_INVALID};
 	  }
 
 	if ((input2 & 0xC0) != 0x80)
-	  return {0xFFFE};
+	  return {CODEPOINT_INVALID};
 
 	uint8_t input3 = next_byte ();
 
 	if ((input3 & 0xC0) != 0x80)
-	  return {0xFFFE};
+	  return {CODEPOINT_INVALID};
 
 	uint32_t output = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6)
 			  | ((input3 & 0x3F) << 0);
@@ -77,15 +103,15 @@ private:
 	// 4 bytes
 	uint8_t input2 = next_byte ();
 	if ((input2 & 0xC0) != 0x80)
-	  return {0xFFFE};
+	  return {CODEPOINT_INVALID};
 
 	uint8_t input3 = next_byte ();
 	if ((input3 & 0xC0) != 0x80)
-	  return {0xFFFE};
+	  return {CODEPOINT_INVALID};
 
 	uint8_t input4 = next_byte ();
 	if ((input4 & 0xC0) != 0x80)
-	  return {0xFFFE};
+	  return {CODEPOINT_INVALID};
 
 	uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
 			  | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
@@ -93,23 +119,26 @@ private:
       }
     else
       {
-	return {0xFFFE};
+	return {CODEPOINT_INVALID};
       }
   }
 
 protected:
-  // Check if the input source is valid as utf-8 and copy all characters to
-  // `chars`.
+  // This method must be called by the constructor to initialize the input
+  // source. We cannot move this to the constructor because it calls a
+  // virtual method .
   void init ()
   {
+    // Check if the input source is valid as utf-8 and copy all characters to
+    // `chars`.
     Codepoint char32 = next_codepoint ();
-    while (!char32.is_eof () && char32 != 0xFFFE)
+    while (!char32.is_eof () && char32 != CODEPOINT_INVALID)
       {
 	chars.push_back (char32);
 	char32 = next_codepoint ();
       }
 
-    if (char32 == 0xFFFE)
+    if (char32 == CODEPOINT_INVALID)
       {
 	// Input source is not valid as utf-8.
 	is_valid_utf8 = false;
@@ -158,11 +187,7 @@ private:
 
 public:
   // Create new input source from file.
-  FileInputSource (FILE *input) : InputSource (), input (input)
-  {
-    // TODO make this better?
-    init ();
-  }
+  FileInputSource (FILE *input) : InputSource (), input (input) { init (); }
 };
 
 class BufferInputSource : public InputSource
@@ -175,7 +200,7 @@ private:
   {
     if (offs >= buffer.size ())
       return EOF;
-    return (uint8_t) buffer.at (offs++);
+    return static_cast<uint8_t> (buffer.at (offs++));
   }
 
 public:
@@ -183,7 +208,6 @@ public:
   BufferInputSource (const std::string &b, size_t offset)
     : InputSource (), buffer (b), offs (offset)
   {
-    // TODO make this better?
     init ();
   }
 };
diff --git a/gcc/rust/lex/rust-codepoint.h b/gcc/rust/util/rust-codepoint.h
index 755c837..755c837 100644
--- a/gcc/rust/lex/rust-codepoint.h
+++ b/gcc/rust/util/rust-codepoint.h
diff --git a/gcc/rust/util/rust-punycode.cc b/gcc/rust/util/rust-punycode.cc
index a35d54a..6c796ab 100644
--- a/gcc/rust/util/rust-punycode.cc
+++ b/gcc/rust/util/rust-punycode.cc
@@ -36,15 +36,13 @@ constexpr uint32_t INITIAL_BIAS = 72;
 constexpr uint32_t INITIAL_N = 128;
 constexpr char DELIMITER = '-';
 
-constexpr uint32_t MAX_ASCII_CODEPOINT = 0x7F;
-
 std::string
 extract_basic_string (const std::vector<Codepoint> &src)
 {
   std::string basic_string;
   for (auto c : src)
     {
-      if (c.value <= MAX_ASCII_CODEPOINT)
+      if (c.value <= 0x7F)
 	basic_string += c.as_string ();
     }
   return basic_string;
diff --git a/gcc/rust/util/rust-unicode.cc b/gcc/rust/util/rust-unicode.cc
index b2ddaf0..95653cb 100644
--- a/gcc/rust/util/rust-unicode.cc
+++ b/gcc/rust/util/rust-unicode.cc
@@ -1,3 +1,21 @@
+// Copyright (C) 2020-2023 Free Software Foundation, Inc.
+
+// This file is part of GCC.
+
+// GCC is free software; you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free
+// Software Foundation; either version 3, or (at your option) any later
+// version.
+
+// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+// for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with GCC; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
 #include "rust-system.h"
 #include "optional.h"
 #include "selftest.h"
author	Raiki Tamura <tamaron1203@gmail.com>	2023-08-06 19:17:17 +0900
committer	Arthur Cohen <arthur.cohen@embecosm.com>	2024-01-16 19:00:30 +0100
commit	5b47923fe512f088a4f1c31466236843c20b7ff9 (patch)
tree	4c3258bbf931ea7093e47d8c7008bcc35d9dbeaa /gcc/rust
parent	1e288d66cb4f0a25a249c0c6dacc2efbf9e44dc8 (diff)
download	gcc-5b47923fe512f088a4f1c31466236843c20b7ff9.zip gcc-5b47923fe512f088a4f1c31466236843c20b7ff9.tar.gz gcc-5b47923fe512f088a4f1c31466236843c20b7ff9.tar.bz2