gccrs: libgrust: Add format_parser library

Compile libformat_parser and link to it. gcc/rust/ChangeLog: * Make-lang.in: Compile libformat_parser. * ast/rust-fmt.cc: New FFI definitions. * ast/rust-fmt.h: Likewise. * expand/rust-macro-builtins.cc (MacroBuiltin::format_args_handler): Call into libformat_parser. * expand/rust-macro-builtins.h: Define format_args!() handler proper. libgrust/ChangeLog: * libformat_parser/Cargo.lock: New file. * libformat_parser/Cargo.toml: New file. * libformat_parser/generic_format_parser/Cargo.toml: New file. * libformat_parser/generic_format_parser/src/lib.rs: New file. * libformat_parser/src/bin.rs: New file. * libformat_parser/src/lib.rs: New file.
author: Arthur Cohen <arthur.cohen@embecosm.com> 2024-04-23 13:38:58 +0200
committer: Arthur Cohen <arthur.cohen@embecosm.com> 2024-08-01 12:47:19 +0200
commit: 6fef4d6ffcab0fec8518adcb05458cba5dbeac25 (patch)
tree: bb3bdab1b69382086cc1bab294d79f60f75295c7 /libgrust
parent: 473feb033d5ccb139f8af8e0e54193b176d1cd93 (diff)
download: gcc-6fef4d6ffcab0fec8518adcb05458cba5dbeac25.zip
gcc-6fef4d6ffcab0fec8518adcb05458cba5dbeac25.tar.gz
gcc-6fef4d6ffcab0fec8518adcb05458cba5dbeac25.tar.bz2
6 files changed, 1210 insertions, 0 deletions
diff --git a/libgrust/libformat_parser/Cargo.lock b/libgrust/libformat_parser/Cargo.lock
new file mode 100644
index 0000000..65e4826
--- /dev/null
+++ b/libgrust/libformat_parser/Cargo.lock
@@ -0,0 +1,30 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "generic_format_parser"
+version = "0.1.0"
+dependencies = [
+ "unicode-xid",
+]
+
+[[package]]
+name = "libc"
+version = "0.2.152"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "13e3bf6590cbc649f4d1a3eefc9d5d6eb746f5200ffb04e5e142700b8faa56e7"
+
+[[package]]
+name = "libformat_parser"
+version = "0.1.0"
+dependencies = [
+ "generic_format_parser",
+ "libc",
+]
+
+[[package]]
+name = "unicode-xid"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f962df74c8c05a667b5ee8bcf162993134c104e96440b663c8daa176dc772d8c"
diff --git a/libgrust/libformat_parser/Cargo.toml b/libgrust/libformat_parser/Cargo.toml
new file mode 100644
index 0000000..0fcfa3e
--- /dev/null
+++ b/libgrust/libformat_parser/Cargo.toml
@@ -0,0 +1,21 @@
+[package]
+name = "libformat_parser"
+version = "0.1.0"
+edition = "2021"
+
+[workspace]
+
+members = [
+  "generic_format_parser",
+]
+
+[dependencies]
+libc = "0.2"
+generic_format_parser = { path = "generic_format_parser" }
+
+[lib]
+crate_type = ["staticlib", "rlib"]
+
+[[bin]]
+name = "format_parser_test"
+path = "src/bin.rs"
diff --git a/libgrust/libformat_parser/generic_format_parser/Cargo.toml b/libgrust/libformat_parser/generic_format_parser/Cargo.toml
new file mode 100644
index 0000000..3457703
--- /dev/null
+++ b/libgrust/libformat_parser/generic_format_parser/Cargo.toml
@@ -0,0 +1,9 @@
+[package]
+name = "generic_format_parser"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+unicode-xid = "0.2.0"
diff --git a/libgrust/libformat_parser/generic_format_parser/src/lib.rs b/libgrust/libformat_parser/generic_format_parser/src/lib.rs
new file mode 100644
index 0000000..f42c9d8
--- /dev/null
+++ b/libgrust/libformat_parser/generic_format_parser/src/lib.rs
@@ -0,0 +1,1102 @@
+//! Macro support for format strings
+//!
+//! These structures are used when parsing format strings for the compiler.
+//! Parsing does not happen at runtime: structures of `std::fmt::rt` are
+//! generated instead.
+
+#![doc(
+    html_root_url = "https://doc.rust-lang.org/nightly/nightly-rustc/",
+    html_playground_url = "https://play.rust-lang.org/",
+    test(attr(deny(warnings)))
+)]
+#![deny(rustc::untranslatable_diagnostic)]
+#![deny(rustc::diagnostic_outside_of_impl)]
+// WARNING: We want to be able to build this crate with a stable compiler,
+//          so no `#![feature]` attributes should be added!
+
+#[deprecated(note = "Use a proper lexer function for this")]
+fn is_id_start(c: char) -> bool {
+    c == '_' || unicode_xid::UnicodeXID::is_xid_start(c)
+}
+
+#[deprecated(note = "Use a proper lexer function for this")]
+fn is_id_continue(c: char) -> bool {
+    unicode_xid::UnicodeXID::is_xid_continue(c)
+}
+
+// use rustc_lexer::unescape;
+pub use Alignment::*;
+pub use Count::*;
+pub use Piece::*;
+pub use Position::*;
+
+use std::iter;
+use std::str;
+use std::string;
+
+// Note: copied from rustc_span
+/// Range inside of a `Span` used for diagnostics when we only have access to relative positions.
+#[derive(Copy, Clone, PartialEq, Eq, Debug)]
+pub struct InnerSpan {
+    pub start: usize,
+    pub end: usize,
+}
+
+impl InnerSpan {
+    pub fn new(start: usize, end: usize) -> InnerSpan {
+        InnerSpan { start, end }
+    }
+}
+
+/// The location and before/after width of a character whose width has changed from its source code
+/// representation
+#[derive(Copy, Clone, PartialEq, Eq)]
+pub struct InnerWidthMapping {
+    /// Index of the character in the source
+    pub position: usize,
+    /// The inner width in characters
+    pub before: usize,
+    /// The transformed width in characters
+    pub after: usize,
+}
+
+impl InnerWidthMapping {
+    pub fn new(position: usize, before: usize, after: usize) -> InnerWidthMapping {
+        InnerWidthMapping {
+            position,
+            before,
+            after,
+        }
+    }
+}
+
+/// Whether the input string is a literal. If yes, it contains the inner width mappings.
+#[derive(Clone, PartialEq, Eq)]
+enum InputStringKind {
+    NotALiteral,
+    Literal {
+        width_mappings: Vec<InnerWidthMapping>,
+    },
+}
+
+/// The type of format string that we are parsing.
+#[derive(Copy, Clone, Debug, Eq, PartialEq)]
+pub enum ParseMode {
+    /// A normal format string as per `format_args!`.
+    Format,
+    /// An inline assembly template string for `asm!`.
+    InlineAsm,
+}
+
+#[derive(Copy, Clone)]
+struct InnerOffset(usize);
+
+impl InnerOffset {
+    fn to(self, end: InnerOffset) -> InnerSpan {
+        InnerSpan::new(self.0, end.0)
+    }
+}
+
+/// A piece is a portion of the format string which represents the next part
+/// to emit. These are emitted as a stream by the `Parser` class.
+#[derive(Clone, Debug, PartialEq)]
+pub enum Piece<'a> {
+    /// A literal string which should directly be emitted
+    String(&'a str),
+    /// This describes that formatting should process the next argument (as
+    /// specified inside) for emission.
+    NextArgument(Box<Argument<'a>>),
+}
+
+/// Representation of an argument specification.
+#[derive(Copy, Clone, Debug, PartialEq)]
+pub struct Argument<'a> {
+    /// Where to find this argument
+    pub position: Position<'a>,
+    /// The span of the position indicator. Includes any whitespace in implicit
+    /// positions (`{  }`).
+    pub position_span: InnerSpan,
+    /// How to format the argument
+    pub format: FormatSpec<'a>,
+}
+
+/// Specification for the formatting of an argument in the format string.
+#[derive(Copy, Clone, Debug, PartialEq)]
+pub struct FormatSpec<'a> {
+    /// Optionally specified character to fill alignment with.
+    pub fill: Option<char>,
+    /// Span of the optionally specified fill character.
+    pub fill_span: Option<InnerSpan>,
+    /// Optionally specified alignment.
+    pub align: Alignment,
+    /// The `+` or `-` flag.
+    pub sign: Option<Sign>,
+    /// The `#` flag.
+    pub alternate: bool,
+    /// The `0` flag.
+    pub zero_pad: bool,
+    /// The `x` or `X` flag. (Only for `Debug`.)
+    pub debug_hex: Option<DebugHex>,
+    /// The integer precision to use.
+    pub precision: Count<'a>,
+    /// The span of the precision formatting flag (for diagnostics).
+    pub precision_span: Option<InnerSpan>,
+    /// The string width requested for the resulting format.
+    pub width: Count<'a>,
+    /// The span of the width formatting flag (for diagnostics).
+    pub width_span: Option<InnerSpan>,
+    /// The descriptor string representing the name of the format desired for
+    /// this argument, this can be empty or any number of characters, although
+    /// it is required to be one word.
+    pub ty: &'a str,
+    /// The span of the descriptor string (for diagnostics).
+    pub ty_span: Option<InnerSpan>,
+}
+
+/// Enum describing where an argument for a format can be located.
+#[derive(Copy, Clone, Debug, PartialEq)]
+pub enum Position<'a> {
+    /// The argument is implied to be located at an index
+    ArgumentImplicitlyIs(usize),
+    /// The argument is located at a specific index given in the format,
+    ArgumentIs(usize),
+    /// The argument has a name.
+    ArgumentNamed(&'a str),
+}
+
+impl Position<'_> {
+    pub fn index(&self) -> Option<usize> {
+        match self {
+            ArgumentIs(i, ..) | ArgumentImplicitlyIs(i) => Some(*i),
+            _ => None,
+        }
+    }
+}
+
+/// Enum of alignments which are supported.
+#[derive(Copy, Clone, Debug, PartialEq)]
+pub enum Alignment {
+    /// The value will be aligned to the left.
+    AlignLeft,
+    /// The value will be aligned to the right.
+    AlignRight,
+    /// The value will be aligned in the center.
+    AlignCenter,
+    /// The value will take on a default alignment.
+    AlignUnknown,
+}
+
+/// Enum for the sign flags.
+#[derive(Copy, Clone, Debug, PartialEq)]
+pub enum Sign {
+    /// The `+` flag.
+    Plus,
+    /// The `-` flag.
+    Minus,
+}
+
+/// Enum for the debug hex flags.
+#[derive(Copy, Clone, Debug, PartialEq)]
+pub enum DebugHex {
+    /// The `x` flag in `{:x?}`.
+    Lower,
+    /// The `X` flag in `{:X?}`.
+    Upper,
+}
+
+/// A count is used for the precision and width parameters of an integer, and
+/// can reference either an argument or a literal integer.
+#[derive(Copy, Clone, Debug, PartialEq)]
+pub enum Count<'a> {
+    /// The count is specified explicitly.
+    CountIs(usize),
+    /// The count is specified by the argument with the given name.
+    CountIsName(&'a str, InnerSpan),
+    /// The count is specified by the argument at the given index.
+    CountIsParam(usize),
+    /// The count is specified by a star (like in `{:.*}`) that refers to the argument at the given index.
+    CountIsStar(usize),
+    /// The count is implied and cannot be explicitly specified.
+    CountImplied,
+}
+
+pub struct ParseError {
+    pub description: string::String,
+    pub note: Option<string::String>,
+    pub label: string::String,
+    pub span: InnerSpan,
+    pub secondary_label: Option<(string::String, InnerSpan)>,
+    pub suggestion: Suggestion,
+}
+
+pub enum Suggestion {
+    None,
+    /// Replace inline argument with positional argument:
+    /// `format!("{foo.bar}")` -> `format!("{}", foo.bar)`
+    UsePositional,
+    /// Remove `r#` from identifier:
+    /// `format!("{r#foo}")` -> `format!("{foo}")`
+    RemoveRawIdent(InnerSpan),
+}
+
+/// The parser structure for interpreting the input format string. This is
+/// modeled as an iterator over `Piece` structures to form a stream of tokens
+/// being output.
+///
+/// This is a recursive-descent parser for the sake of simplicity, and if
+/// necessary there's probably lots of room for improvement performance-wise.
+pub struct Parser<'a> {
+    mode: ParseMode,
+    input: &'a str,
+    cur: iter::Peekable<str::CharIndices<'a>>,
+    /// Error messages accumulated during parsing
+    pub errors: Vec<ParseError>,
+    /// Current position of implicit positional argument pointer
+    pub curarg: usize,
+    /// `Some(raw count)` when the string is "raw", used to position spans correctly
+    style: Option<usize>,
+    /// Start and end byte offset of every successfully parsed argument
+    pub arg_places: Vec<InnerSpan>,
+    /// Characters whose length has been changed from their in-code representation
+    width_map: Vec<InnerWidthMapping>,
+    /// Span of the last opening brace seen, used for error reporting
+    last_opening_brace: Option<InnerSpan>,
+    /// Whether the source string is comes from `println!` as opposed to `format!` or `print!`
+    append_newline: bool,
+    /// Whether this formatting string was written directly in the source. This controls whether we
+    /// can use spans to refer into it and give better error messages.
+    /// N.B: This does _not_ control whether implicit argument captures can be used.
+    pub is_source_literal: bool,
+    /// Start position of the current line.
+    cur_line_start: usize,
+    /// Start and end byte offset of every line of the format string. Excludes
+    /// newline characters and leading whitespace.
+    pub line_spans: Vec<InnerSpan>,
+}
+
+impl<'a> Iterator for Parser<'a> {
+    type Item = Piece<'a>;
+
+    fn next(&mut self) -> Option<Piece<'a>> {
+        if let Some(&(pos, c)) = self.cur.peek() {
+            match c {
+                '{' => {
+                    let curr_last_brace = self.last_opening_brace;
+                    let byte_pos = self.to_span_index(pos);
+                    let lbrace_end = InnerOffset(byte_pos.0 + self.to_span_width(pos));
+                    self.last_opening_brace = Some(byte_pos.to(lbrace_end));
+                    self.cur.next();
+                    if self.consume('{') {
+                        self.last_opening_brace = curr_last_brace;
+
+                        Some(String(self.string(pos + 1)))
+                    } else {
+                        let arg = self.argument(lbrace_end);
+                        if let Some(rbrace_pos) = self.consume_closing_brace(&arg) {
+                            if self.is_source_literal {
+                                let lbrace_byte_pos = self.to_span_index(pos);
+                                let rbrace_byte_pos = self.to_span_index(rbrace_pos);
+
+                                let width = self.to_span_width(rbrace_pos);
+
+                                self.arg_places.push(
+                                    lbrace_byte_pos.to(InnerOffset(rbrace_byte_pos.0 + width)),
+                                );
+                            }
+                        } else {
+                            if let Some(&(_, maybe)) = self.cur.peek() {
+                                if maybe == '?' {
+                                    self.suggest_format();
+                                } else {
+                                    self.suggest_positional_arg_instead_of_captured_arg(arg);
+                                }
+                            }
+                        }
+                        Some(NextArgument(Box::new(arg)))
+                    }
+                }
+                '}' => {
+                    self.cur.next();
+                    if self.consume('}') {
+                        Some(String(self.string(pos + 1)))
+                    } else {
+                        let err_pos = self.to_span_index(pos);
+                        self.err_with_note(
+                            "unmatched `}` found",
+                            "unmatched `}`",
+                            "if you intended to print `}`, you can escape it using `}}`",
+                            err_pos.to(err_pos),
+                        );
+                        None
+                    }
+                }
+                _ => Some(String(self.string(pos))),
+            }
+        } else {
+            if self.is_source_literal {
+                let span = self.span(self.cur_line_start, self.input.len());
+                if self.line_spans.last() != Some(&span) {
+                    self.line_spans.push(span);
+                }
+            }
+            None
+        }
+    }
+}
+
+impl<'a> Parser<'a> {
+    /// Creates a new parser for the given format string
+    pub fn new(
+        s: &'a str,
+        style: Option<usize>,
+        snippet: Option<string::String>,
+        append_newline: bool,
+        mode: ParseMode,
+    ) -> Parser<'a> {
+        let input_string_kind = find_width_map_from_snippet(s, snippet, style);
+        let (width_map, is_source_literal) = match input_string_kind {
+            InputStringKind::Literal { width_mappings } => (width_mappings, true),
+            InputStringKind::NotALiteral => (Vec::new(), false),
+        };
+
+        Parser {
+            mode,
+            input: s,
+            cur: s.char_indices().peekable(),
+            errors: vec![],
+            curarg: 0,
+            style,
+            arg_places: vec![],
+            width_map,
+            last_opening_brace: None,
+            append_newline,
+            is_source_literal,
+            cur_line_start: 0,
+            line_spans: vec![],
+        }
+    }
+
+    /// Notifies of an error. The message doesn't actually need to be of type
+    /// String, but I think it does when this eventually uses conditions so it
+    /// might as well start using it now.
+    fn err<S1: Into<string::String>, S2: Into<string::String>>(
+        &mut self,
+        description: S1,
+        label: S2,
+        span: InnerSpan,
+    ) {
+        self.errors.push(ParseError {
+            description: description.into(),
+            note: None,
+            label: label.into(),
+            span,
+            secondary_label: None,
+            suggestion: Suggestion::None,
+        });
+    }
+
+    /// Notifies of an error. The message doesn't actually need to be of type
+    /// String, but I think it does when this eventually uses conditions so it
+    /// might as well start using it now.
+    fn err_with_note<
+        S1: Into<string::String>,
+        S2: Into<string::String>,
+        S3: Into<string::String>,
+    >(
+        &mut self,
+        description: S1,
+        label: S2,
+        note: S3,
+        span: InnerSpan,
+    ) {
+        self.errors.push(ParseError {
+            description: description.into(),
+            note: Some(note.into()),
+            label: label.into(),
+            span,
+            secondary_label: None,
+            suggestion: Suggestion::None,
+        });
+    }
+
+    /// Optionally consumes the specified character. If the character is not at
+    /// the current position, then the current iterator isn't moved and `false` is
+    /// returned, otherwise the character is consumed and `true` is returned.
+    fn consume(&mut self, c: char) -> bool {
+        self.consume_pos(c).is_some()
+    }
+
+    /// Optionally consumes the specified character. If the character is not at
+    /// the current position, then the current iterator isn't moved and `None` is
+    /// returned, otherwise the character is consumed and the current position is
+    /// returned.
+    fn consume_pos(&mut self, c: char) -> Option<usize> {
+        if let Some(&(pos, maybe)) = self.cur.peek() {
+            if c == maybe {
+                self.cur.next();
+                return Some(pos);
+            }
+        }
+        None
+    }
+
+    fn remap_pos(&self, mut pos: usize) -> InnerOffset {
+        for width in &self.width_map {
+            if pos > width.position {
+                pos += width.before - width.after;
+            } else if pos == width.position && width.after == 0 {
+                pos += width.before;
+            } else {
+                break;
+            }
+        }
+
+        InnerOffset(pos)
+    }
+
+    fn to_span_index(&self, pos: usize) -> InnerOffset {
+        // This handles the raw string case, the raw argument is the number of #
+        // in r###"..."### (we need to add one because of the `r`).
+        let raw = self.style.map_or(0, |raw| raw + 1);
+        let pos = self.remap_pos(pos);
+        InnerOffset(raw + pos.0 + 1)
+    }
+
+    fn to_span_width(&self, pos: usize) -> usize {
+        let pos = self.remap_pos(pos);
+        match self.width_map.iter().find(|w| w.position == pos.0) {
+            Some(w) => w.before,
+            None => 1,
+        }
+    }
+
+    fn span(&self, start_pos: usize, end_pos: usize) -> InnerSpan {
+        let start = self.to_span_index(start_pos);
+        let end = self.to_span_index(end_pos);
+        start.to(end)
+    }
+
+    /// Forces consumption of the specified character. If the character is not
+    /// found, an error is emitted.
+    fn consume_closing_brace(&mut self, arg: &Argument<'_>) -> Option<usize> {
+        self.ws();
+
+        let pos;
+        let description;
+
+        if let Some(&(peek_pos, maybe)) = self.cur.peek() {
+            if maybe == '}' {
+                self.cur.next();
+                return Some(peek_pos);
+            }
+
+            pos = peek_pos;
+            description = format!("expected `'}}'`, found `{maybe:?}`");
+        } else {
+            description = "expected `'}'` but string was terminated".to_owned();
+            // point at closing `"`
+            pos = self.input.len() - if self.append_newline { 1 } else { 0 };
+        }
+
+        let pos = self.to_span_index(pos);
+
+        let label = "expected `'}'`".to_owned();
+        let (note, secondary_label) = if arg.format.fill == Some('}') {
+            (
+                Some("the character `'}'` is interpreted as a fill character because of the `:` that precedes it".to_owned()),
+                arg.format.fill_span.map(|sp| ("this is not interpreted as a formatting closing brace".to_owned(), sp)),
+            )
+        } else {
+            (
+                Some("if you intended to print `{`, you can escape it using `{{`".to_owned()),
+                self.last_opening_brace
+                    .map(|sp| ("because of this opening brace".to_owned(), sp)),
+            )
+        };
+
+        self.errors.push(ParseError {
+            description,
+            note,
+            label,
+            span: pos.to(pos),
+            secondary_label,
+            suggestion: Suggestion::None,
+        });
+
+        None
+    }
+
+    /// Consumes all whitespace characters until the first non-whitespace character
+    fn ws(&mut self) {
+        while let Some(&(_, c)) = self.cur.peek() {
+            if c.is_whitespace() {
+                self.cur.next();
+            } else {
+                break;
+            }
+        }
+    }
+
+    /// Parses all of a string which is to be considered a "raw literal" in a
+    /// format string. This is everything outside of the braces.
+    fn string(&mut self, start: usize) -> &'a str {
+        // we may not consume the character, peek the iterator
+        while let Some(&(pos, c)) = self.cur.peek() {
+            match c {
+                '{' | '}' => {
+                    return &self.input[start..pos];
+                }
+                '\n' if self.is_source_literal => {
+                    self.line_spans.push(self.span(self.cur_line_start, pos));
+                    self.cur_line_start = pos + 1;
+                    self.cur.next();
+                }
+                _ => {
+                    if self.is_source_literal && pos == self.cur_line_start && c.is_whitespace() {
+                        self.cur_line_start = pos + c.len_utf8();
+                    }
+                    self.cur.next();
+                }
+            }
+        }
+        &self.input[start..self.input.len()]
+    }
+
+    /// Parses an `Argument` structure, or what's contained within braces inside the format string.
+    fn argument(&mut self, start: InnerOffset) -> Argument<'a> {
+        let pos = self.position();
+
+        let end = self
+            .cur
+            .clone()
+            .find(|(_, ch)| !ch.is_whitespace())
+            .map_or(start, |(end, _)| self.to_span_index(end));
+        let position_span = start.to(end);
+
+        let format = match self.mode {
+            ParseMode::Format => self.format(),
+            ParseMode::InlineAsm => self.inline_asm(),
+        };
+
+        // Resolve position after parsing format spec.
+        let pos = match pos {
+            Some(position) => position,
+            None => {
+                let i = self.curarg;
+                self.curarg += 1;
+                ArgumentImplicitlyIs(i)
+            }
+        };
+
+        Argument {
+            position: pos,
+            position_span,
+            format,
+        }
+    }
+
+    /// Parses a positional argument for a format. This could either be an
+    /// integer index of an argument, a named argument, or a blank string.
+    /// Returns `Some(parsed_position)` if the position is not implicitly
+    /// consuming a macro argument, `None` if it's the case.
+    fn position(&mut self) -> Option<Position<'a>> {
+        if let Some(i) = self.integer() {
+            Some(ArgumentIs(i))
+        } else {
+            match self.cur.peek() {
+                Some(&(lo, c)) if is_id_start(c) => {
+                    let word = self.word();
+
+                    // Recover from `r#ident` in format strings.
+                    // FIXME: use a let chain
+                    if word == "r" {
+                        if let Some((pos, '#')) = self.cur.peek() {
+                            if self.input[pos + 1..]
+                                .chars()
+                                .next()
+                                .is_some_and(is_id_start)
+                            {
+                                self.cur.next();
+                                let word = self.word();
+                                let prefix_span = self.span(lo, lo + 2);
+                                let full_span = self.span(lo, lo + 2 + word.len());
+                                self.errors.insert(0, ParseError {
+                                    description: "raw identifiers are not supported".to_owned(),
+                                    note: Some("identifiers in format strings can be keywords and don't need to be prefixed with `r#`".to_string()),
+                                    label: "raw identifier used here".to_owned(),
+                                    span: full_span,
+                                    secondary_label: None,
+                                    suggestion: Suggestion::RemoveRawIdent(prefix_span),
+                                });
+                                return Some(ArgumentNamed(word));
+                            }
+                        }
+                    }
+
+                    Some(ArgumentNamed(word))
+                }
+
+                // This is an `ArgumentNext`.
+                // Record the fact and do the resolution after parsing the
+                // format spec, to make things like `{:.*}` work.
+                _ => None,
+            }
+        }
+    }
+
+    fn current_pos(&mut self) -> usize {
+        if let Some(&(pos, _)) = self.cur.peek() {
+            pos
+        } else {
+            self.input.len()
+        }
+    }
+
+    /// Parses a format specifier at the current position, returning all of the
+    /// relevant information in the `FormatSpec` struct.
+    fn format(&mut self) -> FormatSpec<'a> {
+        let mut spec = FormatSpec {
+            fill: None,
+            fill_span: None,
+            align: AlignUnknown,
+            sign: None,
+            alternate: false,
+            zero_pad: false,
+            debug_hex: None,
+            precision: CountImplied,
+            precision_span: None,
+            width: CountImplied,
+            width_span: None,
+            ty: &self.input[..0],
+            ty_span: None,
+        };
+        if !self.consume(':') {
+            return spec;
+        }
+
+        // fill character
+        if let Some(&(idx, c)) = self.cur.peek() {
+            if let Some((_, '>' | '<' | '^')) = self.cur.clone().nth(1) {
+                spec.fill = Some(c);
+                spec.fill_span = Some(self.span(idx, idx + 1));
+                self.cur.next();
+            }
+        }
+        // Alignment
+        if self.consume('<') {
+            spec.align = AlignLeft;
+        } else if self.consume('>') {
+            spec.align = AlignRight;
+        } else if self.consume('^') {
+            spec.align = AlignCenter;
+        }
+        // Sign flags
+        if self.consume('+') {
+            spec.sign = Some(Sign::Plus);
+        } else if self.consume('-') {
+            spec.sign = Some(Sign::Minus);
+        }
+        // Alternate marker
+        if self.consume('#') {
+            spec.alternate = true;
+        }
+        // Width and precision
+        let mut havewidth = false;
+
+        if self.consume('0') {
+            // small ambiguity with '0$' as a format string. In theory this is a
+            // '0' flag and then an ill-formatted format string with just a '$'
+            // and no count, but this is better if we instead interpret this as
+            // no '0' flag and '0$' as the width instead.
+            if let Some(end) = self.consume_pos('$') {
+                spec.width = CountIsParam(0);
+                spec.width_span = Some(self.span(end - 1, end + 1));
+                havewidth = true;
+            } else {
+                spec.zero_pad = true;
+            }
+        }
+
+        if !havewidth {
+            let start = self.current_pos();
+            spec.width = self.count(start);
+            if spec.width != CountImplied {
+                let end = self.current_pos();
+                spec.width_span = Some(self.span(start, end));
+            }
+        }
+
+        if let Some(start) = self.consume_pos('.') {
+            if self.consume('*') {
+                // Resolve `CountIsNextParam`.
+                // We can do this immediately as `position` is resolved later.
+                let i = self.curarg;
+                self.curarg += 1;
+                spec.precision = CountIsStar(i);
+            } else {
+                spec.precision = self.count(start + 1);
+            }
+            let end = self.current_pos();
+            spec.precision_span = Some(self.span(start, end));
+        }
+
+        let ty_span_start = self.current_pos();
+        // Optional radix followed by the actual format specifier
+        if self.consume('x') {
+            if self.consume('?') {
+                spec.debug_hex = Some(DebugHex::Lower);
+                spec.ty = "?";
+            } else {
+                spec.ty = "x";
+            }
+        } else if self.consume('X') {
+            if self.consume('?') {
+                spec.debug_hex = Some(DebugHex::Upper);
+                spec.ty = "?";
+            } else {
+                spec.ty = "X";
+            }
+        } else if self.consume('?') {
+            spec.ty = "?";
+        } else {
+            spec.ty = self.word();
+            if !spec.ty.is_empty() {
+                let ty_span_end = self.current_pos();
+                spec.ty_span = Some(self.span(ty_span_start, ty_span_end));
+            }
+        }
+        spec
+    }
+
+    /// Parses an inline assembly template modifier at the current position, returning the modifier
+    /// in the `ty` field of the `FormatSpec` struct.
+    fn inline_asm(&mut self) -> FormatSpec<'a> {
+        let mut spec = FormatSpec {
+            fill: None,
+            fill_span: None,
+            align: AlignUnknown,
+            sign: None,
+            alternate: false,
+            zero_pad: false,
+            debug_hex: None,
+            precision: CountImplied,
+            precision_span: None,
+            width: CountImplied,
+            width_span: None,
+            ty: &self.input[..0],
+            ty_span: None,
+        };
+        if !self.consume(':') {
+            return spec;
+        }
+
+        let ty_span_start = self.current_pos();
+        spec.ty = self.word();
+        if !spec.ty.is_empty() {
+            let ty_span_end = self.current_pos();
+            spec.ty_span = Some(self.span(ty_span_start, ty_span_end));
+        }
+
+        spec
+    }
+
+    /// Parses a `Count` parameter at the current position. This does not check
+    /// for 'CountIsNextParam' because that is only used in precision, not
+    /// width.
+    fn count(&mut self, start: usize) -> Count<'a> {
+        if let Some(i) = self.integer() {
+            if self.consume('$') {
+                CountIsParam(i)
+            } else {
+                CountIs(i)
+            }
+        } else {
+            let tmp = self.cur.clone();
+            let word = self.word();
+            if word.is_empty() {
+                self.cur = tmp;
+                CountImplied
+            } else if let Some(end) = self.consume_pos('$') {
+                let name_span = self.span(start, end);
+                CountIsName(word, name_span)
+            } else {
+                self.cur = tmp;
+                CountImplied
+            }
+        }
+    }
+
+    /// Parses a word starting at the current position. A word is the same as
+    /// Rust identifier, except that it can't start with `_` character.
+    fn word(&mut self) -> &'a str {
+        let start = match self.cur.peek() {
+            Some(&(pos, c)) if is_id_start(c) => {
+                self.cur.next();
+                pos
+            }
+            _ => {
+                return "";
+            }
+        };
+        let mut end = None;
+        while let Some(&(pos, c)) = self.cur.peek() {
+            if is_id_continue(c) {
+                self.cur.next();
+            } else {
+                end = Some(pos);
+                break;
+            }
+        }
+        let end = end.unwrap_or(self.input.len());
+        let word = &self.input[start..end];
+        if word == "_" {
+            self.err_with_note(
+                "invalid argument name `_`",
+                "invalid argument name",
+                "argument name cannot be a single underscore",
+                self.span(start, end),
+            );
+        }
+        word
+    }
+
+    fn integer(&mut self) -> Option<usize> {
+        let mut cur: usize = 0;
+        let mut found = false;
+        let mut overflow = false;
+        let start = self.current_pos();
+        while let Some(&(_, c)) = self.cur.peek() {
+            if let Some(i) = c.to_digit(10) {
+                let (tmp, mul_overflow) = cur.overflowing_mul(10);
+                let (tmp, add_overflow) = tmp.overflowing_add(i as usize);
+                if mul_overflow || add_overflow {
+                    overflow = true;
+                }
+                cur = tmp;
+                found = true;
+                self.cur.next();
+            } else {
+                break;
+            }
+        }
+
+        if overflow {
+            let end = self.current_pos();
+            let overflowed_int = &self.input[start..end];
+            self.err(
+                format!(
+                    "integer `{}` does not fit into the type `usize` whose range is `0..={}`",
+                    overflowed_int,
+                    usize::MAX
+                ),
+                "integer out of range for `usize`",
+                self.span(start, end),
+            );
+        }
+
+        found.then_some(cur)
+    }
+
+    fn suggest_format(&mut self) {
+        if let (Some(pos), Some(_)) = (self.consume_pos('?'), self.consume_pos(':')) {
+            let word = self.word();
+            let _end = self.current_pos();
+            let pos = self.to_span_index(pos);
+            self.errors.insert(
+                0,
+                ParseError {
+                    description: "expected format parameter to occur after `:`".to_owned(),
+                    note: Some(format!(
+                        "`?` comes after `:`, try `{}:{}` instead",
+                        word, "?"
+                    )),
+                    label: "expected `?` to occur after `:`".to_owned(),
+                    span: pos.to(pos),
+                    secondary_label: None,
+                    suggestion: Suggestion::None,
+                },
+            );
+        }
+    }
+
+    fn suggest_positional_arg_instead_of_captured_arg(&mut self, arg: Argument<'a>) {
+        if let Some(end) = self.consume_pos('.') {
+            let byte_pos = self.to_span_index(end);
+            let start = InnerOffset(byte_pos.0 + 1);
+            let field = self.argument(start);
+            // We can only parse `foo.bar` field access, any deeper nesting,
+            // or another type of expression, like method calls, are not supported
+            if !self.consume('}') {
+                return;
+            }
+            if let ArgumentNamed(_) = arg.position {
+                if let ArgumentNamed(_) = field.position {
+                    self.errors.insert(
+                        0,
+                        ParseError {
+                            description: "field access isn't supported".to_string(),
+                            note: None,
+                            label: "not supported".to_string(),
+                            span: InnerSpan::new(arg.position_span.start, field.position_span.end),
+                            secondary_label: None,
+                            suggestion: Suggestion::UsePositional,
+                        },
+                    );
+                }
+            }
+        }
+    }
+}
+
+/// Finds the indices of all characters that have been processed and differ between the actual
+/// written code (code snippet) and the `InternedString` that gets processed in the `Parser`
+/// in order to properly synthesise the intra-string `Span`s for error diagnostics.
+// TODO: Can we give an escaped string here? probably yes - and a valid one too
+fn find_width_map_from_snippet(
+    input: &str,
+    snippet: Option<string::String>,
+    str_style: Option<usize>,
+) -> InputStringKind {
+    let snippet = match snippet {
+        Some(ref s) if s.starts_with('"') || s.starts_with("r\"") || s.starts_with("r#") => s,
+        _ => return InputStringKind::NotALiteral,
+    };
+
+    if str_style.is_some() {
+        return InputStringKind::Literal {
+            width_mappings: Vec::new(),
+        };
+    }
+
+    // Strip quotes.
+    let snippet = &snippet[1..snippet.len() - 1];
+
+    // Macros like `println` add a newline at the end. That technically doesn't make them "literals" anymore, but it's fine
+    // since we will never need to point our spans there, so we lie about it here by ignoring it.
+    // Since there might actually be newlines in the source code, we need to normalize away all trailing newlines.
+    // If we only trimmed it off the input, `format!("\n")` would cause a mismatch as here we they actually match up.
+    // Alternatively, we could just count the trailing newlines and only trim one from the input if they don't match up.
+    let input_no_nl = input.trim_end_matches('\n');
+    let Some(unescaped) = unescape_string(snippet) else {
+        return InputStringKind::NotALiteral;
+    };
+
+    let unescaped_no_nl = unescaped.trim_end_matches('\n');
+
+    if unescaped_no_nl != input_no_nl {
+        // The source string that we're pointing at isn't our input, so spans pointing at it will be incorrect.
+        // This can for example happen with proc macros that respan generated literals.
+        return InputStringKind::NotALiteral;
+    }
+
+    let mut s = snippet.char_indices();
+    let mut width_mappings = vec![];
+    while let Some((pos, c)) = s.next() {
+        match (c, s.clone().next()) {
+            // skip whitespace and empty lines ending in '\\'
+            ('\\', Some((_, '\n'))) => {
+                let _ = s.next();
+                let mut width = 2;
+
+                while let Some((_, c)) = s.clone().next() {
+                    if matches!(c, ' ' | '\n' | '\t') {
+                        width += 1;
+                        let _ = s.next();
+                    } else {
+                        break;
+                    }
+                }
+
+                width_mappings.push(InnerWidthMapping::new(pos, width, 0));
+            }
+            ('\\', Some((_, 'n' | 't' | 'r' | '0' | '\\' | '\'' | '\"'))) => {
+                width_mappings.push(InnerWidthMapping::new(pos, 2, 1));
+                let _ = s.next();
+            }
+            ('\\', Some((_, 'x'))) => {
+                // consume `\xAB` literal
+                s.nth(2);
+                width_mappings.push(InnerWidthMapping::new(pos, 4, 1));
+            }
+            ('\\', Some((_, 'u'))) => {
+                let mut width = 2;
+                let _ = s.next();
+
+                if let Some((_, next_c)) = s.next() {
+                    if next_c == '{' {
+                        // consume up to 6 hexanumeric chars
+                        let digits_len = s
+                            .clone()
+                            .take(6)
+                            .take_while(|(_, c)| c.is_digit(16))
+                            .count();
+
+                        let len_utf8 = s
+                            .as_str()
+                            .get(..digits_len)
+                            .and_then(|digits| u32::from_str_radix(digits, 16).ok())
+                            .and_then(char::from_u32)
+                            .map_or(1, char::len_utf8);
+
+                        // Skip the digits, for chars that encode to more than 1 utf-8 byte
+                        // exclude as many digits as it is greater than 1 byte
+                        //
+                        // So for a 3 byte character, exclude 2 digits
+                        let required_skips = digits_len.saturating_sub(len_utf8.saturating_sub(1));
+
+                        // skip '{' and '}' also
+                        width += required_skips + 2;
+
+                        s.nth(digits_len);
+                    } else if next_c.is_digit(16) {
+                        width += 1;
+
+                        // We suggest adding `{` and `}` when appropriate, accept it here as if
+                        // it were correct
+                        let mut i = 0; // consume up to 6 hexanumeric chars
+                        while let (Some((_, c)), _) = (s.next(), i < 6) {
+                            if c.is_digit(16) {
+                                width += 1;
+                            } else {
+                                break;
+                            }
+                            i += 1;
+                        }
+                    }
+                }
+
+                width_mappings.push(InnerWidthMapping::new(pos, width, 1));
+            }
+            _ => {}
+        }
+    }
+
+    InputStringKind::Literal { width_mappings }
+}
+
+// TODO: I guess we can provide an `unescape_string` function to the parser... but how do we do that
+// Store it in the parser struct? we need to make it FFI-aware
+// SO this is not possible because we need `unescape_string` *before* we have a parser
+
+fn unescape_string(string: &str) -> Option<string::String> {
+    // let mut buf = string::String::new();
+    // let mut ok = true;
+    // unescape::unescape_literal(string, unescape::Mode::Str, &mut |_, unescaped_char| {
+    //     match unescaped_char {
+    //         Ok(c) => buf.push(c),
+    //         Err(_) => ok = false,
+    //     }
+    // });
+
+    let buf = string::String::from(string);
+    let ok = true;
+
+    ok.then_some(buf)
+}
+
+// Assert a reasonable size for `Piece`
+// #[cfg(all(target_arch = "x86_64", target_pointer_width = "64"))]
+// rustc_index::static_assert_size!(Piece<'_>, 16);
+
+// #[cfg(test)]
+// mod tests;
+\ No newline at end of file
diff --git a/libgrust/libformat_parser/src/bin.rs b/libgrust/libformat_parser/src/bin.rs
new file mode 100644
index 0000000..4b1f903
--- /dev/null
+++ b/libgrust/libformat_parser/src/bin.rs
@@ -0,0 +1,7 @@
+use libformat_parser::rust;
+
+fn main() {
+    dbg!(rust::collect_pieces(
+        std::env::args().nth(1).unwrap().as_str()
+    ));
+}
diff --git a/libgrust/libformat_parser/src/lib.rs b/libgrust/libformat_parser/src/lib.rs
new file mode 100644
index 0000000..e6dc16e
--- /dev/null
+++ b/libgrust/libformat_parser/src/lib.rs
@@ -0,0 +1,41 @@
+//! FFI interface for `rustc_format_parser`
+
+// what's the plan? Have a function return something that can be constructed into a vector?
+// or an iterator?
+
+use std::ffi::CStr;
+
+// TODO: Use rustc's version here #3
+use generic_format_parser::Piece;
+
+// FIXME: Rename?
+pub mod rust {
+    use generic_format_parser::{ParseMode, Parser, Piece};
+
+    pub fn collect_pieces(input: &str) -> Vec<Piece<'_>> {
+        // let parser = Parser::new();
+        let parser = Parser::new(input, None, None, true, ParseMode::Format);
+
+        parser.into_iter().collect()
+    }
+}
+
+#[repr(C)]
+pub struct PieceSlice {
+    base_ptr: *const Piece<'static /* FIXME: That's wrong */>,
+    len: usize,
+}
+
+#[no_mangle]
+pub extern "C" fn collect_pieces(input: *const libc::c_char) -> PieceSlice {
+    // FIXME: Add comment
+    let str = unsafe { CStr::from_ptr(input) };
+
+    // FIXME: No unwrap
+    let pieces = rust::collect_pieces(str.to_str().unwrap());
+
+    PieceSlice {
+        base_ptr: pieces.as_ptr(),
+        len: pieces.len(),
+    }
+}
author	Arthur Cohen <arthur.cohen@embecosm.com>	2024-04-23 13:38:58 +0200
committer	Arthur Cohen <arthur.cohen@embecosm.com>	2024-08-01 12:47:19 +0200
commit	6fef4d6ffcab0fec8518adcb05458cba5dbeac25 (patch)
tree	bb3bdab1b69382086cc1bab294d79f60f75295c7 /libgrust
parent	473feb033d5ccb139f8af8e0e54193b176d1cd93 (diff)
download	gcc-6fef4d6ffcab0fec8518adcb05458cba5dbeac25.zip gcc-6fef4d6ffcab0fec8518adcb05458cba5dbeac25.tar.gz gcc-6fef4d6ffcab0fec8518adcb05458cba5dbeac25.tar.bz2