aboutsummaryrefslogtreecommitdiff
path: root/libcpp
diff options
context:
space:
mode:
Diffstat (limited to 'libcpp')
-rw-r--r--libcpp/ChangeLog18
-rw-r--r--libcpp/charset.c52
-rw-r--r--libcpp/directives.c6
-rw-r--r--libcpp/include/cpplib.h5
-rw-r--r--libcpp/internal.h4
-rw-r--r--libcpp/lex.c223
-rw-r--r--libcpp/macro.c3
7 files changed, 295 insertions, 16 deletions
diff --git a/libcpp/ChangeLog b/libcpp/ChangeLog
index 3259c56..5946b29 100644
--- a/libcpp/ChangeLog
+++ b/libcpp/ChangeLog
@@ -1,3 +1,21 @@
+2009-10-19 Jakub Jelinek <jakub@redhat.com>
+
+ * charset.c (cpp_init_iconv): Initialize utf8_cset_desc.
+ (_cpp_destroy_iconv): Destroy utf8_cset_desc, char16_cset_desc
+ and char32_cset_desc.
+ (converter_for_type): Handle CPP_UTF8STRING.
+ (cpp_interpret_string): Handle CPP_UTF8STRING and raw-strings.
+ * directives.c (get__Pragma_string): Handle CPP_UTF8STRING.
+ (parse_include): Reject raw strings.
+ * include/cpplib.h (CPP_UTF8STRING): New token type.
+ * internal.h (struct cpp_reader): Add utf8_cset_desc field.
+ * lex.c (lex_raw_string): New function.
+ (lex_string): Handle u8 string literals, call lex_raw_string
+ for raw string literals.
+ (_cpp_lex_direct): Call lex_string even for u8" and {,u,U,L,u8}R"
+ sequences.
+ * macro.c (stringify_arg): Handle CPP_UTF8STRING.
+
2009-10-14 Jakub Jelinek <jakub@redhat.com>
PR preprocessor/41543
diff --git a/libcpp/charset.c b/libcpp/charset.c
index bd24ec24..837ccd7 100644
--- a/libcpp/charset.c
+++ b/libcpp/charset.c
@@ -721,6 +721,8 @@ cpp_init_iconv (cpp_reader *pfile)
pfile->narrow_cset_desc = init_iconv_desc (pfile, ncset, SOURCE_CHARSET);
pfile->narrow_cset_desc.width = CPP_OPTION (pfile, char_precision);
+ pfile->utf8_cset_desc = init_iconv_desc (pfile, "UTF-8", SOURCE_CHARSET);
+ pfile->utf8_cset_desc.width = CPP_OPTION (pfile, char_precision);
pfile->char16_cset_desc = init_iconv_desc (pfile,
be ? "UTF-16BE" : "UTF-16LE",
SOURCE_CHARSET);
@@ -741,6 +743,12 @@ _cpp_destroy_iconv (cpp_reader *pfile)
{
if (pfile->narrow_cset_desc.func == convert_using_iconv)
iconv_close (pfile->narrow_cset_desc.cd);
+ if (pfile->utf8_cset_desc.func == convert_using_iconv)
+ iconv_close (pfile->utf8_cset_desc.cd);
+ if (pfile->char16_cset_desc.func == convert_using_iconv)
+ iconv_close (pfile->char16_cset_desc.cd);
+ if (pfile->char32_cset_desc.func == convert_using_iconv)
+ iconv_close (pfile->char32_cset_desc.cd);
if (pfile->wide_cset_desc.func == convert_using_iconv)
iconv_close (pfile->wide_cset_desc.cd);
}
@@ -1339,6 +1347,8 @@ converter_for_type (cpp_reader *pfile, enum cpp_ttype type)
{
default:
return pfile->narrow_cset_desc;
+ case CPP_UTF8STRING:
+ return pfile->utf8_cset_desc;
case CPP_CHAR16:
case CPP_STRING16:
return pfile->char16_cset_desc;
@@ -1373,7 +1383,47 @@ cpp_interpret_string (cpp_reader *pfile, const cpp_string *from, size_t count,
for (i = 0; i < count; i++)
{
p = from[i].text;
- if (*p == 'L' || *p == 'u' || *p == 'U') p++;
+ if (*p == 'u')
+ {
+ if (*++p == '8')
+ p++;
+ }
+ else if (*p == 'L' || *p == 'U') p++;
+ if (*p == 'R')
+ {
+ const uchar *prefix;
+
+ /* Skip over 'R"'. */
+ p += 2;
+ prefix = p;
+ while (*p != '[')
+ p++;
+ p++;
+ limit = from[i].text + from[i].len;
+ if (limit >= p + (p - prefix) + 1)
+ limit -= (p - prefix) + 1;
+
+ for (;;)
+ {
+ base = p;
+ while (p < limit && (*p != '\\' || (p[1] != 'u' && p[1] != 'U')))
+ p++;
+ if (p > base)
+ {
+ /* We have a run of normal characters; these can be fed
+ directly to convert_cset. */
+ if (!APPLY_CONVERSION (cvt, base, p - base, &tbuf))
+ goto fail;
+ }
+ if (p == limit)
+ break;
+
+ p = convert_ucn (pfile, p + 1, limit, &tbuf, cvt);
+ }
+
+ continue;
+ }
+
p++; /* Skip leading quote. */
limit = from[i].text + from[i].len - 1; /* Skip trailing quote. */
diff --git a/libcpp/directives.c b/libcpp/directives.c
index f9dba53..01bb599 100644
--- a/libcpp/directives.c
+++ b/libcpp/directives.c
@@ -697,7 +697,8 @@ parse_include (cpp_reader *pfile, int *pangle_brackets,
/* Allow macro expansion. */
header = get_token_no_padding (pfile);
*location = header->src_loc;
- if (header->type == CPP_STRING || header->type == CPP_HEADER_NAME)
+ if ((header->type == CPP_STRING && header->val.str.text[0] != 'R')
+ || header->type == CPP_HEADER_NAME)
{
fname = XNEWVEC (char, header->val.str.len - 1);
memcpy (fname, header->val.str.text + 1, header->val.str.len - 2);
@@ -1537,7 +1538,8 @@ get__Pragma_string (cpp_reader *pfile)
if (string->type == CPP_EOF)
_cpp_backup_tokens (pfile, 1);
if (string->type != CPP_STRING && string->type != CPP_WSTRING
- && string->type != CPP_STRING32 && string->type != CPP_STRING16)
+ && string->type != CPP_STRING32 && string->type != CPP_STRING16
+ && string->type != CPP_UTF8STRING)
return NULL;
paren = get_token_no_padding (pfile);
diff --git a/libcpp/include/cpplib.h b/libcpp/include/cpplib.h
index df04668..e95f01a 100644
--- a/libcpp/include/cpplib.h
+++ b/libcpp/include/cpplib.h
@@ -127,6 +127,7 @@ struct _cpp_file;
TK(WSTRING, LITERAL) /* L"string" */ \
TK(STRING16, LITERAL) /* u"string" */ \
TK(STRING32, LITERAL) /* U"string" */ \
+ TK(UTF8STRING, LITERAL) /* u8"string" */ \
TK(OBJC_STRING, LITERAL) /* @"string" - Objective-C */ \
TK(HEADER_NAME, LITERAL) /* <stdio.h> in #include */ \
\
@@ -728,10 +729,10 @@ extern const unsigned char *cpp_macro_definition (cpp_reader *,
extern void _cpp_backup_tokens (cpp_reader *, unsigned int);
extern const cpp_token *cpp_peek_token (cpp_reader *, int);
-/* Evaluate a CPP_CHAR or CPP_WCHAR token. */
+/* Evaluate a CPP_*CHAR* token. */
extern cppchar_t cpp_interpret_charconst (cpp_reader *, const cpp_token *,
unsigned int *, int *);
-/* Evaluate a vector of CPP_STRING or CPP_WSTRING tokens. */
+/* Evaluate a vector of CPP_*STRING* tokens. */
extern bool cpp_interpret_string (cpp_reader *,
const cpp_string *, size_t,
cpp_string *, enum cpp_ttype);
diff --git a/libcpp/internal.h b/libcpp/internal.h
index 21e51c6..aaa231c 100644
--- a/libcpp/internal.h
+++ b/libcpp/internal.h
@@ -397,6 +397,10 @@ struct cpp_reader
struct cset_converter narrow_cset_desc;
/* Descriptor for converting from the source character set to the
+ UTF-8 execution character set. */
+ struct cset_converter utf8_cset_desc;
+
+ /* Descriptor for converting from the source character set to the
UTF-16 execution character set. */
struct cset_converter char16_cset_desc;
diff --git a/libcpp/lex.c b/libcpp/lex.c
index bab14a4..55bffa9 100644
--- a/libcpp/lex.c
+++ b/libcpp/lex.c
@@ -617,12 +617,192 @@ create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
token->val.str.text = dest;
}
+/* Lexes a raw string. The stored string contains the spelling, including
+ double quotes, delimiter string, '[' and ']', any leading
+ 'L', 'u', 'U' or 'u8' and 'R' modifier. It returns the type of the
+ literal, or CPP_OTHER if it was not properly terminated.
+
+ The spelling is NUL-terminated, but it is not guaranteed that this
+ is the first NUL since embedded NULs are preserved. */
+
+static void
+lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base,
+ const uchar *cur)
+{
+ source_location saw_NUL = 0;
+ const uchar *raw_prefix;
+ unsigned int raw_prefix_len = 0;
+ enum cpp_ttype type;
+ size_t total_len = 0;
+ _cpp_buff *first_buff = NULL, *last_buff = NULL;
+
+ type = (*base == 'L' ? CPP_WSTRING :
+ *base == 'U' ? CPP_STRING32 :
+ *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
+ : CPP_STRING);
+
+ raw_prefix = cur + 1;
+ while (raw_prefix_len < 16)
+ {
+ switch (raw_prefix[raw_prefix_len])
+ {
+ case ' ': case '[': case ']': case '\t':
+ case '\v': case '\f': case '\n': default:
+ break;
+ /* Basic source charset except the above chars. */
+ case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
+ case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
+ case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
+ case 's': case 't': case 'u': case 'v': case 'w': case 'x':
+ case 'y': case 'z':
+ case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
+ case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
+ case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
+ case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
+ case 'Y': case 'Z':
+ case '0': case '1': case '2': case '3': case '4': case '5':
+ case '6': case '7': case '8': case '9':
+ case '_': case '{': case '}': case '#': case '(': case ')':
+ case '<': case '>': case '%': case ':': case ';': case '.':
+ case '?': case '*': case '+': case '-': case '/': case '^':
+ case '&': case '|': case '~': case '!': case '=': case ',':
+ case '\\': case '"': case '\'':
+ raw_prefix_len++;
+ continue;
+ }
+ break;
+ }
+
+ if (raw_prefix[raw_prefix_len] != '[')
+ {
+ int col = CPP_BUF_COLUMN (pfile->buffer, raw_prefix + raw_prefix_len)
+ + 1;
+ if (raw_prefix_len == 16)
+ cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col,
+ "raw string delimiter longer than 16 characters");
+ else
+ cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col,
+ "invalid character '%c' in raw string delimiter",
+ (int) raw_prefix[raw_prefix_len]);
+ pfile->buffer->cur = raw_prefix - 1;
+ create_literal (pfile, token, base, raw_prefix - 1 - base, CPP_OTHER);
+ return;
+ }
+
+ cur = raw_prefix + raw_prefix_len + 1;
+ for (;;)
+ {
+ cppchar_t c = *cur++;
+
+ if (c == ']'
+ && strncmp ((const char *) cur, (const char *) raw_prefix,
+ raw_prefix_len) == 0
+ && cur[raw_prefix_len] == '"')
+ {
+ cur += raw_prefix_len + 1;
+ break;
+ }
+ else if (c == '\n')
+ {
+ if (pfile->state.in_directive
+ || pfile->state.parsing_args
+ || pfile->state.in_deferred_pragma)
+ {
+ cur--;
+ type = CPP_OTHER;
+ cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
+ "unterminated raw string");
+ break;
+ }
+
+ /* raw strings allow embedded non-escaped newlines, which
+ complicates this routine a lot. */
+ if (first_buff == NULL)
+ {
+ total_len = cur - base;
+ first_buff = last_buff = _cpp_get_buff (pfile, total_len);
+ memcpy (BUFF_FRONT (last_buff), base, total_len);
+ raw_prefix = BUFF_FRONT (last_buff) + (raw_prefix - base);
+ BUFF_FRONT (last_buff) += total_len;
+ }
+ else
+ {
+ size_t len = cur - base;
+ size_t cur_len = len > BUFF_ROOM (last_buff)
+ ? BUFF_ROOM (last_buff) : len;
+
+ total_len += len;
+ memcpy (BUFF_FRONT (last_buff), base, cur_len);
+ BUFF_FRONT (last_buff) += cur_len;
+ if (len > cur_len)
+ {
+ last_buff = _cpp_append_extend_buff (pfile, last_buff,
+ len - cur_len);
+ memcpy (BUFF_FRONT (last_buff), base + cur_len,
+ len - cur_len);
+ BUFF_FRONT (last_buff) += len - cur_len;
+ }
+ }
+
+ if (pfile->buffer->cur < pfile->buffer->rlimit)
+ CPP_INCREMENT_LINE (pfile, 0);
+ pfile->buffer->need_line = true;
+
+ if (!_cpp_get_fresh_line (pfile))
+ {
+ source_location src_loc = token->src_loc;
+ token->type = CPP_EOF;
+ /* Tell the compiler the line number of the EOF token. */
+ token->src_loc = pfile->line_table->highest_line;
+ token->flags = BOL;
+ if (first_buff != NULL)
+ _cpp_release_buff (pfile, first_buff);
+ cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
+ "unterminated raw string");
+ return;
+ }
+
+ cur = base = pfile->buffer->cur;
+ }
+ else if (c == '\0' && !saw_NUL)
+ LINEMAP_POSITION_FOR_COLUMN (saw_NUL, pfile->line_table,
+ CPP_BUF_COLUMN (pfile->buffer, cur));
+ }
+
+ if (saw_NUL && !pfile->state.skipping)
+ cpp_error_with_line (pfile, CPP_DL_WARNING, saw_NUL, 0,
+ "null character(s) preserved in literal");
+
+ pfile->buffer->cur = cur;
+ if (first_buff == NULL)
+ create_literal (pfile, token, base, cur - base, type);
+ else
+ {
+ uchar *dest = _cpp_unaligned_alloc (pfile, total_len + (cur - base) + 1);
+
+ token->type = type;
+ token->val.str.len = total_len + (cur - base);
+ token->val.str.text = dest;
+ last_buff = first_buff;
+ while (last_buff != NULL)
+ {
+ memcpy (dest, last_buff->base,
+ BUFF_FRONT (last_buff) - last_buff->base);
+ dest += BUFF_FRONT (last_buff) - last_buff->base;
+ last_buff = last_buff->next;
+ }
+ _cpp_release_buff (pfile, first_buff);
+ memcpy (dest, base, cur - base);
+ dest[cur - base] = '\0';
+ }
+}
+
/* Lexes a string, character constant, or angle-bracketed header file
name. The stored string contains the spelling, including opening
- quote and leading any leading 'L', 'u' or 'U'. It returns the type
- of the literal, or CPP_OTHER if it was not properly terminated, or
- CPP_LESS for an unterminated header name which must be relexed as
- normal tokens.
+ quote and any leading 'L', 'u', 'U' or 'u8' and optional
+ 'R' modifier. It returns the type of the literal, or CPP_OTHER
+ if it was not properly terminated, or CPP_LESS for an unterminated
+ header name which must be relexed as normal tokens.
The spelling is NUL-terminated, but it is not guaranteed that this
is the first NUL since embedded NULs are preserved. */
@@ -636,12 +816,24 @@ lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
cur = base;
terminator = *cur++;
- if (terminator == 'L' || terminator == 'u' || terminator == 'U')
+ if (terminator == 'L' || terminator == 'U')
terminator = *cur++;
- if (terminator == '\"')
+ else if (terminator == 'u')
+ {
+ terminator = *cur++;
+ if (terminator == '8')
+ terminator = *cur++;
+ }
+ if (terminator == 'R')
+ {
+ lex_raw_string (pfile, token, base, cur);
+ return;
+ }
+ if (terminator == '"')
type = (*base == 'L' ? CPP_WSTRING :
*base == 'U' ? CPP_STRING32 :
- *base == 'u' ? CPP_STRING16 : CPP_STRING);
+ *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
+ : CPP_STRING);
else if (terminator == '\'')
type = (*base == 'L' ? CPP_WCHAR :
*base == 'U' ? CPP_CHAR32 :
@@ -1101,10 +1293,21 @@ _cpp_lex_direct (cpp_reader *pfile)
case 'L':
case 'u':
case 'U':
- /* 'L', 'u' or 'U' may introduce wide characters or strings. */
+ case 'R':
+ /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
+ wide strings or raw strings. */
if (c == 'L' || CPP_OPTION (pfile, uliterals))
{
- if (*buffer->cur == '\'' || *buffer->cur == '"')
+ if ((*buffer->cur == '\'' && c != 'R')
+ || *buffer->cur == '"'
+ || (*buffer->cur == 'R'
+ && c != 'R'
+ && buffer->cur[1] == '"'
+ && CPP_OPTION (pfile, uliterals))
+ || (*buffer->cur == '8'
+ && c == 'u'
+ && (buffer->cur[1] == '"'
+ || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'))))
{
lex_string (pfile, result, buffer->cur - 1);
break;
@@ -1120,7 +1323,7 @@ _cpp_lex_direct (cpp_reader *pfile)
case 'y': case 'z':
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
case 'G': case 'H': case 'I': case 'J': case 'K':
- case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
+ case 'M': case 'N': case 'O': case 'P': case 'Q':
case 'S': case 'T': case 'V': case 'W': case 'X':
case 'Y': case 'Z':
result->type = CPP_NAME;
diff --git a/libcpp/macro.c b/libcpp/macro.c
index f318059..1d284cf 100644
--- a/libcpp/macro.c
+++ b/libcpp/macro.c
@@ -379,7 +379,8 @@ stringify_arg (cpp_reader *pfile, macro_arg *arg)
escape_it = (token->type == CPP_STRING || token->type == CPP_CHAR
|| token->type == CPP_WSTRING || token->type == CPP_WCHAR
|| token->type == CPP_STRING32 || token->type == CPP_CHAR32
- || token->type == CPP_STRING16 || token->type == CPP_CHAR16);
+ || token->type == CPP_STRING16 || token->type == CPP_CHAR16
+ || token->type == CPP_UTF8STRING);
/* Room for each char being written in octal, initial space and
final quote and NUL. */