aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--gcc/ChangeLog7
-rw-r--r--gcc/doc/cpp.texi32
-rw-r--r--gcc/doc/cppopts.texi5
-rw-r--r--gcc/testsuite/ChangeLog39
-rw-r--r--gcc/testsuite/c-c++-common/cpp/ucnid-2011-1-utf8.c15
-rw-r--r--gcc/testsuite/g++.dg/cpp/ucnid-1-utf8.C17
-rw-r--r--gcc/testsuite/g++.dg/cpp/ucnid-2-utf8.C24
-rw-r--r--gcc/testsuite/g++.dg/cpp/ucnid-3-utf8.C23
-rw-r--r--gcc/testsuite/g++.dg/cpp/ucnid-4-utf8.C17
-rw-r--r--gcc/testsuite/g++.dg/other/ucnid-1-utf8.C28
-rw-r--r--gcc/testsuite/gcc.dg/cpp/ucnid-1-utf8.c26
-rw-r--r--gcc/testsuite/gcc.dg/cpp/ucnid-10-utf8.c8
-rw-r--r--gcc/testsuite/gcc.dg/cpp/ucnid-11-utf8.c30
-rw-r--r--gcc/testsuite/gcc.dg/cpp/ucnid-12-utf8.c13
-rw-r--r--gcc/testsuite/gcc.dg/cpp/ucnid-13-utf8.c5
-rw-r--r--gcc/testsuite/gcc.dg/cpp/ucnid-14-utf8.c6
-rw-r--r--gcc/testsuite/gcc.dg/cpp/ucnid-15-utf8.c6
-rw-r--r--gcc/testsuite/gcc.dg/cpp/ucnid-2-utf8.c16
-rw-r--r--gcc/testsuite/gcc.dg/cpp/ucnid-3-utf8.c7
-rw-r--r--gcc/testsuite/gcc.dg/cpp/ucnid-4-utf8.c17
-rw-r--r--gcc/testsuite/gcc.dg/cpp/ucnid-6-utf8.c5
-rw-r--r--gcc/testsuite/gcc.dg/cpp/ucnid-7-utf8.c21
-rw-r--r--gcc/testsuite/gcc.dg/cpp/ucnid-9-utf8.c8
-rw-r--r--gcc/testsuite/gcc.dg/ucnid-1-utf8.c25
-rw-r--r--gcc/testsuite/gcc.dg/ucnid-10-utf8.c11
-rw-r--r--gcc/testsuite/gcc.dg/ucnid-11-utf8.c7
-rw-r--r--gcc/testsuite/gcc.dg/ucnid-12-utf8.c7
-rw-r--r--gcc/testsuite/gcc.dg/ucnid-13-utf8.c15
-rw-r--r--gcc/testsuite/gcc.dg/ucnid-14-utf8.c23
-rw-r--r--gcc/testsuite/gcc.dg/ucnid-15-utf8.c38
-rw-r--r--gcc/testsuite/gcc.dg/ucnid-16-utf8.c6
-rw-r--r--gcc/testsuite/gcc.dg/ucnid-2-utf8.c28
-rw-r--r--gcc/testsuite/gcc.dg/ucnid-3-utf8.c28
-rw-r--r--gcc/testsuite/gcc.dg/ucnid-4-utf8.c28
-rw-r--r--gcc/testsuite/gcc.dg/ucnid-5-utf8.c19
-rw-r--r--gcc/testsuite/gcc.dg/ucnid-6-utf8.c28
-rw-r--r--gcc/testsuite/gcc.dg/ucnid-7-utf8.c9
-rw-r--r--gcc/testsuite/gcc.dg/ucnid-8-utf8.c16
-rw-r--r--gcc/testsuite/gcc.dg/ucnid-9-utf8.c25
-rw-r--r--libcpp/ChangeLog10
-rw-r--r--libcpp/charset.c83
-rw-r--r--libcpp/internal.h8
-rw-r--r--libcpp/lex.c55
43 files changed, 807 insertions, 37 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index e7feded..7f16c16 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,10 @@
+2019-09-19 Lewis Hyatt <lhyatt@gmail.com>
+
+ PR c/67224
+ * doc/cpp.texi: Document support for extended characters in
+ identifiers.
+ * doc/cppopts.texi: Likewise.
+
2019-09-19 Richard Biener <rguenther@suse.de>
* tree-vect-loop.c (vect_is_slp_reduction): Remove.
diff --git a/gcc/doc/cpp.texi b/gcc/doc/cpp.texi
index e271f51..f2de39a 100644
--- a/gcc/doc/cpp.texi
+++ b/gcc/doc/cpp.texi
@@ -274,11 +274,11 @@ the character in the source character set that they represent, then
converted to the execution character set, just like unescaped
characters.
-In identifiers, characters outside the ASCII range can only be
-specified with the @samp{\u} and @samp{\U} escapes, not used
-directly. If strict ISO C90 conformance is specified with an option
+In identifiers, characters outside the ASCII range can be specified
+with the @samp{\u} and @samp{\U} escapes or used directly in the input
+encoding. If strict ISO C90 conformance is specified with an option
such as @option{-std=c90}, or @option{-fno-extended-identifiers} is
-used, then those escapes are not permitted in identifiers.
+used, then those constructs are not permitted in identifiers.
@node Initial processing
@section Initial processing
@@ -503,8 +503,7 @@ In the 1999 C standard, identifiers may contain letters which are not
part of the ``basic source character set'', at the implementation's
discretion (such as accented Latin letters, Greek letters, or Chinese
ideograms). This may be done with an extended character set, or the
-@samp{\u} and @samp{\U} escape sequences. GCC only accepts such
-characters in the @samp{\u} and @samp{\U} forms.
+@samp{\u} and @samp{\U} escape sequences.
As an extension, GCC treats @samp{$} as a letter. This is for
compatibility with some systems, such as VMS, where @samp{$} is commonly
@@ -584,15 +583,15 @@ Punctuator: @{ @} [ ] # ##
@end smallexample
@cindex other tokens
-Any other single character is considered ``other''. It is passed on to
-the preprocessor's output unmolested. The C compiler will almost
-certainly reject source code containing ``other'' tokens. In ASCII, the
-only other characters are @samp{@@}, @samp{$}, @samp{`}, and control
+Any other single byte is considered ``other'' and passed on to the
+preprocessor's output unchanged. The C compiler will almost certainly
+reject source code containing ``other'' tokens. In ASCII, the only
+``other'' characters are @samp{@@}, @samp{$}, @samp{`}, and control
characters other than NUL (all bits zero). (Note that @samp{$} is
-normally considered a letter.) All characters with the high bit set
-(numeric range 0x7F--0xFF) are also ``other'' in the present
-implementation. This will change when proper support for international
-character sets is added to GCC@.
+normally considered a letter.) All bytes with the high bit set
+(numeric range 0x7F--0xFF) that were not succesfully interpreted as
+part of an extended character in the input encoding are also ``other''
+in the present implementation.
NUL is a special case because of the high probability that its
appearance is accidental, and because it may be invisible to the user
@@ -4179,7 +4178,10 @@ be controlled using the @option{-fexec-charset} and
The C and C++ standards allow identifiers to be composed of @samp{_}
and the alphanumeric characters. C++ also allows universal character
names. C99 and later C standards permit both universal character
-names and implementation-defined characters.
+names and implementation-defined characters. In both C and C++ modes,
+GCC accepts in identifiers exactly those extended characters that
+correspond to universal character names permitted by the chosen
+standard.
GCC allows the @samp{$} character in identifiers as an extension for
most targets. This is true regardless of the @option{std=} switch,
diff --git a/gcc/doc/cppopts.texi b/gcc/doc/cppopts.texi
index 61e22cd..f4bc3f5 100644
--- a/gcc/doc/cppopts.texi
+++ b/gcc/doc/cppopts.texi
@@ -254,8 +254,9 @@ Accept @samp{$} in identifiers.
@item -fextended-identifiers
@opindex fextended-identifiers
-Accept universal character names in identifiers. This option is
-enabled by default for C99 (and later C standard versions) and C++.
+Accept universal character names and extended characters in
+identifiers. This option is enabled by default for C99 (and later C
+standard versions) and C++.
@item -fno-canonical-system-headers
@opindex fno-canonical-system-headers
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 7efdac9..1f9b5ac 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,42 @@
+2019-09-19 Lewis Hyatt <lhyatt@gmail.com>
+
+ PR c/67224
+ * c-c++-common/cpp/ucnid-2011-1-utf8.c: New test.
+ * g++.dg/cpp/ucnid-1-utf8.C: New test.
+ * g++.dg/cpp/ucnid-2-utf8.C: New test.
+ * g++.dg/cpp/ucnid-3-utf8.C: New test.
+ * g++.dg/cpp/ucnid-4-utf8.C: New test.
+ * g++.dg/other/ucnid-1-utf8.C: New test.
+ * gcc.dg/cpp/ucnid-1-utf8.c: New test.
+ * gcc.dg/cpp/ucnid-10-utf8.c: New test.
+ * gcc.dg/cpp/ucnid-11-utf8.c: New test.
+ * gcc.dg/cpp/ucnid-12-utf8.c: New test.
+ * gcc.dg/cpp/ucnid-13-utf8.c: New test.
+ * gcc.dg/cpp/ucnid-14-utf8.c: New test.
+ * gcc.dg/cpp/ucnid-15-utf8.c: New test.
+ * gcc.dg/cpp/ucnid-2-utf8.c: New test.
+ * gcc.dg/cpp/ucnid-3-utf8.c: New test.
+ * gcc.dg/cpp/ucnid-4-utf8.c: New test.
+ * gcc.dg/cpp/ucnid-6-utf8.c: New test.
+ * gcc.dg/cpp/ucnid-7-utf8.c: New test.
+ * gcc.dg/cpp/ucnid-9-utf8.c: New test.
+ * gcc.dg/ucnid-1-utf8.c: New test.
+ * gcc.dg/ucnid-10-utf8.c: New test.
+ * gcc.dg/ucnid-11-utf8.c: New test.
+ * gcc.dg/ucnid-12-utf8.c: New test.
+ * gcc.dg/ucnid-13-utf8.c: New test.
+ * gcc.dg/ucnid-14-utf8.c: New test.
+ * gcc.dg/ucnid-15-utf8.c: New test.
+ * gcc.dg/ucnid-16-utf8.c: New test.
+ * gcc.dg/ucnid-2-utf8.c: New test.
+ * gcc.dg/ucnid-3-utf8.c: New test.
+ * gcc.dg/ucnid-4-utf8.c: New test.
+ * gcc.dg/ucnid-5-utf8.c: New test.
+ * gcc.dg/ucnid-6-utf8.c: New test.
+ * gcc.dg/ucnid-7-utf8.c: New test.
+ * gcc.dg/ucnid-8-utf8.c: New test.
+ * gcc.dg/ucnid-9-utf8.c: New test.
+
2019-09-19 Iain Sandoe <iain@sandoe.co.uk>
* gcc.dg/pr89313.c: Test for __POWERPC__ in addition to
diff --git a/gcc/testsuite/c-c++-common/cpp/ucnid-2011-1-utf8.c b/gcc/testsuite/c-c++-common/cpp/ucnid-2011-1-utf8.c
new file mode 100644
index 0000000..02c5fc0
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/cpp/ucnid-2011-1-utf8.c
@@ -0,0 +1,15 @@
+/* { dg-do preprocess } */
+/* { dg-options "-std=c11 -pedantic" { target c } } */
+/* { dg-options "-std=c++11 -pedantic" { target c++ } } */
+
+
+B̀
+
+̀ /* { dg-error "not valid at the start of an identifier" } */
+
+À /* { dg-warning "not in NFC" } */
+
+𐀀
+🿽
+󡈴
diff --git a/gcc/testsuite/g++.dg/cpp/ucnid-1-utf8.C b/gcc/testsuite/g++.dg/cpp/ucnid-1-utf8.C
new file mode 100644
index 0000000..839b188
--- /dev/null
+++ b/gcc/testsuite/g++.dg/cpp/ucnid-1-utf8.C
@@ -0,0 +1,17 @@
+/* { dg-do preprocess } */
+/* { dg-options "-std=gnu++98 -pedantic" } */
+
+ª /* { dg-error "not valid in an identifier" } */
+« /* { dg-error "not valid in an identifier" } */
+¶ /* { dg-error "not valid in an identifier" } */
+º /* { dg-error "not valid in an identifier" } */
+
+٩ /* { dg-error "not valid in an identifier" } */
+A٩ /* { dg-error "not valid in an identifier" } */
+0º /* { dg-error "not valid in an identifier" } */
+0٩ /* { dg-error "not valid in an identifier" } */
+๙
+A๙
diff --git a/gcc/testsuite/g++.dg/cpp/ucnid-2-utf8.C b/gcc/testsuite/g++.dg/cpp/ucnid-2-utf8.C
new file mode 100644
index 0000000..0381452
--- /dev/null
+++ b/gcc/testsuite/g++.dg/cpp/ucnid-2-utf8.C
@@ -0,0 +1,24 @@
+/* Test stringization of identifiers with extended characters works. */
+
+/* Note: The results expected in these tests are what GCC currently
+outputs, but they are not technically standard-conforming. If GCC is
+changed in the future to produce the standard-conforming output, then
+this test will fail and should be adjusted to check for UCNs in the
+output rather than UTF-8. See PR 91755 for more details. */
+
+/* { dg-do run } */
+
+#include <stdlib.h>
+#include <string.h>
+
+#define h(s) #s
+#define str(s) h(s)
+
+int
+main ()
+{
+ if (strcmp (str (str (Á)), "\"Á\""))
+ abort ();
+ if (strcmp (str (str (Á)), "\"Á\""))
+ abort ();
+}
diff --git a/gcc/testsuite/g++.dg/cpp/ucnid-3-utf8.C b/gcc/testsuite/g++.dg/cpp/ucnid-3-utf8.C
new file mode 100644
index 0000000..5c3044a
--- /dev/null
+++ b/gcc/testsuite/g++.dg/cpp/ucnid-3-utf8.C
@@ -0,0 +1,23 @@
+/* Test pasting of identifiers with extended characters works. */
+
+/* Note: The results expected in these tests are what GCC currently
+outputs, but they are not technically standard-conforming. If GCC is
+changed in the future to produce the standard-conforming output, then
+this test will fail and should be adjusted to check for UCNs in the
+output rather than UTF-8. See PR 91755 for more details. */
+
+/* { dg-do run } */
+
+#include <stdlib.h>
+#include <string.h>
+
+#define c(s1, s2) s1 ## s2
+#define h(s) #s
+#define str(s) h(s)
+
+int
+main ()
+{
+ if (strcmp (str (str (c (Á, Á))), "\"ÁÁ\""))
+ abort ();
+}
diff --git a/gcc/testsuite/g++.dg/cpp/ucnid-4-utf8.C b/gcc/testsuite/g++.dg/cpp/ucnid-4-utf8.C
new file mode 100644
index 0000000..de252e8
--- /dev/null
+++ b/gcc/testsuite/g++.dg/cpp/ucnid-4-utf8.C
@@ -0,0 +1,17 @@
+/* { dg-do preprocess } */
+/* { dg-options "-std=gnu++98"} */
+
+« /* { dg-error "not valid in an identifier" } */
+¶ /* { dg-error "not valid in an identifier" } */
+
+٩ /* OK in C++ */
+A٩
+0º
+0٩
+๙ /* OK in C++ */
+A๙
diff --git a/gcc/testsuite/g++.dg/other/ucnid-1-utf8.C b/gcc/testsuite/g++.dg/other/ucnid-1-utf8.C
new file mode 100644
index 0000000..dab4152
--- /dev/null
+++ b/gcc/testsuite/g++.dg/other/ucnid-1-utf8.C
@@ -0,0 +1,28 @@
+/* { dg-do run } */
+/* { dg-options "" } */
+/* { dg-xfail-if "" { powerpc-ibm-aix* } } */
+/* { dg-skip-if "" { ! ucn } } */
+#include <cstdlib>
+
+int À(void) { return 1; }
+int Á(void) { return 2; }
+int Â(void) { return 3; }
+int whÿ(void) { return 4; }
+int aÄbсδe(void) { return 5; }
+
+int main (void)
+{
+
+ if (À() != 1)
+ abort ();
+ if (Á() != 2)
+ abort ();
+ if (Â() != 3)
+ abort ();
+ if (whÿ() != 4)
+ abort ();
+ if (aÄbсδe() != 5)
+ abort ();
+
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/cpp/ucnid-1-utf8.c b/gcc/testsuite/gcc.dg/cpp/ucnid-1-utf8.c
new file mode 100644
index 0000000..9100b98
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/cpp/ucnid-1-utf8.c
@@ -0,0 +1,26 @@
+/* { dg-do run } */
+/* { dg-options "-std=c99 -g3" } */
+void abort (void);
+
+#define À 1
+#define Á 2
+#define  3
+#define whÿ 4
+#define aÄbсδe 5
+
+int main (void)
+{
+
+ if (À != 1)
+ abort ();
+ if (Á != 2)
+ abort ();
+ if (Â != 3)
+ abort ();
+ if (whÿ != 4)
+ abort ();
+ if (aÄbсδe != 5)
+ abort ();
+
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/cpp/ucnid-10-utf8.c b/gcc/testsuite/gcc.dg/cpp/ucnid-10-utf8.c
new file mode 100644
index 0000000..7eeb026
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/cpp/ucnid-10-utf8.c
@@ -0,0 +1,8 @@
+/* Test UTF-8 is allowed in preprocessing numbers. */
+/* { dg-do compile } */
+/* { dg-options "-std=c99" } */
+
+#define a(x) b(x)
+#define b(x) 0
+#define p )
+int c = a(0À.p);
diff --git a/gcc/testsuite/gcc.dg/cpp/ucnid-11-utf8.c b/gcc/testsuite/gcc.dg/cpp/ucnid-11-utf8.c
new file mode 100644
index 0000000..56b88f8b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/cpp/ucnid-11-utf8.c
@@ -0,0 +1,30 @@
+/* Test spelling differences in UCNs are properly diagnosed for macro
+ redefinitions. */
+/* { dg-do preprocess } */
+/* { dg-options "-std=c99 -pedantic-errors" } */
+
+/* Different spelling of UCN in expansion. */
+#define m1 \u00c1 /* { dg-message "-:previous definition" } */
+#define m1 Á /* { dg-error "-:redefined" } */
+
+#define m1ok Á
+#define m1ok Á
+
+/* Different spelling of UCN in argument name. */
+#define m2(\u00c1) /* { dg-message "-:previous definition" } */
+#define m2(Á) /* { dg-error "-:redefined" } */
+
+#define m2ok(Á)
+#define m2ok(Á)
+
+/* Same spelling in argument name but different spelling when used in
+ expansion. */
+#define m3(\u00c1) \u00c1 /* { dg-message "-:previous definition" } */
+#define m3(\u00c1) Á /* { dg-error "-:redefined" } */
+
+#define m3ok(\u00c1) Á
+#define m3ok(\u00c1) Á
+
+/* Different spelling of the macro name itself is OK. */
+#define m4ok\u00c1
+#define m4okÁ
diff --git a/gcc/testsuite/gcc.dg/cpp/ucnid-12-utf8.c b/gcc/testsuite/gcc.dg/cpp/ucnid-12-utf8.c
new file mode 100644
index 0000000..9b54249
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/cpp/ucnid-12-utf8.c
@@ -0,0 +1,13 @@
+/* Test spelling differences in UCNs in macro definitions still count
+ as the same identifier for macro expansion. */
+/* { dg-do compile } */
+/* { dg-options "-std=c99 -pedantic-errors" } */
+
+#define m1\u00c1
+#ifndef m1Á
+#error not defined
+#endif
+
+#define m2(\u00c1) Á
+
+int i = m2 (0);
diff --git a/gcc/testsuite/gcc.dg/cpp/ucnid-13-utf8.c b/gcc/testsuite/gcc.dg/cpp/ucnid-13-utf8.c
new file mode 100644
index 0000000..aff39b6
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/cpp/ucnid-13-utf8.c
@@ -0,0 +1,5 @@
+/* Verify macros named with UTF-8 are output in -dD output with UCNs. */
+/* { dg-do preprocess } */
+/* { dg-options "-std=c99 -dD" } */
+/* { dg-final { scan-file ucnid-13-utf8.i "\\\\U000000c1" } } */
+#define Á 1
diff --git a/gcc/testsuite/gcc.dg/cpp/ucnid-14-utf8.c b/gcc/testsuite/gcc.dg/cpp/ucnid-14-utf8.c
new file mode 100644
index 0000000..6ea14eb
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/cpp/ucnid-14-utf8.c
@@ -0,0 +1,6 @@
+/* Verify macro definitions with UTF-8 are output in -dD output with
+ the original spelling. */
+/* { dg-do preprocess } */
+/* { dg-options "-std=c99 -dD" } */
+/* { dg-final { scan-file ucnid-14-utf8.i "Á" } } */
+#define a Á
diff --git a/gcc/testsuite/gcc.dg/cpp/ucnid-15-utf8.c b/gcc/testsuite/gcc.dg/cpp/ucnid-15-utf8.c
new file mode 100644
index 0000000..cf2289a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/cpp/ucnid-15-utf8.c
@@ -0,0 +1,6 @@
+/* Verify macro definitions with UTF-8 in argument names are output in
+ -dD output with the original spelling. */
+/* { dg-do preprocess } */
+/* { dg-options "-std=c99 -dD" } */
+/* { dg-final { scan-file ucnid-15-utf8.i "#define a\\(Á\\) x:Á:y:Á:z" } } */
+#define a(Á) x:Á:y:Á:z
diff --git a/gcc/testsuite/gcc.dg/cpp/ucnid-2-utf8.c b/gcc/testsuite/gcc.dg/cpp/ucnid-2-utf8.c
new file mode 100644
index 0000000..e3730f8
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/cpp/ucnid-2-utf8.c
@@ -0,0 +1,16 @@
+/* { dg-do run } */
+/* { dg-options "-std=c99" } */
+#include <stdlib.h>
+#include <string.h>
+
+#define str(t) #t
+
+int main (void)
+{
+ const char s[] = str (ゲ);
+
+ if (strcmp (s, "ゲ") != 0)
+ abort ();
+
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/cpp/ucnid-3-utf8.c b/gcc/testsuite/gcc.dg/cpp/ucnid-3-utf8.c
new file mode 100644
index 0000000..4c9ed25
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/cpp/ucnid-3-utf8.c
@@ -0,0 +1,7 @@
+/* { dg-do compile } */
+/* { dg-options "-std=c99" } */
+
+#define paste(x, y) x ## y
+
+int paste(ª, Ա) = 3;
+
diff --git a/gcc/testsuite/gcc.dg/cpp/ucnid-4-utf8.c b/gcc/testsuite/gcc.dg/cpp/ucnid-4-utf8.c
new file mode 100644
index 0000000..ccc7a1e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/cpp/ucnid-4-utf8.c
@@ -0,0 +1,17 @@
+/* { dg-do preprocess } */
+/* { dg-options "-std=c99" } */
+
+« /* not a preprocessing error because we lex it into its own token */
+¶ /* not a preprocessing error because we lex it into its own token */
+
+٩ /* { dg-error "not valid at the start of an identifier" } */
+A٩
+0º
+0٩
+๙ /* { dg-error "not valid at the start of an identifier" } */
+A๙
diff --git a/gcc/testsuite/gcc.dg/cpp/ucnid-6-utf8.c b/gcc/testsuite/gcc.dg/cpp/ucnid-6-utf8.c
new file mode 100644
index 0000000..b4dd094
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/cpp/ucnid-6-utf8.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+/* { dg-options "-std=c89" } */
+#define a b(
+#define b(x) q
+int aª);
diff --git a/gcc/testsuite/gcc.dg/cpp/ucnid-7-utf8.c b/gcc/testsuite/gcc.dg/cpp/ucnid-7-utf8.c
new file mode 100644
index 0000000..22aff7e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/cpp/ucnid-7-utf8.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-std=c99" } */
+
+/* When GCC reads UTF-8-encoded input into its internal UTF-8
+representation, it does not apply any transformation to the data, and
+in particular it makes no attempt to verify that the encoding is valid
+UTF-8. Historically, if any non-ASCII characters were found outside a
+string or comment, they were treated as stray tokens and did not
+necessarily produce an error, e.g. if, as in this test, they disappear
+in the preprocessor. Now that UTF-8 is also supported in identifiers,
+the basic structure of this process has not changed; GCC just treats
+invalid UTF-8 as a stray token. This test verifies that the historical
+behavior is unchanged. In the future, if GCC were changed, say, to
+validate the UTF-8 on input, then this test would no longer be
+appropriate. */
+
+
+#define a b(
+#define b(x) q
+/* The line below contains invalid UTF-8. */
+int a);
diff --git a/gcc/testsuite/gcc.dg/cpp/ucnid-9-utf8.c b/gcc/testsuite/gcc.dg/cpp/ucnid-9-utf8.c
new file mode 100644
index 0000000..1558eca
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/cpp/ucnid-9-utf8.c
@@ -0,0 +1,8 @@
+/* { dg-do preprocess } */
+/* { dg-options "-std=c99 -pedantic" } */
+
+Ⅰ
+ↂ
+〇
+〡
+〩
diff --git a/gcc/testsuite/gcc.dg/ucnid-1-utf8.c b/gcc/testsuite/gcc.dg/ucnid-1-utf8.c
new file mode 100644
index 0000000..7213673
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/ucnid-1-utf8.c
@@ -0,0 +1,25 @@
+/* { dg-do run } */
+/* { dg-options "-std=c99 -g" } */
+void abort (void);
+
+int main (void)
+{
+ int À = 1;
+ int Á = 2;
+ int  = 3;
+ int whÿ = 4;
+ int aÄbсδe = 5;
+
+ if (À != 1)
+ abort ();
+ if (Á != 2)
+ abort ();
+ if (Â != 3)
+ abort ();
+ if (whÿ != 4)
+ abort ();
+ if (aÄbсδe != 5)
+ abort ();
+
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/ucnid-10-utf8.c b/gcc/testsuite/gcc.dg/ucnid-10-utf8.c
new file mode 100644
index 0000000..86830b8
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/ucnid-10-utf8.c
@@ -0,0 +1,11 @@
+/* Verify diagnostics for extended identifiers refer to UCNs (in the C
+ locale). Test #pragma pack diagnostics. */
+/* { dg-do compile } */
+/* { dg-options "-std=gnu99" } */
+/* { dg-require-ascii-locale "" } */
+/* { dg-skip-if "" { powerpc-ibm-aix* } } */
+
+#pragma pack(push)
+#pragma pack(pop, ó) /* { dg-warning "pop, \\\\U000000f3.*push, \\\\U000000f3" } */
+#pragma pack(ç) /* { dg-warning "unknown action '\\\\U000000e7'" } */
+
diff --git a/gcc/testsuite/gcc.dg/ucnid-11-utf8.c b/gcc/testsuite/gcc.dg/ucnid-11-utf8.c
new file mode 100644
index 0000000..c6a89ba
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/ucnid-11-utf8.c
@@ -0,0 +1,7 @@
+/* { dg-do run } */
+/* { dg-xfail-if "" { powerpc-ibm-aix* } } */
+/* { dg-skip-if "" { ! ucn } } */
+/* { dg-skip-if "-fdata-sections not supported" { { hppa*-*-hpux* } && { ! lp64 } } } */
+/* { dg-options "-std=c99 -fdata-sections -g" } */
+
+#include "ucnid-3-utf8.c"
diff --git a/gcc/testsuite/gcc.dg/ucnid-12-utf8.c b/gcc/testsuite/gcc.dg/ucnid-12-utf8.c
new file mode 100644
index 0000000..cfdffba
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/ucnid-12-utf8.c
@@ -0,0 +1,7 @@
+/* { dg-do run } */
+/* { dg-xfail-if "" { powerpc-ibm-aix* } } */
+/* { dg-skip-if "" { ! ucn } } */
+/* { dg-skip-if "-ffunction-sections not supported" { { hppa*-*-hpux* } && { ! lp64 } } } */
+/* { dg-options "-std=c99 -ffunction-sections -g" } */
+
+#include "ucnid-4-utf8.c"
diff --git a/gcc/testsuite/gcc.dg/ucnid-13-utf8.c b/gcc/testsuite/gcc.dg/ucnid-13-utf8.c
new file mode 100644
index 0000000..41536c3
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/ucnid-13-utf8.c
@@ -0,0 +1,15 @@
+/* Verify diagnostics for extended identifiers refer to UCNs (in the C
+ locale). Miscellaneous diagnostics. */
+/* { dg-do compile } */
+/* { dg-options "-std=gnu99 -Wpacked" } */
+/* { dg-require-ascii-locale "" } */
+/* { dg-skip-if "" { powerpc-ibm-aix* } } */
+
+int a __attribute__((À)); /* { dg-warning "'\\\\U000000c0' attribute directive ignored" } */
+
+extern void Á (void) __attribute__((deprecated));
+void g (void) { Á (); } /* { dg-warning "'\\\\U000000c1' is deprecated" } */
+
+struct  { char c; } __attribute__((packed)); /* { dg-warning "'\\\\U000000c2'" } */
+
+void h (void) { asm ("%[Ã]" : : ); } /* { dg-error "undefined named operand '\\\\U000000c3'" } */
diff --git a/gcc/testsuite/gcc.dg/ucnid-14-utf8.c b/gcc/testsuite/gcc.dg/ucnid-14-utf8.c
new file mode 100644
index 0000000..e781ed6
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/ucnid-14-utf8.c
@@ -0,0 +1,23 @@
+/* Test miscellaneous uses of UTF-8 in identifiers compile and run OK,
+ with debug info enabled. */
+/* { dg-do run } */
+/* { dg-options "-std=c99 -g" } */
+
+extern void abort (void);
+extern void exit (int);
+
+int
+main (void)
+{
+ struct À { int Á; } x;
+ struct À *y = &x;
+ y->Á = 1;
+ if (x.Á != 1)
+ abort ();
+ goto ÿ;
+ ÿ: ;
+ enum e { Â = 4 };
+ if (Â != 4)
+ abort ();
+ exit (0);
+}
diff --git a/gcc/testsuite/gcc.dg/ucnid-15-utf8.c b/gcc/testsuite/gcc.dg/ucnid-15-utf8.c
new file mode 100644
index 0000000..e233689
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/ucnid-15-utf8.c
@@ -0,0 +1,38 @@
+/* Test combinations of UTF-8 in various parts of identifiers. */
+/* { dg-do run } */
+/* { dg-xfail-if "" { "powerpc-ibm-aix*" } } */
+/* { dg-skip-if "" { ! ucn } } */
+/* { dg-options "-std=c99" } */
+
+extern void abort (void);
+
+int π = 3;
+int π² = 9;
+int πp1 = 4;
+int twoπ = 6;
+int four_plus_π_ = 7;
+int 😀ÀÁÂÃÄÅßàáâãäaåbæçèéêcëìígîïð7ñ9__òóô4õöÆ3ÇÈÉÊËabcÌÍÎÏÐÑÒÓÔÕÖ😄😅🤣😂_ÿ = 2;
+int π\u03C0 = 9;
+
+int main() {
+ if (π != 3)
+ abort ();
+
+ if (π² != 9)
+ abort ();
+
+ if (πp1 != 4)
+ abort ();
+
+ if (twoπ != 6)
+ abort ();
+
+ if (four_plus_π_ != 7)
+ abort () ;
+
+ if (😀ÀÁÂÃÄÅßàáâãäaåbæçèéêcëìígîïð7ñ9__òóô4õöÆ3ÇÈÉÊËabcÌÍÎÏÐÑÒÓÔÕÖ😄😅🤣😂_ÿ != 2)
+ abort ();
+
+ if(ππ != π²)
+ abort ();
+}
diff --git a/gcc/testsuite/gcc.dg/ucnid-16-utf8.c b/gcc/testsuite/gcc.dg/ucnid-16-utf8.c
new file mode 100644
index 0000000..5d000a0
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/ucnid-16-utf8.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-std=c99 -g -finput-charset=latin1" } */
+/* { dg-final { scan-file ucnid-16-utf8.s "²" } } */
+
+/* This superscript is encoded in latin1; verify that we still get UTF-8 in the output. */
+int x = 9;
diff --git a/gcc/testsuite/gcc.dg/ucnid-2-utf8.c b/gcc/testsuite/gcc.dg/ucnid-2-utf8.c
new file mode 100644
index 0000000..70f9464
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/ucnid-2-utf8.c
@@ -0,0 +1,28 @@
+/* { dg-do run } */
+/* { dg-xfail-if "" { powerpc-ibm-aix* } } */
+/* { dg-skip-if "" { ! ucn } } */
+/* { dg-options "-std=c99 -g" } */
+void abort (void);
+
+static int À = 1;
+static int Á = 2;
+static int  = 3;
+static int whÿ = 4;
+static int aÄbсδe = 5;
+
+int main (void)
+{
+
+ if (À != 1)
+ abort ();
+ if (Á != 2)
+ abort ();
+ if (Â != 3)
+ abort ();
+ if (whÿ != 4)
+ abort ();
+ if (aÄbсδe != 5)
+ abort ();
+
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/ucnid-3-utf8.c b/gcc/testsuite/gcc.dg/ucnid-3-utf8.c
new file mode 100644
index 0000000..f8509a6
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/ucnid-3-utf8.c
@@ -0,0 +1,28 @@
+/* { dg-do run } */
+/* { dg-xfail-if "" { powerpc-ibm-aix* } } */
+/* { dg-skip-if "" { ! ucn } } */
+/* { dg-options "-std=c99 -g" } */
+void abort (void);
+
+int À = 1;
+int Á = 2;
+int  = 3;
+int whÿ = 4;
+int aÄbсδe = 5;
+
+int main (void)
+{
+
+ if (À != 1)
+ abort ();
+ if (Á != 2)
+ abort ();
+ if (Â != 3)
+ abort ();
+ if (whÿ != 4)
+ abort ();
+ if (aÄbсδe != 5)
+ abort ();
+
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/ucnid-4-utf8.c b/gcc/testsuite/gcc.dg/ucnid-4-utf8.c
new file mode 100644
index 0000000..bf1c403
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/ucnid-4-utf8.c
@@ -0,0 +1,28 @@
+/* { dg-do run } */
+/* { dg-xfail-if "" { powerpc-ibm-aix* } } */
+/* { dg-skip-if "" { ! ucn } } */
+/* { dg-options "-std=c99 -g" } */
+void abort (void);
+
+int À(void) { return 1; }
+int Á(void) { return 2; }
+int Â(void) { return 3; }
+int whÿ(void) { return 4; }
+int aÄbсδe(void) { return 5; }
+
+int main (void)
+{
+
+ if (À() != 1)
+ abort ();
+ if (Á() != 2)
+ abort ();
+ if (Â() != 3)
+ abort ();
+ if (whÿ() != 4)
+ abort ();
+ if (aÄbсδe() != 5)
+ abort ();
+
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/ucnid-5-utf8.c b/gcc/testsuite/gcc.dg/ucnid-5-utf8.c
new file mode 100644
index 0000000..f4473e1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/ucnid-5-utf8.c
@@ -0,0 +1,19 @@
+/* { dg-do run } */
+/* { dg-skip-if "No dollar in identfiers" { avr-*-* powerpc-ibm-aix* } } */
+/* { dg-options "-std=c99 -fdollars-in-identifiers -g" } */
+void abort (void);
+
+int a$b(void) { return 1; }
+int a$b😀(void) { return 2; }
+
+int main (void)
+{
+
+ if (a$b() != 1)
+ abort ();
+
+ if (a$b😀() != 2)
+ abort ();
+
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/ucnid-6-utf8.c b/gcc/testsuite/gcc.dg/ucnid-6-utf8.c
new file mode 100644
index 0000000..36ce52b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/ucnid-6-utf8.c
@@ -0,0 +1,28 @@
+/* { dg-do run } */
+/* { dg-xfail-if "" { "powerpc-ibm-aix*" } } */
+/* { dg-skip-if "" { ! ucn } } */
+/* { dg-options "-std=c99 -save-temps -g" } */
+void abort (void);
+
+int À(void) { return 1; }
+int Á(void) { return 2; }
+int Â(void) { return 3; }
+int whÿ(void) { return 4; }
+int aÄbсδe(void) { return 5; }
+
+int main (void)
+{
+
+ if (À() != 1)
+ abort ();
+ if (Á() != 2)
+ abort ();
+ if (Â() != 3)
+ abort ();
+ if (whÿ() != 4)
+ abort ();
+ if (aÄbсδe() != 5)
+ abort ();
+
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/ucnid-7-utf8.c b/gcc/testsuite/gcc.dg/ucnid-7-utf8.c
new file mode 100644
index 0000000..07f5ca0
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/ucnid-7-utf8.c
@@ -0,0 +1,9 @@
+/* Verify diagnostics for extended identifiers refer to UCNs (in the C
+ locale). */
+/* { dg-do compile } */
+/* { dg-options "-std=c99" } */
+/* { dg-require-ascii-locale "" } */
+/* { dg-skip-if "" { "powerpc-ibm-aix*" } } */
+
+void *p = &é; /* { dg-error "'\\\\U000000e9' undeclared" } */
+void *q = &Ḁ; /* { dg-error "'\\\\U00001e00' undeclared" } */
diff --git a/gcc/testsuite/gcc.dg/ucnid-8-utf8.c b/gcc/testsuite/gcc.dg/ucnid-8-utf8.c
new file mode 100644
index 0000000..e6c440d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/ucnid-8-utf8.c
@@ -0,0 +1,16 @@
+/* Verify diagnostics for extended identifiers refer to UCNs (in the C
+ locale). Further tests of C front-end diagnostics. */
+/* { dg-do compile } */
+/* { dg-options "-std=gnu99 -Wvla" } */
+/* { dg-require-ascii-locale "" } */
+/* { dg-skip-if "" { powerpc-ibm-aix* } } */
+
+int a __attribute__((__mode__(é))); /* { dg-error "unknown machine mode '\\\\U000000e9'" } */
+struct s1 { int é : 0; }; /* { dg-error "zero width for bit-field '\\\\U000000e9'" } */
+
+void f (int b) { int é[b]; } /* { dg-warning "variable length array '\\\\U000000e9'" } */
+
+void g (static int é); /* { dg-error "storage class specified for parameter '\\\\U000000e9'" } */
+
+struct s2 { int á; } é = { { 0 } }; /* { dg-warning "braces around scalar initializer" } */
+/* { dg-message "near initialization for '\\\\U000000e9\\.\\\\U000000e1'" "UCN diag" { target *-*-* } .-1 } */
diff --git a/gcc/testsuite/gcc.dg/ucnid-9-utf8.c b/gcc/testsuite/gcc.dg/ucnid-9-utf8.c
new file mode 100644
index 0000000..c937196
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/ucnid-9-utf8.c
@@ -0,0 +1,25 @@
+/* Test __func__ with extended identifiers and character set
+ conversions. */
+/* { dg-do run } */
+/* { dg-xfail-if "" { "powerpc-ibm-aix*" } } */
+/* { dg-skip-if "" { ! ucn } } */
+/* { dg-options "-std=c99 -fexec-charset=ISO-8859-1 -g" } */
+/* { dg-require-iconv "ISO-8859-1" } */
+
+extern int strcmp (const char *, const char *);
+extern void abort (void);
+extern void exit (int);
+
+void
+é (void)
+{
+ if (strcmp (__func__, "é") != 0)
+ abort ();
+}
+
+int
+main (void)
+{
+ é ();
+ exit (0);
+}
diff --git a/libcpp/ChangeLog b/libcpp/ChangeLog
index 1ec8541..0c85195 100644
--- a/libcpp/ChangeLog
+++ b/libcpp/ChangeLog
@@ -1,3 +1,13 @@
+2019-09-19 Lewis Hyatt <lhyatt@gmail.com>
+
+ PR c/67224
+ * charset.c (_cpp_valid_utf8): New function to help lex UTF-8 tokens.
+ * internal.h (_cpp_valid_utf8): Declare.
+ * lex.c (forms_identifier_p): Use it to recognize UTF-8 identifiers.
+ (_cpp_lex_direct): Handle UTF-8 in identifiers and CPP_OTHER tokens.
+ Do all work in "default" case to avoid slowing down typical code paths.
+ Also handle $ and UCN in the default case for consistency.
+
2019-08-30 Nathan Sidwell <nathan@acm.org>
New # semantics for popping to "" name.
diff --git a/libcpp/charset.c b/libcpp/charset.c
index 8a0e5cb..1028621 100644
--- a/libcpp/charset.c
+++ b/libcpp/charset.c
@@ -1198,6 +1198,84 @@ convert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit,
return from;
}
+/* Performs a similar task as _cpp_valid_ucn, but parses UTF-8-encoded
+ extended characters rather than UCNs. If the return value is TRUE, then a
+ character was successfully decoded and stored in *CP; *PSTR has been
+ updated to point one past the valid UTF-8 sequence. Diagnostics may have
+ been emitted if the character parsed is not allowed in the current context.
+ If the return value is FALSE, then *PSTR has not been modified and *CP may
+ equal 0, to indicate that *PSTR does not form a valid UTF-8 sequence, or it
+ may, when processing an identifier in C mode, equal a codepoint that was
+ validly encoded but is not allowed to appear in an identifier. In either
+ case, no diagnostic is emitted, and the return value of FALSE should cause
+ a new token to be formed.
+
+ Unlike _cpp_valid_ucn, this will never be called when lexing a string; only
+ a potential identifier, or a CPP_OTHER token. NST is unused in the latter
+ case.
+
+ As in _cpp_valid_ucn, IDENTIFIER_POS is 0 when not in an identifier, 1 for
+ the start of an identifier, or 2 otherwise. */
+
+extern bool
+_cpp_valid_utf8 (cpp_reader *pfile,
+ const uchar **pstr,
+ const uchar *limit,
+ int identifier_pos,
+ struct normalize_state *nst,
+ cppchar_t *cp)
+{
+ const uchar *base = *pstr;
+ size_t inbytesleft = limit - base;
+ if (one_utf8_to_cppchar (pstr, &inbytesleft, cp))
+ {
+ /* No diagnostic here as this byte will rather become a
+ new token. */
+ *cp = 0;
+ return false;
+ }
+
+ if (identifier_pos)
+ {
+ switch (ucn_valid_in_identifier (pfile, *cp, nst))
+ {
+
+ case 0:
+ /* In C++, this is an error for invalid character in an identifier
+ because logically, the UTF-8 was converted to a UCN during
+ translation phase 1 (even though we don't physically do it that
+ way). In C, this byte rather becomes grammatically a separate
+ token. */
+
+ if (CPP_OPTION (pfile, cplusplus))
+ cpp_error (pfile, CPP_DL_ERROR,
+ "extended character %.*s is not valid in an identifier",
+ (int) (*pstr - base), base);
+ else
+ {
+ *pstr = base;
+ return false;
+ }
+
+ break;
+
+ case 2:
+ if (identifier_pos == 1)
+ {
+ /* This is treated the same way in C++ or C99 -- lexed as an
+ identifier which is then invalid because an identifier is
+ not allowed to start with this character. */
+ cpp_error (pfile, CPP_DL_ERROR,
+ "extended character %.*s is not valid at the start of an identifier",
+ (int) (*pstr - base), base);
+ }
+ break;
+ }
+ }
+
+ return true;
+}
+
/* Subroutine of convert_hex and convert_oct. N is the representation
in the execution character set of a numeric escape; write it into the
string buffer TBUF and update the end-of-string pointer therein. WIDE
@@ -1956,8 +2034,9 @@ cpp_interpret_charconst (cpp_reader *pfile, const cpp_token *token,
}
/* Convert an identifier denoted by ID and LEN, which might contain
- UCN escapes, to the source character set, either UTF-8 or
- UTF-EBCDIC. Assumes that the identifier is actually a valid identifier. */
+ UCN escapes or UTF-8 multibyte chars, to the source character set,
+ either UTF-8 or UTF-EBCDIC. Assumes that the identifier is actually
+ a valid identifier. */
cpp_hashnode *
_cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len)
{
diff --git a/libcpp/internal.h b/libcpp/internal.h
index f9bcd37..90263bd 100644
--- a/libcpp/internal.h
+++ b/libcpp/internal.h
@@ -791,6 +791,14 @@ extern bool _cpp_valid_ucn (cpp_reader *, const unsigned char **,
cppchar_t *,
source_range *char_range,
cpp_string_location_reader *loc_reader);
+
+extern bool _cpp_valid_utf8 (cpp_reader *pfile,
+ const uchar **pstr,
+ const uchar *limit,
+ int identifier_pos,
+ struct normalize_state *nst,
+ cppchar_t *cp);
+
extern void _cpp_destroy_iconv (cpp_reader *);
extern unsigned char *_cpp_convert_input (cpp_reader *, const char *,
unsigned char *, size_t, size_t,
diff --git a/libcpp/lex.c b/libcpp/lex.c
index 52e5bce..0e8de38 100644
--- a/libcpp/lex.c
+++ b/libcpp/lex.c
@@ -1313,7 +1313,9 @@ warn_about_normalization (cpp_reader *pfile,
}
}
-/* Returns TRUE if the sequence starting at buffer->cur is invalid in
+static const cppchar_t utf8_signifier = 0xC0;
+
+/* Returns TRUE if the sequence starting at buffer->cur is valid in
an identifier. FIRST is TRUE if this starts an identifier. */
static bool
forms_identifier_p (cpp_reader *pfile, int first,
@@ -1336,17 +1338,25 @@ forms_identifier_p (cpp_reader *pfile, int first,
return true;
}
- /* Is this a syntactically valid UCN? */
- if (CPP_OPTION (pfile, extended_identifiers)
- && *buffer->cur == '\\'
- && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
+ /* Is this a syntactically valid UCN or a valid UTF-8 char? */
+ if (CPP_OPTION (pfile, extended_identifiers))
{
cppchar_t s;
- buffer->cur += 2;
- if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
- state, &s, NULL, NULL))
- return true;
- buffer->cur -= 2;
+ if (*buffer->cur >= utf8_signifier)
+ {
+ if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
+ state, &s))
+ return true;
+ }
+ else if (*buffer->cur == '\\'
+ && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
+ {
+ buffer->cur += 2;
+ if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
+ state, &s, NULL, NULL))
+ return true;
+ buffer->cur -= 2;
+ }
}
return false;
@@ -1464,7 +1474,8 @@ lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
pfile->buffer->cur = cur;
if (starts_ucn || forms_identifier_p (pfile, false, nst))
{
- /* Slower version for identifiers containing UCNs (or $). */
+ /* Slower version for identifiers containing UCNs
+ or extended chars (including $). */
do {
while (ISIDNUM (*pfile->buffer->cur))
{
@@ -3123,12 +3134,12 @@ _cpp_lex_direct (cpp_reader *pfile)
/* @ is a punctuator in Objective-C. */
case '@': result->type = CPP_ATSIGN; break;
- case '$':
- case '\\':
+ default:
{
const uchar *base = --buffer->cur;
- struct normalize_state nst = INITIAL_NORMALIZE_STATE;
+ /* Check for an extended identifier ($ or UCN or UTF-8). */
+ struct normalize_state nst = INITIAL_NORMALIZE_STATE;
if (forms_identifier_p (pfile, true, &nst))
{
result->type = CPP_NAME;
@@ -3137,13 +3148,21 @@ _cpp_lex_direct (cpp_reader *pfile)
warn_about_normalization (pfile, result, &nst);
break;
}
+
+ /* Otherwise this will form a CPP_OTHER token. Parse valid UTF-8 as a
+ single token. */
buffer->cur++;
+ if (c >= utf8_signifier)
+ {
+ const uchar *pstr = base;
+ cppchar_t s;
+ if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
+ buffer->cur = pstr;
+ }
+ create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
+ break;
}
- /* FALLTHRU */
- default:
- create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
- break;
}
/* Potentially convert the location of the token to a range. */