From: Tom Tromey <tom@tromey.com>
To: gdb-patches@sourceware.org
Cc: Tom Tromey <tom@tromey.com>
Subject: [PATCH] Allow non-ASCII characters in Rust identifiers
Date: Wed, 26 Jan 2022 16:15:01 -0700 [thread overview]
Message-ID: <20220126231501.1031201-1-tom@tromey.com> (raw)
Rust 1.53 (quite a while ago now) ungated the support for non-ASCII
identifiers. This didn't work in gdb. This is PR rust/20166.
This patch fixes the problem by allowing non-ASCII characters to be
considered as identifier components. It seemed simplest to just pass
them through -- doing any extra checking didn't seem worthwhile.
The new test also verifies that such characters are allowed in strings
and character literals as well. The latter also required a bit of
work in the lexer.
Bug: https://sourceware.org/bugzilla/show_bug.cgi?id=20166
---
gdb/rust-parse.c | 70 ++++++++++++++++++++++--------
gdb/testsuite/gdb.rust/unicode.exp | 51 ++++++++++++++++++++++
gdb/testsuite/gdb.rust/unicode.rs | 26 +++++++++++
3 files changed, 129 insertions(+), 18 deletions(-)
create mode 100644 gdb/testsuite/gdb.rust/unicode.exp
create mode 100644 gdb/testsuite/gdb.rust/unicode.rs
diff --git a/gdb/rust-parse.c b/gdb/rust-parse.c
index 31a1ee3b38f..aa215f9cf2a 100644
--- a/gdb/rust-parse.c
+++ b/gdb/rust-parse.c
@@ -33,6 +33,12 @@
using namespace expr;
+#if WORDS_BIGENDIAN
+#define UTF32 "UTF-32BE"
+#else
+#define UTF32 "UTF-32LE"
+#endif
+
/* A regular expression for matching Rust numbers. This is split up
since it is very long and this gives us a way to comment the
sections. */
@@ -577,6 +583,35 @@ rust_parser::lex_escape (int is_byte)
return result;
}
+/* A helper for lex_character. Search forward for the closing single
+ quote, then convert the bytes from the host charset to UTF-32. */
+
+static uint32_t
+lex_multibyte_char (const char *text, int *len)
+{
+ /* Only look a maximum of 5 bytes for the closing quote. This is
+ the maximum for UTF-8. */
+ int quote;
+ gdb_assert (text[0] != '\'');
+ for (quote = 1; text[quote] != '\0' && text[quote] != '\''; ++quote)
+ ;
+ *len = quote;
+ /* The caller will issue an error. */
+ if (text[quote] == '\0')
+ return 0;
+
+ auto_obstack result;
+ convert_between_encodings (host_charset (), UTF32, (const gdb_byte *) text,
+ quote, 1, &result, translit_none);
+
+ int size = obstack_object_size (&result);
+ if (size > 4)
+ error (_("overlong character literal"));
+ uint32_t value;
+ memcpy (&value, obstack_finish (&result), size);
+ return value;
+}
+
/* Lex a character constant. */
int
@@ -592,13 +627,15 @@ rust_parser::lex_character ()
}
gdb_assert (pstate->lexptr[0] == '\'');
++pstate->lexptr;
- /* This should handle UTF-8 here. */
- if (pstate->lexptr[0] == '\\')
+ if (pstate->lexptr[0] == '\'')
+ error (_("empty character literal"));
+ else if (pstate->lexptr[0] == '\\')
value = lex_escape (is_byte);
else
{
- value = pstate->lexptr[0] & 0xff;
- ++pstate->lexptr;
+ int len;
+ value = lex_multibyte_char (&pstate->lexptr[0], &len);
+ pstate->lexptr += len;
}
if (pstate->lexptr[0] != '\'')
@@ -695,16 +732,9 @@ rust_parser::lex_string ()
if (is_byte)
obstack_1grow (&obstack, value);
else
- {
-#if WORDS_BIGENDIAN
-#define UTF32 "UTF-32BE"
-#else
-#define UTF32 "UTF-32LE"
-#endif
- convert_between_encodings (UTF32, "UTF-8", (gdb_byte *) &value,
- sizeof (value), sizeof (value),
- &obstack, translit_none);
- }
+ convert_between_encodings (UTF32, "UTF-8", (gdb_byte *) &value,
+ sizeof (value), sizeof (value),
+ &obstack, translit_none);
}
else if (pstate->lexptr[0] == '\0')
error (_("Unexpected EOF in string"));
@@ -746,7 +776,10 @@ rust_identifier_start_p (char c)
return ((c >= 'a' && c <= 'z')
|| (c >= 'A' && c <= 'Z')
|| c == '_'
- || c == '$');
+ || c == '$'
+ /* Allow any non-ASCII character as an identifier. There
+ doesn't seem to be a need to be picky about this. */
+ || (c & 0x80) != 0);
}
/* Lex an identifier. */
@@ -772,13 +805,14 @@ rust_parser::lex_identifier ()
++pstate->lexptr;
- /* For the time being this doesn't handle Unicode rules. Non-ASCII
- identifiers are gated anyway. */
+ /* Allow any non-ASCII character here. This "handles" UTF-8 by
+ passing it through. */
while ((pstate->lexptr[0] >= 'a' && pstate->lexptr[0] <= 'z')
|| (pstate->lexptr[0] >= 'A' && pstate->lexptr[0] <= 'Z')
|| pstate->lexptr[0] == '_'
|| (is_gdb_var && pstate->lexptr[0] == '$')
- || (pstate->lexptr[0] >= '0' && pstate->lexptr[0] <= '9'))
+ || (pstate->lexptr[0] >= '0' && pstate->lexptr[0] <= '9')
+ || (pstate->lexptr[0] & 0x80) != 0)
++pstate->lexptr;
diff --git a/gdb/testsuite/gdb.rust/unicode.exp b/gdb/testsuite/gdb.rust/unicode.exp
new file mode 100644
index 00000000000..9de0a0e724f
--- /dev/null
+++ b/gdb/testsuite/gdb.rust/unicode.exp
@@ -0,0 +1,51 @@
+# Copyright (C) 2022 Free Software Foundation, Inc.
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+# Test raw identifiers.
+
+load_lib rust-support.exp
+if {[skip_rust_tests]} {
+ continue
+}
+
+# Non-ASCII identifiers were allowed starting in 1.53.
+set v [split [rust_compiler_version] .]
+if {[lindex $v 0] == 1 && [lindex $v 1] < 53} {
+ untested "this test requires rust 1.53 or greater"
+ return -1
+}
+
+# Enable basic use of UTF-8. LC_ALL gets reset for each testfile.
+setenv LC_ALL C.UTF-8
+
+standard_testfile .rs
+if {[prepare_for_testing "failed to prepare" $testfile $srcfile {debug rust}]} {
+ return -1
+}
+
+set line [gdb_get_line_number "set breakpoint here"]
+if {![runto ${srcfile}:$line]} {
+ untested "could not run to breakpoint"
+ return -1
+}
+
+gdb_test "print 𝕯" " = 98" "print D"
+gdb_test "print \"𝕯\"" " = \"𝕯\"" "print D in string"
+# This output is maybe not ideal, but it also isn't incorrect.
+gdb_test "print '𝕯'" " = 120175 '\\\\u\\\{01d56f\\\}'" \
+ "print D as char"
+gdb_test "print cç" " = 97" "print cc"
+
+gdb_test "print 'çc'" "overlong character literal" "print cc as char"
diff --git a/gdb/testsuite/gdb.rust/unicode.rs b/gdb/testsuite/gdb.rust/unicode.rs
new file mode 100644
index 00000000000..c6ca90e6450
--- /dev/null
+++ b/gdb/testsuite/gdb.rust/unicode.rs
@@ -0,0 +1,26 @@
+// Copyright (C) 2022 Free Software Foundation, Inc.
+
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+#![allow(dead_code)]
+#![allow(unused_variables)]
+#![allow(unused_assignments)]
+#![allow(uncommon_codepoints)]
+#![allow(non_snake_case)]
+
+fn main() {
+ let 𝕯 = 98;
+ let cç = 97;
+ println!("{}, {}", 𝕯, cç); // set breakpoint here
+}
--
2.31.1
next reply other threads:[~2022-01-26 23:17 UTC|newest]
Thread overview: 7+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-01-26 23:15 Tom Tromey [this message]
2022-02-06 20:23 ` Tom Tromey
2022-04-03 16:17 ` Andrew Burgess via Gdb-patches
2022-04-03 16:51 ` Tom Tromey
2022-04-03 17:34 ` Andrew Burgess via Gdb-patches
2022-04-04 9:10 ` Andrew Burgess via Gdb-patches
2022-04-04 9:48 ` Tom Tromey
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20220126231501.1031201-1-tom@tromey.com \
--to=tom@tromey.com \
--cc=gdb-patches@sourceware.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox