gdb/python: handle non-utf-8 character from gdb.execute()

I noticed that it was not possible to return a string containing non utf-8 characters using gdb.execute(). For example, using the binary from the gdb.python/py-source-styling.exp test: (gdb) file ./gdb/testsuite/outputs/gdb.python/py-source-styling/py-source-styling Reading symbols from ./gdb/testsuite/outputs/gdb.python/py-source-styling/py-source-styling... (gdb) set style enabled off (gdb) list 26 21 int some_variable = 1234; 22 23 /* The following line contains a character that is non-utf-8. This is a 24 critical part of the test as Python 3 can't convert this into a string 25 using its default mechanism. */ 26 char c[] = "�"; /* List this line. */ 27 28 return 0; 29 } (gdb) python print(gdb.execute('list 26', to_string=True)) Python Exception <class 'UnicodeDecodeError'>: 'utf-8' codec can't decode byte 0xc0 in position 250: invalid start byte Error occurred in Python: 'utf-8' codec can't decode byte 0xc0 in position 250: invalid start byte It is necessary to disable styling before the initial 'list 26', otherwise the source will be passed through GNU source highlight, and GNU source highlight seems to be smart enough to figure out the character encoding, and convert it to UTF-8. This conversion is then cached in the source cache, and the later Python gdb.execute call will get back a pure UTF-8 string. If source styling is disabled, then GDB caches the string without the conversion to UTF-8, now the gdb.execute call gets back the string with a non-UTF-8 character within it, and Python throws an error during its attempt to create a string object. I'm not, at this point, proposing a solution that tries to guess the source file encoding, though I guess such a thing could be done. Instead, I think we should make use of the host_charset(), as set by the user with 'set host-charset ....' during the creation of the Python string. To do this, in execute_gdb_command, we should switch from PyUnicode_FromString, which requires the input be a UTF-8 string, to using PyUnicode_Decode, which allows GDB to specify the string encoding. We will use host_charset(). With this done, it is now possible to list the file contents using gdb.execute(), with the contents passing through a string: (gdb) set host-charset ISO-8859-1 (gdb) python print(gdb.execute('list 26', to_string=True), end='') 21 int some_variable = 1234; 22 23 /* The following line contains a character that is non-utf-8. This is a 24 critical part of the test as Python 3 can't convert this into a string 25 using its default mechanism. */ 26 char c[] = "À"; /* List this line. */ 27 28 return 0; 29 } (gdb) There are already plenty of other places in GDB's Python code where we use PyUnicode_Decode to create a string from something that might contain user generated content, so I believe this is the correct approach.
author: Andrew Burgess <aburgess@redhat.com> 2025-02-14 11:51:41 +0000
committer: Andrew Burgess <aburgess@redhat.com> 2025-03-15 12:36:46 +0000
commit: 8bfe8a6bfdd45de43c626a12fd176750486a0759 (patch)
tree: f009f658a4f681d144a0c390a7ec3f23524f11a6 /gdb/testsuite/gdb.python
parent: c7d973ab6189290fc894529cec5db7f585074ab4 (diff)
download: binutils-8bfe8a6bfdd45de43c626a12fd176750486a0759.zip
binutils-8bfe8a6bfdd45de43c626a12fd176750486a0759.tar.gz
binutils-8bfe8a6bfdd45de43c626a12fd176750486a0759.tar.bz2
1 files changed, 55 insertions, 14 deletions
diff --git a/gdb/testsuite/gdb.python/py-source-styling.exp b/gdb/testsuite/gdb.python/py-source-styling.exp
index a406771..ba7e795 100644
--- a/gdb/testsuite/gdb.python/py-source-styling.exp
+++ b/gdb/testsuite/gdb.python/py-source-styling.exp
@@ -13,9 +13,13 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
-# Check that the Python pygments module can be used for source
-# highlighting when GNU source highlight is not available (or is
-# disabled, as is done in this test).
+# Test related to source code highlighting and Python.  Includes a
+# test for using the Pygments module as a fall back to GNU source
+# highlight.
+#
+# This script also includes tests for handling a non-uft-8 character
+# with both Pygments highlighting, and with gdb.execute (when using
+# the list command).
 
 require allow_python_tests
 
@@ -23,14 +27,17 @@ load_lib gdb-python.exp
 
 standard_testfile
 
-with_ansi_styling_terminal {
-    # We need an ANSI-capable terminal to get the output, additionally
-    # we need to set LC_ALL so GDB knows the terminal is UTF-8
-    # capable, otherwise we'll get a UnicodeEncodeError trying to
-    # encode the output.
-    if { [prepare_for_testing "failed to prepare" ${testfile} ${srcfile}] } {
-	return
-    }
+if { [build_executable "failed to build" ${testfile} ${srcfile}] == -1 } {
+    return
+}
+
+set line_number [gdb_get_line_number "List this line."]
+
+# Check that the Python pygments module can be used for source
+# highlighting when GNU source highlight is not available (or is
+# disabled, as is done in this test).
+proc test_pygments_styling {} {
+    clean_restart $::binfile
 
     if { ![gdb_py_module_available "pygments"] } {
 	unsupported "pygments module not available"
@@ -46,8 +53,7 @@ with_ansi_styling_terminal {
     gdb_test "maint flush source-cache" "Source cache flushed\\."
 
     set seen_style_escape false
-    set line_number [gdb_get_line_number "List this line."]
-    gdb_test_multiple "list ${line_number}" "" {
+    gdb_test_multiple "list $::line_number" "" {
 	-re "Python Exception.*" {
 	    fail $gdb_test_name
 	}
@@ -55,8 +61,43 @@ with_ansi_styling_terminal {
 	    set seen_style_escape true
 	    exp_continue
 	}
-	-re "$gdb_prompt $" {
+	-re "$::gdb_prompt $" {
 	    gdb_assert { $seen_style_escape } $gdb_test_name
 	}
     }
 }
+
+# Use gdb.execute to list source code containing non-utf-8 character.
+# Check that initially GDB fails to convert the source code to a
+# string, then set the correct host encoding, and try again.  This
+# time the conversion should succeed.
+proc test_gdb_execute_non_utf8_source {} {
+    clean_restart $::binfile
+
+    # The default host charset is utf-8, the source code contains a
+    # non-utf-8 character, so this will fail.
+    gdb_test \
+	"python source = gdb.execute('list $::line_number', True, True)" \
+	[multi_line \
+	     "Python Exception <class 'UnicodeDecodeError'>: 'ascii' codec can't decode byte 0xc0 in position 250: ordinal not in range\\(128\\)" \
+	     "Error occurred in Python: 'ascii' codec can't decode byte 0xc0 in position 250: ordinal not in range\\(128\\)"] \
+	"gdb.execute fails to convert result to string"
+
+    # Set the correct host charset, and try the conversion again.
+    gdb_test_no_output "set host-charset ISO-8859-1"
+    gdb_test_no_output \
+	"python source = gdb.execute('list $::line_number', True, True)" \
+	"gdb.execute does convert result to string"
+
+    # Check that we captured something that looks like the expected source.
+    gdb_test "python print(source)" ".*List this line.*"
+}
+
+# We need an ANSI-capable terminal to get the output, additionally we
+# need to set LC_ALL so GDB knows the terminal is UTF-8 capable,
+# otherwise we'll get a UnicodeEncodeError trying to encode the
+# output.
+with_ansi_styling_terminal {
+    test_pygments_styling
+    test_gdb_execute_non_utf8_source
+}
author	Andrew Burgess <aburgess@redhat.com>	2025-02-14 11:51:41 +0000
committer	Andrew Burgess <aburgess@redhat.com>	2025-03-15 12:36:46 +0000
commit	8bfe8a6bfdd45de43c626a12fd176750486a0759 (patch)
tree	f009f658a4f681d144a0c390a7ec3f23524f11a6 /gdb/testsuite/gdb.python
parent	c7d973ab6189290fc894529cec5db7f585074ab4 (diff)
download	binutils-8bfe8a6bfdd45de43c626a12fd176750486a0759.zip binutils-8bfe8a6bfdd45de43c626a12fd176750486a0759.tar.gz binutils-8bfe8a6bfdd45de43c626a12fd176750486a0759.tar.bz2