From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 22993 invoked by alias); 25 Sep 2009 20:40:44 -0000 Received: (qmail 22972 invoked by uid 22791); 25 Sep 2009 20:40:42 -0000 X-SWARE-Spam-Status: No, hits=-2.4 required=5.0 tests=AWL,BAYES_00,SPF_HELO_PASS,SPF_PASS X-Spam-Check-By: sourceware.org Received: from mx1.redhat.com (HELO mx1.redhat.com) (209.132.183.28) by sourceware.org (qpsmtpd/0.43rc1) with ESMTP; Fri, 25 Sep 2009 20:40:33 +0000 Received: from int-mx05.intmail.prod.int.phx2.redhat.com (int-mx05.intmail.prod.int.phx2.redhat.com [10.5.11.18]) by mx1.redhat.com (8.13.8/8.13.8) with ESMTP id n8PKeV6X001631 for ; Fri, 25 Sep 2009 16:40:31 -0400 Received: from ns3.rdu.redhat.com (ns3.rdu.redhat.com [10.11.255.199]) by int-mx05.intmail.prod.int.phx2.redhat.com (8.13.8/8.13.8) with ESMTP id n8PKeULW008740; Fri, 25 Sep 2009 16:40:31 -0400 Received: from opsy.redhat.com (ovpn01.gateway.prod.ext.phx2.redhat.com [10.5.9.1]) by ns3.rdu.redhat.com (8.13.8/8.13.8) with ESMTP id n8PKeTZx028455; Fri, 25 Sep 2009 16:40:29 -0400 Received: by opsy.redhat.com (Postfix, from userid 500) id 2B98237818C; Fri, 25 Sep 2009 14:40:29 -0600 (MDT) From: Tom Tromey To: gdb-patches@sourceware.org Subject: FYI: use UTF-16 and UTF-32, not UCS-2 and UCS-4 Reply-To: tromey@redhat.com Date: Fri, 25 Sep 2009 20:40:00 -0000 Message-ID: User-Agent: Gnus/5.13 (Gnus v5.13) Emacs/23.1 (gnu/linux) MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Mailing-List: contact gdb-patches-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: gdb-patches-owner@sourceware.org X-SW-Source: 2009-09/txt/msg00819.txt.bz2 I'm checking this in on the trunk. This changes gdb to use UTF-16 and UTF-32, not UCS-2 and UCS-4. UCS-2 is obsolete, and UCS-4 is basically the same as UTF-32 -- but changing this name is more consistent, and also interoperates more nicely with Python, which apparently only recognizes the UTF names. Built and regtested on x86-64 (compile farm). I am undecided as to whether I ought to push this into the 7.0 branch. Any opinions? Tom 2009-09-25 Tom Tromey * charset.c (iconv_open): Use UTF-16 and UTF-32, not UCS-2 and UCS-4. (iconv): Rename 'ucs_flag'. (GDB_DEFAULT_TARGET_WIDE_CHARSET): Use UTF-32. * c-lang.c (charset_for_string_type): Use UTF-16 and UTF-32, not UCS-2 and UCS-4. (convert_ucn): Use UTF-32. 2009-09-25 Tom Tromey * gdb.base/charset.exp: Use UTF-16 and UTF-32, not UCS-2 and UCS-4. * gdb.base/charset.c (utf_32_string): Rename. (init_utf32): Rename. (main): Update. diff --git a/gdb/c-lang.c b/gdb/c-lang.c index 4ba81ba..911aa5f 100644 --- a/gdb/c-lang.c +++ b/gdb/c-lang.c @@ -52,17 +52,17 @@ charset_for_string_type (enum c_string_type str_type, case C_WIDE_STRING: return target_wide_charset (byte_order); case C_STRING_16: - /* FIXME: UCS-2 is not always correct. */ + /* FIXME: UTF-16 is not always correct. */ if (byte_order == BFD_ENDIAN_BIG) - return "UCS-2BE"; + return "UTF-16BE"; else - return "UCS-2LE"; + return "UTF-16LE"; case C_STRING_32: - /* FIXME: UCS-4 is not always correct. */ + /* FIXME: UTF-32 is not always correct. */ if (byte_order == BFD_ENDIAN_BIG) - return "UCS-4BE"; + return "UTF-32BE"; else - return "UCS-4LE"; + return "UTF-32LE"; } internal_error (__FILE__, __LINE__, "unhandled c_string_type"); } @@ -763,7 +763,7 @@ convert_ucn (char *p, char *limit, const char *dest_charset, result >>= 8; } - convert_between_encodings ("UCS-4BE", dest_charset, data, 4, 4, output, + convert_between_encodings ("UTF-32BE", dest_charset, data, 4, 4, output, translit_none); return p; diff --git a/gdb/charset.c b/gdb/charset.c index a59d9c6..f5281ed 100644 --- a/gdb/charset.c +++ b/gdb/charset.c @@ -102,17 +102,17 @@ iconv_t iconv_open (const char *to, const char *from) { - /* We allow conversions from UCS-4BE, wchar_t, and the host charset. + /* We allow conversions from UTF-32BE, wchar_t, and the host charset. We allow conversions to wchar_t and the host charset. */ - if (strcmp (from, "UCS-4BE") && strcmp (from, "wchar_t") + if (strcmp (from, "UTF-32BE") && strcmp (from, "wchar_t") && strcmp (from, GDB_DEFAULT_HOST_CHARSET)) return -1; if (strcmp (to, "wchar_t") && strcmp (to, GDB_DEFAULT_HOST_CHARSET)) return -1; - /* Return 1 if we are converting from UCS-4BE, 0 otherwise. This is + /* Return 1 if we are converting from UTF-32BE, 0 otherwise. This is used as a flag in calls to iconv. */ - return !strcmp (from, "UCS-4BE"); + return !strcmp (from, "UTF-32BE"); } int @@ -122,10 +122,10 @@ iconv_close (iconv_t arg) } size_t -iconv (iconv_t ucs_flag, const char **inbuf, size_t *inbytesleft, +iconv (iconv_t utf_flag, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft) { - if (ucs_flag) + if (utf_flag) { while (*inbytesleft >= 4) { @@ -193,7 +193,7 @@ iconv (iconv_t ucs_flag, const char **inbuf, size_t *inbytesleft, #endif #ifndef GDB_DEFAULT_TARGET_WIDE_CHARSET -#define GDB_DEFAULT_TARGET_WIDE_CHARSET "UCS-4" +#define GDB_DEFAULT_TARGET_WIDE_CHARSET "UTF-32" #endif static const char *auto_host_charset_name = GDB_DEFAULT_HOST_CHARSET; diff --git a/gdb/testsuite/gdb.base/charset.c b/gdb/testsuite/gdb.base/charset.c index b61e047..5f07c9e 100644 --- a/gdb/testsuite/gdb.base/charset.c +++ b/gdb/testsuite/gdb.base/charset.c @@ -50,10 +50,10 @@ char ebcdic_us_string[NUM_CHARS]; char ibm1047_string[NUM_CHARS]; /* We make a phony wchar_t and then pretend that this platform uses - UCS-4 (or UCS-2, depending on the size -- same difference for the + UTF-32 (or UTF-16, depending on the size -- same difference for the purposes of this test). */ typedef unsigned int wchar_t; -wchar_t ucs_4_string[NUM_CHARS]; +wchar_t utf_32_string[NUM_CHARS]; /* We also define a couple phony types for testing the u'' and U'' support. It is ok if these have the wrong size on some platforms @@ -103,12 +103,12 @@ fill_run (char string[], int start, int len, int first) void -init_ucs4 () +init_utf32 () { int i; for (i = 0; i < NUM_CHARS; ++i) - ucs_4_string[i] = iso_8859_1_string[i] & 0xff; + utf_32_string[i] = iso_8859_1_string[i] & 0xff; } int main () @@ -171,9 +171,9 @@ int main () /* The digits, at least, are contiguous. */ fill_run (ibm1047_string, 59, 10, 240); - init_ucs4 (); + init_utf32 (); - myvar = ucs_4_string[7]; + myvar = utf_32_string[7]; return 0; /* all strings initialized */ } diff --git a/gdb/testsuite/gdb.base/charset.exp b/gdb/testsuite/gdb.base/charset.exp index fe1fbb0..7a96bb8 100644 --- a/gdb/testsuite/gdb.base/charset.exp +++ b/gdb/testsuite/gdb.base/charset.exp @@ -375,10 +375,10 @@ gdb_expect { set wchar_size [get_sizeof wchar_t 99] set wchar_ok 0 if {$wchar_size == 2} { - lappend charset_subset UCS-2 + lappend charset_subset UTF-16 set wchar_ok 1 } elseif {$wchar_size == 4} { - lappend charset_subset UCS-4 + lappend charset_subset UTF-32 set wchar_ok 1 } @@ -388,7 +388,7 @@ foreach target_charset $charset_subset { continue } - if {$target_charset == "UCS-4" || $target_charset == "UCS-2"} { + if {$target_charset == "UTF-32" || $target_charset == "UTF-16"} { set param target-wide-charset set L L } else { @@ -424,10 +424,10 @@ foreach target_charset $charset_subset { # a string in $target_charset. The variable's name is the # character set's name, in lower-case, with all non-identifier # characters replaced with '_', with "_string" stuck on the end. - if {$target_charset == "UCS-2"} { - # We still use the ucs_4_string variable -- but the size is - # correct for UCS-2. - set var_name ucs_4_string + if {$target_charset == "UTF-16"} { + # We still use the utf_32_string variable -- but the size is + # correct for UTF-16. + set var_name utf_32_string } else { set var_name [string tolower "${target_charset}_string"] regsub -all -- "\[^a-z0-9_\]" $var_name "_" var_name @@ -556,7 +556,7 @@ gdb_test "print '\\9'" " = \[0-9\]+ '9'" gdb_test "print \"\\1011\"" " = \"A1\"" # Tests for wide- or unicode- strings. L is the prefix letter to use, -# either "L" (for wide strings), "u" (for UCS-2), or "U" (for UCS-4). +# either "L" (for wide strings), "u" (for UTF-16), or "U" (for UTF-32). # NAME is used in the test names and should be related to the prefix # letter in some easy-to-undestand way. proc test_wide_or_unicode {L name} { @@ -582,12 +582,12 @@ if {$wchar_ok} { set ucs2_ok [expr {[get_sizeof char16_t 99] == 2}] if {$ucs2_ok} { - test_wide_or_unicode u UCS-2 + test_wide_or_unicode u UTF-16 } set ucs4_ok [expr {[get_sizeof char32_t 99] == 4}] if {$ucs4_ok} { - test_wide_or_unicode U UCS-4 + test_wide_or_unicode U UTF-32 } # Test an invalid string combination. @@ -598,16 +598,16 @@ proc test_combination {L1 name1 L2 name2} { } if {$wchar_ok && $ucs2_ok} { - test_combination L wide u UCS-2 + test_combination L wide u UTF-16 } if {$wchar_ok && $ucs4_ok} { - test_combination L wide U UCS-4 + test_combination L wide U UTF-32 # Regression test for a typedef to a typedef. gdb_test "print myvar" "= \[0-9\]+ L'A'" \ "typedef to wchar_t" } if {$ucs2_ok && $ucs4_ok} { - test_combination u UCS-2 U UCS-4 + test_combination u UTF-16 U UTF-32 } # Regression test for a cleanup bug in the charset code.