1 /* Character set conversion support for GDB.
3 Copyright (C) 2001, 2003, 2007, 2008, 2009 Free Software Foundation, Inc.
5 This file is part of GDB.
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>. */
23 #include "gdb_assert.h"
24 #include "gdb_obstack.h"
25 #include "charset-list.h"
29 #include "gdb_string.h"
33 /* How GDB's character set support works
35 GDB has three global settings:
37 - The `current host character set' is the character set GDB should
38 use in talking to the user, and which (hopefully) the user's
39 terminal knows how to display properly. Most users should not
42 - The `current target character set' is the character set the
43 program being debugged uses.
45 - The `current target wide character set' is the wide character set
46 the program being debugged uses, that is, the encoding used for
49 There are commands to set each of these, and mechanisms for
50 choosing reasonable default values. GDB has a global list of
51 character sets that it can use as its host or target character
54 The header file `charset.h' declares various functions that
55 different pieces of GDB need to perform tasks like:
57 - printing target strings and characters to the user's terminal
58 (mostly target->host conversions),
60 - building target-appropriate representations of strings and
61 characters the user enters in expressions (mostly host->target
66 To avoid excessive code duplication and maintenance efforts,
67 GDB simply requires a capable iconv function. Users on platforms
68 without a suitable iconv can use the GNU iconv library. */
73 /* Provide a phony iconv that does as little as possible. Also,
74 arrange for there to be a single available character set. */
76 #undef GDB_DEFAULT_HOST_CHARSET
77 #define GDB_DEFAULT_HOST_CHARSET "ISO-8859-1"
78 #define GDB_DEFAULT_TARGET_CHARSET "ISO-8859-1"
79 #define GDB_DEFAULT_TARGET_WIDE_CHARSET "ISO-8859-1"
80 #undef DEFAULT_CHARSET_NAMES
81 #define DEFAULT_CHARSET_NAMES GDB_DEFAULT_HOST_CHARSET ,
90 #define ICONV_CONST const
93 iconv_open (const char *to, const char *from)
95 /* We allow conversions from UCS-4BE, wchar_t, and the host charset.
96 We allow conversions to wchar_t and the host charset. */
97 if (strcmp (from, "UCS-4BE") && strcmp (from, "wchar_t")
98 && strcmp (from, GDB_DEFAULT_HOST_CHARSET))
100 if (strcmp (to, "wchar_t") && strcmp (to, GDB_DEFAULT_HOST_CHARSET))
103 /* Return 1 if we are converting from UCS-4BE, 0 otherwise. This is
104 used as a flag in calls to iconv. */
105 return !strcmp (from, "UCS-4BE");
109 iconv_close (iconv_t arg)
115 iconv (iconv_t ucs_flag, const char **inbuf, size_t *inbytesleft,
116 char **outbuf, size_t *outbytesleft)
120 while (*inbytesleft >= 4)
125 for (j = 0; j < 4; ++j)
128 c += (*inbuf)[j] & 0xff;
143 if (*inbytesleft < 4)
151 /* In all other cases we simply copy input bytes to the
153 size_t amt = *inbytesleft;
154 if (amt > *outbytesleft)
156 memcpy (*outbuf, *inbuf, amt);
160 *outbytesleft -= amt;
169 /* The number of non-reversible conversions -- but they were all
178 /* The global lists of character sets and translations. */
181 #ifndef GDB_DEFAULT_TARGET_CHARSET
182 #define GDB_DEFAULT_TARGET_CHARSET "ISO-8859-1"
185 #ifndef GDB_DEFAULT_TARGET_WIDE_CHARSET
186 #define GDB_DEFAULT_TARGET_WIDE_CHARSET "UCS-4"
189 static const char *auto_host_charset_name = GDB_DEFAULT_HOST_CHARSET;
190 static const char *host_charset_name = "auto";
192 show_host_charset_name (struct ui_file *file, int from_tty,
193 struct cmd_list_element *c,
196 if (!strcmp (value, "auto"))
197 fprintf_filtered (file,
198 _("The host character set is \"auto; currently %s\".\n"),
199 auto_host_charset_name);
201 fprintf_filtered (file, _("The host character set is \"%s\".\n"), value);
204 static const char *target_charset_name = GDB_DEFAULT_TARGET_CHARSET;
206 show_target_charset_name (struct ui_file *file, int from_tty,
207 struct cmd_list_element *c, const char *value)
209 fprintf_filtered (file, _("The target character set is \"%s\".\n"),
213 static const char *target_wide_charset_name = GDB_DEFAULT_TARGET_WIDE_CHARSET;
215 show_target_wide_charset_name (struct ui_file *file, int from_tty,
216 struct cmd_list_element *c, const char *value)
218 fprintf_filtered (file, _("The target wide character set is \"%s\".\n"),
222 static const char *default_charset_names[] =
224 DEFAULT_CHARSET_NAMES
228 static const char **charset_enum;
231 /* If the target wide character set has big- or little-endian
232 variants, these are the corresponding names. */
233 static const char *target_wide_charset_be_name;
234 static const char *target_wide_charset_le_name;
236 /* A helper function for validate which sets the target wide big- and
237 little-endian character set names, if possible. */
240 set_be_le_names (void)
244 target_wide_charset_le_name = NULL;
245 target_wide_charset_be_name = NULL;
247 len = strlen (target_wide_charset_name);
248 for (i = 0; charset_enum[i]; ++i)
250 if (strncmp (target_wide_charset_name, charset_enum[i], len))
252 if ((charset_enum[i][len] == 'B'
253 || charset_enum[i][len] == 'L')
254 && charset_enum[i][len + 1] == 'E'
255 && charset_enum[i][len + 2] == '\0')
257 if (charset_enum[i][len] == 'B')
258 target_wide_charset_be_name = charset_enum[i];
260 target_wide_charset_le_name = charset_enum[i];
265 /* 'Set charset', 'set host-charset', 'set target-charset', 'set
266 target-wide-charset', 'set charset' sfunc's. */
272 const char *host_cset = host_charset ();
274 desc = iconv_open (target_wide_charset_name, host_cset);
275 if (desc == (iconv_t) -1)
276 error ("Cannot convert between character sets `%s' and `%s'",
277 target_wide_charset_name, host_cset);
280 desc = iconv_open (target_charset_name, host_cset);
281 if (desc == (iconv_t) -1)
282 error ("Cannot convert between character sets `%s' and `%s'",
283 target_charset_name, host_cset);
289 /* This is the sfunc for the 'set charset' command. */
291 set_charset_sfunc (char *charset, int from_tty, struct cmd_list_element *c)
293 /* CAREFUL: set the target charset here as well. */
294 target_charset_name = host_charset_name;
298 /* 'set host-charset' command sfunc. We need a wrapper here because
299 the function needs to have a specific signature. */
301 set_host_charset_sfunc (char *charset, int from_tty,
302 struct cmd_list_element *c)
307 /* Wrapper for the 'set target-charset' command. */
309 set_target_charset_sfunc (char *charset, int from_tty,
310 struct cmd_list_element *c)
315 /* Wrapper for the 'set target-wide-charset' command. */
317 set_target_wide_charset_sfunc (char *charset, int from_tty,
318 struct cmd_list_element *c)
323 /* sfunc for the 'show charset' command. */
325 show_charset (struct ui_file *file, int from_tty, struct cmd_list_element *c,
328 show_host_charset_name (file, from_tty, c, host_charset_name);
329 show_target_charset_name (file, from_tty, c, target_charset_name);
330 show_target_wide_charset_name (file, from_tty, c, target_wide_charset_name);
334 /* Accessor functions. */
339 if (!strcmp (host_charset_name, "auto"))
340 return auto_host_charset_name;
341 return host_charset_name;
345 target_charset (void)
347 return target_charset_name;
351 target_wide_charset (void)
353 if (gdbarch_byte_order (current_gdbarch) == BFD_ENDIAN_BIG)
355 if (target_wide_charset_be_name)
356 return target_wide_charset_be_name;
360 if (target_wide_charset_le_name)
361 return target_wide_charset_le_name;
364 return target_wide_charset_name;
368 /* Host character set management. For the time being, we assume that
369 the host character set is some superset of ASCII. */
372 host_letter_to_control_character (char c)
379 /* Convert a host character, C, to its hex value. C must already have
380 been validated using isxdigit. */
383 host_hex_value (char c)
387 if (c >= 'a' && c <= 'f')
389 gdb_assert (c >= 'A' && c <= 'F');
394 /* Public character management functions. */
396 /* A cleanup function which is run to close an iconv descriptor. */
399 cleanup_iconv (void *p)
402 iconv_close (*descp);
406 convert_between_encodings (const char *from, const char *to,
407 const gdb_byte *bytes, unsigned int num_bytes,
408 int width, struct obstack *output,
409 enum transliterations translit)
412 struct cleanup *cleanups;
415 unsigned int space_request;
417 /* Often, the host and target charsets will be the same. */
418 if (!strcmp (from, to))
420 obstack_grow (output, bytes, num_bytes);
424 desc = iconv_open (to, from);
425 if (desc == (iconv_t) -1)
426 perror_with_name ("Converting character sets");
427 cleanups = make_cleanup (cleanup_iconv, &desc);
430 inp = (char *) bytes;
432 space_request = num_bytes;
440 old_size = obstack_object_size (output);
441 obstack_blank (output, space_request);
443 outp = obstack_base (output) + old_size;
444 outleft = space_request;
446 r = iconv (desc, (ICONV_CONST char **) &inp, &inleft, &outp, &outleft);
448 /* Now make sure that the object on the obstack only includes
449 bytes we have converted. */
450 obstack_blank (output, - (int) outleft);
452 if (r == (size_t) -1)
460 /* Invalid input sequence. */
461 if (translit == translit_none)
462 error (_("Could not convert character to `%s' character set"),
465 /* We emit escape sequence for the bytes, skip them,
467 for (i = 0; i < width; ++i)
471 sprintf (octal, "\\%.3o", *inp & 0xff);
472 obstack_grow_str (output, octal);
481 /* We ran out of space in the output buffer. Make it
482 bigger next time around. */
487 /* Incomplete input sequence. FIXME: ought to report this
488 to the caller somehow. */
493 perror_with_name ("Internal error while converting character sets");
498 do_cleanups (cleanups);
503 /* An iterator that returns host wchar_t's from a target string. */
504 struct wchar_iterator
506 /* The underlying iconv descriptor. */
509 /* The input string. This is updated as convert characters. */
511 /* The number of bytes remaining in the input. */
514 /* The width of an input character. */
517 /* The output buffer and its size. */
522 /* Create a new iterator. */
523 struct wchar_iterator *
524 make_wchar_iterator (const gdb_byte *input, size_t bytes, const char *charset,
527 struct wchar_iterator *result;
530 desc = iconv_open ("wchar_t", charset);
531 if (desc == (iconv_t) -1)
532 perror_with_name ("Converting character sets");
534 result = XNEW (struct wchar_iterator);
536 result->input = (char *) input;
537 result->bytes = bytes;
538 result->width = width;
540 result->out = XNEW (gdb_wchar_t);
541 result->out_size = 1;
547 do_cleanup_iterator (void *p)
549 struct wchar_iterator *iter = p;
551 iconv_close (iter->desc);
557 make_cleanup_wchar_iterator (struct wchar_iterator *iter)
559 return make_cleanup (do_cleanup_iterator, iter);
563 wchar_iterate (struct wchar_iterator *iter,
564 enum wchar_iterate_result *out_result,
565 gdb_wchar_t **out_chars,
566 const gdb_byte **ptr,
571 /* Try to convert some characters. At first we try to convert just
572 a single character. The reason for this is that iconv does not
573 necessarily update its outgoing arguments when it encounters an
574 invalid input sequence -- but we want to reliably report this to
575 our caller so it can emit an escape sequence. */
577 while (iter->bytes > 0)
579 char *outptr = (char *) &iter->out[0];
580 char *orig_inptr = iter->input;
581 size_t orig_in = iter->bytes;
582 size_t out_avail = out_request * sizeof (gdb_wchar_t);
586 size_t r = iconv (iter->desc,
587 (ICONV_CONST char **) &iter->input, &iter->bytes,
588 &outptr, &out_avail);
589 if (r == (size_t) -1)
594 /* Invalid input sequence. Skip it, and let the caller
596 *out_result = wchar_iterate_invalid;
599 iter->input += iter->width;
600 iter->bytes -= iter->width;
604 /* We ran out of space. We still might have converted a
605 character; if so, return it. Otherwise, grow the
606 buffer and try again. */
607 if (out_avail < out_request * sizeof (gdb_wchar_t))
611 if (out_request > iter->out_size)
613 iter->out_size = out_request;
614 iter->out = xrealloc (iter->out,
615 out_request * sizeof (gdb_wchar_t));
620 /* Incomplete input sequence. Let the caller know, and
621 arrange for future calls to see EOF. */
622 *out_result = wchar_iterate_incomplete;
629 perror_with_name ("Internal error while converting character sets");
633 /* We converted something. */
634 num = out_request - out_avail / sizeof (gdb_wchar_t);
635 *out_result = wchar_iterate_ok;
636 *out_chars = iter->out;
638 *len = orig_in - iter->bytes;
643 *out_result = wchar_iterate_eof;
648 /* The charset.c module initialization function. */
650 extern initialize_file_ftype _initialize_charset; /* -Wmissing-prototype */
652 typedef char *char_ptr;
653 DEF_VEC_P (char_ptr);
655 static VEC (char_ptr) *charsets;
660 find_charset_names (void)
662 VEC_safe_push (char_ptr, charsets, GDB_DEFAULT_HOST_CHARSET);
663 VEC_safe_push (char_ptr, charsets, NULL);
666 #else /* PHONY_ICONV */
667 #ifdef HAVE_ICONVLIST
669 /* A helper function that adds some character sets to the vector of
670 all character sets. This is a callback function for iconvlist. */
673 add_one (unsigned int count, const char *const *names, void *data)
677 for (i = 0; i < count; ++i)
678 VEC_safe_push (char_ptr, charsets, xstrdup (names[i]));
684 find_charset_names (void)
686 iconvlist (add_one, NULL);
687 VEC_safe_push (char_ptr, charsets, NULL);
693 find_charset_names (void)
697 in = popen ("iconv -l", "r");
698 /* It is ok to ignore errors; we'll fall back on a default. */
702 /* POSIX says that iconv -l uses an unspecified format. We parse
703 the glibc format; feel free to add others as needed. */
706 /* The size of buf is chosen arbitrarily. A character set name
707 longer than this would not be very nice. */
710 char *r = fgets (buf, sizeof (buf), in);
716 if (buf[len - 2] == '/' && buf[len - 3] == '/')
718 VEC_safe_push (char_ptr, charsets, xstrdup (buf));
723 VEC_safe_push (char_ptr, charsets, NULL);
726 #endif /* HAVE_ICONVLIST */
727 #endif /* PHONY_ICONV */
730 _initialize_charset (void)
732 struct cmd_list_element *new_cmd;
734 /* The first element is always "auto"; then we skip it for the
735 commands where it is not allowed. */
736 VEC_safe_push (char_ptr, charsets, "auto");
737 find_charset_names ();
739 if (VEC_length (char_ptr, charsets) > 1)
740 charset_enum = (const char **) VEC_address (char_ptr, charsets);
742 charset_enum = default_charset_names;
745 #ifdef HAVE_LANGINFO_CODESET
746 auto_host_charset_name = nl_langinfo (CODESET);
747 target_charset_name = auto_host_charset_name;
753 add_setshow_enum_cmd ("charset", class_support,
754 &charset_enum[1], &host_charset_name, _("\
755 Set the host and target character sets."), _("\
756 Show the host and target character sets."), _("\
757 The `host character set' is the one used by the system GDB is running on.\n\
758 The `target character set' is the one used by the program being debugged.\n\
759 You may only use supersets of ASCII for your host character set; GDB does\n\
760 not support any others.\n\
761 To see a list of the character sets GDB supports, type `set charset <TAB>'."),
762 /* Note that the sfunc below needs to set
763 target_charset_name, because the 'set
764 charset' command sets two variables. */
767 &setlist, &showlist);
769 add_setshow_enum_cmd ("host-charset", class_support,
770 charset_enum, &host_charset_name, _("\
771 Set the host character set."), _("\
772 Show the host character set."), _("\
773 The `host character set' is the one used by the system GDB is running on.\n\
774 You may only use supersets of ASCII for your host character set; GDB does\n\
775 not support any others.\n\
776 To see a list of the character sets GDB supports, type `set host-charset <TAB>'."),
777 set_host_charset_sfunc,
778 show_host_charset_name,
779 &setlist, &showlist);
781 add_setshow_enum_cmd ("target-charset", class_support,
782 &charset_enum[1], &target_charset_name, _("\
783 Set the target character set."), _("\
784 Show the target character set."), _("\
785 The `target character set' is the one used by the program being debugged.\n\
786 GDB translates characters and strings between the host and target\n\
787 character sets as needed.\n\
788 To see a list of the character sets GDB supports, type `set target-charset'<TAB>"),
789 set_target_charset_sfunc,
790 show_target_charset_name,
791 &setlist, &showlist);
793 add_setshow_enum_cmd ("target-wide-charset", class_support,
794 &charset_enum[1], &target_wide_charset_name,
796 Set the target wide character set."), _("\
797 Show the target wide character set."), _("\
798 The `target wide character set' is the one used by the program being debugged.\n\
799 In particular it is the encoding used by `wchar_t'.\n\
800 GDB translates characters and strings between the host and target\n\
801 character sets as needed.\n\
802 To see a list of the character sets GDB supports, type\n\
803 `set target-wide-charset'<TAB>"),
804 set_target_wide_charset_sfunc,
805 show_target_wide_charset_name,
806 &setlist, &showlist);