gdb/charset.c

   1 /* Character set conversion support for GDB.
   2
   3    Copyright (C) 2001-2021 Free Software Foundation, Inc.
   4
   5    This file is part of GDB.
   6
   7    This program is free software; you can redistribute it and/or modify
   8    it under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3 of the License, or
  10    (at your option) any later version.
  11
  12    This program is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15    GNU General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  19
  20 #include "defs.h"
  21 #include "charset.h"
  22 #include "gdbcmd.h"
  23 #include "gdb_obstack.h"
  24 #include "gdbsupport/gdb_wait.h"
  25 #include "charset-list.h"
  26 #include "gdbsupport/environ.h"
  27 #include "arch-utils.h"
  28 #include "gdbsupport/gdb_vecs.h"
  29 #include <ctype.h>
  30
  31 #ifdef USE_WIN32API
  32 #include <windows.h>
  33 #endif
  34 \f
  35 /* How GDB's character set support works
  36
  37    GDB has three global settings:
  38
  39    - The `current host character set' is the character set GDB should
  40      use in talking to the user, and which (hopefully) the user's
  41      terminal knows how to display properly.  Most users should not
  42      change this.
  43
  44    - The `current target character set' is the character set the
  45      program being debugged uses.
  46
  47    - The `current target wide character set' is the wide character set
  48      the program being debugged uses, that is, the encoding used for
  49      wchar_t.
  50
  51    There are commands to set each of these, and mechanisms for
  52    choosing reasonable default values.  GDB has a global list of
  53    character sets that it can use as its host or target character
  54    sets.
  55
  56    The header file `charset.h' declares various functions that
  57    different pieces of GDB need to perform tasks like:
  58
  59    - printing target strings and characters to the user's terminal
  60      (mostly target->host conversions),
  61
  62    - building target-appropriate representations of strings and
  63      characters the user enters in expressions (mostly host->target
  64      conversions),
  65
  66      and so on.
  67
  68    To avoid excessive code duplication and maintenance efforts,
  69    GDB simply requires a capable iconv function.  Users on platforms
  70    without a suitable iconv can use the GNU iconv library.  */
  71
  72 \f
  73 #ifdef PHONY_ICONV
  74
  75 /* Provide a phony iconv that does as little as possible.  Also,
  76    arrange for there to be a single available character set.  */
  77
  78 #undef GDB_DEFAULT_HOST_CHARSET
  79 #ifdef USE_WIN32API
  80 # define GDB_DEFAULT_HOST_CHARSET "CP1252"
  81 #else
  82 # define GDB_DEFAULT_HOST_CHARSET "ISO-8859-1"
  83 #endif
  84 #define GDB_DEFAULT_TARGET_CHARSET GDB_DEFAULT_HOST_CHARSET
  85 #define GDB_DEFAULT_TARGET_WIDE_CHARSET "UTF-32"
  86 #undef DEFAULT_CHARSET_NAMES
  87 #define DEFAULT_CHARSET_NAMES GDB_DEFAULT_HOST_CHARSET ,
  88
  89 #undef iconv_t
  90 #define iconv_t int
  91 #undef iconv_open
  92 #define iconv_open phony_iconv_open
  93 #undef iconv
  94 #define iconv phony_iconv
  95 #undef iconv_close
  96 #define iconv_close phony_iconv_close
  97
  98 #undef ICONV_CONST
  99 #define ICONV_CONST const
 100
 101 /* We allow conversions from UTF-32, wchar_t, and the host charset.
 102    We allow conversions to wchar_t and the host charset.
 103    Return 1 if we are converting from UTF-32BE, 2 if from UTF32-LE,
 104    0 otherwise.  This is used as a flag in calls to iconv.  */
 105
 106 static iconv_t
 107 phony_iconv_open (const char *to, const char *from)
 108 {
 109   if (strcmp (to, "wchar_t") && strcmp (to, GDB_DEFAULT_HOST_CHARSET))
 110     return -1;
 111
 112   if (!strcmp (from, "UTF-32BE") || !strcmp (from, "UTF-32"))
 113     return 1;
 114
 115   if (!strcmp (from, "UTF-32LE"))
 116     return 2;
 117
 118   if (strcmp (from, "wchar_t") && strcmp (from, GDB_DEFAULT_HOST_CHARSET))
 119     return -1;
 120
 121   return 0;
 122 }
 123
 124 static int
 125 phony_iconv_close (iconv_t arg)
 126 {
 127   return 0;
 128 }
 129
 130 static size_t
 131 phony_iconv (iconv_t utf_flag, const char **inbuf, size_t *inbytesleft,
 132              char **outbuf, size_t *outbytesleft)
 133 {
 134   if (utf_flag)
 135     {
 136       enum bfd_endian endian
 137         = utf_flag == 1 ? BFD_ENDIAN_BIG : BFD_ENDIAN_LITTLE;
 138       while (*inbytesleft >= 4)
 139         {
 140           unsigned long c
 141             = extract_unsigned_integer ((const gdb_byte *)*inbuf, 4, endian);
 142
 143           if (c >= 256)
 144             {
 145               errno = EILSEQ;
 146               return -1;
 147             }
 148           if (*outbytesleft < 1)
 149             {
 150               errno = E2BIG;
 151               return -1;
 152             }
 153           **outbuf = c & 0xff;
 154           ++*outbuf;
 155           --*outbytesleft;
 156
 157           *inbuf += 4;
 158           *inbytesleft -= 4;
 159         }
 160       if (*inbytesleft)
 161         {
 162           /* Partial sequence on input.  */
 163           errno = EINVAL;
 164           return -1;
 165         }
 166     }
 167   else
 168     {
 169       /* In all other cases we simply copy input bytes to the
 170          output.  */
 171       size_t amt = *inbytesleft;
 172
 173       if (amt > *outbytesleft)
 174         amt = *outbytesleft;
 175       memcpy (*outbuf, *inbuf, amt);
 176       *inbuf += amt;
 177       *outbuf += amt;
 178       *inbytesleft -= amt;
 179       *outbytesleft -= amt;
 180       if (*inbytesleft)
 181         {
 182           errno = E2BIG;
 183           return -1;
 184         }
 185     }
 186
 187   /* The number of non-reversible conversions -- but they were all
 188      reversible.  */
 189   return 0;
 190 }
 191
 192 #else /* PHONY_ICONV */
 193
 194 /* On systems that don't have EILSEQ, GNU iconv's iconv.h defines it
 195    to ENOENT, while gnulib defines it to a different value.  Always
 196    map ENOENT to gnulib's EILSEQ, leaving callers agnostic.  */
 197
 198 static size_t
 199 gdb_iconv (iconv_t utf_flag, ICONV_CONST char **inbuf, size_t *inbytesleft,
 200            char **outbuf, size_t *outbytesleft)
 201 {
 202   size_t ret;
 203
 204   ret = iconv (utf_flag, inbuf, inbytesleft, outbuf, outbytesleft);
 205   if (errno == ENOENT)
 206     errno = EILSEQ;
 207   return ret;
 208 }
 209
 210 #undef iconv
 211 #define iconv gdb_iconv
 212
 213 #endif /* PHONY_ICONV */
 214
 215 \f
 216 /* The global lists of character sets and translations.  */
 217
 218
 219 #ifndef GDB_DEFAULT_TARGET_CHARSET
 220 #define GDB_DEFAULT_TARGET_CHARSET "ISO-8859-1"
 221 #endif
 222
 223 #ifndef GDB_DEFAULT_TARGET_WIDE_CHARSET
 224 #define GDB_DEFAULT_TARGET_WIDE_CHARSET "UTF-32"
 225 #endif
 226
 227 static const char *auto_host_charset_name = GDB_DEFAULT_HOST_CHARSET;
 228 static const char *host_charset_name = "auto";
 229 static void
 230 show_host_charset_name (struct ui_file *file, int from_tty,
 231                         struct cmd_list_element *c,
 232                         const char *value)
 233 {
 234   if (!strcmp (value, "auto"))
 235     fprintf_filtered (file,
 236                       _("The host character set is \"auto; currently %s\".\n"),
 237                       auto_host_charset_name);
 238   else
 239     fprintf_filtered (file, _("The host character set is \"%s\".\n"), value);
 240 }
 241
 242 static const char *target_charset_name = "auto";
 243 static void
 244 show_target_charset_name (struct ui_file *file, int from_tty,
 245                           struct cmd_list_element *c, const char *value)
 246 {
 247   if (!strcmp (value, "auto"))
 248     fprintf_filtered (file,
 249                       _("The target character set is \"auto; "
 250                         "currently %s\".\n"),
 251                       gdbarch_auto_charset (get_current_arch ()));
 252   else
 253     fprintf_filtered (file, _("The target character set is \"%s\".\n"),
 254                       value);
 255 }
 256
 257 static const char *target_wide_charset_name = "auto";
 258 static void
 259 show_target_wide_charset_name (struct ui_file *file,
 260                                int from_tty,
 261                                struct cmd_list_element *c,
 262                                const char *value)
 263 {
 264   if (!strcmp (value, "auto"))
 265     fprintf_filtered (file,
 266                       _("The target wide character set is \"auto; "
 267                         "currently %s\".\n"),
 268                       gdbarch_auto_wide_charset (get_current_arch ()));
 269   else
 270     fprintf_filtered (file, _("The target wide character set is \"%s\".\n"),
 271                       value);
 272 }
 273
 274 static const char * const default_charset_names[] =
 275 {
 276   DEFAULT_CHARSET_NAMES
 277   0
 278 };
 279
 280 static const char * const *charset_enum;
 281
 282 \f
 283 /* If the target wide character set has big- or little-endian
 284    variants, these are the corresponding names.  */
 285 static const char *target_wide_charset_be_name;
 286 static const char *target_wide_charset_le_name;
 287
 288 /* The architecture for which the BE- and LE-names are valid.  */
 289 static struct gdbarch *be_le_arch;
 290
 291 /* A helper function which sets the target wide big- and little-endian
 292    character set names, if possible.  */
 293
 294 static void
 295 set_be_le_names (struct gdbarch *gdbarch)
 296 {
 297   if (be_le_arch == gdbarch)
 298     return;
 299   be_le_arch = gdbarch;
 300
 301 #ifdef PHONY_ICONV
 302   /* Match the wide charset names recognized by phony_iconv_open.  */
 303   target_wide_charset_le_name = "UTF-32LE";
 304   target_wide_charset_be_name = "UTF-32BE";
 305 #else
 306   int i, len;
 307   const char *target_wide;
 308
 309   target_wide_charset_le_name = NULL;
 310   target_wide_charset_be_name = NULL;
 311
 312   target_wide = target_wide_charset_name;
 313   if (!strcmp (target_wide, "auto"))
 314     target_wide = gdbarch_auto_wide_charset (gdbarch);
 315
 316   len = strlen (target_wide);
 317   for (i = 0; charset_enum[i]; ++i)
 318     {
 319       if (strncmp (target_wide, charset_enum[i], len))
 320         continue;
 321       if ((charset_enum[i][len] == 'B'
 322            || charset_enum[i][len] == 'L')
 323           && charset_enum[i][len + 1] == 'E'
 324           && charset_enum[i][len + 2] == '\0')
 325         {
 326           if (charset_enum[i][len] == 'B')
 327             target_wide_charset_be_name = charset_enum[i];
 328           else
 329             target_wide_charset_le_name = charset_enum[i];
 330         }
 331     }
 332 # endif  /* PHONY_ICONV */
 333 }
 334
 335 /* 'Set charset', 'set host-charset', 'set target-charset', 'set
 336    target-wide-charset', 'set charset' sfunc's.  */
 337
 338 static void
 339 validate (struct gdbarch *gdbarch)
 340 {
 341   iconv_t desc;
 342   const char *host_cset = host_charset ();
 343   const char *target_cset = target_charset (gdbarch);
 344   const char *target_wide_cset = target_wide_charset_name;
 345
 346   if (!strcmp (target_wide_cset, "auto"))
 347     target_wide_cset = gdbarch_auto_wide_charset (gdbarch);
 348
 349   desc = iconv_open (target_wide_cset, host_cset);
 350   if (desc == (iconv_t) -1)
 351     error (_("Cannot convert between character sets `%s' and `%s'"),
 352            target_wide_cset, host_cset);
 353   iconv_close (desc);
 354
 355   desc = iconv_open (target_cset, host_cset);
 356   if (desc == (iconv_t) -1)
 357     error (_("Cannot convert between character sets `%s' and `%s'"),
 358            target_cset, host_cset);
 359   iconv_close (desc);
 360
 361   /* Clear the cache.  */
 362   be_le_arch = NULL;
 363 }
 364
 365 /* This is the sfunc for the 'set charset' command.  */
 366 static void
 367 set_charset_sfunc (const char *charset, int from_tty,
 368                    struct cmd_list_element *c)
 369 {
 370   /* CAREFUL: set the target charset here as well.  */
 371   target_charset_name = host_charset_name;
 372   validate (get_current_arch ());
 373 }
 374
 375 /* 'set host-charset' command sfunc.  We need a wrapper here because
 376    the function needs to have a specific signature.  */
 377 static void
 378 set_host_charset_sfunc (const char *charset, int from_tty,
 379                         struct cmd_list_element *c)
 380 {
 381   validate (get_current_arch ());
 382 }
 383
 384 /* Wrapper for the 'set target-charset' command.  */
 385 static void
 386 set_target_charset_sfunc (const char *charset, int from_tty,
 387                           struct cmd_list_element *c)
 388 {
 389   validate (get_current_arch ());
 390 }
 391
 392 /* Wrapper for the 'set target-wide-charset' command.  */
 393 static void
 394 set_target_wide_charset_sfunc (const char *charset, int from_tty,
 395                                struct cmd_list_element *c)
 396 {
 397   validate (get_current_arch ());
 398 }
 399
 400 /* sfunc for the 'show charset' command.  */
 401 static void
 402 show_charset (struct ui_file *file, int from_tty,
 403               struct cmd_list_element *c,
 404               const char *name)
 405 {
 406   show_host_charset_name (file, from_tty, c, host_charset_name);
 407   show_target_charset_name (file, from_tty, c, target_charset_name);
 408   show_target_wide_charset_name (file, from_tty, c,
 409                                  target_wide_charset_name);
 410 }
 411
 412 \f
 413 /* Accessor functions.  */
 414
 415 const char *
 416 host_charset (void)
 417 {
 418   if (!strcmp (host_charset_name, "auto"))
 419     return auto_host_charset_name;
 420   return host_charset_name;
 421 }
 422
 423 const char *
 424 target_charset (struct gdbarch *gdbarch)
 425 {
 426   if (!strcmp (target_charset_name, "auto"))
 427     return gdbarch_auto_charset (gdbarch);
 428   return target_charset_name;
 429 }
 430
 431 const char *
 432 target_wide_charset (struct gdbarch *gdbarch)
 433 {
 434   enum bfd_endian byte_order = gdbarch_byte_order (gdbarch);
 435
 436   set_be_le_names (gdbarch);
 437   if (byte_order == BFD_ENDIAN_BIG)
 438     {
 439       if (target_wide_charset_be_name)
 440         return target_wide_charset_be_name;
 441     }
 442   else
 443     {
 444       if (target_wide_charset_le_name)
 445         return target_wide_charset_le_name;
 446     }
 447
 448   if (!strcmp (target_wide_charset_name, "auto"))
 449     return gdbarch_auto_wide_charset (gdbarch);
 450
 451   return target_wide_charset_name;
 452 }
 453
 454 \f
 455 /* Host character set management.  For the time being, we assume that
 456    the host character set is some superset of ASCII.  */
 457
 458 char
 459 host_letter_to_control_character (char c)
 460 {
 461   if (c == '?')
 462     return 0177;
 463   return c & 0237;
 464 }
 465
 466 /* Convert a host character, C, to its hex value.  C must already have
 467    been validated using isxdigit.  */
 468
 469 int
 470 host_hex_value (char c)
 471 {
 472   if (isdigit (c))
 473     return c - '0';
 474   if (c >= 'a' && c <= 'f')
 475     return 10 + c - 'a';
 476   gdb_assert (c >= 'A' && c <= 'F');
 477   return 10 + c - 'A';
 478 }
 479
 480 \f
 481 /* Public character management functions.  */
 482
 483 class iconv_wrapper
 484 {
 485 public:
 486
 487   iconv_wrapper (const char *to, const char *from)
 488   {
 489     m_desc = iconv_open (to, from);
 490     if (m_desc == (iconv_t) -1)
 491       perror_with_name (_("Converting character sets"));
 492   }
 493
 494   ~iconv_wrapper ()
 495   {
 496     iconv_close (m_desc);
 497   }
 498
 499   size_t convert (ICONV_CONST char **inp, size_t *inleft, char **outp,
 500                   size_t *outleft)
 501   {
 502     return iconv (m_desc, inp, inleft, outp, outleft);
 503   }
 504
 505 private:
 506
 507   iconv_t m_desc;
 508 };
 509
 510 void
 511 convert_between_encodings (const char *from, const char *to,
 512                            const gdb_byte *bytes, unsigned int num_bytes,
 513                            int width, struct obstack *output,
 514                            enum transliterations translit)
 515 {
 516   size_t inleft;
 517   ICONV_CONST char *inp;
 518   unsigned int space_request;
 519
 520   /* Often, the host and target charsets will be the same.  */
 521   if (!strcmp (from, to))
 522     {
 523       obstack_grow (output, bytes, num_bytes);
 524       return;
 525     }
 526
 527   iconv_wrapper desc (to, from);
 528
 529   inleft = num_bytes;
 530   inp = (ICONV_CONST char *) bytes;
 531
 532   space_request = num_bytes;
 533
 534   while (inleft > 0)
 535     {
 536       char *outp;
 537       size_t outleft, r;
 538       int old_size;
 539
 540       old_size = obstack_object_size (output);
 541       obstack_blank (output, space_request);
 542
 543       outp = (char *) obstack_base (output) + old_size;
 544       outleft = space_request;
 545
 546       r = desc.convert (&inp, &inleft, &outp, &outleft);
 547
 548       /* Now make sure that the object on the obstack only includes
 549          bytes we have converted.  */
 550       obstack_blank_fast (output, -(ssize_t) outleft);
 551
 552       if (r == (size_t) -1)
 553         {
 554           switch (errno)
 555             {
 556             case EILSEQ:
 557               {
 558                 int i;
 559
 560                 /* Invalid input sequence.  */
 561                 if (translit == translit_none)
 562                   error (_("Could not convert character "
 563                            "to `%s' character set"), to);
 564
 565                 /* We emit escape sequence for the bytes, skip them,
 566                    and try again.  */
 567                 for (i = 0; i < width; ++i)
 568                   {
 569                     char octal[5];
 570
 571                     xsnprintf (octal, sizeof (octal), "\\%.3o", *inp & 0xff);
 572                     obstack_grow_str (output, octal);
 573
 574                     ++inp;
 575                     --inleft;
 576                   }
 577               }
 578               break;
 579
 580             case E2BIG:
 581               /* We ran out of space in the output buffer.  Make it
 582                  bigger next time around.  */
 583               space_request *= 2;
 584               break;
 585
 586             case EINVAL:
 587               /* Incomplete input sequence.  FIXME: ought to report this
 588                  to the caller somehow.  */
 589               inleft = 0;
 590               break;
 591
 592             default:
 593               perror_with_name (_("Internal error while "
 594                                   "converting character sets"));
 595             }
 596         }
 597     }
 598 }
 599
 600 \f
 601
 602 /* Create a new iterator.  */
 603 wchar_iterator::wchar_iterator (const gdb_byte *input, size_t bytes,
 604                                 const char *charset, size_t width)
 605 : m_input (input),
 606   m_bytes (bytes),
 607   m_width (width),
 608   m_out (1)
 609 {
 610   m_desc = iconv_open (INTERMEDIATE_ENCODING, charset);
 611   if (m_desc == (iconv_t) -1)
 612     perror_with_name (_("Converting character sets"));
 613 }
 614
 615 wchar_iterator::~wchar_iterator ()
 616 {
 617   if (m_desc != (iconv_t) -1)
 618     iconv_close (m_desc);
 619 }
 620
 621 int
 622 wchar_iterator::iterate (enum wchar_iterate_result *out_result,
 623                          gdb_wchar_t **out_chars,
 624                          const gdb_byte **ptr,
 625                          size_t *len)
 626 {
 627   size_t out_request;
 628
 629   /* Try to convert some characters.  At first we try to convert just
 630      a single character.  The reason for this is that iconv does not
 631      necessarily update its outgoing arguments when it encounters an
 632      invalid input sequence -- but we want to reliably report this to
 633      our caller so it can emit an escape sequence.  */
 634   out_request = 1;
 635   while (m_bytes > 0)
 636     {
 637       ICONV_CONST char *inptr = (ICONV_CONST char *) m_input;
 638       char *outptr = (char *) m_out.data ();
 639       const gdb_byte *orig_inptr = m_input;
 640       size_t orig_in = m_bytes;
 641       size_t out_avail = out_request * sizeof (gdb_wchar_t);
 642       size_t num;
 643       size_t r = iconv (m_desc, &inptr, &m_bytes, &outptr, &out_avail);
 644
 645       m_input = (gdb_byte *) inptr;
 646
 647       if (r == (size_t) -1)
 648         {
 649           switch (errno)
 650             {
 651             case EILSEQ:
 652               /* Invalid input sequence.  We still might have
 653                  converted a character; if so, return it.  */
 654               if (out_avail < out_request * sizeof (gdb_wchar_t))
 655                 break;
 656
 657               /* Otherwise skip the first invalid character, and let
 658                  the caller know about it.  */
 659               *out_result = wchar_iterate_invalid;
 660               *ptr = m_input;
 661               *len = m_width;
 662               m_input += m_width;
 663               m_bytes -= m_width;
 664               return 0;
 665
 666             case E2BIG:
 667               /* We ran out of space.  We still might have converted a
 668                  character; if so, return it.  Otherwise, grow the
 669                  buffer and try again.  */
 670               if (out_avail < out_request * sizeof (gdb_wchar_t))
 671                 break;
 672
 673               ++out_request;
 674               if (out_request > m_out.size ())
 675                 m_out.resize (out_request);
 676               continue;
 677
 678             case EINVAL:
 679               /* Incomplete input sequence.  Let the caller know, and
 680                  arrange for future calls to see EOF.  */
 681               *out_result = wchar_iterate_incomplete;
 682               *ptr = m_input;
 683               *len = m_bytes;
 684               m_bytes = 0;
 685               return 0;
 686
 687             default:
 688               perror_with_name (_("Internal error while "
 689                                   "converting character sets"));
 690             }
 691         }
 692
 693       /* We converted something.  */
 694       num = out_request - out_avail / sizeof (gdb_wchar_t);
 695       *out_result = wchar_iterate_ok;
 696       *out_chars = m_out.data ();
 697       *ptr = orig_inptr;
 698       *len = orig_in - m_bytes;
 699       return num;
 700     }
 701
 702   /* Really done.  */
 703   *out_result = wchar_iterate_eof;
 704   return -1;
 705 }
 706
 707 struct charset_vector
 708 {
 709   ~charset_vector ()
 710   {
 711     clear ();
 712   }
 713
 714   void clear ()
 715   {
 716     for (char *c : charsets)
 717       xfree (c);
 718
 719     charsets.clear ();
 720   }
 721
 722   std::vector<char *> charsets;
 723 };
 724
 725 static charset_vector charsets;
 726
 727 #ifdef PHONY_ICONV
 728
 729 static void
 730 find_charset_names (void)
 731 {
 732   charsets.charsets.push_back (xstrdup (GDB_DEFAULT_HOST_CHARSET));
 733   charsets.charsets.push_back (NULL);
 734 }
 735
 736 #else /* PHONY_ICONV */
 737
 738 /* Sometimes, libiconv redefines iconvlist as libiconvlist -- but
 739    provides different symbols in the static and dynamic libraries.
 740    So, configure may see libiconvlist but not iconvlist.  But, calling
 741    iconvlist is the right thing to do and will work.  Hence we do a
 742    check here but unconditionally call iconvlist below.  */
 743 #if defined (HAVE_ICONVLIST) || defined (HAVE_LIBICONVLIST)
 744
 745 /* A helper function that adds some character sets to the vector of
 746    all character sets.  This is a callback function for iconvlist.  */
 747
 748 static int
 749 add_one (unsigned int count, const char *const *names, void *data)
 750 {
 751   unsigned int i;
 752
 753   for (i = 0; i < count; ++i)
 754     charsets.charsets.push_back (xstrdup (names[i]));
 755
 756   return 0;
 757 }
 758
 759 static void
 760 find_charset_names (void)
 761 {
 762   iconvlist (add_one, NULL);
 763
 764   charsets.charsets.push_back (NULL);
 765 }
 766
 767 #else
 768
 769 /* Return non-zero if LINE (output from iconv) should be ignored.
 770    Older iconv programs (e.g. 2.2.2) include the human readable
 771    introduction even when stdout is not a tty.  Newer versions omit
 772    the intro if stdout is not a tty.  */
 773
 774 static int
 775 ignore_line_p (const char *line)
 776 {
 777   /* This table is used to filter the output.  If this text appears
 778      anywhere in the line, it is ignored (strstr is used).  */
 779   static const char * const ignore_lines[] =
 780     {
 781       "The following",
 782       "not necessarily",
 783       "the FROM and TO",
 784       "listed with several",
 785       NULL
 786     };
 787   int i;
 788
 789   for (i = 0; ignore_lines[i] != NULL; ++i)
 790     {
 791       if (strstr (line, ignore_lines[i]) != NULL)
 792         return 1;
 793     }
 794
 795   return 0;
 796 }
 797
 798 static void
 799 find_charset_names (void)
 800 {
 801   struct pex_obj *child;
 802   const char *args[3];
 803   int err, status;
 804   int fail = 1;
 805   int flags;
 806   gdb_environ iconv_env = gdb_environ::from_host_environ ();
 807   char *iconv_program;
 808
 809   /* Older iconvs, e.g. 2.2.2, don't omit the intro text if stdout is
 810      not a tty.  We need to recognize it and ignore it.  This text is
 811      subject to translation, so force LANGUAGE=C.  */
 812   iconv_env.set ("LANGUAGE", "C");
 813   iconv_env.set ("LC_ALL", "C");
 814
 815   child = pex_init (PEX_USE_PIPES, "iconv", NULL);
 816
 817 #ifdef ICONV_BIN
 818   {
 819     std::string iconv_dir = relocate_gdb_directory (ICONV_BIN,
 820                                                     ICONV_BIN_RELOCATABLE);
 821     iconv_program
 822       = concat (iconv_dir.c_str(), SLASH_STRING, "iconv", (char *) NULL);
 823   }
 824 #else
 825   iconv_program = xstrdup ("iconv");
 826 #endif
 827   args[0] = iconv_program;
 828   args[1] = "-l";
 829   args[2] = NULL;
 830   flags = PEX_STDERR_TO_STDOUT;
 831 #ifndef ICONV_BIN
 832   flags |= PEX_SEARCH;
 833 #endif
 834   /* Note that we simply ignore errors here.  */
 835   if (!pex_run_in_environment (child, flags,
 836                                args[0], const_cast<char **> (args),
 837                                iconv_env.envp (),
 838                                NULL, NULL, &err))
 839     {
 840       FILE *in = pex_read_output (child, 0);
 841
 842       /* POSIX says that iconv -l uses an unspecified format.  We
 843          parse the glibc and libiconv formats; feel free to add others
 844          as needed.  */
 845
 846       while (in != NULL && !feof (in))
 847         {
 848           /* The size of buf is chosen arbitrarily.  */
 849           char buf[1024];
 850           char *start, *r;
 851           int len;
 852
 853           r = fgets (buf, sizeof (buf), in);
 854           if (!r)
 855             break;
 856           len = strlen (r);
 857           if (len <= 3)
 858             continue;
 859           if (ignore_line_p (r))
 860             continue;
 861
 862           /* Strip off the newline.  */
 863           --len;
 864           /* Strip off one or two '/'s.  glibc will print lines like
 865              "8859_7//", but also "10646-1:1993/UCS4/".  */
 866           if (buf[len - 1] == '/')
 867             --len;
 868           if (buf[len - 1] == '/')
 869             --len;
 870           buf[len] = '\0';
 871
 872           /* libiconv will print multiple entries per line, separated
 873              by spaces.  Older iconvs will print multiple entries per
 874              line, indented by two spaces, and separated by ", "
 875              (i.e. the human readable form).  */
 876           start = buf;
 877           while (1)
 878             {
 879               int keep_going;
 880               char *p;
 881
 882               /* Skip leading blanks.  */
 883               for (p = start; *p && *p == ' '; ++p)
 884                 ;
 885               start = p;
 886               /* Find the next space, comma, or end-of-line.  */
 887               for ( ; *p && *p != ' ' && *p != ','; ++p)
 888                 ;
 889               /* Ignore an empty result.  */
 890               if (p == start)
 891                 break;
 892               keep_going = *p;
 893               *p = '\0';
 894               charsets.charsets.push_back (xstrdup (start));
 895               if (!keep_going)
 896                 break;
 897               /* Skip any extra spaces.  */
 898               for (start = p + 1; *start && *start == ' '; ++start)
 899                 ;
 900             }
 901         }
 902
 903       if (pex_get_status (child, 1, &status)
 904           && WIFEXITED (status) && !WEXITSTATUS (status))
 905         fail = 0;
 906
 907     }
 908
 909   xfree (iconv_program);
 910   pex_free (child);
 911
 912   if (fail)
 913     {
 914       /* Some error occurred, so drop the vector.  */
 915       charsets.clear ();
 916     }
 917   else
 918     charsets.charsets.push_back (NULL);
 919 }
 920
 921 #endif /* HAVE_ICONVLIST || HAVE_LIBICONVLIST */
 922 #endif /* PHONY_ICONV */
 923
 924 /* The "auto" target charset used by default_auto_charset.  */
 925 static const char *auto_target_charset_name = GDB_DEFAULT_TARGET_CHARSET;
 926
 927 const char *
 928 default_auto_charset (void)
 929 {
 930   return auto_target_charset_name;
 931 }
 932
 933 const char *
 934 default_auto_wide_charset (void)
 935 {
 936   return GDB_DEFAULT_TARGET_WIDE_CHARSET;
 937 }
 938
 939
 940 #ifdef USE_INTERMEDIATE_ENCODING_FUNCTION
 941 /* Macro used for UTF or UCS endianness suffix.  */
 942 #if WORDS_BIGENDIAN
 943 #define ENDIAN_SUFFIX "BE"
 944 #else
 945 #define ENDIAN_SUFFIX "LE"
 946 #endif
 947
 948 /* GDB cannot handle strings correctly if this size is different.  */
 949
 950 gdb_static_assert (sizeof (gdb_wchar_t) == 2 || sizeof (gdb_wchar_t) == 4);
 951
 952 /* intermediate_encoding returns the charset used internally by
 953    GDB to convert between target and host encodings. As the test above
 954    compiled, sizeof (gdb_wchar_t) is either 2 or 4 bytes.
 955    UTF-16/32 is tested first, UCS-2/4 is tested as a second option,
 956    otherwise an error is generated.  */
 957
 958 const char *
 959 intermediate_encoding (void)
 960 {
 961   iconv_t desc;
 962   static const char *stored_result = NULL;
 963   char *result;
 964
 965   if (stored_result)
 966     return stored_result;
 967   result = xstrprintf ("UTF-%d%s", (int) (sizeof (gdb_wchar_t) * 8),
 968                        ENDIAN_SUFFIX);
 969   /* Check that the name is supported by iconv_open.  */
 970   desc = iconv_open (result, host_charset ());
 971   if (desc != (iconv_t) -1)
 972     {
 973       iconv_close (desc);
 974       stored_result = result;
 975       return result;
 976     }
 977   /* Not valid, free the allocated memory.  */
 978   xfree (result);
 979   /* Second try, with UCS-2 type.  */
 980   result = xstrprintf ("UCS-%d%s", (int) sizeof (gdb_wchar_t),
 981                        ENDIAN_SUFFIX);
 982   /* Check that the name is supported by iconv_open.  */
 983   desc = iconv_open (result, host_charset ());
 984   if (desc != (iconv_t) -1)
 985     {
 986       iconv_close (desc);
 987       stored_result = result;
 988       return result;
 989     }
 990   /* Not valid, free the allocated memory.  */
 991   xfree (result);
 992   /* No valid charset found, generate error here.  */
 993   error (_("Unable to find a valid charset for string conversions"));
 994 }
 995
 996 #endif /* USE_INTERMEDIATE_ENCODING_FUNCTION */
 997
 998 void _initialize_charset ();
 999 void
1000 _initialize_charset ()
1001 {
1002   /* The first element is always "auto".  */
1003   charsets.charsets.push_back (xstrdup ("auto"));
1004   find_charset_names ();
1005
1006   if (charsets.charsets.size () > 1)
1007     charset_enum = (const char * const *) charsets.charsets.data ();
1008   else
1009     charset_enum = default_charset_names;
1010
1011 #ifndef PHONY_ICONV
1012 #ifdef HAVE_LANGINFO_CODESET
1013   /* The result of nl_langinfo may be overwritten later.  This may
1014      leak a little memory, if the user later changes the host charset,
1015      but that doesn't matter much.  */
1016   auto_host_charset_name = xstrdup (nl_langinfo (CODESET));
1017   /* Solaris will return `646' here -- but the Solaris iconv then does
1018      not accept this.  Darwin (and maybe FreeBSD) may return "" here,
1019      which GNU libiconv doesn't like (infinite loop).  */
1020   if (!strcmp (auto_host_charset_name, "646") || !*auto_host_charset_name)
1021     auto_host_charset_name = "ASCII";
1022   auto_target_charset_name = auto_host_charset_name;
1023 #elif defined (USE_WIN32API)
1024   {
1025     /* "CP" + x<=5 digits + paranoia.  */
1026     static char w32_host_default_charset[16];
1027
1028     snprintf (w32_host_default_charset, sizeof w32_host_default_charset,
1029               "CP%d", GetACP());
1030     auto_host_charset_name = w32_host_default_charset;
1031     auto_target_charset_name = auto_host_charset_name;
1032   }
1033 #endif
1034 #endif
1035
1036   add_setshow_enum_cmd ("charset", class_support,
1037                         charset_enum, &host_charset_name, _("\
1038 Set the host and target character sets."), _("\
1039 Show the host and target character sets."), _("\
1040 The `host character set' is the one used by the system GDB is running on.\n\
1041 The `target character set' is the one used by the program being debugged.\n\
1042 You may only use supersets of ASCII for your host character set; GDB does\n\
1043 not support any others.\n\
1044 To see a list of the character sets GDB supports, type `set charset <TAB>'."),
1045                         /* Note that the sfunc below needs to set
1046                            target_charset_name, because the 'set
1047                            charset' command sets two variables.  */
1048                         set_charset_sfunc,
1049                         show_charset,
1050                         &setlist, &showlist);
1051
1052   add_setshow_enum_cmd ("host-charset", class_support,
1053                         charset_enum, &host_charset_name, _("\
1054 Set the host character set."), _("\
1055 Show the host character set."), _("\
1056 The `host character set' is the one used by the system GDB is running on.\n\
1057 You may only use supersets of ASCII for your host character set; GDB does\n\
1058 not support any others.\n\
1059 To see a list of the character sets GDB supports, type `set host-charset <TAB>'."),
1060                         set_host_charset_sfunc,
1061                         show_host_charset_name,
1062                         &setlist, &showlist);
1063
1064   add_setshow_enum_cmd ("target-charset", class_support,
1065                         charset_enum, &target_charset_name, _("\
1066 Set the target character set."), _("\
1067 Show the target character set."), _("\
1068 The `target character set' is the one used by the program being debugged.\n\
1069 GDB translates characters and strings between the host and target\n\
1070 character sets as needed.\n\
1071 To see a list of the character sets GDB supports, type `set target-charset'<TAB>"),
1072                         set_target_charset_sfunc,
1073                         show_target_charset_name,
1074                         &setlist, &showlist);
1075
1076   add_setshow_enum_cmd ("target-wide-charset", class_support,
1077                         charset_enum, &target_wide_charset_name,
1078                         _("\
1079 Set the target wide character set."), _("\
1080 Show the target wide character set."), _("\
1081 The `target wide character set' is the one used by the program being debugged.\
1082 \nIn particular it is the encoding used by `wchar_t'.\n\
1083 GDB translates characters and strings between the host and target\n\
1084 character sets as needed.\n\
1085 To see a list of the character sets GDB supports, type\n\
1086 `set target-wide-charset'<TAB>"),
1087                         set_target_wide_charset_sfunc,
1088                         show_target_wide_charset_name,
1089                         &setlist, &showlist);
1090 }