#!/usr/bin/env python3 # # Script to generate tables for libstdc++ std::text_encoding. # # This file is part of GCC. # # GCC is free software; you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free # Software Foundation; either version 3, or (at your option) any later # version. # # GCC is distributed in the hope that it will be useful, but WITHOUT ANY # WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License # for more details. # # You should have received a copy of the GNU General Public License # along with GCC; see the file COPYING3. If not see # . # To update the Libstdc++ static data in download # the latest: # https://www.iana.org/assignments/character-sets/character-sets-1.csv # Then run this script and save the output to # include/bits/text_encoding-data.h import sys import csv import os if len(sys.argv) != 2: print("Usage: %s " % sys.argv[0], file=sys.stderr) sys.exit(1) self = os.path.basename(__file__) print("// Generated by scripts/{}, do not edit.".format(self)) print(""" // Copyright The GNU Toolchain Authors. // // This file is part of the GNU ISO C++ Library. This library is free // software; you can redistribute it and/or modify it under the // terms of the GNU General Public License as published by the // Free Software Foundation; either version 3, or (at your option) // any later version. // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // Under Section 7 of GPL version 3, you are granted additional // permissions described in the GCC Runtime Library Exception, version // 3.1, as published by the Free Software Foundation. // You should have received a copy of the GNU General Public License and // a copy of the GCC Runtime Library Exception along with this program; // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see // . /** @file bits/text_encoding-data.h * This is an internal header file, included by other library headers. * Do not attempt to use it directly. @headername{text_encoding} */ """) print("#ifndef _GLIBCXX_GET_ENCODING_DATA") print('# error "This is not a public header, do not include it directly"') print("#endif\n") # We need to generate a list of initializers of the form { mib, alias }, e.g., # { 3, "US-ASCII" }, # { 3, "ISO646-US" }, # { 3, "csASCII" }, # { 4, "ISO_8859-1:1987" }, # { 4, "latin1" }, # The initializers must be sorted by the mib value. The first entry for # a given mib must be the primary name for the encoding. Any aliases for # the encoding come after the primary name. # We also define a macro _GLIBCXX_TEXT_ENCODING_UTF8_OFFSET which is the # offset into the list of the mib=106, alias="UTF-8" entry. This is used # to optimize the common case, so we don't need to search for "UTF-8". charsets = {} with open(sys.argv[1], newline='') as f: reader = csv.reader(f) next(reader) # skip header row for row in reader: mib = int(row[2]) if mib in charsets: raise ValueError("Multiple rows for mibEnum={}".format(mib)) name = row[1] aliases = row[5].split() # Ensure primary name comes first if name in aliases: aliases.remove(name) charsets[mib] = [name] + aliases # Remove "NATS-DANO" and "NATS-DANO-ADD" as specified by the C++ standard. charsets.pop(33, None) charsets.pop(34, None) # This is not an official IANA alias, but we include it in the # implementation-defined superset of aliases for US-ASCII. # See also LWG 4043. extra_aliases = {3: ["ASCII"]} count = 0 for mib in sorted(charsets.keys()): names = charsets[mib] if names[0] == "UTF-8": print("#define _GLIBCXX_TEXT_ENCODING_UTF8_OFFSET {}".format(count)) for name in names: print(' {{ {:4}, "{}" }},'.format(mib, name)) count += len(names) if mib in extra_aliases: names = extra_aliases[mib] for name in names: print(' {{ {:4}, "{}" }}, // libstdc++ extension'.format(mib, name)) count += len(names) # gives an error if this macro is left defined. # Do this last, so that the generated output is not usable unless we reach here. print("\n#undef _GLIBCXX_GET_ENCODING_DATA")