------------------------------------------------------------------------------ -- -- -- GNAT RUN-TIME COMPONENTS -- -- -- -- S Y S T E M . W C H _ C N V -- -- -- -- B o d y -- -- -- -- Copyright (C) 1992-2024, Free Software Foundation, Inc. -- -- -- -- GNAT is free software; you can redistribute it and/or modify it under -- -- terms of the GNU General Public License as published by the Free Soft- -- -- ware Foundation; either version 3, or (at your option) any later ver- -- -- sion. GNAT is distributed in the hope that it will be useful, but WITH- -- -- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -- -- or FITNESS FOR A PARTICULAR PURPOSE. -- -- -- -- As a special exception under Section 7 of GPL version 3, you are granted -- -- additional permissions described in the GCC Runtime Library Exception, -- -- version 3.1, as published by the Free Software Foundation. -- -- -- -- You should have received a copy of the GNU General Public License and -- -- a copy of the GCC Runtime Library Exception along with this program; -- -- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see -- -- . -- -- -- -- GNAT was originally developed by the GNAT team at New York University. -- -- Extensive contributions were provided by Ada Core Technologies Inc. -- -- -- ------------------------------------------------------------------------------ with Interfaces; use Interfaces; with System.WCh_Con; use System.WCh_Con; with System.WCh_JIS; use System.WCh_JIS; package body System.WCh_Cnv is ----------------------------- -- Char_Sequence_To_UTF_32 -- ----------------------------- function Char_Sequence_To_UTF_32 (C : Character; EM : System.WCh_Con.WC_Encoding_Method) return UTF_32_Code is B1 : Unsigned_32; C1 : Character; U : Unsigned_32; W : Unsigned_32; procedure Get_Hex (N : Character); -- If N is a hex character, then set B1 to 16 * B1 + character N. -- Raise Constraint_Error if character N is not a hex character. procedure Get_UTF_Byte; pragma Inline (Get_UTF_Byte); -- Used to interpret a 2#10xxxxxx# continuation byte in UTF-8 mode. -- Reads a byte, and raises CE if the first two bits are not 10. -- Otherwise shifts W 6 bits left and or's in the 6 xxxxxx bits. ------------- -- Get_Hex -- ------------- procedure Get_Hex (N : Character) is B2 : constant Unsigned_32 := Character'Pos (N); begin if B2 in Character'Pos ('0') .. Character'Pos ('9') then B1 := B1 * 16 + B2 - Character'Pos ('0'); elsif B2 in Character'Pos ('A') .. Character'Pos ('F') then B1 := B1 * 16 + B2 - (Character'Pos ('A') - 10); elsif B2 in Character'Pos ('a') .. Character'Pos ('f') then B1 := B1 * 16 + B2 - (Character'Pos ('a') - 10); else raise Constraint_Error; end if; end Get_Hex; ------------------ -- Get_UTF_Byte -- ------------------ procedure Get_UTF_Byte is begin U := Unsigned_32 (Character'Pos (In_Char)); if (U and 2#11000000#) /= 2#10_000000# then raise Constraint_Error; end if; W := Shift_Left (W, 6) or (U and 2#00111111#); end Get_UTF_Byte; -- Start of processing for Char_Sequence_To_UTF_32 begin case EM is when WCEM_Hex => if C /= ASCII.ESC then return Character'Pos (C); else B1 := 0; Get_Hex (In_Char); Get_Hex (In_Char); Get_Hex (In_Char); Get_Hex (In_Char); return UTF_32_Code (B1); end if; when WCEM_Upper => if C > ASCII.DEL then return 256 * Character'Pos (C) + Character'Pos (In_Char); else return Character'Pos (C); end if; when WCEM_Shift_JIS => if C > ASCII.DEL then return Wide_Character'Pos (Shift_JIS_To_JIS (C, In_Char)); else return Character'Pos (C); end if; when WCEM_EUC => if C > ASCII.DEL then return Wide_Character'Pos (EUC_To_JIS (C, In_Char)); else return Character'Pos (C); end if; when WCEM_UTF8 => -- Note: for details of UTF8 encoding see RFC 3629 U := Unsigned_32 (Character'Pos (C)); -- 16#00_0000#-16#00_007F#: 0xxxxxxx if (U and 2#10000000#) = 2#00000000# then return Character'Pos (C); -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx elsif (U and 2#11100000#) = 2#110_00000# then W := U and 2#00011111#; Get_UTF_Byte; return UTF_32_Code (W); -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx elsif (U and 2#11110000#) = 2#1110_0000# then W := U and 2#00001111#; Get_UTF_Byte; Get_UTF_Byte; return UTF_32_Code (W); -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx elsif (U and 2#11111000#) = 2#11110_000# then W := U and 2#00000111#; for K in 1 .. 3 loop Get_UTF_Byte; end loop; return UTF_32_Code (W); -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx -- 10xxxxxx 10xxxxxx elsif (U and 2#11111100#) = 2#111110_00# then W := U and 2#00000011#; for K in 1 .. 4 loop Get_UTF_Byte; end loop; return UTF_32_Code (W); -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx -- 10xxxxxx 10xxxxxx 10xxxxxx elsif (U and 2#11111110#) = 2#1111110_0# then W := U and 2#00000001#; for K in 1 .. 5 loop Get_UTF_Byte; end loop; return UTF_32_Code (W); else raise Constraint_Error; end if; when WCEM_Brackets => if C /= '[' then return Character'Pos (C); end if; if In_Char /= '"' then raise Constraint_Error; end if; B1 := 0; Get_Hex (In_Char); Get_Hex (In_Char); C1 := In_Char; if C1 /= '"' then Get_Hex (C1); Get_Hex (In_Char); C1 := In_Char; if C1 /= '"' then Get_Hex (C1); Get_Hex (In_Char); C1 := In_Char; if C1 /= '"' then Get_Hex (C1); Get_Hex (In_Char); if B1 > Unsigned_32 (UTF_32_Code'Last) then raise Constraint_Error; end if; if In_Char /= '"' then raise Constraint_Error; end if; end if; end if; end if; if In_Char /= ']' then raise Constraint_Error; end if; return UTF_32_Code (B1); end case; end Char_Sequence_To_UTF_32; -------------------------------- -- Char_Sequence_To_Wide_Char -- -------------------------------- function Char_Sequence_To_Wide_Char (C : Character; EM : System.WCh_Con.WC_Encoding_Method) return Wide_Character is function Char_Sequence_To_UTF is new Char_Sequence_To_UTF_32 (In_Char); U : constant UTF_32_Code := Char_Sequence_To_UTF (C, EM); begin if U > 16#FFFF# then raise Constraint_Error; else return Wide_Character'Val (U); end if; end Char_Sequence_To_Wide_Char; ----------------------------- -- UTF_32_To_Char_Sequence -- ----------------------------- procedure UTF_32_To_Char_Sequence (Val : UTF_32_Code; EM : System.WCh_Con.WC_Encoding_Method) is Hexc : constant array (UTF_32_Code range 0 .. 15) of Character := "0123456789ABCDEF"; C1, C2 : Character; U : Unsigned_32; begin -- Raise CE for invalid UTF_32_Code if not Val'Valid then raise Constraint_Error; end if; -- Processing depends on encoding mode case EM is when WCEM_Hex => if Val < 256 then Out_Char (Character'Val (Val)); elsif Val <= 16#FFFF# then Out_Char (ASCII.ESC); Out_Char (Hexc (Val / (16**3))); Out_Char (Hexc ((Val / (16**2)) mod 16)); Out_Char (Hexc ((Val / 16) mod 16)); Out_Char (Hexc (Val mod 16)); else raise Constraint_Error; end if; when WCEM_Upper => if Val < 128 then Out_Char (Character'Val (Val)); elsif Val < 16#8000# or else Val > 16#FFFF# then raise Constraint_Error; else Out_Char (Character'Val (Val / 256)); Out_Char (Character'Val (Val mod 256)); end if; when WCEM_Shift_JIS => if Val < 128 then Out_Char (Character'Val (Val)); elsif Val <= 16#FFFF# then JIS_To_Shift_JIS (Wide_Character'Val (Val), C1, C2); Out_Char (C1); Out_Char (C2); else raise Constraint_Error; end if; when WCEM_EUC => if Val < 128 then Out_Char (Character'Val (Val)); elsif Val <= 16#FFFF# then JIS_To_EUC (Wide_Character'Val (Val), C1, C2); Out_Char (C1); Out_Char (C2); else raise Constraint_Error; end if; when WCEM_UTF8 => -- Note: for details of UTF8 encoding see RFC 3629 U := Unsigned_32 (Val); -- 16#00_0000#-16#00_007F#: 0xxxxxxx if U <= 16#00_007F# then Out_Char (Character'Val (U)); -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx elsif U <= 16#00_07FF# then Out_Char (Character'Val (2#11000000# or Shift_Right (U, 6))); Out_Char (Character'Val (2#10000000# or (U and 2#00111111#))); -- 16#00_0800#-16#00_FFFF#: 1110xxxx 10xxxxxx 10xxxxxx elsif U <= 16#00_FFFF# then Out_Char (Character'Val (2#11100000# or Shift_Right (U, 12))); Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 6) and 2#00111111#))); Out_Char (Character'Val (2#10000000# or (U and 2#00111111#))); -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx elsif U <= 16#10_FFFF# then Out_Char (Character'Val (2#11110000# or Shift_Right (U, 18))); Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 12) and 2#00111111#))); Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 6) and 2#00111111#))); Out_Char (Character'Val (2#10000000# or (U and 2#00111111#))); -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx -- 10xxxxxx 10xxxxxx elsif U <= 16#03FF_FFFF# then Out_Char (Character'Val (2#11111000# or Shift_Right (U, 24))); Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 18) and 2#00111111#))); Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 12) and 2#00111111#))); Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 6) and 2#00111111#))); Out_Char (Character'Val (2#10000000# or (U and 2#00111111#))); -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx -- 10xxxxxx 10xxxxxx 10xxxxxx elsif U <= 16#7FFF_FFFF# then Out_Char (Character'Val (2#11111100# or Shift_Right (U, 30))); Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 24) and 2#00111111#))); Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 18) and 2#00111111#))); Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 12) and 2#00111111#))); Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 6) and 2#00111111#))); Out_Char (Character'Val (2#10000000# or (U and 2#00111111#))); else raise Constraint_Error; end if; when WCEM_Brackets => -- Values in the range 0-255 are directly output. Note that there -- is an issue with [ (16#5B#) since this will cause confusion -- if the resulting string is interpreted using brackets encoding. -- One possibility would be to always output [ as ["5B"] but in -- practice this is undesirable, since for example normal use of -- Wide_Text_IO for output (much more common than input), really -- does want to be able to say something like -- Put_Line ("Start of output [first run]"); -- and have it come out as intended, rather than contaminated by -- a ["5B"] sequence in place of the left bracket. if Val < 256 then Out_Char (Character'Val (Val)); -- Otherwise use brackets notation for vales greater than 255 else Out_Char ('['); Out_Char ('"'); if Val > 16#FFFF# then if Val > 16#00FF_FFFF# then Out_Char (Hexc (Val / 16 ** 7)); Out_Char (Hexc ((Val / 16 ** 6) mod 16)); end if; Out_Char (Hexc ((Val / 16 ** 5) mod 16)); Out_Char (Hexc ((Val / 16 ** 4) mod 16)); end if; Out_Char (Hexc ((Val / 16 ** 3) mod 16)); Out_Char (Hexc ((Val / 16 ** 2) mod 16)); Out_Char (Hexc ((Val / 16) mod 16)); Out_Char (Hexc (Val mod 16)); Out_Char ('"'); Out_Char (']'); end if; end case; end UTF_32_To_Char_Sequence; -------------------------------- -- Wide_Char_To_Char_Sequence -- -------------------------------- procedure Wide_Char_To_Char_Sequence (WC : Wide_Character; EM : System.WCh_Con.WC_Encoding_Method) is procedure UTF_To_Char_Sequence is new UTF_32_To_Char_Sequence (Out_Char); begin UTF_To_Char_Sequence (Wide_Character'Pos (WC), EM); end Wide_Char_To_Char_Sequence; end System.WCh_Cnv;