diff options
author | Michael Brown <mcb30@ipxe.org> | 2022-02-28 13:37:40 +0000 |
---|---|---|
committer | Michael Brown <mcb30@ipxe.org> | 2022-03-01 15:57:33 +0000 |
commit | 3cd3a7326178bd10fb38e09eb702b27bc463d3c6 (patch) | |
tree | a863df88bf8509fe64395d6bb479d66871043bc7 | |
parent | 2acdc92994e7aca397b0d24b112e4973e82e0f91 (diff) | |
download | ipxe-3cd3a7326178bd10fb38e09eb702b27bc463d3c6.zip ipxe-3cd3a7326178bd10fb38e09eb702b27bc463d3c6.tar.gz ipxe-3cd3a7326178bd10fb38e09eb702b27bc463d3c6.tar.bz2 |
[utf8] Add ability to accumulate Unicode characters from UTF-8 bytes
Signed-off-by: Michael Brown <mcb30@ipxe.org>
-rw-r--r-- | src/core/utf8.c | 137 | ||||
-rw-r--r-- | src/include/ipxe/utf8.h | 69 |
2 files changed, 206 insertions, 0 deletions
diff --git a/src/core/utf8.c b/src/core/utf8.c new file mode 100644 index 0000000..4ee01ba --- /dev/null +++ b/src/core/utf8.c @@ -0,0 +1,137 @@ +/* + * Copyright (C) 2022 Michael Brown <mbrown@fensystems.co.uk>. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * You can also choose to distribute this program under the terms of + * the Unmodified Binary Distribution Licence (as given in the file + * COPYING.UBDL), provided that you have satisfied its requirements. + */ + +FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL ); + +#include <stdint.h> +#include <assert.h> +#include <ipxe/utf8.h> + +/** @file + * + * UTF-8 Unicode encoding + * + */ + +/** + * Accumulate Unicode character from UTF-8 byte sequence + * + * @v utf8 UTF-8 accumulator + * @v byte UTF-8 byte + * @ret character Unicode character, or 0 if incomplete + */ +unsigned int utf8_accumulate ( struct utf8_accumulator *utf8, uint8_t byte ) { + static unsigned int min[] = { + UTF8_MIN_TWO, + UTF8_MIN_THREE, + UTF8_MIN_FOUR, + }; + unsigned int shift; + unsigned int len; + uint8_t tmp; + + /* Handle continuation bytes */ + if ( UTF8_IS_CONTINUATION ( byte ) ) { + + /* Fail if this is an unexpected continuation byte */ + if ( utf8->remaining == 0 ) { + DBGC ( utf8, "UTF8 %p unexpected %02x\n", utf8, byte ); + return UTF8_INVALID; + } + + /* Apply continuation byte */ + utf8->character <<= UTF8_CONTINUATION_BITS; + utf8->character |= ( byte & UTF8_CONTINUATION_MASK ); + + /* Return 0 if more continuation bytes are expected */ + if ( --utf8->remaining != 0 ) + return 0; + + /* Fail if sequence is illegal */ + if ( utf8->character < utf8->min ) { + DBGC ( utf8, "UTF8 %p illegal %02x\n", utf8, + utf8->character ); + return UTF8_INVALID; + } + + /* Sanity check */ + assert ( utf8->character != 0 ); + + /* Return completed character */ + DBGC2 ( utf8, "UTF8 %p accumulated %02x\n", + utf8, utf8->character ); + return utf8->character; + } + + /* Reset state and report failure if this is an unexpected + * non-continuation byte. Do not return UTF8_INVALID since + * doing so could cause us to drop a valid ASCII character. + */ + if ( utf8->remaining != 0 ) { + shift = ( utf8->remaining * UTF8_CONTINUATION_BITS ); + DBGC ( utf8, "UTF8 %p unexpected %02x (partial %02x/%02x)\n", + utf8, byte, ( utf8->character << shift ), + ( ( 1 << shift ) - 1 ) ); + utf8->remaining = 0; + } + + /* Handle initial bytes */ + if ( ! UTF8_IS_ASCII ( byte ) ) { + + /* Sanity check */ + assert ( utf8->remaining == 0 ); + + /* Count total number of bytes in sequence */ + tmp = byte; + len = 0; + while ( tmp & UTF8_HIGH_BIT ) { + tmp <<= 1; + len++; + } + + /* Check for illegal length */ + if ( len > UTF8_MAX_LEN ) { + DBGC ( utf8, "UTF8 %p illegal %02x length %d\n", + utf8, byte, len ); + return UTF8_INVALID; + } + + /* Store initial bits of character */ + utf8->character = ( tmp >> len ); + + /* Store number of bytes remaining */ + len--; + utf8->remaining = len; + assert ( utf8->remaining > 0 ); + + /* Store minimum legal value */ + utf8->min = min[ len - 1 ]; + assert ( utf8->min > 0 ); + + /* Await continuation bytes */ + return 0; + } + + /* Handle ASCII bytes */ + return byte; +} diff --git a/src/include/ipxe/utf8.h b/src/include/ipxe/utf8.h new file mode 100644 index 0000000..299c255 --- /dev/null +++ b/src/include/ipxe/utf8.h @@ -0,0 +1,69 @@ +#ifndef _IPXE_UTF8_H +#define _IPXE_UTF8_H + +/** @file + * + * UTF-8 Unicode encoding + * + */ + +FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL ); + +#include <stdint.h> + +/** Maximum length of UTF-8 sequence */ +#define UTF8_MAX_LEN 4 + +/** Minimum legal value for two-byte UTF-8 sequence */ +#define UTF8_MIN_TWO 0x80 + +/** Minimum legal value for three-byte UTF-8 sequence */ +#define UTF8_MIN_THREE 0x800 + +/** Minimum legal value for four-byte UTF-8 sequence */ +#define UTF8_MIN_FOUR 0x10000 + +/** High bit of UTF-8 bytes */ +#define UTF8_HIGH_BIT 0x80 + +/** Number of data bits in each continuation byte */ +#define UTF8_CONTINUATION_BITS 6 + +/** Bit mask for data bits in a continuation byte */ +#define UTF8_CONTINUATION_MASK ( ( 1 << UTF8_CONTINUATION_BITS ) - 1 ) + +/** Non-data bits in a continuation byte */ +#define UTF8_CONTINUATION 0x80 + +/** Check for a continuation byte + * + * @v byte UTF-8 byte + * @ret is_continuation Byte is a continuation byte + */ +#define UTF8_IS_CONTINUATION( byte ) \ + ( ( (byte) & ~UTF8_CONTINUATION_MASK ) == UTF8_CONTINUATION ) + +/** Check for an ASCII byte + * + * @v byte UTF-8 byte + * @ret is_ascii Byte is an ASCII byte + */ +#define UTF8_IS_ASCII( byte ) ( ! ( (byte) & UTF8_HIGH_BIT ) ) + +/** Invalid character returned when decoding fails */ +#define UTF8_INVALID 0xfffd + +/** A UTF-8 character accumulator */ +struct utf8_accumulator { + /** Character in progress */ + unsigned int character; + /** Number of remaining continuation bytes */ + unsigned int remaining; + /** Minimum legal character */ + unsigned int min; +}; + +extern unsigned int utf8_accumulate ( struct utf8_accumulator *utf8, + uint8_t byte ); + +#endif /* _IPXE_UTF8_H */ |