From 50b7ae4b971d2e7b9d16230de966ec5452a367c6 Mon Sep 17 00:00:00 2001 From: Greg Hudson Date: Sat, 10 Dec 2022 01:26:36 -0500 Subject: Remove unused Unicode functions --- src/include/k5-unicode.h | 29 - src/include/k5-utf8.h | 74 - src/lib/krb5/unicode/Makefile.in | 11 +- src/lib/krb5/unicode/ucdata/bidiapi.txt | 84 - src/lib/krb5/unicode/ucdata/ucpgba.c | 755 --------- src/lib/krb5/unicode/ucdata/ucpgba.h | 166 -- src/lib/krb5/unicode/ucdata/ucpgba.man | 97 -- src/lib/krb5/unicode/ucstr.c | 248 +-- src/lib/krb5/unicode/ure/README | 212 --- src/lib/krb5/unicode/ure/ure.c | 2139 ------------------------- src/lib/krb5/unicode/ure/ure.h | 152 -- src/lib/krb5/unicode/ure/urestubs.c | 125 -- src/lib/krb5/unicode/utbm/README | 121 -- src/lib/krb5/unicode/utbm/utbm.c | 475 ------ src/lib/krb5/unicode/utbm/utbm.h | 110 -- src/lib/krb5/unicode/utbm/utbmstub.c | 108 -- src/util/support/libkrb5support-fixed.exports | 1 - src/util/support/t_utf8.c | 6 +- src/util/support/utf8.c | 343 ---- 19 files changed, 7 insertions(+), 5249 deletions(-) delete mode 100644 src/lib/krb5/unicode/ucdata/bidiapi.txt delete mode 100644 src/lib/krb5/unicode/ucdata/ucpgba.c delete mode 100644 src/lib/krb5/unicode/ucdata/ucpgba.h delete mode 100644 src/lib/krb5/unicode/ucdata/ucpgba.man delete mode 100644 src/lib/krb5/unicode/ure/README delete mode 100644 src/lib/krb5/unicode/ure/ure.c delete mode 100644 src/lib/krb5/unicode/ure/ure.h delete mode 100644 src/lib/krb5/unicode/ure/urestubs.c delete mode 100644 src/lib/krb5/unicode/utbm/README delete mode 100644 src/lib/krb5/unicode/utbm/utbm.c delete mode 100644 src/lib/krb5/unicode/utbm/utbm.h delete mode 100644 src/lib/krb5/unicode/utbm/utbmstub.c diff --git a/src/include/k5-unicode.h b/src/include/k5-unicode.h index 45c1788..81c495f 100644 --- a/src/include/k5-unicode.h +++ b/src/include/k5-unicode.h @@ -87,41 +87,12 @@ typedef krb5_ucs4 krb5_unicode; -int krb5int_ucstrncmp( - const krb5_unicode *, - const krb5_unicode *, - size_t); - -int krb5int_ucstrncasecmp( - const krb5_unicode *, - const krb5_unicode *, - size_t); - -krb5_unicode *krb5int_ucstrnchr( - const krb5_unicode *, - size_t, - krb5_unicode); - -krb5_unicode *krb5int_ucstrncasechr( - const krb5_unicode *, - size_t, - krb5_unicode); - -void krb5int_ucstr2upper( - krb5_unicode *, - size_t); - #define KRB5_UTF8_NOCASEFOLD 0x0U #define KRB5_UTF8_CASEFOLD 0x1U #define KRB5_UTF8_ARG1NFC 0x2U #define KRB5_UTF8_ARG2NFC 0x4U #define KRB5_UTF8_APPROX 0x8U -krb5_error_code krb5int_utf8_normalize( - const krb5_data *, - krb5_data **, - unsigned); - int krb5int_utf8_normcmp( const krb5_data *, const krb5_data *, diff --git a/src/include/k5-utf8.h b/src/include/k5-utf8.h index 7cc8cda..11949f9 100644 --- a/src/include/k5-utf8.h +++ b/src/include/k5-utf8.h @@ -73,9 +73,6 @@ typedef uint16_t krb5_ucs2; typedef uint32_t krb5_ucs4; -int krb5int_utf8_to_ucs2(const char *p, krb5_ucs2 *out); -size_t krb5int_ucs2_to_utf8(krb5_ucs2 c, char *buf); - int krb5int_utf8_to_ucs4(const char *p, krb5_ucs4 *out); size_t krb5int_ucs4_to_utf8(krb5_ucs4 c, char *buf); @@ -96,49 +93,6 @@ int k5_utf16le_to_utf8(const uint8_t *utf16bytes, size_t nbytes, int k5_utf8_to_utf16le(const char *utf8, uint8_t **utf16_out, size_t *nbytes_out); -/* returns the number of bytes in the UTF-8 string */ -size_t krb5int_utf8_bytes(const char *); -/* returns the number of UTF-8 characters in the string */ -size_t krb5int_utf8_chars(const char *); -/* returns the number of UTF-8 characters in the counted string */ -size_t krb5int_utf8c_chars(const char *, size_t); -/* returns the length (in bytes) of the UTF-8 character */ -int krb5int_utf8_offset(const char *); -/* returns the length (in bytes) indicated by the UTF-8 character */ -int krb5int_utf8_charlen(const char *); - -/* returns the length (in bytes) indicated by the UTF-8 character - * also checks that shortest possible encoding was used - */ -int krb5int_utf8_charlen2(const char *); - -/* copies a UTF-8 character and returning number of bytes copied */ -int krb5int_utf8_copy(char *, const char *); - -/* returns pointer of next UTF-8 character in string */ -char *krb5int_utf8_next( const char *); -/* returns pointer of previous UTF-8 character in string */ -char *krb5int_utf8_prev( const char *); - -/* primitive ctype routines -- not aware of non-ascii characters */ -int krb5int_utf8_isascii( const char *); -int krb5int_utf8_isalpha( const char *); -int krb5int_utf8_isalnum( const char *); -int krb5int_utf8_isdigit( const char *); -int krb5int_utf8_isxdigit( const char *); -int krb5int_utf8_isspace( const char *); - -/* span characters not in set, return bytes spanned */ -size_t krb5int_utf8_strcspn( const char* str, const char *set); -/* span characters in set, return bytes spanned */ -size_t krb5int_utf8_strspn( const char* str, const char *set); -/* return first occurrence of character in string */ -char *krb5int_utf8_strchr( const char* str, const char *chr); -/* return first character of set in string */ -char *krb5int_utf8_strpbrk( const char* str, const char *set); -/* reentrant tokenizer */ -char *krb5int_utf8_strtok( char* sp, const char* sep, char **last); - /* Optimizations */ extern const char krb5int_utf8_lentab[128]; extern const char krb5int_utf8_mintab[32]; @@ -157,38 +111,10 @@ extern const char krb5int_utf8_mintab[32]; (krb5int_utf8_mintab[KRB5_UTF8_BV(p) & 0x1f] & (p)[1])) ? \ l : 0) -#define KRB5_UTF8_OFFSET(p) (KRB5_UTF8_ISASCII(p) \ - ? 1 : krb5int_utf8_offset((p)) ) - -#define KRB5_UTF8_COPY(d,s) (KRB5_UTF8_ISASCII(s) \ - ? (*(d) = *(s), 1) : krb5int_utf8_copy((d),(s))) - -#define KRB5_UTF8_NEXT(p) (KRB5_UTF8_ISASCII(p) \ - ? (char *)(p)+1 : krb5int_utf8_next((p))) - -#define KRB5_UTF8_INCR(p) ((p) = KRB5_UTF8_NEXT(p)) - -/* For symmetry */ -#define KRB5_UTF8_PREV(p) (krb5int_utf8_prev((p))) -#define KRB5_UTF8_DECR(p) ((p)=KRB5_UTF8_PREV((p))) - /* * these macros assume 'x' is an ASCII x * and assume the "C" locale */ -#define KRB5_ASCII(c) (!((c) & 0x80)) -#define KRB5_SPACE(c) ((c) == ' ' || (c) == '\t' || (c) == '\n') -#define KRB5_DIGIT(c) ((c) >= '0' && (c) <= '9') -#define KRB5_LOWER(c) ((c) >= 'a' && (c) <= 'z') #define KRB5_UPPER(c) ((c) >= 'A' && (c) <= 'Z') -#define KRB5_ALPHA(c) (KRB5_LOWER(c) || KRB5_UPPER(c)) -#define KRB5_ALNUM(c) (KRB5_ALPHA(c) || KRB5_DIGIT(c)) - -#define KRB5_LDH(c) (KRB5_ALNUM(c) || (c) == '-') - -#define KRB5_HEXLOWER(c) ((c) >= 'a' && (c) <= 'f') -#define KRB5_HEXUPPER(c) ((c) >= 'A' && (c) <= 'F') -#define KRB5_HEX(c) (KRB5_DIGIT(c) || \ - KRB5_HEXLOWER(c) || KRB5_HEXUPPER(c)) #endif /* K5_UTF8_H */ diff --git a/src/lib/krb5/unicode/Makefile.in b/src/lib/krb5/unicode/Makefile.in index e23028d..d7dc0f5 100644 --- a/src/lib/krb5/unicode/Makefile.in +++ b/src/lib/krb5/unicode/Makefile.in @@ -6,19 +6,15 @@ BUILDTOP=$(REL)..$(S)..$(S).. ##DOS##OBJFILE=..\$(OUTPRE)$(PREFIXDIR).lst XXDIR = $(srcdir)/ucdata/ -XXHEADERS = ucdata.h ure.h uctable.h -XXSRCS = ucdata.c ucgendat.c ure.c urestubs.c +XXHEADERS = ucdata.h uctable.h +XXSRCS = ucdata.c ucgendat.c STLIBOBJS= \ ucdata.o \ - ure.o \ - urestubs.o \ ucstr.o OBJS= \ $(OUTPRE)ucdata.$(OBJEXT) \ - $(OUTPRE)ure.$(OBJEXT) \ - $(OUTPRE)urestubs.$(OBJEXT) \ $(OUTPRE)ucstr.$(OBJEXT) SRCS= \ @@ -57,9 +53,6 @@ ucgendat: ucgendat.o ##DOS## $(CP) $(srcdir)\ucdata\ucdata.c ucdata.c ##DOS## $(CP) $(srcdir)\ucdata\ucgendat.c ucgendat.c ##DOS## $(CP) $(srcdir)\ucdata\uctable.h uctable.h -##DOS## $(CP) $(srcdir)\ure\ure.h ure.h -##DOS## $(CP) $(srcdir)\ure\ure.c ure.c -##DOS## $(CP) $(srcdir)\ure\urestubs.c urestubs.c ##DOS## $(CP) nul .links $(XXSRCS) $(XXHEADERS) : .links diff --git a/src/lib/krb5/unicode/ucdata/bidiapi.txt b/src/lib/krb5/unicode/ucdata/bidiapi.txt deleted file mode 100644 index dffd12e..0000000 --- a/src/lib/krb5/unicode/ucdata/bidiapi.txt +++ /dev/null @@ -1,84 +0,0 @@ -# -# $Id: bidiapi.txt,v 1.2 1999/11/19 15:24:29 mleisher Exp $ -# - - "Pretty Good Bidi Algorithm" API - -The PGBA (Pretty Good Bidi Algorithm) is an effective alternative to the -Unicode BiDi algorithm. It currently provides only implicit reordering and -does not yet support explicit reordering codes that the Unicode BiDi algorithm -supports. In addition to reordering, the PGBA includes cursor movement -support for both visual and logical navigation. - ------------------------------------------------------------------------------ - -#define UCPGBA_LTR 0 -#define UCPGBA_RTL 1 - - These macros appear in the `direction' field of the data structures. - -#define UCPGBA_CURSOR_VISUAL 0 -#define UCPGBA_CURSOR_LOGICAL 1 - - These macros are used to set the cursor movement for each reordered string. - ------------------------------------------------------------------------------ - -ucstring_t *ucstring_create(unsigned long *source, unsigned long start, - unsigned long end, int default_direction, - int cursor_motion) - - This function will create a reordered string by using the implicit - directionality of the characters in the specified substring. - - The `default_direction' parameter should be one of UCPGBA_LTR or UCPGBA_RTL - and is used only in cases where a string contains no characters with strong - directionality. - - The `cursor_motion' parameter should be one of UCPGBA_CURSOR_VISUAL or - UCPGBA_CURSOR_LOGICAL, and is used to specify the initial cursor motion - behavior. This behavior can be switched at any time using - ustring_set_cursor_motion(). - ------------------------------------------------------------------------------ - -void ucstring_free(ucstring_t *string) - - This function will deallocate the memory used by the string, incuding the - string itself. - ------------------------------------------------------------------------------ - -void ucstring_cursor_info(ustring_t *string, int *direction, - unsigned long *position) - - This function will return the text position of the internal cursor and the - directionality of the text at that position. The position returned is the - original text position of the character. - ------------------------------------------------------------------------------ - -int ucstring_set_cursor_motion(ucstring_t *string, int cursor_motion) - - This function will change the cursor motion type and return the previous - cursor motion type. - ------------------------------------------------------------------------------ - -int ucstring_cursor_right(ucstring_t *string, int count) - - This function will move the internal cursor to the right according to the - type of cursor motion set for the string. - - If no cursor motion is performed, it returns 0. Otherwise it will return a - 1. - ------------------------------------------------------------------------------ - -int ucstring_cursor_left(ucstring_t *string, int count) - - This function will move the internal cursor to the left according to the - type of cursor motion set for the string. - - If no cursor motion is performed, it returns 0. Otherwise it will return a - 1. diff --git a/src/lib/krb5/unicode/ucdata/ucpgba.c b/src/lib/krb5/unicode/ucdata/ucpgba.c deleted file mode 100644 index 5190703..0000000 --- a/src/lib/krb5/unicode/ucdata/ucpgba.c +++ /dev/null @@ -1,755 +0,0 @@ -/* - * Copyright 1998-2008 The OpenLDAP Foundation. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - */ -/* Copyright 2001 Computing Research Labs, New Mexico State University - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT - * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -/* - * This work is part of OpenLDAP Software . - * $OpenLDAP: pkg/ldap/libraries/liblunicode/ucdata/ucpgba.c,v 1.9 2008/01/07 23:20:05 kurt Exp $ - * $Id: ucpgba.c,v 1.5 2001/01/02 18:46:20 mleisher Exp $ - */ - -#include "k5-int.h" -#include "k5-utf8.h" -#include "k5-unicode.h" - -#include -#include - -#include "ucdata.h" -#include "ucpgba.h" - -/* - * These macros are used while reordering of RTL runs of text for the - * special case of non-spacing characters being in runs of weakly - * directional text. They check for weak and non-spacing, and digits and - * non-spacing. - */ -#define ISWEAKSPECIAL(cc) ucisprop(cc, UC_EN|UC_ES|UC_MN, UC_ET|UC_AN|UC_CS) -#define ISDIGITSPECIAL(cc) ucisprop(cc, UC_ND|UC_MN, 0) - -/* - * These macros are used while breaking a string into runs of text in - * different directions. Descriptions: - * - * ISLTR_LTR - Test for members of an LTR run in an LTR context. This looks - * for characters with ltr, non-spacing, weak, and neutral - * properties. - * - * ISRTL_RTL - Test for members of an RTL run in an RTL context. This looks - * for characters with rtl, non-spacing, weak, and neutral - * properties. - * - * ISRTL_NEUTRAL - Test for RTL or neutral characters. - * - * ISWEAK_NEUTRAL - Test for weak or neutral characters. - */ -#define ISLTR_LTR(cc) ucisprop(cc, UC_L|UC_MN|UC_EN|UC_ES,\ - UC_ET|UC_CS|UC_B|UC_S|UC_WS|UC_ON) - -#define ISRTL_RTL(cc) ucisprop(cc, UC_R|UC_MN|UC_EN|UC_ES,\ - UC_ET|UC_AN|UC_CS|UC_B|UC_S|UC_WS|UC_ON) - -#define ISRTL_NEUTRAL(cc) ucisprop(cc, UC_R, UC_B|UC_S|UC_WS|UC_ON) -#define ISWEAK_NEUTRAL(cc) ucisprop(cc, UC_EN|UC_ES, \ - UC_B|UC_S|UC_WS|UC_ON|UC_ET|UC_AN|UC_CS) - -/* - * This table is temporarily hard-coded here until it can be constructed - * automatically somehow. - */ -static unsigned long _symmetric_pairs[] = { - 0x0028, 0x0029, 0x0029, 0x0028, 0x003C, 0x003E, 0x003E, 0x003C, - 0x005B, 0x005D, 0x005D, 0x005B, 0x007B, 0x007D, 0x007D, 0x007B, - 0x2045, 0x2046, 0x2046, 0x2045, 0x207D, 0x207E, 0x207E, 0x207D, - 0x208D, 0x208E, 0x208E, 0x208D, 0x3008, 0x3009, 0x3009, 0x3008, - 0x300A, 0x300B, 0x300B, 0x300A, 0x300C, 0x300D, 0x300D, 0x300C, - 0x300E, 0x300F, 0x300F, 0x300E, 0x3010, 0x3011, 0x3011, 0x3010, - 0x3014, 0x3015, 0x3015, 0x3014, 0x3016, 0x3017, 0x3017, 0x3016, - 0x3018, 0x3019, 0x3019, 0x3018, 0x301A, 0x301B, 0x301B, 0x301A, - 0xFD3E, 0xFD3F, 0xFD3F, 0xFD3E, 0xFE59, 0xFE5A, 0xFE5A, 0xFE59, - 0xFE5B, 0xFE5C, 0xFE5C, 0xFE5B, 0xFE5D, 0xFE5E, 0xFE5E, 0xFE5D, - 0xFF08, 0xFF09, 0xFF09, 0xFF08, 0xFF3B, 0xFF3D, 0xFF3D, 0xFF3B, - 0xFF5B, 0xFF5D, 0xFF5D, 0xFF5B, 0xFF62, 0xFF63, 0xFF63, 0xFF62, -}; - -static int _symmetric_pairs_size = -sizeof(_symmetric_pairs)/sizeof(_symmetric_pairs[0]); - -/* - * This routine looks up the other form of a symmetric pair. - */ -static unsigned long -_ucsymmetric_pair(unsigned long c) -{ - int i; - - for (i = 0; i < _symmetric_pairs_size; i += 2) { - if (_symmetric_pairs[i] == c) - return _symmetric_pairs[i+1]; - } - return c; -} - -/* - * This routine creates a new run, copies the text into it, links it into the - * logical text order chain and returns it to the caller to be linked into - * the visual text order chain. - */ -static ucrun_t * -_add_run(ucstring_t *str, unsigned long *src, - unsigned long start, unsigned long end, int direction) -{ - long i, t; - ucrun_t *run; - - run = (ucrun_t *) malloc(sizeof(ucrun_t)); - run->visual_next = run->visual_prev = 0; - run->direction = direction; - - run->cursor = ~0; - - run->chars = (unsigned long *) - malloc(sizeof(unsigned long) * ((end - start) << 1)); - run->positions = run->chars + (end - start); - - run->source = src; - run->start = start; - run->end = end; - - if (direction == UCPGBA_RTL) { - /* - * Copy the source text into the run in reverse order and select - * replacements for the pairwise punctuation and the <> characters. - */ - for (i = 0, t = end - 1; start < end; start++, t--, i++) { - run->positions[i] = t; - if (ucissymmetric(src[t]) || src[t] == '<' || src[t] == '>') - run->chars[i] = _ucsymmetric_pair(src[t]); - else - run->chars[i] = src[t]; - } - } else { - /* - * Copy the source text into the run directly. - */ - for (i = start; i < end; i++) { - run->positions[i - start] = i; - run->chars[i - start] = src[i]; - } - } - - /* - * Add the run to the logical list for cursor traversal. - */ - if (str->logical_first == 0) - str->logical_first = str->logical_last = run; - else { - run->logical_prev = str->logical_last; - str->logical_last->logical_next = run; - str->logical_last = run; - } - - return run; -} - -static void -_ucadd_rtl_segment(ucstring_t *str, unsigned long *source, unsigned long start, - unsigned long end) -{ - unsigned long s, e; - ucrun_t *run, *lrun; - - /* - * This is used to splice runs into strings with overall LTR direction. - * The `lrun' variable will never be NULL because at least one LTR run was - * added before this RTL run. - */ - lrun = str->visual_last; - - for (e = s = start; s < end;) { - for (; e < end && ISRTL_NEUTRAL(source[e]); e++) ; - - if (e > s) { - run = _add_run(str, source, s, e, UCPGBA_RTL); - - /* - * Add the run to the visual list for cursor traversal. - */ - if (str->visual_first != 0) { - if (str->direction == UCPGBA_LTR) { - run->visual_prev = lrun; - run->visual_next = lrun->visual_next; - if (lrun->visual_next != 0) - lrun->visual_next->visual_prev = run; - lrun->visual_next = run; - if (lrun == str->visual_last) - str->visual_last = run; - } else { - run->visual_next = str->visual_first; - str->visual_first->visual_prev = run; - str->visual_first = run; - } - } else - str->visual_first = str->visual_last = run; - } - - /* - * Handle digits in a special way. This makes sure the weakly - * directional characters appear on the expected sides of a number - * depending on whether that number is Arabic or not. - */ - for (s = e; e < end && ISWEAKSPECIAL(source[e]); e++) { - if (!ISDIGITSPECIAL(source[e]) && - (e + 1 == end || !ISDIGITSPECIAL(source[e + 1]))) - break; - } - - if (e > s) { - run = _add_run(str, source, s, e, UCPGBA_LTR); - - /* - * Add the run to the visual list for cursor traversal. - */ - if (str->visual_first != 0) { - if (str->direction == UCPGBA_LTR) { - run->visual_prev = lrun; - run->visual_next = lrun->visual_next; - if (lrun->visual_next != 0) - lrun->visual_next->visual_prev = run; - lrun->visual_next = run; - if (lrun == str->visual_last) - str->visual_last = run; - } else { - run->visual_next = str->visual_first; - str->visual_first->visual_prev = run; - str->visual_first = run; - } - } else - str->visual_first = str->visual_last = run; - } - - /* - * Collect all weak non-digit sequences for an RTL segment. These - * will appear as part of the next RTL segment or will be added as - * an RTL segment by themselves. - */ - for (s = e; e < end && ucisweak(source[e]) && !ucisdigit(source[e]); - e++) ; - } - - /* - * Capture any weak non-digit sequences that occur at the end of the RTL - * run. - */ - if (e > s) { - run = _add_run(str, source, s, e, UCPGBA_RTL); - - /* - * Add the run to the visual list for cursor traversal. - */ - if (str->visual_first != 0) { - if (str->direction == UCPGBA_LTR) { - run->visual_prev = lrun; - run->visual_next = lrun->visual_next; - if (lrun->visual_next != 0) - lrun->visual_next->visual_prev = run; - lrun->visual_next = run; - if (lrun == str->visual_last) - str->visual_last = run; - } else { - run->visual_next = str->visual_first; - str->visual_first->visual_prev = run; - str->visual_first = run; - } - } else - str->visual_first = str->visual_last = run; - } -} - -static void -_ucadd_ltr_segment(ucstring_t *str, unsigned long *source, unsigned long start, - unsigned long end) -{ - ucrun_t *run; - - run = _add_run(str, source, start, end, UCPGBA_LTR); - - /* - * Add the run to the visual list for cursor traversal. - */ - if (str->visual_first != 0) { - if (str->direction == UCPGBA_LTR) { - run->visual_prev = str->visual_last; - str->visual_last->visual_next = run; - str->visual_last = run; - } else { - run->visual_next = str->visual_first; - str->visual_first->visual_prev = run; - str->visual_first = run; - } - } else - str->visual_first = str->visual_last = run; -} - -ucstring_t * -ucstring_create(unsigned long *source, unsigned long start, unsigned long end, - int default_direction, int cursor_motion) -{ - int rtl_first; - unsigned long s, e, ld; - ucstring_t *str; - - str = (ucstring_t *) malloc(sizeof(ucstring_t)); - - /* - * Set the initial values. - */ - str->cursor_motion = cursor_motion; - str->logical_first = str->logical_last = 0; - str->visual_first = str->visual_last = str->cursor = 0; - str->source = source; - str->start = start; - str->end = end; - - /* - * If the length of the string is 0, then just return it at this point. - */ - if (start == end) - return str; - - /* - * This flag indicates whether the collection loop for RTL is called - * before the LTR loop the first time. - */ - rtl_first = 0; - - /* - * Look for the first character in the string that has strong - * directionality. - */ - for (s = start; s < end && !ucisstrong(source[s]); s++) ; - - if (s == end) - /* - * If the string contains no characters with strong directionality, use - * the default direction. - */ - str->direction = default_direction; - else - str->direction = ucisrtl(source[s]) ? UCPGBA_RTL : UCPGBA_LTR; - - if (str->direction == UCPGBA_RTL) - /* - * Set the flag that causes the RTL collection loop to run first. - */ - rtl_first = 1; - - /* - * This loop now separates the string into runs based on directionality. - */ - for (s = e = 0; s < end; s = e) { - if (!rtl_first) { - /* - * Determine the next run of LTR text. - */ - - ld = s; - while (e < end && ISLTR_LTR(source[e])) { - if (ucisdigit(source[e]) && - !(0x660 <= source[e] && source[e] <= 0x669)) - ld = e; - e++; - } - if (str->direction != UCPGBA_LTR) { - while (e > ld && ISWEAK_NEUTRAL(source[e - 1])) - e--; - } - - /* - * Add the LTR segment to the string. - */ - if (e > s) - _ucadd_ltr_segment(str, source, s, e); - } - - /* - * Determine the next run of RTL text. - */ - ld = s = e; - while (e < end && ISRTL_RTL(source[e])) { - if (ucisdigit(source[e]) && - !(0x660 <= source[e] && source[e] <= 0x669)) - ld = e; - e++; - } - if (str->direction != UCPGBA_RTL) { - while (e > ld && ISWEAK_NEUTRAL(source[e - 1])) - e--; - } - - /* - * Add the RTL segment to the string. - */ - if (e > s) - _ucadd_rtl_segment(str, source, s, e); - - /* - * Clear the flag that allowed the RTL collection loop to run first - * for strings with overall RTL directionality. - */ - rtl_first = 0; - } - - /* - * Set up the initial cursor run. - */ - str->cursor = str->logical_first; - if (str != 0) - str->cursor->cursor = (str->cursor->direction == UCPGBA_RTL) ? - str->cursor->end - str->cursor->start : 0; - - return str; -} - -void -ucstring_free(ucstring_t *s) -{ - ucrun_t *l, *r; - - if (s == 0) - return; - - for (l = 0, r = s->visual_first; r != 0; r = r->visual_next) { - if (r->end > r->start) - free((char *) r->chars); - if (l) - free((char *) l); - l = r; - } - if (l) - free((char *) l); - - free((char *) s); -} - -int -ucstring_set_cursor_motion(ucstring_t *str, int cursor_motion) -{ - int n; - - if (str == 0) - return -1; - - n = str->cursor_motion; - str->cursor_motion = cursor_motion; - return n; -} - -static int -_ucstring_visual_cursor_right(ucstring_t *str, int count) -{ - int cnt = count; - unsigned long size; - ucrun_t *cursor; - - if (str == 0) - return 0; - - cursor = str->cursor; - while (cnt > 0) { - size = cursor->end - cursor->start; - if ((cursor->direction == UCPGBA_RTL && cursor->cursor + 1 == size) || - cursor->cursor + 1 > size) { - /* - * If the next run is NULL, then the cursor is already on the - * far right end already. - */ - if (cursor->visual_next == 0) - /* - * If movement occured, then report it. - */ - return (cnt != count); - - /* - * Move to the next run. - */ - str->cursor = cursor = cursor->visual_next; - cursor->cursor = (cursor->direction == UCPGBA_RTL) ? -1 : 0; - size = cursor->end - cursor->start; - } else - cursor->cursor++; - cnt--; - } - return 1; -} - -static int -_ucstring_logical_cursor_right(ucstring_t *str, int count) -{ - int cnt = count; - unsigned long size; - ucrun_t *cursor; - - if (str == 0) - return 0; - - cursor = str->cursor; - while (cnt > 0) { - size = cursor->end - cursor->start; - if (str->direction == UCPGBA_RTL) { - if (cursor->direction == UCPGBA_RTL) { - if (cursor->cursor + 1 == size) { - if (cursor == str->logical_first) - /* - * Already at the beginning of the string. - */ - return (cnt != count); - - str->cursor = cursor = cursor->logical_prev; - size = cursor->end - cursor->start; - cursor->cursor = (cursor->direction == UCPGBA_LTR) ? - size : 0; - } else - cursor->cursor++; - } else { - if (cursor->cursor == 0) { - if (cursor == str->logical_first) - /* - * At the beginning of the string already. - */ - return (cnt != count); - - str->cursor = cursor = cursor->logical_prev; - size = cursor->end - cursor->start; - cursor->cursor = (cursor->direction == UCPGBA_LTR) ? - size : 0; - } else - cursor->cursor--; - } - } else { - if (cursor->direction == UCPGBA_RTL) { - if (cursor->cursor == 0) { - if (cursor == str->logical_last) - /* - * Already at the end of the string. - */ - return (cnt != count); - - str->cursor = cursor = cursor->logical_next; - size = cursor->end - cursor->start; - cursor->cursor = (cursor->direction == UCPGBA_LTR) ? - 0 : size - 1; - } else - cursor->cursor--; - } else { - if (cursor->cursor + 1 > size) { - if (cursor == str->logical_last) - /* - * Already at the end of the string. - */ - return (cnt != count); - - str->cursor = cursor = cursor->logical_next; - cursor->cursor = (cursor->direction == UCPGBA_LTR) ? - 0 : size - 1; - } else - cursor->cursor++; - } - } - cnt--; - } - return 1; -} - -int -ucstring_cursor_right(ucstring_t *str, int count) -{ - if (str == 0) - return 0; - return (str->cursor_motion == UCPGBA_CURSOR_VISUAL) ? - _ucstring_visual_cursor_right(str, count) : - _ucstring_logical_cursor_right(str, count); -} - -static int -_ucstring_visual_cursor_left(ucstring_t *str, int count) -{ - int cnt = count; - unsigned long size; - ucrun_t *cursor; - - if (str == 0) - return 0; - - cursor = str->cursor; - while (cnt > 0) { - size = cursor->end - cursor->start; - if ((cursor->direction == UCPGBA_LTR && cursor->cursor == 0) || - cursor->cursor - 1 < -1) { - /* - * If the preceding run is NULL, then the cursor is already on the - * far left end already. - */ - if (cursor->visual_prev == 0) - /* - * If movement occured, then report it. - */ - return (cnt != count); - - /* - * Move to the previous run. - */ - str->cursor = cursor = cursor->visual_prev; - size = cursor->end - cursor->start; - cursor->cursor = (cursor->direction == UCPGBA_RTL) ? - size : size - 1; - } else - cursor->cursor--; - cnt--; - } - return 1; -} - -static int -_ucstring_logical_cursor_left(ucstring_t *str, int count) -{ - int cnt = count; - unsigned long size; - ucrun_t *cursor; - - if (str == 0) - return 0; - - cursor = str->cursor; - while (cnt > 0) { - size = cursor->end - cursor->start; - if (str->direction == UCPGBA_RTL) { - if (cursor->direction == UCPGBA_RTL) { - if (cursor->cursor == -1) { - if (cursor == str->logical_last) - /* - * Already at the end of the string. - */ - return (cnt != count); - - str->cursor = cursor = cursor->logical_next; - size = cursor->end - cursor->start; - cursor->cursor = (cursor->direction == UCPGBA_LTR) ? - 0 : size - 1; - } else - cursor->cursor--; - } else { - if (cursor->cursor + 1 > size) { - if (cursor == str->logical_last) - /* - * At the end of the string already. - */ - return (cnt != count); - - str->cursor = cursor = cursor->logical_next; - size = cursor->end - cursor->start; - cursor->cursor = (cursor->direction == UCPGBA_LTR) ? - 0 : size - 1; - } else - cursor->cursor++; - } - } else { - if (cursor->direction == UCPGBA_RTL) { - if (cursor->cursor + 1 == size) { - if (cursor == str->logical_first) - /* - * Already at the beginning of the string. - */ - return (cnt != count); - - str->cursor = cursor = cursor->logical_prev; - size = cursor->end - cursor->start; - cursor->cursor = (cursor->direction == UCPGBA_LTR) ? - size : 0; - } else - cursor->cursor++; - } else { - if (cursor->cursor == 0) { - if (cursor == str->logical_first) - /* - * Already at the beginning of the string. - */ - return (cnt != count); - - str->cursor = cursor = cursor->logical_prev; - cursor->cursor = (cursor->direction == UCPGBA_LTR) ? - size : 0; - } else - cursor->cursor--; - } - } - cnt--; - } - return 1; -} - -int -ucstring_cursor_left(ucstring_t *str, int count) -{ - if (str == 0) - return 0; - return (str->cursor_motion == UCPGBA_CURSOR_VISUAL) ? - _ucstring_visual_cursor_left(str, count) : - _ucstring_logical_cursor_left(str, count); -} - -void -ucstring_cursor_info(ucstring_t *str, int *direction, unsigned long *position) -{ - long c; - unsigned long size; - ucrun_t *cursor; - - if (str == 0 || direction == 0 || position == 0) - return; - - cursor = str->cursor; - - *direction = cursor->direction; - - c = cursor->cursor; - size = cursor->end - cursor->start; - - if (c == size) - *position = (cursor->direction == UCPGBA_RTL) ? - cursor->start : cursor->positions[c - 1]; - else if (c == -1) - *position = (cursor->direction == UCPGBA_RTL) ? - cursor->end : cursor->start; - else - *position = cursor->positions[c]; -} diff --git a/src/lib/krb5/unicode/ucdata/ucpgba.h b/src/lib/krb5/unicode/ucdata/ucpgba.h deleted file mode 100644 index 7e1d570..0000000 --- a/src/lib/krb5/unicode/ucdata/ucpgba.h +++ /dev/null @@ -1,166 +0,0 @@ -/* - * Copyright 1998-2008 The OpenLDAP Foundation. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - */ -/* Copyright 1999 Computing Research Labs, New Mexico State University - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT - * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -/* - * This work is part of OpenLDAP Software . - * $OpenLDAP: pkg/ldap/libraries/liblunicode/ucdata/ucpgba.h,v 1.10 2008/01/07 23:20:05 kurt Exp $ - * $Id: ucpgba.h,v 1.4 1999/11/19 15:24:30 mleisher Exp $ - */ - -#ifndef _h_ucpgba -#define _h_ucpgba - -#include "k5-int.h" - -/*************************************************************************** - * - * Macros and types. - * - ***************************************************************************/ - -/* - * These are the direction values that can appear in render runs and render - * strings. - */ -#define UCPGBA_LTR 0 -#define UCPGBA_RTL 1 - -/* - * These are the flags for cursor motion. - */ -#define UCPGBA_CURSOR_VISUAL 0 -#define UCPGBA_CURSOR_LOGICAL 1 - -/* - * This structure is used to contain runs of text in a particular direction. - */ -typedef struct _ucrun_t { - struct _ucrun_t *visual_prev; /* Pointer to the previous visual run. */ - struct _ucrun_t *visual_next; /* Pointer to the next visual run. */ - - struct _ucrun_t *logical_prev; /* Pointer to the previous logical run. */ - struct _ucrun_t *logical_next; /* Pointer to the next logical run. */ - - int direction; /* Direction of the run. */ - - long cursor; /* Position of "cursor" in the string. */ - - unsigned long *chars; /* List of characters for the run. */ - unsigned long *positions; /* List of original positions in source. */ - - unsigned long *source; /* The source string. */ - unsigned long start; /* Beginning offset in the source string. */ - unsigned long end; /* Ending offset in the source string. */ -} ucrun_t; - -/* - * This represents a string of runs rendered up to a point that is not - * platform specific. - */ -typedef struct _ucstring_t { - int direction; /* Overall direction of the string. */ - - int cursor_motion; /* Logical or visual cursor motion flag. */ - - ucrun_t *cursor; /* The run containing the "cursor." */ - - ucrun_t *logical_first; /* First run in the logical order. */ - ucrun_t *logical_last; /* Last run in the logical order. */ - - ucrun_t *visual_first; /* First run in the visual order. */ - ucrun_t *visual_last; /* Last run in the visual order. */ - - unsigned long *source; /* The source string. */ - unsigned long start; /* The beginning offset in the source. */ - unsigned long end; /* The ending offset in the source. */ -} ucstring_t; - -/*************************************************************************** - * - * API - * - ***************************************************************************/ - -/* - * This creates and reorders the specified substring using the - * "Pretty Good Bidi Algorithm." A default direction is provided for cases - * of a string containing no strong direction characters and the default - * cursor motion should be provided. - */ -ucstring_t * -ucstring_create (unsigned long *source, - unsigned long start, - unsigned long end, - int default_direction, - int cursor_motion); -/* - * This releases the string. - */ -void ucstring_free (ucstring_t *string); - -/* - * This changes the cursor motion flag for the string. - */ -int -ucstring_set_cursor_motion (ucstring_t *string, - int cursor_motion); - -/* - * This function will move the cursor to the right depending on the - * type of cursor motion that was specified for the string. - * - * A 0 is returned if no cursor motion is performed, otherwise a - * 1 is returned. - */ -int -ucstring_cursor_right (ucstring_t *string, int count); - -/* - * This function will move the cursor to the left depending on the - * type of cursor motion that was specified for the string. - * - * A 0 is returned if no cursor motion is performed, otherwise a - * 1 is returned. - */ -int -ucstring_cursor_left (ucstring_t *string, int count); - -/* - * This routine retrieves the direction of the run containing the cursor - * and the actual position in the original text string. - */ -void -ucstring_cursor_info (ucstring_t *string, int *direction, - unsigned long *position); - -#endif /* _h_ucpgba */ diff --git a/src/lib/krb5/unicode/ucdata/ucpgba.man b/src/lib/krb5/unicode/ucdata/ucpgba.man deleted file mode 100644 index 4486509..0000000 --- a/src/lib/krb5/unicode/ucdata/ucpgba.man +++ /dev/null @@ -1,97 +0,0 @@ -.\" -.\" $Id: ucpgba.man,v 1.1 1999/11/19 16:08:34 mleisher Exp $ -.\" -.TH ucpgba 3 "19 November 1999" -.SH NAME -ucpgba \- functions for doing bidirectional reordering of Unicode text and -logical and visual cursor motion - -.SH SYNOPSIS -.nf -#include -#include - -ucstring_t *ucstring_create(unsigned long *source, unsigned long start, - unsigned long end, int default_direction, - int cursor_motion) -.sp -void ucstring_free(ucstring_t *string) -.sp -int ucstring_set_cursor_motion(ucstring_t *string, int cursor_motion) -.sp -int ucstring_cursor_right(ucstring_t *string, int count) -.sp -int ucstring_cursor_left(ucstring_t *string, int count) -.sp -void ucstring_cursor_info(ucstring_t *string, int *direction, - unsigned long *position) - -.SH DESCRIPTION -.TP 4 -.BR Macros -UCPGBA_LTR -.br -UCPGBA_RTL -.br -UCPGBA_CURSOR_VISUAL -.br -UCPGBA_CURSOR_LOGICAL - -.TP 4 -.BR ucstring_create() -This function will create a reordered string by using the implicit -directionality of the characters in the specified substring. -.sp -The `default_direction' parameter should be one of UCPGBA_LTR or UCPGBA_RTL -and is used only in cases where a string contains no characters with strong -directionality. -.sp -The `cursor_motion' parameter should be one of UCPGBA_CURSOR_VISUAL or -UCPGBA_CURSOR_LOGICAL, and is used to specify the initial cursor motion -behavior. This behavior can be switched at any time using -ustring_set_cursor_motion(). - -.TP 4 -.BR ucstring_free() -This function will deallocate the memory used by the string, incuding the -string itself. - -.TP 4 -.BR ucstring_cursor_info() -This function will return the text position of the internal cursor and the -directionality of the text at that position. The position returned is the -original text position of the character. - -.TP 4 -.BR ucstring_set_cursor_motion() -This function will change the cursor motion type and return the previous -cursor motion type. - -.TP 4 -.BR ucstring_cursor_right() -This function will move the internal cursor to the right according to the -type of cursor motion set for the string. -.sp -If no cursor motion is performed, it returns 0. Otherwise it will return a 1. - -.TP 4 -.BR ucstring_cursor_left() -This function will move the internal cursor to the left according to the -type of cursor motion set for the string. -.sp -If no cursor motion is performed, it returns 0. Otherwise it will return a 1. - -.SH "SEE ALSO" -ucdata(3) - -.SH ACKNOWLEDGMENTS -These are people who have helped with patches or alerted me about problems. - -.SH AUTHOR -Mark Leisher -.br -Computing Research Lab -.br -New Mexico State University -.br -Email: mleisher@crl.nmsu.edu diff --git a/src/lib/krb5/unicode/ucstr.c b/src/lib/krb5/unicode/ucstr.c index 0257882..0a2e5ab 100644 --- a/src/lib/krb5/unicode/ucstr.c +++ b/src/lib/krb5/unicode/ucstr.c @@ -23,7 +23,7 @@ #include -int +static int krb5int_ucstrncmp( const krb5_unicode * u1, const krb5_unicode * u2, @@ -40,7 +40,7 @@ krb5int_ucstrncmp( return 0; } -int +static int krb5int_ucstrncasecmp( const krb5_unicode * u1, const krb5_unicode * u2, @@ -60,47 +60,6 @@ krb5int_ucstrncasecmp( return 0; } -krb5_unicode * -krb5int_ucstrnchr( - const krb5_unicode * u, - size_t n, - krb5_unicode c) -{ - for (; 0 < n; ++u, --n) { - if (*u == c) { - return (krb5_unicode *) u; - } - } - - return NULL; -} - -krb5_unicode * -krb5int_ucstrncasechr( - const krb5_unicode * u, - size_t n, - krb5_unicode c) -{ - c = uctolower(c); - for (; 0 < n; ++u, --n) { - if ((krb5_unicode) uctolower(*u) == c) { - return (krb5_unicode *) u; - } - } - - return NULL; -} - -void -krb5int_ucstr2upper( - krb5_unicode * u, - size_t n) -{ - for (; 0 < n; ++u, --n) { - *u = uctoupper(*u); - } -} - /* Return true if data contains valid UTF-8 sequences. */ krb5_boolean k5_utf8_validate(const krb5_data *data) @@ -127,211 +86,8 @@ k5_utf8_validate(const krb5_data *data) return !in.status; } -#define TOUPPER(c) (islower(c) ? toupper(c) : (c)) #define TOLOWER(c) (isupper(c) ? tolower(c) : (c)) -krb5_error_code -krb5int_utf8_normalize( - const krb5_data * data, - krb5_data ** newdataptr, - unsigned flags) -{ - int i, j, len, clen, outpos = 0, ucsoutlen, outsize; - char *out = NULL, *outtmp, *s; - krb5_ucs4 *ucs = NULL, *p, *ucsout = NULL; - krb5_data *newdata; - krb5_error_code retval = 0; - - static unsigned char mask[] = { - 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01}; - - unsigned casefold = flags & KRB5_UTF8_CASEFOLD; - unsigned approx = flags & KRB5_UTF8_APPROX; - - *newdataptr = NULL; - - s = data->data; - len = data->length; - - newdata = malloc(sizeof(*newdata)); - if (newdata == NULL) - return ENOMEM; - - /* - * Should first check to see if string is already in proper normalized - * form. This is almost as time consuming as the normalization though. - */ - - /* finish off everything up to character before first non-ascii */ - if (KRB5_UTF8_ISASCII(s)) { - if (casefold) { - outsize = len + 7; - out = malloc(outsize); - if (out == NULL) { - retval = ENOMEM; - goto cleanup; - } - - for (i = 1; (i < len) && KRB5_UTF8_ISASCII(s + i); i++) { - out[outpos++] = TOLOWER(s[i - 1]); - } - if (i == len) { - out[outpos++] = TOLOWER(s[len - 1]); - goto cleanup; - } - } else { - for (i = 1; (i < len) && KRB5_UTF8_ISASCII(s + i); i++) { - /* empty */ - } - - if (i == len) { - newdata->length = len; - newdata->data = k5memdup0(s, len, &retval); - if (newdata->data == NULL) - goto cleanup; - *newdataptr = newdata; - return 0; - } - outsize = len + 7; - out = malloc(outsize); - if (out == NULL) { - retval = ENOMEM; - goto cleanup; - } - outpos = i - 1; - memcpy(out, s, outpos); - } - } else { - outsize = len + 7; - out = malloc(outsize); - if (out == NULL) { - retval = ENOMEM; - goto cleanup; - } - i = 0; - } - - p = ucs = malloc(len * sizeof(*ucs)); - if (ucs == NULL) { - retval = ENOMEM; - goto cleanup; - } - /* convert character before first non-ascii to ucs-4 */ - if (i > 0) { - *p = casefold ? TOLOWER(s[i - 1]) : s[i - 1]; - p++; - } - /* s[i] is now first non-ascii character */ - for (;;) { - /* s[i] is non-ascii */ - /* convert everything up to next ascii to ucs-4 */ - while (i < len) { - /* KRB5_UTF8_CHARLEN only looks at the first byte; use it to guard - * against small read overruns. */ - if (KRB5_UTF8_CHARLEN(s + i) > len - i) { - retval = KRB5_ERR_INVALID_UTF8; - goto cleanup; - } - clen = KRB5_UTF8_CHARLEN2(s + i, clen); - if (clen == 0) { - retval = KRB5_ERR_INVALID_UTF8; - goto cleanup; - } - if (clen == 1) { - /* ascii */ - break; - } - *p = s[i] & mask[clen]; - i++; - for (j = 1; j < clen; j++) { - if ((s[i] & 0xc0) != 0x80) { - retval = KRB5_ERR_INVALID_UTF8; - goto cleanup; - } - *p <<= 6; - *p |= s[i] & 0x3f; - i++; - } - if (casefold) { - *p = uctolower(*p); - } - p++; - } - /* normalize ucs of length p - ucs */ - uccompatdecomp(ucs, p - ucs, &ucsout, &ucsoutlen); - if (approx) { - for (j = 0; j < ucsoutlen; j++) { - if (ucsout[j] < 0x80) { - out[outpos++] = ucsout[j]; - } - } - } else { - ucsoutlen = uccanoncomp(ucsout, ucsoutlen); - /* convert ucs to utf-8 and store in out */ - for (j = 0; j < ucsoutlen; j++) { - /* - * allocate more space if not enough room for 6 bytes and - * terminator - */ - if (outsize - outpos < 7) { - outsize = ucsoutlen - j + outpos + 6; - outtmp = realloc(out, outsize); - if (outtmp == NULL) { - retval = ENOMEM; - goto cleanup; - } - out = outtmp; - } - outpos += krb5int_ucs4_to_utf8(ucsout[j], &out[outpos]); - } - } - - free(ucsout); - ucsout = NULL; - - if (i == len) { - break; - } - - /* Allocate more space in out if necessary */ - if (len - i >= outsize - outpos) { - outsize += 1 + ((len - i) - (outsize - outpos)); - outtmp = realloc(out, outsize); - if (outtmp == NULL) { - retval = ENOMEM; - goto cleanup; - } - out = outtmp; - } - /* s[i] is ascii */ - /* finish off everything up to char before next non-ascii */ - for (i++; (i < len) && KRB5_UTF8_ISASCII(s + i); i++) { - out[outpos++] = casefold ? TOLOWER(s[i - 1]) : s[i - 1]; - } - if (i == len) { - out[outpos++] = casefold ? TOLOWER(s[len - 1]) : s[len - 1]; - break; - } - /* convert character before next non-ascii to ucs-4 */ - *ucs = casefold ? TOLOWER(s[i - 1]) : s[i - 1]; - p = ucs + 1; - } - -cleanup: - free(ucs); - free(ucsout); - if (retval) { - free(out); - free(newdata); - return retval; - } - out[outpos] = '\0'; - newdata->data = out; - newdata->length = outpos; - *newdataptr = newdata; - return 0; -} - /* compare UTF8-strings, optionally ignore casing */ /* slow, should be optimized */ int diff --git a/src/lib/krb5/unicode/ure/README b/src/lib/krb5/unicode/ure/README deleted file mode 100644 index c9918f5..0000000 --- a/src/lib/krb5/unicode/ure/README +++ /dev/null @@ -1,212 +0,0 @@ -# -# $Id: README,v 1.3 1999/09/21 15:47:43 mleisher Exp $ -# -# Copyright 1997, 1998, 1999 Computing Research Labs, -# New Mexico State University -# -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY -# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT -# OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -# THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# - - - Unicode and Regular Expressions - Version 0.5 - -This is a simple regular expression package for matching against Unicode text -in UCS2 form. The implementation of this URE package is a variation on the -RE->DFA algorithm done by Mark Hopkins (markh@csd4.csd.uwm.edu). Mark -Hopkins' algorithm had the virtue of being very simple, so it was used as a -model. - ---------------------------------------------------------------------------- - -Assumptions: - - o Regular expression and text already normalized. - - o Conversion to lower case assumes a 1-1 mapping. - -Definitions: - - Separator - any one of U+2028, U+2029, '\n', '\r'. - -Operators: - . - match any character. - * - match zero or more of the last subexpression. - + - match one or more of the last subexpression. - ? - match zero or one of the last subexpression. - () - subexpression grouping. - - Notes: - - o The "." operator normally does not match separators, but a flag is - available for the ure_exec() function that will allow this operator to - match a separator. - -Literals and Constants: - - c - literal UCS2 character. - \x.... - hexadecimal number of up to 4 digits. - \X.... - hexadecimal number of up to 4 digits. - \u.... - hexadecimal number of up to 4 digits. - \U.... - hexadecimal number of up to 4 digits. - -Character classes: - - [...] - Character class. - [^...] - Negated character class. - \pN1,N2,...,Nn - Character properties class. - \PN1,N2,...,Nn - Negated character properties class. - - POSIX character classes recognized: - - :alnum: - :alpha: - :cntrl: - :digit: - :graph: - :lower: - :print: - :punct: - :space: - :upper: - :xdigit: - - Notes: - - o Character property classes are \p or \P followed by a comma separated - list of integers between 1 and 32. These integers are references to - the following character properties: - - N Character Property - -------------------------- - 1 _URE_NONSPACING - 2 _URE_COMBINING - 3 _URE_NUMDIGIT - 4 _URE_NUMOTHER - 5 _URE_SPACESEP - 6 _URE_LINESEP - 7 _URE_PARASEP - 8 _URE_CNTRL - 9 _URE_PUA - 10 _URE_UPPER - 11 _URE_LOWER - 12 _URE_TITLE - 13 _URE_MODIFIER - 14 _URE_OTHERLETTER - 15 _URE_DASHPUNCT - 16 _URE_OPENPUNCT - 17 _URE_CLOSEPUNCT - 18 _URE_OTHERPUNCT - 19 _URE_MATHSYM - 20 _URE_CURRENCYSYM - 21 _URE_OTHERSYM - 22 _URE_LTR - 23 _URE_RTL - 24 _URE_EURONUM - 25 _URE_EURONUMSEP - 26 _URE_EURONUMTERM - 27 _URE_ARABNUM - 28 _URE_COMMONSEP - 29 _URE_BLOCKSEP - 30 _URE_SEGMENTSEP - 31 _URE_WHITESPACE - 32 _URE_OTHERNEUT - - o Character classes can contain literals, constants, and character - property classes. Example: - - [abc\U10A\p1,3,4] - ---------------------------------------------------------------------------- - -Before using URE ----------------- -Before URE is used, two functions need to be created. One to check if a -character matches a set of URE character properties, and one to convert a -character to lower case. - -Stubs for these function are located in the urestubs.c file. - -Using URE ---------- - -Sample pseudo-code fragment. - - ure_buffer_t rebuf; - ure_dfa_t dfa; - ucs2_t *re, *text; - unsigned long relen, textlen; - unsigned long match_start, match_end; - - /* - * Allocate the dynamic storage needed to compile regular expressions. - */ - rebuf = ure_buffer_create(); - - for each regular expression in a list { - re = next regular expression; - relen = length(re); - - /* - * Compile the regular expression with the case insensitive flag - * turned on. - */ - dfa = ure_compile(re, relen, 1, rebuf); - - /* - * Look for the first match in some text. The matching will be done - * in a case insensitive manner because the expression was compiled - * with the case insensitive flag on. - */ - if (ure_exec(dfa, 0, text, textlen, &match_start, &match_end)) - printf("MATCH: %ld %ld\n", match_start, match_end); - - /* - * Look for the first match in some text, ignoring non-spacing - * characters. - */ - if (ure_exec(dfa, URE_IGNORE_NONSPACING, text, textlen, - &match_start, &match_end)) - printf("MATCH: %ld %ld\n", match_start, match_end); - - /* - * Free the DFA. - */ - ure_free_dfa(dfa); - } - - /* - * Free the dynamic storage used for compiling the expressions. - */ - ure_free_buffer(rebuf); - ---------------------------------------------------------------------------- - -Mark Leisher -29 March 1997 - -=========================================================================== - -CHANGES -------- - -Version: 0.5 -Date : 21 September 1999 -========================== - 1. Added copyright stuff and put in CVS. diff --git a/src/lib/krb5/unicode/ure/ure.c b/src/lib/krb5/unicode/ure/ure.c deleted file mode 100644 index 7b30487..0000000 --- a/src/lib/krb5/unicode/ure/ure.c +++ /dev/null @@ -1,2139 +0,0 @@ -/* - * Copyright 1998-2008 The OpenLDAP Foundation. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - */ -/* Copyright 1997, 1998, 1999 Computing Research Labs, - * New Mexico State University - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT - * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -/* - * This work is part of OpenLDAP Software . - * $OpenLDAP: pkg/ldap/libraries/liblunicode/ure/ure.c,v 1.19 2008/01/07 23:20:05 kurt Exp $ - * $Id: ure.c,v 1.2 1999/09/21 15:47:43 mleisher Exp $" - */ - -#include - -#include -#include -#ifndef _WIN32 -#include -#endif - -#include "ure.h" - -/* - * Flags used internally in the DFA. - */ -#define _URE_DFA_CASEFOLD 0x01 -#define _URE_DFA_BLANKLINE 0x02 - -static unsigned long cclass_flags[] = { - 0, - _URE_NONSPACING, - _URE_COMBINING, - _URE_NUMDIGIT, - _URE_NUMOTHER, - _URE_SPACESEP, - _URE_LINESEP, - _URE_PARASEP, - _URE_CNTRL, - _URE_PUA, - _URE_UPPER, - _URE_LOWER, - _URE_TITLE, - _URE_MODIFIER, - _URE_OTHERLETTER, - _URE_DASHPUNCT, - _URE_OPENPUNCT, - _URE_CLOSEPUNCT, - _URE_OTHERPUNCT, - _URE_MATHSYM, - _URE_CURRENCYSYM, - _URE_OTHERSYM, - _URE_LTR, - _URE_RTL, - _URE_EURONUM, - _URE_EURONUMSEP, - _URE_EURONUMTERM, - _URE_ARABNUM, - _URE_COMMONSEP, - _URE_BLOCKSEP, - _URE_SEGMENTSEP, - _URE_WHITESPACE, - _URE_OTHERNEUT, -}; - -/* - * Symbol types for the DFA. - */ -#define _URE_ANY_CHAR 1 -#define _URE_CHAR 2 -#define _URE_CCLASS 3 -#define _URE_NCCLASS 4 -#define _URE_BOL_ANCHOR 5 -#define _URE_EOL_ANCHOR 6 - -/* - * Op codes for converting the NFA to a DFA. - */ -#define _URE_SYMBOL 10 -#define _URE_PAREN 11 -#define _URE_QUEST 12 -#define _URE_STAR 13 -#define _URE_PLUS 14 -#define _URE_ONE 15 -#define _URE_AND 16 -#define _URE_OR 17 - -#define _URE_NOOP 0xffff - -#define _URE_REGSTART 0x8000 -#define _URE_REGEND 0x4000 - -/* - * Structure used to handle a compacted range of characters. - */ -typedef struct { - ucs4_t min_code; - ucs4_t max_code; -} _ure_range_t; - -typedef struct { - _ure_range_t *ranges; - ucs2_t ranges_used; - ucs2_t ranges_size; -} _ure_ccl_t; - -typedef union { - ucs4_t chr; - _ure_ccl_t ccl; -} _ure_sym_t; - -/* - * This is a general element structure used for expressions and stack - * elements. - */ -typedef struct { - ucs2_t reg; - ucs2_t onstack; - ucs2_t type; - ucs2_t lhs; - ucs2_t rhs; -} _ure_elt_t; - -/* - * This is a structure used to track a list or a stack of states. - */ -typedef struct { - ucs2_t *slist; - ucs2_t slist_size; - ucs2_t slist_used; -} _ure_stlist_t; - -/* - * Structure to track the list of unique states for a symbol - * during reduction. - */ -typedef struct { - ucs2_t id; - ucs2_t type; - unsigned long mods; - unsigned long props; - _ure_sym_t sym; - _ure_stlist_t states; -} _ure_symtab_t; - -/* - * Structure to hold a single state. - */ -typedef struct { - ucs2_t id; - ucs2_t accepting; - ucs2_t pad; - _ure_stlist_t st; - _ure_elt_t *trans; - ucs2_t trans_size; - ucs2_t trans_used; -} _ure_state_t; - -/* - * Structure used for keeping lists of states. - */ -typedef struct { - _ure_state_t *states; - ucs2_t states_size; - ucs2_t states_used; -} _ure_statetable_t; - -/* - * Structure to track pairs of DFA states when equivalent states are - * merged. - */ -typedef struct { - ucs2_t l; - ucs2_t r; -} _ure_equiv_t; - -/* - * Structure used for constructing the NFA and reducing to a minimal DFA. - */ -typedef struct _ure_buffer_t { - int reducing; - int error; - unsigned long flags; - - _ure_stlist_t stack; - - /* - * Table of unique symbols encountered. - */ - _ure_symtab_t *symtab; - ucs2_t symtab_size; - ucs2_t symtab_used; - - /* - * Tracks the unique expressions generated for the NFA and when the NFA is - * reduced. - */ - _ure_elt_t *expr; - ucs2_t expr_used; - ucs2_t expr_size; - - /* - * The reduced table of unique groups of NFA states. - */ - _ure_statetable_t states; - - /* - * Tracks states when equivalent states are merged. - */ - _ure_equiv_t *equiv; - ucs2_t equiv_used; - ucs2_t equiv_size; -} _ure_buffer_t; - -typedef struct { - ucs2_t symbol; - ucs2_t next_state; -} _ure_trans_t; - -typedef struct { - ucs2_t accepting; - ucs2_t ntrans; - _ure_trans_t *trans; -} _ure_dstate_t; - -typedef struct _ure_dfa_t { - unsigned long flags; - - _ure_symtab_t *syms; - ucs2_t nsyms; - - _ure_dstate_t *states; - ucs2_t nstates; - - _ure_trans_t *trans; - ucs2_t ntrans; -} _ure_dfa_t; - -/************************************************************************* - * - * Functions. - * - *************************************************************************/ - -static void -_ure_memmove(char *dest, char *src, unsigned long bytes) -{ - long i, j; - - i = (long) bytes; - j = i & 7; - i = (i + 7) >> 3; - - /* - * Do a memmove using Ye Olde Duff's Device for efficiency. - */ - if (src < dest) { - src += bytes; - dest += bytes; - - switch (j) { - case 0: do { - *--dest = *--src; - case 7: *--dest = *--src; - case 6: *--dest = *--src; - case 5: *--dest = *--src; - case 4: *--dest = *--src; - case 3: *--dest = *--src; - case 2: *--dest = *--src; - case 1: *--dest = *--src; - } while (--i > 0); - } - } else if (src > dest) { - switch (j) { - case 0: do { - *dest++ = *src++; - case 7: *dest++ = *src++; - case 6: *dest++ = *src++; - case 5: *dest++ = *src++; - case 4: *dest++ = *src++; - case 3: *dest++ = *src++; - case 2: *dest++ = *src++; - case 1: *dest++ = *src++; - } while (--i > 0); - } - } -} - -static void -_ure_push(ucs2_t v, _ure_buffer_t *b) -{ - _ure_stlist_t *s; - - if (b == 0) - return; - - /* - * If the `reducing' parameter is non-zero, check to see if the value - * passed is already on the stack. - */ - if (b->reducing != 0 && b->expr[v].onstack != 0) - return; - - s = &b->stack; - if (s->slist_used == s->slist_size) { - if (s->slist_size == 0) - s->slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3); - else - s->slist = (ucs2_t *) realloc((char *) s->slist, - sizeof(ucs2_t) * (s->slist_size + 8)); - s->slist_size += 8; - } - s->slist[s->slist_used++] = v; - - /* - * If the `reducing' parameter is non-zero, flag the element as being on - * the stack. - */ - if (b->reducing != 0) - b->expr[v].onstack = 1; -} - -static ucs2_t -_ure_peek(_ure_buffer_t *b) -{ - if (b == 0 || b->stack.slist_used == 0) - return _URE_NOOP; - - return b->stack.slist[b->stack.slist_used - 1]; -} - -static ucs2_t -_ure_pop(_ure_buffer_t *b) -{ - ucs2_t v; - - if (b == 0 || b->stack.slist_used == 0) - return _URE_NOOP; - - v = b->stack.slist[--b->stack.slist_used]; - if (b->reducing) - b->expr[v].onstack = 0; - - return v; -} - -/************************************************************************* - * - * Start symbol parse functions. - * - *************************************************************************/ - -/* - * Parse a comma-separated list of integers that represent character - * properties. Combine them into a mask that is returned in the `mask' - * variable, and return the number of characters consumed. - */ -static unsigned long -_ure_prop_list(ucs2_t *pp, unsigned long limit, unsigned long *mask, - _ure_buffer_t *b) -{ - unsigned long n, m; - ucs2_t *sp, *ep; - - sp = pp; - ep = sp + limit; - - for (m = n = 0; b->error == _URE_OK && sp < ep; sp++) { - if (*sp == ',') { - /* - * Encountered a comma, so select the next character property flag - * and reset the number. - */ - m |= cclass_flags[n]; - n = 0; - } else if (*sp >= '0' && *sp <= '9') - /* - * Encountered a digit, so start or continue building the cardinal - * that represents the character property flag. - */ - n = (n * 10) + (*sp - '0'); - else - /* - * Encountered something that is not part of the property list. - * Indicate that we are done. - */ - break; - - /* - * If a property number greater than 32 occurs, then there is a - * problem. Most likely a missing comma separator. - */ - if (n > 32) - b->error = _URE_INVALID_PROPERTY; - } - - if (b->error == _URE_OK && n != 0) - m |= cclass_flags[n]; - - /* - * Set the mask that represents the group of character properties. - */ - *mask = m; - - /* - * Return the number of characters consumed. - */ - return sp - pp; -} - -/* - * Collect a hex number with 1 to 4 digits and return the number - * of characters used. - */ -static unsigned long -_ure_hex(ucs2_t *np, unsigned long limit, ucs4_t *n) -{ - ucs2_t i; - ucs2_t *sp, *ep; - ucs4_t nn; - - sp = np; - ep = sp + limit; - - for (nn = 0, i = 0; i < 4 && sp < ep; i++, sp++) { - if (*sp >= '0' && *sp <= '9') - nn = (nn << 4) + (*sp - '0'); - else if (*sp >= 'A' && *sp <= 'F') - nn = (nn << 4) + ((*sp - 'A') + 10); - else if (*sp >= 'a' && *sp <= 'f') - nn = (nn << 4) + ((*sp - 'a') + 10); - else - /* - * Encountered something that is not a hex digit. - */ - break; - } - - /* - * Assign the character code collected and return the number of - * characters used. - */ - *n = nn; - - return sp - np; -} - -/* - * Insert a range into a character class, removing duplicates and ordering - * them in increasing range-start order. - */ -static void -_ure_add_range(_ure_ccl_t *ccl, _ure_range_t *r, _ure_buffer_t *b) -{ - ucs2_t i; - ucs4_t tmp; - _ure_range_t *rp; - - /* - * If the `casefold' flag is set, then make sure both endpoints of the - * range are converted to lower case. - */ - if (b->flags & _URE_DFA_CASEFOLD) { - r->min_code = _ure_tolower(r->min_code); - r->max_code = _ure_tolower(r->max_code); - } - - /* - * Swap the range endpoints if they are not in increasing order. - */ - if (r->min_code > r->max_code) { - tmp = r->min_code; - r->min_code = r->max_code; - r->max_code = tmp; - } - - for (i = 0, rp = ccl->ranges; - i < ccl->ranges_used && r->min_code < rp->min_code; i++, rp++) ; - - /* - * Check for a duplicate. - */ - if (i < ccl->ranges_used && - r->min_code == rp->min_code && r->max_code == rp->max_code) - return; - - if (ccl->ranges_used == ccl->ranges_size) { - if (ccl->ranges_size == 0) - ccl->ranges = (_ure_range_t *) malloc(sizeof(_ure_range_t) << 3); - else - ccl->ranges = (_ure_range_t *) - realloc((char *) ccl->ranges, - sizeof(_ure_range_t) * (ccl->ranges_size + 8)); - ccl->ranges_size += 8; - } - - rp = ccl->ranges + ccl->ranges_used; - - if (i < ccl->ranges_used) - _ure_memmove((char *) (rp + 1), (char *) rp, - sizeof(_ure_range_t) * (ccl->ranges_used - i)); - - ccl->ranges_used++; - rp->min_code = r->min_code; - rp->max_code = r->max_code; -} - -#define _URE_ALPHA_MASK (_URE_UPPER|_URE_LOWER|_URE_OTHERLETTER|\ -_URE_MODIFIER|_URE_TITLE|_URE_NONSPACING|_URE_COMBINING) -#define _URE_ALNUM_MASK (_URE_ALPHA_MASK|_URE_NUMDIGIT) -#define _URE_PUNCT_MASK (_URE_DASHPUNCT|_URE_OPENPUNCT|_URE_CLOSEPUNCT|\ -_URE_OTHERPUNCT) -#define _URE_GRAPH_MASK (_URE_NUMDIGIT|_URE_NUMOTHER|_URE_ALPHA_MASK|\ -_URE_MATHSYM|_URE_CURRENCYSYM|_URE_OTHERSYM) -#define _URE_PRINT_MASK (_URE_GRAPH_MASK|_URE_SPACESEP) -#define _URE_SPACE_MASK (_URE_SPACESEP|_URE_LINESEP|_URE_PARASEP) - -typedef void (*_ure_cclsetup_t)( - _ure_symtab_t *sym, - unsigned long mask, - _ure_buffer_t *b -); - -typedef struct { - ucs2_t key; - unsigned int len : 8; - unsigned int next : 8; - _ure_cclsetup_t func; - unsigned long mask; -} _ure_trie_t; - -static void -_ure_ccl_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b) -{ - sym->props |= mask; -} - -static void -_ure_space_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b) -{ - _ure_range_t range; - - sym->props |= mask; - - /* - * Add the additional characters needed for handling isspace(). - */ - range.min_code = range.max_code = '\t'; - _ure_add_range(&sym->sym.ccl, &range, b); - range.min_code = range.max_code = '\r'; - _ure_add_range(&sym->sym.ccl, &range, b); - range.min_code = range.max_code = '\n'; - _ure_add_range(&sym->sym.ccl, &range, b); - range.min_code = range.max_code = '\f'; - _ure_add_range(&sym->sym.ccl, &range, b); - range.min_code = range.max_code = 0xfeff; - _ure_add_range(&sym->sym.ccl, &range, b); -} - -static void -_ure_xdigit_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b) -{ - _ure_range_t range; - - /* - * Add the additional characters needed for handling isxdigit(). - */ - range.min_code = '0'; - range.max_code = '9'; - _ure_add_range(&sym->sym.ccl, &range, b); - range.min_code = 'A'; - range.max_code = 'F'; - _ure_add_range(&sym->sym.ccl, &range, b); - range.min_code = 'a'; - range.max_code = 'f'; - _ure_add_range(&sym->sym.ccl, &range, b); -} - -static const _ure_trie_t cclass_trie[] = { - {0x003a, 1, 1, 0, 0}, - {0x0061, 9, 10, 0, 0}, - {0x0063, 8, 19, 0, 0}, - {0x0064, 7, 24, 0, 0}, - {0x0067, 6, 29, 0, 0}, - {0x006c, 5, 34, 0, 0}, - {0x0070, 4, 39, 0, 0}, - {0x0073, 3, 49, 0, 0}, - {0x0075, 2, 54, 0, 0}, - {0x0078, 1, 59, 0, 0}, - {0x006c, 1, 11, 0, 0}, - {0x006e, 2, 13, 0, 0}, - {0x0070, 1, 16, 0, 0}, - {0x0075, 1, 14, 0, 0}, - {0x006d, 1, 15, 0, 0}, - {0x003a, 1, 16, _ure_ccl_setup, _URE_ALNUM_MASK}, - {0x0068, 1, 17, 0, 0}, - {0x0061, 1, 18, 0, 0}, - {0x003a, 1, 19, _ure_ccl_setup, _URE_ALPHA_MASK}, - {0x006e, 1, 20, 0, 0}, - {0x0074, 1, 21, 0, 0}, - {0x0072, 1, 22, 0, 0}, - {0x006c, 1, 23, 0, 0}, - {0x003a, 1, 24, _ure_ccl_setup, _URE_CNTRL}, - {0x0069, 1, 25, 0, 0}, - {0x0067, 1, 26, 0, 0}, - {0x0069, 1, 27, 0, 0}, - {0x0074, 1, 28, 0, 0}, - {0x003a, 1, 29, _ure_ccl_setup, _URE_NUMDIGIT}, - {0x0072, 1, 30, 0, 0}, - {0x0061, 1, 31, 0, 0}, - {0x0070, 1, 32, 0, 0}, - {0x0068, 1, 33, 0, 0}, - {0x003a, 1, 34, _ure_ccl_setup, _URE_GRAPH_MASK}, - {0x006f, 1, 35, 0, 0}, - {0x0077, 1, 36, 0, 0}, - {0x0065, 1, 37, 0, 0}, - {0x0072, 1, 38, 0, 0}, - {0x003a, 1, 39, _ure_ccl_setup, _URE_LOWER}, - {0x0072, 2, 41, 0, 0}, - {0x0075, 1, 45, 0, 0}, - {0x0069, 1, 42, 0, 0}, - {0x006e, 1, 43, 0, 0}, - {0x0074, 1, 44, 0, 0}, - {0x003a, 1, 45, _ure_ccl_setup, _URE_PRINT_MASK}, - {0x006e, 1, 46, 0, 0}, - {0x0063, 1, 47, 0, 0}, - {0x0074, 1, 48, 0, 0}, - {0x003a, 1, 49, _ure_ccl_setup, _URE_PUNCT_MASK}, - {0x0070, 1, 50, 0, 0}, - {0x0061, 1, 51, 0, 0}, - {0x0063, 1, 52, 0, 0}, - {0x0065, 1, 53, 0, 0}, - {0x003a, 1, 54, _ure_space_setup, _URE_SPACE_MASK}, - {0x0070, 1, 55, 0, 0}, - {0x0070, 1, 56, 0, 0}, - {0x0065, 1, 57, 0, 0}, - {0x0072, 1, 58, 0, 0}, - {0x003a, 1, 59, _ure_ccl_setup, _URE_UPPER}, - {0x0064, 1, 60, 0, 0}, - {0x0069, 1, 61, 0, 0}, - {0x0067, 1, 62, 0, 0}, - {0x0069, 1, 63, 0, 0}, - {0x0074, 1, 64, 0, 0}, - {0x003a, 1, 65, _ure_xdigit_setup, 0}, -}; - -/* - * Probe for one of the POSIX colon delimited character classes in the static - * trie. - */ -static unsigned long -_ure_posix_ccl(ucs2_t *cp, unsigned long limit, _ure_symtab_t *sym, - _ure_buffer_t *b) -{ - int i; - unsigned long n; - const _ure_trie_t *tp; - ucs2_t *sp, *ep; - - /* - * If the number of characters left is less than 7, then this cannot be - * interpreted as one of the colon delimited classes. - */ - if (limit < 7) - return 0; - - sp = cp; - ep = sp + limit; - tp = cclass_trie; - for (i = 0; sp < ep && i < 8; i++, sp++) { - n = tp->len; - - for (; n > 0 && tp->key != *sp; tp++, n--) ; - - if (n == 0) - return 0; - - if (*sp == ':' && (i == 6 || i == 7)) { - sp++; - break; - } - if (sp + 1 < ep) - tp = cclass_trie + tp->next; - } - if (tp->func == 0) - return 0; - - (*tp->func)(sym, tp->mask, b); - - return sp - cp; -} - -/* - * Construct a list of ranges and return the number of characters consumed. - */ -static unsigned long -_ure_cclass(ucs2_t *cp, unsigned long limit, _ure_symtab_t *symp, - _ure_buffer_t *b) -{ - int range_end; - unsigned long n; - ucs2_t *sp, *ep; - ucs4_t c, last; - _ure_ccl_t *cclp; - _ure_range_t range; - - sp = cp; - ep = sp + limit; - - if (*sp == '^') { - symp->type = _URE_NCCLASS; - sp++; - } else - symp->type = _URE_CCLASS; - - for (last = 0, range_end = 0; - b->error == _URE_OK && sp < ep && *sp != ']'; ) { - c = *sp++; - if (c == '\\') { - if (sp == ep) { - /* - * The EOS was encountered when expecting the reverse solidus - * to be followed by the character it is escaping. Set an - * error code and return the number of characters consumed up - * to this point. - */ - b->error = _URE_UNEXPECTED_EOS; - return sp - cp; - } - - c = *sp++; - switch (c) { - case 'a': - c = 0x07; - break; - case 'b': - c = 0x08; - break; - case 'f': - c = 0x0c; - break; - case 'n': - c = 0x0a; - break; - case 'r': - c = 0x0d; - break; - case 't': - c = 0x09; - break; - case 'v': - c = 0x0b; - break; - case 'p': - case 'P': - sp += _ure_prop_list(sp, ep - sp, &symp->props, b); - /* - * Invert the bit mask of the properties if this is a negated - * character class or if 'P' is used to specify a list of - * character properties that should *not* match in a - * character class. - */ - if (c == 'P') - symp->props = ~symp->props; - continue; - break; - case 'x': - case 'X': - case 'u': - case 'U': - if (sp < ep && - ((*sp >= '0' && *sp <= '9') || - (*sp >= 'A' && *sp <= 'F') || - (*sp >= 'a' && *sp <= 'f'))) - sp += _ure_hex(sp, ep - sp, &c); - } - } else if (c == ':') { - /* - * Probe for a POSIX colon delimited character class. - */ - sp--; - if ((n = _ure_posix_ccl(sp, ep - sp, symp, b)) == 0) - sp++; - else { - sp += n; - continue; - } - } - - cclp = &symp->sym.ccl; - - /* - * Check to see if the current character is a low surrogate that needs - * to be combined with a preceding high surrogate. - */ - if (last != 0) { - if (c >= 0xdc00 && c <= 0xdfff) - /* - * Construct the UTF16 character code. - */ - c = 0x10000 + (((last & 0x03ff) << 10) | (c & 0x03ff)); - else { - /* - * Add the isolated high surrogate to the range. - */ - if (range_end == 1) - range.max_code = last & 0xffff; - else - range.min_code = range.max_code = last & 0xffff; - - _ure_add_range(cclp, &range, b); - range_end = 0; - } - } - - /* - * Clear the last character code. - */ - last = 0; - - /* - * This slightly awkward code handles the different cases needed to - * construct a range. - */ - if (c >= 0xd800 && c <= 0xdbff) { - /* - * If the high surrogate is followed by a range indicator, simply - * add it as the range start. Otherwise, save it in case the next - * character is a low surrogate. - */ - if (*sp == '-') { - sp++; - range.min_code = c; - range_end = 1; - } else - last = c; - } else if (range_end == 1) { - range.max_code = c; - _ure_add_range(cclp, &range, b); - range_end = 0; - } else { - range.min_code = range.max_code = c; - if (*sp == '-') { - sp++; - range_end = 1; - } else - _ure_add_range(cclp, &range, b); - } - } - - if (sp < ep && *sp == ']') - sp++; - else - /* - * The parse was not terminated by the character class close symbol - * (']'), so set an error code. - */ - b->error = _URE_CCLASS_OPEN; - - return sp - cp; -} - -/* - * Probe for a low surrogate hex code. - */ -static unsigned long -_ure_probe_ls(ucs2_t *ls, unsigned long limit, ucs4_t *c) -{ - ucs4_t i, code; - ucs2_t *sp, *ep; - - for (i = code = 0, sp = ls, ep = sp + limit; i < 4 && sp < ep; sp++) { - if (*sp >= '0' && *sp <= '9') - code = (code << 4) + (*sp - '0'); - else if (*sp >= 'A' && *sp <= 'F') - code = (code << 4) + ((*sp - 'A') + 10); - else if (*sp >= 'a' && *sp <= 'f') - code = (code << 4) + ((*sp - 'a') + 10); - else - break; - } - - *c = code; - return (0xdc00 <= code && code <= 0xdfff) ? sp - ls : 0; -} - -static unsigned long -_ure_compile_symbol(ucs2_t *sym, unsigned long limit, _ure_symtab_t *symp, - _ure_buffer_t *b) -{ - ucs4_t c; - ucs2_t *sp, *ep; - - sp = sym; - ep = sym + limit; - - if ((c = *sp++) == '\\') { - - if (sp == ep) { - /* - * The EOS was encountered when expecting the reverse solidus to - * be followed by the character it is escaping. Set an error code - * and return the number of characters consumed up to this point. - */ - b->error = _URE_UNEXPECTED_EOS; - return sp - sym; - } - - c = *sp++; - switch (c) { - case 'p': - case 'P': - symp->type = (c == 'p') ? _URE_CCLASS : _URE_NCCLASS; - sp += _ure_prop_list(sp, ep - sp, &symp->props, b); - break; - case 'a': - symp->type = _URE_CHAR; - symp->sym.chr = 0x07; - break; - case 'b': - symp->type = _URE_CHAR; - symp->sym.chr = 0x08; - break; - case 'f': - symp->type = _URE_CHAR; - symp->sym.chr = 0x0c; - break; - case 'n': - symp->type = _URE_CHAR; - symp->sym.chr = 0x0a; - break; - case 'r': - symp->type = _URE_CHAR; - symp->sym.chr = 0x0d; - break; - case 't': - symp->type = _URE_CHAR; - symp->sym.chr = 0x09; - break; - case 'v': - symp->type = _URE_CHAR; - symp->sym.chr = 0x0b; - break; - case 'x': - case 'X': - case 'u': - case 'U': - /* - * Collect between 1 and 4 digits representing a UCS2 code. Fall - * through to the next case. - */ - if (sp < ep && - ((*sp >= '0' && *sp <= '9') || - (*sp >= 'A' && *sp <= 'F') || - (*sp >= 'a' && *sp <= 'f'))) - sp += _ure_hex(sp, ep - sp, &c); - /* FALLTHROUGH */ - default: - /* - * Simply add an escaped character here. - */ - symp->type = _URE_CHAR; - symp->sym.chr = c; - } - } else if (c == '^' || c == '$') - /* - * Handle the BOL and EOL anchors. This actually consists simply of - * setting a flag that indicates that the user supplied anchor match - * function should be called. This needs to be done instead of simply - * matching line/paragraph separators because beginning-of-text and - * end-of-text tests are needed as well. - */ - symp->type = (c == '^') ? _URE_BOL_ANCHOR : _URE_EOL_ANCHOR; - else if (c == '[') - /* - * Construct a character class. - */ - sp += _ure_cclass(sp, ep - sp, symp, b); - else if (c == '.') - symp->type = _URE_ANY_CHAR; - else { - symp->type = _URE_CHAR; - symp->sym.chr = c; - } - - /* - * If the symbol type happens to be a character and is a high surrogate, - * then probe forward to see if it is followed by a low surrogate that - * needs to be added. - */ - if (sp < ep && symp->type == _URE_CHAR && - 0xd800 <= symp->sym.chr && symp->sym.chr <= 0xdbff) { - - if (0xdc00 <= *sp && *sp <= 0xdfff) { - symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) | - (*sp & 0x03ff)); - sp++; - } else if (*sp == '\\' && (*(sp + 1) == 'x' || *(sp + 1) == 'X' || - *(sp + 1) == 'u' || *(sp + 1) == 'U')) { - sp += _ure_probe_ls(sp + 2, ep - (sp + 2), &c); - if (0xdc00 <= c && c <= 0xdfff) { - /* - * Take into account the \[xu] in front of the hex code. - */ - sp += 2; - symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) | - (c & 0x03ff)); - } - } - } - - /* - * Last, make sure any _URE_CHAR type symbols are changed to lower case if - * the `casefold' flag is set. - */ - if ((b->flags & _URE_DFA_CASEFOLD) && symp->type == _URE_CHAR) - symp->sym.chr = _ure_tolower(symp->sym.chr); - - /* - * If the symbol constructed is anything other than one of the anchors, - * make sure the _URE_DFA_BLANKLINE flag is removed. - */ - if (symp->type != _URE_BOL_ANCHOR && symp->type != _URE_EOL_ANCHOR) - b->flags &= ~_URE_DFA_BLANKLINE; - - /* - * Return the number of characters consumed. - */ - return sp - sym; -} - -static int -_ure_sym_neq(_ure_symtab_t *a, _ure_symtab_t *b) -{ - if (a->type != b->type || a->mods != b->mods || a->props != b->props) - return 1; - - if (a->type == _URE_CCLASS || a->type == _URE_NCCLASS) { - if (a->sym.ccl.ranges_used != b->sym.ccl.ranges_used) - return 1; - if (a->sym.ccl.ranges_used > 0 && - memcmp((char *) a->sym.ccl.ranges, (char *) b->sym.ccl.ranges, - sizeof(_ure_range_t) * a->sym.ccl.ranges_used) != 0) - return 1; - } else if (a->type == _URE_CHAR && a->sym.chr != b->sym.chr) - return 1; - return 0; -} - -/* - * Construct a symbol, but only keep unique symbols. - */ -static ucs2_t -_ure_make_symbol(ucs2_t *sym, unsigned long limit, unsigned long *consumed, - _ure_buffer_t *b) -{ - ucs2_t i; - _ure_symtab_t *sp, symbol; - - /* - * Build the next symbol so we can test to see if it is already in the - * symbol table. - */ - (void) memset((char *) &symbol, '\0', sizeof(_ure_symtab_t)); - *consumed = _ure_compile_symbol(sym, limit, &symbol, b); - - /* - * Check to see if the symbol exists. - */ - for (i = 0, sp = b->symtab; - i < b->symtab_used && _ure_sym_neq(&symbol, sp); i++, sp++) ; - - if (i < b->symtab_used) { - /* - * Free up any ranges used for the symbol. - */ - if ((symbol.type == _URE_CCLASS || symbol.type == _URE_NCCLASS) && - symbol.sym.ccl.ranges_size > 0) - free((char *) symbol.sym.ccl.ranges); - - return b->symtab[i].id; - } - - /* - * Need to add the new symbol. - */ - if (b->symtab_used == b->symtab_size) { - if (b->symtab_size == 0) - b->symtab = (_ure_symtab_t *) malloc(sizeof(_ure_symtab_t) << 3); - else - b->symtab = (_ure_symtab_t *) - realloc((char *) b->symtab, - sizeof(_ure_symtab_t) * (b->symtab_size + 8)); - sp = b->symtab + b->symtab_size; - (void) memset((char *) sp, '\0', sizeof(_ure_symtab_t) << 3); - b->symtab_size += 8; - } - - symbol.id = b->symtab_used++; - (void) memmove((char *) &b->symtab[symbol.id], (char *) &symbol, - sizeof(_ure_symtab_t)); - - return symbol.id; -} - -/************************************************************************* - * - * End symbol parse functions. - * - *************************************************************************/ - -static ucs2_t -_ure_make_expr(ucs2_t type, ucs2_t lhs, ucs2_t rhs, _ure_buffer_t *b) -{ - ucs2_t i; - - if (b == 0) - return _URE_NOOP; - - /* - * Determine if the expression already exists or not. - */ - for (i = 0; i < b->expr_used; i++) { - if (b->expr[i].type == type && b->expr[i].lhs == lhs && - b->expr[i].rhs == rhs) - break; - } - if (i < b->expr_used) - return i; - - /* - * Need to add a new expression. - */ - if (b->expr_used == b->expr_size) { - if (b->expr_size == 0) - b->expr = (_ure_elt_t *) malloc(sizeof(_ure_elt_t) << 3); - else - b->expr = (_ure_elt_t *) - realloc((char *) b->expr, - sizeof(_ure_elt_t) * (b->expr_size + 8)); - b->expr_size += 8; - } - - b->expr[b->expr_used].onstack = 0; - b->expr[b->expr_used].type = type; - b->expr[b->expr_used].lhs = lhs; - b->expr[b->expr_used].rhs = rhs; - - return b->expr_used++; -} - -static unsigned char spmap[] = { - 0x00, 0x00, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -}; - -#define _ure_isspecial(cc) ((cc) > 0x20 && (cc) < 0x7f && \ - (spmap[(cc) >> 3] & (1 << ((cc) & 7)))) - -/* - * Convert the regular expression into an NFA in a form that will be easy to - * reduce to a DFA. The starting state for the reduction will be returned. - */ -static ucs2_t -_ure_re2nfa(ucs2_t *re, unsigned long relen, _ure_buffer_t *b) -{ - ucs2_t c, state, top, sym, *sp, *ep; - unsigned long used; - - state = _URE_NOOP; - - sp = re; - ep = sp + relen; - while (b->error == _URE_OK && sp < ep) { - c = *sp++; - switch (c) { - case '(': - _ure_push(_URE_PAREN, b); - break; - case ')': - /* - * Check for the case of too many close parentheses. - */ - if (_ure_peek(b) == _URE_NOOP) { - b->error = _URE_UNBALANCED_GROUP; - break; - } - - while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR) - /* - * Make an expression with the AND or OR operator and its right - * hand side. - */ - state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b); - - /* - * Remove the _URE_PAREN off the stack. - */ - (void) _ure_pop(b); - break; - case '*': - state = _ure_make_expr(_URE_STAR, state, _URE_NOOP, b); - break; - case '+': - state = _ure_make_expr(_URE_PLUS, state, _URE_NOOP, b); - break; - case '?': - state = _ure_make_expr(_URE_QUEST, state, _URE_NOOP, b); - break; - case '|': - while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR) - /* - * Make an expression with the AND or OR operator and its right - * hand side. - */ - state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b); - - _ure_push(state, b); - _ure_push(_URE_OR, b); - break; - default: - sp--; - sym = _ure_make_symbol(sp, ep - sp, &used, b); - sp += used; - state = _ure_make_expr(_URE_SYMBOL, sym, _URE_NOOP, b); - break; - } - - if (c != '(' && c != '|' && sp < ep && - (!_ure_isspecial(*sp) || *sp == '(')) { - _ure_push(state, b); - _ure_push(_URE_AND, b); - } - } - while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR) - /* - * Make an expression with the AND or OR operator and its right - * hand side. - */ - state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b); - - if (b->stack.slist_used > 0) - b->error = _URE_UNBALANCED_GROUP; - - return (b->error == _URE_OK) ? state : _URE_NOOP; -} - -static void -_ure_add_symstate(ucs2_t sym, ucs2_t state, _ure_buffer_t *b) -{ - ucs2_t i, *stp; - _ure_symtab_t *sp; - - /* - * Locate the symbol in the symbol table so the state can be added. - * If the symbol doesn't exist, then a real problem exists. - */ - for (i = 0, sp = b->symtab; i < b->symtab_used && sym != sp->id; - i++, sp++) ; - - /* - * Now find out if the state exists in the symbol's state list. - */ - for (i = 0, stp = sp->states.slist; - i < sp->states.slist_used && state > *stp; i++, stp++) ; - - if (i == sp->states.slist_used || state < *stp) { - /* - * Need to add the state in order. - */ - if (sp->states.slist_used == sp->states.slist_size) { - if (sp->states.slist_size == 0) - sp->states.slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3); - else - sp->states.slist = (ucs2_t *) - realloc((char *) sp->states.slist, - sizeof(ucs2_t) * (sp->states.slist_size + 8)); - sp->states.slist_size += 8; - } - if (i < sp->states.slist_used) - (void) _ure_memmove((char *) (sp->states.slist + i + 1), - (char *) (sp->states.slist + i), - sizeof(ucs2_t) * (sp->states.slist_used - i)); - sp->states.slist[i] = state; - sp->states.slist_used++; - } -} - -static ucs2_t -_ure_add_state(ucs2_t nstates, ucs2_t *states, _ure_buffer_t *b) -{ - ucs2_t i; - _ure_state_t *sp; - - for (i = 0, sp = b->states.states; i < b->states.states_used; i++, sp++) { - if (sp->st.slist_used == nstates && - memcmp((char *) states, (char *) sp->st.slist, - sizeof(ucs2_t) * nstates) == 0) - break; - } - - if (i == b->states.states_used) { - /* - * Need to add a new DFA state (set of NFA states). - */ - if (b->states.states_used == b->states.states_size) { - if (b->states.states_size == 0) - b->states.states = (_ure_state_t *) - malloc(sizeof(_ure_state_t) << 3); - else - b->states.states = (_ure_state_t *) - realloc((char *) b->states.states, - sizeof(_ure_state_t) * (b->states.states_size + 8)); - sp = b->states.states + b->states.states_size; - (void) memset((char *) sp, '\0', sizeof(_ure_state_t) << 3); - b->states.states_size += 8; - } - - sp = b->states.states + b->states.states_used++; - sp->id = i; - - if (sp->st.slist_used + nstates > sp->st.slist_size) { - if (sp->st.slist_size == 0) - sp->st.slist = (ucs2_t *) - malloc(sizeof(ucs2_t) * (sp->st.slist_used + nstates)); - else - sp->st.slist = (ucs2_t *) - realloc((char *) sp->st.slist, - sizeof(ucs2_t) * (sp->st.slist_used + nstates)); - sp->st.slist_size = sp->st.slist_used + nstates; - } - sp->st.slist_used = nstates; - (void) memmove((char *) sp->st.slist, (char *) states, - sizeof(ucs2_t) * nstates); - } - - /* - * Return the ID of the DFA state representing a group of NFA states. - */ - return i; -} - -static void -_ure_reduce(ucs2_t start, _ure_buffer_t *b) -{ - ucs2_t i, j, state, eval, syms, rhs; - ucs2_t s1, s2, ns1, ns2; - _ure_state_t *sp; - _ure_symtab_t *smp; - - b->reducing = 1; - - /* - * Add the starting state for the reduction. - */ - _ure_add_state(1, &start, b); - - /* - * Process each set of NFA states that get created. - */ - for (i = 0; i < b->states.states_used; i++) { - sp = b->states.states + i; - - /* - * Push the current states on the stack. - */ - for (j = 0; j < sp->st.slist_used; j++) - _ure_push(sp->st.slist[j], b); - - /* - * Reduce the NFA states. - */ - for (j = sp->accepting = syms = 0; j < b->stack.slist_used; j++) { - state = b->stack.slist[j]; - eval = 1; - - /* - * This inner loop is the iterative equivalent of recursively - * reducing subexpressions generated as a result of a reduction. - */ - while (eval) { - switch (b->expr[state].type) { - case _URE_SYMBOL: - ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b); - _ure_add_symstate(b->expr[state].lhs, ns1, b); - syms++; - eval = 0; - break; - case _URE_ONE: - sp->accepting = 1; - eval = 0; - break; - case _URE_QUEST: - s1 = b->expr[state].lhs; - ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b); - state = _ure_make_expr(_URE_OR, ns1, s1, b); - break; - case _URE_PLUS: - s1 = b->expr[state].lhs; - ns1 = _ure_make_expr(_URE_STAR, s1, _URE_NOOP, b); - state = _ure_make_expr(_URE_AND, s1, ns1, b); - break; - case _URE_STAR: - s1 = b->expr[state].lhs; - ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b); - ns2 = _ure_make_expr(_URE_PLUS, s1, _URE_NOOP, b); - state = _ure_make_expr(_URE_OR, ns1, ns2, b); - break; - case _URE_OR: - s1 = b->expr[state].lhs; - s2 = b->expr[state].rhs; - _ure_push(s1, b); - _ure_push(s2, b); - eval = 0; - break; - case _URE_AND: - s1 = b->expr[state].lhs; - s2 = b->expr[state].rhs; - switch (b->expr[s1].type) { - case _URE_SYMBOL: - _ure_add_symstate(b->expr[s1].lhs, s2, b); - syms++; - eval = 0; - break; - case _URE_ONE: - state = s2; - break; - case _URE_QUEST: - ns1 = b->expr[s1].lhs; - ns2 = _ure_make_expr(_URE_AND, ns1, s2, b); - state = _ure_make_expr(_URE_OR, s2, ns2, b); - break; - case _URE_PLUS: - ns1 = b->expr[s1].lhs; - ns2 = _ure_make_expr(_URE_OR, s2, state, b); - state = _ure_make_expr(_URE_AND, ns1, ns2, b); - break; - case _URE_STAR: - ns1 = b->expr[s1].lhs; - ns2 = _ure_make_expr(_URE_AND, ns1, state, b); - state = _ure_make_expr(_URE_OR, s2, ns2, b); - break; - case _URE_OR: - ns1 = b->expr[s1].lhs; - ns2 = b->expr[s1].rhs; - ns1 = _ure_make_expr(_URE_AND, ns1, s2, b); - ns2 = _ure_make_expr(_URE_AND, ns2, s2, b); - state = _ure_make_expr(_URE_OR, ns1, ns2, b); - break; - case _URE_AND: - ns1 = b->expr[s1].lhs; - ns2 = b->expr[s1].rhs; - ns2 = _ure_make_expr(_URE_AND, ns2, s2, b); - state = _ure_make_expr(_URE_AND, ns1, ns2, b); - break; - } - } - } - } - - /* - * Clear the state stack. - */ - while (_ure_pop(b) != _URE_NOOP) ; - - /* - * Reset the state pointer because the reduction may have moved it - * during a reallocation. - */ - sp = b->states.states + i; - - /* - * Generate the DFA states for the symbols collected during the - * current reduction. - */ - if (sp->trans_used + syms > sp->trans_size) { - if (sp->trans_size == 0) - sp->trans = (_ure_elt_t *) - malloc(sizeof(_ure_elt_t) * (sp->trans_used + syms)); - else - sp->trans = (_ure_elt_t *) - realloc((char *) sp->trans, - sizeof(_ure_elt_t) * (sp->trans_used + syms)); - sp->trans_size = sp->trans_used + syms; - } - - /* - * Go through the symbol table and generate the DFA state transitions - * for each symbol that has collected NFA states. - */ - for (j = syms = 0, smp = b->symtab; j < b->symtab_used; j++, smp++) { - sp = b->states.states + i; - - if (smp->states.slist_used > 0) { - sp->trans[syms].lhs = smp->id; - rhs = _ure_add_state(smp->states.slist_used, - smp->states.slist, b); - /* - * Reset the state pointer in case the reallocation moves it - * in memory. - */ - sp = b->states.states + i; - sp->trans[syms].rhs = rhs; - - smp->states.slist_used = 0; - syms++; - } - } - - /* - * Set the number of transitions actually used. - */ - sp->trans_used = syms; - } - b->reducing = 0; -} - -static void -_ure_add_equiv(ucs2_t l, ucs2_t r, _ure_buffer_t *b) -{ - ucs2_t tmp; - - l = b->states.states[l].id; - r = b->states.states[r].id; - - if (l == r) - return; - - if (l > r) { - tmp = l; - l = r; - r = tmp; - } - - /* - * Check to see if the equivalence pair already exists. - */ - for (tmp = 0; tmp < b->equiv_used && - (b->equiv[tmp].l != l || b->equiv[tmp].r != r); - tmp++) ; - - if (tmp < b->equiv_used) - return; - - if (b->equiv_used == b->equiv_size) { - if (b->equiv_size == 0) - b->equiv = (_ure_equiv_t *) malloc(sizeof(_ure_equiv_t) << 3); - else - b->equiv = (_ure_equiv_t *) realloc((char *) b->equiv, - sizeof(_ure_equiv_t) * - (b->equiv_size + 8)); - b->equiv_size += 8; - } - b->equiv[b->equiv_used].l = l; - b->equiv[b->equiv_used].r = r; - b->equiv_used++; -} - -/* - * Merge the DFA states that are equivalent. - */ -static void -_ure_merge_equiv(_ure_buffer_t *b) -{ - ucs2_t i, j, k, eq, done; - _ure_state_t *sp1, *sp2, *ls, *rs; - - for (i = 0; i < b->states.states_used; i++) { - sp1 = b->states.states + i; - if (sp1->id != i) - continue; - for (j = 0; j < i; j++) { - sp2 = b->states.states + j; - if (sp2->id != j) - continue; - b->equiv_used = 0; - _ure_add_equiv(i, j, b); - for (eq = 0, done = 0; eq < b->equiv_used; eq++) { - ls = b->states.states + b->equiv[eq].l; - rs = b->states.states + b->equiv[eq].r; - if (ls->accepting != rs->accepting || - ls->trans_used != rs->trans_used) { - done = 1; - break; - } - for (k = 0; k < ls->trans_used && - ls->trans[k].lhs == rs->trans[k].lhs; k++) ; - if (k < ls->trans_used) { - done = 1; - break; - } - - for (k = 0; k < ls->trans_used; k++) - _ure_add_equiv(ls->trans[k].rhs, rs->trans[k].rhs, b); - } - if (done == 0) - break; - } - for (eq = 0; j < i && eq < b->equiv_used; eq++) - b->states.states[b->equiv[eq].r].id = - b->states.states[b->equiv[eq].l].id; - } - - /* - * Renumber the states appropriately. - */ - for (i = eq = 0, sp1 = b->states.states; i < b->states.states_used; - sp1++, i++) - sp1->id = (sp1->id == i) ? eq++ : b->states.states[sp1->id].id; -} - -/************************************************************************* - * - * API. - * - *************************************************************************/ - -ure_buffer_t -ure_buffer_create(void) -{ - ure_buffer_t b; - - b = (ure_buffer_t) calloc(1, sizeof(_ure_buffer_t)); - - return b; -} - -void -ure_buffer_free(ure_buffer_t buf) -{ - unsigned long i; - - if (buf == 0) - return; - - if (buf->stack.slist_size > 0) - free((char *) buf->stack.slist); - - if (buf->expr_size > 0) - free((char *) buf->expr); - - for (i = 0; i < buf->symtab_size; i++) { - if (buf->symtab[i].states.slist_size > 0) - free((char *) buf->symtab[i].states.slist); - } - - if (buf->symtab_size > 0) - free((char *) buf->symtab); - - for (i = 0; i < buf->states.states_size; i++) { - if (buf->states.states[i].trans_size > 0) - free((char *) buf->states.states[i].trans); - if (buf->states.states[i].st.slist_size > 0) - free((char *) buf->states.states[i].st.slist); - } - - if (buf->states.states_size > 0) - free((char *) buf->states.states); - - if (buf->equiv_size > 0) - free((char *) buf->equiv); - - free((char *) buf); -} - -ure_dfa_t -ure_compile(ucs2_t *re, unsigned long relen, int casefold, ure_buffer_t buf) -{ - ucs2_t i, j, state; - _ure_state_t *sp; - _ure_dstate_t *dsp; - _ure_trans_t *tp; - ure_dfa_t dfa; - - if (re == 0 || *re == 0 || relen == 0 || buf == 0) - return 0; - - /* - * Reset the various fields of the compilation buffer. Default the flags - * to indicate the presense of the "^$" pattern. If any other pattern - * occurs, then this flag will be removed. This is done to catch this - * special pattern and handle it specially when matching. - */ - buf->flags = _URE_DFA_BLANKLINE | ((casefold) ? _URE_DFA_CASEFOLD : 0); - buf->reducing = 0; - buf->stack.slist_used = 0; - buf->expr_used = 0; - - for (i = 0; i < buf->symtab_used; i++) - buf->symtab[i].states.slist_used = 0; - buf->symtab_used = 0; - - for (i = 0; i < buf->states.states_used; i++) { - buf->states.states[i].st.slist_used = 0; - buf->states.states[i].trans_used = 0; - } - buf->states.states_used = 0; - - /* - * Construct the NFA. If this stage returns a 0, then an error occurred or - * an empty expression was passed. - */ - if ((state = _ure_re2nfa(re, relen, buf)) == _URE_NOOP) - return 0; - - /* - * Do the expression reduction to get the initial DFA. - */ - _ure_reduce(state, buf); - - /* - * Merge all the equivalent DFA states. - */ - _ure_merge_equiv(buf); - - /* - * Construct the minimal DFA. - */ - dfa = (ure_dfa_t) malloc(sizeof(_ure_dfa_t)); - (void) memset((char *) dfa, '\0', sizeof(_ure_dfa_t)); - - dfa->flags = buf->flags & (_URE_DFA_CASEFOLD|_URE_DFA_BLANKLINE); - - /* - * Free up the NFA state groups and transfer the symbols from the buffer - * to the DFA. - */ - for (i = 0; i < buf->symtab_size; i++) { - if (buf->symtab[i].states.slist_size > 0) - free((char *) buf->symtab[i].states.slist); - } - dfa->syms = buf->symtab; - dfa->nsyms = buf->symtab_used; - - buf->symtab_used = buf->symtab_size = 0; - - /* - * Collect the total number of states and transitions needed for the DFA. - */ - for (i = state = 0, sp = buf->states.states; i < buf->states.states_used; - i++, sp++) { - if (sp->id == state) { - dfa->nstates++; - dfa->ntrans += sp->trans_used; - state++; - } - } - - /* - * Allocate enough space for the states and transitions. - */ - dfa->states = (_ure_dstate_t *) malloc(sizeof(_ure_dstate_t) * - dfa->nstates); - dfa->trans = (_ure_trans_t *) malloc(sizeof(_ure_trans_t) * dfa->ntrans); - - /* - * Actually transfer the DFA states from the buffer. - */ - dsp = dfa->states; - tp = dfa->trans; - for (i = state = 0, sp = buf->states.states; i < buf->states.states_used; - i++, sp++) { - if (sp->id == state) { - dsp->trans = tp; - dsp->ntrans = sp->trans_used; - dsp->accepting = sp->accepting; - - /* - * Add the transitions for the state. - */ - for (j = 0; j < dsp->ntrans; j++, tp++) { - tp->symbol = sp->trans[j].lhs; - tp->next_state = buf->states.states[sp->trans[j].rhs].id; - } - - dsp++; - state++; - } - } - - return dfa; -} - -void -ure_dfa_free(ure_dfa_t dfa) -{ - ucs2_t i; - - if (dfa == 0) - return; - - for (i = 0; i < dfa->nsyms; i++) { - if ((dfa->syms[i].type == _URE_CCLASS || - dfa->syms[i].type == _URE_NCCLASS) && - dfa->syms[i].sym.ccl.ranges_size > 0) - free((char *) dfa->syms[i].sym.ccl.ranges); - } - if (dfa->nsyms > 0) - free((char *) dfa->syms); - - if (dfa->nstates > 0) - free((char *) dfa->states); - if (dfa->ntrans > 0) - free((char *) dfa->trans); - free((char *) dfa); -} - -void -ure_write_dfa(ure_dfa_t dfa, FILE *out) -{ - ucs2_t i, j, k, h, l; - _ure_dstate_t *sp; - _ure_symtab_t *sym; - _ure_range_t *rp; - - if (dfa == 0 || out == 0) - return; - - /* - * Write all the different character classes. - */ - for (i = 0, sym = dfa->syms; i < dfa->nsyms; i++, sym++) { - if (sym->type == _URE_CCLASS || sym->type == _URE_NCCLASS) { - fprintf(out, "C%hd = ", sym->id); - if (sym->sym.ccl.ranges_used > 0) { - putc('[', out); - if (sym->type == _URE_NCCLASS) - putc('^', out); - } - if (sym->props != 0) { - if (sym->type == _URE_NCCLASS) - fprintf(out, "\\P"); - else - fprintf(out, "\\p"); - for (k = h = 0; k < 32; k++) { - if (sym->props & (1 << k)) { - if (h != 0) - putc(',', out); - fprintf(out, "%d", k + 1); - h = 1; - } - } - } - /* - * Dump the ranges. - */ - for (k = 0, rp = sym->sym.ccl.ranges; - k < sym->sym.ccl.ranges_used; k++, rp++) { - /* - * Check for UTF16 characters. - */ - if (0x10000 <= rp->min_code && - rp->min_code <= 0x10ffff) { - h = (ucs2_t) (((rp->min_code - 0x10000) >> 10) + 0xd800); - l = (ucs2_t) (((rp->min_code - 0x10000) & 1023) + 0xdc00); - fprintf(out, "\\x%04hX\\x%04hX", h, l); - } else - fprintf(out, "\\x%04lX", - (unsigned long)(rp->min_code & 0xffff)); - if (rp->max_code != rp->min_code) { - putc('-', out); - if (rp->max_code >= 0x10000 && - rp->max_code <= 0x10ffff) { - h = (ucs2_t) (((rp->max_code - 0x10000) >> 10) + 0xd800); - l = (ucs2_t) (((rp->max_code - 0x10000) & 1023) + 0xdc00); - fprintf(out, "\\x%04hX\\x%04hX", h, l); - } else - fprintf(out, "\\x%04lX", - (unsigned long)(rp->max_code & 0xffff)); - } - } - if (sym->sym.ccl.ranges_used > 0) - putc(']', out); - putc('\n', out); - } - } - - for (i = 0, sp = dfa->states; i < dfa->nstates; i++, sp++) { - fprintf(out, "S%hd = ", i); - if (sp->accepting) { - fprintf(out, "1 "); - if (sp->ntrans) - fprintf(out, "| "); - } - for (j = 0; j < sp->ntrans; j++) { - if (j > 0) - fprintf(out, "| "); - - sym = dfa->syms + sp->trans[j].symbol; - switch (sym->type) { - case _URE_CHAR: - if (0x10000 <= sym->sym.chr && sym->sym.chr <= 0x10ffff) { - /* - * Take care of UTF16 characters. - */ - h = (ucs2_t) (((sym->sym.chr - 0x10000) >> 10) + 0xd800); - l = (ucs2_t) (((sym->sym.chr - 0x10000) & 1023) + 0xdc00); - fprintf(out, "\\x%04hX\\x%04hX ", h, l); - } else - fprintf(out, "\\x%04lX ", - (unsigned long)(sym->sym.chr & 0xffff)); - break; - case _URE_ANY_CHAR: - fprintf(out, " "); - break; - case _URE_BOL_ANCHOR: - fprintf(out, " "); - break; - case _URE_EOL_ANCHOR: - fprintf(out, " "); - break; - case _URE_CCLASS: - case _URE_NCCLASS: - fprintf(out, "[C%hd] ", sym->id); - break; - } - fprintf(out, "S%hd", sp->trans[j].next_state); - if (j + 1 < sp->ntrans) - putc(' ', out); - } - putc('\n', out); - } -} - -#define _ure_issep(cc) ((cc) == '\n' || (cc) == '\r' || (cc) == 0x2028 ||\ - (cc) == 0x2029) - -int -ure_exec(ure_dfa_t dfa, int flags, ucs2_t *text, unsigned long textlen, - unsigned long *match_start, unsigned long *match_end) -{ - int i, j, matched, found; - unsigned long ms, me; - ucs4_t c; - ucs2_t *sp, *ep, *lp; - _ure_dstate_t *stp; - _ure_symtab_t *sym; - _ure_range_t *rp; - - if (dfa == 0 || text == 0) - return 0; - - /* - * Handle the special case of an empty string matching the "^$" pattern. - */ - if (textlen == 0 && (dfa->flags & _URE_DFA_BLANKLINE)) { - *match_start = *match_end = 0; - return 1; - } - - sp = text; - ep = sp + textlen; - - ms = me = ~0; - - stp = dfa->states; - - for (found = 0; found == 0 && sp < ep; ) { - lp = sp; - c = *sp++; - - /* - * Check to see if this is a high surrogate that should be - * combined with a following low surrogate. - */ - if (sp < ep && 0xd800 <= c && c <= 0xdbff && - 0xdc00 <= *sp && *sp <= 0xdfff) - c = 0x10000 + (((c & 0x03ff) << 10) | (*sp++ & 0x03ff)); - - /* - * Determine if the character is non-spacing and should be skipped. - */ - if (_ure_matches_properties(_URE_NONSPACING, c) && - (flags & URE_IGNORE_NONSPACING)) { - sp++; - continue; - } - - if (dfa->flags & _URE_DFA_CASEFOLD) - c = _ure_tolower(c); - - /* - * See if one of the transitions matches. - */ - for (i = 0, matched = 0; matched == 0 && i < stp->ntrans; i++) { - sym = dfa->syms + stp->trans[i].symbol; - switch (sym->type) { - case _URE_ANY_CHAR: - if ((flags & URE_DOT_MATCHES_SEPARATORS) || - !_ure_issep(c)) - matched = 1; - break; - case _URE_CHAR: - if (c == sym->sym.chr) - matched = 1; - break; - case _URE_BOL_ANCHOR: - if (lp == text) { - sp = lp; - matched = 1; - } else if (_ure_issep(c)) { - if (c == '\r' && sp < ep && *sp == '\n') - sp++; - lp = sp; - matched = 1; - } - break; - case _URE_EOL_ANCHOR: - if (_ure_issep(c)) { - /* - * Put the pointer back before the separator so the match - * end position will be correct. This case will also - * cause the `sp' pointer to be advanced over the current - * separator once the match end point has been recorded. - */ - sp = lp; - matched = 1; - } - break; - case _URE_CCLASS: - case _URE_NCCLASS: - if (sym->props != 0) - matched = _ure_matches_properties(sym->props, c); - for (j = 0, rp = sym->sym.ccl.ranges; - j < sym->sym.ccl.ranges_used; j++, rp++) { - if (rp->min_code <= c && c <= rp->max_code) - matched = 1; - } - if (sym->type == _URE_NCCLASS) - matched = !matched; - break; - } - - if (matched) { - if (ms == ~0UL) - ms = lp - text; - else - me = sp - text; - stp = dfa->states + stp->trans[i].next_state; - - /* - * If the match was an EOL anchor, adjust the pointer past the - * separator that caused the match. The correct match - * position has been recorded already. - */ - if (sym->type == _URE_EOL_ANCHOR) { - /* - * Skip the character that caused the match. - */ - sp++; - - /* - * Handle the infamous CRLF situation. - */ - if (sp < ep && c == '\r' && *sp == '\n') - sp++; - } - } - } - - if (matched == 0) { - if (stp->accepting == 0) { - /* - * If the last state was not accepting, then reset - * and start over. - */ - stp = dfa->states; - ms = me = ~0; - } else - /* - * The last state was accepting, so terminate the matching - * loop to avoid more work. - */ - found = 1; - } else if (sp == ep) { - if (!stp->accepting) { - /* - * This ugly hack is to make sure the end-of-line anchors - * match when the source text hits the end. This is only done - * if the last subexpression matches. - */ - for (i = 0; found == 0 && i < stp->ntrans; i++) { - sym = dfa->syms + stp->trans[i].symbol; - if (sym->type ==_URE_EOL_ANCHOR) { - stp = dfa->states + stp->trans[i].next_state; - if (stp->accepting) { - me = sp - text; - found = 1; - } else - break; - } - } - } else { - /* - * Make sure any conditions that match all the way to the end - * of the string match. - */ - found = 1; - me = sp - text; - } - } - } - - if (found == 0) - ms = me = ~0; - - *match_start = ms; - *match_end = me; - - return (ms != ~0UL) ? 1 : 0; -} diff --git a/src/lib/krb5/unicode/ure/ure.h b/src/lib/krb5/unicode/ure/ure.h deleted file mode 100644 index b83c97e..0000000 --- a/src/lib/krb5/unicode/ure/ure.h +++ /dev/null @@ -1,152 +0,0 @@ -/* - * Copyright 1998-2008 The OpenLDAP Foundation. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - */ -/* Copyright 1997, 1998, 1999 Computing Research Labs, - * New Mexico State University - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT - * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -/* - * This work is part of OpenLDAP Software . - * $OpenLDAP: pkg/ldap/libraries/liblunicode/ure/ure.h,v 1.15 2008/01/07 23:20:05 kurt Exp $ - * $Id: ure.h,v 1.2 1999/09/21 15:47:44 mleisher Exp $ - */ - -#ifndef _h_ure -#define _h_ure - -#include "k5-int.h" - -#include - -/* - * Set of character class flags. - */ -#define _URE_NONSPACING 0x00000001 -#define _URE_COMBINING 0x00000002 -#define _URE_NUMDIGIT 0x00000004 -#define _URE_NUMOTHER 0x00000008 -#define _URE_SPACESEP 0x00000010 -#define _URE_LINESEP 0x00000020 -#define _URE_PARASEP 0x00000040 -#define _URE_CNTRL 0x00000080 -#define _URE_PUA 0x00000100 - -#define _URE_UPPER 0x00000200 -#define _URE_LOWER 0x00000400 -#define _URE_TITLE 0x00000800 -#define _URE_MODIFIER 0x00001000 -#define _URE_OTHERLETTER 0x00002000 -#define _URE_DASHPUNCT 0x00004000 -#define _URE_OPENPUNCT 0x00008000 -#define _URE_CLOSEPUNCT 0x00010000 -#define _URE_OTHERPUNCT 0x00020000 -#define _URE_MATHSYM 0x00040000 -#define _URE_CURRENCYSYM 0x00080000 -#define _URE_OTHERSYM 0x00100000 - -#define _URE_LTR 0x00200000 -#define _URE_RTL 0x00400000 - -#define _URE_EURONUM 0x00800000 -#define _URE_EURONUMSEP 0x01000000 -#define _URE_EURONUMTERM 0x02000000 -#define _URE_ARABNUM 0x04000000 -#define _URE_COMMONSEP 0x08000000 - -#define _URE_BLOCKSEP 0x10000000 -#define _URE_SEGMENTSEP 0x20000000 - -#define _URE_WHITESPACE 0x40000000 -#define _URE_OTHERNEUT 0x80000000 - -/* - * Error codes. - */ -#define _URE_OK 0 -#define _URE_UNEXPECTED_EOS -1 -#define _URE_CCLASS_OPEN -2 -#define _URE_UNBALANCED_GROUP -3 -#define _URE_INVALID_PROPERTY -4 - -/* - * Options that can be combined for searching. - */ -#define URE_IGNORE_NONSPACING 0x01 -#define URE_DOT_MATCHES_SEPARATORS 0x02 - -typedef krb5_ui_4 ucs4_t; -typedef krb5_ui_2 ucs2_t; - -/* - * Opaque type for memory used when compiling expressions. - */ -typedef struct _ure_buffer_t *ure_buffer_t; - -/* - * Opaque type for the minimal DFA used when matching. - */ -typedef struct _ure_dfa_t *ure_dfa_t; - -/************************************************************************* - * - * API. - * - *************************************************************************/ - -ure_buffer_t ure_buffer_create (void); - -void ure_buffer_free (ure_buffer_t buf); - -ure_dfa_t -ure_compile (ucs2_t *re, unsigned long relen, - int casefold, ure_buffer_t buf); - -void ure_dfa_free (ure_dfa_t dfa); - -void ure_write_dfa (ure_dfa_t dfa, FILE *out); - -int -ure_exec (ure_dfa_t dfa, int flags, ucs2_t *text, - unsigned long textlen, unsigned long *match_start, - unsigned long *match_end); - -/************************************************************************* - * - * Prototypes for stub functions used for URE. These need to be rewritten to - * use the Unicode support available on the system. - * - *************************************************************************/ - -ucs4_t _ure_tolower (ucs4_t c); - -int -_ure_matches_properties (unsigned long props, ucs4_t c); - -#endif /* _h_ure */ diff --git a/src/lib/krb5/unicode/ure/urestubs.c b/src/lib/krb5/unicode/ure/urestubs.c deleted file mode 100644 index 0f17951..0000000 --- a/src/lib/krb5/unicode/ure/urestubs.c +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Copyright 1998-2008 The OpenLDAP Foundation. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - */ -/* - * Copyright 1997, 1998, 1999 Computing Research Labs, - * New Mexico State University - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT - * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -/* - * This work is part of OpenLDAP Software . - * $OpenLDAP: pkg/ldap/libraries/liblunicode/ure/urestubs.c,v 1.16 2008/01/07 23:20:05 kurt Exp $ - * $Id: urestubs.c,v 1.2 1999/09/21 15:47:44 mleisher Exp $" - */ - -#include "k5-int.h" - -#include "ure.h" - -#include "ucdata.h" - -/* - * This file contains stub routines needed by the URE package to test - * character properties and other Unicode implementation specific details. - */ - -/* - * This routine should return the lower case equivalent for the character or, - * if there is no lower case quivalent, the character itself. - */ -ucs4_t _ure_tolower(ucs4_t c) -{ - return uctoupper(c); -} - -static struct ucmaskmap { - unsigned long mask1; - unsigned long mask2; -} masks[32] = { - { UC_MN, 0 }, /* _URE_NONSPACING */ - { UC_MC, 0 }, /* _URE_COMBINING */ - { UC_ND, 0 }, /* _URE_NUMDIGIT */ - { UC_NL|UC_NO, 0 }, /* _URE_NUMOTHER */ - { UC_ZS, 0 }, /* _URE_SPACESEP */ - { UC_ZL, 0 }, /* _URE_LINESEP */ - { UC_ZP, 0 }, /* _URE_PARASEP */ - { UC_CC, 0 }, /* _URE_CNTRL */ - { UC_CO, 0 }, /* _URE_PUA */ - - { UC_LU, 0 }, /* _URE_UPPER */ - { UC_LL, 0 }, /* _URE_LOWER */ - { UC_LT, 0 }, /* _URE_TITLE */ - { UC_LM, 0 }, /* _URE_MODIFIER */ - { UC_LO, 0 }, /* _URE_OTHERLETTER */ - { UC_PD, 0 }, /* _URE_DASHPUNCT */ - { UC_PS, 0 }, /* _URE_OPENPUNCT */ - { UC_PC, 0 }, /* _URE_CLOSEPUNCT */ - { UC_PO, 0 }, /* _URE_OTHERPUNCT */ - { UC_SM, 0 }, /* _URE_MATHSYM */ - { UC_SC, 0 }, /* _URE_CURRENCYSYM */ - { UC_SO, 0 }, /* _URE_OTHERSYM */ - - { UC_L, 0 }, /* _URE_LTR */ - { UC_R, 0 }, /* _URE_RTL */ - - { 0, UC_EN }, /* _URE_EURONUM */ - { 0, UC_ES }, /* _URE_EURONUMSEP */ - { 0, UC_ET }, /* _URE_EURONUMTERM */ - { 0, UC_AN }, /* _URE_ARABNUM */ - { 0, UC_CS }, /* _URE_COMMONSEP */ - - { 0, UC_B }, /* _URE_BLOCKSEP */ - { 0, UC_S }, /* _URE_SEGMENTSEP */ - - { 0, UC_WS }, /* _URE_WHITESPACE */ - { 0, UC_ON } /* _URE_OTHERNEUT */ -}; - - -/* - * This routine takes a set of URE character property flags (see ure.h) along - * with a character and tests to see if the character has one or more of those - * properties. - */ -int -_ure_matches_properties(unsigned long props, ucs4_t c) -{ - int i; - unsigned long mask1=0, mask2=0; - - for( i=0; i<32; i++ ) { - if( props & (1 << i) ) { - mask1 |= masks[i].mask1; - mask2 |= masks[i].mask2; - } - } - - return ucisprop( mask1, mask2, c ); -} diff --git a/src/lib/krb5/unicode/utbm/README b/src/lib/krb5/unicode/utbm/README deleted file mode 100644 index 8c0212d..0000000 --- a/src/lib/krb5/unicode/utbm/README +++ /dev/null @@ -1,121 +0,0 @@ -# -# $Id: README,v 1.1 1999/09/21 15:45:17 mleisher Exp $ -# -# Copyright 1997, 1998, 1999 Computing Research Labs, -# New Mexico State University -# -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY -# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT -# OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -# THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# - - Unicode and Boyer-Moore Searching - Version 0.2 - -UTBM (Unicode Tuned Boyer-Moore) is a simple package that provides tuned -Boyer-Moore searches on Unicode UCS2 text (handles high and low surrogates). - ---------------------------------------------------------------------------- - -Assumptions: - - o Search pattern and text already normalized in some fasion. - - o Upper, lower, and title case conversions are one-to-one. - - o For conversions between upper, lower, and title case, UCS2 characters - always convert to other UCS2 characters, and UTF-16 characters always - convert to other UTF-16 characters. - -Flags: - - UTBM provides three processing flags: - - o UTBM_CASEFOLD - search in a case-insensitive manner. - - o UTBM_IGNORE_NONSPACING - ignore non-spacing characters in the pattern and - the text. - - o UTBM_SPACE_COMPRESS - view as a *single space*, sequential groups of - U+2028, U+2029, '\n', '\r', '\t', and any - character identified as a space by the Unicode - support on the platform. - - This flag also causes all characters identified - as control by the Unicode support on the - platform to be ignored (except for '\n', '\r', - and '\t'). - ---------------------------------------------------------------------------- - -Before using UTBM ------------------ -Before UTBM is used, some functions need to be created. The "utbmstub.c" file -contains stubs that need to be rewritten so they work with the Unicode support -on the platform on which this package is being used. - -Using UTBM ----------- - -Sample pseudo-code fragment. - - utbm_pattern_t pat; - ucs2_t *pattern, *text; - unsigned long patternlen, textlen; - unsigned long flags, match_start, match_end; - - /* - * Allocate the dynamic storage needed for a search pattern. - */ - pat = utbm_create_pattern(); - - /* - * Set the search flags desired. - */ - flags = UTBM_CASEFOLD|UTBM_IGNORE_NONSPACING; - - /* - * Compile the search pattern. - */ - utbm_compile(pattern, patternlen, flags, pat); - - /* - * Find the first occurance of the search pattern in the text. - */ - if (utbm_exec(pat, text, textlen, &match_start, &match_end)) - printf("MATCH: %ld %ld\n", match_start, match_end); - - /* - * Free the dynamic storage used for the search pattern. - */ - ure_free_pattern(pat); - ---------------------------------------------------------------------------- - -Mark Leisher -2 May 1997 - -=========================================================================== - -CHANGES -------- - -Version: 0.2 -Date : 21 September 1999 -========================== - 1. Added copyright stuff and put in CVS. - diff --git a/src/lib/krb5/unicode/utbm/utbm.c b/src/lib/krb5/unicode/utbm/utbm.c deleted file mode 100644 index cc895e5..0000000 --- a/src/lib/krb5/unicode/utbm/utbm.c +++ /dev/null @@ -1,475 +0,0 @@ -/* - * Copyright 1998-2008 The OpenLDAP Foundation. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - */ -/* Copyright 1997, 1998, 1999 Computing Research Labs, - * New Mexico State University - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT - * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -/* - * This work is part of OpenLDAP Software . - * $OpenLDAP: pkg/ldap/libraries/liblunicode/utbm/utbm.c,v 1.9 2008/01/07 23:20:05 kurt Exp $ - * $Id: utbm.c,v 1.1 1999/09/21 15:45:17 mleisher Exp $ - */ - -/* - * Assumptions: - * 1. Case conversions of UTF-16 characters must also be UTF-16 characters. - * 2. Case conversions are all one-to-one. - * 3. Text and pattern have already been normalized in some fashion. - */ - -#include -#include -#include -#include "utbm.h" - -/* - * Single pattern character. - */ -typedef struct { - ucs4_t lc; - ucs4_t uc; - ucs4_t tc; -} _utbm_char_t; - -typedef struct { - _utbm_char_t *ch; - unsigned long skip; -} _utbm_skip_t; - -typedef struct _utbm_pattern_t { - unsigned long flags; - - _utbm_char_t *pat; - unsigned long pat_used; - unsigned long pat_size; - unsigned long patlen; - - _utbm_skip_t *skip; - unsigned long skip_used; - unsigned long skip_size; - - unsigned long md4; -} _utbm_pattern_t; - -/************************************************************************* - * - * Support functions. - * - *************************************************************************/ - -/* - * Routine to look up the skip value for a character. - */ -static unsigned long -_utbm_skip(utbm_pattern_t p, ucs2_t *start, ucs2_t *end) -{ - unsigned long i; - ucs4_t c1, c2; - _utbm_skip_t *sp; - - if (start >= end) - return 0; - - c1 = *start; - c2 = (start + 1 < end) ? *(start + 1) : ~0; - if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff) - c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff)); - - for (i = 0, sp = p->skip; i < p->skip_used; i++, sp++) { - if (!((c1 ^ sp->ch->uc) & (c1 ^ sp->ch->lc) & (c1 ^ sp->ch->tc))) { - return ((unsigned long) (end - start) < sp->skip) ? - end - start : sp->skip; - } - } - return p->patlen; -} - -static int -_utbm_match(utbm_pattern_t pat, ucs2_t *text, ucs2_t *start, ucs2_t *end, - unsigned long *match_start, unsigned long *match_end) -{ - int check_space; - ucs4_t c1, c2; - unsigned long count; - _utbm_char_t *cp; - - /* - * Set the potential match endpoint first. - */ - *match_end = (start - text) + 1; - - c1 = *start; - c2 = (start + 1 < end) ? *(start + 1) : ~0; - if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff) { - c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff)); - /* - * Adjust the match end point to occur after the UTF-16 character. - */ - *match_end = *match_end + 1; - } - - if (pat->pat_used == 1) { - *match_start = start - text; - return 1; - } - - /* - * Compare backward. - */ - cp = pat->pat + (pat->pat_used - 1); - - for (count = pat->patlen; start > text && count > 0;) { - /* - * Ignore non-spacing characters if indicated. - */ - if (pat->flags & UTBM_IGNORE_NONSPACING) { - while (start > text && _utbm_nonspacing(c1)) { - c2 = *--start; - c1 = (start - 1 > text) ? *(start - 1) : ~0; - if (0xdc00 <= c2 && c2 <= 0xdfff && - 0xd800 <= c1 && c1 <= 0xdbff) { - c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff)); - start--; - } else - c1 = c2; - } - } - - /* - * Handle space compression if indicated. - */ - if (pat->flags & UTBM_SPACE_COMPRESS) { - check_space = 0; - while (start > text && - (_utbm_isspace(c1, 1) || _utbm_iscntrl(c1))) { - check_space = _utbm_isspace(c1, 1); - c2 = *--start; - c1 = (start - 1 > text) ? *(start - 1) : ~0; - if (0xdc00 <= c2 && c2 <= 0xdfff && - 0xd800 <= c1 && c1 <= 0xdbff) { - c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff)); - start--; - } else - c1 = c2; - } - /* - * Handle things if space compression was indicated and one or - * more member characters were found. - */ - if (check_space) { - if (cp->uc != ' ') - return 0; - cp--; - count--; - } - } - - /* - * Handle the normal comparison cases. - */ - if (count > 0 && ((c1 ^ cp->uc) & (c1 ^ cp->lc) & (c1 ^ cp->tc))) - return 0; - - count -= (c1 >= 0x10000) ? 2 : 1; - if (count > 0) { - cp--; - - /* - * Get the next preceding character. - */ - if (start > text) { - c2 = *--start; - c1 = (start - 1 > text) ? *(start - 1) : ~0; - if (0xdc00 <= c2 && c2 <= 0xdfff && - 0xd800 <= c1 && c1 <= 0xdbff) { - c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff)); - start--; - } else - c1 = c2; - } - } - } - - /* - * Set the match start position. - */ - *match_start = start - text; - return 1; -} - -/************************************************************************* - * - * API. - * - *************************************************************************/ - -utbm_pattern_t -utbm_create_pattern(void) -{ - utbm_pattern_t p; - - p = (utbm_pattern_t) malloc(sizeof(_utbm_pattern_t)); - (void) memset((char *) p, '\0', sizeof(_utbm_pattern_t)); - return p; -} - -void -utbm_free_pattern(utbm_pattern_t pattern) -{ - if (pattern == 0) - return; - - if (pattern->pat_size > 0) - free((char *) pattern->pat); - - if (pattern->skip_size > 0) - free((char *) pattern->skip); - - free((char *) pattern); -} - -void -utbm_compile(ucs2_t *pat, unsigned long patlen, unsigned long flags, - utbm_pattern_t p) -{ - int have_space; - unsigned long i, j, k, slen; - _utbm_char_t *cp; - _utbm_skip_t *sp; - ucs4_t c1, c2, sentinel; - - if (p == 0 || pat == 0 || *pat == 0 || patlen == 0) - return; - - /* - * Reset the pattern buffer. - */ - p->patlen = p->pat_used = p->skip_used = 0; - - /* - * Set the flags. - */ - p->flags = flags; - - /* - * Initialize the extra skip flag. - */ - p->md4 = 1; - - /* - * Allocate more storage if necessary. - */ - if (patlen > p->pat_size) { - if (p->pat_size == 0) { - p->pat = (_utbm_char_t *) malloc(sizeof(_utbm_char_t) * patlen); - p->skip = (_utbm_skip_t *) malloc(sizeof(_utbm_skip_t) * patlen); - } else { - p->pat = (_utbm_char_t *) - realloc((char *) p->pat, sizeof(_utbm_char_t) * patlen); - p->skip = (_utbm_skip_t *) - realloc((char *) p->skip, sizeof(_utbm_skip_t) * patlen); - } - p->pat_size = p->skip_size = patlen; - } - - /* - * Preprocess the pattern to remove controls (if specified) and determine - * case. - */ - for (have_space = 0, cp = p->pat, i = 0; i < patlen; i++) { - c1 = pat[i]; - c2 = (i + 1 < patlen) ? pat[i + 1] : ~0; - if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff) - c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff)); - - /* - * Make sure the `have_space' flag is turned off if the character - * is not an appropriate one. - */ - if (!_utbm_isspace(c1, flags & UTBM_SPACE_COMPRESS)) - have_space = 0; - - /* - * If non-spacing characters should be ignored, do it here. - */ - if ((flags & UTBM_IGNORE_NONSPACING) && _utbm_nonspacing(c1)) - continue; - - /* - * Check if spaces and controls need to be compressed. - */ - if (flags & UTBM_SPACE_COMPRESS) { - if (_utbm_isspace(c1, 1)) { - if (!have_space) { - /* - * Add a space and set the flag. - */ - cp->uc = cp->lc = cp->tc = ' '; - cp++; - - /* - * Increase the real pattern length. - */ - p->patlen++; - sentinel = ' '; - have_space = 1; - } - continue; - } - - /* - * Ignore all control characters. - */ - if (_utbm_iscntrl(c1)) - continue; - } - - /* - * Add the character. - */ - if (flags & UTBM_CASEFOLD) { - cp->uc = _utbm_toupper(c1); - cp->lc = _utbm_tolower(c1); - cp->tc = _utbm_totitle(c1); - } else - cp->uc = cp->lc = cp->tc = c1; - - /* - * Set the sentinel character. - */ - sentinel = cp->uc; - - /* - * Move to the next character. - */ - cp++; - - /* - * Increase the real pattern length appropriately. - */ - p->patlen += (c1 >= 0x10000) ? 2 : 1; - - /* - * Increment the loop index for UTF-16 characters. - */ - i += (c1 >= 0x10000) ? 1 : 0; - - } - - /* - * Set the number of characters actually used. - */ - p->pat_used = cp - p->pat; - - /* - * Go through and construct the skip array and determine the actual length - * of the pattern in UCS2 terms. - */ - slen = p->patlen - 1; - cp = p->pat; - for (i = k = 0; i < p->pat_used; i++, cp++) { - /* - * Locate the character in the skip array. - */ - for (sp = p->skip, j = 0; - j < p->skip_used && sp->ch->uc != cp->uc; j++, sp++) ; - - /* - * If the character is not found, set the new skip element and - * increase the number of skip elements. - */ - if (j == p->skip_used) { - sp->ch = cp; - p->skip_used++; - } - - /* - * Set the updated skip value. If the character is UTF-16 and is - * not the last one in the pattern, add one to its skip value. - */ - sp->skip = slen - k; - if (cp->uc >= 0x10000 && k + 2 < slen) - sp->skip++; - - /* - * Set the new extra skip for the sentinel character. - */ - if (((cp->uc >= 0x10000 && k + 2 <= slen) || k + 1 <= slen) && - cp->uc == sentinel) - p->md4 = slen - k; - - /* - * Increase the actual index. - */ - k += (cp->uc >= 0x10000) ? 2 : 1; - } -} - -int -utbm_exec(utbm_pattern_t pat, ucs2_t *text, unsigned long textlen, - unsigned long *match_start, unsigned long *match_end) -{ - unsigned long k; - ucs2_t *start, *end; - - if (pat == 0 || pat->pat_used == 0 || text == 0 || textlen == 0 || - textlen < pat->patlen) - return 0; - - start = text + pat->patlen; - end = text + textlen; - - /* - * Adjust the start point if it points to a low surrogate. - */ - if (0xdc00 <= *start && *start <= 0xdfff && - 0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff) - start--; - - while (start < end) { - while ((k = _utbm_skip(pat, start, end))) { - start += k; - if (start < end && 0xdc00 <= *start && *start <= 0xdfff && - 0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff) - start--; - } - - if (start < end && - _utbm_match(pat, text, start, end, match_start, match_end)) - return 1; - - start += pat->md4; - if (start < end && 0xdc00 <= *start && *start <= 0xdfff && - 0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff) - start--; - } - return 0; -} diff --git a/src/lib/krb5/unicode/utbm/utbm.h b/src/lib/krb5/unicode/utbm/utbm.h deleted file mode 100644 index 1ab8b91..0000000 --- a/src/lib/krb5/unicode/utbm/utbm.h +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Copyright 1998-2008 The OpenLDAP Foundation. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - */ -/* Copyright 1997, 1998, 1999 Computing Research Labs, - * New Mexico State University - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT - * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -/* - * This work is part of OpenLDAP Software . - * $OpenLDAP: pkg/ldap/libraries/liblunicode/utbm/utbm.h,v 1.10 2008/01/07 23:20:05 kurt Exp $ - * $Id: utbm.h,v 1.1 1999/09/21 15:45:18 mleisher Exp $ - */ - -#ifndef _h_utbm -#define _h_utbm - -#include "k5-int.h" - -/************************************************************************* - * - * Types. - * - *************************************************************************/ - -/* - * Fundamental character types. - */ -typedef krb5_ui_4 ucs4_t; -typedef krb5_ui_2 ucs2_t; - -/* - * An opaque type used for the search pattern. - */ -typedef struct _utbm_pattern_t *utbm_pattern_t; - -/************************************************************************* - * - * Flags. - * - *************************************************************************/ - -#define UTBM_CASEFOLD 0x01 -#define UTBM_IGNORE_NONSPACING 0x02 -#define UTBM_SPACE_COMPRESS 0x04 - -/************************************************************************* - * - * API. - * - *************************************************************************/ - -utbm_pattern_t utbm_create_pattern (void); - -void utbm_free_pattern (utbm_pattern_t pattern); - -void -utbm_compile (ucs2_t *pat, unsigned long patlen, - unsigned long flags, utbm_pattern_t pattern); - -int -utbm_exec (utbm_pattern_t pat, ucs2_t *text, - unsigned long textlen, unsigned long *match_start, - unsigned long *match_end); - -/************************************************************************* - * - * Prototypes for the stub functions needed. - * - *************************************************************************/ - -int _utbm_isspace (ucs4_t c, int compress); - -int _utbm_iscntrl (ucs4_t c); - -int _utbm_nonspacing (ucs4_t c); - -ucs4_t _utbm_tolower (ucs4_t c); - -ucs4_t _utbm_toupper (ucs4_t c); - -ucs4_t _utbm_totitle (ucs4_t c); - -#endif /* _h_utbm */ diff --git a/src/lib/krb5/unicode/utbm/utbmstub.c b/src/lib/krb5/unicode/utbm/utbmstub.c deleted file mode 100644 index 9a6f60a..0000000 --- a/src/lib/krb5/unicode/utbm/utbmstub.c +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Copyright 1998-2008 The OpenLDAP Foundation. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - */ -/* Copyright 1997, 1998, 1999 Computing Research Labs, - * New Mexico State University - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT - * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -/* - * This work is part of OpenLDAP Software . - * $OpenLDAP: pkg/ldap/libraries/liblunicode/utbm/utbmstub.c,v 1.8 2008/01/07 23:20:05 kurt Exp $ - * $Id: utbmstub.c,v 1.1 1999/09/21 15:45:18 mleisher Exp $ - */ - -#include "utbm.h" - -/* - * This should be redefined to use the `isspace' function available in the - * Unicode support on the platform where this is being used. - */ -#define _platform_isspace(x) 0 - -/* - * Return non-zero for any character that should be considered the equivalent - * of a space character. Return zero otherwise. - */ -int -_utbm_isspace(ucs4_t c, int compress) -{ - if (compress) - return (c == 0x09 || c == 0x0a || c == 0x0d || - c == 0x2028 || c == 0x2029 || _platform_isspace(c)) ? 1 : 0; - - return _platform_isspace(c); - -} - -/* - * Return non-zero if the character is a control character, or zero otherwise. - */ -int -_utbm_iscntrl(ucs4_t c) -{ - return 0; -} - -/* - * Return non-zero if the character is a non-spacing character, or zero - * otherwise. - */ -int -_utbm_nonspacing(ucs4_t c) -{ - return 0; -} - -/* - * Convert a character to lower case. - */ -ucs4_t -_utbm_tolower(ucs4_t c) -{ - return c; -} - -/* - * Convert a character to upper case. - */ -ucs4_t -_utbm_toupper(ucs4_t c) -{ - return c; -} - -/* - * Convert a character to title case. - */ -ucs4_t -_utbm_totitle(ucs4_t c) -{ - return c; -} diff --git a/src/util/support/libkrb5support-fixed.exports b/src/util/support/libkrb5support-fixed.exports index df3c78f..0bafe1c 100644 --- a/src/util/support/libkrb5support-fixed.exports +++ b/src/util/support/libkrb5support-fixed.exports @@ -95,5 +95,4 @@ krb5int_ucs4_to_utf8 krb5int_utf8_to_ucs4 krb5int_utf8_lentab krb5int_utf8_mintab -krb5int_utf8_next krb5int_zap diff --git a/src/util/support/t_utf8.c b/src/util/support/t_utf8.c index 5832701..6493bae 100644 --- a/src/util/support/t_utf8.c +++ b/src/util/support/t_utf8.c @@ -49,13 +49,13 @@ #endif /* - * len is 0 for invalid encoding prefixes (krb5int_utf8_charlen2() partially + * len is 0 for invalid encoding prefixes (KRB5_UTF8_CHARLEN2() partially * enforces the validity of the first two bytes, based on masking the second * byte. It doesn't check whether bit 6 is 0, though, and doesn't catch the * range between U+110000 and U+13FFFF). * * ucs is 0 for invalid encodings (including ones with valid prefixes according - * to krb5int_utf8_charlen2(); krb5int_utf8_to_ucs4() will still fail on them + * to KRB5_UTF8_CHARLEN2(); krb5int_utf8_to_ucs4() will still fail on them * because it checks more things.) Code points above U+10FFFF are excluded by * the actual test code and remain in the table for possibly testing the old * implementation that didn't exclude them. @@ -129,7 +129,7 @@ test_decode(struct testcase *t, int high4) int len, status = 0; krb5_ucs4 u = 0; - len = krb5int_utf8_charlen2(t->p); + len = KRB5_UTF8_CHARLEN2(t->p, len); if (len != t->len) { printf("expected len=%d, got len=%d\n", t->len, len); status = 1; diff --git a/src/util/support/utf8.c b/src/util/support/utf8.c index dfbf12b..08bdcf9 100644 --- a/src/util/support/utf8.c +++ b/src/util/support/utf8.c @@ -53,50 +53,6 @@ #include "supp-int.h" /* - * return the number of bytes required to hold the - * NULL-terminated UTF-8 string NOT INCLUDING the - * termination. - */ -size_t krb5int_utf8_bytes(const char *p) -{ - size_t bytes; - - for (bytes = 0; p[bytes]; bytes++) - ; - - return bytes; -} - -size_t krb5int_utf8_chars(const char *p) -{ - /* could be optimized and could check for invalid sequences */ - size_t chars = 0; - - for ( ; *p ; KRB5_UTF8_INCR(p)) - chars++; - - return chars; -} - -size_t krb5int_utf8c_chars(const char *p, size_t length) -{ - /* could be optimized and could check for invalid sequences */ - size_t chars = 0; - const char *end = p + length; - - for ( ; p < end; KRB5_UTF8_INCR(p)) - chars++; - - return chars; -} - -/* return offset to next character */ -int krb5int_utf8_offset(const char *p) -{ - return KRB5_UTF8_NEXT(p) - p; -} - -/* * Returns length indicated by first byte. */ const char krb5int_utf8_lentab[] = { @@ -109,14 +65,6 @@ const char krb5int_utf8_lentab[] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; -int krb5int_utf8_charlen(const char *p) -{ - if (!(*p & 0x80)) - return 1; - - return krb5int_utf8_lentab[*(const unsigned char *)p ^ 0x80]; -} - /* * Make sure the UTF-8 char used the shortest possible encoding * returns charlen if valid, 0 if not. @@ -147,18 +95,6 @@ c krb5int_utf8_mintab[] = { (c)0x00, (c)0x00, (c)0x00, (c)0x00, (c)0x00, (c)0x00, (c)0x00, (c)0x00 }; #undef c -int krb5int_utf8_charlen2(const char *p) -{ - int i = KRB5_UTF8_CHARLEN(p); - - if (i > 2) { - if (!(krb5int_utf8_mintab[*p & 0x1f] & p[1])) - i = 0; - } - - return i; -} - /* * Convert a UTF8 character to a UCS4 character. Return 0 on success, * -1 on failure. @@ -194,17 +130,6 @@ int krb5int_utf8_to_ucs4(const char *p, krb5_ucs4 *out) return 0; } -int krb5int_utf8_to_ucs2(const char *p, krb5_ucs2 *out) -{ - krb5_ucs4 ch; - - *out = 0; - if (krb5int_utf8_to_ucs4(p, &ch) == -1 || ch > 0xFFFF) - return -1; - *out = (krb5_ucs2) ch; - return 0; -} - /* conv UCS-4 to UTF-8 */ size_t krb5int_ucs4_to_utf8(krb5_ucs4 c, char *buf) { @@ -241,271 +166,3 @@ size_t krb5int_ucs4_to_utf8(krb5_ucs4 c, char *buf) return len; } - -size_t krb5int_ucs2_to_utf8(krb5_ucs2 c, char *buf) -{ - return krb5int_ucs4_to_utf8((krb5_ucs4)c, buf); -} - -/* - * Advance to the next UTF-8 character - * - * Ignores length of multibyte character, instead rely on - * continuation markers to find start of next character. - * This allows for "resyncing" of when invalid characters - * are provided provided the start of the next character - * is appears within the 6 bytes examined. - */ -char *krb5int_utf8_next(const char *p) -{ - int i; - const unsigned char *u = (const unsigned char *) p; - - if (KRB5_UTF8_ISASCII(u)) { - return (char *) &p[1]; - } - - for (i = 1; i < 6; i++) { - if ((u[i] & 0xc0) != 0x80) { - return (char *) &p[i]; - } - } - - return (char *) &p[i]; -} - -/* - * Advance to the previous UTF-8 character - * - * Ignores length of multibyte character, instead rely on - * continuation markers to find start of next character. - * This allows for "resyncing" of when invalid characters - * are provided provided the start of the next character - * is appears within the 6 bytes examined. - */ -char *krb5int_utf8_prev(const char *p) -{ - int i; - const unsigned char *u = (const unsigned char *) p; - - for (i = -1; i>-6 ; i--) { - if ((u[i] & 0xc0 ) != 0x80) { - return (char *) &p[i]; - } - } - - return (char *) &p[i]; -} - -/* - * Copy one UTF-8 character from src to dst returning - * number of bytes copied. - * - * Ignores length of multibyte character, instead rely on - * continuation markers to find start of next character. - * This allows for "resyncing" of when invalid characters - * are provided provided the start of the next character - * is appears within the 6 bytes examined. - */ -int krb5int_utf8_copy(char* dst, const char *src) -{ - int i; - const unsigned char *u = (const unsigned char *) src; - - dst[0] = src[0]; - - if (KRB5_UTF8_ISASCII(u)) { - return 1; - } - - for (i=1; i<6; i++) { - if ((u[i] & 0xc0) != 0x80) { - return i; - } - dst[i] = src[i]; - } - - return i; -} - -#ifndef UTF8_ALPHA_CTYPE -/* - * UTF-8 ctype routines - * Only deals with characters < 0x80 (ie: US-ASCII) - */ - -int krb5int_utf8_isascii(const char * p) -{ - unsigned c = * (const unsigned char *) p; - - return KRB5_ASCII(c); -} - -int krb5int_utf8_isdigit(const char * p) -{ - unsigned c = * (const unsigned char *) p; - - if (!KRB5_ASCII(c)) - return 0; - - return KRB5_DIGIT( c ); -} - -int krb5int_utf8_isxdigit(const char * p) -{ - unsigned c = * (const unsigned char *) p; - - if (!KRB5_ASCII(c)) - return 0; - - return KRB5_HEX(c); -} - -int krb5int_utf8_isspace(const char * p) -{ - unsigned c = * (const unsigned char *) p; - - if (!KRB5_ASCII(c)) - return 0; - - switch(c) { - case ' ': - case '\t': - case '\n': - case '\r': - case '\v': - case '\f': - return 1; - } - - return 0; -} - -/* - * These are not needed by the C SDK and are - * not "good enough" for general use. - */ -int krb5int_utf8_isalpha(const char * p) -{ - unsigned c = * (const unsigned char *) p; - - if (!KRB5_ASCII(c)) - return 0; - - return KRB5_ALPHA(c); -} - -int krb5int_utf8_isalnum(const char * p) -{ - unsigned c = * (const unsigned char *) p; - - if (!KRB5_ASCII(c)) - return 0; - - return KRB5_ALNUM(c); -} -#endif - - -/* - * UTF-8 string routines - */ - -/* like strchr() */ -char *krb5int_utf8_strchr(const char *str, const char *chr) -{ - krb5_ucs4 chs, ch; - - if (krb5int_utf8_to_ucs4(chr, &ch) == -1) - return NULL; - for ( ; *str != '\0'; KRB5_UTF8_INCR(str)) { - if (krb5int_utf8_to_ucs4(str, &chs) == 0 && chs == ch) - return (char *)str; - } - - return NULL; -} - -/* like strcspn() but returns number of bytes, not characters */ -size_t krb5int_utf8_strcspn(const char *str, const char *set) -{ - const char *cstr, *cset; - krb5_ucs4 chstr, chset; - - for (cstr = str; *cstr != '\0'; KRB5_UTF8_INCR(cstr)) { - for (cset = set; *cset != '\0'; KRB5_UTF8_INCR(cset)) { - if (krb5int_utf8_to_ucs4(cstr, &chstr) == 0 - && krb5int_utf8_to_ucs4(cset, &chset) == 0 && chstr == chset) - return cstr - str; - } - } - - return cstr - str; -} - -/* like strspn() but returns number of bytes, not characters */ -size_t krb5int_utf8_strspn(const char *str, const char *set) -{ - const char *cstr, *cset; - krb5_ucs4 chstr, chset; - - for (cstr = str; *cstr != '\0'; KRB5_UTF8_INCR(cstr)) { - for (cset = set; ; KRB5_UTF8_INCR(cset)) { - if (*cset == '\0') - return cstr - str; - if (krb5int_utf8_to_ucs4(cstr, &chstr) == 0 - && krb5int_utf8_to_ucs4(cset, &chset) == 0 && chstr == chset) - break; - } - } - - return cstr - str; -} - -/* like strpbrk(), replaces strchr() as well */ -char *krb5int_utf8_strpbrk(const char *str, const char *set) -{ - const char *cset; - krb5_ucs4 chstr, chset; - - for ( ; *str != '\0'; KRB5_UTF8_INCR(str)) { - for (cset = set; *cset != '\0'; KRB5_UTF8_INCR(cset)) { - if (krb5int_utf8_to_ucs4(str, &chstr) == 0 - && krb5int_utf8_to_ucs4(cset, &chset) == 0 && chstr == chset) - return (char *)str; - } - } - - return NULL; -} - -/* like strtok_r(), not strtok() */ -char *krb5int_utf8_strtok(char *str, const char *sep, char **last) -{ - char *begin; - char *end; - - if (last == NULL) - return NULL; - - begin = str ? str : *last; - - begin += krb5int_utf8_strspn(begin, sep); - - if (*begin == '\0') { - *last = NULL; - return NULL; - } - - end = &begin[krb5int_utf8_strcspn(begin, sep)]; - - if (*end != '\0') { - char *next = KRB5_UTF8_NEXT(end); - *end = '\0'; - end = next; - } - - *last = end; - - return begin; -} -- cgit v1.1