aboutsummaryrefslogtreecommitdiff
path: root/winsup
diff options
context:
space:
mode:
authorCorinna Vinschen <corinna@vinschen.de>2023-02-20 22:50:17 +0100
committerCorinna Vinschen <corinna@vinschen.de>2023-02-20 22:50:17 +0100
commitce5aa098071304cfd3bd1bd535a7571089344b1a (patch)
treea796ecf228e6c325ea54fcdbcbe455c30137fbf6 /winsup
parent244faaea8e24c70a1d61b939623364e3bdfaa28c (diff)
downloadnewlib-ce5aa098071304cfd3bd1bd535a7571089344b1a.zip
newlib-ce5aa098071304cfd3bd1bd535a7571089344b1a.tar.gz
newlib-ce5aa098071304cfd3bd1bd535a7571089344b1a.tar.bz2
Cygwin: glob: implement collating symbol support
Allow the [.<sym>.] expression This requires a string comparision rather than a character comparison. Introduce and use __wscollate_range_cmp. Signed-off-by: Corinna Vinschen <corinna@vinschen.de>
Diffstat (limited to 'winsup')
-rw-r--r--winsup/cygwin/glob.cc115
-rw-r--r--winsup/cygwin/local_includes/collate.h1
-rw-r--r--winsup/cygwin/nlsfuncs.cc19
3 files changed, 98 insertions, 37 deletions
diff --git a/winsup/cygwin/glob.cc b/winsup/cygwin/glob.cc
index 4ef9479..6668178 100644
--- a/winsup/cygwin/glob.cc
+++ b/winsup/cygwin/glob.cc
@@ -160,6 +160,9 @@ typedef char Char;
#define M_SET META('[')
#define M_NAMED META(':')
#define M_EQUIV META('=')
+#define M_COLL(_ccnt) META('.' | ((_ccnt) << 8))
+#define M_COLL_P(_c) (((_c) & M_COLL_MASK) == META('.'))
+#define M_COLL_CNT(_c) (((_c) & ~M_COLL_MASK) >> 8)
#define ismeta(c) (((c)&M_QUOTE) != 0)
static int compare(const void *, const void *);
@@ -528,41 +531,61 @@ glob0(const Char *pattern, glob_t *pglob, size_t *limit)
*bufnext++ = M_SET;
if (c == NOT)
*bufnext++ = M_NOT;
- c = *qpatnext;
+ c = *qpatnext++;
do {
wint_t wclass[64];
Char ctype;
- ctype = check_classes_expr(qpatnext, wclass,
+ ctype = check_classes_expr(--qpatnext, wclass,
64);
- if (ctype) {
+ ++qpatnext;
+ if (ctype == COLON) {
wctype_t type;
-
- if (ctype == COLON) {
- char cclass[64];
-
- /* No worries, char classes are
- ASCII-only anyway */
- wcitoascii (cclass, wclass);
- if ((type = wctype (cclass))) {
- *bufnext++ = M_NAMED;
- *bufnext++ = CHAR (type);
- }
- } else if (ctype == EQUALS &&
- wclass[0] && !wclass[1]) {
+ char cclass[64];
+
+ /* No worries, char classes are
+ ASCII-only anyway */
+ wcitoascii (cclass, wclass);
+ if ((type = wctype (cclass))) {
+ *bufnext++ = M_NAMED;
+ *bufnext++ = CHAR (type);
+ }
+ continue;
+ }
+ if (ctype == EQUALS) {
+ if (wclass[0] && !wclass[1]) {
*bufnext++ = M_EQUIV;
*bufnext++ = CHAR (wclass[0]);
}
- /* TODO: [. is ignored yet */
- qpatnext++;
continue;
}
- *bufnext++ = CHAR(c);
+ if (ctype == DOT &&
+ is_unicode_coll_elem (wclass)) {
+ *bufnext++ =
+ M_COLL (wcilen (wclass));
+ wint_t *wcp = wclass;
+ while ((*bufnext++ = *wcp++))
+ ;
+ --bufnext; /* drop NUL */
+ } else
+ *bufnext++ = CHAR(c);
if (*qpatnext == RANGE &&
(c = qpatnext[1]) != RBRACKET) {
*bufnext++ = M_RNG;
- *bufnext++ = CHAR(c);
- qpatnext += 2;
+
+ ctype = check_classes_expr(++qpatnext,
+ wclass, 64);
+ if (ctype == DOT &&
+ is_unicode_coll_elem (wclass)) {
+ *bufnext++ =
+ M_COLL (wcilen (wclass));
+ wint_t *wcp = wclass;
+ while ((*bufnext++ = *wcp++))
+ ;
+ --bufnext; /* drop NUL */
+ } else
+ *bufnext++ = CHAR(c);
+ ++qpatnext;
}
} while ((c = *qpatnext++) != RBRACKET);
pglob->gl_flags |= GLOB_MAGCHAR;
@@ -849,11 +872,12 @@ static int
match(Char *name, Char *pat, Char *patend)
{
int ok, negate_range;
- Char c, k;
+ Char *c, *k;
+ size_t k_len;
while (pat < patend) {
- c = *pat++;
- switch (c & M_MASK) {
+ c = pat++;
+ switch (*c & M_MASK) {
case M_ALL:
if (pat == patend)
return(1);
@@ -868,36 +892,53 @@ match(Char *name, Char *pat, Char *patend)
break;
case M_SET:
ok = 0;
- if ((k = *name++) == EOS)
+ if (*(k = name) == EOS)
return(0);
+ k_len = next_unicode_char (k);
+ name += k_len;
if ((negate_range = ((*pat & M_MASK) == M_NOT)) != EOS)
++pat;
- while (((c = *pat++) & M_MASK) != M_END)
- if ((c & M_MASK) == M_NAMED) {
- if (iswctype (k, *pat++))
+ while ((*(c = pat++) & M_MASK) != M_END) {
+ size_t len1 = 1, len2 = 1;
+
+ if ((*c & M_MASK) == M_NAMED) {
+ if (iswctype (*k, *pat++))
ok = 1;
- } else if ((c & M_MASK) == M_EQUIV) {
- if (is_unicode_equiv (k, *pat++))
+ continue;
+ }
+ if ((*c & M_MASK) == M_EQUIV) {
+ if (is_unicode_equiv (*k, *pat++))
ok = 1;
- } else if ((*pat & M_MASK) == M_RNG) {
+ continue;
+ }
+ if (M_COLL_P(*c)) {
+ len1 = M_COLL_CNT(*c);
+ ++c;
+ pat += len1;
+ }
+ if ((*pat & M_MASK) == M_RNG) {
+ if (M_COLL_P(pat[1]))
+ len2 = M_COLL_CNT(*++pat);
#ifdef __CYGWIN__
if ((!__get_current_collate_locale ()->lcid) ?
#else
if (__collate_load_error ?
#endif
- CCHAR(c) <= CCHAR(k) && CCHAR(k) <= CCHAR(pat[1]) :
- __wcollate_range_cmp(CCHAR(c), CCHAR(k)) <= 0
- && __wcollate_range_cmp(CCHAR(k), CCHAR(pat[1])) <= 0
+ *c <= *k && *k <= pat[1] :
+ __wscollate_range_cmp(c, k, len1, k_len) <= 0
+ && __wscollate_range_cmp(k, pat + 1, k_len, len2) <= 0
)
ok = 1;
- pat += 2;
- } else if (c == k)
+ pat += len2 + 1;
+ } else if (len1 == k_len &&
+ wcincmp (c, k, len1) == 0)
ok = 1;
+ }
if (ok == negate_range)
return(0);
break;
default:
- if (Cchar(*name++) != Cchar(c))
+ if (Cchar(*name++) != Cchar(*c))
return(0);
break;
}
diff --git a/winsup/cygwin/local_includes/collate.h b/winsup/cygwin/local_includes/collate.h
index 7b4c72d..498d5e1 100644
--- a/winsup/cygwin/local_includes/collate.h
+++ b/winsup/cygwin/local_includes/collate.h
@@ -14,6 +14,7 @@ extern "C" {
extern const int __collate_load_error;
extern int __wcollate_range_cmp (wint_t, wint_t);
+extern int __wscollate_range_cmp (wint_t *, wint_t *, size_t, size_t);
int is_unicode_equiv (wint_t, wint_t);
diff --git a/winsup/cygwin/nlsfuncs.cc b/winsup/cygwin/nlsfuncs.cc
index 20143f1..eb9948d 100644
--- a/winsup/cygwin/nlsfuncs.cc
+++ b/winsup/cygwin/nlsfuncs.cc
@@ -1195,6 +1195,25 @@ __wcollate_range_cmp (wint_t c1, wint_t c2)
return wcscoll (s1, s2);
}
+/* Not so much BSD. Used from glob.cc, fnmatch.c and regcomp.c.
+
+ First arg is always from pattern space, second arg is the tested string.
+ len is the length of the pattern in the first arg. */
+extern "C" int
+__wscollate_range_cmp (wint_t *c1, wint_t *c2,
+ size_t c1len, size_t c2len)
+{
+ wchar_t s1[c1len * 2 + 1] = { 0 }; /* # of chars if all are surrogates */
+ wchar_t s2[c2len * 2 + 1] = { 0 };
+
+ wcintowcs (s1, c1, c1len);
+ wcintowcs (s2, c2, c2len);
+ return wcscoll_l (s1, s2, __get_current_locale ());
+}
+
+const size_t ce_size = sizeof collating_element / sizeof *collating_element;
+const size_t ce_e_size = sizeof *collating_element;
+
/* Check if UTF-32 input character `test' is in the same equivalence class
as UTF-32 character 'eqv'.
Note that we only recognize input in Unicode normalization form C, that