Cygwin: glob: implement collating symbol support

Allow the [.<sym>.] expression This requires a string comparision rather than a character comparison. Introduce and use __wscollate_range_cmp. Signed-off-by: Corinna Vinschen <corinna@vinschen.de>
author: Corinna Vinschen <corinna@vinschen.de> 2023-02-20 22:50:17 +0100
committer: Corinna Vinschen <corinna@vinschen.de> 2023-02-20 22:50:17 +0100
commit: ce5aa098071304cfd3bd1bd535a7571089344b1a (patch)
tree: a796ecf228e6c325ea54fcdbcbe455c30137fbf6 /winsup
parent: 244faaea8e24c70a1d61b939623364e3bdfaa28c (diff)
download: newlib-ce5aa098071304cfd3bd1bd535a7571089344b1a.zip
newlib-ce5aa098071304cfd3bd1bd535a7571089344b1a.tar.gz
newlib-ce5aa098071304cfd3bd1bd535a7571089344b1a.tar.bz2
3 files changed, 98 insertions, 37 deletions
diff --git a/winsup/cygwin/glob.cc b/winsup/cygwin/glob.cc
index 4ef9479..6668178 100644
--- a/winsup/cygwin/glob.cc
+++ b/winsup/cygwin/glob.cc
@@ -160,6 +160,9 @@ typedef char Char;
 #define	M_SET		META('[')
 #define	M_NAMED		META(':')
 #define	M_EQUIV		META('=')
+#define	M_COLL(_ccnt)	META('.' | ((_ccnt) << 8))
+#define M_COLL_P(_c)	(((_c) & M_COLL_MASK) == META('.'))
+#define M_COLL_CNT(_c)	(((_c) & ~M_COLL_MASK) >> 8)
 #define	ismeta(c)	(((c)&M_QUOTE) != 0)
 
 static int	 compare(const void *, const void *);
@@ -528,41 +531,61 @@ glob0(const Char *pattern, glob_t *pglob, size_t *limit)
 			*bufnext++ = M_SET;
 			if (c == NOT)
 				*bufnext++ = M_NOT;
-			c = *qpatnext;
+			c = *qpatnext++;
 			do {
 				wint_t wclass[64];
 				Char ctype;
 
-				ctype = check_classes_expr(qpatnext, wclass,
+				ctype = check_classes_expr(--qpatnext, wclass,
 							   64);
-				if (ctype) {
+				++qpatnext;
+				if (ctype == COLON) {
 					wctype_t type;
-
-					if (ctype == COLON) {
-					    char cclass[64];
-
-					    /* No worries, char classes are
-					       ASCII-only anyway */
-					    wcitoascii (cclass, wclass);
-					    if ((type = wctype (cclass))) {
-						*bufnext++ = M_NAMED;
-						*bufnext++ = CHAR (type);
-					    }
-					} else if (ctype == EQUALS &&
-						   wclass[0] && !wclass[1]) {
+					char cclass[64];
+
+					/* No worries, char classes are
+					   ASCII-only anyway */
+					wcitoascii (cclass, wclass);
+					if ((type = wctype (cclass))) {
+					    *bufnext++ = M_NAMED;
+					    *bufnext++ = CHAR (type);
+					}
+					continue;
+				}
+				if (ctype == EQUALS) {
+					if (wclass[0] && !wclass[1]) {
 					    *bufnext++ = M_EQUIV;
 					    *bufnext++ = CHAR (wclass[0]);
 					}
-					/* TODO: [. is ignored yet */
-					qpatnext++;
 					continue;
 				}
-				*bufnext++ = CHAR(c);
+				if (ctype == DOT &&
+				    is_unicode_coll_elem (wclass)) {
+					*bufnext++ =
+					    M_COLL (wcilen (wclass));
+					wint_t *wcp = wclass;
+					while ((*bufnext++ = *wcp++))
+					    ;
+					--bufnext; /* drop NUL */
+				} else
+					*bufnext++ = CHAR(c);
 				if (*qpatnext == RANGE &&
 				    (c = qpatnext[1]) != RBRACKET) {
 					*bufnext++ = M_RNG;
-					*bufnext++ = CHAR(c);
-					qpatnext += 2;
+
+					ctype = check_classes_expr(++qpatnext,
+								   wclass, 64);
+					if (ctype == DOT &&
+					    is_unicode_coll_elem (wclass)) {
+						*bufnext++ =
+						    M_COLL (wcilen (wclass));
+						wint_t *wcp = wclass;
+						while ((*bufnext++ = *wcp++))
+						    ;
+						--bufnext; /* drop NUL */
+					} else
+						*bufnext++ = CHAR(c);
+					++qpatnext;
 				}
 			} while ((c = *qpatnext++) != RBRACKET);
 			pglob->gl_flags |= GLOB_MAGCHAR;
@@ -849,11 +872,12 @@ static int
 match(Char *name, Char *pat, Char *patend)
 {
 	int ok, negate_range;
-	Char c, k;
+	Char *c, *k;
+	size_t k_len;
 
 	while (pat < patend) {
-		c = *pat++;
-		switch (c & M_MASK) {
+		c = pat++;
+		switch (*c & M_MASK) {
 		case M_ALL:
 			if (pat == patend)
 				return(1);
@@ -868,36 +892,53 @@ match(Char *name, Char *pat, Char *patend)
 			break;
 		case M_SET:
 			ok = 0;
-			if ((k = *name++) == EOS)
+			if (*(k = name) == EOS)
 				return(0);
+			k_len = next_unicode_char (k);
+			name += k_len;
 			if ((negate_range = ((*pat & M_MASK) == M_NOT)) != EOS)
 				++pat;
-			while (((c = *pat++) & M_MASK) != M_END)
-				if ((c & M_MASK) == M_NAMED) {
-					if (iswctype (k, *pat++))
+			while ((*(c = pat++) & M_MASK) != M_END) {
+				size_t len1 = 1, len2 = 1;
+
+				if ((*c & M_MASK) == M_NAMED) {
+					if (iswctype (*k, *pat++))
 						ok = 1;
-				} else if ((c & M_MASK) == M_EQUIV) {
-					if (is_unicode_equiv (k, *pat++))
+					continue;
+				}
+				if ((*c & M_MASK) == M_EQUIV) {
+					if (is_unicode_equiv (*k, *pat++))
 						ok = 1;
-				} else if ((*pat & M_MASK) == M_RNG) {
+					continue;
+				}
+				if (M_COLL_P(*c)) {
+					len1 = M_COLL_CNT(*c);
+					++c;
+					pat += len1;
+				}
+				if ((*pat & M_MASK) == M_RNG) {
+					if (M_COLL_P(pat[1]))
+						len2 = M_COLL_CNT(*++pat);
 #ifdef __CYGWIN__
 					if ((!__get_current_collate_locale ()->lcid) ?
 #else
 					if (__collate_load_error ?
 #endif
-					    CCHAR(c) <= CCHAR(k) && CCHAR(k) <= CCHAR(pat[1]) :
-					       __wcollate_range_cmp(CCHAR(c), CCHAR(k)) <= 0
-					    && __wcollate_range_cmp(CCHAR(k), CCHAR(pat[1])) <= 0
+					    *c <= *k && *k <= pat[1] :
+					       __wscollate_range_cmp(c, k, len1, k_len) <= 0
+					    && __wscollate_range_cmp(k, pat + 1, k_len, len2) <= 0
 					   )
 						ok = 1;
-					pat += 2;
-				} else if (c == k)
+					pat += len2 + 1;
+				} else if (len1 == k_len &&
+					   wcincmp (c, k, len1) == 0)
 					ok = 1;
+			}
 			if (ok == negate_range)
 				return(0);
 			break;
 		default:
-			if (Cchar(*name++) != Cchar(c))
+			if (Cchar(*name++) != Cchar(*c))
 				return(0);
 			break;
 		}
diff --git a/winsup/cygwin/local_includes/collate.h b/winsup/cygwin/local_includes/collate.h
index 7b4c72d..498d5e1 100644
--- a/winsup/cygwin/local_includes/collate.h
+++ b/winsup/cygwin/local_includes/collate.h
@@ -14,6 +14,7 @@ extern "C" {
 extern const int __collate_load_error;
 
 extern int __wcollate_range_cmp (wint_t, wint_t);
+extern int __wscollate_range_cmp (wint_t *, wint_t *, size_t, size_t);
 
 int is_unicode_equiv (wint_t, wint_t);
 
diff --git a/winsup/cygwin/nlsfuncs.cc b/winsup/cygwin/nlsfuncs.cc
index 20143f1..eb9948d 100644
--- a/winsup/cygwin/nlsfuncs.cc
+++ b/winsup/cygwin/nlsfuncs.cc
@@ -1195,6 +1195,25 @@ __wcollate_range_cmp (wint_t c1, wint_t c2)
   return wcscoll (s1, s2);
 }
 
+/* Not so much BSD.  Used from glob.cc, fnmatch.c and regcomp.c.
+
+   First arg is always from pattern space, second arg is the tested string.
+   len is the length of the pattern in the first arg. */
+extern "C" int
+__wscollate_range_cmp (wint_t *c1, wint_t *c2,
+		       size_t c1len, size_t c2len)
+{
+  wchar_t s1[c1len * 2 + 1] = { 0 };	/* # of chars if all are surrogates */
+  wchar_t s2[c2len * 2 + 1] = { 0 };
+
+  wcintowcs (s1, c1, c1len);
+  wcintowcs (s2, c2, c2len);
+  return wcscoll_l (s1, s2, __get_current_locale ());
+}
+
+const size_t ce_size = sizeof collating_element / sizeof *collating_element;
+const size_t ce_e_size = sizeof *collating_element;
+
 /* Check if UTF-32 input character `test' is in the same equivalence class
    as UTF-32 character 'eqv'.
    Note that we only recognize input in Unicode normalization form C, that
author	Corinna Vinschen <corinna@vinschen.de>	2023-02-20 22:50:17 +0100
committer	Corinna Vinschen <corinna@vinschen.de>	2023-02-20 22:50:17 +0100
commit	ce5aa098071304cfd3bd1bd535a7571089344b1a (patch)
tree	a796ecf228e6c325ea54fcdbcbe455c30137fbf6 /winsup
parent	244faaea8e24c70a1d61b939623364e3bdfaa28c (diff)
download	newlib-ce5aa098071304cfd3bd1bd535a7571089344b1a.zip newlib-ce5aa098071304cfd3bd1bd535a7571089344b1a.tar.gz newlib-ce5aa098071304cfd3bd1bd535a7571089344b1a.tar.bz2