aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSteve Bennett <steveb@workware.net.au>2025-08-08 19:57:57 +1000
committerSteve Bennett <steveb@workware.net.au>2025-08-08 20:07:50 +1000
commiteeff606bb7860465b56782ba2ab2847f71963b59 (patch)
tree168d8a4cec99629c2f81cb5e8601b6c923f7e6ba
parentcd31c05cabd79aa1430213c7b9b240f245e6b773 (diff)
downloadjimtcl-regsub-all-match-empty.zip
jimtcl-regsub-all-match-empty.tar.gz
jimtcl-regsub-all-match-empty.tar.bz2
regsub -all: don't loop forever when matching everywhereregsub-all-match-empty
Although "" and "x*" both match the empty string, the former correctly exits, while the latter looped forever. Match Tcl here by advancing by one char in both cases, but in the latter case end of string is matched, while in the former it is not. Also prevent both cases from slicing a utf-8 char into bytes. Fixes: #353 Signed-off-by: Steve Bennett <steveb@workware.net.au>
-rw-r--r--jim-regexp.c37
-rw-r--r--tests/regexp.test10
-rw-r--r--tests/regexp2.test6
3 files changed, 37 insertions, 16 deletions
diff --git a/jim-regexp.c b/jim-regexp.c
index d19867e..f47f235 100644
--- a/jim-regexp.c
+++ b/jim-regexp.c
@@ -485,7 +485,13 @@ int Jim_RegsubCmd(Jim_Interp *interp, int argc, Jim_Obj *const *argv)
n = source_len - offset;
p = source_str + offset;
- do {
+ while (1) {
+ /* To match Tcl, an empty pattern does not match at the end
+ * of the string.
+ */
+ if (n == 0 && !pattern[0]) {
+ break;
+ }
int match = jim_regexec(regex, p, MAX_SUB_MATCHES, pmatch, regexec_flags);
if (match >= REG_BADPAT) {
@@ -584,23 +590,22 @@ int Jim_RegsubCmd(Jim_Interp *interp, int argc, Jim_Obj *const *argv)
break;
}
- /* If the pattern is empty, need to step forwards */
- if (pattern[0] == '\0' && n) {
- /* Need to copy the char we are moving over */
- Jim_AppendString(interp, resultObj, p, 1);
- p++;
- n--;
- }
-
+ regexec_flags = 0;
if (pmatch[0].rm_eo == pmatch[0].rm_so) {
- /* The match did not advance the string, so set REG_NOTBOL to force the next match */
- regexec_flags = REG_NOTBOL;
- }
- else {
- regexec_flags = 0;
+ /* Matched a zero length string. Need to avoid matching the same position again */
+ if (pattern[0] == '^') {
+ /* An anchored search sets REG_BOL */
+ regexec_flags = REG_NOTBOL;
+ }
+ else {
+ /* A non-anchored search advances by one char */
+ int charlen = utf8_charlen(p[0]);
+ Jim_AppendString(interp, resultObj, p, charlen);
+ p += charlen;
+ n -= charlen;
+ }
}
-
- } while (n);
+ }
/*
* Copy the portion of the string after the last match to the
diff --git a/tests/regexp.test b/tests/regexp.test
index f7c589d..0d86d6b 100644
--- a/tests/regexp.test
+++ b/tests/regexp.test
@@ -666,6 +666,16 @@ test regexp-21.15 {Replace literal backslash} {
set value
} "\\abc\\def"
+test regexp-21.16 {Replace nothing} {
+ regsub -all {x*} anything !
+} {!a!n!y!t!h!i!n!g!}
+
+test regexp-21.17 {Replace nothing via empty pattern} {
+ # Interestingly in this case Tcl does not match
+ # at end of string while the previous case does
+ regsub -all {} anything !
+} {!a!n!y!t!h!i!n!g}
+
test regexp-22.1 {char range} {
regexp -all -inline {[a-c]+} "defaaghbcadfbaacccd"
} {aa bca baaccc}
diff --git a/tests/regexp2.test b/tests/regexp2.test
index 4d915c2..3f357b6 100644
--- a/tests/regexp2.test
+++ b/tests/regexp2.test
@@ -810,6 +810,12 @@ test regexpComp-21.10 {regexp command compiling tests} {
# list [regsub -all "" "" bar str] $str
# }
#} {0 {}}
+test regexpComp-21.12 {regexp empty pattern with utf8} utf8 {
+ # Make sure the second char isn't sliced up
+ evalInProc {
+ regsub -all "" a\u0442bc !
+ }
+} "!a!\u0442!b!c"
# We can forgive the underlying regexp engine for not supporting this.
# Why not use this instead? "((^X)*|\$)"