diff options
author | Steve Bennett <steveb@workware.net.au> | 2025-08-08 19:57:57 +1000 |
---|---|---|
committer | Steve Bennett <steveb@workware.net.au> | 2025-08-08 20:07:50 +1000 |
commit | eeff606bb7860465b56782ba2ab2847f71963b59 (patch) | |
tree | 168d8a4cec99629c2f81cb5e8601b6c923f7e6ba | |
parent | cd31c05cabd79aa1430213c7b9b240f245e6b773 (diff) | |
download | jimtcl-regsub-all-match-empty.zip jimtcl-regsub-all-match-empty.tar.gz jimtcl-regsub-all-match-empty.tar.bz2 |
regsub -all: don't loop forever when matching everywhereregsub-all-match-empty
Although "" and "x*" both match the empty string, the former
correctly exits, while the latter looped forever.
Match Tcl here by advancing by one char in both cases, but
in the latter case end of string is matched, while in the former
it is not.
Also prevent both cases from slicing a utf-8 char into bytes.
Fixes: #353
Signed-off-by: Steve Bennett <steveb@workware.net.au>
-rw-r--r-- | jim-regexp.c | 37 | ||||
-rw-r--r-- | tests/regexp.test | 10 | ||||
-rw-r--r-- | tests/regexp2.test | 6 |
3 files changed, 37 insertions, 16 deletions
diff --git a/jim-regexp.c b/jim-regexp.c index d19867e..f47f235 100644 --- a/jim-regexp.c +++ b/jim-regexp.c @@ -485,7 +485,13 @@ int Jim_RegsubCmd(Jim_Interp *interp, int argc, Jim_Obj *const *argv) n = source_len - offset; p = source_str + offset; - do { + while (1) { + /* To match Tcl, an empty pattern does not match at the end + * of the string. + */ + if (n == 0 && !pattern[0]) { + break; + } int match = jim_regexec(regex, p, MAX_SUB_MATCHES, pmatch, regexec_flags); if (match >= REG_BADPAT) { @@ -584,23 +590,22 @@ int Jim_RegsubCmd(Jim_Interp *interp, int argc, Jim_Obj *const *argv) break; } - /* If the pattern is empty, need to step forwards */ - if (pattern[0] == '\0' && n) { - /* Need to copy the char we are moving over */ - Jim_AppendString(interp, resultObj, p, 1); - p++; - n--; - } - + regexec_flags = 0; if (pmatch[0].rm_eo == pmatch[0].rm_so) { - /* The match did not advance the string, so set REG_NOTBOL to force the next match */ - regexec_flags = REG_NOTBOL; - } - else { - regexec_flags = 0; + /* Matched a zero length string. Need to avoid matching the same position again */ + if (pattern[0] == '^') { + /* An anchored search sets REG_BOL */ + regexec_flags = REG_NOTBOL; + } + else { + /* A non-anchored search advances by one char */ + int charlen = utf8_charlen(p[0]); + Jim_AppendString(interp, resultObj, p, charlen); + p += charlen; + n -= charlen; + } } - - } while (n); + } /* * Copy the portion of the string after the last match to the diff --git a/tests/regexp.test b/tests/regexp.test index f7c589d..0d86d6b 100644 --- a/tests/regexp.test +++ b/tests/regexp.test @@ -666,6 +666,16 @@ test regexp-21.15 {Replace literal backslash} { set value } "\\abc\\def" +test regexp-21.16 {Replace nothing} { + regsub -all {x*} anything ! +} {!a!n!y!t!h!i!n!g!} + +test regexp-21.17 {Replace nothing via empty pattern} { + # Interestingly in this case Tcl does not match + # at end of string while the previous case does + regsub -all {} anything ! +} {!a!n!y!t!h!i!n!g} + test regexp-22.1 {char range} { regexp -all -inline {[a-c]+} "defaaghbcadfbaacccd" } {aa bca baaccc} diff --git a/tests/regexp2.test b/tests/regexp2.test index 4d915c2..3f357b6 100644 --- a/tests/regexp2.test +++ b/tests/regexp2.test @@ -810,6 +810,12 @@ test regexpComp-21.10 {regexp command compiling tests} { # list [regsub -all "" "" bar str] $str # } #} {0 {}} +test regexpComp-21.12 {regexp empty pattern with utf8} utf8 { + # Make sure the second char isn't sliced up + evalInProc { + regsub -all "" a\u0442bc ! + } +} "!a!\u0442!b!c" # We can forgive the underlying regexp engine for not supporting this. # Why not use this instead? "((^X)*|\$)" |