diff options
-rw-r--r-- | jim-regexp.c | 37 | ||||
-rw-r--r-- | tests/regexp.test | 10 | ||||
-rw-r--r-- | tests/regexp2.test | 6 |
3 files changed, 37 insertions, 16 deletions
diff --git a/jim-regexp.c b/jim-regexp.c index d19867e..f47f235 100644 --- a/jim-regexp.c +++ b/jim-regexp.c @@ -485,7 +485,13 @@ int Jim_RegsubCmd(Jim_Interp *interp, int argc, Jim_Obj *const *argv) n = source_len - offset; p = source_str + offset; - do { + while (1) { + /* To match Tcl, an empty pattern does not match at the end + * of the string. + */ + if (n == 0 && !pattern[0]) { + break; + } int match = jim_regexec(regex, p, MAX_SUB_MATCHES, pmatch, regexec_flags); if (match >= REG_BADPAT) { @@ -584,23 +590,22 @@ int Jim_RegsubCmd(Jim_Interp *interp, int argc, Jim_Obj *const *argv) break; } - /* If the pattern is empty, need to step forwards */ - if (pattern[0] == '\0' && n) { - /* Need to copy the char we are moving over */ - Jim_AppendString(interp, resultObj, p, 1); - p++; - n--; - } - + regexec_flags = 0; if (pmatch[0].rm_eo == pmatch[0].rm_so) { - /* The match did not advance the string, so set REG_NOTBOL to force the next match */ - regexec_flags = REG_NOTBOL; - } - else { - regexec_flags = 0; + /* Matched a zero length string. Need to avoid matching the same position again */ + if (pattern[0] == '^') { + /* An anchored search sets REG_BOL */ + regexec_flags = REG_NOTBOL; + } + else { + /* A non-anchored search advances by one char */ + int charlen = utf8_charlen(p[0]); + Jim_AppendString(interp, resultObj, p, charlen); + p += charlen; + n -= charlen; + } } - - } while (n); + } /* * Copy the portion of the string after the last match to the diff --git a/tests/regexp.test b/tests/regexp.test index f7c589d..0d86d6b 100644 --- a/tests/regexp.test +++ b/tests/regexp.test @@ -666,6 +666,16 @@ test regexp-21.15 {Replace literal backslash} { set value } "\\abc\\def" +test regexp-21.16 {Replace nothing} { + regsub -all {x*} anything ! +} {!a!n!y!t!h!i!n!g!} + +test regexp-21.17 {Replace nothing via empty pattern} { + # Interestingly in this case Tcl does not match + # at end of string while the previous case does + regsub -all {} anything ! +} {!a!n!y!t!h!i!n!g} + test regexp-22.1 {char range} { regexp -all -inline {[a-c]+} "defaaghbcadfbaacccd" } {aa bca baaccc} diff --git a/tests/regexp2.test b/tests/regexp2.test index 4d915c2..3f357b6 100644 --- a/tests/regexp2.test +++ b/tests/regexp2.test @@ -810,6 +810,12 @@ test regexpComp-21.10 {regexp command compiling tests} { # list [regsub -all "" "" bar str] $str # } #} {0 {}} +test regexpComp-21.12 {regexp empty pattern with utf8} utf8 { + # Make sure the second char isn't sliced up + evalInProc { + regsub -all "" a\u0442bc ! + } +} "!a!\u0442!b!c" # We can forgive the underlying regexp engine for not supporting this. # Why not use this instead? "((^X)*|\$)" |