From adb3252e5b213579ad26f101c25461c78829569c Mon Sep 17 00:00:00 2001 From: Steve Bennett Date: Sat, 27 Jan 2024 10:39:28 +1000 Subject: regexp: implement regsub -command Per Tcl 8.7 Signed-off-by: Steve Bennett --- jim-regexp.c | 147 +++++++++++++++++++++++++++++++++++++----------------- jim_tcl.txt | 23 ++++++++- tests/regexp.test | 43 +++++++++++++++- 3 files changed, 165 insertions(+), 48 deletions(-) diff --git a/jim-regexp.c b/jim-regexp.c index 1486c3a..f370e5e 100644 --- a/jim-regexp.c +++ b/jim-regexp.c @@ -348,27 +348,30 @@ int Jim_RegsubCmd(Jim_Interp *interp, int argc, Jim_Obj *const *argv) int regcomp_flags = 0; int regexec_flags = 0; int opt_all = 0; + int opt_command = 0; int offset = 0; regex_t *regex; const char *p; - int result; + int result = JIM_OK; regmatch_t pmatch[MAX_SUB_MATCHES + 1]; int num_matches = 0; int i, j, n; Jim_Obj *varname; Jim_Obj *resultObj; + Jim_Obj *cmd_prefix = NULL; + Jim_Obj *regcomp_obj = NULL; const char *source_str; int source_len; - const char *replace_str; + const char *replace_str = NULL; int replace_len; const char *pattern; int option; enum { - OPT_NOCASE, OPT_LINE, OPT_ALL, OPT_START, OPT_END + OPT_NOCASE, OPT_LINE, OPT_ALL, OPT_START, OPT_COMMAND, OPT_END }; static const char * const options[] = { - "-nocase", "-line", "-all", "-start", "--", NULL + "-nocase", "-line", "-all", "-start", "-command", "--", NULL }; if (argc < 4) { @@ -412,20 +415,39 @@ int Jim_RegsubCmd(Jim_Interp *interp, int argc, Jim_Obj *const *argv) return JIM_ERR; } break; + + case OPT_COMMAND: + opt_command = 1; + break; } } if (argc - i != 3 && argc - i != 4) { goto wrongNumArgs; } - regex = SetRegexpFromAny(interp, argv[i], regcomp_flags); + /* Need to ensure that this is unshared, so just duplicate it always */ + regcomp_obj = Jim_DuplicateObj(interp, argv[i]); + Jim_IncrRefCount(regcomp_obj); + regex = SetRegexpFromAny(interp, regcomp_obj, regcomp_flags); if (!regex) { + Jim_DecrRefCount(interp, regcomp_obj); return JIM_ERR; } pattern = Jim_String(argv[i]); source_str = Jim_GetString(argv[i + 1], &source_len); - replace_str = Jim_GetString(argv[i + 2], &replace_len); + if (opt_command) { + cmd_prefix = argv[i + 2]; + if (Jim_ListLength(interp, cmd_prefix) == 0) { + Jim_SetResultString(interp, "command prefix must be a list of at least one element", -1); + Jim_DecrRefCount(interp, regcomp_obj); + return JIM_ERR; + } + Jim_IncrRefCount(cmd_prefix); + } + else { + replace_str = Jim_GetString(argv[i + 2], &replace_len); + } varname = argv[i + 3]; /* Create the result string */ @@ -482,44 +504,67 @@ int Jim_RegsubCmd(Jim_Interp *interp, int argc, Jim_Obj *const *argv) */ Jim_AppendString(interp, resultObj, p, pmatch[0].rm_so); - /* - * Append the subSpec (replace_str) argument to the variable, making appropriate - * substitutions. This code is a bit hairy because of the backslash - * conventions and because the code saves up ranges of characters in - * subSpec to reduce the number of calls to Jim_SetVar. - */ - - for (j = 0; j < replace_len; j++) { - int idx; - int c = replace_str[j]; + if (opt_command) { + /* construct the command as a list */ + Jim_Obj *cmdListObj = Jim_DuplicateObj(interp, cmd_prefix); + for (j = 0; j < MAX_SUB_MATCHES; j++) { + if (pmatch[j].rm_so == -1) { + break; + } + else { + Jim_Obj *srcObj = Jim_NewStringObj(interp, p + pmatch[j].rm_so, pmatch[j].rm_eo - pmatch[j].rm_so); + Jim_ListAppendElement(interp, cmdListObj, srcObj); + } + } + Jim_IncrRefCount(cmdListObj); - if (c == '&') { - idx = 0; + result = Jim_EvalObj(interp, cmdListObj); + Jim_DecrRefCount(interp, cmdListObj); + if (result != JIM_OK) { + goto cmd_error; } - else if (c == '\\' && j < replace_len) { - c = replace_str[++j]; - if ((c >= '0') && (c <= '9')) { - idx = c - '0'; + Jim_AppendString(interp, resultObj, Jim_String(Jim_GetResult(interp)), -1); + } + else { + /* + * Append the subSpec (replace_str) argument to the variable, making appropriate + * substitutions. This code is a bit hairy because of the backslash + * conventions and because the code saves up ranges of characters in + * subSpec to reduce the number of calls to Jim_SetVar. + */ + + for (j = 0; j < replace_len; j++) { + int idx; + int c = replace_str[j]; + + if (c == '&') { + idx = 0; } - else if ((c == '\\') || (c == '&')) { - Jim_AppendString(interp, resultObj, replace_str + j, 1); - continue; + else if (c == '\\' && j < replace_len) { + c = replace_str[++j]; + if ((c >= '0') && (c <= '9')) { + idx = c - '0'; + } + else if ((c == '\\') || (c == '&')) { + Jim_AppendString(interp, resultObj, replace_str + j, 1); + continue; + } + else { + /* If the replacement is a trailing backslash, just replace with a backslash, otherwise + * with the literal backslash and the following character + */ + Jim_AppendString(interp, resultObj, replace_str + j - 1, (j == replace_len) ? 1 : 2); + continue; + } } else { - /* If the replacement is a trailing backslash, just replace with a backslash, otherwise - * with the literal backslash and the following character - */ - Jim_AppendString(interp, resultObj, replace_str + j - 1, (j == replace_len) ? 1 : 2); + Jim_AppendString(interp, resultObj, replace_str + j, 1); continue; } - } - else { - Jim_AppendString(interp, resultObj, replace_str + j, 1); - continue; - } - if ((idx < MAX_SUB_MATCHES) && pmatch[idx].rm_so != -1 && pmatch[idx].rm_eo != -1) { - Jim_AppendString(interp, resultObj, p + pmatch[idx].rm_so, - pmatch[idx].rm_eo - pmatch[idx].rm_so); + if ((idx < MAX_SUB_MATCHES) && pmatch[idx].rm_so != -1 && pmatch[idx].rm_eo != -1) { + Jim_AppendString(interp, resultObj, p + pmatch[idx].rm_so, + pmatch[idx].rm_eo - pmatch[idx].rm_so); + } } } @@ -560,22 +605,34 @@ int Jim_RegsubCmd(Jim_Interp *interp, int argc, Jim_Obj *const *argv) */ Jim_AppendString(interp, resultObj, p, -1); - /* And now set or return the result variable */ - if (argc - i == 4) { - result = Jim_SetVariable(interp, varname, resultObj); +cmd_error: + if (result == JIM_OK) { + /* And now set or return the result variable */ + if (argc - i == 4) { + result = Jim_SetVariable(interp, varname, resultObj); - if (result == JIM_OK) { - Jim_SetResultInt(interp, num_matches); + if (result == JIM_OK) { + Jim_SetResultInt(interp, num_matches); + } + else { + Jim_FreeObj(interp, resultObj); + } } else { - Jim_FreeObj(interp, resultObj); + Jim_SetResult(interp, resultObj); + result = JIM_OK; } } else { - Jim_SetResult(interp, resultObj); - result = JIM_OK; + Jim_FreeObj(interp, resultObj); + } + + if (opt_command) { + Jim_DecrRefCount(interp, cmd_prefix); } + Jim_DecrRefCount(interp, regcomp_obj); + return result; } diff --git a/jim_tcl.txt b/jim_tcl.txt index df94e9a..8684ed9 100644 --- a/jim_tcl.txt +++ b/jim_tcl.txt @@ -62,6 +62,7 @@ Changes since 0.82 6. `socket` , `open` and `aio accept` now support '-noclose' 7. Add support for hinting with `history hints` 8. Support for `proc` statics by reference (lexical closure) rather than by value +9. `regsub` now supports '-command' (per Tcl 8.7) Changes between 0.81 and 0.82 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -3840,7 +3841,7 @@ The following switches modify the behaviour of +'regexp'+ regsub ~~~~~~ -+*regsub ?-nocase? ?-all? ?-line? ?-start* 'offset'? ?*--*? 'exp string subSpec ?varName?'+ ++*regsub ?-nocase? ?-all? ?-line? ?-command? ?-start* 'offset'? ?*--*? 'exp string subSpec ?varName?'+ This command matches the regular expression +'exp'+ against +'string'+ using the rules described in REGULAR EXPRESSIONS @@ -3897,6 +3898,26 @@ The following switches modify the behaviour of +'regsub'+ function, and the +$+ anchor matches the empty string before any newline in the string in addition to its normal function. ++*-command*+:: + + Changes the handling of +'subSpec'+ so that it is not treated + as a template for a substitution string and the substrings +*&*+ + and +*\n*+ no longer have special meaning. Instead +'subSpec'+ must + be a command prefix, that is, a non-empty list. The substring + of string that matches +'exp'+, and then each substring that matches + each capturing sub-RE within +'exp'+, are appended as additional + elements to that list. (The items appended to the list are much + like what `regexp -inline` would return). The completed list is + then evaluated as a Tcl command, and the result of that command + is the substitution string. Any error or exception from command + evaluation becomes an error or exception from the regsub command. + :: + If +*-all*+ is not also given, the command callback will be invoked + at most once (exactly when the regular expression matches). If + +*-all*+ is given, the command callback will be invoked for each + matched location, in sequence. The exact location indices that + matched are not made available to the script. + +*-start* 'offset'+:: Specifies a character index offset into the string at which to start matching the regular expression. +'offset'+ will be diff --git a/tests/regexp.test b/tests/regexp.test index a7c8a4a..7aeb72e 100644 --- a/tests/regexp.test +++ b/tests/regexp.test @@ -365,9 +365,9 @@ test regexp-11.3 {regsub errors} { test regexp-11.4 {regsub errors} { list [catch {regsub a b c d e f} msg] $msg } {1 {wrong # args: should be "regsub ?-switch ...? exp string subSpec ?varName?"}} -test regexp-11.5 {regsub errors} jim { +test regexp-11.5 {regsub errors} -constraints jim -body { list [catch {regsub -gorp a b c} msg] $msg -} {1 {bad switch "-gorp": must be --, -all, -line, -nocase, or -start}} +} -result {1 {bad switch "-gorp": must be --, -all, -command, -line, -nocase, or -start}} test regexp-11.6 {regsub errors} { catch {regsub -nocase a( b c d} msg } 1 @@ -742,6 +742,45 @@ test regexp-22.17 {\d in set} { regexp -all -inline {[a\d]+} "a0ac\[b a\]44c\tb-1aa7" } {a0a a 44 1aa7} +test regexp-27.1 {regsub -command} { + regsub -command {.x.} {abcxdef} {string length} +} ab3ef +test regexp-27.2 {regsub -command} { + regsub -command {.x.} {abcxdefxghi} {string length} +} ab3efxghi +test regexp-27.3 {regsub -command} { + set x 0 + regsub -all -command {(.)} abcde {apply {args {string cat [incr ::x] [lindex $args 1]}}} +} 1a2b3c4d5e +test regexp-27.4 {regsub -command} -body { + regsub -command {.x.} {abcxdef} error +} -returnCodes error -result cxd +test regexp-27.5 {regsub -command} { + regsub -command {(.)(.)} {abcdef} {list ,} +} {, ab a bcdef} +test regexp-27.6 {regsub -command} { + regsub -command -all {(.)(.)} {abcdef} {list ,} +} {, ab a b, cd c d, ef e f} +test regexp-27.7 {regsub -command representation smash} { + set ::s {123=456 789} + regsub -command -all {\d+} $::s {apply {n { + expr {[llength $::s] + $n} + }}} +} {125=458 791} +test regexp-27.8 {regsub -command representation smash} { + set ::t {apply {n { + expr {[llength [lindex $::t 1 1 1]] + $n} + }}} + regsub -command -all {\d+} "123=456 789" $::t +} {131=464 797} +test regexp-27.11 {regsub -command error cases} -returnCodes error -body { + regsub -command . abc {} +} -result {command prefix must be a list of at least one element} +test regexp-27.12 {regsub -command representation smash} { + set s {list (.+)} + regsub -command $s {list list} $s +} {(.+) {list list} list} + # Tests resulting from bugs reported by users test reg-31.1 {[[:xdigit:]] behaves correctly when followed by [[:space:]]} { set str {2:::DebugWin32} -- cgit v1.1