diff options
author | Steve Bennett <steveb@workware.net.au> | 2025-08-09 20:41:45 +1000 |
---|---|---|
committer | Steve Bennett <steveb@workware.net.au> | 2025-08-13 08:05:25 +1000 |
commit | 00b27e1bcf275bfebe62ff882ef17f3401bdbea0 (patch) | |
tree | 1d9d1041c5fd38713d7cb8b116b0f3858a23563e | |
parent | 3c89e1bffa6fa0d7e66657a0ab7a29a1066dc536 (diff) | |
download | jimtcl-00b27e1bcf275bfebe62ff882ef17f3401bdbea0.zip jimtcl-00b27e1bcf275bfebe62ff882ef17f3401bdbea0.tar.gz jimtcl-00b27e1bcf275bfebe62ff882ef17f3401bdbea0.tar.bz2 |
regexp, regsub: add support for -lineanchor and -linestop
Signed-off-by: Steve Bennett <steveb@workware.net.au>
-rw-r--r-- | jim-regexp.c | 53 | ||||
-rw-r--r-- | jimregexp.c | 6 | ||||
-rw-r--r-- | jimregexp.h | 5 | ||||
-rw-r--r-- | tests/regexp.test | 74 | ||||
-rw-r--r-- | tests/regexp2.test | 36 |
5 files changed, 97 insertions, 77 deletions
diff --git a/jim-regexp.c b/jim-regexp.c index cf06e68..c1909c0 100644 --- a/jim-regexp.c +++ b/jim-regexp.c @@ -122,6 +122,7 @@ int Jim_RegexpCmd(Jim_Interp *interp, int argc, Jim_Obj *const *argv) int opt_indices = 0; int opt_all = 0; int opt_inline = 0; + int opt_lineanchor = 0; regex_t *regex; int match, i, j; int offset = 0; @@ -137,10 +138,10 @@ int Jim_RegexpCmd(Jim_Interp *interp, int argc, Jim_Obj *const *argv) int eflags = 0; int option; enum { - OPT_INDICES, OPT_NOCASE, OPT_LINE, OPT_ALL, OPT_INLINE, OPT_START, OPT_EXPANDED, OPT_END + OPT_INDICES, OPT_NOCASE, OPT_LINE, OPT_LINESTOP, OPT_LINEANCHOR, OPT_ALL, OPT_INLINE, OPT_START, OPT_EXPANDED, OPT_END }; static const char * const options[] = { - "-indices", "-nocase", "-line", "-all", "-inline", "-start", "-expanded", "--", NULL + "-indices", "-nocase", "-line", "-linestop", "-lineanchor", "-all", "-inline", "-start", "-expanded", "--", NULL }; for (i = 1; i < argc; i++) { @@ -167,8 +168,20 @@ int Jim_RegexpCmd(Jim_Interp *interp, int argc, Jim_Obj *const *argv) case OPT_LINE: regcomp_flags |= REG_NEWLINE; + opt_lineanchor = 1; break; +#ifdef REG_NEWLINE_STOP + case OPT_LINESTOP: + regcomp_flags |= REG_NEWLINE_STOP; + break; +#endif +#ifdef REG_NEWLINE_ANCHOR + case OPT_LINEANCHOR: + regcomp_flags |= REG_NEWLINE_ANCHOR; + opt_lineanchor = 1; + break; +#endif case OPT_ALL: opt_all = 1; break; @@ -186,14 +199,15 @@ int Jim_RegexpCmd(Jim_Interp *interp, int argc, Jim_Obj *const *argv) } break; - case OPT_EXPANDED: #ifdef REG_EXPANDED + case OPT_EXPANDED: regcomp_flags |= REG_EXPANDED; break; -#else +#endif + default: + /* Could get here if -linestop or -lineanchor or -expanded is not supported */ Jim_SetResultFormatted(interp, "not supported: %#s", argv[i]); return JIM_ERR; -#endif } } if (argc - i < 2) { @@ -313,7 +327,7 @@ int Jim_RegexpCmd(Jim_Interp *interp, int argc, Jim_Obj *const *argv) } } - if (opt_all && (pattern[0] != '^' || (regcomp_flags & REG_NEWLINE)) && *source_str) { + if (opt_all && (pattern[0] != '^' || opt_lineanchor) && *source_str) { if (pmatch[0].rm_eo) { offset += utf8_strlen(source_str, pmatch[0].rm_eo); source_str += pmatch[0].rm_eo; @@ -369,10 +383,10 @@ int Jim_RegsubCmd(Jim_Interp *interp, int argc, Jim_Obj *const *argv) const char *pattern; int option; enum { - OPT_NOCASE, OPT_LINE, OPT_ALL, OPT_START, OPT_COMMAND, OPT_EXPANDED, OPT_END + OPT_NOCASE, OPT_LINE, OPT_LINESTOP, OPT_LINEANCHOR, OPT_ALL, OPT_START, OPT_COMMAND, OPT_EXPANDED, OPT_END }; static const char * const options[] = { - "-nocase", "-line", "-all", "-start", "-command", "-expanded", "--", NULL + "-nocase", "-line", "-linestop", "-lineanchor", "-all", "-start", "-command", "-expanded", "--", NULL }; for (i = 1; i < argc; i++) { @@ -397,6 +411,16 @@ int Jim_RegsubCmd(Jim_Interp *interp, int argc, Jim_Obj *const *argv) regcomp_flags |= REG_NEWLINE; break; +#ifdef REG_NEWLINE_STOP + case OPT_LINESTOP: + regcomp_flags |= REG_NEWLINE_STOP; + break; +#endif +#ifdef REG_NEWLINE_ANCHOR + case OPT_LINEANCHOR: + regcomp_flags |= REG_NEWLINE_ANCHOR; + break; +#endif case OPT_ALL: opt_all = 1; break; @@ -414,14 +438,16 @@ int Jim_RegsubCmd(Jim_Interp *interp, int argc, Jim_Obj *const *argv) opt_command = 1; break; - case OPT_EXPANDED: #ifdef REG_EXPANDED + case OPT_EXPANDED: regcomp_flags |= REG_EXPANDED; break; -#else +#endif + + default: + /* Could get here if -linestop or -lineanchor or -expanded is not supported */ Jim_SetResultFormatted(interp, "not supported: %#s", argv[i]); return JIM_ERR; -#endif } } if (argc - i != 3 && argc - i != 4) { @@ -583,11 +609,6 @@ int Jim_RegsubCmd(Jim_Interp *interp, int argc, Jim_Obj *const *argv) break; } - /* An anchored pattern without -line must be done */ - if ((regcomp_flags & REG_NEWLINE) == 0 && pattern[0] == '^') { - break; - } - regexec_flags = 0; if (pmatch[0].rm_eo == pmatch[0].rm_so) { /* Matched a zero length string. Need to avoid matching the same position again */ diff --git a/jimregexp.c b/jimregexp.c index ca755c2..86f99ce 100644 --- a/jimregexp.c +++ b/jimregexp.c @@ -1234,7 +1234,7 @@ int jim_regexec(regex_t *preg, const char *string, size_t nmatch, regmatch_t } if (*string) { nextline: - if (preg->cflags & REG_NEWLINE) { + if (preg->cflags & REG_NEWLINE_ANCHOR) { /* Try the next anchor? */ string = strchr(string, '\n'); if (string) { @@ -1369,12 +1369,12 @@ static const char *str_find(const char *string, int c, int nocase) /** * Returns true if 'ch' is an end-of-line char. * - * In REG_NEWLINE mode, \n is considered EOL in + * In REG_NEWLINE_STOP mode, \n is considered EOL in * addition to \0 */ static int reg_iseol(regex_t *preg, int ch) { - if (preg->cflags & REG_NEWLINE) { + if (preg->cflags & REG_NEWLINE_STOP) { return ch == '\0' || ch == '\n'; } else { diff --git a/jimregexp.h b/jimregexp.h index 86a94f6..b6058c7 100644 --- a/jimregexp.h +++ b/jimregexp.h @@ -70,8 +70,11 @@ typedef struct regexp { typedef regexp regex_t; #define REG_EXTENDED 0 -#define REG_NEWLINE 1 #define REG_ICASE 2 +#define REG_NEWLINE_ANCHOR 4 +#define REG_NEWLINE_STOP 8 +/* REG_NEWLINE is POSIX */ +#define REG_NEWLINE (REG_NEWLINE_ANCHOR | REG_NEWLINE_STOP) #define REG_NOTBOL 16 #define REG_EXPANDED 32 diff --git a/tests/regexp.test b/tests/regexp.test index 681b793..b911014 100644 --- a/tests/regexp.test +++ b/tests/regexp.test @@ -37,12 +37,12 @@ test regexp-1.5 {basic regexp operation} { test regexp-1.6 {basic regexp operation} regexp_are { list [catch {regexp {} abc} msg] $msg } {0 1} -#test regexp-1.7 {regexp utf compliance} { -# # if not UTF-8 aware, result is "0 1" -# set foo "\u4e4eb q" -# regexp "\u4e4eb q" "a\u4e4eb qw\u5e4e\x4e wq" bar -# list [string compare $foo $bar] [regexp 4 $bar] -#} {0 0} +test regexp-1.7 {regexp utf compliance} { + # if not UTF-8 aware, result is "0 1" + set foo "\u4e4eb q" + regexp "\u4e4eb q" "a\u4e4eb qw\u5e4e\x4e wq" bar + list [string compare $foo $bar] [regexp 4 $bar] +} {0 0} test regexp-2.1 {getting substrings back from regexp} { @@ -201,9 +201,9 @@ test regexp-6.2 {regexp errors} -body { list [catch {regexp -nocase a} msg] $msg } -result {1 {wrong # args: should be "regexp ?-option ...? exp string ?matchVar? ?subMatchVar ...?"}} -test regexp-6.3 {regexp errors} -constraints jim -body { +test regexp-6.3 {regexp errors} -body { list [catch {regexp -gorp a} msg] $msg -} -result {1 {bad option "-gorp": must be -all, -expanded, -indices, -inline, -line, -nocase, -start, or --}} +} -result {1 {bad option "-gorp": must be -all, -expanded, -indices, -inline, -line, -lineanchor, -linestop, -nocase, -start, or --}} test regexp-6.4 {regexp errors} { catch {regexp a( b} msg @@ -283,12 +283,12 @@ test regexp-7.16 {basic regsub operation} { set foo xxx list [regsub x "" y foo] $foo } {0 {}} -#test regexp-7.17 {regsub utf compliance} { -# # if not UTF-8 aware, result is "0 1" -# set foo "xyz555ijka\u4e4ebpqr" -# regsub a\u4e4eb xyza\u4e4ebijka\u4e4ebpqr 555 bar -# list [string compare $foo $bar] [regexp 4 $bar] -#} {0 0} +test regexp-7.17 {regsub utf compliance} { + # if not UTF-8 aware, result is "0 1" + set foo "xyz555ijka\u4e4ebpqr" + regsub a\u4e4eb xyza\u4e4ebijka\u4e4ebpqr 555 bar + list [string compare $foo $bar] [regexp 4 $bar] +} {0 0} test regexp-8.1 {case conversion in regsub} { list [regsub -nocase a(a+) xaAAaAAay & foo] $foo @@ -348,14 +348,14 @@ test regexp-10.3 {newline sensitivity in regsub} { set foo xxx list [regsub -line {^a.*b$} "dabc\naxyb\nxb" 123 foo] $foo } "1 {dabc\n123\nxb}" -#test regexp-10.4 {partial newline sensitivity in regsub} { -# set foo xxx -# list [regsub -lineanchor {^a.*b$} "da\naxyb\nxb" 123 foo] $foo -#} "1 {da\n123}" -#test regexp-10.5 {inverse partial newline sensitivity in regsub} { -# set foo xxx -# list [regsub -linestop {a.*b} "da\nbaxyb\nxb" 123 foo] $foo -#} "1 {da\nb123\nxb}" +test regexp-10.4 {partial newline sensitivity in regsub} { + set foo xxx + list [regsub -lineanchor {^a.*b$} "da\naxyb\nxb" 123 foo] $foo +} "1 {da\n123}" +test regexp-10.5 {inverse partial newline sensitivity in regsub} { + set foo xxx + list [regsub -linestop {a.*b} "da\nbaxyb\nxb" 123 foo] $foo +} "1 {da\nb123\nxb}" test regexp-11.1 {regsub errors} { list [catch {regsub a b} msg] $msg @@ -370,13 +370,9 @@ test regexp-11.4 {regsub errors} { list [catch {regsub a b c d e f} msg] $msg } {1 {wrong # args: should be "regsub ?-option ...? exp string subSpec ?varName?"}} -test regexp-11.5 {regsub errors} -constraints jim -body { - list [catch {regsub -gorp a b c} msg] $msg -} -result {1 {bad option "-gorp": must be -all, -command, -expanded, -line, -nocase, -start, or --}} - -test regexp-11.5 {regsub errors} -constraints tcl -body { +test regexp-11.5 {regsub errors} -body { list [catch {regsub -gorp a b c} msg] $msg -} -result {1 {bad option "-gorp": must be -all, -command, -expanded, -line, -linestop, -lineanchor, -nocase, -start, or --}} +} -result {1 {bad option "-gorp": must be -all, -command, -expanded, -line, -lineanchor, -linestop, -nocase, -start, or --}} test regexp-11.6 {regsub errors} { catch {regsub -nocase a( b c d} msg @@ -605,14 +601,14 @@ test regexp-19.1 {regsub null replacement} { list $result [string length $result] } "\0a\0hel\0a\0lo\0a\0 14" -#test regexp-20.1 {regsub shared object shimmering} { -# # Bug #461322 -# set a abcdefghijklmnopqurstuvwxyz -# set b $a -# set c abcdefghijklmnopqurstuvwxyz0123456789 -# regsub $a $c $b d -# list $d [string length $d] [string bytelength $d] -#} [list abcdefghijklmnopqurstuvwxyz0123456789 37 37] +test regexp-20.1 {regsub shared object shimmering} { + # Bug #461322 + set a abcdefghijklmnopqurstuvwxyz + set b $a + set c abcdefghijklmnopqurstuvwxyz0123456789 + regsub $a $c $b d + list $d [string length $d] [string bytelength $d] +} [list abcdefghijklmnopqurstuvwxyz0123456789 37 37] #test regexp-20.2 {regsub shared object shimmering with -about} { # eval regexp -about abc #} {0 {}} @@ -653,9 +649,9 @@ test regexp-21.9 {regexp works with empty string offset} { regexp -start 3 -- \$ {123} } {1} -#test regexp-21.10 {multiple matches handle newlines} { -# regsub -all -lineanchor -- {^#[^\n]*\n} "#one\n#two\n#three\n" foo\n -#} "foo\nfoo\nfoo\n" +test regexp-21.10 {multiple matches handle newlines} { + regsub -all -lineanchor -- {^#[^\n]*\n} "#one\n#two\n#three\n" foo\n +} "foo\nfoo\nfoo\n" test regexp-21.11 {multiple matches handle newlines} { regsub -all -line -- ^ "a\nb\nc" \# diff --git a/tests/regexp2.test b/tests/regexp2.test index 2b8e2bc..c965cf9 100644 --- a/tests/regexp2.test +++ b/tests/regexp2.test @@ -481,18 +481,18 @@ test regexpComp-10.3 {newline sensitivity in regsub} { list [regsub -line {^a.*b$} "dabc\naxyb\nxb" 123 foo] $foo } } "1 {dabc\n123\nxb}" -#test regexpComp-10.4 {partial newline sensitivity in regsub} { -# evalInProc { -# set foo xxx -# list [regsub -lineanchor {^a.*b$} "da\naxyb\nxb" 123 foo] $foo -# } -#} "1 {da\n123}" -#test regexpComp-10.5 {inverse partial newline sensitivity in regsub} { -# evalInProc { -# set foo xxx -# list [regsub -linestop {a.*b} "da\nbaxyb\nxb" 123 foo] $foo -# } -#} "1 {da\nb123\nxb}" +test regexpComp-10.4 {partial newline sensitivity in regsub} { + evalInProc { + set foo xxx + list [regsub -lineanchor {^a.*b$} "da\naxyb\nxb" 123 foo] $foo + } +} "1 {da\n123}" +test regexpComp-10.5 {inverse partial newline sensitivity in regsub} { + evalInProc { + set foo xxx + list [regsub -linestop {a.*b} "da\nbaxyb\nxb" 123 foo] $foo + } +} "1 {da\nb123\nxb}" test regexpComp-10.6 {\Z only matching end of string with -line} { evalInProc { set foo xxx @@ -528,7 +528,7 @@ test regexpComp-11.5 {regsub errors} -body { evalInProc { list [catch {regsub -gorp a b c} msg] $msg } -} -result {1 {bad option "-gorp": must be -all, -command, -expanded, -line, -nocase, -start, or --}} +} -result {1 {bad option "-gorp": must be -all, -command, -expanded, -line, -lineanchor, -linestop, -nocase, -start, or --}} test regexpComp-11.6 {regsub errors} { evalInProc { @@ -807,11 +807,11 @@ test regexpComp-21.10 {regexp command compiling tests} { } } {3 barfbarobaro} # This useless expression fails. Jim returns "bar" -#test regexpComp-21.11 {regexp command compiling tests} { -# evalInProc { -# list [regsub -all "" "" bar str] $str -# } -#} {0 {}} +test regexpComp-21.11 {regexp command compiling tests} { + evalInProc { + list [regsub -all "" "" bar str] $str + } +} {0 {}} test regexpComp-21.12 {regexp empty pattern with utf8} utf8 { # Make sure the second char isn't sliced up evalInProc { |