regexp, regsub: add support for -lineanchor and -linestop

Signed-off-by: Steve Bennett <steveb@workware.net.au>
author: Steve Bennett <steveb@workware.net.au> 2025-08-09 20:41:45 +1000
committer: Steve Bennett <steveb@workware.net.au> 2025-08-13 08:05:25 +1000
commit: 00b27e1bcf275bfebe62ff882ef17f3401bdbea0 (patch)
tree: 1d9d1041c5fd38713d7cb8b116b0f3858a23563e
parent: 3c89e1bffa6fa0d7e66657a0ab7a29a1066dc536 (diff)
download: jimtcl-00b27e1bcf275bfebe62ff882ef17f3401bdbea0.zip
jimtcl-00b27e1bcf275bfebe62ff882ef17f3401bdbea0.tar.gz
jimtcl-00b27e1bcf275bfebe62ff882ef17f3401bdbea0.tar.bz2
5 files changed, 97 insertions, 77 deletions
diff --git a/jim-regexp.c b/jim-regexp.c
index cf06e68..c1909c0 100644
--- a/jim-regexp.c
+++ b/jim-regexp.c
@@ -122,6 +122,7 @@ int Jim_RegexpCmd(Jim_Interp *interp, int argc, Jim_Obj *const *argv)
     int opt_indices = 0;
     int opt_all = 0;
     int opt_inline = 0;
+    int opt_lineanchor = 0;
     regex_t *regex;
     int match, i, j;
     int offset = 0;
@@ -137,10 +138,10 @@ int Jim_RegexpCmd(Jim_Interp *interp, int argc, Jim_Obj *const *argv)
     int eflags = 0;
     int option;
     enum {
-        OPT_INDICES,  OPT_NOCASE, OPT_LINE, OPT_ALL, OPT_INLINE, OPT_START, OPT_EXPANDED, OPT_END
+        OPT_INDICES,  OPT_NOCASE, OPT_LINE, OPT_LINESTOP, OPT_LINEANCHOR, OPT_ALL, OPT_INLINE, OPT_START, OPT_EXPANDED, OPT_END
     };
     static const char * const options[] = {
-        "-indices", "-nocase", "-line", "-all", "-inline", "-start", "-expanded", "--", NULL
+        "-indices", "-nocase", "-line", "-linestop", "-lineanchor", "-all", "-inline", "-start", "-expanded", "--", NULL
     };
 
     for (i = 1; i < argc; i++) {
@@ -167,8 +168,20 @@ int Jim_RegexpCmd(Jim_Interp *interp, int argc, Jim_Obj *const *argv)
 
             case OPT_LINE:
                 regcomp_flags |= REG_NEWLINE;
+                opt_lineanchor = 1;
                 break;
 
+#ifdef REG_NEWLINE_STOP
+            case OPT_LINESTOP:
+                regcomp_flags |= REG_NEWLINE_STOP;
+                break;
+#endif
+#ifdef REG_NEWLINE_ANCHOR
+            case OPT_LINEANCHOR:
+                regcomp_flags |= REG_NEWLINE_ANCHOR;
+                opt_lineanchor = 1;
+                break;
+#endif
             case OPT_ALL:
                 opt_all = 1;
                 break;
@@ -186,14 +199,15 @@ int Jim_RegexpCmd(Jim_Interp *interp, int argc, Jim_Obj *const *argv)
                 }
                 break;
 
-            case OPT_EXPANDED:
 #ifdef REG_EXPANDED
+            case OPT_EXPANDED:
                 regcomp_flags |= REG_EXPANDED;
                 break;
-#else
+#endif
+            default:
+                /* Could get here if -linestop or -lineanchor or -expanded is not supported */
                 Jim_SetResultFormatted(interp, "not supported: %#s", argv[i]);
                 return JIM_ERR;
-#endif
         }
     }
     if (argc - i < 2) {
@@ -313,7 +327,7 @@ int Jim_RegexpCmd(Jim_Interp *interp, int argc, Jim_Obj *const *argv)
         }
     }
 
-    if (opt_all && (pattern[0] != '^' || (regcomp_flags & REG_NEWLINE)) && *source_str) {
+    if (opt_all && (pattern[0] != '^' || opt_lineanchor) && *source_str) {
         if (pmatch[0].rm_eo) {
             offset += utf8_strlen(source_str, pmatch[0].rm_eo);
             source_str += pmatch[0].rm_eo;
@@ -369,10 +383,10 @@ int Jim_RegsubCmd(Jim_Interp *interp, int argc, Jim_Obj *const *argv)
     const char *pattern;
     int option;
     enum {
-        OPT_NOCASE, OPT_LINE, OPT_ALL, OPT_START, OPT_COMMAND, OPT_EXPANDED, OPT_END
+        OPT_NOCASE, OPT_LINE, OPT_LINESTOP, OPT_LINEANCHOR, OPT_ALL, OPT_START, OPT_COMMAND, OPT_EXPANDED, OPT_END
     };
     static const char * const options[] = {
-        "-nocase", "-line", "-all", "-start", "-command", "-expanded", "--", NULL
+        "-nocase", "-line", "-linestop", "-lineanchor", "-all", "-start", "-command", "-expanded", "--", NULL
     };
 
     for (i = 1; i < argc; i++) {
@@ -397,6 +411,16 @@ int Jim_RegsubCmd(Jim_Interp *interp, int argc, Jim_Obj *const *argv)
                 regcomp_flags |= REG_NEWLINE;
                 break;
 
+#ifdef REG_NEWLINE_STOP
+            case OPT_LINESTOP:
+                regcomp_flags |= REG_NEWLINE_STOP;
+                break;
+#endif
+#ifdef REG_NEWLINE_ANCHOR
+            case OPT_LINEANCHOR:
+                regcomp_flags |= REG_NEWLINE_ANCHOR;
+                break;
+#endif
             case OPT_ALL:
                 opt_all = 1;
                 break;
@@ -414,14 +438,16 @@ int Jim_RegsubCmd(Jim_Interp *interp, int argc, Jim_Obj *const *argv)
                 opt_command = 1;
                 break;
 
-            case OPT_EXPANDED:
 #ifdef REG_EXPANDED
+            case OPT_EXPANDED:
                 regcomp_flags |= REG_EXPANDED;
                 break;
-#else
+#endif
+
+            default:
+                /* Could get here if -linestop or -lineanchor or -expanded is not supported */
                 Jim_SetResultFormatted(interp, "not supported: %#s", argv[i]);
                 return JIM_ERR;
-#endif
         }
     }
     if (argc - i != 3 && argc - i != 4) {
@@ -583,11 +609,6 @@ int Jim_RegsubCmd(Jim_Interp *interp, int argc, Jim_Obj *const *argv)
             break;
         }
 
-        /* An anchored pattern without -line must be done */
-        if ((regcomp_flags & REG_NEWLINE) == 0 && pattern[0] == '^') {
-            break;
-        }
-
         regexec_flags = 0;
         if (pmatch[0].rm_eo == pmatch[0].rm_so) {
             /* Matched a zero length string. Need to avoid matching the same position again */
diff --git a/jimregexp.c b/jimregexp.c
index ca755c2..86f99ce 100644
--- a/jimregexp.c
+++ b/jimregexp.c
@@ -1234,7 +1234,7 @@ int jim_regexec(regex_t  *preg,  const  char *string, size_t nmatch, regmatch_t
 			}
 			if (*string) {
 nextline:
-				if (preg->cflags & REG_NEWLINE) {
+				if (preg->cflags & REG_NEWLINE_ANCHOR) {
 					/* Try the next anchor? */
 					string = strchr(string, '\n');
 					if (string) {
@@ -1369,12 +1369,12 @@ static const char *str_find(const char *string, int c, int nocase)
 /**
  * Returns true if 'ch' is an end-of-line char.
  *
- * In REG_NEWLINE mode, \n is considered EOL in
+ * In REG_NEWLINE_STOP mode, \n is considered EOL in
  * addition to \0
  */
 static int reg_iseol(regex_t *preg, int ch)
 {
-	if (preg->cflags & REG_NEWLINE) {
+	if (preg->cflags & REG_NEWLINE_STOP) {
 		return ch == '\0' || ch == '\n';
 	}
 	else {
diff --git a/jimregexp.h b/jimregexp.h
index 86a94f6..b6058c7 100644
--- a/jimregexp.h
+++ b/jimregexp.h
@@ -70,8 +70,11 @@ typedef struct regexp {
 typedef regexp regex_t;
 
 #define REG_EXTENDED 0
-#define REG_NEWLINE 1
 #define REG_ICASE 2
+#define REG_NEWLINE_ANCHOR 4
+#define REG_NEWLINE_STOP 8
+/* REG_NEWLINE is POSIX */
+#define REG_NEWLINE (REG_NEWLINE_ANCHOR | REG_NEWLINE_STOP)
 
 #define REG_NOTBOL 16
 #define REG_EXPANDED 32
diff --git a/tests/regexp.test b/tests/regexp.test
index 681b793..b911014 100644
--- a/tests/regexp.test
+++ b/tests/regexp.test
@@ -37,12 +37,12 @@ test regexp-1.5 {basic regexp operation} {
 test regexp-1.6 {basic regexp operation} regexp_are {
     list [catch {regexp {} abc} msg] $msg
 } {0 1}
-#test regexp-1.7 {regexp utf compliance} {
-#    # if not UTF-8 aware, result is "0 1"
-#    set foo "\u4e4eb q"
-#    regexp "\u4e4eb q" "a\u4e4eb qw\u5e4e\x4e wq" bar
-#    list [string compare $foo $bar] [regexp 4 $bar]
-#} {0 0}
+test regexp-1.7 {regexp utf compliance} {
+    # if not UTF-8 aware, result is "0 1"
+    set foo "\u4e4eb q"
+    regexp "\u4e4eb q" "a\u4e4eb qw\u5e4e\x4e wq" bar
+    list [string compare $foo $bar] [regexp 4 $bar]
+} {0 0}
 
 
 test regexp-2.1 {getting substrings back from regexp} {
@@ -201,9 +201,9 @@ test regexp-6.2 {regexp errors} -body {
     list [catch {regexp -nocase a} msg] $msg
 } -result {1 {wrong # args: should be "regexp ?-option ...? exp string ?matchVar? ?subMatchVar ...?"}}
 
-test regexp-6.3 {regexp errors} -constraints jim -body {
+test regexp-6.3 {regexp errors} -body {
     list [catch {regexp -gorp a} msg] $msg
-} -result {1 {bad option "-gorp": must be -all, -expanded, -indices, -inline, -line, -nocase, -start, or --}}
+} -result {1 {bad option "-gorp": must be -all, -expanded, -indices, -inline, -line, -lineanchor, -linestop, -nocase, -start, or --}}
 
 test regexp-6.4 {regexp errors} {
     catch {regexp a( b} msg
@@ -283,12 +283,12 @@ test regexp-7.16 {basic regsub operation} {
     set foo xxx
     list [regsub x "" y foo] $foo
 } {0 {}}
-#test regexp-7.17 {regsub utf compliance} {
-#    # if not UTF-8 aware, result is "0 1"
-#    set foo "xyz555ijka\u4e4ebpqr"
-#    regsub a\u4e4eb xyza\u4e4ebijka\u4e4ebpqr 555 bar
-#    list [string compare $foo $bar] [regexp 4 $bar]
-#} {0 0}
+test regexp-7.17 {regsub utf compliance} {
+    # if not UTF-8 aware, result is "0 1"
+    set foo "xyz555ijka\u4e4ebpqr"
+    regsub a\u4e4eb xyza\u4e4ebijka\u4e4ebpqr 555 bar
+    list [string compare $foo $bar] [regexp 4 $bar]
+} {0 0}
 
 test regexp-8.1 {case conversion in regsub} {
     list [regsub -nocase a(a+) xaAAaAAay & foo] $foo
@@ -348,14 +348,14 @@ test regexp-10.3 {newline sensitivity in regsub} {
     set foo xxx
     list [regsub -line {^a.*b$} "dabc\naxyb\nxb" 123 foo] $foo
 } "1 {dabc\n123\nxb}"
-#test regexp-10.4 {partial newline sensitivity in regsub} {
-#    set foo xxx
-#    list [regsub -lineanchor {^a.*b$} "da\naxyb\nxb" 123 foo] $foo
-#} "1 {da\n123}"
-#test regexp-10.5 {inverse partial newline sensitivity in regsub} {
-#    set foo xxx
-#    list [regsub -linestop {a.*b} "da\nbaxyb\nxb" 123 foo] $foo
-#} "1 {da\nb123\nxb}"
+test regexp-10.4 {partial newline sensitivity in regsub} {
+    set foo xxx
+    list [regsub -lineanchor {^a.*b$} "da\naxyb\nxb" 123 foo] $foo
+} "1 {da\n123}"
+test regexp-10.5 {inverse partial newline sensitivity in regsub} {
+    set foo xxx
+    list [regsub -linestop {a.*b} "da\nbaxyb\nxb" 123 foo] $foo
+} "1 {da\nb123\nxb}"
 
 test regexp-11.1 {regsub errors} {
     list [catch {regsub a b} msg] $msg
@@ -370,13 +370,9 @@ test regexp-11.4 {regsub errors} {
     list [catch {regsub a b c d e f} msg] $msg
 } {1 {wrong # args: should be "regsub ?-option ...? exp string subSpec ?varName?"}}
 
-test regexp-11.5 {regsub errors} -constraints jim -body {
-    list [catch {regsub -gorp a b c} msg] $msg
-} -result {1 {bad option "-gorp": must be -all, -command, -expanded, -line, -nocase, -start, or --}}
-
-test regexp-11.5 {regsub errors} -constraints tcl -body {
+test regexp-11.5 {regsub errors} -body {
     list [catch {regsub -gorp a b c} msg] $msg
-} -result {1 {bad option "-gorp": must be -all, -command, -expanded, -line, -linestop, -lineanchor, -nocase, -start, or --}}
+} -result {1 {bad option "-gorp": must be -all, -command, -expanded, -line, -lineanchor, -linestop, -nocase, -start, or --}}
 
 test regexp-11.6 {regsub errors} {
     catch {regsub -nocase a( b c d} msg
@@ -605,14 +601,14 @@ test regexp-19.1 {regsub null replacement} {
     list $result [string length $result]
 } "\0a\0hel\0a\0lo\0a\0 14"
 
-#test regexp-20.1 {regsub shared object shimmering} {
-#    # Bug #461322
-#    set a abcdefghijklmnopqurstuvwxyz 
-#    set b $a 
-#    set c abcdefghijklmnopqurstuvwxyz0123456789 
-#    regsub $a $c $b d 
-#    list $d [string length $d] [string bytelength $d]
-#} [list abcdefghijklmnopqurstuvwxyz0123456789 37 37]
+test regexp-20.1 {regsub shared object shimmering} {
+    # Bug #461322
+    set a abcdefghijklmnopqurstuvwxyz 
+    set b $a 
+    set c abcdefghijklmnopqurstuvwxyz0123456789 
+    regsub $a $c $b d 
+    list $d [string length $d] [string bytelength $d]
+} [list abcdefghijklmnopqurstuvwxyz0123456789 37 37]
 #test regexp-20.2 {regsub shared object shimmering with -about} {
 #    eval regexp -about abc
 #} {0 {}}
@@ -653,9 +649,9 @@ test regexp-21.9 {regexp works with empty string offset} {
     regexp -start 3 -- \$ {123}
 } {1}
 
-#test regexp-21.10 {multiple matches handle newlines} {
-#    regsub -all -lineanchor -- {^#[^\n]*\n} "#one\n#two\n#three\n" foo\n
-#} "foo\nfoo\nfoo\n"
+test regexp-21.10 {multiple matches handle newlines} {
+    regsub -all -lineanchor -- {^#[^\n]*\n} "#one\n#two\n#three\n" foo\n
+} "foo\nfoo\nfoo\n"
 
 test regexp-21.11 {multiple matches handle newlines} {
     regsub -all -line -- ^ "a\nb\nc" \#
diff --git a/tests/regexp2.test b/tests/regexp2.test
index 2b8e2bc..c965cf9 100644
--- a/tests/regexp2.test
+++ b/tests/regexp2.test
@@ -481,18 +481,18 @@ test regexpComp-10.3 {newline sensitivity in regsub} {
 	list [regsub -line {^a.*b$} "dabc\naxyb\nxb" 123 foo] $foo
     }
 } "1 {dabc\n123\nxb}"
-#test regexpComp-10.4 {partial newline sensitivity in regsub} {
-#    evalInProc {
-#	set foo xxx
-#	list [regsub -lineanchor {^a.*b$} "da\naxyb\nxb" 123 foo] $foo
-#    }
-#} "1 {da\n123}"
-#test regexpComp-10.5 {inverse partial newline sensitivity in regsub} {
-#    evalInProc {
-#	set foo xxx
-#	list [regsub -linestop {a.*b} "da\nbaxyb\nxb" 123 foo] $foo
-#    }
-#} "1 {da\nb123\nxb}"
+test regexpComp-10.4 {partial newline sensitivity in regsub} {
+    evalInProc {
+	set foo xxx
+	list [regsub -lineanchor {^a.*b$} "da\naxyb\nxb" 123 foo] $foo
+    }
+} "1 {da\n123}"
+test regexpComp-10.5 {inverse partial newline sensitivity in regsub} {
+    evalInProc {
+	set foo xxx
+	list [regsub -linestop {a.*b} "da\nbaxyb\nxb" 123 foo] $foo
+    }
+} "1 {da\nb123\nxb}"
 test regexpComp-10.6 {\Z only matching end of string with -line} {
     evalInProc {
 	set foo xxx
@@ -528,7 +528,7 @@ test regexpComp-11.5 {regsub errors} -body {
     evalInProc {
 	list [catch {regsub -gorp a b c} msg] $msg
     }
-} -result {1 {bad option "-gorp": must be -all, -command, -expanded, -line, -nocase, -start, or --}}
+} -result {1 {bad option "-gorp": must be -all, -command, -expanded, -line, -lineanchor, -linestop, -nocase, -start, or --}}
 
 test regexpComp-11.6 {regsub errors} {
     evalInProc {
@@ -807,11 +807,11 @@ test regexpComp-21.10 {regexp command compiling tests} {
     }
 } {3 barfbarobaro}
 # This useless expression fails. Jim returns "bar"
-#test regexpComp-21.11 {regexp command compiling tests} {
-#    evalInProc {
-#	list [regsub -all "" "" bar str] $str
-#    }
-#} {0 {}}
+test regexpComp-21.11 {regexp command compiling tests} {
+    evalInProc {
+	list [regsub -all "" "" bar str] $str
+    }
+} {0 {}}
 test regexpComp-21.12 {regexp empty pattern with utf8} utf8 {
     # Make sure the second char isn't sliced up
     evalInProc {
author	Steve Bennett <steveb@workware.net.au>	2025-08-09 20:41:45 +1000
committer	Steve Bennett <steveb@workware.net.au>	2025-08-13 08:05:25 +1000
commit	00b27e1bcf275bfebe62ff882ef17f3401bdbea0 (patch)
tree	1d9d1041c5fd38713d7cb8b116b0f3858a23563e
parent	3c89e1bffa6fa0d7e66657a0ab7a29a1066dc536 (diff)
download	jimtcl-00b27e1bcf275bfebe62ff882ef17f3401bdbea0.zip jimtcl-00b27e1bcf275bfebe62ff882ef17f3401bdbea0.tar.gz jimtcl-00b27e1bcf275bfebe62ff882ef17f3401bdbea0.tar.bz2