aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSteve Bennett <steveb@workware.net.au>2019-10-27 21:17:42 +1000
committerSteve Bennett <steveb@workware.net.au>2019-10-27 21:17:42 +1000
commit9eba9e87fc3ed257bf82c1bc7e2f77dba1aaf9a5 (patch)
treea439f53d186ac7431935b91c4b52a8f7d38eb7dc
parent38c389a29e8ff582ba32c7737bb90f0f0df12078 (diff)
downloadjimtcl-9eba9e87fc3ed257bf82c1bc7e2f77dba1aaf9a5.zip
jimtcl-9eba9e87fc3ed257bf82c1bc7e2f77dba1aaf9a5.tar.gz
jimtcl-9eba9e87fc3ed257bf82c1bc7e2f77dba1aaf9a5.tar.bz2
regexp -indices should return character indices
Not byte indices Reported-by: dbohdan <dbohdan@dbohdan.com> Signed-off-by: Steve Bennett <steveb@workware.net.au>
-rw-r--r--jim-regexp.c15
-rw-r--r--tests/regexp2.test4
2 files changed, 11 insertions, 8 deletions
diff --git a/jim-regexp.c b/jim-regexp.c
index 81f3207..3134598 100644
--- a/jim-regexp.c
+++ b/jim-regexp.c
@@ -281,16 +281,15 @@ int Jim_RegexpCmd(Jim_Interp *interp, int argc, Jim_Obj *const *argv)
}
}
else {
- int len = pmatch[j].rm_eo - pmatch[j].rm_so;
-
if (opt_indices) {
- Jim_ListAppendElement(interp, resultObj, Jim_NewIntObj(interp,
- offset + pmatch[j].rm_so));
- Jim_ListAppendElement(interp, resultObj, Jim_NewIntObj(interp,
- offset + pmatch[j].rm_so + len - 1));
+ /* rm_so and rm_eo are byte offsets. We need char offsets */
+ int so = utf8_strlen(source_str, pmatch[j].rm_so);
+ int eo = utf8_strlen(source_str + pmatch[j].rm_so, pmatch[j].rm_eo);
+ Jim_ListAppendElement(interp, resultObj, Jim_NewIntObj(interp, offset + so));
+ Jim_ListAppendElement(interp, resultObj, Jim_NewIntObj(interp, offset + eo - 1));
}
else {
- Jim_AppendString(interp, resultObj, source_str + pmatch[j].rm_so, len);
+ Jim_AppendString(interp, resultObj, source_str + pmatch[j].rm_so, pmatch[j].rm_eo - pmatch[j].rm_so);
}
}
@@ -311,7 +310,7 @@ int Jim_RegexpCmd(Jim_Interp *interp, int argc, Jim_Obj *const *argv)
try_next_match:
if (opt_all && (pattern[0] != '^' || (regcomp_flags & REG_NEWLINE)) && *source_str) {
if (pmatch[0].rm_eo) {
- offset += pmatch[0].rm_eo;
+ offset += utf8_strlen(source_str, pmatch[0].rm_eo);
source_str += pmatch[0].rm_eo;
}
else {
diff --git a/tests/regexp2.test b/tests/regexp2.test
index 76735e6..1aee8cd 100644
--- a/tests/regexp2.test
+++ b/tests/regexp2.test
@@ -633,6 +633,7 @@ test regexpComp-16.5 {regexp -start with utf8} utf8 {
test regexpComp-16.6 {regexp -start with utf8} utf8 {
regsub -start 1 . \u0442\u0435\u0441\u0442 x
} \u0442x\u0441\u0442
+
test regexpComp-17.1 {regexp -inline} {
regexp -inline b ababa
} {b}
@@ -654,6 +655,9 @@ test regexpComp-17.6 {regexp -inline no matches} {
test regexpComp-17.7 {regexp -inline, no matchvars allowed} {
list [catch {regexp -inline b abc match} msg] $msg
} {1 {regexp match variables not allowed when using -inline}}
+test regexpComp-17.8 {regexp -indices utf8} utf8 {
+ regexp -all -inline -start 1 -indices . \u0442\u0435\u0441\u0442
+} {{1 1} {2 2} {3 3}}
test regexpComp-18.1 {regexp -all} {
regexp -all b bbbbb