diff options
Diffstat (limited to 'libphobos/src/std/regex/package.d')
-rw-r--r-- | libphobos/src/std/regex/package.d | 1735 |
1 files changed, 1735 insertions, 0 deletions
diff --git a/libphobos/src/std/regex/package.d b/libphobos/src/std/regex/package.d new file mode 100644 index 0000000..bfc7d7f --- /dev/null +++ b/libphobos/src/std/regex/package.d @@ -0,0 +1,1735 @@ +/++ + $(LINK2 https://en.wikipedia.org/wiki/Regular_expression, Regular expressions) + are a commonly used method of pattern matching + on strings, with $(I regex) being a catchy word for a pattern in this domain + specific language. Typical problems usually solved by regular expressions + include validation of user input and the ubiquitous find $(AMP) replace + in text processing utilities. + +$(SCRIPT inhibitQuickIndex = 1;) +$(BOOKTABLE, +$(TR $(TH Category) $(TH Functions)) +$(TR $(TD Matching) $(TD + $(LREF bmatch) + $(LREF match) + $(LREF matchAll) + $(LREF matchFirst) +)) +$(TR $(TD Building) $(TD + $(LREF ctRegex) + $(LREF escaper) + $(LREF _regex) +)) +$(TR $(TD Replace) $(TD + $(LREF replace) + $(LREF replaceAll) + $(LREF replaceAllInto) + $(LREF replaceFirst) + $(LREF replaceFirstInto) +)) +$(TR $(TD Split) $(TD + $(LREF split) + $(LREF splitter) +)) +$(TR $(TD Objects) $(TD + $(LREF Captures) + $(LREF Regex) + $(LREF RegexException) + $(LREF RegexMatch) + $(LREF Splitter) + $(LREF StaticRegex) +)) +) + + $(SECTION Synopsis) + --- + import std.regex; + import std.stdio; + void main() + { + // Print out all possible dd/mm/yy(yy) dates found in user input. + auto r = regex(r"\b[0-9][0-9]?/[0-9][0-9]?/[0-9][0-9](?:[0-9][0-9])?\b"); + foreach (line; stdin.byLine) + { + // matchAll() returns a range that can be iterated + // to get all subsequent matches. + foreach (c; matchAll(line, r)) + writeln(c.hit); + } + } + ... + + // Create a static regex at compile-time, which contains fast native code. + auto ctr = ctRegex!(`^.*/([^/]+)/?$`); + + // It works just like a normal regex: + auto c2 = matchFirst("foo/bar", ctr); // First match found here, if any + assert(!c2.empty); // Be sure to check if there is a match before examining contents! + assert(c2[1] == "bar"); // Captures is a range of submatches: 0 = full match. + + ... + // multi-pattern regex + auto multi = regex([`\d+,\d+`,`(a-z]+):(\d+)`]); + auto m = "abc:43 12,34".matchAll(multi); + assert(m.front.whichPattern == 2); + assert(m.front[1] == "abc"); + assert(m.front[2] == "43"); + m.popFront(); + assert(m.front.whichPattern == 1); + assert(m.front[1] == "12"); + ... + + // The result of the `matchAll/matchFirst` is directly testable with if/assert/while. + // e.g. test if a string consists of letters: + assert(matchFirst("Letter", `^\p{L}+$`)); + --- + + $(SECTION Syntax and general information) + The general usage guideline is to keep regex complexity on the side of simplicity, + as its capabilities reside in purely character-level manipulation. + As such it's ill-suited for tasks involving higher level invariants + like matching an integer number $(U bounded) in an [a,b] interval. + Checks of this sort of are better addressed by additional post-processing. + + The basic syntax shouldn't surprise experienced users of regular expressions. + For an introduction to $(D std.regex) see a + $(HTTP dlang.org/regular-expression.html, short tour) of the module API + and its abilities. + + There are other web resources on regular expressions to help newcomers, + and a good $(HTTP www.regular-expressions.info, reference with tutorial) + can easily be found. + + This library uses a remarkably common ECMAScript syntax flavor + with the following extensions: + $(UL + $(LI Named subexpressions, with Python syntax. ) + $(LI Unicode properties such as Scripts, Blocks and common binary properties e.g Alphabetic, White_Space, Hex_Digit etc.) + $(LI Arbitrary length and complexity lookbehind, including lookahead in lookbehind and vise-versa.) + ) + + $(REG_START Pattern syntax ) + $(I std.regex operates on codepoint level, + 'character' in this table denotes a single Unicode codepoint.) + $(REG_TABLE + $(REG_TITLE Pattern element, Semantics ) + $(REG_TITLE Atoms, Match single characters ) + $(REG_ROW any character except [{|*+?()^$, Matches the character itself. ) + $(REG_ROW ., In single line mode matches any character. + Otherwise it matches any character except '\n' and '\r'. ) + $(REG_ROW [class], Matches a single character + that belongs to this character class. ) + $(REG_ROW [^class], Matches a single character that + does $(U not) belong to this character class.) + $(REG_ROW \cC, Matches the control character corresponding to letter C) + $(REG_ROW \xXX, Matches a character with hexadecimal value of XX. ) + $(REG_ROW \uXXXX, Matches a character with hexadecimal value of XXXX. ) + $(REG_ROW \U00YYYYYY, Matches a character with hexadecimal value of YYYYYY. ) + $(REG_ROW \f, Matches a formfeed character. ) + $(REG_ROW \n, Matches a linefeed character. ) + $(REG_ROW \r, Matches a carriage return character. ) + $(REG_ROW \t, Matches a tab character. ) + $(REG_ROW \v, Matches a vertical tab character. ) + $(REG_ROW \d, Matches any Unicode digit. ) + $(REG_ROW \D, Matches any character except Unicode digits. ) + $(REG_ROW \w, Matches any word character (note: this includes numbers).) + $(REG_ROW \W, Matches any non-word character.) + $(REG_ROW \s, Matches whitespace, same as \p{White_Space}.) + $(REG_ROW \S, Matches any character except those recognized as $(I \s ). ) + $(REG_ROW \\, Matches \ character. ) + $(REG_ROW \c where c is one of [|*+?(), Matches the character c itself. ) + $(REG_ROW \p{PropertyName}, Matches a character that belongs + to the Unicode PropertyName set. + Single letter abbreviations can be used without surrounding {,}. ) + $(REG_ROW \P{PropertyName}, Matches a character that does not belong + to the Unicode PropertyName set. + Single letter abbreviations can be used without surrounding {,}. ) + $(REG_ROW \p{InBasicLatin}, Matches any character that is part of + the BasicLatin Unicode $(U block).) + $(REG_ROW \P{InBasicLatin}, Matches any character except ones in + the BasicLatin Unicode $(U block).) + $(REG_ROW \p{Cyrillic}, Matches any character that is part of + Cyrillic $(U script).) + $(REG_ROW \P{Cyrillic}, Matches any character except ones in + Cyrillic $(U script).) + $(REG_TITLE Quantifiers, Specify repetition of other elements) + $(REG_ROW *, Matches previous character/subexpression 0 or more times. + Greedy version - tries as many times as possible.) + $(REG_ROW *?, Matches previous character/subexpression 0 or more times. + Lazy version - stops as early as possible.) + $(REG_ROW +, Matches previous character/subexpression 1 or more times. + Greedy version - tries as many times as possible.) + $(REG_ROW +?, Matches previous character/subexpression 1 or more times. + Lazy version - stops as early as possible.) + $(REG_ROW {n}, Matches previous character/subexpression exactly n times. ) + $(REG_ROW {n$(COMMA)}, Matches previous character/subexpression n times or more. + Greedy version - tries as many times as possible. ) + $(REG_ROW {n$(COMMA)}?, Matches previous character/subexpression n times or more. + Lazy version - stops as early as possible.) + $(REG_ROW {n$(COMMA)m}, Matches previous character/subexpression n to m times. + Greedy version - tries as many times as possible, but no more than m times. ) + $(REG_ROW {n$(COMMA)m}?, Matches previous character/subexpression n to m times. + Lazy version - stops as early as possible, but no less then n times.) + $(REG_TITLE Other, Subexpressions $(AMP) alternations ) + $(REG_ROW (regex), Matches subexpression regex, + saving matched portion of text for later retrieval. ) + $(REG_ROW (?#comment), An inline comment that is ignored while matching.) + $(REG_ROW (?:regex), Matches subexpression regex, + $(U not) saving matched portion of text. Useful to speed up matching. ) + $(REG_ROW A|B, Matches subexpression A, or failing that, matches B. ) + $(REG_ROW (?P$(LT)name$(GT)regex), Matches named subexpression + regex labeling it with name 'name'. + When referring to a matched portion of text, + names work like aliases in addition to direct numbers. + ) + $(REG_TITLE Assertions, Match position rather than character ) + $(REG_ROW ^, Matches at the begining of input or line (in multiline mode).) + $(REG_ROW $, Matches at the end of input or line (in multiline mode). ) + $(REG_ROW \b, Matches at word boundary. ) + $(REG_ROW \B, Matches when $(U not) at word boundary. ) + $(REG_ROW (?=regex), Zero-width lookahead assertion. + Matches at a point where the subexpression + regex could be matched starting from the current position. + ) + $(REG_ROW (?!regex), Zero-width negative lookahead assertion. + Matches at a point where the subexpression + regex could $(U not) be matched starting from the current position. + ) + $(REG_ROW (?<=regex), Zero-width lookbehind assertion. Matches at a point + where the subexpression regex could be matched ending + at the current position (matching goes backwards). + ) + $(REG_ROW (?<!regex), Zero-width negative lookbehind assertion. + Matches at a point where the subexpression regex could $(U not) + be matched ending at the current position (matching goes backwards). + ) + ) + + $(REG_START Character classes ) + $(REG_TABLE + $(REG_TITLE Pattern element, Semantics ) + $(REG_ROW Any atom, Has the same meaning as outside of a character class.) + $(REG_ROW a-z, Includes characters a, b, c, ..., z. ) + $(REG_ROW [a||b]$(COMMA) [a--b]$(COMMA) [a~~b]$(COMMA) [a$(AMP)$(AMP)b], + Where a, b are arbitrary classes, means union, set difference, + symmetric set difference, and intersection respectively. + $(I Any sequence of character class elements implicitly forms a union.) ) + ) + + $(REG_START Regex flags ) + $(REG_TABLE + $(REG_TITLE Flag, Semantics ) + $(REG_ROW g, Global regex, repeat over the whole input. ) + $(REG_ROW i, Case insensitive matching. ) + $(REG_ROW m, Multi-line mode, match ^, $ on start and end line separators + as well as start and end of input.) + $(REG_ROW s, Single-line mode, makes . match '\n' and '\r' as well. ) + $(REG_ROW x, Free-form syntax, ignores whitespace in pattern, + useful for formatting complex regular expressions. ) + ) + + $(SECTION Unicode support) + + This library provides full Level 1 support* according to + $(HTTP unicode.org/reports/tr18/, UTS 18). Specifically: + $(UL + $(LI 1.1 Hex notation via any of \uxxxx, \U00YYYYYY, \xZZ.) + $(LI 1.2 Unicode properties.) + $(LI 1.3 Character classes with set operations.) + $(LI 1.4 Word boundaries use the full set of "word" characters.) + $(LI 1.5 Using simple casefolding to match case + insensitively across the full range of codepoints.) + $(LI 1.6 Respecting line breaks as any of + \u000A | \u000B | \u000C | \u000D | \u0085 | \u2028 | \u2029 | \u000D\u000A.) + $(LI 1.7 Operating on codepoint level.) + ) + *With exception of point 1.1.1, as of yet, normalization of input + is expected to be enforced by user. + + $(SECTION Replace format string) + + A set of functions in this module that do the substitution rely + on a simple format to guide the process. In particular the table below + applies to the $(D format) argument of + $(LREF replaceFirst) and $(LREF replaceAll). + + The format string can reference parts of match using the following notation. + $(REG_TABLE + $(REG_TITLE Format specifier, Replaced by ) + $(REG_ROW $$(AMP), the whole match. ) + $(REG_ROW $(DOLLAR)$(BACKTICK), part of input $(I preceding) the match. ) + $(REG_ROW $', part of input $(I following) the match. ) + $(REG_ROW $$, '$' character. ) + $(REG_ROW \c $(COMMA) where c is any character, the character c itself. ) + $(REG_ROW \\, '\' character. ) + $(REG_ROW $(DOLLAR)1 .. $(DOLLAR)99, submatch number 1 to 99 respectively. ) + ) + + $(SECTION Slicing and zero memory allocations orientation) + + All matches returned by pattern matching functionality in this library + are slices of the original input. The notable exception is the $(D replace) + family of functions that generate a new string from the input. + + In cases where producing the replacement is the ultimate goal + $(LREF replaceFirstInto) and $(LREF replaceAllInto) could come in handy + as functions that avoid allocations even for replacement. + + Copyright: Copyright Dmitry Olshansky, 2011- + + License: $(HTTP boost.org/LICENSE_1_0.txt, Boost License 1.0). + + Authors: Dmitry Olshansky, + + API and utility constructs are modeled after the original $(D std.regex) + by Walter Bright and Andrei Alexandrescu. + + Source: $(PHOBOSSRC std/_regex/_package.d) + +Macros: + REG_ROW = $(TR $(TD $(I $1 )) $(TD $+) ) + REG_TITLE = $(TR $(TD $(B $1)) $(TD $(B $2)) ) + REG_TABLE = <table border="1" cellspacing="0" cellpadding="5" > $0 </table> + REG_START = <h3><div align="center"> $0 </div></h3> + SECTION = <h3><a id="$1" href="#$1" class="anchor">$0</a></h3> + S_LINK = <a href="#$1">$+</a> + +/ +module std.regex; + +import std.range.primitives, std.traits; +import std.regex.internal.ir; +import std.regex.internal.thompson; //TODO: get rid of this dependency +import std.typecons; // : Flag, Yes, No; + +/++ + $(D Regex) object holds regular expression pattern in compiled form. + + Instances of this object are constructed via calls to $(D regex). + This is an intended form for caching and storage of frequently + used regular expressions. + + Example: + + Test if this object doesn't contain any compiled pattern. + --- + Regex!char r; + assert(r.empty); + r = regex(""); // Note: "" is a valid regex pattern. + assert(!r.empty); + --- + + Getting a range of all the named captures in the regex. + ---- + import std.range; + import std.algorithm; + + auto re = regex(`(?P<name>\w+) = (?P<var>\d+)`); + auto nc = re.namedCaptures; + static assert(isRandomAccessRange!(typeof(nc))); + assert(!nc.empty); + assert(nc.length == 2); + assert(nc.equal(["name", "var"])); + assert(nc[0] == "name"); + assert(nc[1..$].equal(["var"])); + ---- ++/ +public alias Regex(Char) = std.regex.internal.ir.Regex!(Char); + +/++ + A $(D StaticRegex) is $(D Regex) object that contains D code specially + generated at compile-time to speed up matching. + + Implicitly convertible to normal $(D Regex), + however doing so will result in losing this additional capability. ++/ +public alias StaticRegex(Char) = std.regex.internal.ir.StaticRegex!(Char); + +/++ + Compile regular expression pattern for the later execution. + Returns: $(D Regex) object that works on inputs having + the same character width as $(D pattern). + + Params: + pattern = A single regular expression to match. + patterns = An array of regular expression strings. + The resulting `Regex` object will match any expression; + use $(LREF whichPattern) to know which. + flags = The _attributes (g, i, m and x accepted) + + Throws: $(D RegexException) if there were any errors during compilation. ++/ +@trusted public auto regex(S)(S[] patterns, const(char)[] flags="") +if (isSomeString!(S)) +{ + import std.array : appender; + import std.functional : memoize; + enum cacheSize = 8; //TODO: invent nice interface to control regex caching + S pat; + if (patterns.length > 1) + { + auto app = appender!S(); + foreach (i, p; patterns) + { + if (i != 0) + app.put("|"); + app.put("(?:"); + app.put(patterns[i]); + // terminator for the pattern + // to detect if the pattern unexpectedly ends + app.put("\\"); + app.put(cast(dchar)(privateUseStart+i)); + app.put(")"); + // another one to return correct whichPattern + // for all of potential alternatives in the patterns[i] + app.put("\\"); + app.put(cast(dchar)(privateUseStart+i)); + } + pat = app.data; + } + else + pat = patterns[0]; + + if (__ctfe) + return regexImpl(pat, flags); + return memoize!(regexImpl!S, cacheSize)(pat, flags); +} + +///ditto +@trusted public auto regex(S)(S pattern, const(char)[] flags="") +if (isSomeString!(S)) +{ + return regex([pattern], flags); +} + +/// +@system unittest +{ + // multi-pattern regex example + auto multi = regex([`([a-z]+):(\d+)`, `(\d+),\d+`]); // multi regex + auto m = "abc:43 12,34".matchAll(multi); + assert(m.front.whichPattern == 1); + assert(m.front[1] == "abc"); + assert(m.front[2] == "43"); + m.popFront(); + assert(m.front.whichPattern == 2); + assert(m.front[1] == "12"); +} + +public auto regexImpl(S)(S pattern, const(char)[] flags="") +if (isSomeString!(S)) +{ + import std.regex.internal.parser : Parser, CodeGen; + auto parser = Parser!(Unqual!(typeof(pattern)), CodeGen)(pattern, flags); + auto r = parser.program; + return r; +} + + +template ctRegexImpl(alias pattern, string flags=[]) +{ + import std.regex.internal.backtracking, std.regex.internal.parser; + enum r = regex(pattern, flags); + alias Char = BasicElementOf!(typeof(pattern)); + enum source = ctGenRegExCode(r); + alias Matcher = BacktrackingMatcher!(true); + @trusted bool func(ref Matcher!Char matcher) + { + debug(std_regex_ctr) pragma(msg, source); + mixin(source); + } + enum nr = StaticRegex!Char(r, &func); +} + +/++ + Compile regular expression using CTFE + and generate optimized native machine code for matching it. + + Returns: StaticRegex object for faster matching. + + Params: + pattern = Regular expression + flags = The _attributes (g, i, m and x accepted) ++/ +public enum ctRegex(alias pattern, alias flags=[]) = ctRegexImpl!(pattern, flags).nr; + +enum isRegexFor(RegEx, R) = is(RegEx == Regex!(BasicElementOf!R)) + || is(RegEx == StaticRegex!(BasicElementOf!R)); + + +/++ + $(D Captures) object contains submatches captured during a call + to $(D match) or iteration over $(D RegexMatch) range. + + First element of range is the whole match. ++/ +@trusted public struct Captures(R, DIndex = size_t) +if (isSomeString!R) +{//@trusted because of union inside + alias DataIndex = DIndex; + alias String = R; +private: + import std.conv : text; + R _input; + int _nMatch; + enum smallString = 3; + enum SMALL_MASK = 0x8000_0000, REF_MASK= 0x1FFF_FFFF; + union + { + Group!DataIndex[] big_matches; + Group!DataIndex[smallString] small_matches; + } + uint _f, _b; + uint _refcount; // ref count or SMALL MASK + num groups + NamedGroup[] _names; + + this()(R input, uint n, NamedGroup[] named) + { + _input = input; + _names = named; + newMatches(n); + _b = n; + _f = 0; + } + + this(alias Engine)(ref RegexMatch!(R,Engine) rmatch) + { + _input = rmatch._input; + _names = rmatch._engine.re.dict; + immutable n = rmatch._engine.re.ngroup; + newMatches(n); + _b = n; + _f = 0; + } + + @property inout(Group!DataIndex[]) matches() inout + { + return (_refcount & SMALL_MASK) ? small_matches[0 .. _refcount & 0xFF] : big_matches; + } + + void newMatches(uint n) + { + import core.stdc.stdlib : calloc; + import std.exception : enforce; + if (n > smallString) + { + auto p = cast(Group!DataIndex*) enforce( + calloc(Group!DataIndex.sizeof,n), + "Failed to allocate Captures struct" + ); + big_matches = p[0 .. n]; + _refcount = 1; + } + else + { + _refcount = SMALL_MASK | n; + } + } + + bool unique() + { + return (_refcount & SMALL_MASK) || _refcount == 1; + } + +public: + this(this) + { + if (!(_refcount & SMALL_MASK)) + { + _refcount++; + } + } + ~this() + { + import core.stdc.stdlib : free; + if (!(_refcount & SMALL_MASK)) + { + if (--_refcount == 0) + { + free(big_matches.ptr); + big_matches = null; + } + } + } + ///Slice of input prior to the match. + @property R pre() + { + return _nMatch == 0 ? _input[] : _input[0 .. matches[0].begin]; + } + + ///Slice of input immediately after the match. + @property R post() + { + return _nMatch == 0 ? _input[] : _input[matches[0].end .. $]; + } + + ///Slice of matched portion of input. + @property R hit() + { + assert(_nMatch, "attempted to get hit of an empty match"); + return _input[matches[0].begin .. matches[0].end]; + } + + ///Range interface. + @property R front() + { + assert(_nMatch, "attempted to get front of an empty match"); + return _input[matches[_f].begin .. matches[_f].end]; + } + + ///ditto + @property R back() + { + assert(_nMatch, "attempted to get back of an empty match"); + return _input[matches[_b - 1].begin .. matches[_b - 1].end]; + } + + ///ditto + void popFront() + { + assert(!empty); + ++_f; + } + + ///ditto + void popBack() + { + assert(!empty); + --_b; + } + + ///ditto + @property bool empty() const { return _nMatch == 0 || _f >= _b; } + + ///ditto + inout(R) opIndex()(size_t i) inout + { + assert(_f + i < _b,text("requested submatch number ", i," is out of range")); + assert(matches[_f + i].begin <= matches[_f + i].end, + text("wrong match: ", matches[_f + i].begin, "..", matches[_f + i].end)); + return _input[matches[_f + i].begin .. matches[_f + i].end]; + } + + /++ + Explicit cast to bool. + Useful as a shorthand for !(x.empty) in if and assert statements. + + --- + import std.regex; + + assert(!matchFirst("nothing", "something")); + --- + +/ + + @safe bool opCast(T:bool)() const nothrow { return _nMatch != 0; } + + /++ + Number of pattern matched counting, where 1 - the first pattern. + Returns 0 on no match. + +/ + + @safe @property int whichPattern() const nothrow { return _nMatch; } + + /// + @system unittest + { + import std.regex; + assert(matchFirst("abc", "[0-9]+", "[a-z]+").whichPattern == 2); + } + + /++ + Lookup named submatch. + + --- + import std.regex; + import std.range; + + auto c = matchFirst("a = 42;", regex(`(?P<var>\w+)\s*=\s*(?P<value>\d+);`)); + assert(c["var"] == "a"); + assert(c["value"] == "42"); + popFrontN(c, 2); + //named groups are unaffected by range primitives + assert(c["var"] =="a"); + assert(c.front == "42"); + ---- + +/ + R opIndex(String)(String i) /*const*/ //@@@BUG@@@ + if (isSomeString!String) + { + size_t index = lookupNamedGroup(_names, i); + return _input[matches[index].begin .. matches[index].end]; + } + + ///Number of matches in this object. + @property size_t length() const { return _nMatch == 0 ? 0 : _b - _f; } + + ///A hook for compatibility with original std.regex. + @property ref captures(){ return this; } +} + +/// +@system unittest +{ + import std.range.primitives : popFrontN; + + auto c = matchFirst("@abc#", regex(`(\w)(\w)(\w)`)); + assert(c.pre == "@"); // Part of input preceding match + assert(c.post == "#"); // Immediately after match + assert(c.hit == c[0] && c.hit == "abc"); // The whole match + assert(c[2] == "b"); + assert(c.front == "abc"); + c.popFront(); + assert(c.front == "a"); + assert(c.back == "c"); + c.popBack(); + assert(c.back == "b"); + popFrontN(c, 2); + assert(c.empty); + + assert(!matchFirst("nothing", "something")); +} + +/++ + A regex engine state, as returned by $(D match) family of functions. + + Effectively it's a forward range of Captures!R, produced + by lazily searching for matches in a given input. + + $(D alias Engine) specifies an engine type to use during matching, + and is automatically deduced in a call to $(D match)/$(D bmatch). ++/ +@trusted public struct RegexMatch(R, alias Engine = ThompsonMatcher) +if (isSomeString!R) +{ +private: + import core.stdc.stdlib : malloc, free; + alias Char = BasicElementOf!R; + alias EngineType = Engine!Char; + EngineType _engine; + R _input; + Captures!(R,EngineType.DataIndex) _captures; + void[] _memory;//is ref-counted + + this(RegEx)(R input, RegEx prog) + { + import std.exception : enforce; + _input = input; + immutable size = EngineType.initialMemory(prog)+size_t.sizeof; + _memory = (enforce(malloc(size), "malloc failed")[0 .. size]); + scope(failure) free(_memory.ptr); + *cast(size_t*)_memory.ptr = 1; + _engine = EngineType(prog, Input!Char(input), _memory[size_t.sizeof..$]); + static if (is(RegEx == StaticRegex!(BasicElementOf!R))) + _engine.nativeFn = prog.nativeFn; + _captures = Captures!(R,EngineType.DataIndex)(this); + _captures._nMatch = _engine.match(_captures.matches); + debug(std_regex_allocation) writefln("RefCount (ctor): %x %d", _memory.ptr, counter); + } + + @property ref size_t counter(){ return *cast(size_t*)_memory.ptr; } +public: + this(this) + { + if (_memory.ptr) + { + ++counter; + debug(std_regex_allocation) writefln("RefCount (postblit): %x %d", + _memory.ptr, *cast(size_t*)_memory.ptr); + } + } + + ~this() + { + if (_memory.ptr && --*cast(size_t*)_memory.ptr == 0) + { + debug(std_regex_allocation) writefln("RefCount (dtor): %x %d", + _memory.ptr, *cast(size_t*)_memory.ptr); + free(cast(void*)_memory.ptr); + } + } + + ///Shorthands for front.pre, front.post, front.hit. + @property R pre() + { + return _captures.pre; + } + + ///ditto + @property R post() + { + return _captures.post; + } + + ///ditto + @property R hit() + { + return _captures.hit; + } + + /++ + Functionality for processing subsequent matches of global regexes via range interface: + --- + import std.regex; + auto m = matchAll("Hello, world!", regex(`\w+`)); + assert(m.front.hit == "Hello"); + m.popFront(); + assert(m.front.hit == "world"); + m.popFront(); + assert(m.empty); + --- + +/ + @property auto front() + { + return _captures; + } + + ///ditto + void popFront() + { + import std.exception : enforce; + if (counter != 1) + {//do cow magic first + counter--;//we abandon this reference + immutable size = EngineType.initialMemory(_engine.re)+size_t.sizeof; + _memory = (enforce(malloc(size), "malloc failed")[0 .. size]); + _engine = _engine.dupTo(_memory[size_t.sizeof .. size]); + counter = 1;//points to new chunk + } + + if (!_captures.unique) + { + // has external references - allocate new space + _captures.newMatches(_engine.re.ngroup); + } + _captures._nMatch = _engine.match(_captures.matches); + } + + ///ditto + auto save(){ return this; } + + ///Test if this match object is empty. + @property bool empty() const { return _captures._nMatch == 0; } + + ///Same as !(x.empty), provided for its convenience in conditional statements. + T opCast(T:bool)(){ return !empty; } + + /// Same as .front, provided for compatibility with original std.regex. + @property auto captures() inout { return _captures; } + +} + +private @trusted auto matchOnce(alias Engine, RegEx, R)(R input, RegEx re) +{ + import core.stdc.stdlib : malloc, free; + import std.exception : enforce; + alias Char = BasicElementOf!R; + alias EngineType = Engine!Char; + + size_t size = EngineType.initialMemory(re); + void[] memory = enforce(malloc(size), "malloc failed")[0 .. size]; + scope(exit) free(memory.ptr); + auto captures = Captures!(R, EngineType.DataIndex)(input, re.ngroup, re.dict); + auto engine = EngineType(re, Input!Char(input), memory); + static if (is(RegEx == StaticRegex!(BasicElementOf!R))) + engine.nativeFn = re.nativeFn; + captures._nMatch = engine.match(captures.matches); + return captures; +} + +private auto matchMany(alias Engine, RegEx, R)(R input, RegEx re) +{ + re.flags |= RegexOption.global; + return RegexMatch!(R, Engine)(input, re); +} + +@system unittest +{ + //sanity checks for new API + auto re = regex("abc"); + assert(!"abc".matchOnce!(ThompsonMatcher)(re).empty); + assert("abc".matchOnce!(ThompsonMatcher)(re)[0] == "abc"); +} + + +private enum isReplaceFunctor(alias fun, R) = + __traits(compiles, (Captures!R c) { fun(c); }); + +// the lowest level - just stuff replacements into the sink +private @trusted void replaceCapturesInto(alias output, Sink, R, T) + (ref Sink sink, R input, T captures) +if (isOutputRange!(Sink, dchar) && isSomeString!R) +{ + if (captures.empty) + { + sink.put(input); + return; + } + sink.put(captures.pre); + // a hack to get around bogus errors, should be simply output(captures, sink) + // "is a nested function and cannot be accessed from" + static if (isReplaceFunctor!(output, R)) + sink.put(output(captures)); //"mutator" type of function + else + output(captures, sink); //"output" type of function + sink.put(captures.post); +} + +// ditto for a range of captures +private void replaceMatchesInto(alias output, Sink, R, T) + (ref Sink sink, R input, T matches) +if (isOutputRange!(Sink, dchar) && isSomeString!R) +{ + size_t offset = 0; + foreach (cap; matches) + { + sink.put(cap.pre[offset .. $]); + // same hack, see replaceCapturesInto + static if (isReplaceFunctor!(output, R)) + sink.put(output(cap)); //"mutator" type of function + else + output(cap, sink); //"output" type of function + offset = cap.pre.length + cap.hit.length; + } + sink.put(input[offset .. $]); +} + +// a general skeleton of replaceFirst +private R replaceFirstWith(alias output, R, RegEx)(R input, RegEx re) +if (isSomeString!R && isRegexFor!(RegEx, R)) +{ + import std.array : appender; + auto data = matchFirst(input, re); + if (data.empty) + return input; + auto app = appender!(R)(); + replaceCapturesInto!output(app, input, data); + return app.data; +} + +// ditto for replaceAll +// the method parameter allows old API to ride on the back of the new one +private R replaceAllWith(alias output, + alias method=matchAll, R, RegEx)(R input, RegEx re) +if (isSomeString!R && isRegexFor!(RegEx, R)) +{ + import std.array : appender; + auto matches = method(input, re); //inout(C)[] fails + if (matches.empty) + return input; + auto app = appender!(R)(); + replaceMatchesInto!output(app, input, matches); + return app.data; +} + + +/++ + Start matching $(D input) to regex pattern $(D re), + using Thompson NFA matching scheme. + + The use of this function is $(RED discouraged) - use either of + $(LREF matchAll) or $(LREF matchFirst). + + Delegating the kind of operation + to "g" flag is soon to be phased out along with the + ability to choose the exact matching scheme. The choice of + matching scheme to use depends highly on the pattern kind and + can done automatically on case by case basis. + + Returns: a $(D RegexMatch) object holding engine state after first match. ++/ + +public auto match(R, RegEx)(R input, RegEx re) +if (isSomeString!R && is(RegEx == Regex!(BasicElementOf!R))) +{ + import std.regex.internal.thompson : ThompsonMatcher; + return RegexMatch!(Unqual!(typeof(input)),ThompsonMatcher)(input, re); +} + +///ditto +public auto match(R, String)(R input, String re) +if (isSomeString!R && isSomeString!String) +{ + import std.regex.internal.thompson : ThompsonMatcher; + return RegexMatch!(Unqual!(typeof(input)),ThompsonMatcher)(input, regex(re)); +} + +public auto match(R, RegEx)(R input, RegEx re) +if (isSomeString!R && is(RegEx == StaticRegex!(BasicElementOf!R))) +{ + import std.regex.internal.backtracking : BacktrackingMatcher; + return RegexMatch!(Unqual!(typeof(input)),BacktrackingMatcher!true)(input, re); +} + +/++ + Find the first (leftmost) slice of the $(D input) that + matches the pattern $(D re). This function picks the most suitable + regular expression engine depending on the pattern properties. + + $(D re) parameter can be one of three types: + $(UL + $(LI Plain string(s), in which case it's compiled to bytecode before matching. ) + $(LI Regex!char (wchar/dchar) that contains a pattern in the form of + compiled bytecode. ) + $(LI StaticRegex!char (wchar/dchar) that contains a pattern in the form of + compiled native machine code. ) + ) + + Returns: + $(LREF Captures) containing the extent of a match together with all submatches + if there was a match, otherwise an empty $(LREF Captures) object. ++/ +public auto matchFirst(R, RegEx)(R input, RegEx re) +if (isSomeString!R && is(RegEx == Regex!(BasicElementOf!R))) +{ + import std.regex.internal.thompson : ThompsonMatcher; + return matchOnce!ThompsonMatcher(input, re); +} + +///ditto +public auto matchFirst(R, String)(R input, String re) +if (isSomeString!R && isSomeString!String) +{ + import std.regex.internal.thompson : ThompsonMatcher; + return matchOnce!ThompsonMatcher(input, regex(re)); +} + +///ditto +public auto matchFirst(R, String)(R input, String[] re...) +if (isSomeString!R && isSomeString!String) +{ + import std.regex.internal.thompson : ThompsonMatcher; + return matchOnce!ThompsonMatcher(input, regex(re)); +} + +public auto matchFirst(R, RegEx)(R input, RegEx re) +if (isSomeString!R && is(RegEx == StaticRegex!(BasicElementOf!R))) +{ + import std.regex.internal.backtracking : BacktrackingMatcher; + return matchOnce!(BacktrackingMatcher!true)(input, re); +} + +/++ + Initiate a search for all non-overlapping matches to the pattern $(D re) + in the given $(D input). The result is a lazy range of matches generated + as they are encountered in the input going left to right. + + This function picks the most suitable regular expression engine + depending on the pattern properties. + + $(D re) parameter can be one of three types: + $(UL + $(LI Plain string(s), in which case it's compiled to bytecode before matching. ) + $(LI Regex!char (wchar/dchar) that contains a pattern in the form of + compiled bytecode. ) + $(LI StaticRegex!char (wchar/dchar) that contains a pattern in the form of + compiled native machine code. ) + ) + + Returns: + $(LREF RegexMatch) object that represents matcher state + after the first match was found or an empty one if not present. ++/ +public auto matchAll(R, RegEx)(R input, RegEx re) +if (isSomeString!R && is(RegEx == Regex!(BasicElementOf!R))) +{ + import std.regex.internal.thompson : ThompsonMatcher; + return matchMany!ThompsonMatcher(input, re); +} + +///ditto +public auto matchAll(R, String)(R input, String re) +if (isSomeString!R && isSomeString!String) +{ + import std.regex.internal.thompson : ThompsonMatcher; + return matchMany!ThompsonMatcher(input, regex(re)); +} + +///ditto +public auto matchAll(R, String)(R input, String[] re...) +if (isSomeString!R && isSomeString!String) +{ + import std.regex.internal.thompson : ThompsonMatcher; + return matchMany!ThompsonMatcher(input, regex(re)); +} + +public auto matchAll(R, RegEx)(R input, RegEx re) +if (isSomeString!R && is(RegEx == StaticRegex!(BasicElementOf!R))) +{ + import std.regex.internal.backtracking : BacktrackingMatcher; + return matchMany!(BacktrackingMatcher!true)(input, re); +} + +// another set of tests just to cover the new API +@system unittest +{ + import std.algorithm.comparison : equal; + import std.algorithm.iteration : map; + import std.conv : to; + + foreach (String; AliasSeq!(string, wstring, const(dchar)[])) + { + auto str1 = "blah-bleh".to!String(); + auto pat1 = "bl[ae]h".to!String(); + auto mf = matchFirst(str1, pat1); + assert(mf.equal(["blah".to!String()])); + auto mAll = matchAll(str1, pat1); + assert(mAll.equal!((a,b) => a.equal(b)) + ([["blah".to!String()], ["bleh".to!String()]])); + + auto str2 = "1/03/12 - 3/03/12".to!String(); + auto pat2 = regex([r"(\d+)/(\d+)/(\d+)".to!String(), "abc".to!String]); + auto mf2 = matchFirst(str2, pat2); + assert(mf2.equal(["1/03/12", "1", "03", "12"].map!(to!String)())); + auto mAll2 = matchAll(str2, pat2); + assert(mAll2.front.equal(mf2)); + mAll2.popFront(); + assert(mAll2.front.equal(["3/03/12", "3", "03", "12"].map!(to!String)())); + mf2.popFrontN(3); + assert(mf2.equal(["12".to!String()])); + + auto ctPat = ctRegex!(`(?P<Quot>\d+)/(?P<Denom>\d+)`.to!String()); + auto str = "2 + 34/56 - 6/1".to!String(); + auto cmf = matchFirst(str, ctPat); + assert(cmf.equal(["34/56", "34", "56"].map!(to!String)())); + assert(cmf["Quot"] == "34".to!String()); + assert(cmf["Denom"] == "56".to!String()); + + auto cmAll = matchAll(str, ctPat); + assert(cmAll.front.equal(cmf)); + cmAll.popFront(); + assert(cmAll.front.equal(["6/1", "6", "1"].map!(to!String)())); + } +} + +/++ + Start matching of $(D input) to regex pattern $(D re), + using traditional $(LINK2 https://en.wikipedia.org/wiki/Backtracking, + backtracking) matching scheme. + + The use of this function is $(RED discouraged) - use either of + $(LREF matchAll) or $(LREF matchFirst). + + Delegating the kind of operation + to "g" flag is soon to be phased out along with the + ability to choose the exact matching scheme. The choice of + matching scheme to use depends highly on the pattern kind and + can done automatically on case by case basis. + + Returns: a $(D RegexMatch) object holding engine + state after first match. + ++/ +public auto bmatch(R, RegEx)(R input, RegEx re) +if (isSomeString!R && is(RegEx == Regex!(BasicElementOf!R))) +{ + import std.regex.internal.backtracking : BacktrackingMatcher; + return RegexMatch!(Unqual!(typeof(input)), BacktrackingMatcher!false)(input, re); +} + +///ditto +public auto bmatch(R, String)(R input, String re) +if (isSomeString!R && isSomeString!String) +{ + import std.regex.internal.backtracking : BacktrackingMatcher; + return RegexMatch!(Unqual!(typeof(input)), BacktrackingMatcher!false)(input, regex(re)); +} + +public auto bmatch(R, RegEx)(R input, RegEx re) +if (isSomeString!R && is(RegEx == StaticRegex!(BasicElementOf!R))) +{ + import std.regex.internal.backtracking : BacktrackingMatcher; + return RegexMatch!(Unqual!(typeof(input)),BacktrackingMatcher!true)(input, re); +} + +// produces replacement string from format using captures for substitution +package void replaceFmt(R, Capt, OutR) + (R format, Capt captures, OutR sink, bool ignoreBadSubs = false) +if (isOutputRange!(OutR, ElementEncodingType!R[]) && + isOutputRange!(OutR, ElementEncodingType!(Capt.String)[])) +{ + import std.algorithm.searching : find; + import std.ascii : isDigit, isAlpha; + import std.conv : text, parse; + import std.exception : enforce; + enum State { Normal, Dollar } + auto state = State.Normal; + size_t offset; +L_Replace_Loop: + while (!format.empty) + final switch (state) + { + case State.Normal: + for (offset = 0; offset < format.length; offset++)//no decoding + { + if (format[offset] == '$') + { + state = State.Dollar; + sink.put(format[0 .. offset]); + format = format[offset+1 .. $];//ditto + continue L_Replace_Loop; + } + } + sink.put(format[0 .. offset]); + format = format[offset .. $]; + break; + case State.Dollar: + if (isDigit(format[0])) + { + uint digit = parse!uint(format); + enforce(ignoreBadSubs || digit < captures.length, text("invalid submatch number ", digit)); + if (digit < captures.length) + sink.put(captures[digit]); + } + else if (format[0] == '{') + { + auto x = find!(a => !isAlpha(a))(format[1..$]); + enforce(!x.empty && x[0] == '}', "no matching '}' in replacement format"); + auto name = format[1 .. $ - x.length]; + format = x[1..$]; + enforce(!name.empty, "invalid name in ${...} replacement format"); + sink.put(captures[name]); + } + else if (format[0] == '&') + { + sink.put(captures[0]); + format = format[1 .. $]; + } + else if (format[0] == '`') + { + sink.put(captures.pre); + format = format[1 .. $]; + } + else if (format[0] == '\'') + { + sink.put(captures.post); + format = format[1 .. $]; + } + else if (format[0] == '$') + { + sink.put(format[0 .. 1]); + format = format[1 .. $]; + } + state = State.Normal; + break; + } + enforce(state == State.Normal, "invalid format string in regex replace"); +} + +/++ + Construct a new string from $(D input) by replacing the first match with + a string generated from it according to the $(D format) specifier. + + To replace all matches use $(LREF replaceAll). + + Params: + input = string to search + re = compiled regular expression to use + format = _format string to generate replacements from, + see $(S_LINK Replace _format string, the _format string). + + Returns: + A string of the same type with the first match (if any) replaced. + If no match is found returns the input string itself. ++/ +public R replaceFirst(R, C, RegEx)(R input, RegEx re, const(C)[] format) +if (isSomeString!R && is(C : dchar) && isRegexFor!(RegEx, R)) +{ + return replaceFirstWith!((m, sink) => replaceFmt(format, m, sink))(input, re); +} + +/// +@system unittest +{ + assert(replaceFirst("noon", regex("n"), "[$&]") == "[n]oon"); +} + +/++ + This is a general replacement tool that construct a new string by replacing + matches of pattern $(D re) in the $(D input). Unlike the other overload + there is no format string instead captures are passed to + to a user-defined functor $(D fun) that returns a new string + to use as replacement. + + This version replaces the first match in $(D input), + see $(LREF replaceAll) to replace the all of the matches. + + Returns: + A new string of the same type as $(D input) with all matches + replaced by return values of $(D fun). If no matches found + returns the $(D input) itself. ++/ +public R replaceFirst(alias fun, R, RegEx)(R input, RegEx re) +if (isSomeString!R && isRegexFor!(RegEx, R)) +{ + return replaceFirstWith!((m, sink) => sink.put(fun(m)))(input, re); +} + +/// +@system unittest +{ + import std.conv : to; + string list = "#21 out of 46"; + string newList = replaceFirst!(cap => to!string(to!int(cap.hit)+1)) + (list, regex(`[0-9]+`)); + assert(newList == "#22 out of 46"); +} + +/++ + A variation on $(LREF replaceFirst) that instead of allocating a new string + on each call outputs the result piece-wise to the $(D sink). In particular + this enables efficient construction of a final output incrementally. + + Like in $(LREF replaceFirst) family of functions there is an overload + for the substitution guided by the $(D format) string + and the one with the user defined callback. ++/ +public @trusted void replaceFirstInto(Sink, R, C, RegEx) + (ref Sink sink, R input, RegEx re, const(C)[] format) +if (isOutputRange!(Sink, dchar) && isSomeString!R + && is(C : dchar) && isRegexFor!(RegEx, R)) + { + replaceCapturesInto!((m, sink) => replaceFmt(format, m, sink)) + (sink, input, matchFirst(input, re)); + } + +///ditto +public @trusted void replaceFirstInto(alias fun, Sink, R, RegEx) + (Sink sink, R input, RegEx re) +if (isOutputRange!(Sink, dchar) && isSomeString!R && isRegexFor!(RegEx, R)) +{ + replaceCapturesInto!fun(sink, input, matchFirst(input, re)); +} + +/// +@system unittest +{ + import std.array; + string m1 = "first message\n"; + string m2 = "second message\n"; + auto result = appender!string(); + replaceFirstInto(result, m1, regex(`([a-z]+) message`), "$1"); + //equivalent of the above with user-defined callback + replaceFirstInto!(cap=>cap[1])(result, m2, regex(`([a-z]+) message`)); + assert(result.data == "first\nsecond\n"); +} + +//examples for replaceFirst +@system unittest +{ + import std.conv; + string list = "#21 out of 46"; + string newList = replaceFirst!(cap => to!string(to!int(cap.hit)+1)) + (list, regex(`[0-9]+`)); + assert(newList == "#22 out of 46"); + import std.array; + string m1 = "first message\n"; + string m2 = "second message\n"; + auto result = appender!string(); + replaceFirstInto(result, m1, regex(`([a-z]+) message`), "$1"); + //equivalent of the above with user-defined callback + replaceFirstInto!(cap=>cap[1])(result, m2, regex(`([a-z]+) message`)); + assert(result.data == "first\nsecond\n"); +} + +/++ + Construct a new string from $(D input) by replacing all of the + fragments that match a pattern $(D re) with a string generated + from the match according to the $(D format) specifier. + + To replace only the first match use $(LREF replaceFirst). + + Params: + input = string to search + re = compiled regular expression to use + format = _format string to generate replacements from, + see $(S_LINK Replace _format string, the _format string). + + Returns: + A string of the same type as $(D input) with the all + of the matches (if any) replaced. + If no match is found returns the input string itself. ++/ +public @trusted R replaceAll(R, C, RegEx)(R input, RegEx re, const(C)[] format) +if (isSomeString!R && is(C : dchar) && isRegexFor!(RegEx, R)) +{ + return replaceAllWith!((m, sink) => replaceFmt(format, m, sink))(input, re); +} + +/// +@system unittest +{ + // insert comma as thousands delimiter + auto re = regex(r"(?<=\d)(?=(\d\d\d)+\b)","g"); + assert(replaceAll("12000 + 42100 = 54100", re, ",") == "12,000 + 42,100 = 54,100"); +} + +/++ + This is a general replacement tool that construct a new string by replacing + matches of pattern $(D re) in the $(D input). Unlike the other overload + there is no format string instead captures are passed to + to a user-defined functor $(D fun) that returns a new string + to use as replacement. + + This version replaces all of the matches found in $(D input), + see $(LREF replaceFirst) to replace the first match only. + + Returns: + A new string of the same type as $(D input) with all matches + replaced by return values of $(D fun). If no matches found + returns the $(D input) itself. + + Params: + input = string to search + re = compiled regular expression + fun = delegate to use ++/ +public @trusted R replaceAll(alias fun, R, RegEx)(R input, RegEx re) +if (isSomeString!R && isRegexFor!(RegEx, R)) +{ + return replaceAllWith!((m, sink) => sink.put(fun(m)))(input, re); +} + +/// +@system unittest +{ + string baz(Captures!(string) m) + { + import std.string : toUpper; + return toUpper(m.hit); + } + // Capitalize the letters 'a' and 'r': + auto s = replaceAll!(baz)("Strap a rocket engine on a chicken.", + regex("[ar]")); + assert(s == "StRAp A Rocket engine on A chicken."); +} + +/++ + A variation on $(LREF replaceAll) that instead of allocating a new string + on each call outputs the result piece-wise to the $(D sink). In particular + this enables efficient construction of a final output incrementally. + + As with $(LREF replaceAll) there are 2 overloads - one with a format string, + the other one with a user defined functor. ++/ +public @trusted void replaceAllInto(Sink, R, C, RegEx) + (Sink sink, R input, RegEx re, const(C)[] format) +if (isOutputRange!(Sink, dchar) && isSomeString!R + && is(C : dchar) && isRegexFor!(RegEx, R)) + { + replaceMatchesInto!((m, sink) => replaceFmt(format, m, sink)) + (sink, input, matchAll(input, re)); + } + +///ditto +public @trusted void replaceAllInto(alias fun, Sink, R, RegEx) + (Sink sink, R input, RegEx re) +if (isOutputRange!(Sink, dchar) && isSomeString!R && isRegexFor!(RegEx, R)) +{ + replaceMatchesInto!fun(sink, input, matchAll(input, re)); +} + +/// +@system unittest +{ + // insert comma as thousands delimiter in fifty randomly produced big numbers + import std.array, std.conv, std.random, std.range; + static re = regex(`(?<=\d)(?=(\d\d\d)+\b)`, "g"); + auto sink = appender!(char [])(); + enum ulong min = 10UL ^^ 10, max = 10UL ^^ 19; + foreach (i; 0 .. 50) + { + sink.clear(); + replaceAllInto(sink, text(uniform(min, max)), re, ","); + foreach (pos; iota(sink.data.length - 4, 0, -4)) + assert(sink.data[pos] == ','); + } +} + +// exercise all of the replace APIs +@system unittest +{ + import std.array : appender; + import std.conv; + // try and check first/all simple substitution + foreach (S; AliasSeq!(string, wstring, dstring, char[], wchar[], dchar[])) + { + S s1 = "curt trial".to!S(); + S s2 = "round dome".to!S(); + S t1F = "court trial".to!S(); + S t2F = "hound dome".to!S(); + S t1A = "court trial".to!S(); + S t2A = "hound home".to!S(); + auto re1 = regex("curt".to!S()); + auto re2 = regex("[dr]o".to!S()); + + assert(replaceFirst(s1, re1, "court") == t1F); + assert(replaceFirst(s2, re2, "ho") == t2F); + assert(replaceAll(s1, re1, "court") == t1A); + assert(replaceAll(s2, re2, "ho") == t2A); + + auto rep1 = replaceFirst!(cap => cap[0][0]~"o".to!S()~cap[0][1..$])(s1, re1); + assert(rep1 == t1F); + assert(replaceFirst!(cap => "ho".to!S())(s2, re2) == t2F); + auto rep1A = replaceAll!(cap => cap[0][0]~"o".to!S()~cap[0][1..$])(s1, re1); + assert(rep1A == t1A); + assert(replaceAll!(cap => "ho".to!S())(s2, re2) == t2A); + + auto sink = appender!S(); + replaceFirstInto(sink, s1, re1, "court"); + assert(sink.data == t1F); + replaceFirstInto(sink, s2, re2, "ho"); + assert(sink.data == t1F~t2F); + replaceAllInto(sink, s1, re1, "court"); + assert(sink.data == t1F~t2F~t1A); + replaceAllInto(sink, s2, re2, "ho"); + assert(sink.data == t1F~t2F~t1A~t2A); + } +} + +/++ + Old API for replacement, operation depends on flags of pattern $(D re). + With "g" flag it performs the equivalent of $(LREF replaceAll) otherwise it + works the same as $(LREF replaceFirst). + + The use of this function is $(RED discouraged), please use $(LREF replaceAll) + or $(LREF replaceFirst) explicitly. ++/ +public R replace(alias scheme = match, R, C, RegEx)(R input, RegEx re, const(C)[] format) +if (isSomeString!R && isRegexFor!(RegEx, R)) +{ + return replaceAllWith!((m, sink) => replaceFmt(format, m, sink), match)(input, re); +} + +///ditto +public R replace(alias fun, R, RegEx)(R input, RegEx re) +if (isSomeString!R && isRegexFor!(RegEx, R)) +{ + return replaceAllWith!(fun, match)(input, re); +} + +/** +Splits a string `r` using a regular expression `pat` as a separator. + +Params: + keepSeparators = flag to specify if the matches should be in the resulting range + r = the string to split + pat = the pattern to split on +Returns: + A lazy range of strings +*/ +public struct Splitter(Flag!"keepSeparators" keepSeparators = No.keepSeparators, Range, alias RegEx = Regex) +if (isSomeString!Range && isRegexFor!(RegEx, Range)) +{ +private: + Range _input; + size_t _offset; + alias Rx = typeof(match(Range.init,RegEx.init)); + Rx _match; + + static if (keepSeparators) bool onMatch = false; + + @trusted this(Range input, RegEx separator) + {//@@@BUG@@@ generated opAssign of RegexMatch is not @trusted + _input = input; + separator.flags |= RegexOption.global; + if (_input.empty) + { + //there is nothing to match at all, make _offset > 0 + _offset = 1; + } + else + { + _match = Rx(_input, separator); + + static if (keepSeparators) + if (_match.pre.empty) + popFront(); + } + } + +public: + auto ref opSlice() + { + return this.save; + } + + ///Forward range primitives. + @property Range front() + { + import std.algorithm.comparison : min; + + assert(!empty && _offset <= _match.pre.length + && _match.pre.length <= _input.length); + + static if (keepSeparators) + { + if (!onMatch) + return _input[_offset .. min($, _match.pre.length)]; + else + return _match.hit(); + } + else + { + return _input[_offset .. min($, _match.pre.length)]; + } + } + + ///ditto + @property bool empty() + { + static if (keepSeparators) + return _offset >= _input.length; + else + return _offset > _input.length; + } + + ///ditto + void popFront() + { + assert(!empty); + if (_match.empty) + { + //No more separators, work is done here + _offset = _input.length + 1; + } + else + { + static if (keepSeparators) + { + if (!onMatch) + { + //skip past the separator + _offset = _match.pre.length; + } + else + { + _offset += _match.hit.length; + _match.popFront(); + } + + onMatch = !onMatch; + } + else + { + //skip past the separator + _offset = _match.pre.length + _match.hit.length; + _match.popFront(); + } + } + } + + ///ditto + @property auto save() + { + return this; + } +} + +/// ditto +public Splitter!(keepSeparators, Range, RegEx) splitter( + Flag!"keepSeparators" keepSeparators = No.keepSeparators, Range, RegEx)(Range r, RegEx pat) +if ( + is(BasicElementOf!Range : dchar) && isRegexFor!(RegEx, Range)) +{ + return Splitter!(keepSeparators, Range, RegEx)(r, pat); +} + +/// +@system unittest +{ + import std.algorithm.comparison : equal; + auto s1 = ", abc, de, fg, hi, "; + assert(equal(splitter(s1, regex(", *")), + ["", "abc", "de", "fg", "hi", ""])); +} + +/// Split on a pattern, but keep the matches in the resulting range +@system unittest +{ + import std.algorithm.comparison : equal; + import std.typecons : Yes; + + auto pattern = regex(`([\.,])`); + + assert("2003.04.05" + .splitter!(Yes.keepSeparators)(pattern) + .equal(["2003", ".", "04", ".", "05"])); + + assert(",1,2,3" + .splitter!(Yes.keepSeparators)(pattern) + .equal([",", "1", ",", "2", ",", "3"])); +} + +///An eager version of $(D splitter) that creates an array with splitted slices of $(D input). +public @trusted String[] split(String, RegEx)(String input, RegEx rx) +if (isSomeString!String && isRegexFor!(RegEx, String)) +{ + import std.array : appender; + auto a = appender!(String[])(); + foreach (e; splitter(input, rx)) + a.put(e); + return a.data; +} + +///Exception object thrown in case of errors during regex compilation. +public alias RegexException = std.regex.internal.ir.RegexException; + +/++ + A range that lazily produces a string output escaped + to be used inside of a regular expression. ++/ +auto escaper(Range)(Range r) +{ + import std.algorithm.searching : find; + static immutable escapables = [Escapables]; + static struct Escaper // template to deduce attributes + { + Range r; + bool escaped; + + @property ElementType!Range front(){ + if (escaped) + return '\\'; + else + return r.front; + } + + @property bool empty(){ return r.empty; } + + void popFront(){ + if (escaped) escaped = false; + else + { + r.popFront(); + if (!r.empty && !escapables.find(r.front).empty) + escaped = true; + } + } + + @property auto save(){ return Escaper(r.save, escaped); } + } + + bool escaped = !r.empty && !escapables.find(r.front).empty; + return Escaper(r, escaped); +} + +/// +@system unittest +{ + import std.algorithm.comparison; + import std.regex; + string s = `This is {unfriendly} to *regex*`; + assert(s.escaper.equal(`This is \{unfriendly\} to \*regex\*`)); +} + +@system unittest +{ + import std.algorithm.comparison; + import std.conv; + foreach (S; AliasSeq!(string, wstring, dstring)) + { + auto s = "^".to!S; + assert(s.escaper.equal(`\^`)); + auto s2 = ""; + assert(s2.escaper.equal("")); + } +} |