diff options
Diffstat (limited to 'libphobos/src/std/regex/internal')
-rw-r--r-- | libphobos/src/std/regex/internal/backtracking.d | 1495 | ||||
-rw-r--r-- | libphobos/src/std/regex/internal/generator.d | 187 | ||||
-rw-r--r-- | libphobos/src/std/regex/internal/ir.d | 788 | ||||
-rw-r--r-- | libphobos/src/std/regex/internal/kickstart.d | 579 | ||||
-rw-r--r-- | libphobos/src/std/regex/internal/parser.d | 1751 | ||||
-rw-r--r-- | libphobos/src/std/regex/internal/tests.d | 1120 | ||||
-rw-r--r-- | libphobos/src/std/regex/internal/thompson.d | 1188 |
7 files changed, 7108 insertions, 0 deletions
diff --git a/libphobos/src/std/regex/internal/backtracking.d b/libphobos/src/std/regex/internal/backtracking.d new file mode 100644 index 0000000..ffc9779 --- /dev/null +++ b/libphobos/src/std/regex/internal/backtracking.d @@ -0,0 +1,1495 @@ +/* + Implementation of backtracking std.regex engine. + Contains both compile-time and run-time versions. +*/ +module std.regex.internal.backtracking; + +package(std.regex): + +import core.stdc.stdlib, std.range.primitives, std.traits, std.typecons; +import std.regex.internal.ir; + +/+ + BacktrackingMatcher implements backtracking scheme of matching + regular expressions. ++/ +template BacktrackingMatcher(bool CTregex) +{ + @trusted struct BacktrackingMatcher(Char, Stream = Input!Char) + if (is(Char : dchar)) + { + alias DataIndex = Stream.DataIndex; + struct State + {//top bit in pc is set if saved along with matches + DataIndex index; + uint pc, counter, infiniteNesting; + } + static assert(State.sizeof % size_t.sizeof == 0); + enum stateSize = State.sizeof / size_t.sizeof; + enum initialStack = 1 << 11; // items in a block of segmented stack + alias String = const(Char)[]; + alias RegEx = Regex!Char; + alias MatchFn = bool function (ref BacktrackingMatcher!(Char, Stream)); + RegEx re; //regex program + static if (CTregex) + MatchFn nativeFn; //native code for that program + //Stream state + Stream s; + DataIndex index; + dchar front; + bool exhausted; + //backtracking machine state + uint pc, counter; + DataIndex lastState = 0; //top of state stack + static if (!CTregex) + uint infiniteNesting; + size_t[] memory; + Trace[] merge; + static struct Trace + { + ulong mask; + size_t offset; + + bool mark(size_t idx) + { + immutable d = idx - offset; + if (d < 64) // including overflow + { + immutable p = mask & (1UL << d); + mask |= 1UL << d; + return p != 0; + } + else + { + offset = idx; + mask = 1; + return false; + } + } + } + //local slice of matches, global for backref + Group!DataIndex[] matches, backrefed; + + static if (__traits(hasMember,Stream, "search")) + { + enum kicked = true; + } + else + enum kicked = false; + + static size_t initialMemory(const ref RegEx re) + { + return stackSize(re)*size_t.sizeof + re.hotspotTableSize*Trace.sizeof; + } + + static size_t stackSize(const ref RegEx re) + { + size_t itemSize = stateSize + + re.ngroup * (Group!DataIndex).sizeof / size_t.sizeof; + return initialStack * itemSize + 2; + } + + @property bool atStart(){ return index == 0; } + + @property bool atEnd(){ return index == s.lastIndex && s.atEnd; } + + void next() + { + if (!s.nextChar(front, index)) + index = s.lastIndex; + } + + void search() + { + static if (kicked) + { + if (!s.search(re.kickstart, front, index)) + { + index = s.lastIndex; + } + } + else + next(); + } + + // + void newStack() + { + auto chunk = mallocArray!(size_t)(stackSize(re)); + chunk[0] = cast(size_t)(memory.ptr); + chunk[1] = lastState; + memory = chunk[2..$]; + lastState = 0; + } + + bool prevStack() + { + // pointer to previous block + size_t* prev = cast(size_t*) memory.ptr[-2]; + if (!prev) + { + // The last segment is freed in RegexMatch + return false; + } + else + { + import core.stdc.stdlib : free; + // memory used in previous block + size_t size = memory.ptr[-1]; + free(memory.ptr-2); + memory = prev[0 .. size]; + lastState = size; + return true; + } + } + + void initExternalMemory(void[] memBlock) + { + merge = arrayInChunk!(Trace)(re.hotspotTableSize, memBlock); + merge[] = Trace.init; + memory = cast(size_t[]) memBlock; + memory[0] = 0; // hidden pointer + memory[1] = 0; // used size + memory = memory[2..$]; + } + + void initialize(ref RegEx program, Stream stream, void[] memBlock) + { + re = program; + s = stream; + exhausted = false; + initExternalMemory(memBlock); + backrefed = null; + } + + auto dupTo(void[] memory) + { + typeof(this) tmp = this; + tmp.initExternalMemory(memory); + return tmp; + } + + this(ref RegEx program, Stream stream, void[] memBlock, dchar ch, DataIndex idx) + { + initialize(program, stream, memBlock); + front = ch; + index = idx; + } + + this(ref RegEx program, Stream stream, void[] memBlock) + { + initialize(program, stream, memBlock); + next(); + } + + auto fwdMatcher(ref BacktrackingMatcher matcher, void[] memBlock) + { + alias BackMatcherTempl = .BacktrackingMatcher!(CTregex); + alias BackMatcher = BackMatcherTempl!(Char, Stream); + auto fwdMatcher = BackMatcher(matcher.re, s, memBlock, front, index); + return fwdMatcher; + } + + auto bwdMatcher(ref BacktrackingMatcher matcher, void[] memBlock) + { + alias BackMatcherTempl = .BacktrackingMatcher!(CTregex); + alias BackMatcher = BackMatcherTempl!(Char, typeof(s.loopBack(index))); + auto fwdMatcher = + BackMatcher(matcher.re, s.loopBack(index), memBlock); + return fwdMatcher; + } + + // + int matchFinalize() + { + immutable start = index; + immutable val = matchImpl(); + if (val) + {//stream is updated here + matches[0].begin = start; + matches[0].end = index; + if (!(re.flags & RegexOption.global) || atEnd) + exhausted = true; + if (start == index)//empty match advances input + next(); + return val; + } + else + return 0; + } + + //lookup next match, fill matches with indices into input + int match(Group!DataIndex[] matches) + { + debug(std_regex_matcher) + { + writeln("------------------------------------------"); + } + if (exhausted) //all matches collected + return false; + this.matches = matches; + if (re.flags & RegexInfo.oneShot) + { + exhausted = true; + const DataIndex start = index; + immutable m = matchImpl(); + if (m) + { + matches[0].begin = start; + matches[0].end = index; + } + return m; + } + static if (kicked) + { + if (!re.kickstart.empty) + { + for (;;) + { + immutable val = matchFinalize(); + if (val) + return val; + else + { + if (atEnd) + break; + search(); + if (atEnd) + { + exhausted = true; + return matchFinalize(); + } + } + } + exhausted = true; + return 0; //early return + } + } + //no search available - skip a char at a time + for (;;) + { + immutable val = matchFinalize(); + if (val) + return val; + else + { + if (atEnd) + break; + next(); + if (atEnd) + { + exhausted = true; + return matchFinalize(); + } + } + } + exhausted = true; + return 0; + } + + /+ + match subexpression against input, + results are stored in matches + +/ + int matchImpl() + { + static if (CTregex && is(typeof(nativeFn(this)))) + { + debug(std_regex_ctr) writeln("using C-T matcher"); + return nativeFn(this); + } + else + { + pc = 0; + counter = 0; + lastState = 0; + matches[] = Group!DataIndex.init; + auto start = s._index; + debug(std_regex_matcher) + writeln("Try match starting at ", s[index .. s.lastIndex]); + for (;;) + { + debug(std_regex_matcher) + writefln("PC: %s\tCNT: %s\t%s \tfront: %s src: %s", + pc, counter, disassemble(re.ir, pc, re.dict), + front, s._index); + switch (re.ir[pc].code) + { + case IR.OrChar://assumes IRL!(OrChar) == 1 + if (atEnd) + goto L_backtrack; + uint len = re.ir[pc].sequence; + uint end = pc + len; + if (re.ir[pc].data != front && re.ir[pc+1].data != front) + { + for (pc = pc+2; pc < end; pc++) + if (re.ir[pc].data == front) + break; + if (pc == end) + goto L_backtrack; + } + pc = end; + next(); + break; + case IR.Char: + if (atEnd || front != re.ir[pc].data) + goto L_backtrack; + pc += IRL!(IR.Char); + next(); + break; + case IR.Any: + if (atEnd) + goto L_backtrack; + pc += IRL!(IR.Any); + next(); + break; + case IR.CodepointSet: + if (atEnd || !re.charsets[re.ir[pc].data].scanFor(front)) + goto L_backtrack; + next(); + pc += IRL!(IR.CodepointSet); + break; + case IR.Trie: + if (atEnd || !re.matchers[re.ir[pc].data][front]) + goto L_backtrack; + next(); + pc += IRL!(IR.Trie); + break; + case IR.Wordboundary: + dchar back; + DataIndex bi; + //at start & end of input + if (atStart && wordMatcher[front]) + { + pc += IRL!(IR.Wordboundary); + break; + } + else if (atEnd && s.loopBack(index).nextChar(back, bi) + && wordMatcher[back]) + { + pc += IRL!(IR.Wordboundary); + break; + } + else if (s.loopBack(index).nextChar(back, bi)) + { + immutable af = wordMatcher[front]; + immutable ab = wordMatcher[back]; + if (af ^ ab) + { + pc += IRL!(IR.Wordboundary); + break; + } + } + goto L_backtrack; + case IR.Notwordboundary: + dchar back; + DataIndex bi; + //at start & end of input + if (atStart && wordMatcher[front]) + goto L_backtrack; + else if (atEnd && s.loopBack(index).nextChar(back, bi) + && wordMatcher[back]) + goto L_backtrack; + else if (s.loopBack(index).nextChar(back, bi)) + { + immutable af = wordMatcher[front]; + immutable ab = wordMatcher[back]; + if (af ^ ab) + goto L_backtrack; + } + pc += IRL!(IR.Wordboundary); + break; + case IR.Bof: + if (atStart) + pc += IRL!(IR.Bol); + else + goto L_backtrack; + break; + case IR.Bol: + dchar back; + DataIndex bi; + if (atStart) + pc += IRL!(IR.Bol); + else if (s.loopBack(index).nextChar(back,bi) + && endOfLine(back, front == '\n')) + { + pc += IRL!(IR.Bol); + } + else + goto L_backtrack; + break; + case IR.Eof: + if (atEnd) + pc += IRL!(IR.Eol); + else + goto L_backtrack; + break; + case IR.Eol: + dchar back; + DataIndex bi; + debug(std_regex_matcher) writefln("EOL (front 0x%x) %s", front, s[index .. s.lastIndex]); + //no matching inside \r\n + if (atEnd || (endOfLine(front, s.loopBack(index).nextChar(back,bi) + && back == '\r'))) + { + pc += IRL!(IR.Eol); + } + else + goto L_backtrack; + break; + case IR.InfiniteStart, IR.InfiniteQStart: + pc += re.ir[pc].data + IRL!(IR.InfiniteStart); + //now pc is at end IR.Infinite(Q)End + uint len = re.ir[pc].data; + if (re.ir[pc].code == IR.InfiniteEnd) + { + pushState(pc+IRL!(IR.InfiniteEnd), counter); + pc -= len; + } + else + { + pushState(pc - len, counter); + pc += IRL!(IR.InfiniteEnd); + } + break; + case IR.InfiniteBloomStart: + pc += re.ir[pc].data + IRL!(IR.InfiniteBloomStart); + //now pc is at end IR.InfiniteBloomEnd + immutable len = re.ir[pc].data; + immutable filterIdx = re.ir[pc+2].raw; + if (re.filters[filterIdx][front]) + pushState(pc+IRL!(IR.InfiniteBloomEnd), counter); + pc -= len; + break; + case IR.RepeatStart, IR.RepeatQStart: + pc += re.ir[pc].data + IRL!(IR.RepeatStart); + break; + case IR.RepeatEnd: + case IR.RepeatQEnd: + if (merge[re.ir[pc + 1].raw+counter].mark(index)) + { + // merged! + goto L_backtrack; + } + //len, step, min, max + immutable len = re.ir[pc].data; + immutable step = re.ir[pc+2].raw; + immutable min = re.ir[pc+3].raw; + immutable max = re.ir[pc+4].raw; + if (counter < min) + { + counter += step; + pc -= len; + } + else if (counter < max) + { + if (re.ir[pc].code == IR.RepeatEnd) + { + pushState(pc + IRL!(IR.RepeatEnd), counter%step); + counter += step; + pc -= len; + } + else + { + pushState(pc - len, counter + step); + counter = counter%step; + pc += IRL!(IR.RepeatEnd); + } + } + else + { + counter = counter%step; + pc += IRL!(IR.RepeatEnd); + } + break; + case IR.InfiniteEnd: + case IR.InfiniteQEnd: + debug(std_regex_matcher) writeln("Infinited nesting:", infiniteNesting); + if (merge[re.ir[pc + 1].raw+counter].mark(index)) + { + // merged! + goto L_backtrack; + } + immutable len = re.ir[pc].data; + if (re.ir[pc].code == IR.InfiniteEnd) + { + pushState(pc + IRL!(IR.InfiniteEnd), counter); + pc -= len; + } + else + { + pushState(pc-len, counter); + pc += IRL!(IR.InfiniteEnd); + } + break; + case IR.InfiniteBloomEnd: + debug(std_regex_matcher) writeln("Infinited nesting:", infiniteNesting); + if (merge[re.ir[pc + 1].raw+counter].mark(index)) + { + // merged! + goto L_backtrack; + } + immutable len = re.ir[pc].data; + immutable filterIdx = re.ir[pc+2].raw; + if (re.filters[filterIdx][front]) + { + infiniteNesting--; + pushState(pc + IRL!(IR.InfiniteBloomEnd), counter); + infiniteNesting++; + } + pc -= len; + break; + case IR.OrEnd: + if (merge[re.ir[pc + 1].raw+counter].mark(index)) + { + // merged! + goto L_backtrack; + } + pc += IRL!(IR.OrEnd); + break; + case IR.OrStart: + pc += IRL!(IR.OrStart); + goto case; + case IR.Option: + immutable len = re.ir[pc].data; + if (re.ir[pc+len].code == IR.GotoEndOr)//not a last one + { + pushState(pc + len + IRL!(IR.Option), counter); //remember 2nd branch + } + pc += IRL!(IR.Option); + break; + case IR.GotoEndOr: + pc = pc + re.ir[pc].data + IRL!(IR.GotoEndOr); + break; + case IR.GroupStart: + immutable n = re.ir[pc].data; + matches[n].begin = index; + debug(std_regex_matcher) writefln("IR group #%u starts at %u", n, index); + pc += IRL!(IR.GroupStart); + break; + case IR.GroupEnd: + immutable n = re.ir[pc].data; + matches[n].end = index; + debug(std_regex_matcher) writefln("IR group #%u ends at %u", n, index); + pc += IRL!(IR.GroupEnd); + break; + case IR.LookaheadStart: + case IR.NeglookaheadStart: + immutable len = re.ir[pc].data; + auto save = index; + immutable ms = re.ir[pc+1].raw, me = re.ir[pc+2].raw; + auto mem = malloc(initialMemory(re))[0 .. initialMemory(re)]; + scope(exit) free(mem.ptr); + static if (Stream.isLoopback) + { + auto matcher = bwdMatcher(this, mem); + } + else + { + auto matcher = fwdMatcher(this, mem); + } + matcher.matches = matches[ms .. me]; + matcher.backrefed = backrefed.empty ? matches : backrefed; + matcher.re.ir = re.ir[ + pc+IRL!(IR.LookaheadStart) .. pc+IRL!(IR.LookaheadStart)+len+IRL!(IR.LookaheadEnd) + ]; + immutable match = (matcher.matchImpl() != 0) ^ (re.ir[pc].code == IR.NeglookaheadStart); + s.reset(save); + next(); + if (!match) + goto L_backtrack; + else + { + pc += IRL!(IR.LookaheadStart)+len+IRL!(IR.LookaheadEnd); + } + break; + case IR.LookbehindStart: + case IR.NeglookbehindStart: + immutable len = re.ir[pc].data; + immutable ms = re.ir[pc+1].raw, me = re.ir[pc+2].raw; + auto mem = malloc(initialMemory(re))[0 .. initialMemory(re)]; + scope(exit) free(mem.ptr); + static if (Stream.isLoopback) + { + alias Matcher = BacktrackingMatcher!(Char, Stream); + auto matcher = Matcher(re, s, mem, front, index); + } + else + { + alias Matcher = BacktrackingMatcher!(Char, typeof(s.loopBack(index))); + auto matcher = Matcher(re, s.loopBack(index), mem); + } + matcher.matches = matches[ms .. me]; + matcher.re.ir = re.ir[ + pc + IRL!(IR.LookbehindStart) .. pc + IRL!(IR.LookbehindStart) + len + IRL!(IR.LookbehindEnd) + ]; + matcher.backrefed = backrefed.empty ? matches : backrefed; + immutable match = (matcher.matchImpl() != 0) ^ (re.ir[pc].code == IR.NeglookbehindStart); + if (!match) + goto L_backtrack; + else + { + pc += IRL!(IR.LookbehindStart)+len+IRL!(IR.LookbehindEnd); + } + break; + case IR.Backref: + immutable n = re.ir[pc].data; + auto referenced = re.ir[pc].localRef + ? s[matches[n].begin .. matches[n].end] + : s[backrefed[n].begin .. backrefed[n].end]; + while (!atEnd && !referenced.empty && front == referenced.front) + { + next(); + referenced.popFront(); + } + if (referenced.empty) + pc++; + else + goto L_backtrack; + break; + case IR.Nop: + pc += IRL!(IR.Nop); + break; + case IR.LookaheadEnd: + case IR.NeglookaheadEnd: + case IR.LookbehindEnd: + case IR.NeglookbehindEnd: + case IR.End: + // cleanup stale stack blocks if any + while (prevStack()) {} + return re.ir[pc].data; + default: + debug printBytecode(re.ir[0..$]); + assert(0); + L_backtrack: + if (!popState()) + { + s.reset(start); + return 0; + } + } + } + } + assert(0); + } + + @property size_t stackAvail() + { + return memory.length - lastState; + } + + void stackPush(T)(T val) + if (!isDynamicArray!T) + { + *cast(T*)&memory[lastState] = val; + enum delta = (T.sizeof+size_t.sizeof/2)/size_t.sizeof; + lastState += delta; + debug(std_regex_matcher) writeln("push element SP= ", lastState); + } + + void stackPush(T)(T[] val) + { + static assert(T.sizeof % size_t.sizeof == 0); + (cast(T*)&memory[lastState])[0 .. val.length] + = val[0..$]; + lastState += val.length*(T.sizeof/size_t.sizeof); + debug(std_regex_matcher) writeln("push array SP= ", lastState); + } + + void stackPop(T)(ref T val) + if (!isDynamicArray!T) + { + enum delta = (T.sizeof+size_t.sizeof/2)/size_t.sizeof; + lastState -= delta; + val = *cast(T*)&memory[lastState]; + debug(std_regex_matcher) writeln("pop element SP= ", lastState); + } + + void stackPop(T)(T[] val) + { + stackPop(val); // call ref version + } + void stackPop(T)(ref T[] val) + { + lastState -= val.length*(T.sizeof/size_t.sizeof); + val[0..$] = (cast(T*)&memory[lastState])[0 .. val.length]; + debug(std_regex_matcher) writeln("pop array SP= ", lastState); + } + + static if (!CTregex) + { + //helper function, saves engine state + void pushState(uint pc, uint counter) + { + if (stateSize + 2 * matches.length > stackAvail) + { + newStack(); + } + *cast(State*)&memory[lastState] = + State(index, pc, counter, infiniteNesting); + lastState += stateSize; + memory[lastState .. lastState + 2 * matches.length] = (cast(size_t[]) matches)[]; + lastState += 2*matches.length; + debug(std_regex_matcher) + writefln("Saved(pc=%s) front: %s src: %s", + pc, front, s[index .. s.lastIndex]); + } + + //helper function, restores engine state + bool popState() + { + if (!lastState && !prevStack()) + return false; + lastState -= 2*matches.length; + auto pm = cast(size_t[]) matches; + pm[] = memory[lastState .. lastState + 2 * matches.length]; + lastState -= stateSize; + State* state = cast(State*)&memory[lastState]; + index = state.index; + pc = state.pc; + counter = state.counter; + infiniteNesting = state.infiniteNesting; + debug(std_regex_matcher) + { + writefln("Restored matches", front, s[index .. s.lastIndex]); + foreach (i, m; matches) + writefln("Sub(%d) : %s..%s", i, m.begin, m.end); + } + s.reset(index); + next(); + debug(std_regex_matcher) + writefln("Backtracked (pc=%s) front: %s src: %s", + pc, front, s[index .. s.lastIndex]); + return true; + } + } + } +} + +//very shitty string formatter, $$ replaced with next argument converted to string +@trusted string ctSub( U...)(string format, U args) +{ + import std.conv : to; + bool seenDollar; + foreach (i, ch; format) + { + if (ch == '$') + { + if (seenDollar) + { + static if (args.length > 0) + { + return format[0 .. i - 1] ~ to!string(args[0]) + ~ ctSub(format[i + 1 .. $], args[1 .. $]); + } + else + assert(0); + } + else + seenDollar = true; + } + else + seenDollar = false; + + } + return format; +} + +alias Sequence(int B, int E) = staticIota!(B, E); + +struct CtContext +{ + import std.conv : to, text; + //dirty flags + bool counter; + //to mark the portion of matches to save + int match, total_matches; + int reserved; + CodepointSet[] charsets; + + + //state of codegenerator + static struct CtState + { + string code; + int addr; + } + + this(Char)(Regex!Char re) + { + match = 1; + reserved = 1; //first match is skipped + total_matches = re.ngroup; + charsets = re.charsets; + } + + CtContext lookaround(uint s, uint e) + { + CtContext ct; + ct.total_matches = e - s; + ct.match = 1; + return ct; + } + + //restore state having current context + string restoreCode() + { + string text; + //stack is checked in L_backtrack + text ~= counter + ? " + stackPop(counter);" + : " + counter = 0;"; + if (match < total_matches) + { + text ~= ctSub(" + stackPop(matches[$$..$$]);", reserved, match); + text ~= ctSub(" + matches[$$..$] = typeof(matches[0]).init;", match); + } + else + text ~= ctSub(" + stackPop(matches[$$..$]);", reserved); + return text; + } + + //save state having current context + string saveCode(uint pc, string count_expr="counter") + { + string text = ctSub(" + if (stackAvail < $$*(Group!(DataIndex)).sizeof/size_t.sizeof + $$) + { + newStack(); + }", match - reserved, cast(int) counter + 2); + if (match < total_matches) + text ~= ctSub(" + stackPush(matches[$$..$$]);", reserved, match); + else + text ~= ctSub(" + stackPush(matches[$$..$]);", reserved); + text ~= counter ? ctSub(" + stackPush($$);", count_expr) : ""; + text ~= ctSub(" + stackPush(index); stackPush($$); \n", pc); + return text; + } + + // + CtState ctGenBlock(Bytecode[] ir, int addr) + { + CtState result; + result.addr = addr; + while (!ir.empty) + { + auto n = ctGenGroup(ir, result.addr); + result.code ~= n.code; + result.addr = n.addr; + } + return result; + } + + // + CtState ctGenGroup(ref Bytecode[] ir, int addr) + { + import std.algorithm.comparison : max; + auto bailOut = "goto L_backtrack;"; + auto nextInstr = ctSub("goto case $$;", addr+1); + CtState r; + assert(!ir.empty); + switch (ir[0].code) + { + case IR.InfiniteStart, IR.InfiniteBloomStart,IR.InfiniteQStart, IR.RepeatStart, IR.RepeatQStart: + immutable infLoop = + ir[0].code == IR.InfiniteStart || ir[0].code == IR.InfiniteQStart || + ir[0].code == IR.InfiniteBloomStart; + + counter = counter || + ir[0].code == IR.RepeatStart || ir[0].code == IR.RepeatQStart; + immutable len = ir[0].data; + auto nir = ir[ir[0].length .. ir[0].length+len]; + r = ctGenBlock(nir, addr+1); + //start/end codegen + //r.addr is at last test+ jump of loop, addr+1 is body of loop + nir = ir[ir[0].length + len .. $]; + r.code = ctGenFixupCode(ir[0 .. ir[0].length], addr, r.addr) ~ r.code; + r.code ~= ctGenFixupCode(nir, r.addr, addr+1); + r.addr += 2; //account end instruction + restore state + ir = nir; + break; + case IR.OrStart: + immutable len = ir[0].data; + auto nir = ir[ir[0].length .. ir[0].length+len]; + r = ctGenAlternation(nir, addr); + ir = ir[ir[0].length + len .. $]; + assert(ir[0].code == IR.OrEnd); + ir = ir[ir[0].length..$]; + break; + case IR.LookaheadStart: + case IR.NeglookaheadStart: + case IR.LookbehindStart: + case IR.NeglookbehindStart: + immutable len = ir[0].data; + immutable behind = ir[0].code == IR.LookbehindStart || ir[0].code == IR.NeglookbehindStart; + immutable negative = ir[0].code == IR.NeglookaheadStart || ir[0].code == IR.NeglookbehindStart; + string fwdType = "typeof(fwdMatcher(matcher, []))"; + string bwdType = "typeof(bwdMatcher(matcher, []))"; + string fwdCreate = "fwdMatcher(matcher, mem)"; + string bwdCreate = "bwdMatcher(matcher, mem)"; + immutable start = IRL!(IR.LookbehindStart); + immutable end = IRL!(IR.LookbehindStart)+len+IRL!(IR.LookaheadEnd); + CtContext context = lookaround(ir[1].raw, ir[2].raw); //split off new context + auto slice = ir[start .. end]; + r.code ~= ctSub(` + case $$: //fake lookaround "atom" + static if (typeof(matcher.s).isLoopback) + alias Lookaround = $$; + else + alias Lookaround = $$; + static bool matcher_$$(ref Lookaround matcher) @trusted + { + //(neg)lookaround piece start + $$ + //(neg)lookaround piece ends + } + auto save = index; + auto mem = malloc(initialMemory(re))[0 .. initialMemory(re)]; + scope(exit) free(mem.ptr); + static if (typeof(matcher.s).isLoopback) + auto lookaround = $$; + else + auto lookaround = $$; + lookaround.matches = matches[$$..$$]; + lookaround.backrefed = backrefed.empty ? matches : backrefed; + lookaround.nativeFn = &matcher_$$; //hookup closure's binary code + int match = $$; + s.reset(save); + next(); + if (match) + $$ + else + $$`, addr, + behind ? fwdType : bwdType, behind ? bwdType : fwdType, + addr, context.ctGenRegEx(slice), + behind ? fwdCreate : bwdCreate, behind ? bwdCreate : fwdCreate, + ir[1].raw, ir[2].raw, //start - end of matches slice + addr, + negative ? "!lookaround.matchImpl()" : "lookaround.matchImpl()", + nextInstr, bailOut); + ir = ir[end .. $]; + r.addr = addr + 1; + break; + case IR.LookaheadEnd: case IR.NeglookaheadEnd: + case IR.LookbehindEnd: case IR.NeglookbehindEnd: + ir = ir[IRL!(IR.LookaheadEnd) .. $]; + r.addr = addr; + break; + default: + assert(ir[0].isAtom, text(ir[0].mnemonic)); + r = ctGenAtom(ir, addr); + } + return r; + } + + //generate source for bytecode contained in OrStart ... OrEnd + CtState ctGenAlternation(Bytecode[] ir, int addr) + { + CtState[] pieces; + CtState r; + enum optL = IRL!(IR.Option); + for (;;) + { + assert(ir[0].code == IR.Option); + auto len = ir[0].data; + if (optL+len < ir.length && ir[optL+len].code == IR.Option)//not a last option + { + auto nir = ir[optL .. optL+len-IRL!(IR.GotoEndOr)]; + r = ctGenBlock(nir, addr+2);//space for Option + restore state + //r.addr+1 to account GotoEndOr at end of branch + r.code = ctGenFixupCode(ir[0 .. ir[0].length], addr, r.addr+1) ~ r.code; + addr = r.addr+1;//leave space for GotoEndOr + pieces ~= r; + ir = ir[optL + len .. $]; + } + else + { + pieces ~= ctGenBlock(ir[optL..$], addr); + addr = pieces[$-1].addr; + break; + } + } + r = pieces[0]; + for (uint i = 1; i < pieces.length; i++) + { + r.code ~= ctSub(` + case $$: + goto case $$; `, pieces[i-1].addr, addr); + r.code ~= pieces[i].code; + } + r.addr = addr; + return r; + } + + // generate fixup code for instruction in ir, + // fixup means it has an alternative way for control flow + string ctGenFixupCode(Bytecode[] ir, int addr, int fixup) + { + return ctGenFixupCode(ir, addr, fixup); // call ref Bytecode[] version + } + string ctGenFixupCode(ref Bytecode[] ir, int addr, int fixup) + { + string r; + string testCode; + r = ctSub(` + case $$: debug(std_regex_matcher) writeln("#$$");`, + addr, addr); + switch (ir[0].code) + { + case IR.InfiniteStart, IR.InfiniteQStart, IR.InfiniteBloomStart: + r ~= ctSub( ` + goto case $$;`, fixup); + ir = ir[ir[0].length..$]; + break; + case IR.InfiniteEnd: + testCode = ctQuickTest(ir[IRL!(IR.InfiniteEnd) .. $],addr + 1); + r ~= ctSub( ` + if (merge[$$+counter].mark(index)) + { + // merged! + goto L_backtrack; + } + + $$ + { + $$ + } + goto case $$; + case $$: //restore state and go out of loop + $$ + goto case;`, ir[1].raw, testCode, saveCode(addr+1), fixup, + addr+1, restoreCode()); + ir = ir[ir[0].length..$]; + break; + case IR.InfiniteBloomEnd: + //TODO: check bloom filter and skip on failure + testCode = ctQuickTest(ir[IRL!(IR.InfiniteBloomEnd) .. $],addr + 1); + r ~= ctSub( ` + if (merge[$$+counter].mark(index)) + { + // merged! + goto L_backtrack; + } + + $$ + { + $$ + } + goto case $$; + case $$: //restore state and go out of loop + $$ + goto case;`, ir[1].raw, testCode, saveCode(addr+1), fixup, + addr+1, restoreCode()); + ir = ir[ir[0].length..$]; + break; + case IR.InfiniteQEnd: + testCode = ctQuickTest(ir[IRL!(IR.InfiniteEnd) .. $],addr + 1); + auto altCode = testCode.length ? ctSub("else goto case $$;", fixup) : ""; + r ~= ctSub( ` + if (merge[$$+counter].mark(index)) + { + // merged! + goto L_backtrack; + } + + $$ + { + $$ + goto case $$; + } + $$ + case $$://restore state and go inside loop + $$ + goto case $$;`, ir[1].raw, + testCode, saveCode(addr+1), addr+2, altCode, + addr+1, restoreCode(), fixup); + ir = ir[ir[0].length..$]; + break; + case IR.RepeatStart, IR.RepeatQStart: + r ~= ctSub( ` + goto case $$;`, fixup); + ir = ir[ir[0].length..$]; + break; + case IR.RepeatEnd, IR.RepeatQEnd: + //len, step, min, max + immutable len = ir[0].data; + immutable step = ir[2].raw; + immutable min = ir[3].raw; + immutable max = ir[4].raw; + r ~= ctSub(` + if (merge[$$+counter].mark(index)) + { + // merged! + goto L_backtrack; + } + if (counter < $$) + { + debug(std_regex_matcher) writeln("RepeatEnd min case pc=", $$); + counter += $$; + goto case $$; + }`, ir[1].raw, min, addr, step, fixup); + if (ir[0].code == IR.RepeatEnd) + { + string counter_expr = ctSub("counter % $$", step); + r ~= ctSub(` + else if (counter < $$) + { + $$ + counter += $$; + goto case $$; + }`, max, saveCode(addr+1, counter_expr), step, fixup); + } + else + { + string counter_expr = ctSub("counter % $$", step); + r ~= ctSub(` + else if (counter < $$) + { + $$ + counter = counter % $$; + goto case $$; + }`, max, saveCode(addr+1,counter_expr), step, addr+2); + } + r ~= ctSub(` + else + { + counter = counter % $$; + goto case $$; + } + case $$: //restore state + $$ + goto case $$;`, step, addr+2, addr+1, restoreCode(), + ir[0].code == IR.RepeatEnd ? addr+2 : fixup ); + ir = ir[ir[0].length..$]; + break; + case IR.Option: + r ~= ctSub( ` + { + $$ + } + goto case $$; + case $$://restore thunk to go to the next group + $$ + goto case $$;`, saveCode(addr+1), addr+2, + addr+1, restoreCode(), fixup); + ir = ir[ir[0].length..$]; + break; + default: + assert(0, text(ir[0].mnemonic)); + } + return r; + } + + + string ctQuickTest(Bytecode[] ir, int id) + { + uint pc = 0; + while (pc < ir.length && ir[pc].isAtom) + { + if (ir[pc].code == IR.GroupStart || ir[pc].code == IR.GroupEnd) + { + pc++; + } + else if (ir[pc].code == IR.Backref) + break; + else + { + auto code = ctAtomCode(ir[pc..$], -1); + return ctSub(` + int test_$$() + { + $$ //$$ + } + if (test_$$() >= 0)`, id, code.ptr ? code : "return 0;", + ir[pc].mnemonic, id); + } + } + return ""; + } + + //process & generate source for simple bytecodes at front of ir using address addr + CtState ctGenAtom(ref Bytecode[] ir, int addr) + { + CtState result; + result.code = ctAtomCode(ir, addr); + ir.popFrontN(ir[0].code == IR.OrChar ? ir[0].sequence : ir[0].length); + result.addr = addr + 1; + return result; + } + + //D code for atom at ir using address addr, addr < 0 means quickTest + string ctAtomCode(Bytecode[] ir, int addr) + { + string code; + string bailOut, nextInstr; + if (addr < 0) + { + bailOut = "return -1;"; + nextInstr = "return 0;"; + } + else + { + bailOut = "goto L_backtrack;"; + nextInstr = ctSub("goto case $$;", addr+1); + code ~= ctSub( ` + case $$: debug(std_regex_matcher) writeln("#$$"); + `, addr, addr); + } + switch (ir[0].code) + { + case IR.OrChar://assumes IRL!(OrChar) == 1 + code ~= ctSub(` + if (atEnd) + $$`, bailOut); + immutable len = ir[0].sequence; + for (uint i = 0; i < len; i++) + { + code ~= ctSub( ` + if (front == $$) + { + $$ + $$ + }`, ir[i].data, addr >= 0 ? "next();" :"", nextInstr); + } + code ~= ctSub( ` + $$`, bailOut); + break; + case IR.Char: + code ~= ctSub( ` + if (atEnd || front != $$) + $$ + $$ + $$`, ir[0].data, bailOut, addr >= 0 ? "next();" :"", nextInstr); + break; + case IR.Any: + code ~= ctSub( ` + if (atEnd || (!(re.flags & RegexOption.singleline) + && (front == '\r' || front == '\n'))) + $$ + $$ + $$`, bailOut, addr >= 0 ? "next();" :"",nextInstr); + break; + case IR.CodepointSet: + if (charsets.length) + { + string name = `func_`~to!string(addr+1); + string funcCode = charsets[ir[0].data].toSourceCode(name); + code ~= ctSub( ` + static $$ + if (atEnd || !$$(front)) + $$ + $$ + $$`, funcCode, name, bailOut, addr >= 0 ? "next();" :"", nextInstr); + } + else + code ~= ctSub( ` + if (atEnd || !re.charsets[$$].scanFor(front)) + $$ + $$ + $$`, ir[0].data, bailOut, addr >= 0 ? "next();" :"", nextInstr); + break; + case IR.Trie: + if (charsets.length && charsets[ir[0].data].byInterval.length <= 8) + goto case IR.CodepointSet; + code ~= ctSub( ` + if (atEnd || !re.matchers[$$][front]) + $$ + $$ + $$`, ir[0].data, bailOut, addr >= 0 ? "next();" :"", nextInstr); + break; + case IR.Wordboundary: + code ~= ctSub( ` + dchar back; + DataIndex bi; + if (atStart && wordMatcher[front]) + { + $$ + } + else if (atEnd && s.loopBack(index).nextChar(back, bi) + && wordMatcher[back]) + { + $$ + } + else if (s.loopBack(index).nextChar(back, bi)) + { + bool af = wordMatcher[front]; + bool ab = wordMatcher[back]; + if (af ^ ab) + { + $$ + } + } + $$`, nextInstr, nextInstr, nextInstr, bailOut); + break; + case IR.Notwordboundary: + code ~= ctSub( ` + dchar back; + DataIndex bi; + //at start & end of input + if (atStart && wordMatcher[front]) + $$ + else if (atEnd && s.loopBack(index).nextChar(back, bi) + && wordMatcher[back]) + $$ + else if (s.loopBack(index).nextChar(back, bi)) + { + bool af = wordMatcher[front]; + bool ab = wordMatcher[back]; + if (af ^ ab) + $$ + } + $$`, bailOut, bailOut, bailOut, nextInstr); + + break; + case IR.Bol: + code ~= ctSub(` + dchar back; + DataIndex bi; + if (atStart || (s.loopBack(index).nextChar(back,bi) + && endOfLine(back, front == '\n'))) + { + debug(std_regex_matcher) writeln("BOL matched"); + $$ + } + else + $$`, nextInstr, bailOut); + + break; + case IR.Bof: + code ~= ctSub(` + if (atStart) + { + debug(std_regex_matcher) writeln("BOF matched"); + $$ + } + else + $$`, nextInstr, bailOut); + break; + case IR.Eol: + code ~= ctSub(` + dchar back; + DataIndex bi; + debug(std_regex_matcher) writefln("EOL (front 0x%x) %s", front, s[index .. s.lastIndex]); + //no matching inside \r\n + if (atEnd || (endOfLine(front, s.loopBack(index).nextChar(back,bi) + && back == '\r'))) + { + debug(std_regex_matcher) writeln("EOL matched"); + $$ + } + else + $$`, nextInstr, bailOut); + break; + case IR.Eof: + code ~= ctSub(` + if (atEnd) + { + debug(std_regex_matcher) writeln("BOF matched"); + $$ + } + else + $$`, nextInstr, bailOut); + break; + case IR.GroupStart: + code ~= ctSub(` + matches[$$].begin = index; + $$`, ir[0].data, nextInstr); + match = ir[0].data+1; + break; + case IR.GroupEnd: + code ~= ctSub(` + matches[$$].end = index; + $$`, ir[0].data, nextInstr); + break; + case IR.Backref: + string mStr = "auto referenced = "; + mStr ~= ir[0].localRef + ? ctSub("s[matches[$$].begin .. matches[$$].end];", + ir[0].data, ir[0].data) + : ctSub("s[backrefed[$$].begin .. backrefed[$$].end];", + ir[0].data, ir[0].data); + code ~= ctSub( ` + $$ + while (!atEnd && !referenced.empty && front == referenced.front) + { + next(); + referenced.popFront(); + } + if (referenced.empty) + $$ + else + $$`, mStr, nextInstr, bailOut); + break; + case IR.Nop: + case IR.End: + break; + default: + assert(0, text(ir[0].mnemonic, " is not supported yet")); + } + return code; + } + + //generate D code for the whole regex + public string ctGenRegEx(Bytecode[] ir) + { + auto bdy = ctGenBlock(ir, 0); + auto r = ` + import core.stdc.stdlib; + with(matcher) + { + pc = 0; + counter = 0; + lastState = 0; + matches[] = Group!DataIndex.init; + auto start = s._index;`; + r ~= ` + goto StartLoop; + debug(std_regex_matcher) writeln("Try CT matching starting at ",s[index .. s.lastIndex]); + L_backtrack: + if (lastState || prevStack()) + { + stackPop(pc); + stackPop(index); + s.reset(index); + next(); + } + else + { + s.reset(start); + return false; + } + StartLoop: + switch (pc) + { + `; + r ~= bdy.code; + r ~= ctSub(` + case $$: break;`,bdy.addr); + r ~= ` + default: + assert(0); + } + // cleanup stale stack blocks + while (prevStack()) {} + return true; + } + `; + return r; + } + +} + +string ctGenRegExCode(Char)(Regex!Char re) +{ + auto context = CtContext(re); + return context.ctGenRegEx(re.ir); +} diff --git a/libphobos/src/std/regex/internal/generator.d b/libphobos/src/std/regex/internal/generator.d new file mode 100644 index 0000000..6831e59 --- /dev/null +++ b/libphobos/src/std/regex/internal/generator.d @@ -0,0 +1,187 @@ +/* + Generators - components that generate strings for a given regex pattern. + + For the moment undocumented, and is subject to change. +*/ +module std.regex.internal.generator; + +/* + Useful utility for self-testing, an infinite range of string samples + that _have_ to match given compiled regex. + Caveats: supports only a simple subset of bytecode. +*/ +@trusted private struct SampleGenerator(Char) +{ + import std.array : appender, Appender; + import std.format : formattedWrite; + import std.random : Xorshift; + import std.regex.internal.ir : Regex, IR, IRL; + import std.utf : isValidDchar, byChar; + Regex!Char re; + Appender!(char[]) app; + uint limit, seed; + Xorshift gen; + //generator for pattern r, with soft maximum of threshold elements + //and a given random seed + this(ref Regex!Char r, uint threshold, uint randomSeed) + { + re = r; + limit = threshold; + seed = randomSeed; + app = appender!(Char[])(); + compose(); + } + + uint rand(uint x) + { + uint r = gen.front % x; + gen.popFront(); + return r; + } + + void compose() + { + uint pc = 0, counter = 0, dataLenOld = uint.max; + for (;;) + { + switch (re.ir[pc].code) + { + case IR.Char: + formattedWrite(app,"%s", cast(dchar) re.ir[pc].data); + pc += IRL!(IR.Char); + break; + case IR.OrChar: + uint len = re.ir[pc].sequence; + formattedWrite(app, "%s", cast(dchar) re.ir[pc + rand(len)].data); + pc += len; + break; + case IR.CodepointSet: + case IR.Trie: + auto set = re.charsets[re.ir[pc].data]; + auto x = rand(cast(uint) set.byInterval.length); + auto y = rand(set.byInterval[x].b - set.byInterval[x].a); + formattedWrite(app, "%s", cast(dchar)(set.byInterval[x].a+y)); + pc += IRL!(IR.CodepointSet); + break; + case IR.Any: + uint x; + do + { + x = rand(0x11_000); + }while (x == '\r' || x == '\n' || !isValidDchar(x)); + formattedWrite(app, "%s", cast(dchar) x); + pc += IRL!(IR.Any); + break; + case IR.GotoEndOr: + pc += IRL!(IR.GotoEndOr)+re.ir[pc].data; + assert(re.ir[pc].code == IR.OrEnd); + goto case; + case IR.OrEnd: + pc += IRL!(IR.OrEnd); + break; + case IR.OrStart: + pc += IRL!(IR.OrStart); + goto case; + case IR.Option: + uint next = pc + re.ir[pc].data + IRL!(IR.Option); + uint nOpt = 0; + //queue next Option + while (re.ir[next].code == IR.Option) + { + nOpt++; + next += re.ir[next].data + IRL!(IR.Option); + } + nOpt++; + nOpt = rand(nOpt); + for (;nOpt; nOpt--) + { + pc += re.ir[pc].data + IRL!(IR.Option); + } + assert(re.ir[pc].code == IR.Option); + pc += IRL!(IR.Option); + break; + case IR.RepeatStart:case IR.RepeatQStart: + pc += IRL!(IR.RepeatStart)+re.ir[pc].data; + goto case IR.RepeatEnd; + case IR.RepeatEnd: + case IR.RepeatQEnd: + uint len = re.ir[pc].data; + uint step = re.ir[pc+2].raw; + uint min = re.ir[pc+3].raw; + if (counter < min) + { + counter += step; + pc -= len; + break; + } + uint max = re.ir[pc+4].raw; + if (counter < max) + { + if (app.data.length < limit && rand(3) > 0) + { + pc -= len; + counter += step; + } + else + { + counter = counter%step; + pc += IRL!(IR.RepeatEnd); + } + } + else + { + counter = counter%step; + pc += IRL!(IR.RepeatEnd); + } + break; + case IR.InfiniteStart, IR.InfiniteBloomStart, IR.InfiniteQStart: + pc += re.ir[pc].data + IRL!(IR.InfiniteStart); + goto case IR.InfiniteEnd; //both Q and non-Q + case IR.InfiniteEnd, IR.InfiniteBloomEnd, IR.InfiniteQEnd: + uint len = re.ir[pc].data; + if (app.data.length == dataLenOld) + { + pc += IRL!(IR.InfiniteEnd); + break; + } + dataLenOld = cast(uint) app.data.length; + if (app.data.length < limit && rand(3) > 0) + pc = pc - len; + else + pc = pc + re.ir[pc].length; + break; + case IR.GroupStart, IR.GroupEnd: + pc += IRL!(IR.GroupStart); + break; + case IR.Bol, IR.Wordboundary, IR.Notwordboundary: + case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart: + default: + return; + } + } + } + + @property Char[] front() + { + return app.data; + } + + enum empty = false; + + void popFront() + { + app.shrinkTo(0); + compose(); + } +} + +@system unittest +{ + import std.range, std.regex; + auto re = regex(`P[a-z]{3,}q`); + auto gen = SampleGenerator!char(re, 20, 3141592); + static assert(isInputRange!(typeof(gen))); + //@@@BUG@@@ somehow gen.take(1_000) doesn't work + foreach (v; take(gen, 1_000)) + assert(v.match(re)); +} diff --git a/libphobos/src/std/regex/internal/ir.d b/libphobos/src/std/regex/internal/ir.d new file mode 100644 index 0000000..28b1998 --- /dev/null +++ b/libphobos/src/std/regex/internal/ir.d @@ -0,0 +1,788 @@ +/* + Implementation of std.regex IR, an intermediate representation + of a regular expression pattern. + + This is a common ground between frontend regex component (parser) + and backend components - generators, matchers and other "filters". +*/ +module std.regex.internal.ir; + +package(std.regex): + +import std.exception, std.meta, std.range.primitives, std.traits, std.uni; + +debug(std_regex_parser) import std.stdio; +// just a common trait, may be moved elsewhere +alias BasicElementOf(Range) = Unqual!(ElementEncodingType!Range); + +enum privateUseStart = '\U000F0000', privateUseEnd ='\U000FFFFD'; + +// heuristic value determines maximum CodepointSet length suitable for linear search +enum maxCharsetUsed = 6; + +// another variable to tweak behavior of caching generated Tries for character classes +enum maxCachedMatchers = 8; + +alias Trie = CodepointSetTrie!(13, 8); +alias makeTrie = codepointSetTrie!(13, 8); + +CharMatcher[CodepointSet] matcherCache; + +//accessor with caching +@trusted CharMatcher getMatcher(CodepointSet set) +{// @@@BUG@@@ 6357 almost all properties of AA are not @safe + if (__ctfe || maxCachedMatchers == 0) + return CharMatcher(set); + else + { + auto p = set in matcherCache; + if (p) + return *p; + if (matcherCache.length == maxCachedMatchers) + { + // flush enmatchers in trieCache + matcherCache = null; + } + return (matcherCache[set] = CharMatcher(set)); + } +} + +@trusted auto memoizeExpr(string expr)() +{ + if (__ctfe) + return mixin(expr); + alias T = typeof(mixin(expr)); + static T slot; + static bool initialized; + if (!initialized) + { + slot = mixin(expr); + initialized = true; + } + return slot; +} + +//property for \w character class +@property CodepointSet wordCharacter() +{ + return memoizeExpr!("unicode.Alphabetic | unicode.Mn | unicode.Mc + | unicode.Me | unicode.Nd | unicode.Pc")(); +} + +@property CharMatcher wordMatcher() +{ + return memoizeExpr!("CharMatcher(wordCharacter)")(); +} + +// some special Unicode white space characters +private enum NEL = '\u0085', LS = '\u2028', PS = '\u2029'; + +// Characters that need escaping in string posed as regular expressions +alias Escapables = AliasSeq!('[', ']', '\\', '^', '$', '.', '|', '?', ',', '-', + ';', ':', '#', '&', '%', '/', '<', '>', '`', '*', '+', '(', ')', '{', '}', '~'); + +//Regular expression engine/parser options: +// global - search all nonoverlapping matches in input +// casefold - case insensitive matching, do casefolding on match in unicode mode +// freeform - ignore whitespace in pattern, to match space use [ ] or \s +// multiline - switch ^, $ detect start and end of linesinstead of just start and end of input +enum RegexOption: uint { + global = 0x1, + casefold = 0x2, + freeform = 0x4, + nonunicode = 0x8, + multiline = 0x10, + singleline = 0x20 +} +//do not reorder this list +alias RegexOptionNames = AliasSeq!('g', 'i', 'x', 'U', 'm', 's'); +static assert( RegexOption.max < 0x80); +// flags that allow guide execution of engine +enum RegexInfo : uint { oneShot = 0x80 } + +// IR bit pattern: 0b1_xxxxx_yy +// where yy indicates class of instruction, xxxxx for actual operation code +// 00: atom, a normal instruction +// 01: open, opening of a group, has length of contained IR in the low bits +// 10: close, closing of a group, has length of contained IR in the low bits +// 11 unused +// +// Loops with Q (non-greedy, with ? mark) must have the same size / other properties as non Q version +// Possible changes: +//* merge group, option, infinite/repeat start (to never copy during parsing of (a|b){1,2}) +//* reorganize groups to make n args easier to find, or simplify the check for groups of similar ops +// (like lookaround), or make it easier to identify hotspots. + +enum IR:uint { + Char = 0b1_00000_00, //a character + Any = 0b1_00001_00, //any character + CodepointSet = 0b1_00010_00, //a most generic CodepointSet [...] + Trie = 0b1_00011_00, //CodepointSet implemented as Trie + //match with any of a consecutive OrChar's in this sequence + //(used for case insensitive match) + //OrChar holds in upper two bits of data total number of OrChars in this _sequence_ + //the drawback of this representation is that it is difficult + // to detect a jump in the middle of it + OrChar = 0b1_00100_00, + Nop = 0b1_00101_00, //no operation (padding) + End = 0b1_00110_00, //end of program + Bol = 0b1_00111_00, //beginning of a line ^ + Eol = 0b1_01000_00, //end of a line $ + Wordboundary = 0b1_01001_00, //boundary of a word + Notwordboundary = 0b1_01010_00, //not a word boundary + Backref = 0b1_01011_00, //backreference to a group (that has to be pinned, i.e. locally unique) (group index) + GroupStart = 0b1_01100_00, //start of a group (x) (groupIndex+groupPinning(1bit)) + GroupEnd = 0b1_01101_00, //end of a group (x) (groupIndex+groupPinning(1bit)) + Option = 0b1_01110_00, //start of an option within an alternation x | y (length) + GotoEndOr = 0b1_01111_00, //end of an option (length of the rest) + Bof = 0b1_10000_00, //begining of "file" (string) ^ + Eof = 0b1_10001_00, //end of "file" (string) $ + //... any additional atoms here + + OrStart = 0b1_00000_01, //start of alternation group (length) + OrEnd = 0b1_00000_10, //end of the or group (length,mergeIndex) + //with this instruction order + //bit mask 0b1_00001_00 could be used to test/set greediness + InfiniteStart = 0b1_00001_01, //start of an infinite repetition x* (length) + InfiniteEnd = 0b1_00001_10, //end of infinite repetition x* (length,mergeIndex) + InfiniteQStart = 0b1_00010_01, //start of a non eager infinite repetition x*? (length) + InfiniteQEnd = 0b1_00010_10, //end of non eager infinite repetition x*? (length,mergeIndex) + InfiniteBloomStart = 0b1_00011_01, //start of an filtered infinite repetition x* (length) + InfiniteBloomEnd = 0b1_00011_10, //end of filtered infinite repetition x* (length,mergeIndex) + RepeatStart = 0b1_00100_01, //start of a {n,m} repetition (length) + RepeatEnd = 0b1_00100_10, //end of x{n,m} repetition (length,step,minRep,maxRep) + RepeatQStart = 0b1_00101_01, //start of a non eager x{n,m}? repetition (length) + RepeatQEnd = 0b1_00101_10, //end of non eager x{n,m}? repetition (length,step,minRep,maxRep) + + // + LookaheadStart = 0b1_00110_01, //begin of the lookahead group (length) + LookaheadEnd = 0b1_00110_10, //end of a lookahead group (length) + NeglookaheadStart = 0b1_00111_01, //start of a negative lookahead (length) + NeglookaheadEnd = 0b1_00111_10, //end of a negative lookahead (length) + LookbehindStart = 0b1_01000_01, //start of a lookbehind (length) + LookbehindEnd = 0b1_01000_10, //end of a lookbehind (length) + NeglookbehindStart = 0b1_01001_01, //start of a negative lookbehind (length) + NeglookbehindEnd = 0b1_01001_10, //end of negative lookbehind (length) +} + +//a shorthand for IR length - full length of specific opcode evaluated at compile time +template IRL(IR code) +{ + enum uint IRL = lengthOfIR(code); +} +static assert(IRL!(IR.LookaheadStart) == 3); + +//how many parameters follow the IR, should be optimized fixing some IR bits +int immediateParamsIR(IR i){ + switch (i) + { + case IR.OrEnd,IR.InfiniteEnd,IR.InfiniteQEnd: + return 1; // merge table index + case IR.InfiniteBloomEnd: + return 2; // bloom filter index + merge table index + case IR.RepeatEnd, IR.RepeatQEnd: + return 4; + case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart: + return 2; // start-end of captures used + default: + return 0; + } +} + +//full length of IR instruction inlcuding all parameters that might follow it +int lengthOfIR(IR i) +{ + return 1 + immediateParamsIR(i); +} + +//full length of the paired IR instruction inlcuding all parameters that might follow it +int lengthOfPairedIR(IR i) +{ + return 1 + immediateParamsIR(pairedIR(i)); +} + +//if the operation has a merge point (this relies on the order of the ops) +bool hasMerge(IR i) +{ + return (i&0b11)==0b10 && i <= IR.RepeatQEnd; +} + +//is an IR that opens a "group" +bool isStartIR(IR i) +{ + return (i&0b11)==0b01; +} + +//is an IR that ends a "group" +bool isEndIR(IR i) +{ + return (i&0b11)==0b10; +} + +//is a standalone IR +bool isAtomIR(IR i) +{ + return (i&0b11)==0b00; +} + +//makes respective pair out of IR i, swapping start/end bits of instruction +IR pairedIR(IR i) +{ + assert(isStartIR(i) || isEndIR(i)); + return cast(IR)(i ^ 0b11); +} + +//encoded IR instruction +struct Bytecode +{ + uint raw; + //natural constraints + enum maxSequence = 2+4; + enum maxData = 1 << 22; + enum maxRaw = 1 << 31; + + this(IR code, uint data) + { + assert(data < (1 << 22) && code < 256); + raw = code << 24 | data; + } + + this(IR code, uint data, uint seq) + { + assert(data < (1 << 22) && code < 256 ); + assert(seq >= 2 && seq < maxSequence); + raw = code << 24 | (seq - 2)<<22 | data; + } + + //store raw data + static Bytecode fromRaw(uint data) + { + Bytecode t; + t.raw = data; + return t; + } + + //bit twiddling helpers + //0-arg template due to @@@BUG@@@ 10985 + @property uint data()() const { return raw & 0x003f_ffff; } + + @property void data()(uint val) + { + raw = (raw & ~0x003f_ffff) | (val & 0x003f_ffff); + } + + //ditto + //0-arg template due to @@@BUG@@@ 10985 + @property uint sequence()() const { return 2 + (raw >> 22 & 0x3); } + + //ditto + //0-arg template due to @@@BUG@@@ 10985 + @property IR code()() const { return cast(IR)(raw >> 24); } + + //ditto + @property bool hotspot() const { return hasMerge(code); } + + //test the class of this instruction + @property bool isAtom() const { return isAtomIR(code); } + + //ditto + @property bool isStart() const { return isStartIR(code); } + + //ditto + @property bool isEnd() const { return isEndIR(code); } + + //number of arguments for this instruction + @property int args() const { return immediateParamsIR(code); } + + //mark this GroupStart or GroupEnd as referenced in backreference + void setBackrefence() + { + assert(code == IR.GroupStart || code == IR.GroupEnd); + raw = raw | 1 << 23; + } + + //is referenced + @property bool backreference() const + { + assert(code == IR.GroupStart || code == IR.GroupEnd); + return cast(bool)(raw & 1 << 23); + } + + //mark as local reference (for backrefs in lookarounds) + void setLocalRef() + { + assert(code == IR.Backref); + raw = raw | 1 << 23; + } + + //is a local ref + @property bool localRef() const + { + assert(code == IR.Backref); + return cast(bool)(raw & 1 << 23); + } + + //human readable name of instruction + @trusted @property string mnemonic()() const + {//@@@BUG@@@ to is @system + import std.conv : to; + return to!string(code); + } + + //full length of instruction + @property uint length() const + { + return lengthOfIR(code); + } + + //full length of respective start/end of this instruction + @property uint pairedLength() const + { + return lengthOfPairedIR(code); + } + + //returns bytecode of paired instruction (assuming this one is start or end) + @property Bytecode paired() const + {//depends on bit and struct layout order + assert(isStart || isEnd); + return Bytecode.fromRaw(raw ^ 0b11 << 24); + } + + //gets an index into IR block of the respective pair + uint indexOfPair(uint pc) const + { + assert(isStart || isEnd); + return isStart ? pc + data + length : pc - data - lengthOfPairedIR(code); + } +} + +static assert(Bytecode.sizeof == 4); + + +//index entry structure for name --> number of submatch +struct NamedGroup +{ + string name; + uint group; +} + +//holds pair of start-end markers for a submatch +struct Group(DataIndex) +{ + DataIndex begin, end; + @trusted string toString()() const + { + import std.array : appender; + import std.format : formattedWrite; + auto a = appender!string(); + formattedWrite(a, "%s..%s", begin, end); + return a.data; + } +} + +//debugging tool, prints out instruction along with opcodes +@trusted string disassemble(in Bytecode[] irb, uint pc, in NamedGroup[] dict=[]) +{ + import std.array : appender; + import std.format : formattedWrite; + auto output = appender!string(); + formattedWrite(output,"%s", irb[pc].mnemonic); + switch (irb[pc].code) + { + case IR.Char: + formattedWrite(output, " %s (0x%x)",cast(dchar) irb[pc].data, irb[pc].data); + break; + case IR.OrChar: + formattedWrite(output, " %s (0x%x) seq=%d", cast(dchar) irb[pc].data, irb[pc].data, irb[pc].sequence); + break; + case IR.RepeatStart, IR.InfiniteStart, IR.InfiniteBloomStart, + IR.Option, IR.GotoEndOr, IR.OrStart: + //forward-jump instructions + uint len = irb[pc].data; + formattedWrite(output, " pc=>%u", pc+len+IRL!(IR.RepeatStart)); + break; + case IR.RepeatEnd, IR.RepeatQEnd: //backward-jump instructions + uint len = irb[pc].data; + formattedWrite(output, " pc=>%u min=%u max=%u step=%u", + pc - len, irb[pc + 3].raw, irb[pc + 4].raw, irb[pc + 2].raw); + break; + case IR.InfiniteEnd, IR.InfiniteQEnd, IR.InfiniteBloomEnd, IR.OrEnd: //ditto + uint len = irb[pc].data; + formattedWrite(output, " pc=>%u", pc-len); + break; + case IR.LookaheadEnd, IR.NeglookaheadEnd: //ditto + uint len = irb[pc].data; + formattedWrite(output, " pc=>%u", pc-len); + break; + case IR.GroupStart, IR.GroupEnd: + uint n = irb[pc].data; + string name; + foreach (v;dict) + if (v.group == n) + { + name = "'"~v.name~"'"; + break; + } + formattedWrite(output, " %s #%u " ~ (irb[pc].backreference ? "referenced" : ""), + name, n); + break; + case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart: + uint len = irb[pc].data; + uint start = irb[pc+1].raw, end = irb[pc+2].raw; + formattedWrite(output, " pc=>%u [%u..%u]", pc + len + IRL!(IR.LookaheadStart), start, end); + break; + case IR.Backref: case IR.CodepointSet: case IR.Trie: + uint n = irb[pc].data; + formattedWrite(output, " %u", n); + if (irb[pc].code == IR.Backref) + formattedWrite(output, " %s", irb[pc].localRef ? "local" : "global"); + break; + default://all data-free instructions + } + if (irb[pc].hotspot) + formattedWrite(output, " Hotspot %u", irb[pc+1].raw); + return output.data; +} + +//disassemble the whole chunk +@trusted void printBytecode()(in Bytecode[] slice, in NamedGroup[] dict=[]) +{ + import std.stdio : writeln; + for (uint pc=0; pc<slice.length; pc += slice[pc].length) + writeln("\t", disassemble(slice, pc, dict)); +} + +/++ + $(D Regex) object holds regular expression pattern in compiled form. + Instances of this object are constructed via calls to $(D regex). + This is an intended form for caching and storage of frequently + used regular expressions. ++/ +struct Regex(Char) +{ + //temporary workaround for identifier lookup + CodepointSet[] charsets; // + Bytecode[] ir; //compiled bytecode of pattern + + + @safe @property bool empty() const nothrow { return ir is null; } + + @safe @property auto namedCaptures() + { + static struct NamedGroupRange + { + private: + NamedGroup[] groups; + size_t start; + size_t end; + public: + this(NamedGroup[] g, size_t s, size_t e) + { + assert(s <= e); + assert(e <= g.length); + groups = g; + start = s; + end = e; + } + + @property string front() { return groups[start].name; } + @property string back() { return groups[end-1].name; } + @property bool empty() { return start >= end; } + @property size_t length() { return end - start; } + alias opDollar = length; + @property NamedGroupRange save() + { + return NamedGroupRange(groups, start, end); + } + void popFront() { assert(!empty); start++; } + void popBack() { assert(!empty); end--; } + string opIndex()(size_t i) + { + assert(start + i < end, + "Requested named group is out of range."); + return groups[start+i].name; + } + NamedGroupRange opSlice(size_t low, size_t high) { + assert(low <= high); + assert(start + high <= end); + return NamedGroupRange(groups, start + low, start + high); + } + NamedGroupRange opSlice() { return this.save; } + } + return NamedGroupRange(dict, 0, dict.length); + } + +package(std.regex): + import std.regex.internal.kickstart : Kickstart; //TODO: get rid of this dependency + NamedGroup[] dict; // maps name -> user group number + uint ngroup; // number of internal groups + uint maxCounterDepth; // max depth of nested {n,m} repetitions + uint hotspotTableSize; // number of entries in merge table + uint threadCount; // upper bound on number of Thompson VM threads + uint flags; // global regex flags + public const(CharMatcher)[] matchers; // tables that represent character sets + public const(BitTable)[] filters; // bloom filters for conditional loops + uint[] backrefed; // bit array of backreferenced submatches + Kickstart!Char kickstart; + + //bit access helper + uint isBackref(uint n) + { + if (n/32 >= backrefed.length) + return 0; + return backrefed[n / 32] & (1 << (n & 31)); + } + + //check if searching is not needed + void checkIfOneShot() + { + L_CheckLoop: + for (uint i = 0; i < ir.length; i += ir[i].length) + { + switch (ir[i].code) + { + case IR.Bof: + flags |= RegexInfo.oneShot; + break L_CheckLoop; + case IR.GroupStart, IR.GroupEnd, IR.Bol, IR.Eol, IR.Eof, + IR.Wordboundary, IR.Notwordboundary: + break; + default: + break L_CheckLoop; + } + } + } + + //print out disassembly a program's IR + @trusted debug(std_regex_parser) void print() const + {//@@@BUG@@@ write is system + for (uint i = 0; i < ir.length; i += ir[i].length) + { + writefln("%d\t%s ", i, disassemble(ir, i, dict)); + } + writeln("Total merge table size: ", hotspotTableSize); + writeln("Max counter nesting depth: ", maxCounterDepth); + } + +} + +//@@@BUG@@@ (unreduced) - public makes it inaccessible in std.regex.package (!) +/*public*/ struct StaticRegex(Char) +{ +package(std.regex): + import std.regex.internal.backtracking : BacktrackingMatcher; + alias Matcher = BacktrackingMatcher!(true); + alias MatchFn = bool function(ref Matcher!Char) @trusted; + MatchFn nativeFn; +public: + Regex!Char _regex; + alias _regex this; + this(Regex!Char re, MatchFn fn) + { + _regex = re; + nativeFn = fn; + + } + +} + +// The stuff below this point is temporarrily part of IR module +// but may need better place in the future (all internals) +package(std.regex): + +//Simple UTF-string abstraction compatible with stream interface +struct Input(Char) +if (is(Char :dchar)) +{ + import std.utf : decode; + alias DataIndex = size_t; + enum bool isLoopback = false; + alias String = const(Char)[]; + String _origin; + size_t _index; + + //constructs Input object out of plain string + this(String input, size_t idx = 0) + { + _origin = input; + _index = idx; + } + + //codepoint at current stream position + pragma(inline, true) bool nextChar(ref dchar res, ref size_t pos) + { + pos = _index; + // DMD's inliner hates multiple return functions + // but can live with single statement if/else bodies + bool n = !(_index == _origin.length); + if (n) + res = decode(_origin, _index); + return n; + } + @property bool atEnd(){ + return _index == _origin.length; + } + bool search(Kickstart)(ref Kickstart kick, ref dchar res, ref size_t pos) + { + size_t idx = kick.search(_origin, _index); + _index = idx; + return nextChar(res, pos); + } + + //index of at End position + @property size_t lastIndex(){ return _origin.length; } + + //support for backtracker engine, might not be present + void reset(size_t index){ _index = index; } + + String opSlice(size_t start, size_t end){ return _origin[start .. end]; } + + auto loopBack(size_t index){ return BackLooper!Input(this, index); } +} + +struct BackLooperImpl(Input) +{ + import std.utf : strideBack; + alias DataIndex = size_t; + alias String = Input.String; + enum bool isLoopback = true; + String _origin; + size_t _index; + this(Input input, size_t index) + { + _origin = input._origin; + _index = index; + } + @trusted bool nextChar(ref dchar res,ref size_t pos) + { + pos = _index; + if (_index == 0) + return false; + + res = _origin[0.._index].back; + _index -= strideBack(_origin, _index); + + return true; + } + @property atEnd(){ return _index == 0 || _index == strideBack(_origin, _index); } + auto loopBack(size_t index){ return Input(_origin, index); } + + //support for backtracker engine, might not be present + //void reset(size_t index){ _index = index ? index-std.utf.strideBack(_origin, index) : 0; } + void reset(size_t index){ _index = index; } + + String opSlice(size_t start, size_t end){ return _origin[end .. start]; } + //index of at End position + @property size_t lastIndex(){ return 0; } +} + +template BackLooper(E) +{ + static if (is(E : BackLooperImpl!U, U)) + { + alias BackLooper = U; + } + else + { + alias BackLooper = BackLooperImpl!E; + } +} + +//both helpers below are internal, on its own are quite "explosive" +//unsafe, no initialization of elements +@system T[] mallocArray(T)(size_t len) +{ + import core.stdc.stdlib : malloc; + return (cast(T*) malloc(len * T.sizeof))[0 .. len]; +} + +//very unsafe, no initialization +@system T[] arrayInChunk(T)(size_t len, ref void[] chunk) +{ + auto ret = (cast(T*) chunk.ptr)[0 .. len]; + chunk = chunk[len * T.sizeof .. $]; + return ret; +} + +// +@trusted uint lookupNamedGroup(String)(NamedGroup[] dict, String name) +{//equal is @system? + import std.algorithm.comparison : equal; + import std.algorithm.iteration : map; + import std.conv : text; + import std.range : assumeSorted; + + auto fnd = assumeSorted!"cmp(a,b) < 0"(map!"a.name"(dict)).lowerBound(name).length; + enforce(fnd < dict.length && equal(dict[fnd].name, name), + text("no submatch named ", name)); + return dict[fnd].group; +} + +//whether ch is one of unicode newline sequences +//0-arg template due to @@@BUG@@@ 10985 +bool endOfLine()(dchar front, bool seenCr) +{ + return ((front == '\n') ^ seenCr) || front == '\r' + || front == NEL || front == LS || front == PS; +} + +// +//0-arg template due to @@@BUG@@@ 10985 +bool startOfLine()(dchar back, bool seenNl) +{ + return ((back == '\r') ^ seenNl) || back == '\n' + || back == NEL || back == LS || back == PS; +} + +///Exception object thrown in case of errors during regex compilation. +public class RegexException : Exception +{ + mixin basicExceptionCtors; +} + +// simple 128-entry bit-table used with a hash function +struct BitTable { + uint[4] filter; + + this(CodepointSet set){ + foreach (iv; set.byInterval) + { + foreach (v; iv.a .. iv.b) + add(v); + } + } + + void add()(dchar ch){ + immutable i = index(ch); + filter[i >> 5] |= 1<<(i & 31); + } + // non-zero -> might be present, 0 -> absent + bool opIndex()(dchar ch) const{ + immutable i = index(ch); + return (filter[i >> 5]>>(i & 31)) & 1; + } + + static uint index()(dchar ch){ + return ((ch >> 7) ^ ch) & 0x7F; + } +} + +struct CharMatcher { + BitTable ascii; // fast path for ASCII + Trie trie; // slow path for Unicode + + this(CodepointSet set) + { + auto asciiSet = set & unicode.ASCII; + ascii = BitTable(asciiSet); + trie = makeTrie(set); + } + + bool opIndex()(dchar ch) const + { + if (ch < 0x80) + return ascii[ch]; + else + return trie[ch]; + } +} diff --git a/libphobos/src/std/regex/internal/kickstart.d b/libphobos/src/std/regex/internal/kickstart.d new file mode 100644 index 0000000..f303b43 --- /dev/null +++ b/libphobos/src/std/regex/internal/kickstart.d @@ -0,0 +1,579 @@ +/* + Kickstart is a coarse-grained "filter" engine that finds likely matches + to be verified by full-blown matcher. +*/ +module std.regex.internal.kickstart; + +package(std.regex): + +import std.range.primitives, std.utf; +import std.regex.internal.ir; + +//utility for shiftOr, returns a minimum number of bytes to test in a Char +uint effectiveSize(Char)() +{ + static if (is(Char == char)) + return 1; + else static if (is(Char == wchar)) + return 2; + else static if (is(Char == dchar)) + return 3; + else + static assert(0); +} + +/* + Kickstart engine using ShiftOr algorithm, + a bit parallel technique for inexact string searching. +*/ +struct ShiftOr(Char) +{ +private: + uint[] table; + uint fChar; + uint n_length; + enum charSize = effectiveSize!Char(); + //maximum number of chars in CodepointSet to process + enum uint charsetThreshold = 32_000; + static struct ShiftThread + { + uint[] tab; + uint mask; + uint idx; + uint pc, counter, hops; + this(uint newPc, uint newCounter, uint[] table) + { + pc = newPc; + counter = newCounter; + mask = 1; + idx = 0; + hops = 0; + tab = table; + } + + void setMask(uint idx, uint mask) + { + tab[idx] |= mask; + } + + void setInvMask(uint idx, uint mask) + { + tab[idx] &= ~mask; + } + + void set(alias setBits = setInvMask)(dchar ch) + { + static if (charSize == 3) + { + uint val = ch, tmask = mask; + setBits(val&0xFF, tmask); + tmask <<= 1; + val >>= 8; + setBits(val&0xFF, tmask); + tmask <<= 1; + val >>= 8; + assert(val <= 0x10); + setBits(val, tmask); + tmask <<= 1; + } + else + { + Char[dchar.sizeof/Char.sizeof] buf; + uint tmask = mask; + size_t total = encode(buf, ch); + for (size_t i = 0; i < total; i++, tmask<<=1) + { + static if (charSize == 1) + setBits(buf[i], tmask); + else static if (charSize == 2) + { + setBits(buf[i]&0xFF, tmask); + tmask <<= 1; + setBits(buf[i]>>8, tmask); + } + } + } + } + void add(dchar ch){ return set!setInvMask(ch); } + void advance(uint s) + { + mask <<= s; + idx += s; + } + @property bool full(){ return !mask; } + } + + static ShiftThread fork(ShiftThread t, uint newPc, uint newCounter) + { + ShiftThread nt = t; + nt.pc = newPc; + nt.counter = newCounter; + return nt; + } + + @trusted static ShiftThread fetch(ref ShiftThread[] worklist) + { + auto t = worklist[$-1]; + worklist.length -= 1; + if (!__ctfe) + cast(void) worklist.assumeSafeAppend(); + return t; + } + + static uint charLen(uint ch) + { + assert(ch <= 0x10FFFF); + return codeLength!Char(cast(dchar) ch)*charSize; + } + +public: + @trusted this(ref Regex!Char re, uint[] memory) + { + static import std.algorithm.comparison; + import std.algorithm.searching : countUntil; + import std.conv : text; + import std.range : assumeSorted; + assert(memory.length == 256); + fChar = uint.max; + // FNV-1a flavored hash (uses 32bits at a time) + ulong hash(uint[] tab) + { + ulong h = 0xcbf29ce484222325; + foreach (v; tab) + { + h ^= v; + h *= 0x100000001b3; + } + return h; + } + L_FindChar: + for (size_t i = 0;;) + { + switch (re.ir[i].code) + { + case IR.Char: + fChar = re.ir[i].data; + static if (charSize != 3) + { + Char[dchar.sizeof/Char.sizeof] buf; + encode(buf, fChar); + fChar = buf[0]; + } + fChar = fChar & 0xFF; + break L_FindChar; + case IR.GroupStart, IR.GroupEnd: + i += IRL!(IR.GroupStart); + break; + case IR.Bof, IR.Bol, IR.Wordboundary, IR.Notwordboundary: + i += IRL!(IR.Bol); + break; + default: + break L_FindChar; + } + } + table = memory; + table[] = uint.max; + alias MergeTab = bool[ulong]; + // use reasonably complex hash to identify equivalent tables + auto merge = new MergeTab[re.hotspotTableSize]; + ShiftThread[] trs; + ShiftThread t = ShiftThread(0, 0, table); + //locate first fixed char if any + n_length = 32; + for (;;) + { + L_Eval_Thread: + for (;;) + { + switch (re.ir[t.pc].code) + { + case IR.Char: + uint s = charLen(re.ir[t.pc].data); + if (t.idx+s > n_length) + goto L_StopThread; + t.add(re.ir[t.pc].data); + t.advance(s); + t.pc += IRL!(IR.Char); + break; + case IR.OrChar://assumes IRL!(OrChar) == 1 + uint len = re.ir[t.pc].sequence; + uint end = t.pc + len; + uint[Bytecode.maxSequence] s; + uint numS; + for (uint i = 0; i < len; i++) + { + auto x = charLen(re.ir[t.pc+i].data); + if (countUntil(s[0 .. numS], x) < 0) + s[numS++] = x; + } + for (uint i = t.pc; i < end; i++) + { + t.add(re.ir[i].data); + } + for (uint i = 0; i < numS; i++) + { + auto tx = fork(t, t.pc + len, t.counter); + if (tx.idx + s[i] <= n_length) + { + tx.advance(s[i]); + trs ~= tx; + } + } + if (!trs.empty) + t = fetch(trs); + else + goto L_StopThread; + break; + case IR.CodepointSet: + case IR.Trie: + auto set = re.charsets[re.ir[t.pc].data]; + uint[4] s; + uint numS; + static if (charSize == 3) + { + s[0] = charSize; + numS = 1; + } + else + { + + static if (charSize == 1) + static immutable codeBounds = [0x0, 0x7F, 0x80, 0x7FF, 0x800, 0xFFFF, 0x10000, 0x10FFFF]; + else //== 2 + static immutable codeBounds = [0x0, 0xFFFF, 0x10000, 0x10FFFF]; + uint[] arr = new uint[set.byInterval.length * 2]; + size_t ofs = 0; + foreach (ival; set.byInterval) + { + arr[ofs++] = ival.a; + arr[ofs++] = ival.b; + } + auto srange = assumeSorted!"a <= b"(arr); + for (uint i = 0; i < codeBounds.length/2; i++) + { + auto start = srange.lowerBound(codeBounds[2*i]).length; + auto end = srange.lowerBound(codeBounds[2*i+1]).length; + if (end > start || (end == start && (end & 1))) + s[numS++] = (i+1)*charSize; + } + } + if (numS == 0 || t.idx + s[numS-1] > n_length) + goto L_StopThread; + auto chars = set.length; + if (chars > charsetThreshold) + goto L_StopThread; + foreach (ch; set.byCodepoint) + { + //avoid surrogate pairs + if (0xD800 <= ch && ch <= 0xDFFF) + continue; + t.add(ch); + } + for (uint i = 0; i < numS; i++) + { + auto tx = fork(t, t.pc + IRL!(IR.CodepointSet), t.counter); + tx.advance(s[i]); + trs ~= tx; + } + if (!trs.empty) + t = fetch(trs); + else + goto L_StopThread; + break; + case IR.Any: + goto L_StopThread; + + case IR.GotoEndOr: + t.pc += IRL!(IR.GotoEndOr)+re.ir[t.pc].data; + assert(re.ir[t.pc].code == IR.OrEnd); + goto case; + case IR.OrEnd: + auto slot = re.ir[t.pc+1].raw+t.counter; + auto val = hash(t.tab); + if (val in merge[slot]) + goto L_StopThread; // merge equivalent + merge[slot][val] = true; + t.pc += IRL!(IR.OrEnd); + break; + case IR.OrStart: + t.pc += IRL!(IR.OrStart); + goto case; + case IR.Option: + uint next = t.pc + re.ir[t.pc].data + IRL!(IR.Option); + //queue next Option + if (re.ir[next].code == IR.Option) + { + trs ~= fork(t, next, t.counter); + } + t.pc += IRL!(IR.Option); + break; + case IR.RepeatStart:case IR.RepeatQStart: + t.pc += IRL!(IR.RepeatStart)+re.ir[t.pc].data; + goto case IR.RepeatEnd; + case IR.RepeatEnd: + case IR.RepeatQEnd: + auto slot = re.ir[t.pc+1].raw+t.counter; + auto val = hash(t.tab); + if (val in merge[slot]) + goto L_StopThread; // merge equivalent + merge[slot][val] = true; + uint len = re.ir[t.pc].data; + uint step = re.ir[t.pc+2].raw; + uint min = re.ir[t.pc+3].raw; + if (t.counter < min) + { + t.counter += step; + t.pc -= len; + break; + } + uint max = re.ir[t.pc+4].raw; + if (t.counter < max) + { + trs ~= fork(t, t.pc - len, t.counter + step); + t.counter = t.counter%step; + t.pc += IRL!(IR.RepeatEnd); + } + else + { + t.counter = t.counter%step; + t.pc += IRL!(IR.RepeatEnd); + } + break; + case IR.InfiniteStart, IR.InfiniteQStart: + t.pc += re.ir[t.pc].data + IRL!(IR.InfiniteStart); + goto case IR.InfiniteEnd; //both Q and non-Q + case IR.InfiniteEnd: + case IR.InfiniteQEnd: + auto slot = re.ir[t.pc+1].raw+t.counter; + auto val = hash(t.tab); + if (val in merge[slot]) + goto L_StopThread; // merge equivalent + merge[slot][val] = true; + uint len = re.ir[t.pc].data; + uint pc1, pc2; //branches to take in priority order + if (++t.hops == 32) + goto L_StopThread; + pc1 = t.pc + IRL!(IR.InfiniteEnd); + pc2 = t.pc - len; + trs ~= fork(t, pc2, t.counter); + t.pc = pc1; + break; + case IR.GroupStart, IR.GroupEnd: + t.pc += IRL!(IR.GroupStart); + break; + case IR.Bof, IR.Bol, IR.Wordboundary, IR.Notwordboundary: + t.pc += IRL!(IR.Bol); + break; + case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart: + t.pc += IRL!(IR.LookaheadStart) + IRL!(IR.LookaheadEnd) + re.ir[t.pc].data; + break; + default: + L_StopThread: + assert(re.ir[t.pc].code >= 0x80, text(re.ir[t.pc].code)); + debug (fred_search) writeln("ShiftOr stumbled on ",re.ir[t.pc].mnemonic); + n_length = std.algorithm.comparison.min(t.idx, n_length); + break L_Eval_Thread; + } + } + if (trs.empty) + break; + t = fetch(trs); + } + debug(std_regex_search) + { + writeln("Min length: ", n_length); + } + } + + @property bool empty() const { return n_length == 0; } + + @property uint length() const{ return n_length/charSize; } + + // lookup compatible bit pattern in haystack, return starting index + // has a useful trait: if supplied with valid UTF indexes, + // returns only valid UTF indexes + // (that given the haystack in question is valid UTF string) + @trusted size_t search(const(Char)[] haystack, size_t idx) + {//@BUG: apparently assumes little endian machines + import core.stdc.string : memchr; + import std.conv : text; + assert(!empty); + auto p = cast(const(ubyte)*)(haystack.ptr+idx); + uint state = uint.max; + uint limit = 1u<<(n_length - 1u); + debug(std_regex_search) writefln("Limit: %32b",limit); + if (fChar != uint.max) + { + const(ubyte)* end = cast(ubyte*)(haystack.ptr + haystack.length); + const orginalAlign = cast(size_t) p & (Char.sizeof-1); + while (p != end) + { + if (!~state) + {//speed up seeking first matching place + for (;;) + { + assert(p <= end, text(p," vs ", end)); + p = cast(ubyte*) memchr(p, fChar, end - p); + if (!p) + return haystack.length; + if ((cast(size_t) p & (Char.sizeof-1)) == orginalAlign) + break; + if (++p == end) + return haystack.length; + } + state = ~1u; + assert((cast(size_t) p & (Char.sizeof-1)) == orginalAlign); + static if (charSize == 3) + { + state = (state << 1) | table[p[1]]; + state = (state << 1) | table[p[2]]; + p += 4; + } + else + p++; + //first char is tested, see if that's all + if (!(state & limit)) + return (p-cast(ubyte*) haystack.ptr)/Char.sizeof + -length; + } + else + {//have some bits/states for possible matches, + //use the usual shift-or cycle + static if (charSize == 3) + { + state = (state << 1) | table[p[0]]; + state = (state << 1) | table[p[1]]; + state = (state << 1) | table[p[2]]; + p += 4; + } + else + { + state = (state << 1) | table[p[0]]; + p++; + } + if (!(state & limit)) + return (p-cast(ubyte*) haystack.ptr)/Char.sizeof + -length; + } + debug(std_regex_search) writefln("State: %32b", state); + } + } + else + { + //normal path, partially unrolled for char/wchar + static if (charSize == 3) + { + const(ubyte)* end = cast(ubyte*)(haystack.ptr + haystack.length); + while (p != end) + { + state = (state << 1) | table[p[0]]; + state = (state << 1) | table[p[1]]; + state = (state << 1) | table[p[2]]; + p += 4; + if (!(state & limit))//division rounds down for dchar + return (p-cast(ubyte*) haystack.ptr)/Char.sizeof + -length; + } + } + else + { + auto len = cast(ubyte*)(haystack.ptr + haystack.length) - p; + size_t i = 0; + if (len & 1) + { + state = (state << 1) | table[p[i++]]; + if (!(state & limit)) + return idx+i/Char.sizeof-length; + } + while (i < len) + { + state = (state << 1) | table[p[i++]]; + if (!(state & limit)) + return idx+i/Char.sizeof + -length; + state = (state << 1) | table[p[i++]]; + if (!(state & limit)) + return idx+i/Char.sizeof + -length; + debug(std_regex_search) writefln("State: %32b", state); + } + } + } + return haystack.length; + } + + @system debug static void dump(uint[] table) + {//@@@BUG@@@ writef(ln) is @system + import std.stdio : writefln; + for (size_t i = 0; i < table.length; i += 4) + { + writefln("%32b %32b %32b %32b",table[i], table[i+1], table[i+2], table[i+3]); + } + } +} + +@system unittest +{ + import std.conv, std.regex; + @trusted void test_fixed(alias Kick)() + { + foreach (i, v; AliasSeq!(char, wchar, dchar)) + { + alias Char = v; + alias String = immutable(v)[]; + auto r = regex(to!String(`abc$`)); + auto kick = Kick!Char(r, new uint[256]); + assert(kick.length == 3, text(Kick.stringof," ",v.stringof, " == ", kick.length)); + auto r2 = regex(to!String(`(abc){2}a+`)); + kick = Kick!Char(r2, new uint[256]); + assert(kick.length == 7, text(Kick.stringof,v.stringof," == ", kick.length)); + auto r3 = regex(to!String(`\b(a{2}b{3}){2,4}`)); + kick = Kick!Char(r3, new uint[256]); + assert(kick.length == 10, text(Kick.stringof,v.stringof," == ", kick.length)); + auto r4 = regex(to!String(`\ba{2}c\bxyz`)); + kick = Kick!Char(r4, new uint[256]); + assert(kick.length == 6, text(Kick.stringof,v.stringof, " == ", kick.length)); + auto r5 = regex(to!String(`\ba{2}c\b`)); + kick = Kick!Char(r5, new uint[256]); + size_t x = kick.search("aabaacaa", 0); + assert(x == 3, text(Kick.stringof,v.stringof," == ", kick.length)); + x = kick.search("aabaacaa", x+1); + assert(x == 8, text(Kick.stringof,v.stringof," == ", kick.length)); + } + } + @trusted void test_flex(alias Kick)() + { + foreach (i, v; AliasSeq!(char, wchar, dchar)) + { + alias Char = v; + alias String = immutable(v)[]; + auto r = regex(to!String(`abc[a-z]`)); + auto kick = Kick!Char(r, new uint[256]); + auto x = kick.search(to!String("abbabca"), 0); + assert(x == 3, text("real x is ", x, " ",v.stringof)); + + auto r2 = regex(to!String(`(ax|bd|cdy)`)); + String s2 = to!String("abdcdyabax"); + kick = Kick!Char(r2, new uint[256]); + x = kick.search(s2, 0); + assert(x == 1, text("real x is ", x)); + x = kick.search(s2, x+1); + assert(x == 3, text("real x is ", x)); + x = kick.search(s2, x+1); + assert(x == 8, text("real x is ", x)); + auto rdot = regex(to!String(`...`)); + kick = Kick!Char(rdot, new uint[256]); + assert(kick.length == 0); + auto rN = regex(to!String(`a(b+|c+)x`)); + kick = Kick!Char(rN, new uint[256]); + assert(kick.length == 3, to!string(kick.length)); + assert(kick.search("ababx",0) == 2); + assert(kick.search("abaacba",0) == 3);//expected inexact + + } + } + test_fixed!(ShiftOr)(); + test_flex!(ShiftOr)(); +} + +alias Kickstart = ShiftOr; diff --git a/libphobos/src/std/regex/internal/parser.d b/libphobos/src/std/regex/internal/parser.d new file mode 100644 index 0000000..8cabae5 --- /dev/null +++ b/libphobos/src/std/regex/internal/parser.d @@ -0,0 +1,1751 @@ +//Written in the D programming language +/* + Regular expression pattern parser. +*/ +module std.regex.internal.parser; + +static import std.ascii; +import std.range.primitives, std.uni, std.meta, + std.traits, std.typecons, std.exception; +import std.regex.internal.ir; + +// package relevant info from parser into a regex object +auto makeRegex(S, CG)(Parser!(S, CG) p) +{ + Regex!(BasicElementOf!S) re; + auto g = p.g; + with(re) + { + ir = g.ir; + dict = g.dict; + ngroup = g.ngroup; + maxCounterDepth = g.counterDepth; + flags = p.re_flags; + charsets = g.charsets; + matchers = g.matchers; + backrefed = g.backrefed; + re.postprocess(); + debug(std_regex_parser) + { + __ctfe || print(); + } + //@@@BUG@@@ (not reduced) + //somehow just using validate _collides_ with std.utf.validate (!) + version (assert) re.validateRe(); + } + return re; +} + +// helper for unittest +auto makeRegex(S)(S arg) +if (isSomeString!S) +{ + return makeRegex(Parser!(S, CodeGen)(arg, "")); +} + +@system unittest +{ + import std.algorithm.comparison : equal; + auto re = makeRegex(`(?P<name>\w+) = (?P<var>\d+)`); + auto nc = re.namedCaptures; + static assert(isRandomAccessRange!(typeof(nc))); + assert(!nc.empty); + assert(nc.length == 2); + assert(nc.equal(["name", "var"])); + assert(nc[0] == "name"); + assert(nc[1..$].equal(["var"])); + + re = makeRegex(`(\w+) (?P<named>\w+) (\w+)`); + nc = re.namedCaptures; + assert(nc.length == 1); + assert(nc[0] == "named"); + assert(nc.front == "named"); + assert(nc.back == "named"); + + re = makeRegex(`(\w+) (\w+)`); + nc = re.namedCaptures; + assert(nc.empty); + + re = makeRegex(`(?P<year>\d{4})/(?P<month>\d{2})/(?P<day>\d{2})/`); + nc = re.namedCaptures; + auto cp = nc.save; + assert(nc.equal(cp)); + nc.popFront(); + assert(nc.equal(cp[1..$])); + nc.popBack(); + assert(nc.equal(cp[1 .. $ - 1])); +} + + +@trusted void reverseBytecode()(Bytecode[] code) +{ + Bytecode[] rev = new Bytecode[code.length]; + uint revPc = cast(uint) rev.length; + Stack!(Tuple!(uint, uint, uint)) stack; + uint start = 0; + uint end = cast(uint) code.length; + for (;;) + { + for (uint pc = start; pc < end; ) + { + immutable len = code[pc].length; + if (code[pc].code == IR.GotoEndOr) + break; //pick next alternation branch + if (code[pc].isAtom) + { + rev[revPc - len .. revPc] = code[pc .. pc + len]; + revPc -= len; + pc += len; + } + else if (code[pc].isStart || code[pc].isEnd) + { + //skip over other embedded lookbehinds they are reversed + if (code[pc].code == IR.LookbehindStart + || code[pc].code == IR.NeglookbehindStart) + { + immutable blockLen = len + code[pc].data + + code[pc].pairedLength; + rev[revPc - blockLen .. revPc] = code[pc .. pc + blockLen]; + pc += blockLen; + revPc -= blockLen; + continue; + } + immutable second = code[pc].indexOfPair(pc); + immutable secLen = code[second].length; + rev[revPc - secLen .. revPc] = code[second .. second + secLen]; + revPc -= secLen; + if (code[pc].code == IR.OrStart) + { + //we pass len bytes forward, but secLen in reverse + immutable revStart = revPc - (second + len - secLen - pc); + uint r = revStart; + uint i = pc + IRL!(IR.OrStart); + while (code[i].code == IR.Option) + { + if (code[i - 1].code != IR.OrStart) + { + assert(code[i - 1].code == IR.GotoEndOr); + rev[r - 1] = code[i - 1]; + } + rev[r] = code[i]; + auto newStart = i + IRL!(IR.Option); + auto newEnd = newStart + code[i].data; + auto newRpc = r + code[i].data + IRL!(IR.Option); + if (code[newEnd].code != IR.OrEnd) + { + newRpc--; + } + stack.push(tuple(newStart, newEnd, newRpc)); + r += code[i].data + IRL!(IR.Option); + i += code[i].data + IRL!(IR.Option); + } + pc = i; + revPc = revStart; + assert(code[pc].code == IR.OrEnd); + } + else + pc += len; + } + } + if (stack.empty) + break; + start = stack.top[0]; + end = stack.top[1]; + revPc = stack.top[2]; + stack.pop(); + } + code[] = rev[]; +} + +//test if a given string starts with hex number of maxDigit that's a valid codepoint +//returns it's value and skips these maxDigit chars on success, throws on failure +dchar parseUniHex(Char)(ref Char[] str, size_t maxDigit) +{ + //std.conv.parse is both @system and bogus + enforce(str.length >= maxDigit,"incomplete escape sequence"); + uint val; + for (int k = 0; k < maxDigit; k++) + { + immutable current = str[k];//accepts ascii only, so it's OK to index directly + if ('0' <= current && current <= '9') + val = val * 16 + current - '0'; + else if ('a' <= current && current <= 'f') + val = val * 16 + current -'a' + 10; + else if ('A' <= current && current <= 'F') + val = val * 16 + current - 'A' + 10; + else + throw new Exception("invalid escape sequence"); + } + enforce(val <= 0x10FFFF, "invalid codepoint"); + str = str[maxDigit..$]; + return val; +} + +@system unittest //BUG canFind is system +{ + import std.algorithm.searching : canFind; + string[] non_hex = [ "000j", "000z", "FffG", "0Z"]; + string[] hex = [ "01", "ff", "00af", "10FFFF" ]; + int[] value = [ 1, 0xFF, 0xAF, 0x10FFFF ]; + foreach (v; non_hex) + assert(collectException(parseUniHex(v, v.length)).msg + .canFind("invalid escape sequence")); + foreach (i, v; hex) + assert(parseUniHex(v, v.length) == value[i]); + string over = "0011FFFF"; + assert(collectException(parseUniHex(over, over.length)).msg + .canFind("invalid codepoint")); +} + +auto caseEnclose(CodepointSet set) +{ + auto cased = set & unicode.LC; + foreach (dchar ch; cased.byCodepoint) + { + foreach (c; simpleCaseFoldings(ch)) + set |= c; + } + return set; +} + +/+ + fetch codepoint set corresponding to a name (InBlock or binary property) ++/ +@trusted CodepointSet getUnicodeSet(in char[] name, bool negated, bool casefold) +{ + CodepointSet s = unicode(name); + //FIXME: caseEnclose for new uni as Set | CaseEnclose(SET && LC) + if (casefold) + s = caseEnclose(s); + if (negated) + s = s.inverted; + return s; +} + +//basic stack, just in case it gets used anywhere else then Parser +@trusted struct Stack(T) +{ + T[] data; + @property bool empty(){ return data.empty; } + + @property size_t length(){ return data.length; } + + void push(T val){ data ~= val; } + + T pop() + { + assert(!empty); + auto val = data[$ - 1]; + data = data[0 .. $ - 1]; + if (!__ctfe) + cast(void) data.assumeSafeAppend(); + return val; + } + + @property ref T top() + { + assert(!empty); + return data[$ - 1]; + } +} + +struct CodeGen +{ + Bytecode[] ir; // resulting bytecode + Stack!(uint) fixupStack; // stack of opened start instructions + NamedGroup[] dict; // maps name -> user group number + Stack!(uint) groupStack; // stack of current number of group + uint nesting = 0; // group nesting level and repetitions step + uint lookaroundNest = 0; // nesting of lookaround + uint counterDepth = 0; // current depth of nested counted repetitions + CodepointSet[] charsets; // sets for char classes + const(CharMatcher)[] matchers; // matchers for char classes + uint[] backrefed; // bitarray for groups refered by backref + uint ngroup; // final number of groups (of all patterns) + + void start(uint length) + { + if (!__ctfe) + ir.reserve((length*5+2)/4); + fixupStack.push(0); + groupStack.push(1);//0 - whole match + } + + //mark referenced groups for latter processing + void markBackref(uint n) + { + if (n/32 >= backrefed.length) + backrefed.length = n/32 + 1; + backrefed[n / 32] |= 1 << (n & 31); + } + + bool isOpenGroup(uint n) + { + import std.algorithm.searching : canFind; + // walk the fixup stack and see if there are groups labeled 'n' + // fixup '0' is reserved for alternations + return fixupStack.data[1..$]. + canFind!(fix => ir[fix].code == IR.GroupStart && ir[fix].data == n)(); + } + + void put(Bytecode code) + { + enforce(ir.length < maxCompiledLength, + "maximum compiled pattern length is exceeded"); + ir ~= code; + } + + void putRaw(uint number) + { + enforce(ir.length < maxCompiledLength, + "maximum compiled pattern length is exceeded"); + ir ~= Bytecode.fromRaw(number); + } + + //try to generate optimal IR code for this CodepointSet + @trusted void charsetToIr(CodepointSet set) + {//@@@BUG@@@ writeln is @system + uint chars = cast(uint) set.length; + if (chars < Bytecode.maxSequence) + { + switch (chars) + { + case 1: + put(Bytecode(IR.Char, set.byCodepoint.front)); + break; + case 0: + throw new RegexException("empty CodepointSet not allowed"); + default: + foreach (ch; set.byCodepoint) + put(Bytecode(IR.OrChar, ch, chars)); + } + } + else + { + import std.algorithm.searching : countUntil; + const ivals = set.byInterval; + immutable n = charsets.countUntil(set); + if (n >= 0) + { + if (ivals.length*2 > maxCharsetUsed) + put(Bytecode(IR.Trie, cast(uint) n)); + else + put(Bytecode(IR.CodepointSet, cast(uint) n)); + return; + } + if (ivals.length*2 > maxCharsetUsed) + { + auto t = getMatcher(set); + put(Bytecode(IR.Trie, cast(uint) matchers.length)); + matchers ~= t; + debug(std_regex_allocation) writeln("Trie generated"); + } + else + { + put(Bytecode(IR.CodepointSet, cast(uint) charsets.length)); + matchers ~= CharMatcher.init; + } + charsets ~= set; + assert(charsets.length == matchers.length); + } + } + + void genLogicGroup() + { + nesting++; + pushFixup(length); + put(Bytecode(IR.Nop, 0)); + } + + void genGroup() + { + nesting++; + pushFixup(length); + immutable nglob = groupStack.top++; + enforce(groupStack.top <= maxGroupNumber, "limit on number of submatches is exceeded"); + put(Bytecode(IR.GroupStart, nglob)); + } + + void genNamedGroup(string name) + { + import std.array : insertInPlace; + import std.range : assumeSorted; + nesting++; + pushFixup(length); + immutable nglob = groupStack.top++; + enforce(groupStack.top <= maxGroupNumber, "limit on submatches is exceeded"); + auto t = NamedGroup(name, nglob); + auto d = assumeSorted!"a.name < b.name"(dict); + immutable ind = d.lowerBound(t).length; + insertInPlace(dict, ind, t); + put(Bytecode(IR.GroupStart, nglob)); + } + + //generate code for start of lookaround: (?= (?! (?<= (?<! + void genLookaround(IR opcode) + { + nesting++; + pushFixup(length); + put(Bytecode(opcode, 0)); + put(Bytecode.fromRaw(0)); + put(Bytecode.fromRaw(0)); + groupStack.push(0); + lookaroundNest++; + enforce(lookaroundNest <= maxLookaroundDepth, + "maximum lookaround depth is exceeded"); + } + + void endPattern(uint num) + { + import std.algorithm.comparison : max; + put(Bytecode(IR.End, num)); + ngroup = max(ngroup, groupStack.top); + groupStack.top = 1; // reset group counter + } + + //fixup lookaround with start at offset fix and append a proper *-End opcode + void fixLookaround(uint fix) + { + lookaroundNest--; + ir[fix] = Bytecode(ir[fix].code, + cast(uint) ir.length - fix - IRL!(IR.LookaheadStart)); + auto g = groupStack.pop(); + assert(!groupStack.empty); + ir[fix+1] = Bytecode.fromRaw(groupStack.top); + //groups are cumulative across lookarounds + ir[fix+2] = Bytecode.fromRaw(groupStack.top+g); + groupStack.top += g; + if (ir[fix].code == IR.LookbehindStart || ir[fix].code == IR.NeglookbehindStart) + { + reverseBytecode(ir[fix + IRL!(IR.LookbehindStart) .. $]); + } + put(ir[fix].paired); + } + + // repetition of {1,1} + void fixRepetition(uint offset) + { + import std.algorithm.mutation : copy; + immutable replace = ir[offset].code == IR.Nop; + if (replace) + { + copy(ir[offset + 1 .. $], ir[offset .. $ - 1]); + ir.length -= 1; + } + } + + // repetition of {x,y} + void fixRepetition(uint offset, uint min, uint max, bool greedy) + { + static import std.algorithm.comparison; + import std.algorithm.mutation : copy; + import std.array : insertInPlace; + immutable replace = ir[offset].code == IR.Nop; + immutable len = cast(uint) ir.length - offset - replace; + if (max != infinite) + { + if (min != 1 || max != 1) + { + Bytecode op = Bytecode(greedy ? IR.RepeatStart : IR.RepeatQStart, len); + if (replace) + ir[offset] = op; + else + insertInPlace(ir, offset, op); + put(Bytecode(greedy ? IR.RepeatEnd : IR.RepeatQEnd, len)); + put(Bytecode.init); //hotspot + putRaw(1); + putRaw(min); + putRaw(max); + counterDepth = std.algorithm.comparison.max(counterDepth, nesting+1); + } + } + else if (min) //&& max is infinite + { + if (min != 1) + { + Bytecode op = Bytecode(greedy ? IR.RepeatStart : IR.RepeatQStart, len); + if (replace) + ir[offset] = op; + else + insertInPlace(ir, offset, op); + offset += 1;//so it still points to the repeated block + put(Bytecode(greedy ? IR.RepeatEnd : IR.RepeatQEnd, len)); + put(Bytecode.init); //hotspot + putRaw(1); + putRaw(min); + putRaw(min); + counterDepth = std.algorithm.comparison.max(counterDepth, nesting+1); + } + else if (replace) + { + copy(ir[offset+1 .. $], ir[offset .. $-1]); + ir.length -= 1; + } + put(Bytecode(greedy ? IR.InfiniteStart : IR.InfiniteQStart, len)); + enforce(ir.length + len < maxCompiledLength, "maximum compiled pattern length is exceeded"); + ir ~= ir[offset .. offset+len]; + //IR.InfinteX is always a hotspot + put(Bytecode(greedy ? IR.InfiniteEnd : IR.InfiniteQEnd, len)); + put(Bytecode.init); //merge index + } + else//vanila {0,inf} + { + Bytecode op = Bytecode(greedy ? IR.InfiniteStart : IR.InfiniteQStart, len); + if (replace) + ir[offset] = op; + else + insertInPlace(ir, offset, op); + //IR.InfinteX is always a hotspot + put(Bytecode(greedy ? IR.InfiniteEnd : IR.InfiniteQEnd, len)); + put(Bytecode.init); //merge index + } + } + + void fixAlternation() + { + import std.array : insertInPlace; + uint fix = fixupStack.top; + if (ir.length > fix && ir[fix].code == IR.Option) + { + ir[fix] = Bytecode(ir[fix].code, cast(uint) ir.length - fix); + put(Bytecode(IR.GotoEndOr, 0)); + fixupStack.top = cast(uint) ir.length; //replace latest fixup for Option + put(Bytecode(IR.Option, 0)); + return; + } + uint len, orStart; + //start a new option + if (fixupStack.length == 1) + {//only root entry, effectively no fixup + len = cast(uint) ir.length + IRL!(IR.GotoEndOr); + orStart = 0; + } + else + {//IR.lookahead, etc. fixups that have length > 1, thus check ir[x].length + len = cast(uint) ir.length - fix - (ir[fix].length - 1); + orStart = fix + ir[fix].length; + } + insertInPlace(ir, orStart, Bytecode(IR.OrStart, 0), Bytecode(IR.Option, len)); + assert(ir[orStart].code == IR.OrStart); + put(Bytecode(IR.GotoEndOr, 0)); + fixupStack.push(orStart); //fixup for StartOR + fixupStack.push(cast(uint) ir.length); //for second Option + put(Bytecode(IR.Option, 0)); + } + + // finalizes IR.Option, fix points to the first option of sequence + void finishAlternation(uint fix) + { + enforce(ir[fix].code == IR.Option, "no matching ')'"); + ir[fix] = Bytecode(ir[fix].code, cast(uint) ir.length - fix - IRL!(IR.OrStart)); + fix = fixupStack.pop(); + enforce(ir[fix].code == IR.OrStart, "no matching ')'"); + ir[fix] = Bytecode(IR.OrStart, cast(uint) ir.length - fix - IRL!(IR.OrStart)); + put(Bytecode(IR.OrEnd, cast(uint) ir.length - fix - IRL!(IR.OrStart))); + uint pc = fix + IRL!(IR.OrStart); + while (ir[pc].code == IR.Option) + { + pc = pc + ir[pc].data; + if (ir[pc].code != IR.GotoEndOr) + break; + ir[pc] = Bytecode(IR.GotoEndOr, cast(uint)(ir.length - pc - IRL!(IR.OrEnd))); + pc += IRL!(IR.GotoEndOr); + } + put(Bytecode.fromRaw(0)); + } + + // returns: (flag - repetition possible?, fixup of the start of this "group") + Tuple!(bool, uint) onClose() + { + nesting--; + uint fix = popFixup(); + switch (ir[fix].code) + { + case IR.GroupStart: + put(Bytecode(IR.GroupEnd, ir[fix].data)); + return tuple(true, fix); + case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart: + assert(lookaroundNest); + fixLookaround(fix); + return tuple(false, 0u); + case IR.Option: //| xxx ) + //two fixups: last option + full OR + finishAlternation(fix); + fix = topFixup; + switch (ir[fix].code) + { + case IR.GroupStart: + popFixup(); + put(Bytecode(IR.GroupEnd, ir[fix].data)); + return tuple(true, fix); + case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart: + assert(lookaroundNest); + fix = popFixup(); + fixLookaround(fix); + return tuple(false, 0u); + default://(?:xxx) + popFixup(); + return tuple(true, fix); + } + default://(?:xxx) + return tuple(true, fix); + } + } + + uint popFixup(){ return fixupStack.pop(); } + + void pushFixup(uint val){ return fixupStack.push(val); } + + @property uint topFixup(){ return fixupStack.top; } + + @property size_t fixupLength(){ return fixupStack.data.length; } + + @property uint length(){ return cast(uint) ir.length; } +} + +// safety limits +enum maxGroupNumber = 2^^19; +enum maxLookaroundDepth = 16; +// *Bytecode.sizeof, i.e. 1Mb of bytecode alone +enum maxCompiledLength = 2^^18; +// amounts to up to 4 Mb of auxilary table for matching +enum maxCumulativeRepetitionLength = 2^^20; +// marker to indicate infinite repetition +enum infinite = ~0u; + +struct Parser(R, Generator) +if (isForwardRange!R && is(ElementType!R : dchar)) +{ + dchar _current; + bool empty; + R pat, origin; //keep full pattern for pretty printing error messages + uint re_flags = 0; //global flags e.g. multiline + internal ones + Generator g; + + @trusted this(S)(R pattern, S flags) + if (isSomeString!S) + { + pat = origin = pattern; + //reserve slightly more then avg as sampled from unittests + parseFlags(flags); + _current = ' ';//a safe default for freeform parsing + next(); + g.start(cast(uint) pat.length); + try + { + parseRegex(); + } + catch (Exception e) + { + error(e.msg);//also adds pattern location + } + g.endPattern(1); + } + + @property dchar current(){ return _current; } + + bool _next() + { + if (pat.empty) + { + empty = true; + return false; + } + _current = pat.front; + pat.popFront(); + return true; + } + + void skipSpace() + { + while (isWhite(current) && _next()){ } + } + + bool next() + { + if (re_flags & RegexOption.freeform) + { + immutable r = _next(); + skipSpace(); + return r; + } + else + return _next(); + } + + //parsing number with basic overflow check + uint parseDecimal() + { + uint r = 0; + while (std.ascii.isDigit(current)) + { + if (r >= (uint.max/10)) + error("Overflow in decimal number"); + r = 10*r + cast(uint)(current-'0'); + if (!next()) + break; + } + return r; + } + + //parse control code of form \cXXX, c assumed to be the current symbol + dchar parseControlCode() + { + enforce(next(), "Unfinished escape sequence"); + enforce(('a' <= current && current <= 'z') || ('A' <= current && current <= 'Z'), + "Only letters are allowed after \\c"); + return current & 0x1f; + } + + // + @trusted void parseFlags(S)(S flags) + {//@@@BUG@@@ text is @system + import std.conv : text; + foreach (ch; flags)//flags are ASCII anyway + { + L_FlagSwitch: + switch (ch) + { + + foreach (i, op; __traits(allMembers, RegexOption)) + { + case RegexOptionNames[i]: + if (re_flags & mixin("RegexOption."~op)) + throw new RegexException(text("redundant flag specified: ",ch)); + re_flags |= mixin("RegexOption."~op); + break L_FlagSwitch; + } + default: + throw new RegexException(text("unknown regex flag '",ch,"'")); + } + } + } + + //parse and store IR for regex pattern + @trusted void parseRegex() + { + uint fix;//fixup pointer + + while (!empty) + { + debug(std_regex_parser) + __ctfe || writeln("*LR*\nSource: ", pat, "\nStack: ",fixupStack.data); + switch (current) + { + case '(': + next(); + if (current == '?') + { + next(); + switch (current) + { + case '#': + for (;;) + { + if (!next()) + error("Unexpected end of pattern"); + if (current == ')') + { + next(); + break; + } + } + break; + case ':': + g.genLogicGroup(); + next(); + break; + case '=': + g.genLookaround(IR.LookaheadStart); + next(); + break; + case '!': + g.genLookaround(IR.NeglookaheadStart); + next(); + break; + case 'P': + next(); + if (current != '<') + error("Expected '<' in named group"); + string name; + if (!next() || !(isAlpha(current) || current == '_')) + error("Expected alpha starting a named group"); + name ~= current; + while (next() && (isAlpha(current) || + current == '_' || std.ascii.isDigit(current))) + { + name ~= current; + } + if (current != '>') + error("Expected '>' closing named group"); + next(); + g.genNamedGroup(name); + break; + case '<': + next(); + if (current == '=') + g.genLookaround(IR.LookbehindStart); + else if (current == '!') + g.genLookaround(IR.NeglookbehindStart); + else + error("'!' or '=' expected after '<'"); + next(); + break; + default: + uint enableFlags, disableFlags; + bool enable = true; + do + { + switch (current) + { + case 's': + if (enable) + enableFlags |= RegexOption.singleline; + else + disableFlags |= RegexOption.singleline; + break; + case 'x': + if (enable) + enableFlags |= RegexOption.freeform; + else + disableFlags |= RegexOption.freeform; + break; + case 'i': + if (enable) + enableFlags |= RegexOption.casefold; + else + disableFlags |= RegexOption.casefold; + break; + case 'm': + if (enable) + enableFlags |= RegexOption.multiline; + else + disableFlags |= RegexOption.multiline; + break; + case '-': + if (!enable) + error(" unexpected second '-' in flags"); + enable = false; + break; + default: + error(" 's', 'x', 'i', 'm' or '-' expected after '(?' "); + } + next(); + }while (current != ')'); + next(); + re_flags |= enableFlags; + re_flags &= ~disableFlags; + } + } + else + { + g.genGroup(); + } + break; + case ')': + enforce(g.nesting, "Unmatched ')'"); + next(); + auto pair = g.onClose(); + if (pair[0]) + parseQuantifier(pair[1]); + break; + case '|': + next(); + g.fixAlternation(); + break; + default://no groups or whatever + immutable start = g.length; + parseAtom(); + parseQuantifier(start); + } + } + + if (g.fixupLength != 1) + { + fix = g.popFixup(); + g.finishAlternation(fix); + enforce(g.fixupLength == 1, "no matching ')'"); + } + } + + + //parse and store IR for atom-quantifier pair + @trusted void parseQuantifier(uint offset) + {//copy is @system + if (empty) + return g.fixRepetition(offset); + uint min, max; + switch (current) + { + case '*': + min = 0; + max = infinite; + break; + case '?': + min = 0; + max = 1; + break; + case '+': + min = 1; + max = infinite; + break; + case '{': + enforce(next(), "Unexpected end of regex pattern"); + enforce(std.ascii.isDigit(current), "First number required in repetition"); + min = parseDecimal(); + if (current == '}') + max = min; + else if (current == ',') + { + next(); + if (std.ascii.isDigit(current)) + max = parseDecimal(); + else if (current == '}') + max = infinite; + else + error("Unexpected symbol in regex pattern"); + skipSpace(); + if (current != '}') + error("Unmatched '{' in regex pattern"); + } + else + error("Unexpected symbol in regex pattern"); + if (min > max) + error("Illegal {n,m} quantifier"); + break; + default: + g.fixRepetition(offset); + return; + } + bool greedy = true; + //check only if we managed to get new symbol + if (next() && current == '?') + { + greedy = false; + next(); + } + g.fixRepetition(offset, min, max, greedy); + } + + //parse and store IR for atom + void parseAtom() + { + if (empty) + return; + switch (current) + { + case '*', '?', '+', '|', '{', '}': + error("'*', '+', '?', '{', '}' not allowed in atom"); + break; + case '.': + if (re_flags & RegexOption.singleline) + g.put(Bytecode(IR.Any, 0)); + else + { + CodepointSet set; + g.charsetToIr(set.add('\n','\n'+1).add('\r', '\r'+1).inverted); + } + next(); + break; + case '[': + parseCharset(); + break; + case '\\': + enforce(_next(), "Unfinished escape sequence"); + parseEscape(); + break; + case '^': + if (re_flags & RegexOption.multiline) + g.put(Bytecode(IR.Bol, 0)); + else + g.put(Bytecode(IR.Bof, 0)); + next(); + break; + case '$': + if (re_flags & RegexOption.multiline) + g.put(Bytecode(IR.Eol, 0)); + else + g.put(Bytecode(IR.Eof, 0)); + next(); + break; + default: + //FIXME: getCommonCasing in new std uni + if (re_flags & RegexOption.casefold) + { + auto range = simpleCaseFoldings(current); + assert(range.length <= 5); + if (range.length == 1) + g.put(Bytecode(IR.Char, range.front)); + else + foreach (v; range) + g.put(Bytecode(IR.OrChar, v, cast(uint) range.length)); + } + else + g.put(Bytecode(IR.Char, current)); + next(); + } + } + + + + //CodepointSet operations relatively in order of priority + enum Operator:uint { + Open = 0, Negate, Difference, SymDifference, Intersection, Union, None + } + + //parse unit of CodepointSet spec, most notably escape sequences and char ranges + //also fetches next set operation + Tuple!(CodepointSet,Operator) parseCharTerm() + { + enum State{ Start, Char, Escape, CharDash, CharDashEscape, + PotentialTwinSymbolOperator } + Operator op = Operator.None; + dchar last; + CodepointSet set; + State state = State.Start; + + static void addWithFlags(ref CodepointSet set, uint ch, uint re_flags) + { + if (re_flags & RegexOption.casefold) + { + auto range = simpleCaseFoldings(ch); + foreach (v; range) + set |= v; + } + else + set |= ch; + } + + static Operator twinSymbolOperator(dchar symbol) + { + switch (symbol) + { + case '|': + return Operator.Union; + case '-': + return Operator.Difference; + case '~': + return Operator.SymDifference; + case '&': + return Operator.Intersection; + default: + assert(false); + } + } + + L_CharTermLoop: + for (;;) + { + final switch (state) + { + case State.Start: + switch (current) + { + case '|': + case '-': + case '~': + case '&': + state = State.PotentialTwinSymbolOperator; + last = current; + break; + case '[': + op = Operator.Union; + goto case; + case ']': + break L_CharTermLoop; + case '\\': + state = State.Escape; + break; + default: + state = State.Char; + last = current; + } + break; + case State.Char: + // xxx last current xxx + switch (current) + { + case '|': + case '~': + case '&': + // then last is treated as normal char and added as implicit union + state = State.PotentialTwinSymbolOperator; + addWithFlags(set, last, re_flags); + last = current; + break; + case '-': // still need more info + state = State.CharDash; + break; + case '\\': + set |= last; + state = State.Escape; + break; + case '[': + op = Operator.Union; + goto case; + case ']': + addWithFlags(set, last, re_flags); + break L_CharTermLoop; + default: + state = State.Char; + addWithFlags(set, last, re_flags); + last = current; + } + break; + case State.PotentialTwinSymbolOperator: + // xxx last current xxxx + // where last = [|-&~] + if (current == last) + { + op = twinSymbolOperator(last); + next();//skip second twin char + break L_CharTermLoop; + } + goto case State.Char; + case State.Escape: + // xxx \ current xxx + switch (current) + { + case 'f': + last = '\f'; + state = State.Char; + break; + case 'n': + last = '\n'; + state = State.Char; + break; + case 'r': + last = '\r'; + state = State.Char; + break; + case 't': + last = '\t'; + state = State.Char; + break; + case 'v': + last = '\v'; + state = State.Char; + break; + case 'c': + last = parseControlCode(); + state = State.Char; + break; + foreach (val; Escapables) + { + case val: + } + last = current; + state = State.Char; + break; + case 'p': + set.add(parseUnicodePropertySpec(false)); + state = State.Start; + continue L_CharTermLoop; //next char already fetched + case 'P': + set.add(parseUnicodePropertySpec(true)); + state = State.Start; + continue L_CharTermLoop; //next char already fetched + case 'x': + last = parseUniHex(pat, 2); + state = State.Char; + break; + case 'u': + last = parseUniHex(pat, 4); + state = State.Char; + break; + case 'U': + last = parseUniHex(pat, 8); + state = State.Char; + break; + case 'd': + set.add(unicode.Nd); + state = State.Start; + break; + case 'D': + set.add(unicode.Nd.inverted); + state = State.Start; + break; + case 's': + set.add(unicode.White_Space); + state = State.Start; + break; + case 'S': + set.add(unicode.White_Space.inverted); + state = State.Start; + break; + case 'w': + set.add(wordCharacter); + state = State.Start; + break; + case 'W': + set.add(wordCharacter.inverted); + state = State.Start; + break; + default: + if (current >= privateUseStart && current <= privateUseEnd) + enforce(false, "no matching ']' found while parsing character class"); + enforce(false, "invalid escape sequence"); + } + break; + case State.CharDash: + // xxx last - current xxx + switch (current) + { + case '[': + op = Operator.Union; + goto case; + case ']': + //means dash is a single char not an interval specifier + addWithFlags(set, last, re_flags); + addWithFlags(set, '-', re_flags); + break L_CharTermLoop; + case '-'://set Difference again + addWithFlags(set, last, re_flags); + op = Operator.Difference; + next();//skip '-' + break L_CharTermLoop; + case '\\': + state = State.CharDashEscape; + break; + default: + enforce(last <= current, "inverted range"); + if (re_flags & RegexOption.casefold) + { + for (uint ch = last; ch <= current; ch++) + addWithFlags(set, ch, re_flags); + } + else + set.add(last, current + 1); + state = State.Start; + } + break; + case State.CharDashEscape: + //xxx last - \ current xxx + uint end; + switch (current) + { + case 'f': + end = '\f'; + break; + case 'n': + end = '\n'; + break; + case 'r': + end = '\r'; + break; + case 't': + end = '\t'; + break; + case 'v': + end = '\v'; + break; + foreach (val; Escapables) + { + case val: + } + end = current; + break; + case 'c': + end = parseControlCode(); + break; + case 'x': + end = parseUniHex(pat, 2); + break; + case 'u': + end = parseUniHex(pat, 4); + break; + case 'U': + end = parseUniHex(pat, 8); + break; + default: + if (current >= privateUseStart && current <= privateUseEnd) + enforce(false, "no matching ']' found while parsing character class"); + error("invalid escape sequence"); + } + // Lookahead to check if it's a \T + // where T is sub-pattern terminator in multi-pattern scheme + if (end == '\\' && !pat.empty) + { + if (pat.front >= privateUseStart && pat.front <= privateUseEnd) + enforce(false, "invalid escape sequence"); + } + enforce(last <= end,"inverted range"); + set.add(last, end + 1); + state = State.Start; + break; + } + enforce(next(), "unexpected end of CodepointSet"); + } + return tuple(set, op); + } + + alias ValStack = Stack!(CodepointSet); + alias OpStack = Stack!(Operator); + + //parse and store IR for CodepointSet + void parseCharset() + { + const save = re_flags; + re_flags &= ~RegexOption.freeform; // stop ignoring whitespace if we did + parseCharsetImpl(); + re_flags = save; + // Last next() in parseCharsetImp is executed w/o freeform flag + if (re_flags & RegexOption.freeform) skipSpace(); + } + + void parseCharsetImpl() + { + ValStack vstack; + OpStack opstack; + import std.functional : unaryFun; + // + static bool apply(Operator op, ref ValStack stack) + { + switch (op) + { + case Operator.Negate: + enforce(!stack.empty, "no operand for '^'"); + stack.top = stack.top.inverted; + break; + case Operator.Union: + auto s = stack.pop();//2nd operand + enforce(!stack.empty, "no operand for '||'"); + stack.top.add(s); + break; + case Operator.Difference: + auto s = stack.pop();//2nd operand + enforce(!stack.empty, "no operand for '--'"); + stack.top.sub(s); + break; + case Operator.SymDifference: + auto s = stack.pop();//2nd operand + enforce(!stack.empty, "no operand for '~~'"); + stack.top ~= s; + break; + case Operator.Intersection: + auto s = stack.pop();//2nd operand + enforce(!stack.empty, "no operand for '&&'"); + stack.top.intersect(s); + break; + default: + return false; + } + return true; + } + static bool unrollWhile(alias cond)(ref ValStack vstack, ref OpStack opstack) + { + while (cond(opstack.top)) + { + if (!apply(opstack.pop(),vstack)) + return false;//syntax error + if (opstack.empty) + return false; + } + return true; + } + + L_CharsetLoop: + do + { + switch (current) + { + case '[': + opstack.push(Operator.Open); + enforce(next(), "unexpected end of character class"); + if (current == '^') + { + opstack.push(Operator.Negate); + enforce(next(), "unexpected end of character class"); + } + else if (current == ']') // []...] is special cased + { + enforce(next(), "wrong character set"); + auto pair = parseCharTerm(); + pair[0].add(']', ']'+1); + if (pair[1] != Operator.None) + { + if (opstack.top == Operator.Union) + unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack); + opstack.push(pair[1]); + } + vstack.push(pair[0]); + } + break; + case ']': + enforce(unrollWhile!(unaryFun!"a != a.Open")(vstack, opstack), + "character class syntax error"); + enforce(!opstack.empty, "unmatched ']'"); + opstack.pop(); + if (!next() || opstack.empty) + break L_CharsetLoop; + auto pair = parseCharTerm(); + if (!pair[0].empty)//not only operator e.g. -- or ~~ + { + vstack.top.add(pair[0]);//apply union + } + if (pair[1] != Operator.None) + { + if (opstack.top == Operator.Union) + unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack); + opstack.push(pair[1]); + } + break; + // + default://yet another pair of term(op)? + auto pair = parseCharTerm(); + if (pair[1] != Operator.None) + { + if (opstack.top == Operator.Union) + unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack); + opstack.push(pair[1]); + } + vstack.push(pair[0]); + } + }while (!empty || !opstack.empty); + while (!opstack.empty) + { + enforce(opstack.top != Operator.Open, + "no matching ']' found while parsing character class"); + apply(opstack.pop(), vstack); + } + assert(vstack.length == 1); + g.charsetToIr(vstack.top); + } + + //parse and generate IR for escape stand alone escape sequence + @trusted void parseEscape() + {//accesses array of appender + import std.algorithm.iteration : sum; + switch (current) + { + case 'f': next(); g.put(Bytecode(IR.Char, '\f')); break; + case 'n': next(); g.put(Bytecode(IR.Char, '\n')); break; + case 'r': next(); g.put(Bytecode(IR.Char, '\r')); break; + case 't': next(); g.put(Bytecode(IR.Char, '\t')); break; + case 'v': next(); g.put(Bytecode(IR.Char, '\v')); break; + + case 'd': + next(); + g.charsetToIr(unicode.Nd); + break; + case 'D': + next(); + g.charsetToIr(unicode.Nd.inverted); + break; + case 'b': next(); g.put(Bytecode(IR.Wordboundary, 0)); break; + case 'B': next(); g.put(Bytecode(IR.Notwordboundary, 0)); break; + case 's': + next(); + g.charsetToIr(unicode.White_Space); + break; + case 'S': + next(); + g.charsetToIr(unicode.White_Space.inverted); + break; + case 'w': + next(); + g.charsetToIr(wordCharacter); + break; + case 'W': + next(); + g.charsetToIr(wordCharacter.inverted); + break; + case 'p': case 'P': + auto CodepointSet = parseUnicodePropertySpec(current == 'P'); + g.charsetToIr(CodepointSet); + break; + case 'x': + immutable code = parseUniHex(pat, 2); + next(); + g.put(Bytecode(IR.Char,code)); + break; + case 'u': case 'U': + immutable code = parseUniHex(pat, current == 'u' ? 4 : 8); + next(); + g.put(Bytecode(IR.Char, code)); + break; + case 'c': //control codes + Bytecode code = Bytecode(IR.Char, parseControlCode()); + next(); + g.put(code); + break; + case '0': + next(); + g.put(Bytecode(IR.Char, 0));//NUL character + break; + case '1': .. case '9': + uint nref = cast(uint) current - '0'; + immutable maxBackref = sum(g.groupStack.data); + enforce(nref < maxBackref, "Backref to unseen group"); + //perl's disambiguation rule i.e. + //get next digit only if there is such group number + while (nref < maxBackref && next() && std.ascii.isDigit(current)) + { + nref = nref * 10 + current - '0'; + } + if (nref >= maxBackref) + nref /= 10; + enforce(!g.isOpenGroup(nref), "Backref to open group"); + uint localLimit = maxBackref - g.groupStack.top; + if (nref >= localLimit) + { + g.put(Bytecode(IR.Backref, nref-localLimit)); + g.ir[$-1].setLocalRef(); + } + else + g.put(Bytecode(IR.Backref, nref)); + g.markBackref(nref); + break; + default: + // Lookahead to check if it's a \T + // where T is sub-pattern terminator in multi-pattern scheme + if (current == '\\' && !pat.empty) + { + if (pat.front >= privateUseStart && current <= privateUseEnd) + enforce(false, "invalid escape sequence"); + } + if (current >= privateUseStart && current <= privateUseEnd) + { + g.endPattern(current - privateUseStart + 1); + break; + } + auto op = Bytecode(IR.Char, current); + next(); + g.put(op); + } + } + + //parse and return a CodepointSet for \p{...Property...} and \P{...Property..}, + //\ - assumed to be processed, p - is current + CodepointSet parseUnicodePropertySpec(bool negated) + { + enum MAX_PROPERTY = 128; + char[MAX_PROPERTY] result; + uint k = 0; + enforce(next(), "eof parsing unicode property spec"); + if (current == '{') + { + while (k < MAX_PROPERTY && next() && current !='}' && current !=':') + if (current != '-' && current != ' ' && current != '_') + result[k++] = cast(char) std.ascii.toLower(current); + enforce(k != MAX_PROPERTY, "invalid property name"); + enforce(current == '}', "} expected "); + } + else + {//single char properties e.g.: \pL, \pN ... + enforce(current < 0x80, "invalid property name"); + result[k++] = cast(char) current; + } + auto s = getUnicodeSet(result[0 .. k], negated, + cast(bool)(re_flags & RegexOption.casefold)); + enforce(!s.empty, "unrecognized unicode property spec"); + next(); + return s; + } + + // + @trusted void error(string msg) + { + import std.array : appender; + import std.format : formattedWrite; + auto app = appender!string(); + formattedWrite(app, "%s\nPattern with error: `%s` <--HERE-- `%s`", + msg, origin[0..$-pat.length], pat); + throw new RegexException(app.data); + } + + alias Char = BasicElementOf!R; + + @property program() + { + return makeRegex(this); + } +} + +/+ + Postproces the IR, then optimize. ++/ +@trusted void postprocess(Char)(ref Regex!Char zis) +{//@@@BUG@@@ write is @system + with(zis) + { + struct FixedStack(T) + { + T[] arr; + uint _top; + //this(T[] storage){ arr = storage; _top = -1; } + @property ref T top(){ assert(!empty); return arr[_top]; } + void push(T x){ arr[++_top] = x; } + T pop() { assert(!empty); return arr[_top--]; } + @property bool empty(){ return _top == -1; } + } + auto counterRange = FixedStack!uint(new uint[maxCounterDepth+1], -1); + counterRange.push(1); + ulong cumRange = 0; + for (uint i = 0; i < ir.length; i += ir[i].length) + { + if (ir[i].hotspot) + { + assert(i + 1 < ir.length, + "unexpected end of IR while looking for hotspot"); + ir[i+1] = Bytecode.fromRaw(hotspotTableSize); + hotspotTableSize += counterRange.top; + } + switch (ir[i].code) + { + case IR.RepeatStart, IR.RepeatQStart: + uint repEnd = cast(uint)(i + ir[i].data + IRL!(IR.RepeatStart)); + assert(ir[repEnd].code == ir[i].paired.code); + immutable max = ir[repEnd + 4].raw; + ir[repEnd+2].raw = counterRange.top; + ir[repEnd+3].raw *= counterRange.top; + ir[repEnd+4].raw *= counterRange.top; + ulong cntRange = cast(ulong)(max)*counterRange.top; + cumRange += cntRange; + enforce(cumRange < maxCumulativeRepetitionLength, + "repetition length limit is exceeded"); + counterRange.push(cast(uint) cntRange + counterRange.top); + threadCount += counterRange.top; + break; + case IR.RepeatEnd, IR.RepeatQEnd: + threadCount += counterRange.top; + counterRange.pop(); + break; + case IR.GroupStart: + if (isBackref(ir[i].data)) + ir[i].setBackrefence(); + threadCount += counterRange.top; + break; + case IR.GroupEnd: + if (isBackref(ir[i].data)) + ir[i].setBackrefence(); + threadCount += counterRange.top; + break; + default: + threadCount += counterRange.top; + } + } + checkIfOneShot(); + if (!(flags & RegexInfo.oneShot)) + kickstart = Kickstart!Char(zis, new uint[](256)); + debug(std_regex_allocation) writefln("IR processed, max threads: %d", threadCount); + optimize(zis); + } +} + +void fixupBytecode()(Bytecode[] ir) +{ + Stack!uint fixups; + + with(IR) for (uint i=0; i<ir.length; i+= ir[i].length) + { + if (ir[i].isStart || ir[i].code == Option) + fixups.push(i); + else if (ir[i].code == OrEnd) + { + // Alternatives need more care + auto j = fixups.pop(); // last Option + ir[j].data = i - j - ir[j].length; + j = fixups.pop(); // OrStart + ir[j].data = i - j - ir[j].length; + ir[i].data = ir[j].data; + + // fixup all GotoEndOrs + j = j + IRL!(OrStart); + assert(ir[j].code == Option); + for (;;) + { + auto next = j + ir[j].data + IRL!(Option); + if (ir[next].code == IR.OrEnd) + break; + ir[next - IRL!(GotoEndOr)].data = i - next; + j = next; + } + } + else if (ir[i].code == GotoEndOr) + { + auto j = fixups.pop(); // Option + ir[j].data = i - j + IRL!(GotoEndOr)- IRL!(Option); // to the next option + } + else if (ir[i].isEnd) + { + auto j = fixups.pop(); + ir[i].data = i - j - ir[j].length; + ir[j].data = ir[i].data; + } + } + assert(fixups.empty); +} + +void optimize(Char)(ref Regex!Char zis) +{ + import std.array : insertInPlace; + CodepointSet nextSet(uint idx) + { + CodepointSet set; + with(zis) with(IR) + Outer: + for (uint i = idx; i < ir.length; i += ir[i].length) + { + switch (ir[i].code) + { + case Char: + set.add(ir[i].data, ir[i].data+1); + goto default; + //TODO: OrChar + case Trie, CodepointSet: + set = zis.charsets[ir[i].data]; + goto default; + case GroupStart,GroupEnd: + break; + default: + break Outer; + } + } + return set; + } + + with(zis) with(IR) for (uint i = 0; i < ir.length; i += ir[i].length) + { + if (ir[i].code == InfiniteEnd) + { + auto set = nextSet(i+IRL!(InfiniteEnd)); + if (!set.empty && set.length < 10_000) + { + ir[i] = Bytecode(InfiniteBloomEnd, ir[i].data); + ir[i - ir[i].data - IRL!(InfiniteStart)] = + Bytecode(InfiniteBloomStart, ir[i].data); + ir.insertInPlace(i+IRL!(InfiniteEnd), + Bytecode.fromRaw(cast(uint) zis.filters.length)); + zis.filters ~= BitTable(set); + fixupBytecode(ir); + } + } + } +} + +//IR code validator - proper nesting, illegal instructions, etc. +@trusted void validateRe(Char)(ref Regex!Char zis) +{//@@@BUG@@@ text is @system + import std.conv : text; + with(zis) + { + for (uint pc = 0; pc < ir.length; pc += ir[pc].length) + { + if (ir[pc].isStart || ir[pc].isEnd) + { + immutable dest = ir[pc].indexOfPair(pc); + assert(dest < ir.length, text("Wrong length in opcode at pc=", + pc, " ", dest, " vs ", ir.length)); + assert(ir[dest].paired == ir[pc], + text("Wrong pairing of opcodes at pc=", pc, "and pc=", dest)); + } + else if (ir[pc].isAtom) + { + + } + else + assert(0, text("Unknown type of instruction at pc=", pc)); + } + } +} diff --git a/libphobos/src/std/regex/internal/tests.d b/libphobos/src/std/regex/internal/tests.d new file mode 100644 index 0000000..1c4f295 --- /dev/null +++ b/libphobos/src/std/regex/internal/tests.d @@ -0,0 +1,1120 @@ +/* + Regualar expressions package test suite. +*/ +module std.regex.internal.tests; + +package(std.regex): + +import std.conv, std.exception, std.meta, std.range, + std.typecons, std.regex; + +import std.regex.internal.parser : Escapables; // characters that need escaping + +alias Sequence(int B, int E) = staticIota!(B, E); + +@safe unittest +{//sanity checks + regex("(a|b)*"); + regex(`(?:([0-9A-F]+)\.\.([0-9A-F]+)|([0-9A-F]+))\s*;\s*(.*)\s*#`); + regex("abc|edf|ighrg"); + auto r1 = regex("abc"); + auto r2 = regex("(gylba)"); + assert(match("abcdef", r1).hit == "abc"); + assert(!match("wida",r2)); + assert(bmatch("abcdef", r1).hit == "abc"); + assert(!bmatch("wida", r2)); + assert(match("abc", "abc".dup)); + assert(bmatch("abc", "abc".dup)); + Regex!char rc; + assert(rc.empty); + rc = regex("test"); + assert(!rc.empty); +} + +/* The test vectors in this file are altered from Henry Spencer's regexp + test code. His copyright notice is: + + Copyright (c) 1986 by University of Toronto. + Written by Henry Spencer. Not derived from licensed software. + + Permission is granted to anyone to use this software for any + purpose on any computer system, and to redistribute it freely, + subject to the following restrictions: + + 1. The author is not responsible for the consequences of use of + this software, no matter how awful, even if they arise + from defects in it. + + 2. The origin of this software must not be misrepresented, either + by explicit claim or by omission. + + 3. Altered versions must be plainly marked as such, and must not + be misrepresented as being the original software. + + + */ + +@safe unittest +{ + struct TestVectors + { + string pattern; + string input; + string result; + string format; + string replace; + string flags; + } + + static immutable TestVectors[] tv = [ + TestVectors( "a\\b", "a", "y", "$&", "a" ), + TestVectors( "(a)b\\1", "abaab","y", "$&", "aba" ), + TestVectors( "()b\\1", "aaab", "y", "$&", "b" ), + TestVectors( "abc", "abc", "y", "$&", "abc" ), + TestVectors( "abc", "xbc", "n", "-", "-" ), + TestVectors( "abc", "axc", "n", "-", "-" ), + TestVectors( "abc", "abx", "n", "-", "-" ), + TestVectors( "abc", "xabcy","y", "$&", "abc" ), + TestVectors( "abc", "ababc","y", "$&", "abc" ), + TestVectors( "ab*c", "abc", "y", "$&", "abc" ), + TestVectors( "ab*bc", "abc", "y", "$&", "abc" ), + TestVectors( "ab*bc", "abbc", "y", "$&", "abbc" ), + TestVectors( "ab*bc", "abbbbc","y", "$&", "abbbbc" ), + TestVectors( "ab+bc", "abbc", "y", "$&", "abbc" ), + TestVectors( "ab+bc", "abc", "n", "-", "-" ), + TestVectors( "ab+bc", "abq", "n", "-", "-" ), + TestVectors( "ab+bc", "abbbbc","y", "$&", "abbbbc" ), + TestVectors( "ab?bc", "abbc", "y", "$&", "abbc" ), + TestVectors( "ab?bc", "abc", "y", "$&", "abc" ), + TestVectors( "ab?bc", "abbbbc","n", "-", "-" ), + TestVectors( "ab?c", "abc", "y", "$&", "abc" ), + TestVectors( "^abc$", "abc", "y", "$&", "abc" ), + TestVectors( "^abc$", "abcc", "n", "-", "-" ), + TestVectors( "^abc", "abcc", "y", "$&", "abc" ), + TestVectors( "^abc$", "aabc", "n", "-", "-" ), + TestVectors( "abc$", "aabc", "y", "$&", "abc" ), + TestVectors( "^", "abc", "y", "$&", "" ), + TestVectors( "$", "abc", "y", "$&", "" ), + TestVectors( "a.c", "abc", "y", "$&", "abc" ), + TestVectors( "a.c", "axc", "y", "$&", "axc" ), + TestVectors( "a.*c", "axyzc","y", "$&", "axyzc" ), + TestVectors( "a.*c", "axyzd","n", "-", "-" ), + TestVectors( "a[bc]d", "abc", "n", "-", "-" ), + TestVectors( "a[bc]d", "abd", "y", "$&", "abd" ), + TestVectors( "a[b-d]e", "abd", "n", "-", "-" ), + TestVectors( "a[b-d]e", "ace", "y", "$&", "ace" ), + TestVectors( "a[b-d]", "aac", "y", "$&", "ac" ), + TestVectors( "a[-b]", "a-", "y", "$&", "a-" ), + TestVectors( "a[b-]", "a-", "y", "$&", "a-" ), + TestVectors( "a[b-a]", "-", "c", "-", "-" ), + TestVectors( "a[]b", "-", "c", "-", "-" ), + TestVectors( "a[", "-", "c", "-", "-" ), + TestVectors( "a]", "a]", "y", "$&", "a]" ), + TestVectors( "a[\\]]b", "a]b", "y", "$&", "a]b" ), + TestVectors( "a[^bc]d", "aed", "y", "$&", "aed" ), + TestVectors( "a[^bc]d", "abd", "n", "-", "-" ), + TestVectors( "a[^-b]c", "adc", "y", "$&", "adc" ), + TestVectors( "a[^-b]c", "a-c", "n", "-", "-" ), + TestVectors( "a[^\\]b]c", "adc", "y", "$&", "adc" ), + TestVectors( "ab|cd", "abc", "y", "$&", "ab" ), + TestVectors( "ab|cd", "abcd", "y", "$&", "ab" ), + TestVectors( "()ef", "def", "y", "$&-$1", "ef-" ), + TestVectors( "()*", "-", "y", "-", "-" ), + TestVectors( "*a", "-", "c", "-", "-" ), + TestVectors( "^*", "-", "y", "-", "-" ), + TestVectors( "$*", "-", "y", "-", "-" ), + TestVectors( "(*)b", "-", "c", "-", "-" ), + TestVectors( "$b", "b", "n", "-", "-" ), + TestVectors( "a\\", "-", "c", "-", "-" ), + TestVectors( "a\\(b", "a(b", "y", "$&-$1", "a(b-" ), + TestVectors( "a\\(*b", "ab", "y", "$&", "ab" ), + TestVectors( "a\\(*b", "a((b", "y", "$&", "a((b" ), + TestVectors( "a\\\\b", "a\\b", "y", "$&", "a\\b" ), + TestVectors( "abc)", "-", "c", "-", "-" ), + TestVectors( "(abc", "-", "c", "-", "-" ), + TestVectors( "((a))", "abc", "y", "$&-$1-$2", "a-a-a" ), + TestVectors( "(a)b(c)", "abc", "y", "$&-$1-$2", "abc-a-c" ), + TestVectors( "a+b+c", "aabbabc","y", "$&", "abc" ), + TestVectors( "a**", "-", "c", "-", "-" ), + TestVectors( "a*?a", "aa", "y", "$&", "a" ), + TestVectors( "(a*)*", "aaa", "y", "-", "-" ), + TestVectors( "(a*)+", "aaa", "y", "-", "-" ), + TestVectors( "(a|)*", "-", "y", "-", "-" ), + TestVectors( "(a*|b)*", "aabb", "y", "-", "-" ), + TestVectors( "(a|b)*", "ab", "y", "$&-$1", "ab-b" ), + TestVectors( "(a+|b)*", "ab", "y", "$&-$1", "ab-b" ), + TestVectors( "(a+|b)+", "ab", "y", "$&-$1", "ab-b" ), + TestVectors( "(a+|b)?", "ab", "y", "$&-$1", "a-a" ), + TestVectors( "[^ab]*", "cde", "y", "$&", "cde" ), + TestVectors( "(^)*", "-", "y", "-", "-" ), + TestVectors( "(ab|)*", "-", "y", "-", "-" ), + TestVectors( ")(", "-", "c", "-", "-" ), + TestVectors( "", "abc", "y", "$&", "" ), + TestVectors( "abc", "", "n", "-", "-" ), + TestVectors( "a*", "", "y", "$&", "" ), + TestVectors( "([abc])*d", "abbbcd", "y", "$&-$1", "abbbcd-c" ), + TestVectors( "([abc])*bcd", "abcd", "y", "$&-$1", "abcd-a" ), + TestVectors( "a|b|c|d|e", "e", "y", "$&", "e" ), + TestVectors( "(a|b|c|d|e)f", "ef", "y", "$&-$1", "ef-e" ), + TestVectors( "((a*|b))*", "aabb", "y", "-", "-" ), + TestVectors( "abcd*efg", "abcdefg", "y", "$&", "abcdefg" ), + TestVectors( "ab*", "xabyabbbz", "y", "$&", "ab" ), + TestVectors( "ab*", "xayabbbz", "y", "$&", "a" ), + TestVectors( "(ab|cd)e", "abcde", "y", "$&-$1", "cde-cd" ), + TestVectors( "[abhgefdc]ij", "hij", "y", "$&", "hij" ), + TestVectors( "^(ab|cd)e", "abcde", "n", "x$1y", "xy" ), + TestVectors( "(abc|)ef", "abcdef", "y", "$&-$1", "ef-" ), + TestVectors( "(a|b)c*d", "abcd", "y", "$&-$1", "bcd-b" ), + TestVectors( "(ab|ab*)bc", "abc", "y", "$&-$1", "abc-a" ), + TestVectors( "a([bc]*)c*", "abc", "y", "$&-$1", "abc-bc" ), + TestVectors( "a([bc]*)(c*d)", "abcd", "y", "$&-$1-$2", "abcd-bc-d" ), + TestVectors( "a([bc]+)(c*d)", "abcd", "y", "$&-$1-$2", "abcd-bc-d" ), + TestVectors( "a([bc]*)(c+d)", "abcd", "y", "$&-$1-$2", "abcd-b-cd" ), + TestVectors( "a[bcd]*dcdcde", "adcdcde", "y", "$&", "adcdcde" ), + TestVectors( "a[bcd]+dcdcde", "adcdcde", "n", "-", "-" ), + TestVectors( "(ab|a)b*c", "abc", "y", "$&-$1", "abc-ab" ), + TestVectors( "((a)(b)c)(d)", "abcd", "y", "$1-$2-$3-$4", "abc-a-b-d" ), + TestVectors( "[a-zA-Z_][a-zA-Z0-9_]*", "alpha", "y", "$&", "alpha" ), + TestVectors( "^a(bc+|b[eh])g|.h$", "abh", "y", "$&-$1", "bh-" ), + TestVectors( "(bc+d$|ef*g.|h?i(j|k))", "effgz", "y", "$&-$1-$2", "effgz-effgz-" ), + TestVectors( "(bc+d$|ef*g.|h?i(j|k))", "ij", "y", "$&-$1-$2", "ij-ij-j" ), + TestVectors( "(bc+d$|ef*g.|h?i(j|k))", "effg", "n", "-", "-" ), + TestVectors( "(bc+d$|ef*g.|h?i(j|k))", "bcdd", "n", "-", "-" ), + TestVectors( "(bc+d$|ef*g.|h?i(j|k))", "reffgz", "y", "$&-$1-$2", "effgz-effgz-" ), + TestVectors( "(((((((((a)))))))))", "a", "y", "$&", "a" ), + TestVectors( "multiple words of text", "uh-uh", "n", "-", "-" ), + TestVectors( "multiple words", "multiple words, yeah", "y", "$&", "multiple words" ), + TestVectors( "(.*)c(.*)", "abcde", "y", "$&-$1-$2", "abcde-ab-de" ), + TestVectors( "\\((.*), (.*)\\)", "(a, b)", "y", "($2, $1)", "(b, a)" ), + TestVectors( "abcd", "abcd", "y", "$&-&-$$$&", "abcd-&-$abcd" ), + TestVectors( "a(bc)d", "abcd", "y", "$1-$$1-$$$1", "bc-$1-$bc" ), + TestVectors( "[k]", "ab", "n", "-", "-" ), + TestVectors( "[ -~]*", "abc", "y", "$&", "abc" ), + TestVectors( "[ -~ -~]*", "abc", "y", "$&", "abc" ), + TestVectors( "[ -~ -~ -~]*", "abc", "y", "$&", "abc" ), + TestVectors( "[ -~ -~ -~ -~]*", "abc", "y", "$&", "abc" ), + TestVectors( "[ -~ -~ -~ -~ -~]*", "abc", "y", "$&", "abc" ), + TestVectors( "[ -~ -~ -~ -~ -~ -~]*", "abc", "y", "$&", "abc" ), + TestVectors( "[ -~ -~ -~ -~ -~ -~ -~]*", "abc", "y", "$&", "abc" ), + TestVectors( "a{2}", "candy", "n", "", "" ), + TestVectors( "a{2}", "caandy", "y", "$&", "aa" ), + TestVectors( "a{2}", "caaandy", "y", "$&", "aa" ), + TestVectors( "a{2,}", "candy", "n", "", "" ), + TestVectors( "a{2,}", "caandy", "y", "$&", "aa" ), + TestVectors( "a{2,}", "caaaaaandy", "y", "$&", "aaaaaa" ), + TestVectors( "a{1,3}", "cndy", "n", "", "" ), + TestVectors( "a{1,3}", "candy", "y", "$&", "a" ), + TestVectors( "a{1,3}", "caandy", "y", "$&", "aa" ), + TestVectors( "a{1,3}", "caaaaaandy", "y", "$&", "aaa" ), + TestVectors( "e?le?", "angel", "y", "$&", "el" ), + TestVectors( "e?le?", "angle", "y", "$&", "le" ), + TestVectors( "\\bn\\w", "noonday", "y", "$&", "no" ), + TestVectors( "\\wy\\b", "possibly yesterday", "y", "$&", "ly" ), + TestVectors( "\\w\\Bn", "noonday", "y", "$&", "on" ), + TestVectors( "y\\B\\w", "possibly yesterday", "y", "$&", "ye" ), + TestVectors( "\\cJ", "abc\ndef", "y", "$&", "\n" ), + TestVectors( "\\d", "B2 is", "y", "$&", "2" ), + TestVectors( "\\D", "B2 is", "y", "$&", "B" ), + TestVectors( "\\s\\w*", "foo bar", "y", "$&", " bar" ), + TestVectors( "\\S\\w*", "foo bar", "y", "$&", "foo" ), + TestVectors( "abc", "ababc", "y", "$&", "abc" ), + TestVectors( "apple(,)\\sorange\\1", "apple, orange, cherry, peach", "y", "$&", "apple, orange," ), + TestVectors( "(\\w+)\\s(\\w+)", "John Smith", "y", "$2, $1", "Smith, John" ), + TestVectors( "\\n\\f\\r\\t\\v", "abc\n\f\r\t\vdef", "y", "$&", "\n\f\r\t\v" ), + TestVectors( ".*c", "abcde", "y", "$&", "abc" ), + TestVectors( "^\\w+((;|=)\\w+)+$", "some=host=tld", "y", "$&-$1-$2", "some=host=tld-=tld-=" ), + TestVectors( "^\\w+((\\.|-)\\w+)+$", "some.host.tld", "y", "$&-$1-$2", "some.host.tld-.tld-." ), + TestVectors( "q(a|b)*q", "xxqababqyy", "y", "$&-$1", "qababq-b" ), + TestVectors( "^(a)(b){0,1}(c*)", "abcc", "y", "$1 $2 $3", "a b cc" ), + TestVectors( "^(a)((b){0,1})(c*)", "abcc", "y", "$1 $2 $3", "a b b" ), + TestVectors( "^(a)(b)?(c*)", "abcc", "y", "$1 $2 $3", "a b cc" ), + TestVectors( "^(a)((b)?)(c*)", "abcc", "y", "$1 $2 $3", "a b b" ), + TestVectors( "^(a)(b){0,1}(c*)", "acc", "y", "$1 $2 $3", "a cc" ), + TestVectors( "^(a)((b){0,1})(c*)", "acc", "y", "$1 $2 $3", "a " ), + TestVectors( "^(a)(b)?(c*)", "acc", "y", "$1 $2 $3", "a cc" ), + TestVectors( "^(a)((b)?)(c*)", "acc", "y", "$1 $2 $3", "a " ), + TestVectors( "(?:ab){3}", "_abababc","y", "$&-$1", "ababab-" ), + TestVectors( "(?:a(?:x)?)+", "aaxaxx", "y", "$&-$1-$2", "aaxax--" ), + TestVectors( `\W\w\W`, "aa b!ca", "y", "$&", " b!"), +//more repetitions: + TestVectors( "(?:a{2,4}b{1,3}){1,2}", "aaabaaaabbb", "y", "$&", "aaabaaaabbb" ), + TestVectors( "(?:a{2,4}b{1,3}){1,2}?", "aaabaaaabbb", "y", "$&", "aaab" ), +//groups: + TestVectors( "(abc)|(edf)|(xyz)", "xyz", "y", "$1-$2-$3","--xyz"), + TestVectors( "(?P<q>\\d+)/(?P<d>\\d+)", "2/3", "y", "${d}/${q}", "3/2"), +//set operations: + TestVectors( "[a-z--d-f]", " dfa", "y", "$&", "a"), + TestVectors( "[abc[pq--acq]]{2}", "bqpaca", "y", "$&", "pa"), + TestVectors( "[a-z9&&abc0-9]{3}", "z90a0abc", "y", "$&", "abc"), + TestVectors( "[0-9a-f~~0-5a-z]{2}", "g0a58x", "y", "$&", "8x"), + TestVectors( "[abc[pq]xyz[rs]]{4}", "cqxr", "y", "$&", "cqxr"), + TestVectors( "[abcdf--[ab&&[bcd]][acd]]", "abcdefgh", "y", "$&", "f"), + TestVectors( "[a-c||d-f]+", "abcdef", "y", "$&", "abcdef"), + TestVectors( "[a-f--a-c]+", "abcdef", "y", "$&", "def"), + TestVectors( "[a-c&&b-f]+", "abcdef", "y", "$&", "bc"), + TestVectors( "[a-c~~b-f]+", "abcdef", "y", "$&", "a"), +//unicode blocks & properties: + TestVectors( `\P{Inlatin1suppl ement}`, "\u00c2!", "y", "$&", "!"), + TestVectors( `\p{InLatin-1 Supplement}\p{in-mathematical-operators}\P{Inlatin1suppl ement}`, + "\u00c2\u2200\u00c3\u2203.", "y", "$&", "\u00c3\u2203."), + TestVectors( `[-+*/\p{in-mathematical-operators}]{2}`, "a+\u2212", "y", "$&", "+\u2212"), + TestVectors( `\p{Ll}+`, "XabcD", "y", "$&", "abc"), + TestVectors( `\p{Lu}+`, "абвГДЕ", "y", "$&", "ГДЕ"), + TestVectors( `^\p{Currency Symbol}\p{Sc}`, "$₤", "y", "$&", "$₤"), + TestVectors( `\p{Common}\p{Thai}`, "!ฆ", "y", "$&", "!ฆ"), + TestVectors( `[\d\s]*\D`, "12 \t3\U00001680\u0F20_2", "y", "$&", "12 \t3\U00001680\u0F20_"), + TestVectors( `[c-wф]фф`, "ффф", "y", "$&", "ффф"), +//case insensitive: + TestVectors( `^abcdEf$`, "AbCdEF", "y", "$&", "AbCdEF", "i"), + TestVectors( `Русский язык`, "рУсскИй ЯзЫк", "y", "$&", "рУсскИй ЯзЫк", "i"), + TestVectors( `ⒶⒷⓒ` , "ⓐⓑⒸ", "y", "$&", "ⓐⓑⒸ", "i"), + TestVectors( "\U00010400{2}", "\U00010428\U00010400 ", "y", "$&", "\U00010428\U00010400", "i"), + TestVectors( `[adzУ-Я]{4}`, "DzюЯ", "y", "$&", "DzюЯ", "i"), + TestVectors( `\p{L}\p{Lu}{10}`, "абвгдеЖЗИКЛ", "y", "$&", "абвгдеЖЗИКЛ", "i"), + TestVectors( `(?:Dåb){3}`, "DåbDÅBdÅb", "y", "$&", "DåbDÅBdÅb", "i"), +//escapes: + TestVectors( `\u0041\u005a\U00000065\u0001`, "AZe\u0001", "y", "$&", "AZe\u0001"), + TestVectors( `\u`, "", "c", "-", "-"), + TestVectors( `\U`, "", "c", "-", "-"), + TestVectors( `\u003`, "", "c", "-", "-"), + TestVectors( `[\x00-\x7f]{4}`, "\x00\x09ab", "y", "$&", "\x00\x09ab"), + TestVectors( `[\cJ\cK\cA-\cD]{3}\cQ`, "\x01\x0B\x0A\x11", "y", "$&", "\x01\x0B\x0A\x11"), + TestVectors( `\r\n\v\t\f\\`, "\r\n\v\t\f\\", "y", "$&", "\r\n\v\t\f\\"), + TestVectors( `[\u0003\u0001]{2}`, "\u0001\u0003", "y", "$&", "\u0001\u0003"), + TestVectors( `^[\u0020-\u0080\u0001\n-\r]{8}`, "abc\u0001\v\f\r\n", "y", "$&", "abc\u0001\v\f\r\n"), + TestVectors( `\w+\S\w+`, "ab7!44c", "y", "$&", "ab7!44c"), + TestVectors( `\b\w+\b`, " abde4 ", "y", "$&", "abde4"), + TestVectors( `\b\w+\b`, " abde4", "y", "$&", "abde4"), + TestVectors( `\b\w+\b`, "abde4 ", "y", "$&", "abde4"), + TestVectors( `\pL\pS`, "a\u02DA", "y", "$&", "a\u02DA"), + TestVectors( `\pX`, "", "c", "-", "-"), +// ^, $, \b, \B, multiline : + TestVectors( `\r.*?$`, "abc\r\nxy", "y", "$&", "\r\nxy", "sm"), + TestVectors( `^a$^b$`, "a\r\nb\n", "n", "$&", "-", "m"), + TestVectors( `^a$\r\n^b$`,"a\r\nb\n", "y", "$&", "a\r\nb", "m"), + TestVectors( `^$`, "\r\n", "y", "$&", "", "m"), + TestVectors( `^a$\nx$`, "a\nx\u2028","y", "$&", "a\nx", "m"), + TestVectors( `^a$\nx$`, "a\nx\u2029","y", "$&", "a\nx", "m"), + TestVectors( `^a$\nx$`, "a\nx\u0085","y", "$&", "a\nx","m"), + TestVectors( `^x$`, "\u2028x", "y", "$&", "x", "m"), + TestVectors( `^x$`, "\u2029x", "y", "$&", "x", "m"), + TestVectors( `^x$`, "\u0085x", "y", "$&", "x", "m"), + TestVectors( `\b^.`, "ab", "y", "$&", "a"), + TestVectors( `\B^.`, "ab", "n", "-", "-"), + TestVectors( `^ab\Bc\B`, "\r\nabcd", "y", "$&", "abc", "m"), + TestVectors( `^.*$`, "12345678", "y", "$&", "12345678"), + +// luckily obtained regression on incremental matching in backtracker + TestVectors( `^(?:(?:([0-9A-F]+)\.\.([0-9A-F]+)|([0-9A-F]+))\s*;\s*([^ ]*)\s*#|# (?:\w|_)+=((?:\w|_)+))`, + "0020 ; White_Space # ", "y", "$1-$2-$3", "--0020"), +//lookahead + TestVectors( "(foo.)(?=(bar))", "foobar foodbar", "y", "$&-$1-$2", "food-food-bar" ), + TestVectors( `\b(\d+)[a-z](?=\1)`, "123a123", "y", "$&-$1", "123a-123" ), + TestVectors( `\$(?!\d{3})\w+`, "$123 $abc", "y", "$&", "$abc"), + TestVectors( `(abc)(?=(ed(f))\3)`, "abcedff", "y", "-", "-"), + TestVectors( `\b[A-Za-z0-9.]+(?=(@(?!gmail)))`, "a@gmail,x@com", "y", "$&-$1", "x-@"), + TestVectors( `x()(abc)(?=(d)(e)(f)\2)`, "xabcdefabc", "y", "$&", "xabc"), + TestVectors( `x()(abc)(?=(d)(e)(f)()\3\4\5)`, "xabcdefdef", "y", "$&", "xabc"), +//lookback + TestVectors( `(?<=(ab))\d`, "12ba3ab4", "y", "$&-$1", "4-ab", "i"), + TestVectors( `\w(?<!\d)\w`, "123ab24", "y", "$&", "ab"), + TestVectors( `(?<=Dåb)x\w`, "DåbDÅBxdÅb", "y", "$&", "xd", "i"), + TestVectors( `(?<=(ab*c))x`, "abbbbcxac", "y", "$&-$1", "x-abbbbc"), + TestVectors( `(?<=(ab*?c))x`, "abbbbcxac", "y", "$&-$1", "x-abbbbc"), + TestVectors( `(?<=(a.*?c))x`, "ababbcxac", "y", "$&-$1", "x-abbc"), + TestVectors( `(?<=(a{2,4}b{1,3}))x`, "yyaaaabx", "y", "$&-$1", "x-aaaab"), + TestVectors( `(?<=((?:a{2,4}b{1,3}){1,2}))x`, "aabbbaaaabx", "y", "$&-$1", "x-aabbbaaaab"), + TestVectors( `(?<=((?:a{2,4}b{1,3}){1,2}?))x`, "aabbbaaaabx", "y", "$&-$1", "x-aaaab"), + TestVectors( `(?<=(abc|def|aef))x`, "abcx", "y", "$&-$1", "x-abc"), + TestVectors( `(?<=(abc|def|aef))x`, "aefx", "y", "$&-$1", "x-aef"), + TestVectors( `(?<=(abc|dabc))(x)`, "dabcx", "y", "$&-$1-$2", "x-abc-x"), + TestVectors( `(?<=(|abc))x`, "dabcx", "y", "$&-$1", "x-"), + TestVectors( `(?<=((ab|da)*))x`, "abdaabx", "y", "$&-$2-$1", "x-ab-abdaab"), + TestVectors( `a(?<=(ba(?<=(aba)(?<=aaba))))`, "aabaa", "y", "$&-$1-$2", "a-ba-aba"), + TestVectors( `.(?<!b).`, "bax", "y", "$&", "ax"), + TestVectors( `(?<=b(?<!ab)).`, "abbx", "y", "$&", "x"), + TestVectors( `(?<=\.|[!?]+)X`, "Hey?!X", "y", "$&", "X"), + TestVectors( `(?<=\.|[!?]+)a{3}`, ".Nope.aaaX", "y", "$&", "aaa"), +//mixed lookaround + TestVectors( `a(?<=a(?=b))b`, "ab", "y", "$&", "ab"), + TestVectors( `a(?<=a(?!b))c`, "ac", "y", "$&", "ac"), + TestVectors( `a(?i)bc`, "aBc", "y", "$&", "aBc"), + TestVectors( `a(?i)bc`, "Abc", "n", "$&", "-"), + TestVectors( `(?i)a(?-i)bc`, "aBcAbc", "y", "$&", "Abc"), + TestVectors( `(?s).(?-s).`, "\n\n\na", "y", "$&", "\na"), + TestVectors( `(?m)^a(?-m)$`, "\na", "y", "$&", "a") + ]; + string produceExpected(M,String)(auto ref M m, String fmt) + { + auto app = appender!(String)(); + replaceFmt(fmt, m.captures, app, true); + return app.data; + } + void run_tests(alias matchFn)() + { + int i; + foreach (Char; AliasSeq!( char, wchar, dchar)) + (){ // avoid slow optimizations for large functions @@@BUG@@@ 2396 + alias String = immutable(Char)[]; + String produceExpected(M,Range)(auto ref M m, Range fmt) + { + auto app = appender!(String)(); + replaceFmt(fmt, m.captures, app, true); + return app.data; + } + Regex!(Char) r; + foreach (a, tvd; tv) + { + uint c = tvd.result[0]; + debug(std_regex_test) writeln(" Test #", a, " pattern: ", tvd.pattern, " with Char = ", Char.stringof); + try + { + i = 1; + r = regex(to!(String)(tvd.pattern), tvd.flags); + } + catch (RegexException e) + { + i = 0; + debug(std_regex_test) writeln(e.msg); + } + + assert((c == 'c') ? !i : i, "failed to compile pattern "~tvd.pattern); + + if (c != 'c') + { + auto m = matchFn(to!(String)(tvd.input), r); + i = !m.empty; + assert( + (c == 'y') ? i : !i, + text(matchFn.stringof ~": failed to match pattern #", a ,": ", tvd.pattern) + ); + if (c == 'y') + { + auto result = produceExpected(m, to!(String)(tvd.format)); + assert(result == to!String(tvd.replace), + text(matchFn.stringof ~": mismatch pattern #", a, ": ", tvd.pattern," expected: ", + tvd.replace, " vs ", result)); + } + } + } + }(); + debug(std_regex_test) writeln("!!! FReD bulk test done "~matchFn.stringof~" !!!"); + } + + + void ct_tests() + { + import std.algorithm.comparison : equal; + version (std_regex_ct1) + { + pragma(msg, "Testing 1st part of ctRegex"); + alias Tests = Sequence!(0, 155); + } + else version (std_regex_ct2) + { + pragma(msg, "Testing 2nd part of ctRegex"); + alias Tests = Sequence!(155, 174); + } + //FIXME: #174-178 contains CTFE parser bug + else version (std_regex_ct3) + { + pragma(msg, "Testing 3rd part of ctRegex"); + alias Tests = Sequence!(178, 220); + } + else version (std_regex_ct4) + { + pragma(msg, "Testing 4th part of ctRegex"); + alias Tests = Sequence!(220, tv.length); + } + else + alias Tests = AliasSeq!(Sequence!(0, 30), Sequence!(235, tv.length-5)); + foreach (a, v; Tests) + (){ // avoid slow optimizations for large functions @@@BUG@@@ 2396 + enum tvd = tv[v]; + static if (tvd.result == "c") + { + static assert(!__traits(compiles, (){ + enum r = regex(tvd.pattern, tvd.flags); + }), "errornously compiles regex pattern: " ~ tvd.pattern); + } + else + { + //BUG: tv[v] is fine but tvd is not known at compile time?! + auto r = ctRegex!(tv[v].pattern, tv[v].flags); + auto nr = regex(tvd.pattern, tvd.flags); + assert(equal(r.ir, nr.ir), + text("!C-T regex! failed to compile pattern #", a ,": ", tvd.pattern)); + auto m = match(tvd.input, r); + auto c = tvd.result[0]; + bool ok = (c == 'y') ^ m.empty; + assert(ok, text("ctRegex: failed to match pattern #", + a ,": ", tvd.pattern)); + if (c == 'y') + { + import std.stdio; + auto result = produceExpected(m, tvd.format); + if (result != tvd.replace) + writeln("ctRegex mismatch pattern #", a, ": ", tvd.pattern," expected: ", + tvd.replace, " vs ", result); + } + } + }(); + debug(std_regex_test) writeln("!!! FReD C-T test done !!!"); + } + + ct_tests(); + run_tests!bmatch(); //backtracker + run_tests!match(); //thompson VM +} + +@safe unittest +{ + auto cr = ctRegex!("abc"); + assert(bmatch("abc",cr).hit == "abc"); + auto cr2 = ctRegex!("ab*c"); + assert(bmatch("abbbbc",cr2).hit == "abbbbc"); +} +@safe unittest +{ + auto cr3 = ctRegex!("^abc$"); + assert(bmatch("abc",cr3).hit == "abc"); + auto cr4 = ctRegex!(`\b(a\B[a-z]b)\b`); + assert(array(match("azb",cr4).captures) == ["azb", "azb"]); +} + +@safe unittest +{ + auto cr5 = ctRegex!("(?:a{2,4}b{1,3}){1,2}"); + assert(bmatch("aaabaaaabbb", cr5).hit == "aaabaaaabbb"); + auto cr6 = ctRegex!("(?:a{2,4}b{1,3}){1,2}?"w); + assert(bmatch("aaabaaaabbb"w, cr6).hit == "aaab"w); +} + +@safe unittest +{ + auto cr7 = ctRegex!(`\r.*?$`,"sm"); + assert(bmatch("abc\r\nxy", cr7).hit == "\r\nxy"); + auto greed = ctRegex!("<packet.*?/packet>"); + assert(bmatch("<packet>text</packet><packet>text</packet>", greed).hit + == "<packet>text</packet>"); +} + +@safe unittest +{ + import std.algorithm.comparison : equal; + auto cr8 = ctRegex!("^(a)(b)?(c*)"); + auto m8 = bmatch("abcc",cr8); + assert(m8); + assert(m8.captures[1] == "a"); + assert(m8.captures[2] == "b"); + assert(m8.captures[3] == "cc"); + auto cr9 = ctRegex!("q(a|b)*q"); + auto m9 = match("xxqababqyy",cr9); + assert(m9); + assert(equal(bmatch("xxqababqyy",cr9).captures, ["qababq", "b"])); +} + +@safe unittest +{ + import std.algorithm.comparison : equal; + auto rtr = regex("a|b|c"); + enum ctr = regex("a|b|c"); + assert(equal(rtr.ir,ctr.ir)); + //CTFE parser BUG is triggered by group + //in the middle of alternation (at least not first and not last) + enum testCT = regex(`abc|(edf)|xyz`); + auto testRT = regex(`abc|(edf)|xyz`); + assert(equal(testCT.ir,testRT.ir)); +} + +@safe unittest +{ + import std.algorithm.comparison : equal; + import std.algorithm.iteration : map; + enum cx = ctRegex!"(A|B|C)"; + auto mx = match("B",cx); + assert(mx); + assert(equal(mx.captures, [ "B", "B"])); + enum cx2 = ctRegex!"(A|B)*"; + assert(match("BAAA",cx2)); + + enum cx3 = ctRegex!("a{3,4}","i"); + auto mx3 = match("AaA",cx3); + assert(mx3); + assert(mx3.captures[0] == "AaA"); + enum cx4 = ctRegex!(`^a{3,4}?[a-zA-Z0-9~]{1,2}`,"i"); + auto mx4 = match("aaaabc", cx4); + assert(mx4); + assert(mx4.captures[0] == "aaaab"); + auto cr8 = ctRegex!("(a)(b)?(c*)"); + auto m8 = bmatch("abcc",cr8); + assert(m8); + assert(m8.captures[1] == "a"); + assert(m8.captures[2] == "b"); + assert(m8.captures[3] == "cc"); + auto cr9 = ctRegex!(".*$", "gm"); + auto m9 = match("First\rSecond", cr9); + assert(m9); + assert(equal(map!"a.hit"(m9), ["First", "", "Second"])); +} + +@safe unittest +{ + import std.algorithm.comparison : equal; + import std.algorithm.iteration : map; +//global matching + void test_body(alias matchFn)() + { + string s = "a quick brown fox jumps over a lazy dog"; + auto r1 = regex("\\b[a-z]+\\b","g"); + string[] test; + foreach (m; matchFn(s, r1)) + test ~= m.hit; + assert(equal(test, [ "a", "quick", "brown", "fox", "jumps", "over", "a", "lazy", "dog"])); + auto free_reg = regex(` + + abc + \s+ + " + ( + [^"]+ + | \\ " + )+ + " + z + `, "x"); + auto m = match(`abc "quoted string with \" inside"z`,free_reg); + assert(m); + string mails = " hey@you.com no@spam.net "; + auto rm = regex(`@(?<=\S+@)\S+`,"g"); + assert(equal(map!"a[0]"(matchFn(mails, rm)), ["@you.com", "@spam.net"])); + auto m2 = matchFn("First line\nSecond line",regex(".*$","gm")); + assert(equal(map!"a[0]"(m2), ["First line", "", "Second line"])); + auto m2a = matchFn("First line\nSecond line",regex(".+$","gm")); + assert(equal(map!"a[0]"(m2a), ["First line", "Second line"])); + auto m2b = matchFn("First line\nSecond line",regex(".+?$","gm")); + assert(equal(map!"a[0]"(m2b), ["First line", "Second line"])); + debug(std_regex_test) writeln("!!! FReD FLAGS test done "~matchFn.stringof~" !!!"); + } + test_body!bmatch(); + test_body!match(); +} + +//tests for accumulated std.regex issues and other regressions +@safe unittest +{ + import std.algorithm.comparison : equal; + import std.algorithm.iteration : map; + void test_body(alias matchFn)() + { + //issue 5857 + //matching goes out of control if ... in (...){x} has .*/.+ + auto c = matchFn("axxxzayyyyyzd",regex("(a.*z){2}d")).captures; + assert(c[0] == "axxxzayyyyyzd"); + assert(c[1] == "ayyyyyz"); + auto c2 = matchFn("axxxayyyyyd",regex("(a.*){2}d")).captures; + assert(c2[0] == "axxxayyyyyd"); + assert(c2[1] == "ayyyyy"); + //issue 2108 + //greedy vs non-greedy + auto nogreed = regex("<packet.*?/packet>"); + assert(matchFn("<packet>text</packet><packet>text</packet>", nogreed).hit + == "<packet>text</packet>"); + auto greed = regex("<packet.*/packet>"); + assert(matchFn("<packet>text</packet><packet>text</packet>", greed).hit + == "<packet>text</packet><packet>text</packet>"); + //issue 4574 + //empty successful match still advances the input + string[] pres, posts, hits; + foreach (m; matchFn("abcabc", regex("","g"))) + { + pres ~= m.pre; + posts ~= m.post; + assert(m.hit.empty); + + } + auto heads = [ + "abcabc", + "abcab", + "abca", + "abc", + "ab", + "a", + "" + ]; + auto tails = [ + "abcabc", + "bcabc", + "cabc", + "abc", + "bc", + "c", + "" + ]; + assert(pres == array(retro(heads))); + assert(posts == tails); + //issue 6076 + //regression on .* + auto re = regex("c.*|d"); + auto m = matchFn("mm", re); + assert(!m); + debug(std_regex_test) writeln("!!! FReD REGRESSION test done "~matchFn.stringof~" !!!"); + auto rprealloc = regex(`((.){5}.{1,10}){5}`); + auto arr = array(repeat('0',100)); + auto m2 = matchFn(arr, rprealloc); + assert(m2); + assert(collectException( + regex(r"^(import|file|binary|config)\s+([^\(]+)\(?([^\)]*)\)?\s*$") + ) is null); + foreach (ch; [Escapables]) + { + assert(match(to!string(ch),regex(`[\`~ch~`]`))); + assert(!match(to!string(ch),regex(`[^\`~ch~`]`))); + assert(match(to!string(ch),regex(`[\`~ch~`-\`~ch~`]`))); + } + //bugzilla 7718 + string strcmd = "./myApp.rb -os OSX -path \"/GIT/Ruby Apps/sec\" -conf 'notimer'"; + auto reStrCmd = regex (`(".*")|('.*')`, "g"); + assert(equal(map!"a[0]"(matchFn(strcmd, reStrCmd)), + [`"/GIT/Ruby Apps/sec"`, `'notimer'`])); + } + test_body!bmatch(); + test_body!match(); +} + +// tests for replace +@safe unittest +{ + void test(alias matchFn)() + { + import std.uni : toUpper; + + foreach (i, v; AliasSeq!(string, wstring, dstring)) + { + auto baz(Cap)(Cap m) + if (is(Cap == Captures!(Cap.String))) + { + return toUpper(m.hit); + } + alias String = v; + assert(std.regex.replace!(matchFn)(to!String("ark rapacity"), regex(to!String("r")), to!String("c")) + == to!String("ack rapacity")); + assert(std.regex.replace!(matchFn)(to!String("ark rapacity"), regex(to!String("r"), "g"), to!String("c")) + == to!String("ack capacity")); + assert(std.regex.replace!(matchFn)(to!String("noon"), regex(to!String("^n")), to!String("[$&]")) + == to!String("[n]oon")); + assert(std.regex.replace!(matchFn)( + to!String("test1 test2"), regex(to!String(`\w+`),"g"), to!String("$`:$'") + ) == to!String(": test2 test1 :")); + auto s = std.regex.replace!(baz!(Captures!(String)))(to!String("Strap a rocket engine on a chicken."), + regex(to!String("[ar]"), "g")); + assert(s == "StRAp A Rocket engine on A chicken."); + } + debug(std_regex_test) writeln("!!! Replace test done "~matchFn.stringof~" !!!"); + } + test!(bmatch)(); + test!(match)(); +} + +// tests for splitter +@safe unittest +{ + import std.algorithm.comparison : equal; + auto s1 = ", abc, de, fg, hi, "; + auto sp1 = splitter(s1, regex(", *")); + auto w1 = ["", "abc", "de", "fg", "hi", ""]; + assert(equal(sp1, w1)); + + auto s2 = ", abc, de, fg, hi"; + auto sp2 = splitter(s2, regex(", *")); + auto w2 = ["", "abc", "de", "fg", "hi"]; + + uint cnt; + foreach (e; sp2) + { + assert(w2[cnt++] == e); + } + assert(equal(sp2, w2)); +} + +@safe unittest +{ + char[] s1 = ", abc, de, fg, hi, ".dup; + auto sp2 = splitter(s1, regex(", *")); +} + +@safe unittest +{ + import std.algorithm.comparison : equal; + auto s1 = ", abc, de, fg, hi, "; + auto w1 = ["", "abc", "de", "fg", "hi", ""]; + assert(equal(split(s1, regex(", *")), w1[])); +} + +@safe unittest +{ // bugzilla 7141 + string pattern = `[a\--b]`; + assert(match("-", pattern)); + assert(match("b", pattern)); + string pattern2 = `[&-z]`; + assert(match("b", pattern2)); +} +@safe unittest +{//bugzilla 7111 + assert(match("", regex("^"))); +} +@safe unittest +{//bugzilla 7300 + assert(!match("a"d, "aa"d)); +} + +// bugzilla 7551 +@safe unittest +{ + auto r = regex("[]abc]*"); + assert("]ab".matchFirst(r).hit == "]ab"); + assertThrown(regex("[]")); + auto r2 = regex("[]abc--ab]*"); + assert("]ac".matchFirst(r2).hit == "]"); +} + +@safe unittest +{//bugzilla 7674 + assert("1234".replace(regex("^"), "$$") == "$1234"); + assert("hello?".replace(regex(r"\?", "g"), r"\?") == r"hello\?"); + assert("hello?".replace(regex(r"\?", "g"), r"\\?") != r"hello\?"); +} +@safe unittest +{// bugzilla 7679 + import std.algorithm.comparison : equal; + foreach (S; AliasSeq!(string, wstring, dstring)) + (){ // avoid slow optimizations for large functions @@@BUG@@@ 2396 + enum re = ctRegex!(to!S(r"\.")); + auto str = to!S("a.b"); + assert(equal(std.regex.splitter(str, re), [to!S("a"), to!S("b")])); + assert(split(str, re) == [to!S("a"), to!S("b")]); + }(); +} +@safe unittest +{//bugzilla 8203 + string data = " + NAME = XPAW01_STA:STATION + NAME = XPAW01_STA + "; + auto uniFileOld = data; + auto r = regex( + r"^NAME = (?P<comp>[a-zA-Z0-9_]+):*(?P<blk>[a-zA-Z0-9_]*)","gm"); + auto uniCapturesNew = match(uniFileOld, r); + for (int i = 0; i < 20; i++) + foreach (matchNew; uniCapturesNew) {} + //a second issue with same symptoms + auto r2 = regex(`([а-яА-Я\-_]+\s*)+(?<=[\s\.,\^])`); + match("аллея Театральная", r2); +} +@safe unittest +{// bugzilla 8637 purity of enforce + auto m = match("hello world", regex("world")); + enforce(m); +} + +// bugzilla 8725 +@safe unittest +{ + static italic = regex( r"\* + (?!\s+) + (.*?) + (?!\s+) + \*", "gx" ); + string input = "this * is* interesting, *very* interesting"; + assert(replace(input, italic, "<i>$1</i>") == + "this * is* interesting, <i>very</i> interesting"); +} + +// bugzilla 8349 +@safe unittest +{ + enum peakRegexStr = r"\>(wgEncode.*Tfbs.*\.(?:narrow)|(?:broad)Peak.gz)</a>"; + enum peakRegex = ctRegex!(peakRegexStr); + //note that the regex pattern itself is probably bogus + assert(match(r"\>wgEncode-blah-Tfbs.narrow</a>", peakRegex)); +} + +// bugzilla 9211 +@safe unittest +{ + import std.algorithm.comparison : equal; + auto rx_1 = regex(r"^(\w)*(\d)"); + auto m = match("1234", rx_1); + assert(equal(m.front, ["1234", "3", "4"])); + auto rx_2 = regex(r"^([0-9])*(\d)"); + auto m2 = match("1234", rx_2); + assert(equal(m2.front, ["1234", "3", "4"])); +} + +// bugzilla 9280 +@safe unittest +{ + string tomatch = "a!b@c"; + static r = regex(r"^(?P<nick>.*?)!(?P<ident>.*?)@(?P<host>.*?)$"); + auto nm = match(tomatch, r); + assert(nm); + auto c = nm.captures; + assert(c[1] == "a"); + assert(c["nick"] == "a"); +} + + +// bugzilla 9579 +@safe unittest +{ + char[] input = ['a', 'b', 'c']; + string format = "($1)"; + // used to give a compile error: + auto re = regex(`(a)`, "g"); + auto r = replace(input, re, format); + assert(r == "(a)bc"); +} + +// bugzilla 9634 +@safe unittest +{ + auto re = ctRegex!"(?:a+)"; + assert(match("aaaa", re).hit == "aaaa"); +} + +//bugzilla 10798 +@safe unittest +{ + auto cr = ctRegex!("[abcd--c]*"); + auto m = "abc".match(cr); + assert(m); + assert(m.hit == "ab"); +} + +// bugzilla 10913 +@system unittest +{ + @system static string foo(const(char)[] s) + { + return s.dup; + } + @safe static string bar(const(char)[] s) + { + return s.dup; + } + () @system { + replace!((a) => foo(a.hit))("blah", regex(`a`)); + }(); + () @safe { + replace!((a) => bar(a.hit))("blah", regex(`a`)); + }(); +} + +// bugzilla 11262 +@safe unittest +{ + enum reg = ctRegex!(r",", "g"); + auto str = "This,List"; + str = str.replace(reg, "-"); + assert(str == "This-List"); +} + +// bugzilla 11775 +@safe unittest +{ + assert(collectException(regex("a{1,0}"))); +} + +// bugzilla 11839 +@safe unittest +{ + import std.algorithm.comparison : equal; + assert(regex(`(?P<var1>\w+)`).namedCaptures.equal(["var1"])); + assert(collectException(regex(`(?P<1>\w+)`))); + assert(regex(`(?P<v1>\w+)`).namedCaptures.equal(["v1"])); + assert(regex(`(?P<__>\w+)`).namedCaptures.equal(["__"])); + assert(regex(`(?P<я>\w+)`).namedCaptures.equal(["я"])); +} + +// bugzilla 12076 +@safe unittest +{ + auto RE = ctRegex!(r"(?<!x[a-z]+)\s([a-z]+)"); + string s = "one two"; + auto m = match(s, RE); +} + +// bugzilla 12105 +@safe unittest +{ + auto r = ctRegex!`.*?(?!a)`; + assert("aaab".matchFirst(r).hit == "aaa"); + auto r2 = ctRegex!`.*(?!a)`; + assert("aaab".matchFirst(r2).hit == "aaab"); +} + +//bugzilla 11784 +@safe unittest +{ + assert("abcdefghijklmnopqrstuvwxyz" + .matchFirst("[a-z&&[^aeiuo]]").hit == "b"); +} + +//bugzilla 12366 +@safe unittest +{ + auto re = ctRegex!(`^((?=(xx+?)\2+$)((?=\2+$)(?=(x+)(\4+$))\5){2})*x?$`); + assert("xxxxxxxx".match(re).empty); + assert(!"xxxx".match(re).empty); +} + +// bugzilla 12582 +@safe unittest +{ + auto r = regex(`(?P<a>abc)`); + assert(collectException("abc".matchFirst(r)["b"])); +} + +// bugzilla 12691 +@safe unittest +{ + assert(bmatch("e@", "^([a-z]|)*$").empty); + assert(bmatch("e@", ctRegex!`^([a-z]|)*$`).empty); +} + +//bugzilla 12713 +@safe unittest +{ + assertThrown(regex("[[a-z]([a-z]|(([[a-z])))")); +} + +//bugzilla 12747 +@safe unittest +{ + assertThrown(regex(`^x(\1)`)); + assertThrown(regex(`^(x(\1))`)); + assertThrown(regex(`^((x)(?=\1))`)); +} + +// bugzilla 14504 +@safe unittest +{ + auto p = ctRegex!("a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?" ~ + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"); +} + +// bugzilla 14529 +@safe unittest +{ + auto ctPat2 = regex(r"^[CDF]$", "i"); + foreach (v; ["C", "c", "D", "d", "F", "f"]) + assert(matchAll(v, ctPat2).front.hit == v); +} + +// bugzilla 14615 +@safe unittest +{ + import std.array : appender; + import std.regex : replaceFirst, replaceFirstInto, regex; + import std.stdio : writeln; + + auto example = "Hello, world!"; + auto pattern = regex("^Hello, (bug)"); // won't find this one + auto result = replaceFirst(example, pattern, "$1 Sponge Bob"); + assert(result == "Hello, world!"); // Ok. + + auto sink = appender!string; + replaceFirstInto(sink, example, pattern, "$1 Sponge Bob"); + assert(sink.data == "Hello, world!"); + replaceAllInto(sink, example, pattern, "$1 Sponge Bob"); + assert(sink.data == "Hello, world!Hello, world!"); +} + +// bugzilla 15573 +@safe unittest +{ + auto rx = regex("[c d]", "x"); + assert("a b".matchFirst(rx)); +} + +// bugzilla 15864 +@safe unittest +{ + regex(`(<a (?:(?:\w+=\"[^"]*\")?\s*)*href="\.\.?)"`); +} + +@safe unittest +{ + auto r = regex("(?# comment)abc(?# comment2)"); + assert("abc".matchFirst(r)); + assertThrown(regex("(?#...")); +} + +// bugzilla 17075 +@safe unittest +{ + enum titlePattern = `<title>(.+)</title>`; + static titleRegex = ctRegex!titlePattern; + string input = "<title>" ~ "<".repeat(100_000).join; + assert(input.matchFirst(titleRegex).empty); +} + +// bugzilla 17212 +@safe unittest +{ + auto r = regex(" [a] ", "x"); + assert("a".matchFirst(r)); +} + +// bugzilla 17157 +@safe unittest +{ + import std.algorithm.comparison : equal; + auto ctr = ctRegex!"(a)|(b)|(c)|(d)"; + auto r = regex("(a)|(b)|(c)|(d)", "g"); + auto s = "--a--b--c--d--"; + auto outcomes = [ + ["a", "a", "", "", ""], + ["b", "", "b", "", ""], + ["c", "", "", "c", ""], + ["d", "", "", "", "d"] + ]; + assert(equal!equal(s.matchAll(ctr), outcomes)); + assert(equal!equal(s.bmatch(r), outcomes)); +} + +// bugzilla 17667 +@safe unittest +{ + import std.algorithm.searching : canFind; + void willThrow(T, size_t line = __LINE__)(T arg, string msg) + { + auto e = collectException(regex(arg)); + assert(e.msg.canFind(msg), to!string(line) ~ ": " ~ e.msg); + } + willThrow([r".", r"[\(\{[\]\}\)]"], "no matching ']' found while parsing character class"); + willThrow([r"[\", r"123"], "no matching ']' found while parsing character class"); + willThrow([r"[a-", r"123"], "no matching ']' found while parsing character class"); + willThrow([r"[a-\", r"123"], "invalid escape sequence"); + willThrow([r"\", r"123"], "invalid escape sequence"); +} + +// bugzilla 17668 +@safe unittest +{ + import std.algorithm.searching; + auto e = collectException!RegexException(regex(q"<[^]>")); + assert(e.msg.canFind("no operand for '^'")); +} + +// bugzilla 17673 +@safe unittest +{ + string str = `<">`; + string[] regexps = ["abc", "\"|x"]; + auto regexp = regex(regexps); + auto c = matchFirst(str, regexp); + assert(c); + assert(c.whichPattern == 2); +} + diff --git a/libphobos/src/std/regex/internal/thompson.d b/libphobos/src/std/regex/internal/thompson.d new file mode 100644 index 0000000..4d7deaa --- /dev/null +++ b/libphobos/src/std/regex/internal/thompson.d @@ -0,0 +1,1188 @@ +//Written in the D programming language +/* + Implementation of Thompson NFA std.regex engine. + Key point is evaluation of all possible threads (state) at each step + in a breadth-first manner, thereby geting some nice properties: + - looking at each character only once + - merging of equivalent threads, that gives matching process linear time complexity +*/ +module std.regex.internal.thompson; + +package(std.regex): + +import std.range.primitives; +import std.regex.internal.ir; + +//State of VM thread +struct Thread(DataIndex) +{ + Thread* next; //intrusive linked list + uint pc; + uint counter; //loop counter + uint uopCounter; //counts micro operations inside one macro instruction (e.g. BackRef) + Group!DataIndex[1] matches; +} + +//head-tail singly-linked list +struct ThreadList(DataIndex) +{ + Thread!DataIndex* tip = null, toe = null; + //add new thread to the start of list + void insertFront(Thread!DataIndex* t) + { + if (tip) + { + t.next = tip; + tip = t; + } + else + { + t.next = null; + tip = toe = t; + } + } + //add new thread to the end of list + void insertBack(Thread!DataIndex* t) + { + if (toe) + { + toe.next = t; + toe = t; + } + else + tip = toe = t; + toe.next = null; + } + //move head element out of list + Thread!DataIndex* fetch() + { + auto t = tip; + if (tip == toe) + tip = toe = null; + else + tip = tip.next; + return t; + } + //non-destructive iteration of ThreadList + struct ThreadRange + { + const(Thread!DataIndex)* ct; + this(ThreadList tlist){ ct = tlist.tip; } + @property bool empty(){ return ct is null; } + @property const(Thread!DataIndex)* front(){ return ct; } + @property popFront() + { + assert(ct); + ct = ct.next; + } + } + @property bool empty() + { + return tip == null; + } + ThreadRange opSlice() + { + return ThreadRange(this); + } +} + +template ThompsonOps(E, S, bool withInput:true) +{ +@trusted: + static bool op(IR code:IR.End)(E* e, S* state) + { + with(e) with(state) + { + finish(t, matches, re.ir[t.pc].data); + //fix endpoint of the whole match + matches[0].end = index; + recycle(t); + //cut off low priority threads + recycle(clist); + recycle(worklist); + debug(std_regex_matcher) writeln("Finished thread ", matches); + return false; // no more state to eval + } + } + + static bool op(IR code:IR.Wordboundary)(E* e, S* state) + { + with(e) with(state) + { + dchar back; + DataIndex bi; + //at start & end of input + if (atStart && wordMatcher[front]) + { + t.pc += IRL!(IR.Wordboundary); + return true; + } + else if (atEnd && s.loopBack(index).nextChar(back, bi) + && wordMatcher[back]) + { + t.pc += IRL!(IR.Wordboundary); + return true; + } + else if (s.loopBack(index).nextChar(back, bi)) + { + bool af = wordMatcher[front]; + bool ab = wordMatcher[back]; + if (af ^ ab) + { + t.pc += IRL!(IR.Wordboundary); + return true; + } + } + return popState(e); + } + } + + static bool op(IR code:IR.Notwordboundary)(E* e, S* state) + { + with(e) with(state) + { + dchar back; + DataIndex bi; + //at start & end of input + if (atStart && wordMatcher[front]) + { + return popState(e); + } + else if (atEnd && s.loopBack(index).nextChar(back, bi) + && wordMatcher[back]) + { + return popState(e); + } + else if (s.loopBack(index).nextChar(back, bi)) + { + bool af = wordMatcher[front]; + bool ab = wordMatcher[back] != 0; + if (af ^ ab) + { + return popState(e); + } + } + t.pc += IRL!(IR.Notwordboundary); + } + return true; + } + + static bool op(IR code:IR.Bof)(E* e, S* state) + { + with(e) with(state) + { + if (atStart) + { + t.pc += IRL!(IR.Bof); + return true; + } + else + { + return popState(e); + } + } + } + + static bool op(IR code:IR.Bol)(E* e, S* state) + { + with(e) with(state) + { + dchar back; + DataIndex bi; + if (atStart + ||(s.loopBack(index).nextChar(back,bi) + && startOfLine(back, front == '\n'))) + { + t.pc += IRL!(IR.Bol); + return true; + } + else + { + return popState(e); + } + } + } + + static bool op(IR code:IR.Eof)(E* e, S* state) + { + with(e) with(state) + { + if (atEnd) + { + t.pc += IRL!(IR.Eol); + return true; + } + else + { + return popState(e); + } + } + } + + static bool op(IR code:IR.Eol)(E* e, S* state) + { + with(e) with(state) + { + dchar back; + DataIndex bi; + //no matching inside \r\n + if (atEnd || (endOfLine(front, s.loopBack(index).nextChar(back, bi) + && back == '\r'))) + { + t.pc += IRL!(IR.Eol); + return true; + } + else + { + return popState(e); + } + + } + } + + static bool op(IR code:IR.InfiniteStart)(E* e, S* state) + { + with(e) with(state) + t.pc += re.ir[t.pc].data + IRL!(IR.InfiniteStart); + return op!(IR.InfiniteEnd)(e,state); + } + + static bool op(IR code:IR.InfiniteBloomStart)(E* e, S* state) + { + with(e) with(state) + t.pc += re.ir[t.pc].data + IRL!(IR.InfiniteBloomStart); + return op!(IR.InfiniteBloomEnd)(e,state); + } + + static bool op(IR code:IR.InfiniteQStart)(E* e, S* state) + { + with(e) with(state) + t.pc += re.ir[t.pc].data + IRL!(IR.InfiniteQStart); + return op!(IR.InfiniteQEnd)(e,state); + } + + static bool op(IR code:IR.RepeatStart)(E* e, S* state) + { + with(e) with(state) + t.pc += re.ir[t.pc].data + IRL!(IR.RepeatStart); + return op!(IR.RepeatEnd)(e,state); + } + + static bool op(IR code:IR.RepeatQStart)(E* e, S* state) + { + with(e) with(state) + t.pc += re.ir[t.pc].data + IRL!(IR.RepeatQStart); + return op!(IR.RepeatQEnd)(e,state); + } + + static bool op(IR code)(E* e, S* state) + if (code == IR.RepeatEnd || code == IR.RepeatQEnd) + { + with(e) with(state) + { + //len, step, min, max + uint len = re.ir[t.pc].data; + uint step = re.ir[t.pc+2].raw; + uint min = re.ir[t.pc+3].raw; + if (t.counter < min) + { + t.counter += step; + t.pc -= len; + return true; + } + if (merge[re.ir[t.pc + 1].raw+t.counter] < genCounter) + { + debug(std_regex_matcher) writefln("A thread(pc=%s) passed there : %s ; GenCounter=%s mergetab=%s", + t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] ); + merge[re.ir[t.pc + 1].raw+t.counter] = genCounter; + } + else + { + debug(std_regex_matcher) + writefln("A thread(pc=%s) got merged there : %s ; GenCounter=%s mergetab=%s", + t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] ); + return popState(e); + } + uint max = re.ir[t.pc+4].raw; + if (t.counter < max) + { + if (re.ir[t.pc].code == IR.RepeatEnd) + { + //queue out-of-loop thread + worklist.insertFront(fork(t, t.pc + IRL!(IR.RepeatEnd), t.counter % step)); + t.counter += step; + t.pc -= len; + } + else + { + //queue into-loop thread + worklist.insertFront(fork(t, t.pc - len, t.counter + step)); + t.counter %= step; + t.pc += IRL!(IR.RepeatEnd); + } + } + else + { + t.counter %= step; + t.pc += IRL!(IR.RepeatEnd); + } + return true; + } + } + + static bool op(IR code)(E* e, S* state) + if (code == IR.InfiniteEnd || code == IR.InfiniteQEnd) + { + with(e) with(state) + { + if (merge[re.ir[t.pc + 1].raw+t.counter] < genCounter) + { + debug(std_regex_matcher) writefln("A thread(pc=%s) passed there : %s ; GenCounter=%s mergetab=%s", + t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] ); + merge[re.ir[t.pc + 1].raw+t.counter] = genCounter; + } + else + { + debug(std_regex_matcher) writefln("A thread(pc=%s) got merged there : %s ; GenCounter=%s mergetab=%s", + t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] ); + return popState(e); + } + uint len = re.ir[t.pc].data; + uint pc1, pc2; //branches to take in priority order + if (re.ir[t.pc].code == IR.InfiniteEnd) + { + pc1 = t.pc - len; + pc2 = t.pc + IRL!(IR.InfiniteEnd); + } + else + { + pc1 = t.pc + IRL!(IR.InfiniteEnd); + pc2 = t.pc - len; + } + worklist.insertFront(fork(t, pc2, t.counter)); + t.pc = pc1; + return true; + } + } + + static bool op(IR code)(E* e, S* state) + if (code == IR.InfiniteBloomEnd) + { + with(e) with(state) + { + if (merge[re.ir[t.pc + 1].raw+t.counter] < genCounter) + { + debug(std_regex_matcher) writefln("A thread(pc=%s) passed there : %s ; GenCounter=%s mergetab=%s", + t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] ); + merge[re.ir[t.pc + 1].raw+t.counter] = genCounter; + } + else + { + debug(std_regex_matcher) writefln("A thread(pc=%s) got merged there : %s ; GenCounter=%s mergetab=%s", + t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] ); + return popState(e); + } + uint len = re.ir[t.pc].data; + uint pc1, pc2; //branches to take in priority order + pc1 = t.pc - len; + pc2 = t.pc + IRL!(IR.InfiniteBloomEnd); + uint filterIndex = re.ir[t.pc + 2].raw; + if (re.filters[filterIndex][front]) + worklist.insertFront(fork(t, pc2, t.counter)); + t.pc = pc1; + return true; + } + } + + static bool op(IR code:IR.OrEnd)(E* e, S* state) + { + with(e) with(state) + { + if (merge[re.ir[t.pc + 1].raw+t.counter] < genCounter) + { + debug(std_regex_matcher) writefln("A thread(pc=%s) passed there : %s ; GenCounter=%s mergetab=%s", + t.pc, s[index .. s.lastIndex], genCounter, merge[re.ir[t.pc + 1].raw + t.counter] ); + merge[re.ir[t.pc + 1].raw+t.counter] = genCounter; + t.pc += IRL!(IR.OrEnd); + } + else + { + debug(std_regex_matcher) writefln("A thread(pc=%s) got merged there : %s ; GenCounter=%s mergetab=%s", + t.pc, s[index .. s.lastIndex], genCounter, merge[re.ir[t.pc + 1].raw + t.counter] ); + return popState(e); + } + return true; + } + } + + static bool op(IR code:IR.OrStart)(E* e, S* state) + { + with(e) with(state) + { + t.pc += IRL!(IR.OrStart); + return op!(IR.Option)(e,state); + } + } + + static bool op(IR code:IR.Option)(E* e, S* state) + { + with(e) with(state) + { + uint next = t.pc + re.ir[t.pc].data + IRL!(IR.Option); + //queue next Option + if (re.ir[next].code == IR.Option) + { + worklist.insertFront(fork(t, next, t.counter)); + } + t.pc += IRL!(IR.Option); + return true; + } + } + + static bool op(IR code:IR.GotoEndOr)(E* e, S* state) + { + with(e) with(state) + { + t.pc = t.pc + re.ir[t.pc].data + IRL!(IR.GotoEndOr); + return op!(IR.OrEnd)(e, state); + } + } + + static bool op(IR code:IR.GroupStart)(E* e, S* state) + { + with(e) with(state) + { + uint n = re.ir[t.pc].data; + t.matches.ptr[n].begin = index; + t.pc += IRL!(IR.GroupStart); + return true; + } + } + static bool op(IR code:IR.GroupEnd)(E* e, S* state) + { + with(e) with(state) + { + uint n = re.ir[t.pc].data; + t.matches.ptr[n].end = index; + t.pc += IRL!(IR.GroupEnd); + return true; + } + } + + static bool op(IR code:IR.Backref)(E* e, S* state) + { + with(e) with(state) + { + uint n = re.ir[t.pc].data; + Group!DataIndex* source = re.ir[t.pc].localRef ? t.matches.ptr : backrefed.ptr; + assert(source); + if (source[n].begin == source[n].end)//zero-width Backref! + { + t.pc += IRL!(IR.Backref); + return true; + } + else + { + size_t idx = source[n].begin + t.uopCounter; + size_t end = source[n].end; + if (s[idx .. end].front == front) + { + import std.utf : stride; + + t.uopCounter += stride(s[idx .. end], 0); + if (t.uopCounter + source[n].begin == source[n].end) + {//last codepoint + t.pc += IRL!(IR.Backref); + t.uopCounter = 0; + } + nlist.insertBack(t); + } + else + recycle(t); + t = worklist.fetch(); + return t != null; + } + } + } + + + static bool op(IR code)(E* e, S* state) + if (code == IR.LookbehindStart || code == IR.NeglookbehindStart) + { + with(e) with(state) + { + uint len = re.ir[t.pc].data; + uint ms = re.ir[t.pc + 1].raw, me = re.ir[t.pc + 2].raw; + uint end = t.pc + len + IRL!(IR.LookbehindEnd) + IRL!(IR.LookbehindStart); + bool positive = re.ir[t.pc].code == IR.LookbehindStart; + static if (Stream.isLoopback) + auto matcher = fwdMatcher(t.pc, end, subCounters.get(t.pc, 0)); + else + auto matcher = bwdMatcher(t.pc, end, subCounters.get(t.pc, 0)); + matcher.re.ngroup = me - ms; + matcher.backrefed = backrefed.empty ? t.matches : backrefed; + //backMatch + auto mRes = matcher.matchOneShot(t.matches.ptr[ms .. me], IRL!(IR.LookbehindStart)); + freelist = matcher.freelist; + subCounters[t.pc] = matcher.genCounter; + if ((mRes != 0 ) ^ positive) + { + return popState(e); + } + t.pc = end; + return true; + } + } + + static bool op(IR code)(E* e, S* state) + if (code == IR.LookaheadStart || code == IR.NeglookaheadStart) + { + with(e) with(state) + { + auto save = index; + uint len = re.ir[t.pc].data; + uint ms = re.ir[t.pc+1].raw, me = re.ir[t.pc+2].raw; + uint end = t.pc+len+IRL!(IR.LookaheadEnd)+IRL!(IR.LookaheadStart); + bool positive = re.ir[t.pc].code == IR.LookaheadStart; + static if (Stream.isLoopback) + auto matcher = bwdMatcher(t.pc, end, subCounters.get(t.pc, 0)); + else + auto matcher = fwdMatcher(t.pc, end, subCounters.get(t.pc, 0)); + matcher.re.ngroup = me - ms; + matcher.backrefed = backrefed.empty ? t.matches : backrefed; + auto mRes = matcher.matchOneShot(t.matches.ptr[ms .. me], IRL!(IR.LookaheadStart)); + freelist = matcher.freelist; + subCounters[t.pc] = matcher.genCounter; + s.reset(index); + next(); + if ((mRes != 0) ^ positive) + { + return popState(e); + } + t.pc = end; + return true; + } + } + + static bool op(IR code)(E* e, S* state) + if (code == IR.LookaheadEnd || code == IR.NeglookaheadEnd || + code == IR.LookbehindEnd || code == IR.NeglookbehindEnd) + { + with(e) with(state) + { + finish(t, matches.ptr[0 .. re.ngroup], re.ir[t.pc].data); + recycle(t); + //cut off low priority threads + recycle(clist); + recycle(worklist); + return false; // no more state + } + } + + static bool op(IR code:IR.Nop)(E* e, S* state) + { + with(state) t.pc += IRL!(IR.Nop); + return true; + } + + static bool op(IR code:IR.OrChar)(E* e, S* state) + { + with(e) with(state) + { + uint len = re.ir[t.pc].sequence; + uint end = t.pc + len; + static assert(IRL!(IR.OrChar) == 1); + for (; t.pc < end; t.pc++) + if (re.ir[t.pc].data == front) + break; + if (t.pc != end) + { + t.pc = end; + nlist.insertBack(t); + } + else + recycle(t); + t = worklist.fetch(); + return t != null; + } + } + + static bool op(IR code:IR.Char)(E* e, S* state) + { + with(e) with(state) + { + if (front == re.ir[t.pc].data) + { + t.pc += IRL!(IR.Char); + nlist.insertBack(t); + } + else + recycle(t); + t = worklist.fetch(); + return t != null; + } + } + + static bool op(IR code:IR.Any)(E* e, S* state) + { + with(e) with(state) + { + t.pc += IRL!(IR.Any); + nlist.insertBack(t); + t = worklist.fetch(); + return t != null; + } + } + + static bool op(IR code:IR.CodepointSet)(E* e, S* state) + { + with(e) with(state) + { + if (re.charsets[re.ir[t.pc].data].scanFor(front)) + { + t.pc += IRL!(IR.CodepointSet); + nlist.insertBack(t); + } + else + { + recycle(t); + } + t = worklist.fetch(); + return t != null; + } + } + + static bool op(IR code:IR.Trie)(E* e, S* state) + { + with(e) with(state) + { + if (re.matchers[re.ir[t.pc].data][front]) + { + t.pc += IRL!(IR.Trie); + nlist.insertBack(t); + } + else + { + recycle(t); + } + t = worklist.fetch(); + return t != null; + } + } + +} + +template ThompsonOps(E,S, bool withInput:false) +{ +@trusted: + // can't match these without input + static bool op(IR code)(E* e, S* state) + if (code == IR.Char || code == IR.OrChar || code == IR.CodepointSet + || code == IR.Trie || code == IR.Char || code == IR.Any) + { + return state.popState(e); + } + + // special case of zero-width backref + static bool op(IR code:IR.Backref)(E* e, S* state) + { + with(e) with(state) + { + uint n = re.ir[t.pc].data; + Group!DataIndex* source = re.ir[t.pc].localRef ? t.matches.ptr : backrefed.ptr; + assert(source); + if (source[n].begin == source[n].end)//zero-width Backref! + { + t.pc += IRL!(IR.Backref); + return true; + } + else + return popState(e); + } + } + + // forward all control flow to normal versions + static bool op(IR code)(E* e, S* state) + if (code != IR.Char && code != IR.OrChar && code != IR.CodepointSet + && code != IR.Trie && code != IR.Char && code != IR.Any && code != IR.Backref) + { + return ThompsonOps!(E,S,true).op!code(e,state); + } +} + +/+ + Thomspon matcher does all matching in lockstep, + never looking at the same char twice ++/ +@trusted struct ThompsonMatcher(Char, StreamType = Input!Char) +if (is(Char : dchar)) +{ + alias DataIndex = Stream.DataIndex; + alias Stream = StreamType; + alias OpFunc = bool function(ThompsonMatcher*, State*); + alias BackMatcher = ThompsonMatcher!(Char, BackLooper!(Stream)); + alias OpBackFunc = bool function(BackMatcher*, BackMatcher.State*); + Thread!DataIndex* freelist; + ThreadList!DataIndex clist, nlist; + DataIndex[] merge; + Group!DataIndex[] backrefed; + Regex!Char re; //regex program + Stream s; + dchar front; + DataIndex index; + DataIndex genCounter; //merge trace counter, goes up on every dchar + size_t[size_t] subCounters; //a table of gen counter per sub-engine: PC -> counter + OpFunc[] opCacheTrue; // pointers to Op!(IR.xyz) for each bytecode + OpFunc[] opCacheFalse; // ditto + OpBackFunc[] opCacheBackTrue; // ditto + OpBackFunc[] opCacheBackFalse; // ditto + size_t threadSize; + int matched; + bool exhausted; + + static struct State + { + Thread!DataIndex* t; + ThreadList!DataIndex worklist; + Group!DataIndex[] matches; + + bool popState(E)(E* e) + { + with(e) + { + recycle(t); + t = worklist.fetch(); + return t != null; + } + } + + } + + static if (__traits(hasMember,Stream, "search")) + { + enum kicked = true; + } + else + enum kicked = false; + + static size_t getThreadSize(const ref Regex!Char re) + { + return re.ngroup + ? (Thread!DataIndex).sizeof + (re.ngroup-1)*(Group!DataIndex).sizeof + : (Thread!DataIndex).sizeof - (Group!DataIndex).sizeof; + } + + static size_t initialMemory(const ref Regex!Char re) + { + return getThreadSize(re)*re.threadCount + re.hotspotTableSize*size_t.sizeof + +4*OpFunc.sizeof*re.ir.length; + } + + //true if it's start of input + @property bool atStart(){ return index == 0; } + + //true if it's end of input + @property bool atEnd(){ return index == s.lastIndex && s.atEnd; } + + bool next() + { + if (!s.nextChar(front, index)) + { + index = s.lastIndex; + return false; + } + return true; + } + + static if (kicked) + { + bool search() + { + + if (!s.search(re.kickstart, front, index)) + { + index = s.lastIndex; + return false; + } + return true; + } + } + + void initExternalMemory(void[] memory) + { + threadSize = getThreadSize(re); + prepareFreeList(re.threadCount, memory); + if (re.hotspotTableSize) + { + merge = arrayInChunk!(DataIndex)(re.hotspotTableSize, memory); + merge[] = 0; + } + opCacheTrue = arrayInChunk!(OpFunc)(re.ir.length, memory); + opCacheFalse = arrayInChunk!(OpFunc)(re.ir.length, memory); + opCacheBackTrue = arrayInChunk!(OpBackFunc)(re.ir.length, memory); + opCacheBackFalse = arrayInChunk!(OpBackFunc)(re.ir.length, memory); + + for (uint pc = 0; pc<re.ir.length; pc += re.ir[pc].length) + { + L_dispatch: + switch (re.ir[pc].code) + { + foreach (e; __traits(allMembers, IR)) + { + mixin(`case IR.`~e~`: + opCacheTrue[pc] = &Ops!(true).op!(IR.`~e~`); + opCacheBackTrue[pc] = &BackOps!(true).op!(IR.`~e~`); + opCacheFalse[pc] = &Ops!(false).op!(IR.`~e~`); + opCacheBackFalse[pc] = &BackOps!(false).op!(IR.`~e~`); + break L_dispatch; + `); + } + default: + assert(0, "Unrecognized instruction "~re.ir[pc].mnemonic); + } + } + } + + this()(Regex!Char program, Stream stream, void[] memory) + { + re = program; + s = stream; + initExternalMemory(memory); + genCounter = 0; + } + + this(ref ThompsonMatcher matcher, size_t lo, size_t hi, Stream stream) + { + s = stream; + re = matcher.re; + re.ir = re.ir[lo .. hi]; + threadSize = matcher.threadSize; + merge = matcher.merge; + freelist = matcher.freelist; + opCacheTrue = matcher.opCacheTrue[lo .. hi]; + opCacheBackTrue = matcher.opCacheBackTrue[lo .. hi]; + opCacheFalse = matcher.opCacheFalse[lo .. hi]; + opCacheBackFalse = matcher.opCacheBackFalse[lo .. hi]; + front = matcher.front; + index = matcher.index; + } + + this(ref BackMatcher matcher, size_t lo, size_t hi, Stream stream) + { + s = stream; + re = matcher.re; + re.ir = re.ir[lo .. hi]; + threadSize = matcher.threadSize; + merge = matcher.merge; + freelist = matcher.freelist; + opCacheTrue = matcher.opCacheBackTrue[lo .. hi]; + opCacheBackTrue = matcher.opCacheTrue[lo .. hi]; + opCacheFalse = matcher.opCacheBackFalse[lo .. hi]; + opCacheBackFalse = matcher.opCacheFalse[lo .. hi]; + front = matcher.front; + index = matcher.index; + } + + auto fwdMatcher()(size_t lo, size_t hi, size_t counter) + { + auto m = ThompsonMatcher!(Char, Stream)(this, lo, hi, s); + m.genCounter = counter; + return m; + } + + auto bwdMatcher()(size_t lo, size_t hi, size_t counter) + { + alias BackLooper = typeof(s.loopBack(index)); + auto m = ThompsonMatcher!(Char, BackLooper)(this, lo, hi, s.loopBack(index)); + m.genCounter = counter; + m.next(); + return m; + } + + auto dupTo(void[] memory) + { + typeof(this) tmp = this;//bitblit + tmp.initExternalMemory(memory); + tmp.genCounter = 0; + return tmp; + } + + int match(Group!DataIndex[] matches) + { + debug(std_regex_matcher) + writeln("------------------------------------------"); + if (exhausted) + { + return false; + } + if (re.flags & RegexInfo.oneShot) + { + next(); + exhausted = true; + return matchOneShot(matches); + } + static if (kicked) + if (!re.kickstart.empty) + return matchImpl!(true)(matches); + return matchImpl!(false)(matches); + } + + //match the input and fill matches + int matchImpl(bool withSearch)(Group!DataIndex[] matches) + { + if (!matched && clist.empty) + { + static if (withSearch) + search(); + else + next(); + } + else//char in question is fetched in prev call to match + { + matched = 0; + } + State state; + state.matches = matches; + + if (!atEnd)//if no char + for (;;) + { + genCounter++; + debug(std_regex_matcher) + { + writefln("Threaded matching threads at %s", s[index .. s.lastIndex]); + foreach (t; clist[]) + { + assert(t); + writef("pc=%s ",t.pc); + write(t.matches); + writeln(); + } + } + for (state.t = clist.fetch(); state.t; state.t = clist.fetch()) + { + eval!true(&state); + } + //if we already have match no need to push the engine + if (!matched) + { + state.t = createStart(index); + eval!true(&state);//new thread staring at this position + } + else if (nlist.empty) + { + debug(std_regex_matcher) writeln("Stopped matching before consuming full input"); + break;//not a partial match for sure + } + clist = nlist; + nlist = (ThreadList!DataIndex).init; + if (clist.tip is null) + { + static if (withSearch) + { + if (!search()) + break; + } + else + { + if (!next()) + break; + } + } + else if (!next()) + { + if (!atEnd) return false; + exhausted = true; + break; + } + } + + genCounter++; //increment also on each end + debug(std_regex_matcher) writefln("Threaded matching threads at end"); + //try out all zero-width posibilities + for (state.t = clist.fetch(); state.t; state.t = clist.fetch()) + { + eval!false(&state); + } + if (!matched) + { + state.t = createStart(index); + eval!false(&state);//new thread starting at end of input + } + if (matched) + {//in case NFA found match along the way + //and last possible longer alternative ultimately failed + s.reset(matches[0].end);//reset to last successful match + next();//and reload front character + //--- here the exact state of stream was restored --- + exhausted = atEnd || !(re.flags & RegexOption.global); + //+ empty match advances the input + if (!exhausted && matches[0].begin == matches[0].end) + next(); + } + return matched; + } + + /+ + handle succesful threads + +/ + void finish(const(Thread!DataIndex)* t, Group!DataIndex[] matches, int code) + { + matches.ptr[0 .. re.ngroup] = t.matches.ptr[0 .. re.ngroup]; + debug(std_regex_matcher) + { + writef("FOUND pc=%s prog_len=%s", + t.pc, re.ir.length); + if (!matches.empty) + writefln(": %s..%s", matches[0].begin, matches[0].end); + foreach (v; matches) + writefln("%d .. %d", v.begin, v.end); + } + matched = code; + } + + alias Ops(bool withInput) = ThompsonOps!(ThompsonMatcher, State, withInput); + alias BackOps(bool withInput) = ThompsonOps!(BackMatcher, BackMatcher.State, withInput); + + /+ + match thread against codepoint, cutting trough all 0-width instructions + and taking care of control flow, then add it to nlist + +/ + void eval(bool withInput)(State* state) + { + debug(std_regex_matcher) writeln("---- Evaluating thread"); + static if (withInput) + while (opCacheTrue.ptr[state.t.pc](&this, state)){} + else + while (opCacheFalse.ptr[state.t.pc](&this, state)){} + } + enum uint RestartPc = uint.max; + //match the input, evaluating IR without searching + int matchOneShot(Group!DataIndex[] matches, uint startPc = 0) + { + debug(std_regex_matcher) + { + writefln("---------------single shot match ----------------- "); + } + alias evalFn = eval; + assert(clist == (ThreadList!DataIndex).init || startPc == RestartPc); // incorrect after a partial match + assert(nlist == (ThreadList!DataIndex).init || startPc == RestartPc); + State state; + state.matches = matches; + if (!atEnd)//if no char + { + debug(std_regex_matcher) + { + writefln("-- Threaded matching threads at %s", s[index .. s.lastIndex]); + } + if (startPc != RestartPc) + { + state.t = createStart(index, startPc); + genCounter++; + evalFn!true(&state); + } + for (;;) + { + debug(std_regex_matcher) writeln("\n-- Started iteration of main cycle"); + genCounter++; + debug(std_regex_matcher) + { + foreach (t; clist[]) + { + assert(t); + } + } + for (state.t = clist.fetch(); state.t; state.t = clist.fetch()) + { + evalFn!true(&state); + } + if (nlist.empty) + { + debug(std_regex_matcher) writeln("Stopped matching before consuming full input"); + break;//not a partial match for sure + } + clist = nlist; + nlist = (ThreadList!DataIndex).init; + if (!next()) + break; + debug(std_regex_matcher) writeln("-- Ended iteration of main cycle\n"); + } + } + genCounter++; //increment also on each end + debug(std_regex_matcher) writefln("-- Matching threads at end"); + //try out all zero-width posibilities + for (state.t = clist.fetch(); state.t; state.t = clist.fetch()) + { + evalFn!false(&state); + } + if (!matched) + { + state.t = createStart(index, startPc); + evalFn!false(&state); + } + return matched; + } + + //get a dirty recycled Thread + Thread!DataIndex* allocate() + { + assert(freelist, "not enough preallocated memory"); + Thread!DataIndex* t = freelist; + freelist = freelist.next; + return t; + } + + //link memory into a free list of Threads + void prepareFreeList(size_t size, ref void[] memory) + { + void[] mem = memory[0 .. threadSize*size]; + memory = memory[threadSize * size .. $]; + freelist = cast(Thread!DataIndex*)&mem[0]; + size_t i; + for (i = threadSize; i < threadSize*size; i += threadSize) + (cast(Thread!DataIndex*)&mem[i-threadSize]).next = cast(Thread!DataIndex*)&mem[i]; + (cast(Thread!DataIndex*)&mem[i-threadSize]).next = null; + } + + //dispose a thread + void recycle(Thread!DataIndex* t) + { + t.next = freelist; + freelist = t; + } + + //dispose list of threads + void recycle(ref ThreadList!DataIndex list) + { + if (list.tip) + { + // just put this head-tail list in front of freelist + list.toe.next = freelist; + freelist = list.tip; + list = list.init; + } + } + + //creates a copy of master thread with given pc + Thread!DataIndex* fork(Thread!DataIndex* master, uint pc, uint counter) + { + auto t = allocate(); + t.matches.ptr[0 .. re.ngroup] = master.matches.ptr[0 .. re.ngroup]; + t.pc = pc; + t.counter = counter; + t.uopCounter = 0; + return t; + } + + //creates a start thread + Thread!DataIndex* createStart(DataIndex index, uint pc = 0) + { + auto t = allocate(); + t.matches.ptr[0 .. re.ngroup] = (Group!DataIndex).init; + t.matches[0].begin = index; + t.pc = pc; + t.counter = 0; + t.uopCounter = 0; + return t; + } +} |