aboutsummaryrefslogtreecommitdiff
path: root/libphobos/src/std/regex/internal/thompson.d
diff options
context:
space:
mode:
Diffstat (limited to 'libphobos/src/std/regex/internal/thompson.d')
-rw-r--r--libphobos/src/std/regex/internal/thompson.d1188
1 files changed, 1188 insertions, 0 deletions
diff --git a/libphobos/src/std/regex/internal/thompson.d b/libphobos/src/std/regex/internal/thompson.d
new file mode 100644
index 0000000..4d7deaa
--- /dev/null
+++ b/libphobos/src/std/regex/internal/thompson.d
@@ -0,0 +1,1188 @@
+//Written in the D programming language
+/*
+ Implementation of Thompson NFA std.regex engine.
+ Key point is evaluation of all possible threads (state) at each step
+ in a breadth-first manner, thereby geting some nice properties:
+ - looking at each character only once
+ - merging of equivalent threads, that gives matching process linear time complexity
+*/
+module std.regex.internal.thompson;
+
+package(std.regex):
+
+import std.range.primitives;
+import std.regex.internal.ir;
+
+//State of VM thread
+struct Thread(DataIndex)
+{
+ Thread* next; //intrusive linked list
+ uint pc;
+ uint counter; //loop counter
+ uint uopCounter; //counts micro operations inside one macro instruction (e.g. BackRef)
+ Group!DataIndex[1] matches;
+}
+
+//head-tail singly-linked list
+struct ThreadList(DataIndex)
+{
+ Thread!DataIndex* tip = null, toe = null;
+ //add new thread to the start of list
+ void insertFront(Thread!DataIndex* t)
+ {
+ if (tip)
+ {
+ t.next = tip;
+ tip = t;
+ }
+ else
+ {
+ t.next = null;
+ tip = toe = t;
+ }
+ }
+ //add new thread to the end of list
+ void insertBack(Thread!DataIndex* t)
+ {
+ if (toe)
+ {
+ toe.next = t;
+ toe = t;
+ }
+ else
+ tip = toe = t;
+ toe.next = null;
+ }
+ //move head element out of list
+ Thread!DataIndex* fetch()
+ {
+ auto t = tip;
+ if (tip == toe)
+ tip = toe = null;
+ else
+ tip = tip.next;
+ return t;
+ }
+ //non-destructive iteration of ThreadList
+ struct ThreadRange
+ {
+ const(Thread!DataIndex)* ct;
+ this(ThreadList tlist){ ct = tlist.tip; }
+ @property bool empty(){ return ct is null; }
+ @property const(Thread!DataIndex)* front(){ return ct; }
+ @property popFront()
+ {
+ assert(ct);
+ ct = ct.next;
+ }
+ }
+ @property bool empty()
+ {
+ return tip == null;
+ }
+ ThreadRange opSlice()
+ {
+ return ThreadRange(this);
+ }
+}
+
+template ThompsonOps(E, S, bool withInput:true)
+{
+@trusted:
+ static bool op(IR code:IR.End)(E* e, S* state)
+ {
+ with(e) with(state)
+ {
+ finish(t, matches, re.ir[t.pc].data);
+ //fix endpoint of the whole match
+ matches[0].end = index;
+ recycle(t);
+ //cut off low priority threads
+ recycle(clist);
+ recycle(worklist);
+ debug(std_regex_matcher) writeln("Finished thread ", matches);
+ return false; // no more state to eval
+ }
+ }
+
+ static bool op(IR code:IR.Wordboundary)(E* e, S* state)
+ {
+ with(e) with(state)
+ {
+ dchar back;
+ DataIndex bi;
+ //at start & end of input
+ if (atStart && wordMatcher[front])
+ {
+ t.pc += IRL!(IR.Wordboundary);
+ return true;
+ }
+ else if (atEnd && s.loopBack(index).nextChar(back, bi)
+ && wordMatcher[back])
+ {
+ t.pc += IRL!(IR.Wordboundary);
+ return true;
+ }
+ else if (s.loopBack(index).nextChar(back, bi))
+ {
+ bool af = wordMatcher[front];
+ bool ab = wordMatcher[back];
+ if (af ^ ab)
+ {
+ t.pc += IRL!(IR.Wordboundary);
+ return true;
+ }
+ }
+ return popState(e);
+ }
+ }
+
+ static bool op(IR code:IR.Notwordboundary)(E* e, S* state)
+ {
+ with(e) with(state)
+ {
+ dchar back;
+ DataIndex bi;
+ //at start & end of input
+ if (atStart && wordMatcher[front])
+ {
+ return popState(e);
+ }
+ else if (atEnd && s.loopBack(index).nextChar(back, bi)
+ && wordMatcher[back])
+ {
+ return popState(e);
+ }
+ else if (s.loopBack(index).nextChar(back, bi))
+ {
+ bool af = wordMatcher[front];
+ bool ab = wordMatcher[back] != 0;
+ if (af ^ ab)
+ {
+ return popState(e);
+ }
+ }
+ t.pc += IRL!(IR.Notwordboundary);
+ }
+ return true;
+ }
+
+ static bool op(IR code:IR.Bof)(E* e, S* state)
+ {
+ with(e) with(state)
+ {
+ if (atStart)
+ {
+ t.pc += IRL!(IR.Bof);
+ return true;
+ }
+ else
+ {
+ return popState(e);
+ }
+ }
+ }
+
+ static bool op(IR code:IR.Bol)(E* e, S* state)
+ {
+ with(e) with(state)
+ {
+ dchar back;
+ DataIndex bi;
+ if (atStart
+ ||(s.loopBack(index).nextChar(back,bi)
+ && startOfLine(back, front == '\n')))
+ {
+ t.pc += IRL!(IR.Bol);
+ return true;
+ }
+ else
+ {
+ return popState(e);
+ }
+ }
+ }
+
+ static bool op(IR code:IR.Eof)(E* e, S* state)
+ {
+ with(e) with(state)
+ {
+ if (atEnd)
+ {
+ t.pc += IRL!(IR.Eol);
+ return true;
+ }
+ else
+ {
+ return popState(e);
+ }
+ }
+ }
+
+ static bool op(IR code:IR.Eol)(E* e, S* state)
+ {
+ with(e) with(state)
+ {
+ dchar back;
+ DataIndex bi;
+ //no matching inside \r\n
+ if (atEnd || (endOfLine(front, s.loopBack(index).nextChar(back, bi)
+ && back == '\r')))
+ {
+ t.pc += IRL!(IR.Eol);
+ return true;
+ }
+ else
+ {
+ return popState(e);
+ }
+
+ }
+ }
+
+ static bool op(IR code:IR.InfiniteStart)(E* e, S* state)
+ {
+ with(e) with(state)
+ t.pc += re.ir[t.pc].data + IRL!(IR.InfiniteStart);
+ return op!(IR.InfiniteEnd)(e,state);
+ }
+
+ static bool op(IR code:IR.InfiniteBloomStart)(E* e, S* state)
+ {
+ with(e) with(state)
+ t.pc += re.ir[t.pc].data + IRL!(IR.InfiniteBloomStart);
+ return op!(IR.InfiniteBloomEnd)(e,state);
+ }
+
+ static bool op(IR code:IR.InfiniteQStart)(E* e, S* state)
+ {
+ with(e) with(state)
+ t.pc += re.ir[t.pc].data + IRL!(IR.InfiniteQStart);
+ return op!(IR.InfiniteQEnd)(e,state);
+ }
+
+ static bool op(IR code:IR.RepeatStart)(E* e, S* state)
+ {
+ with(e) with(state)
+ t.pc += re.ir[t.pc].data + IRL!(IR.RepeatStart);
+ return op!(IR.RepeatEnd)(e,state);
+ }
+
+ static bool op(IR code:IR.RepeatQStart)(E* e, S* state)
+ {
+ with(e) with(state)
+ t.pc += re.ir[t.pc].data + IRL!(IR.RepeatQStart);
+ return op!(IR.RepeatQEnd)(e,state);
+ }
+
+ static bool op(IR code)(E* e, S* state)
+ if (code == IR.RepeatEnd || code == IR.RepeatQEnd)
+ {
+ with(e) with(state)
+ {
+ //len, step, min, max
+ uint len = re.ir[t.pc].data;
+ uint step = re.ir[t.pc+2].raw;
+ uint min = re.ir[t.pc+3].raw;
+ if (t.counter < min)
+ {
+ t.counter += step;
+ t.pc -= len;
+ return true;
+ }
+ if (merge[re.ir[t.pc + 1].raw+t.counter] < genCounter)
+ {
+ debug(std_regex_matcher) writefln("A thread(pc=%s) passed there : %s ; GenCounter=%s mergetab=%s",
+ t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] );
+ merge[re.ir[t.pc + 1].raw+t.counter] = genCounter;
+ }
+ else
+ {
+ debug(std_regex_matcher)
+ writefln("A thread(pc=%s) got merged there : %s ; GenCounter=%s mergetab=%s",
+ t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] );
+ return popState(e);
+ }
+ uint max = re.ir[t.pc+4].raw;
+ if (t.counter < max)
+ {
+ if (re.ir[t.pc].code == IR.RepeatEnd)
+ {
+ //queue out-of-loop thread
+ worklist.insertFront(fork(t, t.pc + IRL!(IR.RepeatEnd), t.counter % step));
+ t.counter += step;
+ t.pc -= len;
+ }
+ else
+ {
+ //queue into-loop thread
+ worklist.insertFront(fork(t, t.pc - len, t.counter + step));
+ t.counter %= step;
+ t.pc += IRL!(IR.RepeatEnd);
+ }
+ }
+ else
+ {
+ t.counter %= step;
+ t.pc += IRL!(IR.RepeatEnd);
+ }
+ return true;
+ }
+ }
+
+ static bool op(IR code)(E* e, S* state)
+ if (code == IR.InfiniteEnd || code == IR.InfiniteQEnd)
+ {
+ with(e) with(state)
+ {
+ if (merge[re.ir[t.pc + 1].raw+t.counter] < genCounter)
+ {
+ debug(std_regex_matcher) writefln("A thread(pc=%s) passed there : %s ; GenCounter=%s mergetab=%s",
+ t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] );
+ merge[re.ir[t.pc + 1].raw+t.counter] = genCounter;
+ }
+ else
+ {
+ debug(std_regex_matcher) writefln("A thread(pc=%s) got merged there : %s ; GenCounter=%s mergetab=%s",
+ t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] );
+ return popState(e);
+ }
+ uint len = re.ir[t.pc].data;
+ uint pc1, pc2; //branches to take in priority order
+ if (re.ir[t.pc].code == IR.InfiniteEnd)
+ {
+ pc1 = t.pc - len;
+ pc2 = t.pc + IRL!(IR.InfiniteEnd);
+ }
+ else
+ {
+ pc1 = t.pc + IRL!(IR.InfiniteEnd);
+ pc2 = t.pc - len;
+ }
+ worklist.insertFront(fork(t, pc2, t.counter));
+ t.pc = pc1;
+ return true;
+ }
+ }
+
+ static bool op(IR code)(E* e, S* state)
+ if (code == IR.InfiniteBloomEnd)
+ {
+ with(e) with(state)
+ {
+ if (merge[re.ir[t.pc + 1].raw+t.counter] < genCounter)
+ {
+ debug(std_regex_matcher) writefln("A thread(pc=%s) passed there : %s ; GenCounter=%s mergetab=%s",
+ t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] );
+ merge[re.ir[t.pc + 1].raw+t.counter] = genCounter;
+ }
+ else
+ {
+ debug(std_regex_matcher) writefln("A thread(pc=%s) got merged there : %s ; GenCounter=%s mergetab=%s",
+ t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] );
+ return popState(e);
+ }
+ uint len = re.ir[t.pc].data;
+ uint pc1, pc2; //branches to take in priority order
+ pc1 = t.pc - len;
+ pc2 = t.pc + IRL!(IR.InfiniteBloomEnd);
+ uint filterIndex = re.ir[t.pc + 2].raw;
+ if (re.filters[filterIndex][front])
+ worklist.insertFront(fork(t, pc2, t.counter));
+ t.pc = pc1;
+ return true;
+ }
+ }
+
+ static bool op(IR code:IR.OrEnd)(E* e, S* state)
+ {
+ with(e) with(state)
+ {
+ if (merge[re.ir[t.pc + 1].raw+t.counter] < genCounter)
+ {
+ debug(std_regex_matcher) writefln("A thread(pc=%s) passed there : %s ; GenCounter=%s mergetab=%s",
+ t.pc, s[index .. s.lastIndex], genCounter, merge[re.ir[t.pc + 1].raw + t.counter] );
+ merge[re.ir[t.pc + 1].raw+t.counter] = genCounter;
+ t.pc += IRL!(IR.OrEnd);
+ }
+ else
+ {
+ debug(std_regex_matcher) writefln("A thread(pc=%s) got merged there : %s ; GenCounter=%s mergetab=%s",
+ t.pc, s[index .. s.lastIndex], genCounter, merge[re.ir[t.pc + 1].raw + t.counter] );
+ return popState(e);
+ }
+ return true;
+ }
+ }
+
+ static bool op(IR code:IR.OrStart)(E* e, S* state)
+ {
+ with(e) with(state)
+ {
+ t.pc += IRL!(IR.OrStart);
+ return op!(IR.Option)(e,state);
+ }
+ }
+
+ static bool op(IR code:IR.Option)(E* e, S* state)
+ {
+ with(e) with(state)
+ {
+ uint next = t.pc + re.ir[t.pc].data + IRL!(IR.Option);
+ //queue next Option
+ if (re.ir[next].code == IR.Option)
+ {
+ worklist.insertFront(fork(t, next, t.counter));
+ }
+ t.pc += IRL!(IR.Option);
+ return true;
+ }
+ }
+
+ static bool op(IR code:IR.GotoEndOr)(E* e, S* state)
+ {
+ with(e) with(state)
+ {
+ t.pc = t.pc + re.ir[t.pc].data + IRL!(IR.GotoEndOr);
+ return op!(IR.OrEnd)(e, state);
+ }
+ }
+
+ static bool op(IR code:IR.GroupStart)(E* e, S* state)
+ {
+ with(e) with(state)
+ {
+ uint n = re.ir[t.pc].data;
+ t.matches.ptr[n].begin = index;
+ t.pc += IRL!(IR.GroupStart);
+ return true;
+ }
+ }
+ static bool op(IR code:IR.GroupEnd)(E* e, S* state)
+ {
+ with(e) with(state)
+ {
+ uint n = re.ir[t.pc].data;
+ t.matches.ptr[n].end = index;
+ t.pc += IRL!(IR.GroupEnd);
+ return true;
+ }
+ }
+
+ static bool op(IR code:IR.Backref)(E* e, S* state)
+ {
+ with(e) with(state)
+ {
+ uint n = re.ir[t.pc].data;
+ Group!DataIndex* source = re.ir[t.pc].localRef ? t.matches.ptr : backrefed.ptr;
+ assert(source);
+ if (source[n].begin == source[n].end)//zero-width Backref!
+ {
+ t.pc += IRL!(IR.Backref);
+ return true;
+ }
+ else
+ {
+ size_t idx = source[n].begin + t.uopCounter;
+ size_t end = source[n].end;
+ if (s[idx .. end].front == front)
+ {
+ import std.utf : stride;
+
+ t.uopCounter += stride(s[idx .. end], 0);
+ if (t.uopCounter + source[n].begin == source[n].end)
+ {//last codepoint
+ t.pc += IRL!(IR.Backref);
+ t.uopCounter = 0;
+ }
+ nlist.insertBack(t);
+ }
+ else
+ recycle(t);
+ t = worklist.fetch();
+ return t != null;
+ }
+ }
+ }
+
+
+ static bool op(IR code)(E* e, S* state)
+ if (code == IR.LookbehindStart || code == IR.NeglookbehindStart)
+ {
+ with(e) with(state)
+ {
+ uint len = re.ir[t.pc].data;
+ uint ms = re.ir[t.pc + 1].raw, me = re.ir[t.pc + 2].raw;
+ uint end = t.pc + len + IRL!(IR.LookbehindEnd) + IRL!(IR.LookbehindStart);
+ bool positive = re.ir[t.pc].code == IR.LookbehindStart;
+ static if (Stream.isLoopback)
+ auto matcher = fwdMatcher(t.pc, end, subCounters.get(t.pc, 0));
+ else
+ auto matcher = bwdMatcher(t.pc, end, subCounters.get(t.pc, 0));
+ matcher.re.ngroup = me - ms;
+ matcher.backrefed = backrefed.empty ? t.matches : backrefed;
+ //backMatch
+ auto mRes = matcher.matchOneShot(t.matches.ptr[ms .. me], IRL!(IR.LookbehindStart));
+ freelist = matcher.freelist;
+ subCounters[t.pc] = matcher.genCounter;
+ if ((mRes != 0 ) ^ positive)
+ {
+ return popState(e);
+ }
+ t.pc = end;
+ return true;
+ }
+ }
+
+ static bool op(IR code)(E* e, S* state)
+ if (code == IR.LookaheadStart || code == IR.NeglookaheadStart)
+ {
+ with(e) with(state)
+ {
+ auto save = index;
+ uint len = re.ir[t.pc].data;
+ uint ms = re.ir[t.pc+1].raw, me = re.ir[t.pc+2].raw;
+ uint end = t.pc+len+IRL!(IR.LookaheadEnd)+IRL!(IR.LookaheadStart);
+ bool positive = re.ir[t.pc].code == IR.LookaheadStart;
+ static if (Stream.isLoopback)
+ auto matcher = bwdMatcher(t.pc, end, subCounters.get(t.pc, 0));
+ else
+ auto matcher = fwdMatcher(t.pc, end, subCounters.get(t.pc, 0));
+ matcher.re.ngroup = me - ms;
+ matcher.backrefed = backrefed.empty ? t.matches : backrefed;
+ auto mRes = matcher.matchOneShot(t.matches.ptr[ms .. me], IRL!(IR.LookaheadStart));
+ freelist = matcher.freelist;
+ subCounters[t.pc] = matcher.genCounter;
+ s.reset(index);
+ next();
+ if ((mRes != 0) ^ positive)
+ {
+ return popState(e);
+ }
+ t.pc = end;
+ return true;
+ }
+ }
+
+ static bool op(IR code)(E* e, S* state)
+ if (code == IR.LookaheadEnd || code == IR.NeglookaheadEnd ||
+ code == IR.LookbehindEnd || code == IR.NeglookbehindEnd)
+ {
+ with(e) with(state)
+ {
+ finish(t, matches.ptr[0 .. re.ngroup], re.ir[t.pc].data);
+ recycle(t);
+ //cut off low priority threads
+ recycle(clist);
+ recycle(worklist);
+ return false; // no more state
+ }
+ }
+
+ static bool op(IR code:IR.Nop)(E* e, S* state)
+ {
+ with(state) t.pc += IRL!(IR.Nop);
+ return true;
+ }
+
+ static bool op(IR code:IR.OrChar)(E* e, S* state)
+ {
+ with(e) with(state)
+ {
+ uint len = re.ir[t.pc].sequence;
+ uint end = t.pc + len;
+ static assert(IRL!(IR.OrChar) == 1);
+ for (; t.pc < end; t.pc++)
+ if (re.ir[t.pc].data == front)
+ break;
+ if (t.pc != end)
+ {
+ t.pc = end;
+ nlist.insertBack(t);
+ }
+ else
+ recycle(t);
+ t = worklist.fetch();
+ return t != null;
+ }
+ }
+
+ static bool op(IR code:IR.Char)(E* e, S* state)
+ {
+ with(e) with(state)
+ {
+ if (front == re.ir[t.pc].data)
+ {
+ t.pc += IRL!(IR.Char);
+ nlist.insertBack(t);
+ }
+ else
+ recycle(t);
+ t = worklist.fetch();
+ return t != null;
+ }
+ }
+
+ static bool op(IR code:IR.Any)(E* e, S* state)
+ {
+ with(e) with(state)
+ {
+ t.pc += IRL!(IR.Any);
+ nlist.insertBack(t);
+ t = worklist.fetch();
+ return t != null;
+ }
+ }
+
+ static bool op(IR code:IR.CodepointSet)(E* e, S* state)
+ {
+ with(e) with(state)
+ {
+ if (re.charsets[re.ir[t.pc].data].scanFor(front))
+ {
+ t.pc += IRL!(IR.CodepointSet);
+ nlist.insertBack(t);
+ }
+ else
+ {
+ recycle(t);
+ }
+ t = worklist.fetch();
+ return t != null;
+ }
+ }
+
+ static bool op(IR code:IR.Trie)(E* e, S* state)
+ {
+ with(e) with(state)
+ {
+ if (re.matchers[re.ir[t.pc].data][front])
+ {
+ t.pc += IRL!(IR.Trie);
+ nlist.insertBack(t);
+ }
+ else
+ {
+ recycle(t);
+ }
+ t = worklist.fetch();
+ return t != null;
+ }
+ }
+
+}
+
+template ThompsonOps(E,S, bool withInput:false)
+{
+@trusted:
+ // can't match these without input
+ static bool op(IR code)(E* e, S* state)
+ if (code == IR.Char || code == IR.OrChar || code == IR.CodepointSet
+ || code == IR.Trie || code == IR.Char || code == IR.Any)
+ {
+ return state.popState(e);
+ }
+
+ // special case of zero-width backref
+ static bool op(IR code:IR.Backref)(E* e, S* state)
+ {
+ with(e) with(state)
+ {
+ uint n = re.ir[t.pc].data;
+ Group!DataIndex* source = re.ir[t.pc].localRef ? t.matches.ptr : backrefed.ptr;
+ assert(source);
+ if (source[n].begin == source[n].end)//zero-width Backref!
+ {
+ t.pc += IRL!(IR.Backref);
+ return true;
+ }
+ else
+ return popState(e);
+ }
+ }
+
+ // forward all control flow to normal versions
+ static bool op(IR code)(E* e, S* state)
+ if (code != IR.Char && code != IR.OrChar && code != IR.CodepointSet
+ && code != IR.Trie && code != IR.Char && code != IR.Any && code != IR.Backref)
+ {
+ return ThompsonOps!(E,S,true).op!code(e,state);
+ }
+}
+
+/+
+ Thomspon matcher does all matching in lockstep,
+ never looking at the same char twice
++/
+@trusted struct ThompsonMatcher(Char, StreamType = Input!Char)
+if (is(Char : dchar))
+{
+ alias DataIndex = Stream.DataIndex;
+ alias Stream = StreamType;
+ alias OpFunc = bool function(ThompsonMatcher*, State*);
+ alias BackMatcher = ThompsonMatcher!(Char, BackLooper!(Stream));
+ alias OpBackFunc = bool function(BackMatcher*, BackMatcher.State*);
+ Thread!DataIndex* freelist;
+ ThreadList!DataIndex clist, nlist;
+ DataIndex[] merge;
+ Group!DataIndex[] backrefed;
+ Regex!Char re; //regex program
+ Stream s;
+ dchar front;
+ DataIndex index;
+ DataIndex genCounter; //merge trace counter, goes up on every dchar
+ size_t[size_t] subCounters; //a table of gen counter per sub-engine: PC -> counter
+ OpFunc[] opCacheTrue; // pointers to Op!(IR.xyz) for each bytecode
+ OpFunc[] opCacheFalse; // ditto
+ OpBackFunc[] opCacheBackTrue; // ditto
+ OpBackFunc[] opCacheBackFalse; // ditto
+ size_t threadSize;
+ int matched;
+ bool exhausted;
+
+ static struct State
+ {
+ Thread!DataIndex* t;
+ ThreadList!DataIndex worklist;
+ Group!DataIndex[] matches;
+
+ bool popState(E)(E* e)
+ {
+ with(e)
+ {
+ recycle(t);
+ t = worklist.fetch();
+ return t != null;
+ }
+ }
+
+ }
+
+ static if (__traits(hasMember,Stream, "search"))
+ {
+ enum kicked = true;
+ }
+ else
+ enum kicked = false;
+
+ static size_t getThreadSize(const ref Regex!Char re)
+ {
+ return re.ngroup
+ ? (Thread!DataIndex).sizeof + (re.ngroup-1)*(Group!DataIndex).sizeof
+ : (Thread!DataIndex).sizeof - (Group!DataIndex).sizeof;
+ }
+
+ static size_t initialMemory(const ref Regex!Char re)
+ {
+ return getThreadSize(re)*re.threadCount + re.hotspotTableSize*size_t.sizeof
+ +4*OpFunc.sizeof*re.ir.length;
+ }
+
+ //true if it's start of input
+ @property bool atStart(){ return index == 0; }
+
+ //true if it's end of input
+ @property bool atEnd(){ return index == s.lastIndex && s.atEnd; }
+
+ bool next()
+ {
+ if (!s.nextChar(front, index))
+ {
+ index = s.lastIndex;
+ return false;
+ }
+ return true;
+ }
+
+ static if (kicked)
+ {
+ bool search()
+ {
+
+ if (!s.search(re.kickstart, front, index))
+ {
+ index = s.lastIndex;
+ return false;
+ }
+ return true;
+ }
+ }
+
+ void initExternalMemory(void[] memory)
+ {
+ threadSize = getThreadSize(re);
+ prepareFreeList(re.threadCount, memory);
+ if (re.hotspotTableSize)
+ {
+ merge = arrayInChunk!(DataIndex)(re.hotspotTableSize, memory);
+ merge[] = 0;
+ }
+ opCacheTrue = arrayInChunk!(OpFunc)(re.ir.length, memory);
+ opCacheFalse = arrayInChunk!(OpFunc)(re.ir.length, memory);
+ opCacheBackTrue = arrayInChunk!(OpBackFunc)(re.ir.length, memory);
+ opCacheBackFalse = arrayInChunk!(OpBackFunc)(re.ir.length, memory);
+
+ for (uint pc = 0; pc<re.ir.length; pc += re.ir[pc].length)
+ {
+ L_dispatch:
+ switch (re.ir[pc].code)
+ {
+ foreach (e; __traits(allMembers, IR))
+ {
+ mixin(`case IR.`~e~`:
+ opCacheTrue[pc] = &Ops!(true).op!(IR.`~e~`);
+ opCacheBackTrue[pc] = &BackOps!(true).op!(IR.`~e~`);
+ opCacheFalse[pc] = &Ops!(false).op!(IR.`~e~`);
+ opCacheBackFalse[pc] = &BackOps!(false).op!(IR.`~e~`);
+ break L_dispatch;
+ `);
+ }
+ default:
+ assert(0, "Unrecognized instruction "~re.ir[pc].mnemonic);
+ }
+ }
+ }
+
+ this()(Regex!Char program, Stream stream, void[] memory)
+ {
+ re = program;
+ s = stream;
+ initExternalMemory(memory);
+ genCounter = 0;
+ }
+
+ this(ref ThompsonMatcher matcher, size_t lo, size_t hi, Stream stream)
+ {
+ s = stream;
+ re = matcher.re;
+ re.ir = re.ir[lo .. hi];
+ threadSize = matcher.threadSize;
+ merge = matcher.merge;
+ freelist = matcher.freelist;
+ opCacheTrue = matcher.opCacheTrue[lo .. hi];
+ opCacheBackTrue = matcher.opCacheBackTrue[lo .. hi];
+ opCacheFalse = matcher.opCacheFalse[lo .. hi];
+ opCacheBackFalse = matcher.opCacheBackFalse[lo .. hi];
+ front = matcher.front;
+ index = matcher.index;
+ }
+
+ this(ref BackMatcher matcher, size_t lo, size_t hi, Stream stream)
+ {
+ s = stream;
+ re = matcher.re;
+ re.ir = re.ir[lo .. hi];
+ threadSize = matcher.threadSize;
+ merge = matcher.merge;
+ freelist = matcher.freelist;
+ opCacheTrue = matcher.opCacheBackTrue[lo .. hi];
+ opCacheBackTrue = matcher.opCacheTrue[lo .. hi];
+ opCacheFalse = matcher.opCacheBackFalse[lo .. hi];
+ opCacheBackFalse = matcher.opCacheFalse[lo .. hi];
+ front = matcher.front;
+ index = matcher.index;
+ }
+
+ auto fwdMatcher()(size_t lo, size_t hi, size_t counter)
+ {
+ auto m = ThompsonMatcher!(Char, Stream)(this, lo, hi, s);
+ m.genCounter = counter;
+ return m;
+ }
+
+ auto bwdMatcher()(size_t lo, size_t hi, size_t counter)
+ {
+ alias BackLooper = typeof(s.loopBack(index));
+ auto m = ThompsonMatcher!(Char, BackLooper)(this, lo, hi, s.loopBack(index));
+ m.genCounter = counter;
+ m.next();
+ return m;
+ }
+
+ auto dupTo(void[] memory)
+ {
+ typeof(this) tmp = this;//bitblit
+ tmp.initExternalMemory(memory);
+ tmp.genCounter = 0;
+ return tmp;
+ }
+
+ int match(Group!DataIndex[] matches)
+ {
+ debug(std_regex_matcher)
+ writeln("------------------------------------------");
+ if (exhausted)
+ {
+ return false;
+ }
+ if (re.flags & RegexInfo.oneShot)
+ {
+ next();
+ exhausted = true;
+ return matchOneShot(matches);
+ }
+ static if (kicked)
+ if (!re.kickstart.empty)
+ return matchImpl!(true)(matches);
+ return matchImpl!(false)(matches);
+ }
+
+ //match the input and fill matches
+ int matchImpl(bool withSearch)(Group!DataIndex[] matches)
+ {
+ if (!matched && clist.empty)
+ {
+ static if (withSearch)
+ search();
+ else
+ next();
+ }
+ else//char in question is fetched in prev call to match
+ {
+ matched = 0;
+ }
+ State state;
+ state.matches = matches;
+
+ if (!atEnd)//if no char
+ for (;;)
+ {
+ genCounter++;
+ debug(std_regex_matcher)
+ {
+ writefln("Threaded matching threads at %s", s[index .. s.lastIndex]);
+ foreach (t; clist[])
+ {
+ assert(t);
+ writef("pc=%s ",t.pc);
+ write(t.matches);
+ writeln();
+ }
+ }
+ for (state.t = clist.fetch(); state.t; state.t = clist.fetch())
+ {
+ eval!true(&state);
+ }
+ //if we already have match no need to push the engine
+ if (!matched)
+ {
+ state.t = createStart(index);
+ eval!true(&state);//new thread staring at this position
+ }
+ else if (nlist.empty)
+ {
+ debug(std_regex_matcher) writeln("Stopped matching before consuming full input");
+ break;//not a partial match for sure
+ }
+ clist = nlist;
+ nlist = (ThreadList!DataIndex).init;
+ if (clist.tip is null)
+ {
+ static if (withSearch)
+ {
+ if (!search())
+ break;
+ }
+ else
+ {
+ if (!next())
+ break;
+ }
+ }
+ else if (!next())
+ {
+ if (!atEnd) return false;
+ exhausted = true;
+ break;
+ }
+ }
+
+ genCounter++; //increment also on each end
+ debug(std_regex_matcher) writefln("Threaded matching threads at end");
+ //try out all zero-width posibilities
+ for (state.t = clist.fetch(); state.t; state.t = clist.fetch())
+ {
+ eval!false(&state);
+ }
+ if (!matched)
+ {
+ state.t = createStart(index);
+ eval!false(&state);//new thread starting at end of input
+ }
+ if (matched)
+ {//in case NFA found match along the way
+ //and last possible longer alternative ultimately failed
+ s.reset(matches[0].end);//reset to last successful match
+ next();//and reload front character
+ //--- here the exact state of stream was restored ---
+ exhausted = atEnd || !(re.flags & RegexOption.global);
+ //+ empty match advances the input
+ if (!exhausted && matches[0].begin == matches[0].end)
+ next();
+ }
+ return matched;
+ }
+
+ /+
+ handle succesful threads
+ +/
+ void finish(const(Thread!DataIndex)* t, Group!DataIndex[] matches, int code)
+ {
+ matches.ptr[0 .. re.ngroup] = t.matches.ptr[0 .. re.ngroup];
+ debug(std_regex_matcher)
+ {
+ writef("FOUND pc=%s prog_len=%s",
+ t.pc, re.ir.length);
+ if (!matches.empty)
+ writefln(": %s..%s", matches[0].begin, matches[0].end);
+ foreach (v; matches)
+ writefln("%d .. %d", v.begin, v.end);
+ }
+ matched = code;
+ }
+
+ alias Ops(bool withInput) = ThompsonOps!(ThompsonMatcher, State, withInput);
+ alias BackOps(bool withInput) = ThompsonOps!(BackMatcher, BackMatcher.State, withInput);
+
+ /+
+ match thread against codepoint, cutting trough all 0-width instructions
+ and taking care of control flow, then add it to nlist
+ +/
+ void eval(bool withInput)(State* state)
+ {
+ debug(std_regex_matcher) writeln("---- Evaluating thread");
+ static if (withInput)
+ while (opCacheTrue.ptr[state.t.pc](&this, state)){}
+ else
+ while (opCacheFalse.ptr[state.t.pc](&this, state)){}
+ }
+ enum uint RestartPc = uint.max;
+ //match the input, evaluating IR without searching
+ int matchOneShot(Group!DataIndex[] matches, uint startPc = 0)
+ {
+ debug(std_regex_matcher)
+ {
+ writefln("---------------single shot match ----------------- ");
+ }
+ alias evalFn = eval;
+ assert(clist == (ThreadList!DataIndex).init || startPc == RestartPc); // incorrect after a partial match
+ assert(nlist == (ThreadList!DataIndex).init || startPc == RestartPc);
+ State state;
+ state.matches = matches;
+ if (!atEnd)//if no char
+ {
+ debug(std_regex_matcher)
+ {
+ writefln("-- Threaded matching threads at %s", s[index .. s.lastIndex]);
+ }
+ if (startPc != RestartPc)
+ {
+ state.t = createStart(index, startPc);
+ genCounter++;
+ evalFn!true(&state);
+ }
+ for (;;)
+ {
+ debug(std_regex_matcher) writeln("\n-- Started iteration of main cycle");
+ genCounter++;
+ debug(std_regex_matcher)
+ {
+ foreach (t; clist[])
+ {
+ assert(t);
+ }
+ }
+ for (state.t = clist.fetch(); state.t; state.t = clist.fetch())
+ {
+ evalFn!true(&state);
+ }
+ if (nlist.empty)
+ {
+ debug(std_regex_matcher) writeln("Stopped matching before consuming full input");
+ break;//not a partial match for sure
+ }
+ clist = nlist;
+ nlist = (ThreadList!DataIndex).init;
+ if (!next())
+ break;
+ debug(std_regex_matcher) writeln("-- Ended iteration of main cycle\n");
+ }
+ }
+ genCounter++; //increment also on each end
+ debug(std_regex_matcher) writefln("-- Matching threads at end");
+ //try out all zero-width posibilities
+ for (state.t = clist.fetch(); state.t; state.t = clist.fetch())
+ {
+ evalFn!false(&state);
+ }
+ if (!matched)
+ {
+ state.t = createStart(index, startPc);
+ evalFn!false(&state);
+ }
+ return matched;
+ }
+
+ //get a dirty recycled Thread
+ Thread!DataIndex* allocate()
+ {
+ assert(freelist, "not enough preallocated memory");
+ Thread!DataIndex* t = freelist;
+ freelist = freelist.next;
+ return t;
+ }
+
+ //link memory into a free list of Threads
+ void prepareFreeList(size_t size, ref void[] memory)
+ {
+ void[] mem = memory[0 .. threadSize*size];
+ memory = memory[threadSize * size .. $];
+ freelist = cast(Thread!DataIndex*)&mem[0];
+ size_t i;
+ for (i = threadSize; i < threadSize*size; i += threadSize)
+ (cast(Thread!DataIndex*)&mem[i-threadSize]).next = cast(Thread!DataIndex*)&mem[i];
+ (cast(Thread!DataIndex*)&mem[i-threadSize]).next = null;
+ }
+
+ //dispose a thread
+ void recycle(Thread!DataIndex* t)
+ {
+ t.next = freelist;
+ freelist = t;
+ }
+
+ //dispose list of threads
+ void recycle(ref ThreadList!DataIndex list)
+ {
+ if (list.tip)
+ {
+ // just put this head-tail list in front of freelist
+ list.toe.next = freelist;
+ freelist = list.tip;
+ list = list.init;
+ }
+ }
+
+ //creates a copy of master thread with given pc
+ Thread!DataIndex* fork(Thread!DataIndex* master, uint pc, uint counter)
+ {
+ auto t = allocate();
+ t.matches.ptr[0 .. re.ngroup] = master.matches.ptr[0 .. re.ngroup];
+ t.pc = pc;
+ t.counter = counter;
+ t.uopCounter = 0;
+ return t;
+ }
+
+ //creates a start thread
+ Thread!DataIndex* createStart(DataIndex index, uint pc = 0)
+ {
+ auto t = allocate();
+ t.matches.ptr[0 .. re.ngroup] = (Group!DataIndex).init;
+ t.matches[0].begin = index;
+ t.pc = pc;
+ t.counter = 0;
+ t.uopCounter = 0;
+ return t;
+ }
+}