diff options
author | Ian Lance Taylor <ian@gcc.gnu.org> | 2013-01-29 20:52:43 +0000 |
---|---|---|
committer | Ian Lance Taylor <ian@gcc.gnu.org> | 2013-01-29 20:52:43 +0000 |
commit | d6f2922e91928b5191a5c5f1b3a6b320712b5ce3 (patch) | |
tree | 4f2fad1f4b778519bdd5941185c7e1d032af055b /libgo/go/exp/norm/iter.go | |
parent | 91bfca59095b1cca9d4364996866848eaaf76c26 (diff) | |
download | gcc-d6f2922e91928b5191a5c5f1b3a6b320712b5ce3.zip gcc-d6f2922e91928b5191a5c5f1b3a6b320712b5ce3.tar.gz gcc-d6f2922e91928b5191a5c5f1b3a6b320712b5ce3.tar.bz2 |
libgo: Update Go library to master revision 15489/921e53d4863c.
From-SVN: r195560
Diffstat (limited to 'libgo/go/exp/norm/iter.go')
-rw-r--r-- | libgo/go/exp/norm/iter.go | 423 |
1 files changed, 269 insertions, 154 deletions
diff --git a/libgo/go/exp/norm/iter.go b/libgo/go/exp/norm/iter.go index def822d..a954624 100644 --- a/libgo/go/exp/norm/iter.go +++ b/libgo/go/exp/norm/iter.go @@ -4,53 +4,96 @@ package norm +import ( + "fmt" + "unicode/utf8" +) + const MaxSegmentSize = maxByteBufferSize // An Iter iterates over a string or byte slice, while normalizing it // to a given Form. type Iter struct { - rb reorderBuffer - info Properties // first character saved from previous iteration - next iterFunc // implementation of next depends on form - - p int // current position in input source - outStart int // start of current segment in output buffer - inStart int // start of current segment in input source - maxp int // position in output buffer after which not to start a new segment - maxseg int // for tracking an excess of combining characters + rb reorderBuffer + buf [maxByteBufferSize]byte + info Properties // first character saved from previous iteration + next iterFunc // implementation of next depends on form + asciiF iterFunc - tccc uint8 - done bool + p int // current position in input source + multiSeg []byte // remainder of multi-segment decomposition } -type iterFunc func(*Iter, []byte) int +type iterFunc func(*Iter) []byte -// SetInput initializes i to iterate over src after normalizing it to Form f. -func (i *Iter) SetInput(f Form, src []byte) { - i.rb.init(f, src) - if i.rb.f.composing { - i.next = nextComposed - } else { - i.next = nextDecomposed - } +// Init initializes i to iterate over src after normalizing it to Form f. +func (i *Iter) Init(f Form, src []byte) { i.p = 0 - if i.done = len(src) == 0; !i.done { - i.info = i.rb.f.info(i.rb.src, i.p) + if len(src) == 0 { + i.setDone() + i.rb.nsrc = 0 + return } + i.multiSeg = nil + i.rb.init(f, src) + i.next = i.rb.f.nextMain + i.asciiF = nextASCIIBytes + i.info = i.rb.f.info(i.rb.src, i.p) } -// SetInputString initializes i to iterate over src after normalizing it to Form f. -func (i *Iter) SetInputString(f Form, src string) { +// InitString initializes i to iterate over src after normalizing it to Form f. +func (i *Iter) InitString(f Form, src string) { + i.p = 0 + if len(src) == 0 { + i.setDone() + i.rb.nsrc = 0 + return + } + i.multiSeg = nil i.rb.initString(f, src) - if i.rb.f.composing { - i.next = nextComposed - } else { - i.next = nextDecomposed + i.next = i.rb.f.nextMain + i.asciiF = nextASCIIString + i.info = i.rb.f.info(i.rb.src, i.p) +} + +// Seek sets the segment to be returned by the next call to Next to start +// at position p. It is the responsibility of the caller to set p to the +// start of a UTF8 rune. +func (i *Iter) Seek(offset int64, whence int) (int64, error) { + var abs int64 + switch whence { + case 0: + abs = offset + case 1: + abs = int64(i.p) + offset + case 2: + abs = int64(i.rb.nsrc) + offset + default: + return 0, fmt.Errorf("norm: invalid whence") } - i.p = 0 - if i.done = len(src) == 0; !i.done { - i.info = i.rb.f.info(i.rb.src, i.p) + if abs < 0 { + return 0, fmt.Errorf("norm: negative position") + } + if int(abs) >= i.rb.nsrc { + i.setDone() + return int64(i.p), nil + } + i.p = int(abs) + i.multiSeg = nil + i.next = i.rb.f.nextMain + i.info = i.rb.f.info(i.rb.src, i.p) + return abs, nil +} + +// returnSlice returns a slice of the underlying input type as a byte slice. +// If the underlying is of type []byte, it will simply return a slice. +// If the underlying is of type string, it will copy the slice to the buffer +// and return that. +func (i *Iter) returnSlice(a, b int) []byte { + if i.rb.src.bytes == nil { + return i.buf[:copy(i.buf[:], i.rb.src.str[a:b])] } + return i.rb.src.bytes[a:b] } // Pos returns the byte position at which the next call to Next will commence processing. @@ -58,140 +101,232 @@ func (i *Iter) Pos() int { return i.p } +func (i *Iter) setDone() { + i.next = nextDone + i.p = i.rb.nsrc +} + // Done returns true if there is no more input to process. func (i *Iter) Done() bool { - return i.done + return i.p >= i.rb.nsrc } -// Next writes f(i.input[i.Pos():n]...) to buffer buf, where n is the -// largest boundary of i.input such that the result fits in buf. -// It returns the number of bytes written to buf. -// len(buf) should be at least MaxSegmentSize. -// Done must be false before calling Next. -func (i *Iter) Next(buf []byte) int { - return i.next(i, buf) +// Next returns f(i.input[i.Pos():n]), where n is a boundary of i.input. +// For any input a and b for which f(a) == f(b), subsequent calls +// to Next will return the same segments. +// Modifying runes are grouped together with the preceding starter, if such a starter exists. +// Although not guaranteed, n will typically be the smallest possible n. +func (i *Iter) Next() []byte { + return i.next(i) } -func (i *Iter) initNext(outn, inStart int) { - i.outStart = 0 - i.inStart = inStart - i.maxp = outn - MaxSegmentSize - i.maxseg = MaxSegmentSize +func nextASCIIBytes(i *Iter) []byte { + p := i.p + 1 + if p >= i.rb.nsrc { + i.setDone() + return i.rb.src.bytes[i.p:p] + } + if i.rb.src.bytes[p] < utf8.RuneSelf { + p0 := i.p + i.p = p + return i.rb.src.bytes[p0:p] + } + i.info = i.rb.f.info(i.rb.src, i.p) + i.next = i.rb.f.nextMain + return i.next(i) } -// setStart resets the start of the new segment to the given position. -// It returns true if there is not enough room for the new segment. -func (i *Iter) setStart(outp, inp int) bool { - if outp > i.maxp { - return true +func nextASCIIString(i *Iter) []byte { + p := i.p + 1 + if p >= i.rb.nsrc { + i.buf[0] = i.rb.src.str[i.p] + i.setDone() + return i.buf[:1] } - i.outStart = outp - i.inStart = inp - i.maxseg = outp + MaxSegmentSize - return false + if i.rb.src.str[p] < utf8.RuneSelf { + i.buf[0] = i.rb.src.str[i.p] + i.p = p + return i.buf[:1] + } + i.info = i.rb.f.info(i.rb.src, i.p) + i.next = i.rb.f.nextMain + return i.next(i) } -func min(a, b int) int { - if a < b { - return a +func nextHangul(i *Iter) []byte { + if r := i.rb.src.hangul(i.p); r != 0 { + i.p += hangulUTF8Size + if i.p >= i.rb.nsrc { + i.setDone() + } + return i.buf[:decomposeHangul(i.buf[:], r)] } - return b + i.info = i.rb.f.info(i.rb.src, i.p) + i.next = i.rb.f.nextMain + return i.next(i) +} + +func nextDone(i *Iter) []byte { + return nil +} + +// nextMulti is used for iterating over multi-segment decompositions +// for decomposing normal forms. +func nextMulti(i *Iter) []byte { + j := 0 + d := i.multiSeg + // skip first rune + for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ { + } + for j < len(d) { + info := i.rb.f.info(input{bytes: d}, j) + if info.ccc == 0 { + i.multiSeg = d[j:] + return d[:j] + } + j += int(info.size) + } + // treat last segment as normal decomposition + i.next = i.rb.f.nextMain + return i.next(i) +} + +// nextMultiNorm is used for iterating over multi-segment decompositions +// for composing normal forms. +func nextMultiNorm(i *Iter) []byte { + j := 0 + d := i.multiSeg + // skip first rune + for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ { + } + for j < len(d) { + info := i.rb.f.info(input{bytes: d}, j) + if info.ccc == 0 { + i.multiSeg = d[j:] + return d[:j] + } + j += int(info.size) + } + i.multiSeg = nil + i.next = nextComposed + i.p++ // restore old valud of i.p. See nextComposed. + if i.p >= i.rb.nsrc { + i.setDone() + } + return d } // nextDecomposed is the implementation of Next for forms NFD and NFKD. -func nextDecomposed(i *Iter, out []byte) int { - var outp int - i.initNext(len(out), i.p) -doFast: - inCopyStart, outCopyStart := i.p, outp // invariant xCopyStart <= i.xStart +func nextDecomposed(i *Iter) (next []byte) { + startp, outp := i.p, 0 + inCopyStart, outCopyStart := i.p, 0 for { if sz := int(i.info.size); sz <= 1 { - // ASCII or illegal byte. Either way, advance by 1. - i.p++ + p := i.p + i.p++ // ASCII or illegal byte. Either way, advance by 1. + if i.p >= i.rb.nsrc { + i.setDone() + return i.returnSlice(p, i.p) + } else if i.rb.src._byte(i.p) < utf8.RuneSelf { + i.next = i.asciiF + return i.returnSlice(p, i.p) + } outp++ - max := min(i.rb.nsrc, len(out)-outp+i.p) - if np := i.rb.src.skipASCII(i.p, max); np > i.p { - outp += np - i.p - i.p = np - if i.p >= i.rb.nsrc { - break + } else if d := i.info.Decomposition(); d != nil { + // Note: If leading CCC != 0, then len(d) == 2 and last is also non-zero. + // Case 1: there is a leftover to copy. In this case the decomposition + // must begin with a modifier and should always be appended. + // Case 2: no leftover. Simply return d if followed by a ccc == 0 value. + p := outp + len(d) + if outp > 0 { + i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p) + if p > len(i.buf) { + return i.buf[:outp] } - // ASCII may combine with consecutive runes. - if i.setStart(outp-1, i.p-1) { - i.p-- - outp-- - i.info.size = 1 - break + } else if i.info.multiSegment() { + // outp must be 0 as multi-segment decompositions always + // start a new segment. + if i.multiSeg == nil { + i.multiSeg = d + i.next = nextMulti + return nextMulti(i) } + // We are in the last segment. Treat as normal decomposition. + d = i.multiSeg + i.multiSeg = nil + p = len(d) } - } else if d := i.info.Decomposition(); d != nil { - i.rb.src.copySlice(out[outCopyStart:], inCopyStart, i.p) - p := outp + len(d) - if p > i.maxseg && i.setStart(outp, i.p) { - return outp + prevCC := i.info.tccc + if i.p += sz; i.p >= i.rb.nsrc { + i.setDone() + i.info = Properties{} // Force BoundaryBefore to succeed. + } else { + i.info = i.rb.f.info(i.rb.src, i.p) } - copy(out[outp:], d) + if i.info.BoundaryBefore() { + if outp > 0 { + copy(i.buf[outp:], d) + return i.buf[:p] + } + return d + } + copy(i.buf[outp:], d) outp = p - i.p += sz inCopyStart, outCopyStart = i.p, outp + if i.info.ccc < prevCC { + goto doNorm + } + continue } else if r := i.rb.src.hangul(i.p); r != 0 { - i.rb.src.copySlice(out[outCopyStart:], inCopyStart, i.p) - for { - outp += decomposeHangul(out[outp:], r) - i.p += hangulUTF8Size - if r = i.rb.src.hangul(i.p); r == 0 { - break - } - if i.setStart(outp, i.p) { - return outp - } + i.next = nextHangul + i.p += hangulUTF8Size + if i.p >= i.rb.nsrc { + i.setDone() } - inCopyStart, outCopyStart = i.p, outp + return i.buf[:decomposeHangul(i.buf[:], r)] } else { p := outp + sz - if p > i.maxseg && i.setStart(outp, i.p) { + if p > len(i.buf) { break } outp = p i.p += sz } if i.p >= i.rb.nsrc { + i.setDone() break } prevCC := i.info.tccc i.info = i.rb.f.info(i.rb.src, i.p) - if cc := i.info.ccc; cc == 0 { - if i.setStart(outp, i.p) { - break - } - } else if cc < prevCC { + if i.info.BoundaryBefore() { + break + } else if i.info.ccc < prevCC { goto doNorm } } - if inCopyStart != i.p { - i.rb.src.copySlice(out[outCopyStart:], inCopyStart, i.p) + if outCopyStart == 0 { + return i.returnSlice(inCopyStart, i.p) + } else if inCopyStart < i.p { + i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p) } - i.done = i.p >= i.rb.nsrc - return outp + return i.buf[:outp] doNorm: // Insert what we have decomposed so far in the reorderBuffer. // As we will only reorder, there will always be enough room. - i.rb.src.copySlice(out[outCopyStart:], inCopyStart, i.p) - if !i.rb.insertDecomposed(out[i.outStart:outp]) { + i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p) + if !i.rb.insertDecomposed(i.buf[0:outp]) { // Start over to prevent decompositions from crossing segment boundaries. // This is a rare occurrence. - i.p = i.inStart + i.p = startp i.info = i.rb.f.info(i.rb.src, i.p) } - outp = i.outStart for { if !i.rb.insert(i.rb.src, i.p, i.info) { break } if i.p += int(i.info.size); i.p >= i.rb.nsrc { - outp += i.rb.flushCopy(out[outp:]) - i.done = true - return outp + i.setDone() + break } i.info = i.rb.f.info(i.rb.src, i.p) if i.info.ccc == 0 { @@ -199,27 +334,19 @@ doNorm: } } // new segment or too many combining characters: exit normalization - if outp += i.rb.flushCopy(out[outp:]); i.setStart(outp, i.p) { - return outp - } - goto doFast + return i.buf[:i.rb.flushCopy(i.buf[:])] } // nextComposed is the implementation of Next for forms NFC and NFKC. -func nextComposed(i *Iter, out []byte) int { - var outp int - i.initNext(len(out), i.p) -doFast: - inCopyStart, outCopyStart := i.p, outp // invariant xCopyStart <= i.xStart +func nextComposed(i *Iter) []byte { + outp, startp := 0, i.p var prevCC uint8 for { if !i.info.isYesC() { goto doNorm } - if cc := i.info.ccc; cc == 0 { - if i.setStart(outp, i.p) { - break - } + if cc := i.info.ccc; cc == 0 && outp > 0 { + break } else if cc < prevCC { goto doNorm } @@ -229,49 +356,33 @@ doFast: sz = 1 // illegal rune: copy byte-by-byte } p := outp + sz - if p > i.maxseg && i.setStart(outp, i.p) { + if p > len(i.buf) { break } outp = p i.p += sz - max := min(i.rb.nsrc, len(out)-outp+i.p) - if np := i.rb.src.skipASCII(i.p, max); np > i.p { - outp += np - i.p - i.p = np - if i.p >= i.rb.nsrc { - break - } - // ASCII may combine with consecutive runes. - if i.setStart(outp-1, i.p-1) { - i.p-- - outp-- - i.info = Properties{size: 1} - break - } - } if i.p >= i.rb.nsrc { + i.setDone() + break + } else if i.rb.src._byte(i.p) < utf8.RuneSelf { + i.next = i.asciiF break } i.info = i.rb.f.info(i.rb.src, i.p) } - if inCopyStart != i.p { - i.rb.src.copySlice(out[outCopyStart:], inCopyStart, i.p) - } - i.done = i.p >= i.rb.nsrc - return outp + return i.returnSlice(startp, i.p) doNorm: - i.rb.src.copySlice(out[outCopyStart:], inCopyStart, i.inStart) - outp, i.p = i.outStart, i.inStart + multi := false + i.p = startp i.info = i.rb.f.info(i.rb.src, i.p) for { if !i.rb.insert(i.rb.src, i.p, i.info) { break } + multi = multi || i.info.multiSegment() if i.p += int(i.info.size); i.p >= i.rb.nsrc { - i.rb.compose() - outp += i.rb.flushCopy(out[outp:]) - i.done = true - return outp + i.setDone() + break } i.info = i.rb.f.info(i.rb.src, i.p) if i.info.BoundaryBefore() { @@ -279,8 +390,12 @@ doNorm: } } i.rb.compose() - if outp += i.rb.flushCopy(out[outp:]); i.setStart(outp, i.p) { - return outp + seg := i.buf[:i.rb.flushCopy(i.buf[:])] + if multi { + i.p-- // fake not being done yet + i.multiSeg = seg + i.next = nextMultiNorm + return nextMultiNorm(i) } - goto doFast + return seg } |