diff options
Diffstat (limited to 'libgo/go/exp/locale/collate/collate.go')
-rw-r--r-- | libgo/go/exp/locale/collate/collate.go | 310 |
1 files changed, 228 insertions, 82 deletions
diff --git a/libgo/go/exp/locale/collate/collate.go b/libgo/go/exp/locale/collate/collate.go index 5853b71..a08dcae 100644 --- a/libgo/go/exp/locale/collate/collate.go +++ b/libgo/go/exp/locale/collate/collate.go @@ -30,7 +30,7 @@ const ( // AlternateHandling identifies the various ways in which variables are handled. // A rune with a primary weight lower than the variable top is considered a -// variable. +// variable. // See http://www.unicode.org/reports/tr10/#Variable_Weighting for details. type AlternateHandling int @@ -83,9 +83,17 @@ type Collator struct { f norm.Form t *table + + _iter [2]iter +} + +func (c *Collator) iter(i int) *iter { + // TODO: evaluate performance for making the second iterator optional. + return &c._iter[i] } // Locales returns the list of locales for which collating differs from its parent locale. +// The returned value should not be modified. func Locales() []string { return availableLocales } @@ -99,11 +107,18 @@ func New(loc string) *Collator { t = mainTable.indexedTable(idx) } } - return &Collator{ + return newCollator(t) +} + +func newCollator(t *table) *Collator { + c := &Collator{ Strength: Quaternary, f: norm.NFD, t: t, } + c._iter[0].init(c) + c._iter[1].init(c) + return c } // SetVariableTop sets all runes with primary strength less than the primary @@ -112,63 +127,114 @@ func (c *Collator) SetVariableTop(r rune) { // TODO: implement } -// Buffer holds reusable buffers that can be used during collation. -// Reusing a Buffer for the various calls that accept it may avoid -// unnecessary memory allocations. +// Buffer holds keys generated by Key and KeyString. type Buffer struct { - // TODO: try various parameters and techniques, such as using - // a chan of buffers for a pool. - ba [4096]byte - wa [512]weights + buf [4096]byte key []byte - ce []weights } func (b *Buffer) init() { - if b.ce == nil { - b.ce = b.wa[:0] - b.key = b.ba[:0] - } else { - b.ce = b.ce[:0] + if b.key == nil { + b.key = b.buf[:0] } } -// ResetKeys clears the buffer used for generated keys. Calling ResetKeys -// invalidates keys previously obtained from Key or KeyFromString. -func (b *Buffer) ResetKeys() { - b.ce = b.ce[:0] +// Reset clears the buffer from previous results generated by Key and KeyString. +func (b *Buffer) Reset() { b.key = b.key[:0] } // Compare returns an integer comparing the two byte slices. // The result will be 0 if a==b, -1 if a < b, and +1 if a > b. -// Compare calls ResetKeys, thereby invalidating keys -// previously generated using Key or KeyFromString using buf. -func (c *Collator) Compare(buf *Buffer, a, b []byte) int { - // TODO: for now we simply compute keys and compare. Once we - // have good benchmarks, move to an implementation that works - // incrementally for the majority of cases. - // - Benchmark with long strings that only vary in modifiers. - buf.ResetKeys() - ka := c.Key(buf, a) - kb := c.Key(buf, b) - defer buf.ResetKeys() - return bytes.Compare(ka, kb) +func (c *Collator) Compare(a, b []byte) int { + // TODO: skip identical prefixes once we have a fast way to detect if a rune is + // part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest. + c.iter(0).setInput(c, a) + c.iter(1).setInput(c, b) + if res := c.compare(); res != 0 { + return res + } + if Identity == c.Strength { + return bytes.Compare(a, b) + } + return 0 } // CompareString returns an integer comparing the two strings. // The result will be 0 if a==b, -1 if a < b, and +1 if a > b. -// CompareString calls ResetKeys, thereby invalidating keys -// previously generated using Key or KeyFromString using buf. -func (c *Collator) CompareString(buf *Buffer, a, b string) int { - buf.ResetKeys() - ka := c.KeyFromString(buf, a) - kb := c.KeyFromString(buf, b) - defer buf.ResetKeys() - return bytes.Compare(ka, kb) +func (c *Collator) CompareString(a, b string) int { + // TODO: skip identical prefixes once we have a fast way to detect if a rune is + // part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest. + c.iter(0).setInputString(c, a) + c.iter(1).setInputString(c, b) + if res := c.compare(); res != 0 { + return res + } + if Identity == c.Strength { + if a < b { + return -1 + } else if a > b { + return 1 + } + } + return 0 +} + +func compareLevel(f func(i *iter) int, a, b *iter) int { + a.pce = 0 + b.pce = 0 + for { + va := f(a) + vb := f(b) + if va != vb { + if va < vb { + return -1 + } + return 1 + } else if va == 0 { + break + } + } + return 0 } -func (c *Collator) Prefix(buf *Buffer, s, prefix []byte) int { +func (c *Collator) compare() int { + ia, ib := c.iter(0), c.iter(1) + // Process primary level + if c.Alternate != AltShifted { + // TODO: implement script reordering + // TODO: special hiragana handling + if res := compareLevel((*iter).nextPrimary, ia, ib); res != 0 { + return res + } + } else { + // TODO: handle shifted + } + if Secondary <= c.Strength { + f := (*iter).nextSecondary + if c.Backwards { + f = (*iter).prevSecondary + } + if res := compareLevel(f, ia, ib); res != 0 { + return res + } + } + // TODO: special case handling (Danish?) + if Tertiary <= c.Strength || c.CaseLevel { + if res := compareLevel((*iter).nextTertiary, ia, ib); res != 0 { + return res + } + // TODO: Not needed for the default value of AltNonIgnorable? + if Quaternary <= c.Strength { + if res := compareLevel((*iter).nextQuaternary, ia, ib); res != 0 { + return res + } + } + } + return 0 +} + +func (c *Collator) Prefix(s, prefix []byte) int { // iterate over s, track bytes consumed. return 0 } @@ -176,12 +242,11 @@ func (c *Collator) Prefix(buf *Buffer, s, prefix []byte) int { // Key returns the collation key for str. // Passing the buffer buf may avoid memory allocations. // The returned slice will point to an allocation in Buffer and will remain -// valid until the next call to buf.ResetKeys(). +// valid until the next call to buf.Reset(). func (c *Collator) Key(buf *Buffer, str []byte) []byte { // See http://www.unicode.org/reports/tr10/#Main_Algorithm for more details. buf.init() - c.getColElems(buf, str) - return c.key(buf, buf.ce) + return c.key(buf, c.getColElems(str)) } // KeyFromString returns the collation key for str. @@ -191,46 +256,73 @@ func (c *Collator) Key(buf *Buffer, str []byte) []byte { func (c *Collator) KeyFromString(buf *Buffer, str string) []byte { // See http://www.unicode.org/reports/tr10/#Main_Algorithm for more details. buf.init() - c.getColElemsString(buf, str) - return c.key(buf, buf.ce) + return c.key(buf, c.getColElemsString(str)) } -func (c *Collator) key(buf *Buffer, w []weights) []byte { +func (c *Collator) key(buf *Buffer, w []colElem) []byte { processWeights(c.Alternate, c.t.variableTop, w) kn := len(buf.key) c.keyFromElems(buf, w) return buf.key[kn:] } -func (c *Collator) getColElems(buf *Buffer, str []byte) { - i := c.iter() - i.src.SetInput(c.f, str) +func (c *Collator) getColElems(str []byte) []colElem { + i := c.iter(0) + i.setInput(c, str) for !i.done() { - buf.ce = i.next(buf.ce) + i.next() } + return i.ce } -func (c *Collator) getColElemsString(buf *Buffer, str string) { - i := c.iter() - i.src.SetInputString(c.f, str) +func (c *Collator) getColElemsString(str string) []colElem { + i := c.iter(0) + i.setInputString(c, str) for !i.done() { - buf.ce = i.next(buf.ce) + i.next() } + return i.ce } type iter struct { src norm.Iter - ba [1024]byte + norm [1024]byte buf []byte - t *table p int minBufSize int + + wa [512]colElem + ce []colElem + pce int + + t *table _done, eof bool } -func (c *Collator) iter() iter { - i := iter{t: c.t, minBufSize: c.t.maxContractLen} - i.buf = i.ba[:0] +func (i *iter) init(c *Collator) { + i.t = c.t + i.minBufSize = c.t.maxContractLen + i.ce = i.wa[:0] + i.buf = i.norm[:0] +} + +func (i *iter) reset() { + i.ce = i.ce[:0] + i.buf = i.buf[:0] + i.p = 0 + i.eof = i.src.Done() + i._done = i.eof +} + +func (i *iter) setInput(c *Collator, s []byte) *iter { + i.src.SetInput(c.f, s) + i.reset() + return i +} + +func (i *iter) setInputString(c *Collator, s string) *iter { + i.src.SetInputString(c.f, s) + i.reset() return i } @@ -238,7 +330,7 @@ func (i *iter) done() bool { return i._done } -func (i *iter) next(ce []weights) []weights { +func (i *iter) next() { if !i.eof && len(i.buf)-i.p < i.minBufSize { // replenish buffer n := copy(i.buf, i.buf[i.p:]) @@ -249,14 +341,70 @@ func (i *iter) next(ce []weights) []weights { } if i.p == len(i.buf) { i._done = true - return ce + return } - ce, sz := i.t.appendNext(ce, i.buf[i.p:]) + sz := 0 + i.ce, sz = i.t.appendNext(i.ce, i.buf[i.p:]) i.p += sz - return ce } -func appendPrimary(key []byte, p uint32) []byte { +func (i *iter) nextPrimary() int { + for { + for ; i.pce < len(i.ce); i.pce++ { + if v := i.ce[i.pce].primary(); v != 0 { + i.pce++ + return v + } + } + if i.done() { + return 0 + } + i.next() + } + panic("should not reach here") +} + +func (i *iter) nextSecondary() int { + for ; i.pce < len(i.ce); i.pce++ { + if v := i.ce[i.pce].secondary(); v != 0 { + i.pce++ + return v + } + } + return 0 +} + +func (i *iter) prevSecondary() int { + for ; i.pce < len(i.ce); i.pce++ { + if v := i.ce[len(i.ce)-i.pce-1].secondary(); v != 0 { + i.pce++ + return v + } + } + return 0 +} + +func (i *iter) nextTertiary() int { + for ; i.pce < len(i.ce); i.pce++ { + if v := i.ce[i.pce].tertiary(); v != 0 { + i.pce++ + return int(v) + } + } + return 0 +} + +func (i *iter) nextQuaternary() int { + for ; i.pce < len(i.ce); i.pce++ { + if v := i.ce[i.pce].quaternary(); v != 0 { + i.pce++ + return v + } + } + return 0 +} + +func appendPrimary(key []byte, p int) []byte { // Convert to variable length encoding; supports up to 23 bits. if p <= 0x7FFF { key = append(key, uint8(p>>8), uint8(p)) @@ -268,9 +416,9 @@ func appendPrimary(key []byte, p uint32) []byte { // keyFromElems converts the weights ws to a compact sequence of bytes. // The result will be appended to the byte buffer in buf. -func (c *Collator) keyFromElems(buf *Buffer, ws []weights) { +func (c *Collator) keyFromElems(buf *Buffer, ws []colElem) { for _, v := range ws { - if w := v.primary; w > 0 { + if w := v.primary(); w > 0 { buf.key = appendPrimary(buf.key, w) } } @@ -279,13 +427,13 @@ func (c *Collator) keyFromElems(buf *Buffer, ws []weights) { // TODO: we can use one 0 if we can guarantee that all non-zero weights are > 0xFF. if !c.Backwards { for _, v := range ws { - if w := v.secondary; w > 0 { + if w := v.secondary(); w > 0 { buf.key = append(buf.key, uint8(w>>8), uint8(w)) } } } else { for i := len(ws) - 1; i >= 0; i-- { - if w := ws[i].secondary; w > 0 { + if w := ws[i].secondary(); w > 0 { buf.key = append(buf.key, uint8(w>>8), uint8(w)) } } @@ -296,20 +444,20 @@ func (c *Collator) keyFromElems(buf *Buffer, ws []weights) { if Tertiary <= c.Strength || c.CaseLevel { buf.key = append(buf.key, 0, 0) for _, v := range ws { - if w := v.tertiary; w > 0 { - buf.key = append(buf.key, w) + if w := v.tertiary(); w > 0 { + buf.key = append(buf.key, uint8(w)) } } // Derive the quaternary weights from the options and other levels. // Note that we represent maxQuaternary as 0xFF. The first byte of the // representation of a a primary weight is always smaller than 0xFF, // so using this single byte value will compare correctly. - if Quaternary <= c.Strength { + if Quaternary <= c.Strength && c.Alternate >= AltShifted { if c.Alternate == AltShiftTrimmed { lastNonFFFF := len(buf.key) buf.key = append(buf.key, 0) for _, v := range ws { - if w := v.quaternary; w == maxQuaternary { + if w := v.quaternary(); w == maxQuaternary { buf.key = append(buf.key, 0xFF) } else if w > 0 { buf.key = appendPrimary(buf.key, w) @@ -320,7 +468,7 @@ func (c *Collator) keyFromElems(buf *Buffer, ws []weights) { } else { buf.key = append(buf.key, 0) for _, v := range ws { - if w := v.quaternary; w == maxQuaternary { + if w := v.quaternary(); w == maxQuaternary { buf.key = append(buf.key, 0xFF) } else if w > 0 { buf.key = appendPrimary(buf.key, w) @@ -331,29 +479,27 @@ func (c *Collator) keyFromElems(buf *Buffer, ws []weights) { } } -func processWeights(vw AlternateHandling, top uint32, wa []weights) { +func processWeights(vw AlternateHandling, top uint32, wa []colElem) { ignore := false + vtop := int(top) switch vw { case AltShifted, AltShiftTrimmed: for i := range wa { - if p := wa[i].primary; p <= top && p != 0 { - wa[i] = weights{quaternary: p} + if p := wa[i].primary(); p <= vtop && p != 0 { + wa[i] = makeQuaternary(p) ignore = true } else if p == 0 { if ignore { - wa[i] = weights{} - } else if wa[i].tertiary != 0 { - wa[i].quaternary = maxQuaternary + wa[i] = ceIgnore } } else { - wa[i].quaternary = maxQuaternary ignore = false } } case AltBlanked: for i := range wa { - if p := wa[i].primary; p <= top && (ignore || p != 0) { - wa[i] = weights{} + if p := wa[i].primary(); p <= vtop && (ignore || p != 0) { + wa[i] = ceIgnore ignore = true } else { ignore = false |