Skip to content

Commit

Permalink
exp/locale/collate: preparation for adding Search API. Also changed t…
Browse files Browse the repository at this point in the history
…he collate API

further to how (I believe) it will end up being.
It is nicer to separate search from sorting functionality. Collation needs tables that
are not needed by search and vice-versa.  The common functionality is separated out
in the Weigher interface.  As this interface is very low-level, it will be moved to
a sub package (colltab) in a next CL.
The types that will move to this package are Weigher, Elem, and Level.  The addition
of Elem allows for removing some of the duplicate code between collate and collate/build.
This CL also introduces some stubs for a higher-level API for options. The default
proposed options are quite complex and require the user to have a decent understanding
of Unicode collation.  The new options hide a lot of the complexity.

R=rsc
CC=golang-dev
https://golang.org/cl/7058051
  • Loading branch information
mpvl committed Jan 23, 2013
1 parent 7f0d165 commit f86ae99
Show file tree
Hide file tree
Showing 11 changed files with 395 additions and 264 deletions.
6 changes: 3 additions & 3 deletions src/pkg/exp/locale/collate/build/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -467,11 +467,11 @@ func (b *Builder) Build() (*collate.Collator, error) {
if err != nil {
return nil, err
}
c := collate.Init(t)
if c == nil {
table := collate.Init(t)
if table == nil {
panic("generated table of incompatible type")
}
return c, nil
return collate.NewFromTable(table), nil
}

// Build builds a Collator for Tailoring t.
Expand Down
20 changes: 2 additions & 18 deletions src/pkg/exp/locale/collate/build/table.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,30 +69,14 @@ func (t *table) fprint(w io.Writer, name string) (n, size int, err error) {
}
size += sz
}
p := func(f string, a ...interface{}) {
nn, e := fmt.Fprintf(w, f, a...)
update(nn, 0, e)
}
// Write main table.
size += int(reflect.TypeOf(*t).Size())
p("var %sTable = table{\n", name)
update(t.index.printStruct(w, t.root, name))
p(",\n")
p("%sExpandElem[:],\n", name)
update(t.contractTries.printStruct(w, name))
p(",\n")
p("%sContractElem[:],\n", name)
p("%d,\n", t.maxContractLen)
p("0x%X,\n", t.variableTop)
p("}\n\n")

// Write arrays needed for the structure.
update(printColElems(w, t.expandElem, name+"ExpandElem"))
update(printColElems(w, t.contractElem, name+"ContractElem"))
update(t.index.printArrays(w, name))
update(t.contractTries.printArray(w, name))

p("// Total size of %sTable is %d bytes\n", name, size)
nn, e := fmt.Fprintf(w, "// Total size of %sTable is %d bytes\n", name, size)
update(nn, 0, e)
return
}

Expand Down
125 changes: 88 additions & 37 deletions src/pkg/exp/locale/collate/colelem.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,27 +8,43 @@ import (
"unicode"
)

// Level identifies the collation comparison level.
// The primary level corresponds to the basic sorting of text.
// The secondary level corresponds to accents and related linguistic elements.
// The tertiary level corresponds to casing and related concepts.
// The quaternary level is derived from the other levels by the
// various algorithms for handling variable elements.
type Level int

const (
Primary Level = iota
Secondary
Tertiary
Quaternary
Identity
)

const (
defaultSecondary = 0x20
defaultTertiary = 0x2
maxTertiary = 0x1F
maxQuaternary = 0x1FFFFF // 21 bits.
MaxQuaternary = 0x1FFFFF // 21 bits.
)

// colElem is a representation of a collation element.
// In the typical case, a rune maps to a single collation element. If a rune
// can be the start of a contraction or expands into multiple collation elements,
// then the colElem that is associated with a rune will have a special form to represent
// such m to n mappings. Such special colElems have a value >= 0x80000000.
type colElem uint32
// Elem is a representation of a collation element. This API provides ways to encode
// and decode Elems. Implementations of collation tables may use values greater
// or equal to PrivateUse for their own purposes. However, these should never be
// returned by AppendNext.
type Elem uint32

const (
maxCE colElem = 0xAFFFFFFF
minContract = 0xC0000000
maxContract = 0xDFFFFFFF
minExpand = 0xE0000000
maxExpand = 0xEFFFFFFF
minDecomp = 0xF0000000
maxCE Elem = 0xAFFFFFFF
PrivateUse = minContract
minContract = 0xC0000000
maxContract = 0xDFFFFFFF
minExpand = 0xE0000000
maxExpand = 0xEFFFFFFF
minDecomp = 0xF0000000
)

type ceType int
Expand All @@ -40,7 +56,7 @@ const (
ceDecompose // rune expands using NFKC decomposition
)

func (ce colElem) ctype() ceType {
func (ce Elem) ctype() ceType {
if ce <= maxCE {
return ceNormal
}
Expand Down Expand Up @@ -97,15 +113,32 @@ const (
minCompactSecondary = defaultSecondary - 4
)

func makeImplicitCE(primary int) colElem {
return ceType1 | colElem(primary<<primaryShift) | defaultSecondary
func makeImplicitCE(primary int) Elem {
return ceType1 | Elem(primary<<primaryShift) | defaultSecondary
}

// MakeElem returns an Elem for the given values. It will return an error
// if the given combination of values is invalid.
func MakeElem(primary, secondary, tertiary int, ccc uint8) (Elem, error) {
// TODO: implement
return 0, nil
}

func makeQuaternary(primary int) colElem {
return ceTypeQ | colElem(primary<<primaryShift)
// MakeQuaternary returns an Elem with the given quaternary value.
func MakeQuaternary(v int) Elem {
return ceTypeQ | Elem(v<<primaryShift)
}

func (ce colElem) ccc() uint8 {
// Mask sets weights for any level smaller than l to 0.
// The resulting Elem can be used to test for equality with
// other Elems to which the same mask has been applied.
func (ce Elem) Mask(l Level) uint32 {
return 0
}

// CCC returns the canoncial combining class associated with the underlying character,
// if applicable, or 0 otherwise.
func (ce Elem) CCC() uint8 {
if ce&ceType3or4 != 0 {
if ce&ceType4 == ceType3or4 {
return uint8(ce >> 16)
Expand All @@ -115,7 +148,8 @@ func (ce colElem) ccc() uint8 {
return 0
}

func (ce colElem) primary() int {
// Primary returns the primary collation weight for ce.
func (ce Elem) Primary() int {
if ce >= firstNonPrimary {
if ce > lastSpecialPrimary {
return 0
Expand All @@ -125,7 +159,8 @@ func (ce colElem) primary() int {
return int(ce&primaryValueMask) >> primaryShift
}

func (ce colElem) secondary() int {
// Secondary returns the secondary collation weight for ce.
func (ce Elem) Secondary() int {
switch ce & ceTypeMask {
case ceType1:
return int(uint8(ce))
Expand All @@ -142,7 +177,8 @@ func (ce colElem) secondary() int {
panic("should not reach here")
}

func (ce colElem) tertiary() uint8 {
// Tertiary returns the tertiary collation weight for ce.
func (ce Elem) Tertiary() uint8 {
if ce&hasTertiaryMask == 0 {
if ce&ceType3or4 == 0 {
return uint8(ce & 0x1F)
Expand All @@ -158,32 +194,47 @@ func (ce colElem) tertiary() uint8 {
return 0
}

func (ce colElem) updateTertiary(t uint8) colElem {
func (ce Elem) updateTertiary(t uint8) Elem {
if ce&ceTypeMask == ceType1 {
// convert to type 4
nce := ce & primaryValueMask
nce |= colElem(uint8(ce)-minCompactSecondary) << compactSecondaryShift
nce |= Elem(uint8(ce)-minCompactSecondary) << compactSecondaryShift
ce = nce
} else if ce&ceTypeMaskExt == ceType3or4 {
ce &= ^colElem(maxTertiary << 24)
return ce | (colElem(t) << 24)
ce &= ^Elem(maxTertiary << 24)
return ce | (Elem(t) << 24)
} else {
// type 2 or 4
ce &= ^colElem(maxTertiary)
ce &= ^Elem(maxTertiary)
}
return ce | colElem(t)
return ce | Elem(t)
}

// quaternary returns the quaternary value if explicitly specified,
// 0 if ce == ceIgnore, or maxQuaternary otherwise.
// Quaternary returns the quaternary value if explicitly specified,
// 0 if ce == ceIgnore, or MaxQuaternary otherwise.
// Quaternary values are used only for shifted variants.
func (ce colElem) quaternary() int {
func (ce Elem) Quaternary() int {
if ce&ceTypeMask == ceTypeQ {
return int(ce&primaryValueMask) >> primaryShift
} else if ce == ceIgnore {
return 0
}
return maxQuaternary
return MaxQuaternary
}

// Weight returns the collation weight for the given level.
func (ce Elem) Weight(l Level) int {
switch l {
case Primary:
return ce.Primary()
case Secondary:
return ce.Secondary()
case Tertiary:
return int(ce.Tertiary())
case Quaternary:
return ce.Quaternary()
}
return 0 // return 0 (ignore) for undefined levels.
}

// For contractions, collation elements are of the form
Expand All @@ -198,7 +249,7 @@ const (
maxContractOffsetBits = 13
)

func splitContractIndex(ce colElem) (index, n, offset int) {
func splitContractIndex(ce Elem) (index, n, offset int) {
n = int(ce & (1<<maxNBits - 1))
ce >>= maxNBits
index = int(ce & (1<<maxTrieIndexBits - 1))
Expand All @@ -207,23 +258,23 @@ func splitContractIndex(ce colElem) (index, n, offset int) {
return
}

// For expansions, colElems are of the form 11100000 00000000 bbbbbbbb bbbbbbbb,
// For expansions, Elems are of the form 11100000 00000000 bbbbbbbb bbbbbbbb,
// where b* is the index into the expansion sequence table.
const maxExpandIndexBits = 16

func splitExpandIndex(ce colElem) (index int) {
func splitExpandIndex(ce Elem) (index int) {
return int(uint16(ce))
}

// Some runes can be expanded using NFKD decomposition. Instead of storing the full
// sequence of collation elements, we decompose the rune and lookup the collation
// elements for each rune in the decomposition and modify the tertiary weights.
// The colElem, in this case, is of the form 11110000 00000000 wwwwwwww vvvvvvvv, where
// The Elem, in this case, is of the form 11110000 00000000 wwwwwwww vvvvvvvv, where
// - v* is the replacement tertiary weight for the first rune,
// - w* is the replacement tertiary weight for the second rune,
// Tertiary weights of subsequent runes should be replaced with maxTertiary.
// See http://www.unicode.org/reports/tr10/#Compatibility_Decompositions for more details.
func splitDecompose(ce colElem) (t1, t2 uint8) {
func splitDecompose(ce Elem) (t1, t2 uint8) {
return uint8(ce), uint8(ce >> 8)
}

Expand Down
Loading

0 comments on commit f86ae99

Please sign in to comment.