Skip to content

Commit

Permalink
分词细化应包含子分词的相对位置
Browse files Browse the repository at this point in the history
  • Loading branch information
huichen committed Jul 21, 2013
1 parent d15852a commit df4a810
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 38 deletions.
28 changes: 28 additions & 0 deletions segment.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
package sego

// 文本中的一个分词
type Segment struct {
// 分词在文本中的起始字节位置
start int

// 分词在文本中的起始字节位置(不包括该位置)
end int

// 分词信息
token *Token
}

// 返回分词在文本中的起始字节位置
func (s *Segment) Start() int {
return s.start
}

// 返回分词在文本中的起始字节位置(不包括该位置)
func (s *Segment) End() int {
return s.end
}

// 返回分词信息
func (s *Segment) Token() *Token {
return s.token
}
32 changes: 10 additions & 22 deletions segmenter.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,18 +19,6 @@ type Segmenter struct {
dict *Dictionary
}

// 文本中的一个分词
type Segment struct {
// 分词在文本中的起始字节位置
Start int

// 分词在文本中的起始字节位置(不包括该位置)
End int

// 分词信息
Token *Token
}

// 该结构体用于记录Viterbi算法中某字元处的向前分词跳转信息
type jumper struct {
minDistance float32
Expand Down Expand Up @@ -99,20 +87,20 @@ func (seg *Segmenter) LoadDictionary(files string) {
// 计算需要添加的子分词数目
numTokensToAdd := 0
for iToken := 0; iToken < len(segments); iToken++ {
if len(segments[iToken].Token.text) > 1 {
if len(segments[iToken].token.text) > 1 {
// 略去字元长度为一的分词
// TODO: 这值得进一步推敲,特别是当字典中有英文复合词的时候
numTokensToAdd++
}
}
token.tokens = make([]*Token, numTokensToAdd)
token.segments = make([]*Segment, numTokensToAdd)

// 添加子分词
iTokenToAdd := 0
iSegmentsToAdd := 0
for iToken := 0; iToken < len(segments); iToken++ {
if len(segments[iToken].Token.text) > 1 {
token.tokens[iTokenToAdd] = segments[iTokenToAdd].Token
iTokenToAdd++
if len(segments[iToken].token.text) > 1 {
token.segments[iSegmentsToAdd] = &segments[iSegmentsToAdd]
iSegmentsToAdd++
}
}
}
Expand Down Expand Up @@ -196,16 +184,16 @@ func (seg *Segmenter) segmentWords(text []Text, searchMode bool) []Segment {
for index := len(text) - 1; index >= 0; {
location := index - len(jumpers[index].token.text) + 1
numSeg--
outputSegments[numSeg].Token = jumpers[index].token
outputSegments[numSeg].token = jumpers[index].token
index = location - 1
}

// 计算各个分词的字节位置
bytePosition := 0
for iSeg := 0; iSeg < len(outputSegments); iSeg++ {
outputSegments[iSeg].Start = bytePosition
bytePosition += textSliceByteLength(outputSegments[iSeg].Token.text)
outputSegments[iSeg].End = bytePosition
outputSegments[iSeg].start = bytePosition
bytePosition += textSliceByteLength(outputSegments[iSeg].token.text)
outputSegments[iSeg].end = bytePosition
}
return outputSegments
}
Expand Down
16 changes: 8 additions & 8 deletions segmenter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,14 @@ func TestSegment(t *testing.T) {
segments := seg.Segment([]byte("中国有十三亿人口"))
expect(t, "中国/p8 有/p3 十三亿/p11 人口/p12 ", SegmentsToString(segments, false))
expect(t, "4", len(segments))
expect(t, "0", segments[0].Start)
expect(t, "6", segments[0].End)
expect(t, "6", segments[1].Start)
expect(t, "9", segments[1].End)
expect(t, "9", segments[2].Start)
expect(t, "18", segments[2].End)
expect(t, "18", segments[3].Start)
expect(t, "24", segments[3].End)
expect(t, "0", segments[0].start)
expect(t, "6", segments[0].end)
expect(t, "6", segments[1].start)
expect(t, "9", segments[1].end)
expect(t, "9", segments[2].start)
expect(t, "18", segments[2].end)
expect(t, "18", segments[3].start)
expect(t, "24", segments[3].end)
}

func TestLargeDictionary(t *testing.T) {
Expand Down
8 changes: 4 additions & 4 deletions token.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ type Token struct {
// 词性标注
pos string

// 该分词文本的进一步分词划分,见Tokens函数注释
tokens []*Token
// 该分词文本的进一步分词划分,见Segments函数注释
segments []*Segment
}

// 返回分词文本
Expand All @@ -45,6 +45,6 @@ func (token *Token) Pos() string {
// 有两个子分词"中华人民共和国"和"中央人民政府"。子分词也可以进一步有子分词
// 形成一个树结构,遍历这个树就可以得到该分词的所有细致分词划分,这主要
// 用于搜索引擎对一段文本进行全文搜索。
func (token *Token) Tokens() []*Token {
return token.tokens
func (token *Token) Segments() []*Segment {
return token.segments
}
8 changes: 4 additions & 4 deletions utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,20 +16,20 @@ import (
func SegmentsToString(segs []Segment, searchMode bool) (output string) {
if searchMode {
for _, seg := range segs {
output += tokenToString(seg.Token)
output += tokenToString(seg.token)
}
} else {
for _, seg := range segs {
output += fmt.Sprintf(
"%s/%s ", textSliceToString(seg.Token.text), seg.Token.pos)
"%s/%s ", textSliceToString(seg.token.text), seg.token.pos)
}
}
return
}

func tokenToString(token *Token) (output string) {
for _, t := range token.tokens {
output += tokenToString(t)
for _, s := range token.segments {
output += tokenToString(s.token)
}
output += fmt.Sprintf("%s/%s ", textSliceToString(token.text), token.pos)
return
Expand Down

0 comments on commit df4a810

Please sign in to comment.