Skip to content

Commit

Permalink
修改splitTextToWords使其能处理标点符号
Browse files Browse the repository at this point in the history
  • Loading branch information
huichen committed Nov 15, 2013
1 parent 091269c commit 7156a10
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 24 deletions.
3 changes: 2 additions & 1 deletion segmenter.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"math"
"os"
"strings"
"unicode"
"unicode/utf8"
)

Expand Down Expand Up @@ -236,7 +237,7 @@ func splitTextToWords(text Text) []Text {
alphanumericStart := 0
for current < len(text) {
r, size := utf8.DecodeRune(text[current:])
if size <= 2 && r != ' ' {
if size <= 2 && (unicode.IsLetter(r) || unicode.IsNumber(r)) {
// 当前是拉丁字母或数字(非中日韩文字)
if !inAlphanumeric {
alphanumericStart = current
Expand Down
34 changes: 12 additions & 22 deletions segmenter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,36 +13,26 @@ func TestSplit(t *testing.T) {
bytesToString(splitTextToWords([]byte(
"中国有十三亿人口"))))

expect(t, "github/ /is/ /a/ /web-based/ /hosting/ /service/ /for/ /software/ /development/ /projects/",
expect(t, "github/ /is/ /a/ /web/-/based/ /hosting/ /service/,/ /for/ /software/ /development/ /projects/./",
bytesToString(splitTextToWords([]byte(
"GitHub is a web-based hosting service for software development projects"))))
"GitHub is a web-based hosting service, for software development projects."))))

expect(t, "中/国/雅/虎/yahoo!/ /china/致/力/于/领/先/的/公/益/民/生/门/户/网/站/",
expect(t, "中/国/雅/虎/yahoo/!/ /china/致/力/于/,/领/先/的/公/益/民/生/门/户/网/站/。/",
bytesToString(splitTextToWords([]byte(
"中国雅虎Yahoo! China致力于领先的公益民生门户网站"))))
"中国雅虎Yahoo! China致力于,领先的公益民生门户网站。"))))

expect(t, "こ/ん/に/ち/は/",
bytesToString(splitTextToWords([]byte(
"こんにちは"))))
expect(t, "こ/ん/に/ち/は/", bytesToString(splitTextToWords([]byte("こんにちは"))))

expect(t, "안/녕/하/세/요/",
bytesToString(splitTextToWords([]byte(
"안녕하세요"))))
expect(t, "안/녕/하/세/요/", bytesToString(splitTextToWords([]byte("안녕하세요"))))

expect(t, "Я/ /тоже/ /рада/ /Вас/ /видеть/",
bytesToString(splitTextToWords([]byte(
"Я тоже рада Вас видеть"))))
expect(t, "Я/ /тоже/ /рада/ /Вас/ /видеть/", bytesToString(splitTextToWords([]byte("Я тоже рада Вас видеть"))))

expect(t, "je/ /suis/ /enchanté/ /de/ /cette/ /pièce/",
bytesToString(splitTextToWords([]byte(
"Je suis enchanté de cette pièce"))))
expect(t, "¿cómo/ /van/ /las/ /cosas/",
bytesToString(splitTextToWords([]byte(
"¿Cómo van las cosas"))))
expect(t, "¿/cómo/ /van/ /las/ /cosas/", bytesToString(splitTextToWords([]byte("¿Cómo van las cosas"))))

expect(t, "wie/ /geht/ /es/ /ihnen/",
bytesToString(splitTextToWords([]byte(
"Wie geht es Ihnen"))))
expect(t, "wie/ /geht/ /es/ /ihnen/", bytesToString(splitTextToWords([]byte("Wie geht es Ihnen"))))

expect(t, "je/ /suis/ /enchanté/ /de/ /cette/ /pièce/",
bytesToString(splitTextToWords([]byte("Je suis enchanté de cette pièce"))))
}

func TestSegment(t *testing.T) {
Expand Down
3 changes: 2 additions & 1 deletion server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ var (
host = flag.String("host", "", "HTTP服务器主机名")
port = flag.Int("port", 8080, "HTTP服务器端口")
dict = flag.String("dict", "../data/dictionary.txt", "词典文件")
staticFolder = flag.String("static_folder", "static", "静态页面存放的目录")
segmenter = sego.Segmenter{}
)

Expand Down Expand Up @@ -80,7 +81,7 @@ func main() {
segmenter.LoadDictionary(*dict)

http.HandleFunc("/json", JsonRpcServer)
http.Handle("/", http.FileServer(http.Dir("static")))
http.Handle("/", http.FileServer(http.Dir(*staticFolder)))
log.Print("服务器启动")
http.ListenAndServe(fmt.Sprintf("%s:%d", *host, *port), nil)
}

0 comments on commit 7156a10

Please sign in to comment.