Skip to content

Commit

Permalink
ParseVersion 0.0.8;小幅优化句子的拆分,例如句子末尾3个字母'day. '
Browse files Browse the repository at this point in the history
  • Loading branch information
vito-go committed Mar 8, 2024
1 parent 2e047bd commit 367ed05
Showing 1 changed file with 12 additions and 8 deletions.
20 changes: 12 additions & 8 deletions mywords-go/artical/article.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,10 @@ func ParseSourceUrl(sourceUrl string, expr string, proxyUrl *url.URL) (*Article,
}

// ParseVersion 如果article的文件的version不同,则进入文章页面会重新进行解析,但是不会更新解析时间。
const ParseVersion = "0.0.7"
const ParseVersion = "0.0.8"

var regSentenceSplit = regexp.MustCompile(`[^ ][^ ][^ ][^ ]\. [A-Z“]`)
// var regSentenceSplit = regexp.MustCompile(`[^ ][^ ][^ ][^ ]\. [A-Z“]`)
var regSentenceSplit = regexp.MustCompile(`[^A-Z ][^A-Z ][^A-Z ]\. [A-Z“]`)

const quote = "”"
const minLen = 3
Expand Down Expand Up @@ -117,22 +118,24 @@ func parseContent(sourceUrl, expr string, respBody []byte, lastModified int64) (
// \. [A-Z“]
//sentences = append(sentences, content[start:s[0]+1])
//start = s[0] + 2
sen := []byte(content[start : s[0]+5])
sen := []byte(content[start : s[0]+4])
start = s[0] + 5
if len(sen) > 2 {
if sen[len(sen)-1] == quote[1] && sen[len(sen)-2] == quote[0] {
sen = append(sen, quote[2], '.')
start += 2
}
sentences = append(sentences, string(sen))
}
start = s[0] + 6
}
sentences = append(sentences, content[start:])

var totalCount int
var wordsMap = make(map[string]int64, 1024)
var wordsSentences = make(map[string][]*string, 1024)
loopSentences:
for _, sentence := range sentences {
for idx := range sentences {
sentence := sentences[idx]
if strings.HasPrefix(sentence, "<div ") {
continue
}
Expand All @@ -151,7 +154,7 @@ loopSentences:
sentence = sentence[1:]
}
//sentence = regexp.MustCompile(`\s+`).ReplaceAllString(sentence, " ")
s := sentence
senPointer := &sentences[idx]
sentenceWords := regexp.MustCompile(fmt.Sprintf("[’A-Za-z-]{%d,}", minLen)).FindAllString(sentence, -1)
if len(sentenceWords) == 0 {
continue
Expand Down Expand Up @@ -181,16 +184,17 @@ loopSentences:
continue
}
wordsMap[word]++
// 最多保留3个例句
if len(wordsSentences[word]) < 3 {
var exist bool
for _, pointer := range wordsSentences[word] {
if *pointer == s {
if *pointer == *senPointer {
exist = true
break
}
}
if !exist {
wordsSentences[word] = append(wordsSentences[word], &s)
wordsSentences[word] = append(wordsSentences[word], senPointer)
}

}
Expand Down

0 comments on commit 367ed05

Please sign in to comment.