Skip to content

Commit

Permalink
* WebTitle
Browse files Browse the repository at this point in the history
  • Loading branch information
niuchaoqun committed Aug 19, 2022
1 parent 850b285 commit c0e6dea
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 6 deletions.
5 changes: 3 additions & 2 deletions charset.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ func CharsetFromHeaderHtml(body []byte, headers *http.Header) CharsetRes {
return res
}

// Header 和 Html 不一致下面情况以 Html 为准
// Header 和 Html 不一致, 以下情况以 Html 为准
if strings.HasPrefix(cHeader, "ISO") || strings.HasPrefix(cHeader, "WINDOWS") {
res.Charset = cHtml
res.CharsetPos = CharsetPosHtml
Expand Down Expand Up @@ -140,6 +140,7 @@ func CharsetFromHtml(body []byte) string {
charset5 = matches[1]
}

// 只有其中一个
if charset4 != "" && charset5 == "" {
charset = charset4
}
Expand Down Expand Up @@ -179,7 +180,7 @@ func CharsetGuess(body []byte) string {
return "UTF-8"
}

// 如果没有
// 如果没有则 guess
detector := chardet.NewHtmlDetector()
guess, err := detector.DetectBest(body)
if err == nil {
Expand Down
4 changes: 2 additions & 2 deletions detect.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ type DomainRes struct {
}

var (
regexMetaRefreshPatern = regexp.MustCompile(`(?i)url=(.+)`)
regexMetaRefreshPattern = regexp.MustCompile(`(?i)url=(.+)`)
)

// DetectDomain 域名探测
Expand Down Expand Up @@ -121,7 +121,7 @@ func DetectDomainDo(domain string, timeout int) (*DomainRes, error) {

// 具有 HTML 跳转属性, HTTP 无法自动处理永远返回错误, 判断跳转后是否是同一个主域名, 记录并返回
if refresh, exists := doc.Find("meta[http-equiv='refresh' i]").Attr("content"); exists {
refreshMatch := regexMetaRefreshPatern.FindStringSubmatch(refresh)
refreshMatch := regexMetaRefreshPattern.FindStringSubmatch(refresh)
if len(refreshMatch) > 1 {
refreshUrl := refreshMatch[1]
if r, err := fun.UrlParse(refreshUrl); err == nil {
Expand Down
2 changes: 1 addition & 1 deletion detect_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ func TestDomainDetect(t *testing.T) {
// "china-nengyuan.com",
// "suosi.com.cn",
// "wanjiaxian.com",
"thestandard.com.hk",
"wengan.gov.cn",
}

for _, domain := range domains {
Expand Down
13 changes: 12 additions & 1 deletion extract/web.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,16 @@ var (

// WebTitle 返回网页标题, 最大 128 个字符
func WebTitle(doc *goquery.Document, maxLength int) string {
title := doc.Find("title").Text()
var title string
titleNode := doc.Find("title")
if titleNode.Size() > 1 {
// 竟然有多个 title, 只取第一个
title = titleNode.First().Text()
} else {
title = titleNode.Text()
}

title = fun.RemoveLines(title)
title = strings.TrimSpace(title)

if maxLength > 0 && maxLength < 128 {
Expand Down Expand Up @@ -81,6 +90,7 @@ func WebTitleClean(title string, lang string) string {
// WebKeywords 返回网页 Keyword
func WebKeywords(doc *goquery.Document) string {
keywords := doc.Find("meta[name='keywords' i]").AttrOr("content", "")
keywords = fun.RemoveLines(keywords)
keywords = strings.TrimSpace(keywords)

return keywords
Expand All @@ -89,6 +99,7 @@ func WebKeywords(doc *goquery.Document) string {
// WebDescription 返回网页描述, 最大 384 个字符
func WebDescription(doc *goquery.Document, maxLength int) string {
description := doc.Find("meta[name='description' i]").AttrOr("content", "")
description = fun.RemoveLines(description)
description = strings.TrimSpace(description)

if maxLength > 0 && maxLength < 384 {
Expand Down

0 comments on commit c0e6dea

Please sign in to comment.