forked from CodyGuo/spider_lib
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgooglesearch.go
147 lines (135 loc) · 3.67 KB
/
googlesearch.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
package pholcus_lib
// 基础包
import (
"github.com/henrylee2cn/pholcus/app/downloader/request" //必需
. "github.com/henrylee2cn/pholcus/app/spider" //必需
"github.com/henrylee2cn/pholcus/common/goquery" //DOM解析
// . "github.com/henrylee2cn/pholcus/app/spider/common" //选用
"github.com/henrylee2cn/pholcus/logs" //信息输出
// net包
// "net/http" //设置http.Header
// "net/url"
// 编码包
// "encoding/xml"
// "encoding/json"
// 字符串处理包
"regexp"
"strconv"
"strings"
// 其他包
// "fmt"
"math"
// "time"
)
func init() {
GoogleSearch.Register()
}
var googleIp = []string{
"210.242.125.100",
"210.242.125.96",
"210.242.125.91",
"210.242.125.95",
"64.233.189.163",
"58.123.102.5",
"210.242.125.97",
"210.242.125.115",
"58.123.102.28",
"210.242.125.70",
"220.255.2.153",
}
var GoogleSearch = &Spider{
Name: "谷歌搜索",
Description: "谷歌搜索结果 [www.google.com镜像]",
// Pausetime: 300,
Keyin: KEYIN,
Limit: LIMIT,
EnableCookie: false,
RuleTree: &RuleTree{
Root: func(ctx *Context) {
var url string
var success bool
logs.Log.Critical("正在查找可用的Google镜像,该过程可能需要几分钟……")
for _, ip := range googleIp {
// url = "http://" + ip + "/search?q=" + ctx.GetKeyin() + "&newwindow=1&biw=1600&bih=398&start="
url = "http://" + ip + "/?gws_rd=ssl#q=" + ctx.GetKeyin()
logs.Log.Informational("测试 " + ip)
if _, err := goquery.NewDocument(url); err == nil {
success = true
break
}
}
if !success {
logs.Log.Critical("没有可用的Google镜像IP!!")
return
}
logs.Log.Critical("开始Google搜索……")
ctx.AddQueue(&request.Request{
Url: url,
Rule: "获取总页数",
Temp: map[string]interface{}{
"baseUrl": url,
},
})
},
Trunk: map[string]*Rule{
"获取总页数": {
AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} {
for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ {
ctx.AddQueue(&request.Request{
Url: aid["urlBase"].(string) + strconv.Itoa(10*loop[0]),
Rule: aid["Rule"].(string),
})
}
return nil
},
ParseFunc: func(ctx *Context) {
query := ctx.GetDom()
txt := query.Find("#resultStats").Text()
re, _ := regexp.Compile(`,+`)
txt = re.ReplaceAllString(txt, "")
re, _ = regexp.Compile(`[\d]+`)
txt = re.FindString(txt)
num, _ := strconv.Atoi(txt)
total := int(math.Ceil(float64(num) / 10))
if total > ctx.GetLimit() {
total = ctx.GetLimit()
} else if total == 0 {
logs.Log.Critical("[消息提示:| 任务:%v | KEYIN:%v | 规则:%v] 没有抓取到任何数据!!!\n", ctx.GetName(), ctx.GetKeyin(), ctx.GetRuleName())
return
}
// 调用指定规则下辅助函数
ctx.Aid(map[string]interface{}{
"loop": [2]int{1, total},
"urlBase": ctx.GetTemp("baseUrl", ""),
"Rule": "搜索结果",
})
// 用指定规则解析响应流
ctx.Parse("搜索结果")
},
},
"搜索结果": {
//注意:有无字段语义和是否输出数据必须保持一致
ItemFields: []string{
"标题",
"内容",
"链接",
},
ParseFunc: func(ctx *Context) {
query := ctx.GetDom()
query.Find("#ires li.g").Each(func(i int, s *goquery.Selection) {
t := s.Find(".r > a")
href, _ := t.Attr("href")
href = strings.TrimLeft(href, "/url?q=")
title := t.Text()
content := s.Find(".st").Text()
ctx.Output(map[int]interface{}{
0: title,
1: content,
2: href,
})
})
},
},
},
},
}