forked from CodyGuo/spider_lib
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkaola.go
115 lines (103 loc) · 2.81 KB
/
kaola.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
package pholcus_lib
// 基础包
import (
"github.com/henrylee2cn/pholcus/app/downloader/request" //必需
"github.com/henrylee2cn/pholcus/common/goquery" //DOM解析
// "github.com/henrylee2cn/pholcus/logs" //信息输出
. "github.com/henrylee2cn/pholcus/app/spider" //必需
// . "github.com/henrylee2cn/pholcus/app/spider/common" //选用
// net包
// "net/http" //设置http.Header
// "net/url"
// 编码包
// "encoding/xml"
// "encoding/json"
// 字符串处理包
// "regexp"
// "strconv"
// "strings"
// 其他包
// "fmt"
// "math"
// "time"
)
func init() {
Kaola.Register()
}
// 考拉海淘,海外直采,7天无理由退货,售后无忧!考拉网放心的海淘网站!
var Kaola = &Spider{
Name: "考拉海淘",
Description: "考拉海淘商品数据 [Auto Page] [www.kaola.com]",
// Pausetime: 300,
// Keyin: KEYIN,
// Limit: LIMIT,
EnableCookie: false,
RuleTree: &RuleTree{
Root: func(ctx *Context) {
ctx.AddQueue(&request.Request{Url: "http://www.kaola.com", Rule: "获取版块URL"})
},
Trunk: map[string]*Rule{
"获取版块URL": {
ParseFunc: func(ctx *Context) {
query := ctx.GetDom()
lis := query.Find("#funcTab li a")
lis.Each(func(i int, s *goquery.Selection) {
if i == 0 {
return
}
if url, ok := s.Attr("href"); ok {
ctx.AddQueue(&request.Request{Url: url, Rule: "商品列表", Temp: map[string]interface{}{"goodsType": s.Text()}})
}
})
},
},
"商品列表": {
ParseFunc: func(ctx *Context) {
query := ctx.GetDom()
query.Find(".proinfo").Each(func(i int, s *goquery.Selection) {
if url, ok := s.Find("a").Attr("href"); ok {
ctx.AddQueue(&request.Request{
Url: "http://www.kaola.com" + url,
Rule: "商品详情",
Temp: map[string]interface{}{"goodsType": ctx.GetTemp("goodsType", "").(string)},
})
}
})
},
},
"商品详情": {
//注意:有无字段语义和是否输出数据必须保持一致
ItemFields: []string{
"标题",
"价格",
"品牌",
"采购地",
"评论数",
"类别",
},
ParseFunc: func(ctx *Context) {
query := ctx.GetDom()
// 获取标题
title := query.Find(".product-title").Text()
// 获取价格
price := query.Find("#js_currentPrice span").Text()
// 获取品牌
brand := query.Find(".goods_parameter li").Eq(0).Text()
// 获取采购地
from := query.Find(".goods_parameter li").Eq(1).Text()
// 获取评论数
discussNum := query.Find("#commentCounts").Text()
// 结果存入Response中转
ctx.Output(map[int]interface{}{
0: title,
1: price,
2: brand,
3: from,
4: discussNum,
5: ctx.GetTemp("goodsType", ""),
})
},
},
},
},
}