Skip to content

Commit 3239e8f

Browse files
committed
[mod] split colly.go to smaller parts
1 parent 63c4b7f commit 3239e8f

File tree

5 files changed

+279
-250
lines changed

5 files changed

+279
-250
lines changed

colly.go

-250
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@ import (
88
"fmt"
99
"hash/fnv"
1010
"io"
11-
"io/ioutil"
12-
"mime"
1311
"net/http"
1412
"net/http/cookiejar"
1513
"net/url"
@@ -25,9 +23,6 @@ import (
2523
"google.golang.org/appengine"
2624
"google.golang.org/appengine/urlfetch"
2725

28-
"golang.org/x/net/html"
29-
"golang.org/x/net/html/charset"
30-
3126
"github.com/PuerkitoBio/goquery"
3227
"github.com/kennygrant/sanitize"
3328
"github.com/temoto/robotstxt"
@@ -80,72 +75,6 @@ type Collector struct {
8075
lock *sync.RWMutex
8176
}
8277

83-
// Request is the representation of a HTTP request made by a Collector
84-
type Request struct {
85-
// URL is the parsed URL of the HTTP request
86-
URL *url.URL
87-
// Headers contains the Request's HTTP headers
88-
Headers *http.Header
89-
// Ctx is a context between a Request and a Response
90-
Ctx *Context
91-
// Depth is the number of the parents of the request
92-
Depth int
93-
// Method is the HTTP method of the request
94-
Method string
95-
// Body is the request body which is used on POST/PUT requests
96-
Body io.Reader
97-
// Unique identifier of the request
98-
Id uint32
99-
collector *Collector
100-
}
101-
102-
// Response is the representation of a HTTP response made by a Collector
103-
type Response struct {
104-
// StatusCode is the status code of the Response
105-
StatusCode int
106-
// Body is the content of the Response
107-
Body []byte
108-
// Ctx is a context between a Request and a Response
109-
Ctx *Context
110-
// Request is the Request object of the response
111-
Request *Request
112-
// Headers contains the Response's HTTP headers
113-
Headers *http.Header
114-
}
115-
116-
// HTMLElement is the representation of a HTML tag.
117-
type HTMLElement struct {
118-
// Name is the name of the tag
119-
Name string
120-
Text string
121-
attributes []html.Attribute
122-
// Request is the request object of the element's HTML document
123-
Request *Request
124-
// Response is the Response object of the element's HTML document
125-
Response *Response
126-
// DOM is the goquery parsed DOM object of the page. DOM is relative
127-
// to the current HTMLElement
128-
DOM *goquery.Selection
129-
}
130-
131-
// NewHTMLElementFromSelectionNode creates a HTMLElement from a goquery.Selection Node.
132-
func NewHTMLElementFromSelectionNode(resp *Response, s *goquery.Selection, n *html.Node) *HTMLElement {
133-
return &HTMLElement{
134-
Name: n.Data,
135-
Request: resp.Request,
136-
Response: resp,
137-
Text: goquery.NewDocumentFromNode(n).Text(),
138-
DOM: s,
139-
attributes: n.Attr,
140-
}
141-
}
142-
143-
// Context provides a tiny layer for passing data between callbacks
144-
type Context struct {
145-
contextMap map[string]interface{}
146-
lock *sync.RWMutex
147-
}
148-
14978
// RequestCallback is a type alias for OnRequest callback functions
15079
type RequestCallback func(*Request)
15180

@@ -199,14 +128,6 @@ func NewCollector() *Collector {
199128
return c
200129
}
201130

202-
// NewContext initializes a new Context instance
203-
func NewContext() *Context {
204-
return &Context{
205-
contextMap: make(map[string]interface{}),
206-
lock: &sync.RWMutex{},
207-
}
208-
}
209-
210131
// Init initializes the Collector's private variables and sets default
211132
// configuration for the Collector
212133
func (c *Collector) Init() {
@@ -779,158 +700,6 @@ func (c *Collector) checkRedirectFunc() func(req *http.Request, via []*http.Requ
779700
}
780701
}
781702

782-
// Attr returns the selected attribute of a HTMLElement or empty string
783-
// if no attribute found
784-
func (h *HTMLElement) Attr(k string) string {
785-
for _, a := range h.attributes {
786-
if a.Key == k {
787-
return a.Val
788-
}
789-
}
790-
return ""
791-
}
792-
793-
// ChildText returns the concatenated and stripped text content of the matching
794-
// elements.
795-
func (h *HTMLElement) ChildText(goquerySelector string) string {
796-
return strings.TrimSpace(h.DOM.Find(goquerySelector).Text())
797-
}
798-
799-
// ChildAttr returns the stripped text content of the first matching
800-
// element's attribute.
801-
func (h *HTMLElement) ChildAttr(goquerySelector, attrName string) string {
802-
if attr, ok := h.DOM.Find(goquerySelector).Attr(attrName); ok {
803-
return strings.TrimSpace(attr)
804-
}
805-
return ""
806-
}
807-
808-
// ChildAttrs returns the stripped text content of all the matching
809-
// element's attributes.
810-
func (h *HTMLElement) ChildAttrs(goquerySelector, attrName string) []string {
811-
res := make([]string, 0)
812-
h.DOM.Find(goquerySelector).Each(func(_ int, s *goquery.Selection) {
813-
if attr, ok := s.Attr(attrName); ok {
814-
res = append(res, strings.TrimSpace(attr))
815-
}
816-
})
817-
return res
818-
}
819-
820-
// AbsoluteURL returns with the resolved absolute URL of an URL chunk.
821-
// AbsoluteURL returns empty string if the URL chunk is a fragment or
822-
// could not be parsed
823-
func (r *Request) AbsoluteURL(u string) string {
824-
if strings.HasPrefix(u, "#") {
825-
return ""
826-
}
827-
absURL, err := r.URL.Parse(u)
828-
if err != nil {
829-
return ""
830-
}
831-
absURL.Fragment = ""
832-
if absURL.Scheme == "//" {
833-
absURL.Scheme = r.URL.Scheme
834-
}
835-
return absURL.String()
836-
}
837-
838-
// Visit continues Collector's collecting job by creating a
839-
// request and preserves the Context of the previous request.
840-
// Visit also calls the previously provided callbacks
841-
func (r *Request) Visit(URL string) error {
842-
return r.collector.scrape(r.AbsoluteURL(URL), "GET", r.Depth+1, nil, r.Ctx, nil, true)
843-
}
844-
845-
// Post continues a collector job by creating a POST request and preserves the Context
846-
// of the previous request.
847-
// Post also calls the previously provided callbacks
848-
func (r *Request) Post(URL string, requestData map[string]string) error {
849-
return r.collector.scrape(r.AbsoluteURL(URL), "POST", r.Depth+1, createFormReader(requestData), r.Ctx, nil, true)
850-
}
851-
852-
// PostRaw starts a collector job by creating a POST request with raw binary data.
853-
// PostRaw preserves the Context of the previous request
854-
// and calls the previously provided callbacks
855-
func (r *Request) PostRaw(URL string, requestData []byte) error {
856-
return r.collector.scrape(r.AbsoluteURL(URL), "POST", r.Depth+1, bytes.NewReader(requestData), r.Ctx, nil, true)
857-
}
858-
859-
// PostMultipart starts a collector job by creating a Multipart POST request
860-
// with raw binary data. PostMultipart also calls the previously provided.
861-
// callbacks
862-
func (r *Request) PostMultipart(URL string, requestData map[string][]byte) error {
863-
boundary := randomBoundary()
864-
hdr := http.Header{}
865-
hdr.Set("Content-Type", "multipart/form-data; boundary="+boundary)
866-
hdr.Set("User-Agent", r.collector.UserAgent)
867-
return r.collector.scrape(r.AbsoluteURL(URL), "POST", r.Depth+1, createMultipartReader(boundary, requestData), r.Ctx, hdr, true)
868-
}
869-
870-
// Retry submits HTTP request again with the same parameters
871-
func (r *Request) Retry() error {
872-
return r.collector.scrape(r.URL.String(), r.Method, r.Depth, r.Body, r.Ctx, *r.Headers, false)
873-
}
874-
875-
// UnmarshalBinary decodes Context value to nil
876-
// This function is used by request caching
877-
func (c *Context) UnmarshalBinary(_ []byte) error {
878-
return nil
879-
}
880-
881-
// MarshalBinary encodes Context value
882-
// This function is used by request caching
883-
func (c *Context) MarshalBinary() (_ []byte, _ error) {
884-
return nil, nil
885-
}
886-
887-
// Put stores a value of any type in Context
888-
func (c *Context) Put(key string, value interface{}) {
889-
c.lock.Lock()
890-
c.contextMap[key] = value
891-
c.lock.Unlock()
892-
}
893-
894-
// Get retrieves a string value from Context.
895-
// Get returns an empty string if key not found
896-
func (c *Context) Get(key string) string {
897-
c.lock.RLock()
898-
defer c.lock.RUnlock()
899-
if v, ok := c.contextMap[key]; ok {
900-
return v.(string)
901-
}
902-
return ""
903-
}
904-
905-
// GetAny retrieves a value from Context.
906-
// GetAny returns nil if key not found
907-
func (c *Context) GetAny(key string) interface{} {
908-
c.lock.RLock()
909-
defer c.lock.RUnlock()
910-
if v, ok := c.contextMap[key]; ok {
911-
return v
912-
}
913-
return nil
914-
}
915-
916-
// Save writes response body to disk
917-
func (r *Response) Save(fileName string) error {
918-
return ioutil.WriteFile(fileName, r.Body, 0644)
919-
}
920-
921-
// FileName returns the sanitized file name parsed from "Content-Disposition"
922-
// header or from URL
923-
func (r *Response) FileName() string {
924-
_, params, err := mime.ParseMediaType(r.Headers.Get("Content-Disposition"))
925-
if fName, ok := params["filename"]; ok && err == nil {
926-
return SanitizeFileName(fName)
927-
}
928-
if r.Request.URL.RawQuery != "" {
929-
return SanitizeFileName(fmt.Sprintf("%s_%s", r.Request.URL.Path, r.Request.URL.RawQuery))
930-
}
931-
return SanitizeFileName(r.Request.URL.Path[1:])
932-
}
933-
934703
// SanitizeFileName replaces dangerous characters in a string
935704
// so the return value can be used as a safe file name.
936705
func SanitizeFileName(fileName string) string {
@@ -982,22 +751,3 @@ func randomBoundary() string {
982751
}
983752
return fmt.Sprintf("%x", buf[:])
984753
}
985-
986-
func (r *Response) fixCharset() {
987-
contentType := strings.ToLower(r.Headers.Get("Content-Type"))
988-
if !strings.Contains(contentType, "charset") {
989-
return
990-
}
991-
if strings.Contains(contentType, "utf-8") || strings.Contains(contentType, "utf8") {
992-
return
993-
}
994-
encodedBodyReader, err := charset.NewReader(bytes.NewReader(r.Body), contentType)
995-
if err != nil {
996-
return
997-
}
998-
tmpBody, err := ioutil.ReadAll(encodedBodyReader)
999-
if err != nil {
1000-
return
1001-
}
1002-
r.Body = tmpBody
1003-
}

context.go

+60
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
package colly
2+
3+
import (
4+
"sync"
5+
)
6+
7+
// Context provides a tiny layer for passing data between callbacks
8+
type Context struct {
9+
contextMap map[string]interface{}
10+
lock *sync.RWMutex
11+
}
12+
13+
// NewContext initializes a new Context instance
14+
func NewContext() *Context {
15+
return &Context{
16+
contextMap: make(map[string]interface{}),
17+
lock: &sync.RWMutex{},
18+
}
19+
}
20+
21+
// UnmarshalBinary decodes Context value to nil
22+
// This function is used by request caching
23+
func (c *Context) UnmarshalBinary(_ []byte) error {
24+
return nil
25+
}
26+
27+
// MarshalBinary encodes Context value
28+
// This function is used by request caching
29+
func (c *Context) MarshalBinary() (_ []byte, _ error) {
30+
return nil, nil
31+
}
32+
33+
// Put stores a value of any type in Context
34+
func (c *Context) Put(key string, value interface{}) {
35+
c.lock.Lock()
36+
c.contextMap[key] = value
37+
c.lock.Unlock()
38+
}
39+
40+
// Get retrieves a string value from Context.
41+
// Get returns an empty string if key not found
42+
func (c *Context) Get(key string) string {
43+
c.lock.RLock()
44+
defer c.lock.RUnlock()
45+
if v, ok := c.contextMap[key]; ok {
46+
return v.(string)
47+
}
48+
return ""
49+
}
50+
51+
// GetAny retrieves a value from Context.
52+
// GetAny returns nil if key not found
53+
func (c *Context) GetAny(key string) interface{} {
54+
c.lock.RLock()
55+
defer c.lock.RUnlock()
56+
if v, ok := c.contextMap[key]; ok {
57+
return v
58+
}
59+
return nil
60+
}

0 commit comments

Comments
 (0)