8
8
"fmt"
9
9
"hash/fnv"
10
10
"io"
11
- "io/ioutil"
12
- "mime"
13
11
"net/http"
14
12
"net/http/cookiejar"
15
13
"net/url"
@@ -25,9 +23,6 @@ import (
25
23
"google.golang.org/appengine"
26
24
"google.golang.org/appengine/urlfetch"
27
25
28
- "golang.org/x/net/html"
29
- "golang.org/x/net/html/charset"
30
-
31
26
"github.com/PuerkitoBio/goquery"
32
27
"github.com/kennygrant/sanitize"
33
28
"github.com/temoto/robotstxt"
@@ -80,72 +75,6 @@ type Collector struct {
80
75
lock * sync.RWMutex
81
76
}
82
77
83
- // Request is the representation of a HTTP request made by a Collector
84
- type Request struct {
85
- // URL is the parsed URL of the HTTP request
86
- URL * url.URL
87
- // Headers contains the Request's HTTP headers
88
- Headers * http.Header
89
- // Ctx is a context between a Request and a Response
90
- Ctx * Context
91
- // Depth is the number of the parents of the request
92
- Depth int
93
- // Method is the HTTP method of the request
94
- Method string
95
- // Body is the request body which is used on POST/PUT requests
96
- Body io.Reader
97
- // Unique identifier of the request
98
- Id uint32
99
- collector * Collector
100
- }
101
-
102
- // Response is the representation of a HTTP response made by a Collector
103
- type Response struct {
104
- // StatusCode is the status code of the Response
105
- StatusCode int
106
- // Body is the content of the Response
107
- Body []byte
108
- // Ctx is a context between a Request and a Response
109
- Ctx * Context
110
- // Request is the Request object of the response
111
- Request * Request
112
- // Headers contains the Response's HTTP headers
113
- Headers * http.Header
114
- }
115
-
116
- // HTMLElement is the representation of a HTML tag.
117
- type HTMLElement struct {
118
- // Name is the name of the tag
119
- Name string
120
- Text string
121
- attributes []html.Attribute
122
- // Request is the request object of the element's HTML document
123
- Request * Request
124
- // Response is the Response object of the element's HTML document
125
- Response * Response
126
- // DOM is the goquery parsed DOM object of the page. DOM is relative
127
- // to the current HTMLElement
128
- DOM * goquery.Selection
129
- }
130
-
131
- // NewHTMLElementFromSelectionNode creates a HTMLElement from a goquery.Selection Node.
132
- func NewHTMLElementFromSelectionNode (resp * Response , s * goquery.Selection , n * html.Node ) * HTMLElement {
133
- return & HTMLElement {
134
- Name : n .Data ,
135
- Request : resp .Request ,
136
- Response : resp ,
137
- Text : goquery .NewDocumentFromNode (n ).Text (),
138
- DOM : s ,
139
- attributes : n .Attr ,
140
- }
141
- }
142
-
143
- // Context provides a tiny layer for passing data between callbacks
144
- type Context struct {
145
- contextMap map [string ]interface {}
146
- lock * sync.RWMutex
147
- }
148
-
149
78
// RequestCallback is a type alias for OnRequest callback functions
150
79
type RequestCallback func (* Request )
151
80
@@ -199,14 +128,6 @@ func NewCollector() *Collector {
199
128
return c
200
129
}
201
130
202
- // NewContext initializes a new Context instance
203
- func NewContext () * Context {
204
- return & Context {
205
- contextMap : make (map [string ]interface {}),
206
- lock : & sync.RWMutex {},
207
- }
208
- }
209
-
210
131
// Init initializes the Collector's private variables and sets default
211
132
// configuration for the Collector
212
133
func (c * Collector ) Init () {
@@ -779,158 +700,6 @@ func (c *Collector) checkRedirectFunc() func(req *http.Request, via []*http.Requ
779
700
}
780
701
}
781
702
782
- // Attr returns the selected attribute of a HTMLElement or empty string
783
- // if no attribute found
784
- func (h * HTMLElement ) Attr (k string ) string {
785
- for _ , a := range h .attributes {
786
- if a .Key == k {
787
- return a .Val
788
- }
789
- }
790
- return ""
791
- }
792
-
793
- // ChildText returns the concatenated and stripped text content of the matching
794
- // elements.
795
- func (h * HTMLElement ) ChildText (goquerySelector string ) string {
796
- return strings .TrimSpace (h .DOM .Find (goquerySelector ).Text ())
797
- }
798
-
799
- // ChildAttr returns the stripped text content of the first matching
800
- // element's attribute.
801
- func (h * HTMLElement ) ChildAttr (goquerySelector , attrName string ) string {
802
- if attr , ok := h .DOM .Find (goquerySelector ).Attr (attrName ); ok {
803
- return strings .TrimSpace (attr )
804
- }
805
- return ""
806
- }
807
-
808
- // ChildAttrs returns the stripped text content of all the matching
809
- // element's attributes.
810
- func (h * HTMLElement ) ChildAttrs (goquerySelector , attrName string ) []string {
811
- res := make ([]string , 0 )
812
- h .DOM .Find (goquerySelector ).Each (func (_ int , s * goquery.Selection ) {
813
- if attr , ok := s .Attr (attrName ); ok {
814
- res = append (res , strings .TrimSpace (attr ))
815
- }
816
- })
817
- return res
818
- }
819
-
820
- // AbsoluteURL returns with the resolved absolute URL of an URL chunk.
821
- // AbsoluteURL returns empty string if the URL chunk is a fragment or
822
- // could not be parsed
823
- func (r * Request ) AbsoluteURL (u string ) string {
824
- if strings .HasPrefix (u , "#" ) {
825
- return ""
826
- }
827
- absURL , err := r .URL .Parse (u )
828
- if err != nil {
829
- return ""
830
- }
831
- absURL .Fragment = ""
832
- if absURL .Scheme == "//" {
833
- absURL .Scheme = r .URL .Scheme
834
- }
835
- return absURL .String ()
836
- }
837
-
838
- // Visit continues Collector's collecting job by creating a
839
- // request and preserves the Context of the previous request.
840
- // Visit also calls the previously provided callbacks
841
- func (r * Request ) Visit (URL string ) error {
842
- return r .collector .scrape (r .AbsoluteURL (URL ), "GET" , r .Depth + 1 , nil , r .Ctx , nil , true )
843
- }
844
-
845
- // Post continues a collector job by creating a POST request and preserves the Context
846
- // of the previous request.
847
- // Post also calls the previously provided callbacks
848
- func (r * Request ) Post (URL string , requestData map [string ]string ) error {
849
- return r .collector .scrape (r .AbsoluteURL (URL ), "POST" , r .Depth + 1 , createFormReader (requestData ), r .Ctx , nil , true )
850
- }
851
-
852
- // PostRaw starts a collector job by creating a POST request with raw binary data.
853
- // PostRaw preserves the Context of the previous request
854
- // and calls the previously provided callbacks
855
- func (r * Request ) PostRaw (URL string , requestData []byte ) error {
856
- return r .collector .scrape (r .AbsoluteURL (URL ), "POST" , r .Depth + 1 , bytes .NewReader (requestData ), r .Ctx , nil , true )
857
- }
858
-
859
- // PostMultipart starts a collector job by creating a Multipart POST request
860
- // with raw binary data. PostMultipart also calls the previously provided.
861
- // callbacks
862
- func (r * Request ) PostMultipart (URL string , requestData map [string ][]byte ) error {
863
- boundary := randomBoundary ()
864
- hdr := http.Header {}
865
- hdr .Set ("Content-Type" , "multipart/form-data; boundary=" + boundary )
866
- hdr .Set ("User-Agent" , r .collector .UserAgent )
867
- return r .collector .scrape (r .AbsoluteURL (URL ), "POST" , r .Depth + 1 , createMultipartReader (boundary , requestData ), r .Ctx , hdr , true )
868
- }
869
-
870
- // Retry submits HTTP request again with the same parameters
871
- func (r * Request ) Retry () error {
872
- return r .collector .scrape (r .URL .String (), r .Method , r .Depth , r .Body , r .Ctx , * r .Headers , false )
873
- }
874
-
875
- // UnmarshalBinary decodes Context value to nil
876
- // This function is used by request caching
877
- func (c * Context ) UnmarshalBinary (_ []byte ) error {
878
- return nil
879
- }
880
-
881
- // MarshalBinary encodes Context value
882
- // This function is used by request caching
883
- func (c * Context ) MarshalBinary () (_ []byte , _ error ) {
884
- return nil , nil
885
- }
886
-
887
- // Put stores a value of any type in Context
888
- func (c * Context ) Put (key string , value interface {}) {
889
- c .lock .Lock ()
890
- c .contextMap [key ] = value
891
- c .lock .Unlock ()
892
- }
893
-
894
- // Get retrieves a string value from Context.
895
- // Get returns an empty string if key not found
896
- func (c * Context ) Get (key string ) string {
897
- c .lock .RLock ()
898
- defer c .lock .RUnlock ()
899
- if v , ok := c .contextMap [key ]; ok {
900
- return v .(string )
901
- }
902
- return ""
903
- }
904
-
905
- // GetAny retrieves a value from Context.
906
- // GetAny returns nil if key not found
907
- func (c * Context ) GetAny (key string ) interface {} {
908
- c .lock .RLock ()
909
- defer c .lock .RUnlock ()
910
- if v , ok := c .contextMap [key ]; ok {
911
- return v
912
- }
913
- return nil
914
- }
915
-
916
- // Save writes response body to disk
917
- func (r * Response ) Save (fileName string ) error {
918
- return ioutil .WriteFile (fileName , r .Body , 0644 )
919
- }
920
-
921
- // FileName returns the sanitized file name parsed from "Content-Disposition"
922
- // header or from URL
923
- func (r * Response ) FileName () string {
924
- _ , params , err := mime .ParseMediaType (r .Headers .Get ("Content-Disposition" ))
925
- if fName , ok := params ["filename" ]; ok && err == nil {
926
- return SanitizeFileName (fName )
927
- }
928
- if r .Request .URL .RawQuery != "" {
929
- return SanitizeFileName (fmt .Sprintf ("%s_%s" , r .Request .URL .Path , r .Request .URL .RawQuery ))
930
- }
931
- return SanitizeFileName (r .Request .URL .Path [1 :])
932
- }
933
-
934
703
// SanitizeFileName replaces dangerous characters in a string
935
704
// so the return value can be used as a safe file name.
936
705
func SanitizeFileName (fileName string ) string {
@@ -982,22 +751,3 @@ func randomBoundary() string {
982
751
}
983
752
return fmt .Sprintf ("%x" , buf [:])
984
753
}
985
-
986
- func (r * Response ) fixCharset () {
987
- contentType := strings .ToLower (r .Headers .Get ("Content-Type" ))
988
- if ! strings .Contains (contentType , "charset" ) {
989
- return
990
- }
991
- if strings .Contains (contentType , "utf-8" ) || strings .Contains (contentType , "utf8" ) {
992
- return
993
- }
994
- encodedBodyReader , err := charset .NewReader (bytes .NewReader (r .Body ), contentType )
995
- if err != nil {
996
- return
997
- }
998
- tmpBody , err := ioutil .ReadAll (encodedBodyReader )
999
- if err != nil {
1000
- return
1001
- }
1002
- r .Body = tmpBody
1003
- }
0 commit comments