forked from wjdp/htmltest
-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdocument.go
115 lines (102 loc) · 3.63 KB
/
document.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
package htmldoc
import (
"fmt"
"github.com/wjdp/htmltest/output"
"golang.org/x/net/html"
"os"
"path"
"sync"
)
// Document struct, representation of a document within the tested site
type Document struct {
FilePath string // Relative to the shell session
SitePath string // Relative to the site root
BasePath string // Base for relative links
htmlMutex *sync.Mutex // Controls access to htmlNode
htmlNode *html.Node // Parsed output
hashMap map[string]*html.Node // Map of valid id/names of nodes
NodesOfInterest []*html.Node // Slice of nodes to run checks on
State DocumentState // Link to a DocumentState struct
DoctypeNode *html.Node // Pointer to doctype node if exists
ignoreTagAttribute string // Attribute to ignore element and children if found on element
}
// DocumentState struct, used by checks that depend on the document being
// parsed.
type DocumentState struct {
FaviconPresent bool // Have we found a favicon in the document?
}
// Init : Initialise the Document struct doesn't mesh nice with the NewXYZ()
// convention but many optional parameters for Document and no parameter
// overloading in Go
func (doc *Document) Init() {
// Setup the document,
doc.htmlMutex = &sync.Mutex{}
doc.NodesOfInterest = make([]*html.Node, 0)
doc.hashMap = make(map[string]*html.Node)
}
// Parse : Ask Document to parse its HTML file. Returns quickly if this has
// already been done. Thread safe. Either called when the document is tested
// or when another document needs data from this one.
func (doc *Document) Parse() {
// Only one routine may parse the doc
doc.htmlMutex.Lock()
defer doc.htmlMutex.Unlock()
// If document has already been parsed, return early.
if doc.htmlNode != nil {
return
}
// Open, parse, and close document
f, err := os.Open(doc.FilePath)
output.CheckErrorPanic(err)
defer f.Close()
htmlNode, err := html.Parse(f)
output.CheckErrorGeneric(err)
doc.htmlNode = htmlNode
doc.parseNode(htmlNode)
}
// Internal recursive function that delves into the node tree and captures
// nodes of interest and node id/names.
func (doc *Document) parseNode(n *html.Node) {
// Ignore this tree if data-proofer-ignore set
if doc.ignoreTagAttribute != "" && AttrPresent(n.Attr, doc.ignoreTagAttribute) {
return
}
switch n.Type {
case html.DoctypeNode:
doc.DoctypeNode = n
case html.ElementNode:
// If present save fragment identifier to the hashMap
nodeID := GetID(n.Attr)
if nodeID != "" {
doc.hashMap[nodeID] = n
}
// Identify and store tags of interest
switch n.Data {
case "a", "area", "audio", "blockquote", "del", "embed", "iframe", "img",
"input", "ins", "link", "meta", "object", "q", "script", "source",
"track", "video":
// Nodes of interest
doc.NodesOfInterest = append(doc.NodesOfInterest, n)
case "base":
// Set BasePath from <base> tag
doc.BasePath = path.Join(doc.BasePath, GetAttr(n.Attr, "href"))
case "pre", "code":
return // Everything within these elements is not to be interpreted
}
case html.ErrorNode:
fmt.Printf("%+v\n", n)
fmt.Println("Oops, in parsing your HTML we fell over.\n",
"Please let the developer know about this.\n",
"https://github.com/wjdp/htmltest/issues/new")
}
// Iterate over children
for c := n.FirstChild; c != nil; c = c.NextSibling {
doc.parseNode(c)
}
}
// IsHashValid : Is a hash/fragment present in this Document.
func (doc *Document) IsHashValid(hash string) bool {
doc.Parse() // Ensure doc has been parsed
_, ok := doc.hashMap[hash]
return ok
}