Skip to content

Commit 0d1f78a

Browse files
committedOct 1, 2023
Draft simple loop stuff
1 parent c4a6c60 commit 0d1f78a

File tree

6 files changed

+812
-0
lines changed

6 files changed

+812
-0
lines changed
 

‎crawl/crawl_archive.py

+71
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
import os
2+
from bs4 import BeautifulSoup
3+
from urllib.parse import urljoin, urlparse
4+
import requests
5+
import time
6+
7+
DELAY = 5
8+
SITE = '2021.huntinality.com'
9+
# SITE = 'dan-simon.github.io'
10+
FOLDER = 'output'
11+
12+
def save_to_wayback_machine(url):
13+
archive_url = "https://web.archive.org/save/" + url
14+
data = {
15+
'url': url,
16+
'capture_all': 'on'
17+
}
18+
19+
headers = {
20+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
21+
}
22+
23+
response = requests.post(archive_url, data=data, headers=headers)
24+
25+
if response.status_code == 200:
26+
print(f"Successfully archived {url}")
27+
else:
28+
print(f"Failed to archive {url} with status code {response.status_code}")
29+
print(response.text) # This will print the response content, useful for debugging.
30+
return response
31+
32+
def handle_url(url, s):
33+
if url in s:
34+
return
35+
time.sleep(DELAY)
36+
print('getting page', url)
37+
response = requests.get(url)
38+
content_type = response.headers.get('Content-Type', '')
39+
is_html = 'text/html' in content_type
40+
s.add(url)
41+
save_to_wayback_machine(url)
42+
if is_html:
43+
soup = BeautifulSoup(response.text, 'html.parser')
44+
all_urls = [i for i in extract_full_urls(soup, url) if SITE in i]
45+
for i in all_urls:
46+
handle_url(i, s)
47+
48+
s =set()
49+
50+
def extract_full_urls(soup, base_url):
51+
tags = ['a', 'link', 'script', 'img', 'image', 'audio', 'source']
52+
all_links = [i[j] for t in tags for j in ('href', 'src') for i in soup.find_all(t, **{j: True})]
53+
# print(all_links)
54+
# Convert relative links to full URLs
55+
if '.' not in base_url.split('/')[-1]:
56+
real_base_url = base_url + '/'
57+
else:
58+
real_base_url = base_url
59+
full_urls = [urljoin(real_base_url, link) for link in all_links]
60+
return full_urls
61+
62+
handle_url('https://2021.huntinality.com/', s)
63+
# handle_url('https://dan-simon.github.io/puzzles', s)
64+
65+
'''
66+
Next steps:
67+
- List files rather than getting them all, for internet archive
68+
- Add command-line arguments
69+
- Get all href/src tags rather than just those on some subset of elements (maybe let user control this)
70+
- Get fonts from css files
71+
'''

‎crawl/crawl_translated.py

+33
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
from bs4 import BeautifulSoup
2+
from urllib.parse import urljoin, urlparse
3+
import requests
4+
import time
5+
6+
s = set()
7+
8+
def handle_url(url, s):
9+
if url in s:
10+
return
11+
time.sleep(DELAY)
12+
print('getting page', url)
13+
response = requests.get(url)
14+
content_type = response.headers.get('Content-Type', '')
15+
is_html = 'text/html' in content_type
16+
s.add(url)
17+
if is_html:
18+
print(url, response.content)
19+
soup = BeautifulSoup(response.text, 'html.parser')
20+
all_urls = [i for i in extract_full_urls(soup, url) if SITE in i]
21+
for i in all_urls:
22+
handle_url(i, s)
23+
24+
def extract_full_urls(soup, base_url):
25+
all_links = [i['href'] for i in soup.find_all('a', href=True)]
26+
# print(all_links)
27+
# Convert relative links to full URLs
28+
if '.' not in base_url.split('/')[-1]:
29+
real_base_url = base_url + '/'
30+
else:
31+
real_base_url = base_url
32+
full_urls = [urljoin(real_base_url, link) for link in all_links]
33+
return full_urls

‎crawl/script.js

+60
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
let DELAY = 7000;
2+
let LONG_DELAY = 70000;
3+
4+
let visitedUrls = new Set();
5+
6+
async function saveToWaybackMachine(url) {
7+
const archiveUrl = `https://web.archive.org/save/${url}`;
8+
try {
9+
await fetch(archiveUrl, {
10+
method: 'POST',
11+
mode: 'no-cors',
12+
headers: {
13+
'Content-Type': 'application/x-www-form-urlencoded',
14+
},
15+
body: new URLSearchParams({
16+
'url': url,
17+
'capture_all': 'on'
18+
})
19+
});
20+
console.log(`Saved ${url}`);
21+
} catch (e) {
22+
console.log('Something went wrong (rate limit)?');
23+
await new Promise(resolve => setTimeout(resolve, LONG_DELAY));
24+
}
25+
}
26+
27+
async function handleUrl(url) {
28+
if (visitedUrls.has(url)) {
29+
return;
30+
}
31+
32+
await new Promise(resolve => setTimeout(resolve, DELAY)); // Mimic time.sleep()
33+
34+
try {
35+
let response = await fetch(url);
36+
let contentType = response.headers.get('Content-Type') || '';
37+
let isHtml = contentType.includes('text/html');
38+
visitedUrls.add(url);
39+
40+
if (isHtml) {
41+
let text = await response.text();
42+
await saveToWaybackMachine(url);
43+
let parser = new DOMParser();
44+
let doc = parser.parseFromString(text, 'text/html');
45+
let fullUrls = extractFullUrls(doc, url).filter(i => i.includes(SITE));
46+
for (let i of fullUrls) {
47+
await handleUrl(i);
48+
}
49+
}
50+
} catch (e) {
51+
console.error("Failed to fetch the URL", url, e);
52+
}
53+
}
54+
55+
function extractFullUrls(doc, baseUrl) {
56+
return Array.from(doc.querySelectorAll('a[href]')).map(a => {
57+
let href = a.getAttribute('href'); // Get the raw href value
58+
return new URL(href, baseUrl).toString(); // Resolve it against the base URL
59+
});
60+
}
+53
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
<!DOCTYPE html>
2+
<html lang="en">
3+
<head>
4+
<meta charset="UTF-8">
5+
<title>Visual Grid</title>
6+
<style>
7+
table {
8+
border-collapse: collapse;
9+
}
10+
td {
11+
width: 30px;
12+
height: 30px;
13+
border: 1px solid black;
14+
position: relative;
15+
}
16+
.black {
17+
background-color: black;
18+
}
19+
.highlighted {
20+
background-color: lime;
21+
}
22+
.edge {
23+
display: flex;
24+
align-items: center;
25+
justify-content: center;
26+
position: absolute;
27+
pointer-events: none;
28+
z-index: 1;
29+
text-align: center;
30+
vertical-align: middle;
31+
}
32+
.edge.used {
33+
background-color: black;
34+
}
35+
.horizontal {
36+
width: 39px;
37+
height: 6px;
38+
left: 13px;
39+
top: 13px;
40+
}
41+
.vertical {
42+
width: 6px;
43+
height: 39px;
44+
left: 13px;
45+
top: 13px;
46+
}
47+
</style>
48+
<script src="script.js"></script>
49+
</head>
50+
<body>
51+
<table id="gridTable"></table>
52+
</body>
53+
</html>

‎simple_loop_parity_explanation/script.js

+595
Large diffs are not rendered by default.
Binary file not shown.

0 commit comments

Comments
 (0)
Please sign in to comment.