Draft simple loop stuff

dan-simon · dan-simon · commit 0d1f78a100f2 · 2023-09-30T23:37:42.000-04:00
diff --git a/crawl/crawl_archive.py b/crawl/crawl_archive.py
@@ -0,0 +1,71 @@
+import os
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin, urlparse
+import requests
+import time
+
+DELAY = 5
+SITE = '2021.huntinality.com'
+# SITE = 'dan-simon.github.io'
+FOLDER = 'output'
+
+def save_to_wayback_machine(url):
+    archive_url = "https://web.archive.org/save/" + url
+    data = {
+        'url': url,
+        'capture_all': 'on'
+    }
+    
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+    }
+
+    response = requests.post(archive_url, data=data, headers=headers)
+
+    if response.status_code == 200:
+        print(f"Successfully archived {url}")
+    else:
+        print(f"Failed to archive {url} with status code {response.status_code}")
+        print(response.text)  # This will print the response content, useful for debugging.
+    return response
+
+def handle_url(url, s):
+    if url in s:
+        return
+    time.sleep(DELAY)
+    print('getting page', url)
+    response = requests.get(url)
+    content_type = response.headers.get('Content-Type', '')
+    is_html = 'text/html' in content_type
+    s.add(url)
+    save_to_wayback_machine(url)
+    if is_html:
+        soup = BeautifulSoup(response.text, 'html.parser')
+        all_urls = [i for i in extract_full_urls(soup, url) if SITE in i]
+        for i in all_urls:
+            handle_url(i, s)
+ 
+s =set()
+ 
+def extract_full_urls(soup, base_url):
+    tags = ['a', 'link', 'script', 'img', 'image', 'audio', 'source']
+    all_links = [i[j] for t in tags for j in ('href', 'src') for i in soup.find_all(t, **{j: True})]
+    # print(all_links)
+    # Convert relative links to full URLs
+    if '.' not in base_url.split('/')[-1]:
+        real_base_url = base_url + '/'
+    else:
+        real_base_url = base_url
+    full_urls = [urljoin(real_base_url, link) for link in all_links]
+    return full_urls
+
+handle_url('https://2021.huntinality.com/', s)
+# handle_url('https://dan-simon.github.io/puzzles', s)
+
+'''
+Next steps:
+- List files rather than getting them all, for internet archive
+- Add command-line arguments
+- Get all href/src tags rather than just those on some subset of elements (maybe let user control this)
+- Get fonts from css files
+'''
diff --git a/crawl/crawl_translated.py b/crawl/crawl_translated.py
@@ -0,0 +1,33 @@
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin, urlparse
+import requests
+import time
+
+s = set()
+
+def handle_url(url, s):
+    if url in s:
+        return
+    time.sleep(DELAY)
+    print('getting page', url)
+    response = requests.get(url)
+    content_type = response.headers.get('Content-Type', '')
+    is_html = 'text/html' in content_type
+    s.add(url)
+    if is_html:
+        print(url, response.content)
+        soup = BeautifulSoup(response.text, 'html.parser')
+        all_urls = [i for i in extract_full_urls(soup, url) if SITE in i]
+        for i in all_urls:
+            handle_url(i, s)
+ 
+def extract_full_urls(soup, base_url):
+    all_links = [i['href'] for i in soup.find_all('a', href=True)]
+    # print(all_links)
+    # Convert relative links to full URLs
+    if '.' not in base_url.split('/')[-1]:
+        real_base_url = base_url + '/'
+    else:
+        real_base_url = base_url
+    full_urls = [urljoin(real_base_url, link) for link in all_links]
+    return full_urls
diff --git a/crawl/script.js b/crawl/script.js
@@ -0,0 +1,60 @@
+let DELAY = 7000;
+let LONG_DELAY = 70000;
+
+let visitedUrls = new Set();
+
+async function saveToWaybackMachine(url) {
+    const archiveUrl = `https://web.archive.org/save/${url}`;
+    try {
+      await fetch(archiveUrl, {
+          method: 'POST',
+          mode: 'no-cors',
+          headers: {
+              'Content-Type': 'application/x-www-form-urlencoded',
+          },
+          body: new URLSearchParams({
+              'url': url,
+              'capture_all': 'on'
+          })
+      });
+      console.log(`Saved ${url}`);
+    } catch (e) {
+      console.log('Something went wrong (rate limit)?');
+      await new Promise(resolve => setTimeout(resolve, LONG_DELAY));
+    }
+}
+
+async function handleUrl(url) {
+    if (visitedUrls.has(url)) {
+        return;
+    }
+
+    await new Promise(resolve => setTimeout(resolve, DELAY));  // Mimic time.sleep()
+    
+    try {
+        let response = await fetch(url);
+        let contentType = response.headers.get('Content-Type') || '';
+        let isHtml = contentType.includes('text/html');
+        visitedUrls.add(url);
+        
+        if (isHtml) {
+            let text = await response.text();
+            await saveToWaybackMachine(url);
+            let parser = new DOMParser();
+            let doc = parser.parseFromString(text, 'text/html');
+            let fullUrls = extractFullUrls(doc, url).filter(i => i.includes(SITE));
+            for (let i of fullUrls) {
+                await handleUrl(i);
+            }
+        }
+    } catch (e) {
+        console.error("Failed to fetch the URL", url, e);
+    }
+}
+
+function extractFullUrls(doc, baseUrl) {
+    return Array.from(doc.querySelectorAll('a[href]')).map(a => {
+        let href = a.getAttribute('href'); // Get the raw href value
+        return new URL(href, baseUrl).toString(); // Resolve it against the base URL
+    });
+}
diff --git a/simple_loop_parity_explanation/index.html b/simple_loop_parity_explanation/index.html
@@ -0,0 +1,53 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <title>Visual Grid</title>
+    <style>
+        table {
+            border-collapse: collapse;
+        }
+        td {
+            width: 30px;
+            height: 30px;
+            border: 1px solid black;
+            position: relative;
+        }
+        .black {
+            background-color: black;
+        }
+        .highlighted {
+            background-color: lime;
+        }
+        .edge {
+            display: flex;
+            align-items: center;
+            justify-content: center;
+            position: absolute;
+            pointer-events: none;
+            z-index: 1;
+            text-align: center;
+            vertical-align: middle;
+        }
+        .edge.used {
+            background-color: black;
+        }
+        .horizontal {
+            width: 39px;
+            height: 6px;
+            left: 13px;
+            top: 13px;
+        }
+        .vertical {
+            width: 6px;
+            height: 39px;
+            left: 13px;
+            top: 13px;
+        }
+    </style>
+    <script src="script.js"></script>
+</head>
+<body>
+    <table id="gridTable"></table>
+</body>
+</html>
diff --git a/simple_loop_parity_explanation/script.js b/simple_loop_parity_explanation/script.js
diff --git a/simple_loop_parity_explanation/simple_loop_parity.pdf b/simple_loop_parity_explanation/simple_loop_parity.pdf