markcda · Sep 6, 2021 · Sep 6, 2021 · Sep 6, 2021 · Sep 6, 2021 · Sep 6, 2021
Showing with 234 additions and 58 deletions.

+17 −9 README.md

+108 −0 main.py

+0 −49 prtscr-photo-scrapper.py

+104 −0 prtscr_photo_scrapper.ipynb

+5 −0 requirements.txt
diff --git a/README.md b/README.md
@@ -1,13 +1,21 @@
 # prtscr-photo-scrapper
+#### !! I am not responsible for sensitive information that may be leaked with this script.
+With this script you can scrappe screen captures from lightshot
 
-Скрипт для скачивания случайных скриншотов Lightshot с сайта prnt.sc.
 
-## ВНИМАНИЕ!
+## Usage:
 
-ДАННОЕ ПРОГРАММНОЕ ОБЕСПЕЧЕНИЕ ДОЛЖНО БЫТЬ ИСПОЛЬЗОВАНО ИСКЛЮЧИТЕЛЬНО В ОБРАЗОВАТЕЛЬНЫХ ЦЕЛЯХ. ЛЮБОЕ ИСПОЛЬЗОВАНИЕ В НЕЗАКОННЫХ ЦЕЛЯХ НЕДОПУСТИМО. АВТОР НЕ НЕСЁТ ОТВЕТСТВЕННОСТИ ЗА ЛЮБЫЕ ДЕЙСТВИЯ, СВЯЗАННЫЕ С ИСПОЛЬЗОВАНИЕМ ПРОГРАММНОГО ОБЕСПЕЧЕНИЯ.
-
-## Использование
-
-Скрипт требует модуль `beautifulsoup4`: установите с помощью команды `pip install beautifulsoup4`.
-
-Запустите скрипт в консоли `./prtscr-photo-scrapper.py <число-потоков-программы-скачивания{минимум=1}>`. В папку, из которой был запущен скрипт, будут загружены случайные скриншоты, сделанные программой Lightshot.
+### Install the requirements from the file. You will need also python3.
+```
+pip install -r requirements.txt
+```
+### How to use the script
+- Without proxies
+```
+python3 main.py <number of captures you want to scrape> 
+```
+- With proxies (specify the proxy file as argument is optional)
+!! Only socks4 are valid. 
+```
+python3 main.py <number of captures you want to scrape> -proxy <csv file>
+```
diff --git a/main.py b/main.py
@@ -0,0 +1,108 @@
+#!/usr/bin/python3
+
+import random
+import concurrent.futures
+import string
+import requests
+import os
+import csv
+import sys
+from bs4 import BeautifulSoup
+
+
+try:
+    os.mkdir("output")
+except:
+    pass
+
+headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
+
+
+def scrape_proxy(cap_num, proxy_file):
+
+    proxylist = []
+
+    with open(str(proxy_file), 'r') as f:
+        reader = csv.reader(f)
+        for row in reader:
+            proxylist.append(row[0])
+
+    scraped_num = 0
+    while cap_num > scraped_num:
+        try:
+            slug = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(6))
+            url = "https://prnt.sc/" + slug
+            try:
+                response = requests.get(url, proxies={'https': "socks4://" + random.choice(proxylist),'http': "socks4://" + random.choice(proxylist)}, headers=headers, timeout=0.5)
+            except:
+                pass
+            content = response.content.decode()
+            soup = BeautifulSoup(content, features='lxml')
+            ufr = requests.get(soup.img['src'], headers=headers)
+            f = open(f'output/{slug}.png', 'wb')
+            f.write(ufr.content)
+            f.close()
+            print(f'[+] Received file {slug}.png')
+            scraped_num += 1
+        except requests.exceptions.MissingSchema:
+            pass
+        except:
+            pass
+
+def scrape(cap_num):
+
+    scraped_num = 0
+    while cap_num > scraped_num:
+        try:
+            slug = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(6))
+            url = "https://prnt.sc/" + slug
+            try:
+                response = requests.get(url, headers=headers)
+            except:
+                pass
+            content = response.content.decode()
+            soup = BeautifulSoup(content, features='lxml')
+            ufr = requests.get(soup.img['src'], headers=headers)
+            f = open(f'output/{slug}.png', 'wb')
+            f.write(ufr.content)
+            f.close()
+            print(f'[+] Received file {slug}.png')
+            scraped_num += 1
+        except requests.exceptions.MissingSchema:
+            pass
+        except:
+            pass
+
+
+
+def main():
+
+    if len(sys.argv) == 2:
+            print("[*] Downloading captures from " + str(sys.argv[1]) + " links without proxy...")
+
+            n_of_links = int(sys.argv[1])
+            scrape(n_of_links)
+
+    elif len(sys.argv) == 3:
+        if sys.argv[2] == "-proxy":
+            n_of_links = int(sys.argv[1])
+            if len(sys.argv) == 4:
+                proxy_file = str(sys.argv[3])
+            else:
+                proxy_file = input("[*] Proxy file (only csv) >> ")
+            if proxy_file.split(".")[1] != "csv":
+                print("[!] Only .csv files for proxies and only socks4 are valid.")
+                sys.exit(1)
+            print("[*] Downloading captures from " + str(sys.argv[1]) + " links using proxies from " + str(proxy_file) + "...")
+            scrape_proxy(n_of_links, proxy_file)
+        else:
+            print("[*] Downloading captures from " + str(sys.argv[1]) + " links without proxy...")
+            n_of_links = int(sys.argv[1])
+            scrape(n_of_links)
+
+    else:
+        print("[!] Usage: python3 " + sys.argv[0] + " <number of captures you want to download>")
+        sys.exit(1)
+
+if __name__ == "__main__":
+    main()
diff --git a/prtscr-photo-scrapper.py b/prtscr-photo-scrapper.py
diff --git a/prtscr_photo_scrapper.ipynb b/prtscr_photo_scrapper.ipynb
@@ -0,0 +1,104 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "prtscr-photo-scrapper.ipynb",
+      "provenance": [],
+      "collapsed_sections": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "A7RlHt-805P8"
+      },
+      "source": [
+        "!pip install requests\n",
+        "!pip install python-csv\n",
+        "!pip install pysocks\n",
+        "!pip install beautifulsoup4\n",
+        "!pip install lxml"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "nwDva3WY1Jy8"
+      },
+      "source": [
+        "#!/usr/bin/python3\n",
+        "\n",
+        "from google.colab import drive\n",
+        "drive.mount('/content/drive')\n",
+        "\n",
+        "import random\n",
+        "import concurrent.futures\n",
+        "import string\n",
+        "import requests\n",
+        "import os\n",
+        "import csv\n",
+        "import sys\n",
+        "from bs4 import BeautifulSoup\n",
+        "\n",
+        "\n",
+        "try:\n",
+        "    os.mkdir(\"/content/drive/MyDrive/output_prtscr\")\n",
+        "except:\n",
+        "    pass\n",
+        "\n",
+        "headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}\n",
+        "\n",
+        "def scrape(cap_num):\n",
+        "\n",
+        "    print(\"Scrapping...\")\n",
+        "    \n",
+        "    scraped_num = 0\n",
+        "    while cap_num > scraped_num:\n",
+        "        try:\n",
+        "            slug = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(6))\n",
+        "            url = \"https://prnt.sc/\" + slug\n",
+        "            try:\n",
+        "                response = requests.get(url, headers=headers, timeout=2)\n",
+        "            except:\n",
+        "                pass\n",
+        "            content = response.content.decode()\n",
+        "            soup = BeautifulSoup(content, features='lxml')\n",
+        "            ufr = requests.get(soup.img['src'], headers=headers)\n",
+        "            f = open(f'/content/drive/MyDrive/output_prtscr/{slug}.png', 'wb')\n",
+        "            f.write(ufr.content)\n",
+        "            f.close()\n",
+        "            print(f'[+] Received file {slug}.png')\n",
+        "            scraped_num += 1\n",
+        "        except requests.exceptions.MissingSchema:\n",
+        "            pass\n",
+        "        except:\n",
+        "            pass\n",
+        "\n",
+        "\n",
+        "\n",
+        "def main():\n",
+        "    \n",
+        "\n",
+        "  n_of_links = int(input(\"Number of links for scrapping: \"))\n",
+        "  scrape(n_of_links)\n",
+        "\n",
+        "if __name__ == \"__main__\":\n",
+        "    main()"
+      ],
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,5 @@
+requests
+python-csv
+pysocks
+beautifulsoup4
+lxml