Skip to content

Commit

Permalink
adding a python replacement of the 4chan image getting script. I got …
Browse files Browse the repository at this point in the history
…sick of the bugs in the super old bash script
  • Loading branch information
gmn committed Oct 20, 2022
1 parent adc7bbe commit e3137b1
Showing 1 changed file with 98 additions and 135 deletions.
233 changes: 98 additions & 135 deletions 4chget.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,46 @@

"""
- default: takes single argument, a 4chan url, and download all the images to a directory, also backs up the html. Default action for subsequent downloads is to skip downloading files we already have, but not check their size
- checks for existence of dir, takes optional dir argument
- should check local directory and separate urls of files we don't have. Then can report, "getting 11 new files."
- optional argument forces 'Content-Length' checks against byte-size of each file to make sure
all of them are complete
X start by working up a printf progress bar like apt-get
- takes an argument list of URLS
"""

import json
from os.path import isdir, isfile, join, dirname, os
import requests
import sys
import time

FEED_URL = "https://rss.samharris.org/feed/bcf60cfa-3ee2-406e-ae8e-f5f439a0a993"
FILES_DIR = './FILES'


p = lambda s: print(s)
perr = lambda s: print(s, file=sys.stderr)


def download_file( url, filename ):
X = requests.get( url )
with open( filename, 'wb' ) as f:
f.write(X.content)
if isfile( filename ):
p('success: saved "{}"'.format(filename))
else:
p('{} download failed [{}]'.format(filename, url))
return isfile( filename )


def progress_download( url, filename ):
def print_progress_bar( fraction ):
""" fraction is in range 0.0 to 1.0 """
barlen = 44
barlen = 36
filename_truncated = 44
frac = int(fraction * barlen)

ftruncate = 36
flen = ftruncate if len(filename) > ftruncate else len(filename)
flen = filename_truncated if len(filename) > filename_truncated else len(filename)
print('\r' * (barlen + 2 + flen + 1), end='')
print(filename[:ftruncate] + " ["+frac*'='+(barlen-frac)*' '+"]", end='')
print(filename[:filename_truncated] + " ["+frac*'='+(barlen-frac)*' '+"]", end='')

try:
fd = open( filename, 'wb' )
Expand Down Expand Up @@ -63,26 +72,40 @@ def print_progress_bar( fraction ):
return isfile( filename )


def check_length_download( url, filename, dl_func=download_file ):
def check_file_download( url, filename, dl_func=download_file, chkLen=False ):
""" check the byte length of every file;
download it if it doesn't exist,
OR if the byte lengths don't match """
OR if the byte lengths don't match .
Change chkLen to False to only check for existence of a file,
and disable checking length.
returns False for not exist or incorrect filelength
"""

if not os.path.isfile( filename ):
p( 'getting "{}"'.format(url) )
dl_func( url, filename )
time.sleep( 1 )
return False # didn't exist
else:
p('!EXISTS "{}" Checking Length....'.format(filename))
time.sleep( (2 ** 0.5) / 8 / 2 )
print('!EXISTS "{}"'.format(filename))

if not chkLen:
return True # did exist

print('!EXISTS "{}" **Checking Length** '.format(filename), end='')
# get headers and compare against byte size of file
H = requests.head( url )
if H.headers.get('Content-Length', None):
if H.headers['Content-Length'] != str(os.stat(filename).st_size):
perr( f'Size doesnt match server. Getting again: "{filename}"' )
a = str(H.headers['Content-Length'])
b = str(os.stat(filename).st_size)
print( f'Size doesnt match [{a} != {b}]. Re-getting' )
dl_func(url, filename)
time.sleep( 1 )
return False # didn't checklen correctly
else:
print(' ok')
return True # correct length


def str_indexes( needle, haystack ):
Expand Down Expand Up @@ -132,92 +155,7 @@ def numpad(n, leading_zeros=3):
return s


def main():
if not isdir(FILES_DIR):
perr('creating {}'.format(FILES_DIR))
os.mkdir(FILES_DIR, mode=(7*8**2 + 5*8**1 + 5*8**0)) #755
else:
perr('{} exists'.format(FILES_DIR))

fetch_remote = True

if fetch_remote:
p('downloading Sam Harris RSS feed')
feed, feed_name = download_rss_xml(FEED_URL)
p('updating RSS feed --> "{}"'.format(feed_name))
with open(feed_name,'w') as f:
f.write(feed)
else:
p('loading Sam Harris RSS feed from file')
FEED_NAME = FEED_URL[FEED_URL.rindex('/')+1:]
feed, feed_name = rss_xml_from_file(FEED_NAME)

time.sleep( (2 ** 0.5) )

#
# TITLES
#
sentinel = '<title>'
title_indexes = str_indexes( sentinel, feed )
TITLES = []
for index, start in enumerate(title_indexes):
start += len(sentinel)
end = feed.index('<', start)
assert(feed[end] == '<')
tstring = feed[start:end].rstrip().lstrip()
TITLES.append(tstring)
TITLES = TITLES[2:] # clip off first two non-item titles
TITLES.reverse()
TITLES = [x.removeprefix('#').replace('\u2014','-').replace('#','').replace(':','').replace('?','').encode('ascii','ignore').decode('utf8') for x in TITLES]

#
# URLS
#
sentinel = '<enclosure length="0" type="audio/mpeg" url="'
mp3_url_indexes = str_indexes( sentinel, feed )
URLS = []
for index, start in enumerate(mp3_url_indexes):
start += len(sentinel)
end = feed.index('"', start)
assert(feed[start] == 'h')
assert(feed[end] == '"')
url = feed[start:end].rstrip().lstrip()
URLS.append(url)
URLS.reverse()

assert(len(URLS) == len(TITLES))

#
# INDEXES
#
INDEXES = [numpad(i+1) for i,_ in enumerate(TITLES)]

assert(len(URLS) == len(INDEXES))
print('{} files to check'.format(len(mp3_url_indexes)))

#
# create a fully formatted filename with all the best bits in it
#
FINAL_NAMES = []
for (index, title, url) in zip(INDEXES, TITLES, URLS):
filename = url[url.rindex('/')+1:]
final = f"{index} - {filename.removesuffix('.mp3')} - {title}.mp3"
FINAL_NAMES.append(final)

#
# retrieve the files we dont have
#
for (final_name, url) in zip(FINAL_NAMES, URLS):
final_filename = join(FILES_DIR, final_name)
check_length_download( url, final_filename )


def fetch_url( url, saveto ):
retval = progress_download(url, saveto)
return retval


def quoted_strings( sentinel, haystack, payload ):
def get_quoted_strings( sentinel, haystack, payload ):
count = 0
indexes = str_indexes( sentinel, haystack )
for index, start in enumerate(indexes):
Expand All @@ -229,22 +167,41 @@ def quoted_strings( sentinel, haystack, payload ):
count = count + 1
return count

if __name__ == '__main__':

def fetch_url( url, saveto ):
retval = progress_download(url, saveto)
return retval


def usage(sextra=None):
perr('usage: {} [options] <URL>'.format(sys.argv[0]))
perr(' options:')
perr(' --full will cause 4chget to recheck the length of each image file')
if sextra:
perr(sextra)


def main():
if len(sys.argv) < 2:
perr('usage: {} <URL>'.format(sys.argv[0]))
usage()
sys.exit(0)
url = sys.argv[1]

"""
- default: takes single argument, a 4chan url, and download all the images to a directory, also backs up the html. Default action for subsequent downloads is to skip downloading files we already have, but not check their size
if '--help' in sys.argv:
usage()
sys.exit(0)

- checks for existence of dir, takes optional dir argument
Args = sys.argv.copy()
full_download = False

- optional argument forces 'Content-Length' checks against byte-size of each file to make sure
all of them are complete

- start by working up a printf progress bar like apt-get
"""
#TODO FULL DOWNLOAD - check the length of each file
if '--full' in Args:
Args = Args[:Args.index('--full')] + Args[Args.index('--full')+1:]
full_download = True
perr('**doing a full download, checking the length of each file')
time.sleep(1.5)

url = Args[1]

# created directory
dirname = url[url.rindex('/')+1:]
Expand All @@ -257,35 +214,41 @@ def quoted_strings( sentinel, haystack, payload ):
perr('directory {} exists'.format(dirname))

# fetch and save html
perr('saving "{}"'.format(feed_file))
ret = fetch_url(url, feed_file)
if not ret:
perr('failed to download "{}"'.format(url))
sys.exit(1)
else:
perr('saving "{}"'.format(feed_file))
with open(feed_file, "r") as f:
html_string = f.read()

# collect target image paths
TITLES1 = []
sentinel = 'i.4cdn.org'
quoted_strings(sentinel, html_string, TITLES1 )
TITLES1 = [f'https://{sentinel}{match}' for match in TITLES1]

TITLES2 = []
sentinel = 'is2.4chan.org'
quoted_strings(sentinel, html_string, TITLES2 )
TITLES2 = [f'https://{sentinel}{match}' for match in TITLES2]

TITLES = []
for T_META in (TITLES1, TITLES2):
for t in T_META:
if t[t.rindex('.')-1] != 's' and t not in TITLES:
TITLES.append(t)

for index, imgurl in enumerate(TITLES):
print( f"{index+1}/{len(TITLES)} ", end='' )
IMAGE_URLS = []
sentinels = [ 'i.4cdn.org', 'is2.4chan.org' ]
for sentinel in sentinels:
sentinel_matches = []
get_quoted_strings(sentinel, html_string, sentinel_matches)
IMAGE_URLS.extend( [f'https://{sentinel}{sent}' for sent in sentinel_matches] )
# clean out duplicates
IMAGE_URLS = list(set(IMAGE_URLS))
# clean out thumbs
IMAGE_URLS_FILTERED = [x for x in IMAGE_URLS if x[x.rindex('.')-1] != 's']

img_downloaded = 0

for index, imgurl in enumerate(IMAGE_URLS_FILTERED):
print( f"{index+1}/{len(IMAGE_URLS_FILTERED)} ", end='' )
basename = imgurl[imgurl.rindex('/')+1:]
check_length_download( imgurl, os.path.join(dirname, basename), dl_func=progress_download )
if not check_file_download( imgurl, os.path.join(dirname, basename), dl_func=progress_download, chkLen=full_download ):
img_downloaded = img_downloaded + 1
time.sleep( 1 ) # sleeping after getting files, not for confirming them

if img_downloaded:
perr(f'got {img_downloaded} new files')
else:
perr('no new files')


if __name__ == '__main__':
main()

0 comments on commit e3137b1

Please sign in to comment.