adding a python replacement of the 4chan image getting script. I got …

…sick of the bugs in the super old bash script
gmn · Oct 20, 2022 · e3137b1 · e3137b1
1 parent adc7bbe
commit e3137b1
Showing 1 changed file with 98 additions and 135 deletions.
diff --git a/4chget.py b/4chget.py
@@ -1,37 +1,46 @@
 
+"""
+- default: takes single argument, a 4chan url, and download all the images to a directory, also backs up the html. Default action for subsequent downloads is to skip downloading files we already have, but not check their size
+
+- checks for existence of dir, takes optional dir argument
+
+- should check local directory and separate urls of files we don't have. Then can report, "getting 11 new files."
+
+- optional argument forces 'Content-Length' checks against byte-size of each file to make sure
+ all of them are complete
+
+X start by working up a printf progress bar like apt-get
+
+- takes an argument list of URLS
+"""
+
 import json
 from os.path import isdir, isfile, join, dirname, os
 import requests
 import sys
 import time
 
-FEED_URL = "https://rss.samharris.org/feed/bcf60cfa-3ee2-406e-ae8e-f5f439a0a993"
-FILES_DIR = './FILES'
-
 
 p = lambda s: print(s)
 perr = lambda s: print(s, file=sys.stderr)
 
+
 def download_file( url, filename ):
     X = requests.get( url )
     with open( filename, 'wb' ) as f:
         f.write(X.content)
-    if isfile( filename ):
-        p('success: saved "{}"'.format(filename))
-    else:
-        p('{} download failed [{}]'.format(filename, url))
+    return isfile( filename )
 
 
 def progress_download( url, filename ):
     def print_progress_bar( fraction ):
         """ fraction is in range 0.0 to 1.0 """
-        barlen = 44
+        barlen = 36
+        filename_truncated = 44
         frac = int(fraction * barlen)
-
-        ftruncate = 36
-        flen = ftruncate if len(filename) > ftruncate else len(filename)
+        flen = filename_truncated if len(filename) > filename_truncated else len(filename)
         print('\r' * (barlen + 2 + flen + 1), end='')
-        print(filename[:ftruncate] + " ["+frac*'='+(barlen-frac)*' '+"]", end='')
+        print(filename[:filename_truncated] + " ["+frac*'='+(barlen-frac)*' '+"]", end='')
 
     try:
         fd = open( filename, 'wb' )
@@ -63,26 +72,40 @@ def print_progress_bar( fraction ):
     return isfile( filename )
 
 
-def check_length_download( url, filename, dl_func=download_file ):
+def check_file_download( url, filename, dl_func=download_file, chkLen=False ):
     """ check the byte length of every file;
         download it if it doesn't exist,
-        OR if the byte lengths don't match """
+        OR if the byte lengths don't match .
+
+        Change chkLen to False to only check for existence of a file,
+        and disable checking length.
+
+        returns False for not exist or incorrect filelength
+    """
 
     if not os.path.isfile( filename ):
         p( 'getting "{}"'.format(url) )
         dl_func( url, filename )
-        time.sleep( 1 )
+        return False  # didn't exist
     else:
-        p('!EXISTS "{}" Checking Length....'.format(filename))
-        time.sleep( (2 ** 0.5) / 8 / 2 )
+        print('!EXISTS "{}"'.format(filename))
 
+        if not chkLen:
+            return True # did exist
+
+        print('!EXISTS "{}"  **Checking Length** '.format(filename), end='')
         # get headers and compare against byte size of file
         H = requests.head( url )
         if H.headers.get('Content-Length', None):
             if H.headers['Content-Length'] != str(os.stat(filename).st_size):
-                perr( f'Size doesnt match server. Getting again: "{filename}"' )
+                a = str(H.headers['Content-Length'])
+                b = str(os.stat(filename).st_size)
+                print( f'Size doesnt match [{a} != {b}]. Re-getting' )
                 dl_func(url, filename)
-                time.sleep( 1 )
+                return False # didn't checklen correctly
+            else:
+                print(' ok')
+                return True # correct length
 
 
 def str_indexes( needle, haystack ):
@@ -132,92 +155,7 @@ def numpad(n, leading_zeros=3):
     return s
 
 
-def main():
-    if not isdir(FILES_DIR):
-        perr('creating {}'.format(FILES_DIR))
-        os.mkdir(FILES_DIR, mode=(7*8**2 + 5*8**1 + 5*8**0)) #755
-    else:
-        perr('{} exists'.format(FILES_DIR))
-
-    fetch_remote = True
-
-    if fetch_remote:
-        p('downloading Sam Harris RSS feed')
-        feed, feed_name = download_rss_xml(FEED_URL)
-        p('updating RSS feed --> "{}"'.format(feed_name))
-        with open(feed_name,'w') as f:
-            f.write(feed)
-    else:
-        p('loading Sam Harris RSS feed from file')
-        FEED_NAME = FEED_URL[FEED_URL.rindex('/')+1:]
-        feed, feed_name = rss_xml_from_file(FEED_NAME)
-
-    time.sleep( (2 ** 0.5) )
-
-    #
-    # TITLES
-    #
-    sentinel = '<title>'
-    title_indexes = str_indexes( sentinel, feed )
-    TITLES = []
-    for index, start in enumerate(title_indexes):
-        start += len(sentinel)
-        end = feed.index('<', start)
-        assert(feed[end] == '<')
-        tstring = feed[start:end].rstrip().lstrip()
-        TITLES.append(tstring)
-    TITLES = TITLES[2:] # clip off first two non-item titles
-    TITLES.reverse()
-    TITLES = [x.removeprefix('#').replace('\u2014','-').replace('#','').replace(':','').replace('?','').encode('ascii','ignore').decode('utf8') for x in TITLES]
-
-    #
-    # URLS
-    #
-    sentinel = '<enclosure length="0" type="audio/mpeg" url="'
-    mp3_url_indexes = str_indexes( sentinel, feed )
-    URLS = []
-    for index, start in enumerate(mp3_url_indexes):
-        start += len(sentinel)
-        end = feed.index('"', start)
-        assert(feed[start] == 'h')
-        assert(feed[end] == '"')
-        url = feed[start:end].rstrip().lstrip()
-        URLS.append(url)
-    URLS.reverse()
-
-    assert(len(URLS) == len(TITLES))
-
-    #
-    # INDEXES
-    #
-    INDEXES = [numpad(i+1) for i,_ in enumerate(TITLES)]
-
-    assert(len(URLS) == len(INDEXES))
-    print('{} files to check'.format(len(mp3_url_indexes)))
-
-    #
-    # create a fully formatted filename with all the best bits in it
-    #
-    FINAL_NAMES = []
-    for (index, title, url) in zip(INDEXES, TITLES, URLS):
-        filename = url[url.rindex('/')+1:]
-        final = f"{index} - {filename.removesuffix('.mp3')} - {title}.mp3"
-        FINAL_NAMES.append(final)
-
-    #
-    # retrieve the files we dont have
-    #
-    for (final_name, url) in zip(FINAL_NAMES, URLS):
-        final_filename = join(FILES_DIR, final_name)
-        check_length_download( url, final_filename )
-
-
-def fetch_url( url, saveto ):
-    retval = progress_download(url, saveto)
-    return retval
-
-
-def quoted_strings( sentinel, haystack, payload ):
+def get_quoted_strings( sentinel, haystack, payload ):
     count = 0
     indexes = str_indexes( sentinel, haystack )
     for index, start in enumerate(indexes):
@@ -229,22 +167,41 @@ def quoted_strings( sentinel, haystack, payload ):
         count = count + 1
     return count
 
-if __name__ == '__main__':
+
+def fetch_url( url, saveto ):
+    retval = progress_download(url, saveto)
+    return retval
+
+
+def usage(sextra=None):
+        perr('usage: {} [options] <URL>'.format(sys.argv[0]))
+        perr('    options:')
+        perr('    --full    will cause 4chget to recheck the length of each image file')
+        if sextra:
+            perr(sextra)
+
+
+def main():
     if len(sys.argv) < 2:
-        perr('usage: {} <URL>'.format(sys.argv[0]))
+        usage()
         sys.exit(0)
-    url = sys.argv[1]
 
-    """
-- default: takes single argument, a 4chan url, and download all the images to a directory, also backs up the html. Default action for subsequent downloads is to skip downloading files we already have, but not check their size
+    if '--help' in sys.argv:
+        usage()
+        sys.exit(0)
 
-- checks for existence of dir, takes optional dir argument
+    Args = sys.argv.copy()
+    full_download = False
 
-- optional argument forces 'Content-Length' checks against byte-size of each file to make sure
- all of them are complete
 
-- start by working up a printf progress bar like apt-get
-    """
+    #TODO FULL DOWNLOAD - check the length of each file
+    if '--full' in Args:
+        Args = Args[:Args.index('--full')] + Args[Args.index('--full')+1:]
+        full_download = True
+        perr('**doing a full download, checking the length of each file')
+        time.sleep(1.5)
+
+    url = Args[1]
 
     # created directory
     dirname = url[url.rindex('/')+1:]
@@ -257,35 +214,41 @@ def quoted_strings( sentinel, haystack, payload ):
         perr('directory {} exists'.format(dirname))
 
     # fetch and save html
+    perr('saving "{}"'.format(feed_file))
     ret = fetch_url(url, feed_file)
     if not ret:
         perr('failed to download "{}"'.format(url))
         sys.exit(1)
     else:
-        perr('saving "{}"'.format(feed_file))
         with open(feed_file, "r") as f:
             html_string = f.read()
 
     # collect target image paths
-    TITLES1 = []
-    sentinel = 'i.4cdn.org'
-    quoted_strings(sentinel, html_string, TITLES1 )
-    TITLES1 = [f'https://{sentinel}{match}' for match in TITLES1]
-
-    TITLES2 = []
-    sentinel = 'is2.4chan.org'
-    quoted_strings(sentinel, html_string, TITLES2 )
-    TITLES2 = [f'https://{sentinel}{match}' for match in TITLES2]
-
-    TITLES = []
-    for T_META in (TITLES1, TITLES2):
-        for t in T_META:
-            if t[t.rindex('.')-1] != 's' and t not in TITLES:
-                TITLES.append(t)
-
-    for index, imgurl in enumerate(TITLES):
-        print( f"{index+1}/{len(TITLES)} ", end='' )
+    IMAGE_URLS = []
+    sentinels = [ 'i.4cdn.org', 'is2.4chan.org' ]
+    for sentinel in sentinels:
+        sentinel_matches = []
+        get_quoted_strings(sentinel, html_string, sentinel_matches)
+        IMAGE_URLS.extend( [f'https://{sentinel}{sent}' for sent in sentinel_matches] )
+    # clean out duplicates
+    IMAGE_URLS = list(set(IMAGE_URLS))
+    # clean out thumbs
+    IMAGE_URLS_FILTERED = [x for x in IMAGE_URLS if x[x.rindex('.')-1] != 's']
+
+    img_downloaded = 0
+
+    for index, imgurl in enumerate(IMAGE_URLS_FILTERED):
+        print( f"{index+1}/{len(IMAGE_URLS_FILTERED)} ", end='' )
         basename = imgurl[imgurl.rindex('/')+1:]
-        check_length_download( imgurl, os.path.join(dirname, basename), dl_func=progress_download )
+        if not check_file_download( imgurl, os.path.join(dirname, basename), dl_func=progress_download, chkLen=full_download ):
+            img_downloaded = img_downloaded + 1
+            time.sleep( 1 ) # sleeping after getting files, not for confirming them
+
+    if img_downloaded:
+        perr(f'got {img_downloaded} new files')
+    else:
+        perr('no new files')
 
 
+if __name__ == '__main__':
+    main()