Skip to content

Commit

Permalink
rename archive toggles from FETCH_ to SAVE_ for clarity
Browse files Browse the repository at this point in the history
  • Loading branch information
pirate committed Apr 24, 2019
1 parent 0f2497a commit 5ef5415
Show file tree
Hide file tree
Showing 7 changed files with 551 additions and 392 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ Unlike crawler software that starts from a seed URL and works outwards, or publi

#### Storage Requirements

Because ArchiveBox is designed to ingest a firehose of browser history and bookmark feeds to a local disk, it can be much more disk-space intensive than a centralized service like the Internet Archive or Archive.today. However, as storage space gets cheaper and compression improves, you should be able to use it continuously over the years without having to delete anything. In my experience, ArchiveBox uses about 5gb per 1000 articles, but your milage may vary depending on which options you have enabled and what types of sites you're archiving. By default, it archives everything in as many formats as possible, meaning it takes more space than a using a single method, but more content is accurately replayable over extended periods of time. Storage requirements can be reduced by using a compressed/deduplicated filesystem like ZFS/BTRFS, or by setting `FETCH_MEDIA=False` to skip audio & video files.
Because ArchiveBox is designed to ingest a firehose of browser history and bookmark feeds to a local disk, it can be much more disk-space intensive than a centralized service like the Internet Archive or Archive.today. However, as storage space gets cheaper and compression improves, you should be able to use it continuously over the years without having to delete anything. In my experience, ArchiveBox uses about 5gb per 1000 articles, but your milage may vary depending on which options you have enabled and what types of sites you're archiving. By default, it archives everything in as many formats as possible, meaning it takes more space than a using a single method, but more content is accurately replayable over extended periods of time. Storage requirements can be reduced by using a compressed/deduplicated filesystem like ZFS/BTRFS, or by setting `SAVE_MEDIA=False` to skip audio & video files.

## Learn more

Expand Down
19 changes: 16 additions & 3 deletions archivebox/cli/archivebox_version.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
from ..legacy.config import (
ANSI,
VERSION,
FOLDERS,
CODE_LOCATIONS,
CONFIG_LOCATIONS,
DATA_LOCATIONS,
DEPENDENCIES,
check_dependencies,
)
Expand Down Expand Up @@ -44,9 +46,20 @@ def main(args=None):
print('{white}[i] Dependency versions:{reset}'.format(**ANSI))
for name, dependency in DEPENDENCIES.items():
print_dependency_version(name, dependency)

print()
print('{white}[i] Folder locations:{reset}'.format(**ANSI))
for name, folder in FOLDERS.items():
print('{white}[i] Code locations:{reset}'.format(**ANSI))
for name, folder in CODE_LOCATIONS.items():
print_folder_status(name, folder)

print()
print('{white}[i] Config locations:{reset}'.format(**ANSI))
for name, folder in CONFIG_LOCATIONS.items():
print_folder_status(name, folder)

print()
print('{white}[i] Data locations:{reset}'.format(**ANSI))
for name, folder in DATA_LOCATIONS.items():
print_folder_status(name, folder)

print()
Expand Down
114 changes: 57 additions & 57 deletions archivebox/legacy/archive_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,17 @@
GIT_BINARY,
WGET_BINARY,
YOUTUBEDL_BINARY,
FETCH_FAVICON,
FETCH_TITLE,
FETCH_WGET,
FETCH_WGET_REQUISITES,
FETCH_PDF,
FETCH_SCREENSHOT,
FETCH_DOM,
FETCH_WARC,
FETCH_GIT,
FETCH_MEDIA,
SUBMIT_ARCHIVE_DOT_ORG,
SAVE_FAVICON,
SAVE_TITLE,
SAVE_WGET,
SAVE_WGET_REQUISITES,
SAVE_PDF,
SAVE_SCREENSHOT,
SAVE_DOM,
SAVE_WARC,
SAVE_GIT,
SAVE_MEDIA,
SAVE_ARCHIVE_DOT_ORG,
TIMEOUT,
MEDIA_TIMEOUT,
GIT_DOMAINS,
Expand Down Expand Up @@ -73,15 +73,15 @@ def archive_link(link: Link, out_dir: Optional[str]=None) -> Link:
"""download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""

ARCHIVE_METHODS = (
('title', should_fetch_title, fetch_title),
('favicon', should_fetch_favicon, fetch_favicon),
('wget', should_fetch_wget, fetch_wget),
('pdf', should_fetch_pdf, fetch_pdf),
('screenshot', should_fetch_screenshot, fetch_screenshot),
('dom', should_fetch_dom, fetch_dom),
('git', should_fetch_git, fetch_git),
('media', should_fetch_media, fetch_media),
('archive_org', should_fetch_archive_dot_org, archive_dot_org),
('title', should_save_title, save_title),
('favicon', should_save_favicon, save_favicon),
('wget', should_save_wget, save_wget),
('pdf', should_save_pdf, save_pdf),
('screenshot', should_save_screenshot, save_screenshot),
('dom', should_save_dom, save_dom),
('git', should_save_git, save_git),
('media', should_save_media, save_media),
('archive_org', should_save_archive_dot_org, save_archive_dot_org),
)

out_dir = out_dir or link.link_dir
Expand Down Expand Up @@ -112,7 +112,7 @@ def archive_link(link: Link, out_dir: Optional[str]=None) -> Link:
else:
stats['skipped'] += 1
except Exception as e:
raise Exception('Exception in archive_methods.fetch_{}(Link(url={}))'.format(
raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format(
method_name,
link.url,
)) from e
Expand Down Expand Up @@ -146,18 +146,18 @@ def archive_link(link: Link, out_dir: Optional[str]=None) -> Link:
### Archive Method Functions

@enforce_types
def should_fetch_title(link: Link, out_dir: Optional[str]=None) -> bool:
def should_save_title(link: Link, out_dir: Optional[str]=None) -> bool:
# if link already has valid title, skip it
if link.title and not link.title.lower().startswith('http'):
return False

if is_static_file(link.url):
return False

return FETCH_TITLE
return SAVE_TITLE

@enforce_types
def fetch_title(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
def save_title(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""try to guess the page's title from its content"""

output: ArchiveOutput = None
Expand Down Expand Up @@ -191,15 +191,15 @@ def fetch_title(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -


@enforce_types
def should_fetch_favicon(link: Link, out_dir: Optional[str]=None) -> bool:
def should_save_favicon(link: Link, out_dir: Optional[str]=None) -> bool:
out_dir = out_dir or link.link_dir
if os.path.exists(os.path.join(out_dir, 'favicon.ico')):
return False

return FETCH_FAVICON
return SAVE_FAVICON

@enforce_types
def fetch_favicon(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
def save_favicon(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""download site favicon from google's favicon api"""

out_dir = out_dir or link.link_dir
Expand Down Expand Up @@ -233,21 +233,21 @@ def fetch_favicon(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
)

@enforce_types
def should_fetch_wget(link: Link, out_dir: Optional[str]=None) -> bool:
def should_save_wget(link: Link, out_dir: Optional[str]=None) -> bool:
output_path = wget_output_path(link)
out_dir = out_dir or link.link_dir
if output_path and os.path.exists(os.path.join(out_dir, output_path)):
return False

return FETCH_WGET
return SAVE_WGET


@enforce_types
def fetch_wget(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
def save_wget(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""download full site using wget"""

out_dir = out_dir or link.link_dir
if FETCH_WARC:
if SAVE_WARC:
warc_dir = os.path.join(out_dir, 'warc')
os.makedirs(warc_dir, exist_ok=True)
warc_path = os.path.join('warc', str(int(datetime.now().timestamp())))
Expand All @@ -267,9 +267,9 @@ def fetch_wget(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) ->
'-e', 'robots=off',
'--restrict-file-names=windows',
'--timeout={}'.format(timeout),
*([] if FETCH_WARC else ['--timestamping']),
*(['--warc-file={}'.format(warc_path)] if FETCH_WARC else []),
*(['--page-requisites'] if FETCH_WGET_REQUISITES else []),
*([] if SAVE_WARC else ['--timestamping']),
*(['--warc-file={}'.format(warc_path)] if SAVE_WARC else []),
*(['--page-requisites'] if SAVE_WGET_REQUISITES else []),
*(['--user-agent={}'.format(WGET_USER_AGENT)] if WGET_USER_AGENT else []),
*(['--load-cookies', COOKIES_FILE] if COOKIES_FILE else []),
*(['--compression=auto'] if WGET_AUTO_COMPRESSION else []),
Expand Down Expand Up @@ -324,19 +324,19 @@ def fetch_wget(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) ->
)

@enforce_types
def should_fetch_pdf(link: Link, out_dir: Optional[str]=None) -> bool:
def should_save_pdf(link: Link, out_dir: Optional[str]=None) -> bool:
out_dir = out_dir or link.link_dir
if is_static_file(link.url):
return False

if os.path.exists(os.path.join(out_dir, 'output.pdf')):
return False

return FETCH_PDF
return SAVE_PDF


@enforce_types
def fetch_pdf(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
def save_pdf(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""print PDF of site to file using chrome --headless"""

out_dir = out_dir or link.link_dir
Expand All @@ -353,7 +353,7 @@ def fetch_pdf(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) ->

if result.returncode:
hints = (result.stderr or result.stdout).decode()
raise ArchiveError('Failed to print PDF', hints)
raise ArchiveError('Failed to save PDF', hints)

chmod_file('output.pdf', cwd=out_dir)
except Exception as err:
Expand All @@ -372,18 +372,18 @@ def fetch_pdf(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) ->
)

@enforce_types
def should_fetch_screenshot(link: Link, out_dir: Optional[str]=None) -> bool:
def should_save_screenshot(link: Link, out_dir: Optional[str]=None) -> bool:
out_dir = out_dir or link.link_dir
if is_static_file(link.url):
return False

if os.path.exists(os.path.join(out_dir, 'screenshot.png')):
return False

return FETCH_SCREENSHOT
return SAVE_SCREENSHOT

@enforce_types
def fetch_screenshot(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
def save_screenshot(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""take screenshot of site using chrome --headless"""

out_dir = out_dir or link.link_dir
Expand All @@ -400,7 +400,7 @@ def fetch_screenshot(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO

if result.returncode:
hints = (result.stderr or result.stdout).decode()
raise ArchiveError('Failed to take screenshot', hints)
raise ArchiveError('Failed to save screenshot', hints)

chmod_file(output, cwd=out_dir)
except Exception as err:
Expand All @@ -419,18 +419,18 @@ def fetch_screenshot(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
)

@enforce_types
def should_fetch_dom(link: Link, out_dir: Optional[str]=None) -> bool:
def should_save_dom(link: Link, out_dir: Optional[str]=None) -> bool:
out_dir = out_dir or link.link_dir
if is_static_file(link.url):
return False

if os.path.exists(os.path.join(out_dir, 'output.html')):
return False

return FETCH_DOM
return SAVE_DOM

@enforce_types
def fetch_dom(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
def save_dom(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""print HTML of site to file using chrome --dump-html"""

out_dir = out_dir or link.link_dir
Expand All @@ -449,7 +449,7 @@ def fetch_dom(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) ->

if result.returncode:
hints = result.stderr.decode()
raise ArchiveError('Failed to fetch DOM', hints)
raise ArchiveError('Failed to save DOM', hints)

chmod_file(output, cwd=out_dir)
except Exception as err:
Expand All @@ -468,7 +468,7 @@ def fetch_dom(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) ->
)

@enforce_types
def should_fetch_git(link: Link, out_dir: Optional[str]=None) -> bool:
def should_save_git(link: Link, out_dir: Optional[str]=None) -> bool:
out_dir = out_dir or link.link_dir
if is_static_file(link.url):
return False
Expand All @@ -483,11 +483,11 @@ def should_fetch_git(link: Link, out_dir: Optional[str]=None) -> bool:
if not is_clonable_url:
return False

return FETCH_GIT
return SAVE_GIT


@enforce_types
def fetch_git(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
def save_git(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""download full site using git"""

out_dir = out_dir or link.link_dir
Expand All @@ -512,7 +512,7 @@ def fetch_git(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) ->
pass
elif result.returncode > 0:
hints = 'Got git response code: {}.'.format(result.returncode)
raise ArchiveError('Failed git download', hints)
raise ArchiveError('Failed to save git clone', hints)

except Exception as err:
status = 'failed'
Expand All @@ -531,7 +531,7 @@ def fetch_git(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) ->


@enforce_types
def should_fetch_media(link: Link, out_dir: Optional[str]=None) -> bool:
def should_save_media(link: Link, out_dir: Optional[str]=None) -> bool:
out_dir = out_dir or link.link_dir

if is_static_file(link.url):
Expand All @@ -540,10 +540,10 @@ def should_fetch_media(link: Link, out_dir: Optional[str]=None) -> bool:
if os.path.exists(os.path.join(out_dir, 'media')):
return False

return FETCH_MEDIA
return SAVE_MEDIA

@enforce_types
def fetch_media(link: Link, out_dir: Optional[str]=None, timeout: int=MEDIA_TIMEOUT) -> ArchiveResult:
def save_media(link: Link, out_dir: Optional[str]=None, timeout: int=MEDIA_TIMEOUT) -> ArchiveResult:
"""Download playlists or individual video, audio, and subtitles using youtube-dl"""

out_dir = out_dir or link.link_dir
Expand Down Expand Up @@ -590,7 +590,7 @@ def fetch_media(link: Link, out_dir: Optional[str]=None, timeout: int=MEDIA_TIME
'Got youtube-dl response code: {}.'.format(result.returncode),
*result.stderr.decode().split('\n'),
)
raise ArchiveError('Failed to download media', hints)
raise ArchiveError('Failed to save media', hints)
except Exception as err:
status = 'failed'
output = err
Expand All @@ -608,7 +608,7 @@ def fetch_media(link: Link, out_dir: Optional[str]=None, timeout: int=MEDIA_TIME


@enforce_types
def should_fetch_archive_dot_org(link: Link, out_dir: Optional[str]=None) -> bool:
def should_save_archive_dot_org(link: Link, out_dir: Optional[str]=None) -> bool:
out_dir = out_dir or link.link_dir
if is_static_file(link.url):
return False
Expand All @@ -617,10 +617,10 @@ def should_fetch_archive_dot_org(link: Link, out_dir: Optional[str]=None) -> boo
# if open(path, 'r').read().strip() != 'None':
return False

return SUBMIT_ARCHIVE_DOT_ORG
return SAVE_ARCHIVE_DOT_ORG

@enforce_types
def archive_dot_org(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
def save_archive_dot_org(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""submit site to archive.org for archiving via their service, save returned archive url"""

out_dir = out_dir or link.link_dir
Expand Down
Loading

0 comments on commit 5ef5415

Please sign in to comment.