download_artefacts.py

#!/usr/bin/python3

import json
import logging
import shutil
import datetime

from concurrent.futures import ProcessPoolExecutor as Pool, as_completed
from pathlib import Path
from urllib.request import urlopen, Request
from urllib.parse import urljoin

logger = logging.getLogger()

PARALLEL_DOWNLOADS = 6
GITHUB_API_URL = "https://api.github.com/repos/scoder/fastrlock"
APPVEYOR_PACKAGE_URL = "https://ci.appveyor.com/api/projects/scoder/fastrlock"
APPVEYOR_BUILDJOBS_URL = "https://ci.appveyor.com/api/buildjobs"


def find_github_files(version, api_url=GITHUB_API_URL):
    url = f"{api_url}/releases/tags/{version}"
    release, _ = read_url(url, accept="application/vnd.github+json", as_json=True)

    for asset in release.get('assets', ()):
        yield asset['browser_download_url']


def find_appveyor_files(version, base_package_url=APPVEYOR_PACKAGE_URL, base_job_url=APPVEYOR_BUILDJOBS_URL):
    url = f"{base_package_url}/history?recordsNumber=20"
    with urlopen(url) as p:
        builds = json.load(p)["builds"]

    tag = f"{version}"
    for build in builds:
        if build['isTag'] and build['tag'] == tag:
            build_id = build['buildId']
            break
    else:
        logger.warning(f"No appveyor build found for tag '{tag}'")
        return

    build_url = f"{base_package_url}/builds/{build_id}"
    with urlopen(build_url) as p:
        jobs = json.load(p)["build"]["jobs"]

    for job in jobs:
        artifacts_url = f"{base_job_url}/{job['jobId']}/artifacts/"

        with urlopen(artifacts_url) as p:
            for artifact in json.load(p):
                yield urljoin(artifacts_url, artifact['fileName'])


def read_url(url, decode=True, accept=None, as_json=False):
    if accept:
        request = Request(url, headers={'Accept': accept})
    else:
        request = Request(url)

    with urlopen(request) as res:
        charset = _find_content_encoding(res)
        content_type = res.headers.get('Content-Type')
        data = res.read()

    if decode:
        data = data.decode(charset)
    if as_json:
        data = json.loads(data)
    return data, content_type


def _find_content_encoding(response, default='iso8859-1'):
    from email.message import Message
    content_type = response.headers.get('Content-Type')
    if content_type:
        msg = Message()
        msg.add_header('Content-Type', content_type)
        charset = msg.get_content_charset(default)
    else:
        charset = default
    return charset


def download1(wheel_url, dest_dir):
    wheel_name = wheel_url.rsplit("/", 1)[1]
    logger.info(f"Downloading {wheel_url} ...")
    with urlopen(wheel_url) as w:
        file_path = dest_dir / wheel_name
        if (file_path.exists()
                and "Content-Length" in w.headers
                and file_path.stat().st_size == int(w.headers["Content-Length"])):
            logger.info(f"Already have {wheel_name}")
        else:
            temp_file_path = file_path.with_suffix(".tmp")
            try:
                with open(temp_file_path, "wb") as f:
                    shutil.copyfileobj(w, f)
            except:
                if temp_file_path.exists():
                    temp_file_path.unlink()
                raise
            else:
                temp_file_path.replace(file_path)
                logger.info(f"Finished downloading {wheel_name}")
    return wheel_name


def download(urls, dest_dir, jobs=PARALLEL_DOWNLOADS):
    with Pool(max_workers=jobs) as pool:
        futures = [pool.submit(download1, url, dest_dir) for url in urls]
        try:
            for future in as_completed(futures):
                wheel_name = future.result()
                yield wheel_name
        except KeyboardInterrupt:
            for future in futures:
                future.cancel()
            raise


def dedup(it):
    seen = set()
    for value in it:
        if value not in seen:
            seen.add(value)
            yield value


def roundrobin(*iterables):
    "roundrobin('ABC', 'D', 'EF') --> A D E B F C"
    # Recipe credited to George Sakkis
    from itertools import cycle, islice
    num_active = len(iterables)
    nexts = cycle(iter(it).__next__ for it in iterables)
    while num_active:
        try:
            for next in nexts:
                yield next()
        except StopIteration:
            # Remove the iterator we just exhausted from the cycle.
            num_active -= 1
            nexts = cycle(islice(nexts, num_active))


def main(*args):
    if not args:
        print("Please pass the version to download")
        return

    version = args[0]
    dest_dir = Path("dist") / version
    if not dest_dir.is_dir():
        dest_dir.mkdir()

    start_time = datetime.datetime.now().replace(microsecond=0)
    urls = roundrobin(*map(dedup, [
        find_github_files(version),
        find_appveyor_files(version),
    ]))
    count = sum(1 for _ in enumerate(download(urls, dest_dir)))
    duration = datetime.datetime.now().replace(microsecond=0) - start_time
    logger.info(f"Downloaded {count} files in {duration}.")


if __name__ == "__main__":
    import sys
    logging.basicConfig(
        stream=sys.stderr,
        level=logging.INFO,
        format="%(asctime)-15s  %(message)s",
    )
    main(*sys.argv[1:])