Merge pull request ShopRunner#151 from ShopRunner/DS02454/download-im…

…age-class DS-2454/download image class
whiteglobe · Aug 21, 2020 · cef1903 · cef1903
2 parents c988db8 + 159c483
commit cef1903
Show file tree

Hide file tree

Showing 11 changed files with 79 additions and 52 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,42 +3,46 @@ All notable changes to this project will be documented in this file.
 
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
-#[4.1.0] - 2020-8-19
+# [5.0.0] - 2020-8-19
+### Changed
+ - Provide `DownloadImagePipeline` class in place of `download_image_pipeline` object.
+
+# [4.1.0] - 2020-8-19
 ### Added
  - Log uncaught exceptions
 
-#[4.0.0] - 2020-7-31
+# [4.0.0] - 2020-7-31
 ### Changed
  - Store `repr(e)` rather than `e` itself in the run report, where `e` is an exception.
 
-#[3.0.3] - 2020-7-30
+# [3.0.3] - 2020-7-30
 ### Added
  - Increased test coverage
 
-#[3.0.2] - 2020-7-29
+# [3.0.2] - 2020-7-29
 ### Fixed
  - Errors from trying to write to the same outpath in multiple threads
 
-#[3.0.1] - 2020-7-28
+# [3.0.1] - 2020-7-28
 ### Fixed
  - Use image URL in README that is accessible on PyPI.
 
-#[3.0.0] - 2020-7-28
+# [3.0.0] - 2020-7-28
 ### Changed
  - Rename library "wildebeest"
  - Use GitHub Actions for CI/CD.
 
 ==== Below is Creevey development ====
 
-#[3.0.0] - 2020-7-28
+# [3.0.0] - 2020-7-28
 ### Added
  - Warning about upcoming name change
 
-#[2.3.0] - 2020-7-24
+# [2.3.0] - 2020-7-24
 ### Added 
  - Use `pip-tools`.
 
-#[2.2.0] - 2020-6-15
+# [2.2.0] - 2020-6-15
 ### Added 
  - Function rotate() that takes an image and an angle and ouputs the rotated image.
  - warning about upcoming rename of library

diff --git a/docs/conf.py b/docs/conf.py
@@ -23,7 +23,7 @@
 copyright = '2019 Shoprunner'
 author = 'gsganden, sugi-chan'
 
-release = version = '4.1.0'
+release = version = '5.0.0'
 
 # -- General configuration ---------------------------------------------------
 

diff --git a/docs/images/custom_reporting_pipeline_run_report.png b/docs/images/custom_reporting_pipeline_run_report.png
diff --git a/docs/images/image_download_pipeline_run_report.png b/docs/images/image_download_pipeline_run_report.png
diff --git a/docs/images/trim_resize_error.png b/docs/images/trim_resize_error.png
diff --git a/docs/images/trim_resize_pipeline_run_report.png b/docs/images/trim_resize_pipeline_run_report.png
diff --git a/docs/quickstart.rst b/docs/quickstart.rst
@@ -16,9 +16,9 @@ The following code uses a fairly minimal Wildebeest pipeline to download a list
     from wildebeest.path_funcs import join_outdir_filename_extension
     from wildebeest.write_funcs.image import write_image
 
-
     image_urls = [
-        f"https://bit.ly/{filename}" for filename in ["2RsJ8EQ", "2TqoToT", "2VocS58"]
+        f'https://raw.githubusercontent.com/ShopRunner/wildebeest/master/tests/sample_data/wildebeest_big{num}.jpg'
+        for num in range(1, 7)
     ]
 
     # Create a pipeline object, specifying how to load a file and how to
@@ -31,7 +31,7 @@ The following code uses a fairly minimal Wildebeest pipeline to download a list
     # from each input path, and how many threads to use
     image_download_pipeline(
         inpaths=image_urls,
-        path_func=partial(join_outdir_filename_extension, outdir=".", extension=".png"),
+        path_func=partial(join_outdir_filename_extension, outdir='.', extension='.png'),
         n_jobs=10,
     )
 
@@ -47,23 +47,24 @@ The trailing underscore in ``run_report_`` indicates that the attribute exists o
 
 If ``n_jobs`` is greater than 1, then the order of the input files in the run report typically will not match the order in ``inpaths``\ ; a command like ``run_report.loc[inpaths, :]`` can be used to restore the original order if desired.
 
-Because downloading images is a common use case for Wildebeest, we have provided a ``download_image_pipeline`` pipeline that you can simply import, resulting in the following simplified version of the code above:
+Because downloading images is a common use case for Wildebeest, we have provided a ``DownloadImagePipeline`` subclass that allows you to write the following simplified version of the code above:
 
 .. code-block:: python
 
     from functools import partial
 
     from wildebeest.path_funcs import join_outdir_filename_extension
-    from wildebeest.pipelines.image import download_image_pipeline
+    from wildebeest.pipelines.image import DownloadImagePipeline
 
 
     image_urls = [
-        f"https://bit.ly/{filename}" for filename in ["2RsJ8EQ", "2TqoToT", "2VocS58"]
+        f'https://raw.githubusercontent.com/ShopRunner/wildebeest/master/tests/sample_data/wildebeest_big{num}.jpg'
+        for num in range(1, 7)
     ]
 
-    download_image_pipeline(
+    DownloadImagePipeline()(
         inpaths=image_urls,
-        path_func=partial(join_outdir_filename_extension, outdir=".", extension=".png"),
+        path_func=partial(join_outdir_filename_extension, outdir='.', extension='.png'),
         n_jobs=10,
     )
 
@@ -88,10 +89,10 @@ The following example processes a few more files with three additional wrinkles:
     from wildebeest.path_funcs import join_outdir_filename_extension
 
 
-    image_urls += [
-        f"https://bit.ly/{filename}"
-        for filename in ["2scKPIp", "2TsO6Pc", "2SCv0q7", "xyz"]
-    ]
+    image_urls = [
+        f'https://raw.githubusercontent.com/ShopRunner/wildebeest/master/tests/sample_data/wildebeest_big{num}.jpg'
+        for num in range(1, 7)
+    ] + ['https://raw.githubusercontent.com/ShopRunner/wildebeest/master/tests/sample_data/fake.jpg']
 
     trim_resize_pipeline = Pipeline(
         load_func=load_image_from_url,
@@ -102,7 +103,7 @@ The following example processes a few more files with three additional wrinkles:
 
     trim_resize_pipeline(
         inpaths=image_urls,
-        path_func=partial(join_outdir_filename_extension, outdir=".", extension=".png"),
+        path_func=partial(join_outdir_filename_extension, outdir='.', extension='.png'),
         n_jobs=10,
         # skip files that have already been downloaded
         skip_func=lambda inpath, outpath: Path(outpath).is_file(),
@@ -119,24 +120,22 @@ Here is the resulting run report:
 
 We can see that the first three files were skipped because they had already been downloaded; note that as a result, they have NOT been trimmed and resized. If we had not provided a ``skip_func``, then the existing local copies would have been overwritten with trimmed and resized versions.
 
-In addition, the last file had a bad URL, resulting in a ``ValueError``. The value in the table for the "error" column in that row is the resulting ``ValueError`` exception itself:
+In addition, the last file had a bad URL, resulting in a ``ValueError``.
 
-.. image:: ./images/trim_resize_error.png
-   :target: ./images/trim_resize_error.png
-   :alt:
-
-We could simplify the code above by using the provided ``download_image_pipeline`` and simply adding our ``ops``.
+We could simplify the code above by using the provided ``DownloadImagePipeline`` and simply adding our ``ops``.
 
 .. code-block:: python
 
-   from wildebeest.pipelines.image import download_image_pipeline
+    from functools import partial
 
-   trim_resize_pipeline = download_image_pipeline
-   trim_resize_pipeline.ops = [
-       lambda image: image[:-100, :],
-       partial(resize, shape=(224, 224)),
-   ]
+    from wildebeest.ops.image import resize
+    from wildebeest.pipelines.image import DownloadImagePipeline
 
+    trim_resize_pipeline = DownloadImagePipeline()
+    trim_resize_pipeline.ops = [
+        lambda image: image[:-100, :],
+        partial(resize, shape=(224, 224)),
+    ]
 
 More generally, you can modify attributes of an existing ``Pipeline`` object.
 
@@ -169,7 +168,13 @@ The ``CustomReportingPipeline`` class allows you to add additional information t
     from wildebeest.write_funcs.image import write_image
 
 
-    @get_report_output_decorator(key="is_grayscale")
+    image_urls = [
+        f'https://raw.githubusercontent.com/ShopRunner/wildebeest/master/tests/sample_data/wildebeest_big{num}.jpg'
+        for num in range(1, 7)
+    ]
+
+
+    @get_report_output_decorator(key='is_grayscale')
     def report_is_grayscale(image):
         return image.ndim == 2
 
@@ -182,7 +187,7 @@ The ``CustomReportingPipeline`` class allows you to add additional information t
 
     custom_reporting_pipeline(
         inpaths=image_urls,
-        path_func=partial(join_outdir_filename_extension, outdir=".", extension=".png"),
+        path_func=partial(join_outdir_filename_extension, outdir='.', extension='.png'),
         n_jobs=1,
     )
 
@@ -208,23 +213,23 @@ Wildebeest is not limited to images! It applies anywhere you want to process dat
     from wildebeest.ops import get_report_output_decorator
 
     URLS = [
-        "http://gandenberger.org/2019/10/29/evaluating-classification-models-part-1-weighing-false-positives-against-false-negatives/",
-        "http://gandenberger.org/2019/11/20/evaluating-classification-models-part-2-the-sufficiency-of-precision-and-recall/",
-        "http://gandenberger.org/2019/11/22/evaluating-classification-models-part-3-f_beta-and-other-weighted-pythagorean-means-of-precision-and-recall/",
-        "http://gandenberger.org/2019/12/03/evaluating-classification-models-part-4/",
+        'http://gandenberger.org/2019/10/29/evaluating-classification-models-part-1-weighing-false-positives-against-false-negatives/',
+        'http://gandenberger.org/2019/11/20/evaluating-classification-models-part-2-the-sufficiency-of-precision-and-recall/',
+        'http://gandenberger.org/2019/11/22/evaluating-classification-models-part-3-f_beta-and-other-weighted-pythagorean-means-of-precision-and-recall/',
+        'http://gandenberger.org/2019/12/03/evaluating-classification-models-part-4/',
     ]
 
 
     def read_from_url(url, *args, **kwargs):
         return str(urllib.request.urlopen(url).read())
 
 
-    @get_report_output_decorator(key="title")
+    @get_report_output_decorator(key='title')
     def record_title(html):
         return re.search(r'<meta property="og:title" content="(.*?)" />', html).group(1)
 
 
-    @get_report_output_decorator(key="word_count")
+    @get_report_output_decorator(key='word_count')
     def count_words(html):
         return len(html.split())
 

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -7,8 +7,11 @@
 TEST_DIR = Path(__file__).parent
 SAMPLE_DATA_DIR = Path(TEST_DIR) / 'sample_data'
 TEMP_DATA_DIR = SAMPLE_DATA_DIR / 'tmp'
-IMAGE_FILENAMES = ['2RsJ8EQ', '2TqoToT', '2VocS58', '2scKPIp', '2TsO6Pc', '2SCv0q7']
-IMAGE_URLS = [f'https://bit.ly/{filename}' for filename in IMAGE_FILENAMES]
+IMAGE_FILENAMES = [f'wildebeest_big{num}.jpg' for num in range(1, 7)]
+IMAGE_URLS = [
+    f'https://raw.githubusercontent.com/ShopRunner/wildebeest/master/tests/sample_data/{filename}'
+    for filename in IMAGE_FILENAMES
+]
 
 from tests.fixtures import *
 

diff --git a/tests/test_pipelines/test_pipelines_image.py b/tests/test_pipelines/test_pipelines_image.py
@@ -19,7 +19,7 @@
 from wildebeest import Pipeline
 from wildebeest.load_funcs.image import load_image_from_disk, load_image_from_url
 from wildebeest.ops.image import resize
-from wildebeest.pipelines.image import download_image_pipeline
+from wildebeest.pipelines.image import DownloadImagePipeline
 from wildebeest.write_funcs.image import write_image
 
 IMAGE_RESIZE_SHAPE = (224, 224)
@@ -34,7 +34,7 @@ def trim_resize_pipeline():
     trim_bottom_100 = lambda image: image[:-100, :]  # noqa: 29
     resize_224 = partial(resize, shape=IMAGE_RESIZE_SHAPE)
 
-    trim_resize_pipeline = download_image_pipeline
+    trim_resize_pipeline = DownloadImagePipeline()
     trim_resize_pipeline.ops = [trim_bottom_100, resize_224]
     yield trim_resize_pipeline
     for url in IMAGE_URLS:

diff --git a/wildebeest/_version.py b/wildebeest/_version.py
@@ -1 +1 @@
-__version__ = '4.1.0'
+__version__ = '5.0.0'
diff --git a/wildebeest/pipelines/image.py b/wildebeest/pipelines/image.py
@@ -1,10 +1,25 @@
 """Image-processing pipelines"""
+from typing import Any, Callable, Iterable, Optional, Union
+
 from wildebeest import Pipeline
 from wildebeest.load_funcs.image import load_image_from_url
 from wildebeest.write_funcs.image import write_image
 
 
-download_image_pipeline = Pipeline(
-    load_func=load_image_from_url, ops=[], write_func=write_image
-)
-"""Basic pipeline for downloading images"""
+class DownloadImagePipeline(Pipeline):
+    """
+    Class for defining a pipeline that downloads images.
+
+    Attributes
+    ----------
+    ops
+        See `wildebeest.pipelines.Pipeline`.
+    """
+
+    def __init__(
+        self,
+        ops: Optional[
+            Union[Callable[[Any], Any], Iterable[Callable[[Any], Any]]]
+        ] = None,
+    ):
+        super().__init__(load_func=load_image_from_url, ops=ops, write_func=write_image)