ARROW-16018: [Doc][Python] Run doctests on Python docstring examples …

…(--doctest-modules) A series of 3 PRs add `doctest` functionality to ensure that docstring examples are actually correct (and keep being correct). - [x] Add `--doctest-module` - [x] Add `--doctest-cython` apache#13204 - [x] Create a CI job apache#13216 This PR can be tested with `pytest --doctest-modules python/pyarrow`. Closes apache#13199 from AlenkaF/ARROW-16018 Lead-authored-by: Alenka Frim <[email protected]> Co-authored-by: Alenka Frim <[email protected]> Signed-off-by: Joris Van den Bossche <[email protected]>
davidhao3300 · May 25, 2022 · 3b92f02 · 3b92f02
1 parent fe2ce20
commit 3b92f02
Show file tree

Hide file tree

Showing 8 changed files with 365 additions and 226 deletions.
diff --git a/python/pyarrow/_compute_docstrings.py b/python/pyarrow/_compute_docstrings.py
@@ -28,13 +28,13 @@
     >>> arr = pa.array(["a", "b", "c", None, "e"])
     >>> mask = pa.array([True, False, None, False, True])
     >>> arr.filter(mask)
-    <pyarrow.lib.StringArray object at 0x7fa826df9200>
+    <pyarrow.lib.StringArray object at ...>
     [
       "a",
       "e"
     ]
     >>> arr.filter(mask, null_selection_behavior='emit_null')
-    <pyarrow.lib.StringArray object at 0x7fa826df9200>
+    <pyarrow.lib.StringArray object at ...>
     [
       "a",
       null,
@@ -50,7 +50,7 @@
     >>> arr = pa.array([1, 1, 2, 2, 3, 2, 2, 2])
     >>> modes = pc.mode(arr, 2)
     >>> modes[0]
-    <pyarrow.StructScalar: {'mode': 2, 'count': 5}>
+    <pyarrow.StructScalar: [('mode', 2), ('count', 5)]>
     >>> modes[1]
-    <pyarrow.StructScalar: {'mode': 1, 'count': 2}>
+    <pyarrow.StructScalar: [('mode', 1), ('count', 2)]>
     """
diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py
@@ -345,7 +345,7 @@ def cast(arr, target_type, safe=True):
     You can use ``pyarrow.DataType`` objects to specify the target type:
 
     >>> cast(arr, pa.timestamp('ms'))
-    <pyarrow.lib.TimestampArray object at 0x7fe93c0f6910>
+    <pyarrow.lib.TimestampArray object at ...>
     [
       2010-01-01 00:00:00.000,
       2015-01-01 00:00:00.000
@@ -358,10 +358,10 @@ def cast(arr, target_type, safe=True):
     types:
 
     >>> arr.cast('timestamp[ms]')
-    <pyarrow.lib.TimestampArray object at 0x10420eb88>
+    <pyarrow.lib.TimestampArray object at ...>
     [
-      1262304000000,
-      1420070400000
+      2010-01-01 00:00:00.000,
+      2015-01-01 00:00:00.000
     ]
     >>> arr.cast('timestamp[ms]').type
     TimestampType(timestamp[ms])
@@ -448,7 +448,7 @@ def take(data, indices, *, boundscheck=True, memory_pool=None):
     >>> arr = pa.array(["a", "b", "c", None, "e", "f"])
     >>> indices = pa.array([0, None, 4, 3])
     >>> arr.take(indices)
-    <pyarrow.lib.StringArray object at 0x7ffa4fc7d368>
+    <pyarrow.lib.StringArray object at ...>
     [
       "a",
       null,
@@ -486,7 +486,7 @@ def fill_null(values, fill_value):
     >>> arr = pa.array([1, 2, None, 3], type=pa.int8())
     >>> fill_value = pa.scalar(5, type=pa.int8())
     >>> arr.fill_null(fill_value)
-    pyarrow.lib.Int8Array object at 0x7f95437f01a0>
+    <pyarrow.lib.Int8Array object at ...>
     [
       1,
       2,
@@ -531,7 +531,7 @@ def top_k_unstable(values, k, sort_keys=None, *, memory_pool=None):
     >>> import pyarrow.compute as pc
     >>> arr = pa.array(["a", "b", "c", None, "e", "f"])
     >>> pc.top_k_unstable(arr, k=3)
-    <pyarrow.lib.UInt64Array object at 0x7fdcb19d7f30>
+    <pyarrow.lib.UInt64Array object at ...>
     [
       5,
       4,
@@ -577,7 +577,7 @@ def bottom_k_unstable(values, k, sort_keys=None, *, memory_pool=None):
     >>> import pyarrow.compute as pc
     >>> arr = pa.array(["a", "b", "c", None, "e", "f"])
     >>> pc.bottom_k_unstable(arr, k=3)
-    <pyarrow.lib.UInt64Array object at 0x7fdcb19d7fa0>
+    <pyarrow.lib.UInt64Array object at ...>
     [
       0,
       1,

diff --git a/python/pyarrow/conftest.py b/python/pyarrow/conftest.py
@@ -0,0 +1,226 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pytest
+from pyarrow import Codec
+
+groups = [
+    'brotli',
+    'bz2',
+    'cython',
+    'dataset',
+    'hypothesis',
+    'fastparquet',
+    'gandiva',
+    'gdb',
+    'gzip',
+    'hdfs',
+    'large_memory',
+    'lz4',
+    'memory_leak',
+    'nopandas',
+    'orc',
+    'pandas',
+    'parquet',
+    'parquet_encryption',
+    'plasma',
+    's3',
+    'snappy',
+    'substrait',
+    'tensorflow',
+    'flight',
+    'slow',
+    'requires_testing_data',
+    'zstd',
+]
+
+defaults = {
+    'brotli': Codec.is_available('brotli'),
+    'bz2': Codec.is_available('bz2'),
+    'cython': False,
+    'dataset': False,
+    'fastparquet': False,
+    'flight': False,
+    'gandiva': False,
+    'gdb': True,
+    'gzip': Codec.is_available('gzip'),
+    'hdfs': False,
+    'hypothesis': False,
+    'large_memory': False,
+    'lz4': Codec.is_available('lz4'),
+    'memory_leak': False,
+    'nopandas': False,
+    'orc': False,
+    'pandas': False,
+    'parquet': False,
+    'parquet_encryption': False,
+    'plasma': False,
+    'requires_testing_data': True,
+    's3': False,
+    'slow': False,
+    'snappy': Codec.is_available('snappy'),
+    'substrait': False,
+    'tensorflow': False,
+    'zstd': Codec.is_available('zstd'),
+}
+
+try:
+    import cython  # noqa
+    defaults['cython'] = True
+except ImportError:
+    pass
+
+try:
+    import fastparquet  # noqa
+    defaults['fastparquet'] = True
+except ImportError:
+    pass
+
+try:
+    import pyarrow.gandiva  # noqa
+    defaults['gandiva'] = True
+except ImportError:
+    pass
+
+try:
+    import pyarrow.dataset  # noqa
+    defaults['dataset'] = True
+except ImportError:
+    pass
+
+try:
+    import pyarrow.orc  # noqa
+    defaults['orc'] = True
+except ImportError:
+    pass
+
+try:
+    import pandas  # noqa
+    defaults['pandas'] = True
+except ImportError:
+    defaults['nopandas'] = True
+
+try:
+    import pyarrow.parquet  # noqa
+    defaults['parquet'] = True
+except ImportError:
+    pass
+
+try:
+    import pyarrow.parquet.encryption  # noqa
+    defaults['parquet_encryption'] = True
+except ImportError:
+    pass
+
+
+try:
+    import pyarrow.plasma  # noqa
+    defaults['plasma'] = True
+except ImportError:
+    pass
+
+try:
+    import tensorflow  # noqa
+    defaults['tensorflow'] = True
+except ImportError:
+    pass
+
+try:
+    import pyarrow.flight  # noqa
+    defaults['flight'] = True
+except ImportError:
+    pass
+
+try:
+    from pyarrow.fs import S3FileSystem  # noqa
+    defaults['s3'] = True
+except ImportError:
+    pass
+
+try:
+    from pyarrow.fs import HadoopFileSystem  # noqa
+    defaults['hdfs'] = True
+except ImportError:
+    pass
+
+try:
+    import pyarrow.substrait  # noqa
+    defaults['substrait'] = True
+except ImportError:
+    pass
+
+
+# Doctest should ignore files for the modules that are not built
+def pytest_ignore_collect(path, config):
+    if config.option.doctestmodules:
+        # don't try to run doctests on the /tests directory
+        if "/pyarrow/tests/" in str(path):
+            return True
+
+        doctest_groups = [
+            'dataset',
+            'orc',
+            'parquet',
+            'plasma',
+            'flight',
+            'substrait',
+        ]
+
+        # handle cuda, flight, etc
+        for group in doctest_groups:
+            if 'pyarrow/{}'.format(group) in str(path):
+                if not defaults[group]:
+                    return True
+
+        if 'pyarrow/parquet/encryption' in str(path):
+            if not defaults['parquet_encryption']:
+                return True
+
+        if 'pyarrow/cuda' in str(path):
+            try:
+                import pyarrow.cuda  # noqa
+                return False
+            except ImportError:
+                return True
+
+        if 'pyarrow/fs' in str(path):
+            try:
+                from pyarrow.fs import S3FileSystem  # noqa
+                return False
+            except ImportError:
+                return True
+
+    return False
+
+
+# Save output files from doctest examples into temp dir
+@pytest.fixture(autouse=True)
+def _docdir(request):
+
+    # Trigger ONLY for the doctests.
+    if request.config.option.doctestmodules:
+
+        # Get the fixture dynamically by its name.
+        tmpdir = request.getfixturevalue('tmpdir')
+
+        # Chdir only for the duration of the test.
+        with tmpdir.as_cwd():
+            yield
+
+    else:
+        # For normal tests, we have to yield, since this is a yield-fixture.
+        yield