Skip to content

Commit

Permalink
ARROW-16018: [Doc][Python] Run doctests on Python docstring examples …
Browse files Browse the repository at this point in the history
…(--doctest-modules)

A series of 3 PRs add `doctest` functionality to ensure that docstring examples are actually correct (and keep being correct).

- [x] Add `--doctest-module`
- [x] Add `--doctest-cython` apache#13204
- [x] Create a CI job apache#13216

This PR can be tested with `pytest --doctest-modules python/pyarrow`.

Closes apache#13199 from AlenkaF/ARROW-16018

Lead-authored-by: Alenka Frim <[email protected]>
Co-authored-by: Alenka Frim <[email protected]>
Signed-off-by: Joris Van den Bossche <[email protected]>
  • Loading branch information
2 people authored and jorisvandenbossche committed May 25, 2022
1 parent fe2ce20 commit 3b92f02
Show file tree
Hide file tree
Showing 8 changed files with 365 additions and 226 deletions.
8 changes: 4 additions & 4 deletions python/pyarrow/_compute_docstrings.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,13 @@
>>> arr = pa.array(["a", "b", "c", None, "e"])
>>> mask = pa.array([True, False, None, False, True])
>>> arr.filter(mask)
<pyarrow.lib.StringArray object at 0x7fa826df9200>
<pyarrow.lib.StringArray object at ...>
[
"a",
"e"
]
>>> arr.filter(mask, null_selection_behavior='emit_null')
<pyarrow.lib.StringArray object at 0x7fa826df9200>
<pyarrow.lib.StringArray object at ...>
[
"a",
null,
Expand All @@ -50,7 +50,7 @@
>>> arr = pa.array([1, 1, 2, 2, 3, 2, 2, 2])
>>> modes = pc.mode(arr, 2)
>>> modes[0]
<pyarrow.StructScalar: {'mode': 2, 'count': 5}>
<pyarrow.StructScalar: [('mode', 2), ('count', 5)]>
>>> modes[1]
<pyarrow.StructScalar: {'mode': 1, 'count': 2}>
<pyarrow.StructScalar: [('mode', 1), ('count', 2)]>
"""
16 changes: 8 additions & 8 deletions python/pyarrow/compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,7 @@ def cast(arr, target_type, safe=True):
You can use ``pyarrow.DataType`` objects to specify the target type:
>>> cast(arr, pa.timestamp('ms'))
<pyarrow.lib.TimestampArray object at 0x7fe93c0f6910>
<pyarrow.lib.TimestampArray object at ...>
[
2010-01-01 00:00:00.000,
2015-01-01 00:00:00.000
Expand All @@ -358,10 +358,10 @@ def cast(arr, target_type, safe=True):
types:
>>> arr.cast('timestamp[ms]')
<pyarrow.lib.TimestampArray object at 0x10420eb88>
<pyarrow.lib.TimestampArray object at ...>
[
1262304000000,
1420070400000
2010-01-01 00:00:00.000,
2015-01-01 00:00:00.000
]
>>> arr.cast('timestamp[ms]').type
TimestampType(timestamp[ms])
Expand Down Expand Up @@ -448,7 +448,7 @@ def take(data, indices, *, boundscheck=True, memory_pool=None):
>>> arr = pa.array(["a", "b", "c", None, "e", "f"])
>>> indices = pa.array([0, None, 4, 3])
>>> arr.take(indices)
<pyarrow.lib.StringArray object at 0x7ffa4fc7d368>
<pyarrow.lib.StringArray object at ...>
[
"a",
null,
Expand Down Expand Up @@ -486,7 +486,7 @@ def fill_null(values, fill_value):
>>> arr = pa.array([1, 2, None, 3], type=pa.int8())
>>> fill_value = pa.scalar(5, type=pa.int8())
>>> arr.fill_null(fill_value)
pyarrow.lib.Int8Array object at 0x7f95437f01a0>
<pyarrow.lib.Int8Array object at ...>
[
1,
2,
Expand Down Expand Up @@ -531,7 +531,7 @@ def top_k_unstable(values, k, sort_keys=None, *, memory_pool=None):
>>> import pyarrow.compute as pc
>>> arr = pa.array(["a", "b", "c", None, "e", "f"])
>>> pc.top_k_unstable(arr, k=3)
<pyarrow.lib.UInt64Array object at 0x7fdcb19d7f30>
<pyarrow.lib.UInt64Array object at ...>
[
5,
4,
Expand Down Expand Up @@ -577,7 +577,7 @@ def bottom_k_unstable(values, k, sort_keys=None, *, memory_pool=None):
>>> import pyarrow.compute as pc
>>> arr = pa.array(["a", "b", "c", None, "e", "f"])
>>> pc.bottom_k_unstable(arr, k=3)
<pyarrow.lib.UInt64Array object at 0x7fdcb19d7fa0>
<pyarrow.lib.UInt64Array object at ...>
[
0,
1,
Expand Down
226 changes: 226 additions & 0 deletions python/pyarrow/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

import pytest
from pyarrow import Codec

groups = [
'brotli',
'bz2',
'cython',
'dataset',
'hypothesis',
'fastparquet',
'gandiva',
'gdb',
'gzip',
'hdfs',
'large_memory',
'lz4',
'memory_leak',
'nopandas',
'orc',
'pandas',
'parquet',
'parquet_encryption',
'plasma',
's3',
'snappy',
'substrait',
'tensorflow',
'flight',
'slow',
'requires_testing_data',
'zstd',
]

defaults = {
'brotli': Codec.is_available('brotli'),
'bz2': Codec.is_available('bz2'),
'cython': False,
'dataset': False,
'fastparquet': False,
'flight': False,
'gandiva': False,
'gdb': True,
'gzip': Codec.is_available('gzip'),
'hdfs': False,
'hypothesis': False,
'large_memory': False,
'lz4': Codec.is_available('lz4'),
'memory_leak': False,
'nopandas': False,
'orc': False,
'pandas': False,
'parquet': False,
'parquet_encryption': False,
'plasma': False,
'requires_testing_data': True,
's3': False,
'slow': False,
'snappy': Codec.is_available('snappy'),
'substrait': False,
'tensorflow': False,
'zstd': Codec.is_available('zstd'),
}

try:
import cython # noqa
defaults['cython'] = True
except ImportError:
pass

try:
import fastparquet # noqa
defaults['fastparquet'] = True
except ImportError:
pass

try:
import pyarrow.gandiva # noqa
defaults['gandiva'] = True
except ImportError:
pass

try:
import pyarrow.dataset # noqa
defaults['dataset'] = True
except ImportError:
pass

try:
import pyarrow.orc # noqa
defaults['orc'] = True
except ImportError:
pass

try:
import pandas # noqa
defaults['pandas'] = True
except ImportError:
defaults['nopandas'] = True

try:
import pyarrow.parquet # noqa
defaults['parquet'] = True
except ImportError:
pass

try:
import pyarrow.parquet.encryption # noqa
defaults['parquet_encryption'] = True
except ImportError:
pass


try:
import pyarrow.plasma # noqa
defaults['plasma'] = True
except ImportError:
pass

try:
import tensorflow # noqa
defaults['tensorflow'] = True
except ImportError:
pass

try:
import pyarrow.flight # noqa
defaults['flight'] = True
except ImportError:
pass

try:
from pyarrow.fs import S3FileSystem # noqa
defaults['s3'] = True
except ImportError:
pass

try:
from pyarrow.fs import HadoopFileSystem # noqa
defaults['hdfs'] = True
except ImportError:
pass

try:
import pyarrow.substrait # noqa
defaults['substrait'] = True
except ImportError:
pass


# Doctest should ignore files for the modules that are not built
def pytest_ignore_collect(path, config):
if config.option.doctestmodules:
# don't try to run doctests on the /tests directory
if "/pyarrow/tests/" in str(path):
return True

doctest_groups = [
'dataset',
'orc',
'parquet',
'plasma',
'flight',
'substrait',
]

# handle cuda, flight, etc
for group in doctest_groups:
if 'pyarrow/{}'.format(group) in str(path):
if not defaults[group]:
return True

if 'pyarrow/parquet/encryption' in str(path):
if not defaults['parquet_encryption']:
return True

if 'pyarrow/cuda' in str(path):
try:
import pyarrow.cuda # noqa
return False
except ImportError:
return True

if 'pyarrow/fs' in str(path):
try:
from pyarrow.fs import S3FileSystem # noqa
return False
except ImportError:
return True

return False


# Save output files from doctest examples into temp dir
@pytest.fixture(autouse=True)
def _docdir(request):

# Trigger ONLY for the doctests.
if request.config.option.doctestmodules:

# Get the fixture dynamically by its name.
tmpdir = request.getfixturevalue('tmpdir')

# Chdir only for the duration of the test.
with tmpdir.as_cwd():
yield

else:
# For normal tests, we have to yield, since this is a yield-fixture.
yield
Loading

0 comments on commit 3b92f02

Please sign in to comment.