Skip to content

Commit

Permalink
ARROW-15006: [Python][CI][Doc] Enable numpydoc check PR03 (apache#13983)
Browse files Browse the repository at this point in the history
Adds an additional numypdoc check to CI (PR03) and fixes all corresponding violations. 

Note this does not fully resolve [ARROW-15006](https://issues.apache.org/jira/browse/ARROW-15006).

Authored-by: Bryce Mecum <[email protected]>
Signed-off-by: Joris Van den Bossche <[email protected]>
  • Loading branch information
amoeba authored Oct 20, 2022
1 parent f49f8ed commit 0f91e68
Show file tree
Hide file tree
Showing 7 changed files with 136 additions and 137 deletions.
2 changes: 1 addition & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1068,7 +1068,7 @@ services:
["/arrow/ci/scripts/cpp_build.sh /arrow /build &&
/arrow/ci/scripts/python_build.sh /arrow /build &&
pip install -e /arrow/dev/archery[numpydoc] &&
archery numpydoc --allow-rule PR01,PR10 &&
archery numpydoc --allow-rule PR01,PR03,PR10 &&
/arrow/ci/scripts/python_test.sh /arrow"]

conda-python-dask:
Expand Down
76 changes: 38 additions & 38 deletions python/pyarrow/_csv.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -189,22 +189,22 @@ cdef class ReadOptions(_Weakrefable):
self.options.reset(new CCSVReadOptions(CCSVReadOptions.Defaults()))

def __init__(self, *, use_threads=None, block_size=None, skip_rows=None,
column_names=None, autogenerate_column_names=None,
encoding='utf8', skip_rows_after_names=None):
skip_rows_after_names=None, column_names=None,
autogenerate_column_names=None, encoding='utf8'):
if use_threads is not None:
self.use_threads = use_threads
if block_size is not None:
self.block_size = block_size
if skip_rows is not None:
self.skip_rows = skip_rows
if skip_rows_after_names is not None:
self.skip_rows_after_names = skip_rows_after_names
if column_names is not None:
self.column_names = column_names
if autogenerate_column_names is not None:
self.autogenerate_column_names= autogenerate_column_names
# Python-specific option
self.encoding = encoding
if skip_rows_after_names is not None:
self.skip_rows_after_names = skip_rows_after_names

@property
def use_threads(self):
Expand Down Expand Up @@ -243,6 +243,23 @@ cdef class ReadOptions(_Weakrefable):
def skip_rows(self, value):
deref(self.options).skip_rows = value

@property
def skip_rows_after_names(self):
"""
The number of rows to skip after the column names.
This number can be larger than the number of rows in one
block, and empty rows are counted.
The order of application is as follows:
- `skip_rows` is applied (if non-zero);
- column names aread (unless `column_names` is set);
- `skip_rows_after_names` is applied (if non-zero).
"""
return deref(self.options).skip_rows_after_names

@skip_rows_after_names.setter
def skip_rows_after_names(self, value):
deref(self.options).skip_rows_after_names = value

@property
def column_names(self):
"""
Expand Down Expand Up @@ -271,23 +288,6 @@ cdef class ReadOptions(_Weakrefable):
def autogenerate_column_names(self, value):
deref(self.options).autogenerate_column_names = value

@property
def skip_rows_after_names(self):
"""
The number of rows to skip after the column names.
This number can be larger than the number of rows in one
block, and empty rows are counted.
The order of application is as follows:
- `skip_rows` is applied (if non-zero);
- column names aread (unless `column_names` is set);
- `skip_rows_after_names` is applied (if non-zero).
"""
return deref(self.options).skip_rows_after_names

@skip_rows_after_names.setter
def skip_rows_after_names(self, value):
deref(self.options).skip_rows_after_names = value

def validate(self):
check_status(deref(self.options).Validate())

Expand All @@ -296,11 +296,11 @@ cdef class ReadOptions(_Weakrefable):
self.use_threads == other.use_threads and
self.block_size == other.block_size and
self.skip_rows == other.skip_rows and
self.skip_rows_after_names == other.skip_rows_after_names and
self.column_names == other.column_names and
self.autogenerate_column_names ==
other.autogenerate_column_names and
self.encoding == other.encoding and
self.skip_rows_after_names == other.skip_rows_after_names
self.encoding == other.encoding
)

@staticmethod
Expand Down Expand Up @@ -605,11 +605,6 @@ cdef class ConvertOptions(_Weakrefable):
decimal_point : 1-character string, optional (default '.')
The character used as decimal point in floating-point and decimal
data.
timestamp_parsers : list, optional
A sequence of strptime()-compatible format strings, tried in order
when attempting to infer or convert timestamp values (the special
value ISO8601() can also be given). By default, a fast built-in
ISO-8601 parser is used.
strings_can_be_null : bool, optional (default False)
Whether string / binary columns can have null values.
If true, then strings in null_values are considered null for
Expand All @@ -620,16 +615,6 @@ cdef class ConvertOptions(_Weakrefable):
If true, then strings in "null_values" are also considered null
when they appear quoted in the CSV file. Otherwise, quoted values
are never considered null.
auto_dict_encode : bool, optional (default False)
Whether to try to automatically dict-encode string / binary data.
If true, then when type inference detects a string or binary column,
it it dict-encoded up to `auto_dict_max_cardinality` distinct values
(per chunk), after which it switches to regular encoding.
This setting is ignored for non-inferred columns (those in
`column_types`).
auto_dict_max_cardinality : int, optional
The maximum dictionary cardinality for `auto_dict_encode`.
This value is per chunk.
include_columns : list, optional
The names of columns to include in the Table.
If empty, the Table will include all columns from the CSV file.
Expand All @@ -641,6 +626,21 @@ cdef class ConvertOptions(_Weakrefable):
produce a column of nulls (whose type is selected using
`column_types`, or null by default).
This option is ignored if `include_columns` is empty.
auto_dict_encode : bool, optional (default False)
Whether to try to automatically dict-encode string / binary data.
If true, then when type inference detects a string or binary column,
it it dict-encoded up to `auto_dict_max_cardinality` distinct values
(per chunk), after which it switches to regular encoding.
This setting is ignored for non-inferred columns (those in
`column_types`).
auto_dict_max_cardinality : int, optional
The maximum dictionary cardinality for `auto_dict_encode`.
This value is per chunk.
timestamp_parsers : list, optional
A sequence of strptime()-compatible format strings, tried in order
when attempting to infer or convert timestamp values (the special
value ISO8601() can also be given). By default, a fast built-in
ISO-8601 parser is used.
Examples
--------
Expand Down
78 changes: 39 additions & 39 deletions python/pyarrow/_dataset.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ cdef class Dataset(_Weakrefable):
... 'n_legs': [2, 2, 4, 4, 5, 100],
... 'animal': ["Flamingo", "Parrot", "Dog", "Horse",
... "Brittle stars", "Centipede"]})
>>>
>>>
>>> import pyarrow.parquet as pq
>>> pq.write_table(table, "dataset_scanner.parquet")
Expand Down Expand Up @@ -1221,12 +1221,12 @@ cdef class CsvFileFormat(FileFormat):
----------
parse_options : pyarrow.csv.ParseOptions
Options regarding CSV parsing.
default_fragment_scan_options : CsvFragmentScanOptions
Default options for fragments scan.
convert_options : pyarrow.csv.ConvertOptions
Options regarding value conversion.
read_options : pyarrow.csv.ReadOptions
General read options.
default_fragment_scan_options : CsvFragmentScanOptions
Default options for fragments scan.
"""
cdef:
CCsvFileFormat* csv_format
Expand Down Expand Up @@ -2315,17 +2315,17 @@ cdef class Scanner(_Weakrefable):
projections.
The list of columns or expressions may use the special fields
`__batch_index` (the index of the batch within the fragment),
`__fragment_index` (the index of the fragment within the dataset),
`__batch_index` (the index of the batch within the fragment),
`__fragment_index` (the index of the fragment within the dataset),
`__last_in_fragment` (whether the batch is last in fragment), and
`__filename` (the name of the source file or a description of the
`__filename` (the name of the source file or a description of the
source fragment).
The columns will be passed down to Datasets and corresponding data
fragments to avoid loading, copying, and deserializing columns
that will not be required further down the compute chain.
By default all of the available columns are projected.
Raises an exception if any of the referenced column names does
By default all of the available columns are projected.
Raises an exception if any of the referenced column names does
not exist in the dataset's Schema.
filter : Expression, default None
Scan will return only the rows matching the filter.
Expand All @@ -2338,8 +2338,9 @@ cdef class Scanner(_Weakrefable):
record batches are overflowing memory then this method can be
called to reduce their size.
batch_readahead : int, default 16
The number of batches to read ahead in a file. Increasing this number
will increase RAM usage but could also improve IO utilization.
The number of batches to read ahead in a file. This might not work
for all file formats. Increasing this number will increase
RAM usage but could also improve IO utilization.
fragment_readahead : int, default 4
The number of files to read ahead. Increasing this number will increase
RAM usage but could also improve IO utilization.
Expand Down Expand Up @@ -2375,14 +2376,13 @@ cdef class Scanner(_Weakrefable):
return self.wrapped

@staticmethod
def from_dataset(Dataset dataset not None,
bint use_threads=True, object use_async=None,
MemoryPool memory_pool=None,
object columns=None, Expression filter=None,
int batch_size=_DEFAULT_BATCH_SIZE,
def from_dataset(Dataset dataset not None, *, object columns=None,
Expression filter=None, int batch_size=_DEFAULT_BATCH_SIZE,
int batch_readahead=_DEFAULT_BATCH_READAHEAD,
int fragment_readahead=_DEFAULT_FRAGMENT_READAHEAD,
FragmentScanOptions fragment_scan_options=None):
FragmentScanOptions fragment_scan_options=None,
bint use_threads=True, object use_async=None,
MemoryPool memory_pool=None):
"""
Create Scanner from Dataset,
Expand All @@ -2397,10 +2397,10 @@ cdef class Scanner(_Weakrefable):
projections.
The list of columns or expressions may use the special fields
`__batch_index` (the index of the batch within the fragment),
`__fragment_index` (the index of the fragment within the dataset),
`__batch_index` (the index of the batch within the fragment),
`__fragment_index` (the index of the fragment within the dataset),
`__last_in_fragment` (whether the batch is last in fragment), and
`__filename` (the name of the source file or a description of the
`__filename` (the name of the source file or a description of the
source fragment).
The columns will be passed down to Datasets and corresponding data
Expand All @@ -2426,6 +2426,9 @@ cdef class Scanner(_Weakrefable):
fragment_readahead : int, default 4
The number of files to read ahead. Increasing this number will increase
RAM usage but could also improve IO utilization.
fragment_scan_options : FragmentScanOptions, default None
Options specific to a particular scan and fragment type, which
can change between different scans of the same dataset.
use_threads : bool, default True
If enabled, then maximum parallelism will be used determined by
the number of available CPU cores.
Expand All @@ -2436,9 +2439,6 @@ cdef class Scanner(_Weakrefable):
memory_pool : MemoryPool, default None
For memory allocations, if required. If not specified, uses the
default pool.
fragment_scan_options : FragmentScanOptions, default None
Options specific to a particular scan and fragment type, which
can change between different scans of the same dataset.
"""
cdef:
shared_ptr[CScanOptions] options = make_shared[CScanOptions]()
Expand All @@ -2461,13 +2461,13 @@ cdef class Scanner(_Weakrefable):
return Scanner.wrap(scanner)

@staticmethod
def from_fragment(Fragment fragment not None, Schema schema=None,
bint use_threads=True, object use_async=None,
MemoryPool memory_pool=None,
def from_fragment(Fragment fragment not None, *, Schema schema=None,
object columns=None, Expression filter=None,
int batch_size=_DEFAULT_BATCH_SIZE,
int batch_readahead=_DEFAULT_BATCH_READAHEAD,
FragmentScanOptions fragment_scan_options=None):
FragmentScanOptions fragment_scan_options=None,
bint use_threads=True, object use_async=None,
MemoryPool memory_pool=None,):
"""
Create Scanner from Fragment,
Expand All @@ -2484,10 +2484,10 @@ cdef class Scanner(_Weakrefable):
projections.
The list of columns or expressions may use the special fields
`__batch_index` (the index of the batch within the fragment),
`__fragment_index` (the index of the fragment within the dataset),
`__batch_index` (the index of the batch within the fragment),
`__fragment_index` (the index of the fragment within the dataset),
`__last_in_fragment` (whether the batch is last in fragment), and
`__filename` (the name of the source file or a description of the
`__filename` (the name of the source file or a description of the
source fragment).
The columns will be passed down to Datasets and corresponding data
Expand All @@ -2510,6 +2510,9 @@ cdef class Scanner(_Weakrefable):
The number of batches to read ahead in a file. This might not work
for all file formats. Increasing this number will increase
RAM usage but could also improve IO utilization.
fragment_scan_options : FragmentScanOptions, default None
Options specific to a particular scan and fragment type, which
can change between different scans of the same dataset.
use_threads : bool, default True
If enabled, then maximum parallelism will be used determined by
the number of available CPU cores.
Expand All @@ -2520,9 +2523,6 @@ cdef class Scanner(_Weakrefable):
memory_pool : MemoryPool, default None
For memory allocations, if required. If not specified, uses the
default pool.
fragment_scan_options : FragmentScanOptions, default None
Options specific to a particular scan and fragment type, which
can change between different scans of the same dataset.
"""
cdef:
shared_ptr[CScanOptions] options = make_shared[CScanOptions]()
Expand All @@ -2549,11 +2549,11 @@ cdef class Scanner(_Weakrefable):
return Scanner.wrap(scanner)

@staticmethod
def from_batches(source, Schema schema=None, bint use_threads=True,
object use_async=None, MemoryPool memory_pool=None,
object columns=None, Expression filter=None,
int batch_size=_DEFAULT_BATCH_SIZE,
FragmentScanOptions fragment_scan_options=None):
def from_batches(source, *, Schema schema=None, object columns=None,
Expression filter=None, int batch_size=_DEFAULT_BATCH_SIZE,
FragmentScanOptions fragment_scan_options=None,
bint use_threads=True, object use_async=None,
MemoryPool memory_pool=None):
"""
Create a Scanner from an iterator of batches.
Expand All @@ -2574,6 +2574,8 @@ cdef class Scanner(_Weakrefable):
Scan will return only the rows matching the filter.
batch_size : int, default 128Ki
The maximum row count for scanned record batches.
fragment_scan_options : FragmentScanOptions
The fragment scan options.
use_threads : bool, default True
If enabled, then maximum parallelism will be used determined by
the number of available CPU cores.
Expand All @@ -2584,8 +2586,6 @@ cdef class Scanner(_Weakrefable):
memory_pool : MemoryPool, default None
For memory allocations, if required. If not specified, uses the
default pool.
fragment_scan_options : FragmentScanOptions
The fragment scan options.
"""
cdef:
shared_ptr[CScanOptions] options = make_shared[CScanOptions]()
Expand Down
8 changes: 4 additions & 4 deletions python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -702,11 +702,11 @@ cdef class _PandasConvertible(_Weakrefable):
memory_pool : MemoryPool, default None
Arrow MemoryPool to use for allocations. Uses the default memory
pool is not passed.
strings_to_categorical : bool, default False
Encode string (UTF8) and binary types to pandas.Categorical.
categories : list, default empty
List of fields that should be returned as pandas.Categorical. Only
applies to table-like data structures.
strings_to_categorical : bool, default False
Encode string (UTF8) and binary types to pandas.Categorical.
zero_copy_only : bool, default False
Raise an ArrowException if this function call would require copying
the underlying data.
Expand Down Expand Up @@ -2549,11 +2549,11 @@ cdef class DictionaryArray(Array):
The array of values referenced by the indices.
mask : ndarray or pandas.Series, bool type
True values indicate that indices are actually null.
ordered : bool, default False
Set to True if the category values are ordered.
from_pandas : bool, default False
If True, the indices should be treated as though they originated in
a pandas.Categorical (null encoded as -1).
ordered : bool, default False
Set to True if the category values are ordered.
safe : bool, default True
If True, check that the dictionary indices are in range.
memory_pool : MemoryPool, default None
Expand Down
6 changes: 3 additions & 3 deletions python/pyarrow/ipc.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -106,12 +106,12 @@ cdef class IpcReadOptions(_Weakrefable):
Parameters
----------
use_threads : bool
Whether to use the global CPU thread pool to parallelize any
computational tasks like decompression.
ensure_native_endian : bool
Whether to convert incoming data to platform-native endianness.
Default is true.
use_threads : bool
Whether to use the global CPU thread pool to parallelize any
computational tasks like decompression.
included_fields : list
If empty (the default), return all deserialized fields.
If non-empty, the values are the indices of fields to read on
Expand Down
Loading

0 comments on commit 0f91e68

Please sign in to comment.