diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index dc26daba90597..aa2352c894411 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. +import os import json import six @@ -414,6 +415,9 @@ def _visit_level(self, level, base_path, part_keys): elif fs.isdir(path): directories.append(path) + # ARROW-1079: Filter out "private" directories starting with underscore + directories = [x for x in directories if not _is_private_directory(x)] + if len(files) > 0 and len(directories) > 0: raise ValueError('Found files in an intermediate ' 'directory: {0}'.format(base_path)) @@ -456,6 +460,11 @@ def _parse_hive_partition(value): return value.split('=', 1) +def _is_private_directory(x): + _, tail = os.path.split(x) + return tail.startswith('_') and '=' not in tail + + def _path_split(path, sep): i = path.rfind(sep) + 1 head, tail = path[:i], path[i:] diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index f606a7fa0c670..0f44d164ca612 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -36,9 +36,14 @@ parquet = pytest.mark.parquet -def _write_table(*args, **kwargs): +def _write_table(table, path, **kwargs): import pyarrow.parquet as pq - return pq.write_table(*args, **kwargs) + + if isinstance(table, pd.DataFrame): + table = pa.Table.from_pandas(table) + + pq.write_table(table, path, **kwargs) + return table def _read_table(*args, **kwargs): @@ -851,6 +856,32 @@ def read_multiple_files(paths, columns=None, nthreads=None, **kwargs): read_multiple_files(mixed_paths) +@parquet +def test_ignore_private_directories(tmpdir): + import pyarrow.parquet as pq + + nfiles = 10 + size = 5 + + dirpath = tmpdir.join(guid()).strpath + os.mkdir(dirpath) + + test_data = [] + paths = [] + for i in range(nfiles): + df = _test_dataframe(size, seed=i) + path = pjoin(dirpath, '{0}.parquet'.format(i)) + + test_data.append(_write_table(df, path)) + paths.append(path) + + # private directory + os.mkdir(pjoin(dirpath, '_impala_staging')) + + dataset = pq.ParquetDataset(dirpath) + assert set(paths) == set(x.path for x in dataset.pieces) + + @parquet def test_multiindex_duplicate_values(tmpdir): num_rows = 3