Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add GCSFSMap implementation #20

Merged
merged 5 commits into from
Jan 7, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ script:

deploy:
provider: pypi
user: "@token"
user: "__token__"
password:
secure: "Td/L9h5/atxclvZvcQqkCjZirJCX2EugBHtD4eeDqf/yRT9CFomCUaQ1042fx8PI3WSkslguTJZeVgsmNor20w0GOMH+Q67/W740VDsxnoynELBRkndcOLsBhTr+y5TaeqjqWSA3gn1b1ZliXieWBiRntHww5mF7IlqyVghiTJuluuIR1kU07kpKe9dTclBqPRLBdyQRw8J0trjozZwkpdyJwKN4P3ezRkQ81soipDAwfZd99MAkLQ8HIkC0rqdru4u/sZ3vatO+dzM6lWTUrFX3tJdvYuKMI2WZZmHMEb52FA8eSuL+yrlyAbpZQV1ULuO3xsmw3c9rWD9xhg7FxbkeODFzKoZs0rmjlmwY9YpEJZbFSpu80ksih0nM5DwYqGXxtyt/DXttdwJ+LWa5Se2hRyogkLhYW8cyb+2fU5IuxAgMKfGc7V2ry1IT/AH2WTiudW5GR2eOpRLAXF0A9N1Jv0brPB2qHb4TMTWcJgEmsHCfrYHg76uehH2SgOWAOyTPOP+RgsrG1pzuh629GtJbT1fVR0ycn30bB/Kx7D8+6ZjCVY0z1PKSSGAA0kSXqdAPaUtWpHU5xDT2EIJhCcM4A/vl5Bt1PeM1bYcBTLNgxD2BLGLy5nRwAklAEZpyq6oxMW4n8mW0f1/tALqY8Z9aMyuIFHMAxMI4EFomGP4="
on:
Expand Down
9 changes: 9 additions & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,20 @@ For a full reference of all available methods of GCSFS visit the documentation o
`fs.base.FS <https://pyfilesystem2.readthedocs.io/en/latest/reference/base.html>`__!


GCSFS
-----

.. autoclass:: fs_gcsfs.GCSFS

.. automethod:: fs_gcsfs.GCSFS.fix_storage


GCSMap
------

.. automethod:: fs_gcsfs.GCSFS.get_mapper


Powered By
==========

Expand Down
62 changes: 57 additions & 5 deletions fs_gcsfs/_gcsfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import os
import tempfile
import mimetypes
from typing import Optional, List, Union, Tuple, Iterator
from typing import Optional, List, Union, Tuple, Iterator, MutableMapping

import google
from fs import ResourceType, errors, tools
Expand All @@ -27,13 +27,13 @@


class GCSFS(FS):
"""A GCS filesystem for `PyFilesystem <https://pyfilesystem.org>`_
"""A Google Cloud Storage filesystem for `PyFilesystem <https://pyfilesystem.org>`_.

This implementation is based on `S3FS <https://github.com/PyFilesystem/s3fs>`_
This implementation is based on `S3FS <https://github.com/PyFilesystem/s3fs>`_.

Args:
bucket_name: The GCS bucket name.
root_path: The root directory within the GCS Bucket
root_path: The root directory within the GCS Bucket.
create: Whether to create ``root_path`` on initialization or not. If ``root_path`` does not yet exist and ``create=False`` a ``CreateFailed``
exception will be raised. To disable ``root_path`` validation entirely set ``strict=False``.
client: A :class:`google.storage.Client` exposing the google storage API.
Expand Down Expand Up @@ -450,6 +450,14 @@ def opendir(self, path: str, factory=None) -> SubFS[FS]:

return _factory(self, path)

def get_mapper(self) -> "GCSMap":
"""Returns a ``MutableMapping`` that represents the filesystem.

The keys of the mapping become files and the values (which must be bytes) the contents of those files.
This is particularly useful to be used with libraries such as `xarray <http://xarray.pydata.org/>`_ or `zarr <https://zarr.readthedocs.io/>`_.
"""
return GCSMap(self)

def fix_storage(self) -> None: # TODO test
"""Utility function that walks the entire `root_path` and makes sure that all intermediate directories are correctly marked with empty blobs.

Expand Down Expand Up @@ -494,7 +502,9 @@ def fix_storage(self) -> None: # TODO test


class GCSFile(io.IOBase):
"""Proxy for a GCS blob. Identical to S3File from https://github.com/PyFilesystem/s3fs
"""Proxy for a GCS blob.

Identical to S3File from https://github.com/PyFilesystem/s3fs.

Note:
Instead of performing all operations directly on the cloud (which is in some cases not even possible)
Expand Down Expand Up @@ -609,6 +619,48 @@ def truncate(self, size=None):
return size


class GCSMap(MutableMapping):
"""Wraps a ``class:GCSFS`` as a ``MutableMapping``.

The keys of the mapping become files and the values (which must be bytes) the contents of those files.
This is particularly useful to be used with libraries such as `xarray <http://xarray.pydata.org/>`_ or `zarr <https://zarr.readthedocs.io/>`_.

Args:
gcsfs: The ``class:GCSFS`` to wrap.
"""

def __init__(self, gcsfs: GCSFS):
self.gcsfs = gcsfs

def __getitem__(self, key: str) -> bytes:
try:
return self.gcsfs.getbytes(str(key))
except errors.ResourceNotFound:
raise KeyError(key)

def __setitem__(self, key: str, value: bytes):
self.gcsfs.makedirs(dirname(str(key)), recreate=True)
self.gcsfs.setbytes(str(key), bytes(value))

def __delitem__(self, key):
self.gcsfs.remove(str(key))

def __iter__(self) -> Iterator[str]:
return self.keys()

def __len__(self) -> int:
return sum(1 for _ in self.keys())

def __contains__(self, key: str) -> bool:
return self.gcsfs.exists(str(key))

def keys(self) -> Iterator[str]:
for path, dirs, files in self.gcsfs.walk("."):
for file in files:
if file.name != "/": # Skip directory markers
yield file.name # join(path, file.name)


def _make_repr(class_name, *args, **kwargs):
"""Generate a repr string. Identical to S3FS implementation

Expand Down
26 changes: 26 additions & 0 deletions fs_gcsfs/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import os
import uuid

import pytest
from google.cloud.storage import Client

from fs_gcsfs import GCSFS


@pytest.fixture(scope="module")
def client():
return Client()


@pytest.fixture(scope="module")
def bucket(client):
return client.get_bucket(os.environ['TEST_BUCKET'])


@pytest.fixture(scope="function")
def gcsfs(bucket, client):
"""Yield a temporary `GCSFS` at a unique 'root-blob' within the test bucket."""
path = "gcsfs/" + str(uuid.uuid4())
yield GCSFS(bucket_name=bucket.name, root_path=path, client=client, create=True)
for blob in bucket.list_blobs(prefix=path):
blob.delete()
37 changes: 9 additions & 28 deletions fs_gcsfs/tests/test_gcsfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from fs_gcsfs import GCSFS

TEST_BUCKET = os.environ['TEST_BUCKET']
TEST_BUCKET = os.environ["TEST_BUCKET"]


class TestGCSFS(FSTestCases, unittest.TestCase):
Expand Down Expand Up @@ -43,25 +43,6 @@ def get_bucket(self, _):
return ClientMock()


@pytest.fixture(scope="module")
def client():
return Client()


@pytest.fixture(scope="module")
def bucket(client):
return client.get_bucket(TEST_BUCKET)


@pytest.fixture(scope="function")
def tmp_gcsfs(bucket, client):
"""Yield a temporary `GCSFS` at a unique 'root-blob' within the test bucket."""
path = "gcsfs/" + str(uuid.uuid4())
yield GCSFS(bucket_name=bucket.name, root_path=path, client=client, create=True)
for blob in bucket.list_blobs(prefix=path):
blob.delete()


@pytest.mark.parametrize("path,root_path,expected", [
("", None, ""),
(".", None, ""),
Expand Down Expand Up @@ -125,31 +106,31 @@ def test_create_property_does_not_create_file_if_emptyish_root_path(root_path, c
assert gcs_fs.bucket.get_blob(root_path + GCSFS.DELIMITER) is None


def test_fix_storage_adds_binary_blobs_with_empty_string_as_directory_marker(bucket, tmp_gcsfs):
def test_fix_storage_adds_binary_blobs_with_empty_string_as_directory_marker(bucket, gcsfs):
# Creating a 'nested' hierarchy of blobs without directory marker
for path in ["foo/test", "foo/bar/test", "foo/baz/test", "foo/bar/egg/test"]:
key = tmp_gcsfs._path_to_key(path)
key = gcsfs._path_to_key(path)
blob = bucket.blob(key)
blob.upload_from_string(b"Is this a test? It has to be. Otherwise I can't go on.")
tmp_gcsfs.fix_storage()
gcsfs.fix_storage()

for path in ["", "foo", "foo/bar", "foo/baz", "foo/bar/egg"]:
assert tmp_gcsfs.isdir(path)
assert gcsfs.isdir(path)


def test_fix_storage_does_not_overwrite_existing_directory_markers_with_custom_content(bucket, tmp_gcsfs):
def test_fix_storage_does_not_overwrite_existing_directory_markers_with_custom_content(bucket, gcsfs):
for path in ["foo/test"]:
key = tmp_gcsfs._path_to_key(path)
key = gcsfs._path_to_key(path)
blob = bucket.blob(key)
blob.upload_from_string(b"Is this a test? It has to be. Otherwise I can't go on.")

# Manual creation of 'directory marker' with custom content
key = tmp_gcsfs._path_to_dir_key("foo/")
key = gcsfs._path_to_dir_key("foo/")
blob = bucket.blob(key)
content = b"CUSTOM_DIRECTORY_MARKER_CONTENT"
blob.upload_from_string(content)

tmp_gcsfs.fix_storage()
gcsfs.fix_storage()

assert blob.download_as_string() == content

Expand Down
54 changes: 54 additions & 0 deletions fs_gcsfs/tests/test_gcsmap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# These tests have been partially copied and adopted from the S3Map implementation of https://github.com/dask/s3fs


def test_empty_mapping(gcsfs):
d = gcsfs.get_mapper()
assert not d
assert list(d) == list(d.keys()) == []
assert list(d.values()) == []
assert list(d.items()) == []


def test_reading_and_writing_to_mapping(gcsfs):
d = gcsfs.get_mapper()
d["x"] = b"123"
assert d["x"] == b"123"
assert list(d) == list(d.keys()) == ["x"]
assert list(d.values()) == [b"123"]
assert list(d.items()) == [("x", b"123")]
assert bool(d)

d["x"] = b"000"
assert d["x"] == b"000"

d["y"] = b"456"
assert d["y"] == b"456"
assert set(d) == {"x", "y"}


def test_reading_and_writing_complex_keys(gcsfs):
d = gcsfs.get_mapper()
d[1] = b"hello"
assert d[1] == b"hello"
del d[1]

d[1, 2] = b"world"
assert d[1, 2] == b"world"
del d[1, 2]

d["x", 1, 2] = b"hello world"
assert d["x", 1, 2] == b"hello world"
assert ("x", 1, 2) in d


def test_writing_array(gcsfs):
from array import array
d = gcsfs.get_mapper()
d["x"] = array("B", [65] * 1000)
assert d["x"] == b"A" * 1000


def test_writing_bytearray(gcsfs):
d = gcsfs.get_mapper()
d["x"] = bytearray(b"123")
assert d["x"] == b"123"