Skip to content

Commit

Permalink
[V7-1204] Sanitize filenames (#174)
Browse files Browse the repository at this point in the history
* Sanitize filenames

* Run tests on Windows

* Should fail on Windows?

* Name conflict

* Mock platform

* only replace : on Windows

* Get rid of useless line
  • Loading branch information
andreaazzini authored Jul 28, 2021
1 parent 81baf18 commit 86ad1a2
Show file tree
Hide file tree
Showing 7 changed files with 110 additions and 24 deletions.
23 changes: 22 additions & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ name: Python package
on: [push]

jobs:
build:
ubuntu:
runs-on: ubuntu-latest
strategy:
matrix:
Expand All @@ -24,3 +24,24 @@ jobs:
pip install --editable .
- name: Run tests
run: pytest
windows:
runs-on: windows-latest
strategy:
matrix:
python-version: [3.6, 3.7, 3.8, 3.9]

steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install pytest
pip install pytest-describe
pip install --upgrade setuptools
pip install --editable .
- name: Run tests
run: pytest
13 changes: 7 additions & 6 deletions darwin/dataset/download_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from pathlib import Path

import requests
from darwin.dataset.utils import sanitize_filename
from darwin.utils import is_image_extension_allowed


Expand Down Expand Up @@ -62,11 +63,11 @@ def download_all_images_from_annotations(
annotation = json.load(file)
if not force_replace:
# Check collisions on image filename, original_filename and json filename on the system
if Path(annotation["image"]["filename"]).stem in existing_images:
if sanitize_filename(Path(annotation["image"]["filename"]).stem) in existing_images:
continue
if Path(annotation["image"]["original_filename"]).stem in existing_images:
if sanitize_filename(Path(annotation["image"]["original_filename"]).stem) in existing_images:
continue
if annotation_path.stem in existing_images:
if sanitize_filename(annotation_path.stem) in existing_images:
continue
annotations_to_download_path.append(annotation_path)

Expand Down Expand Up @@ -99,7 +100,7 @@ def download_image_from_annotation(
api_key: str,
api_url: str,
annotation_path: Path,
images_path: str,
images_path: Path,
annotation_format: str,
use_folders: bool,
video_frames: bool,
Expand Down Expand Up @@ -132,7 +133,7 @@ def download_image_from_annotation(


def download_image_from_json_annotation(
api_key: str, api_url: str, annotation_path: Path, image_path: str, use_folders: bool, video_frames: bool
api_key: str, api_url: str, annotation_path: Path, image_path: Path, use_folders: bool, video_frames: bool
):
"""
Helper function: downloads an image given a .json annotation path
Expand Down Expand Up @@ -169,7 +170,7 @@ def download_image_from_json_annotation(
download_image(frame_url, path, api_key)
else:
image_url = annotation["image"]["url"]
image_path = parent_path / annotation["image"]["filename"]
image_path = parent_path / sanitize_filename(annotation["image"]["filename"])
download_image(image_url, image_path, api_key)


Expand Down
11 changes: 3 additions & 8 deletions darwin/dataset/remote_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,18 +23,13 @@
get_classes,
is_relative_to,
make_class_lists,
sanitize_filename,
)
from darwin.exceptions import NotFound, UnsupportedExportFormat
from darwin.exporter.formats.darwin import build_image_annotation
from darwin.item import DatasetItem, parse_dataset_item
from darwin.item_sorter import ItemSorter
from darwin.utils import (
find_files,
parse_darwin_json,
secure_continue_request,
split_video_annotation,
urljoin,
)
from darwin.utils import find_files, parse_darwin_json, split_video_annotation, urljoin
from darwin.validators import name_taken, validation_error

if TYPE_CHECKING:
Expand Down Expand Up @@ -271,7 +266,7 @@ def pull(
for annotation_path in tmp_dir.glob("*.json"):
with annotation_path.open() as file:
annotation = json.load(file)
filename = Path(annotation["image"]["filename"]).stem
filename = sanitize_filename(Path(annotation["image"]["filename"]).stem)
destination_name = annotations_dir / f"{filename}{annotation_path.suffix}"
shutil.move(str(annotation_path), str(destination_name))

Expand Down
28 changes: 22 additions & 6 deletions darwin/dataset/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,16 @@
import multiprocessing as mp
from collections import Counter, defaultdict
from pathlib import Path
from typing import Dict, Generator, List, Optional, Set, Tuple, Union
from typing import Any, Dict, Generator, List, Optional, Set, Tuple, Union

import numpy as np
from darwin.exceptions import NotFound
from darwin.importer.formats.darwin import parse_file
from darwin.utils import SUPPORTED_EXTENSIONS, SUPPORTED_VIDEO_EXTENSIONS
from darwin.utils import (
SUPPORTED_EXTENSIONS,
SUPPORTED_VIDEO_EXTENSIONS,
is_unix_like_os,
)
from PIL import Image
from rich.live import Live
from rich.progress import ProgressBar, track
Expand Down Expand Up @@ -214,7 +218,7 @@ def get_coco_format_record(
height, width = data["image"]["height"], data["image"]["width"]
annotations = data["annotations"]

record = {}
record: Dict[str, Any] = {}
if image_path is not None:
record["file_name"] = str(image_path)
if image_id is not None:
Expand Down Expand Up @@ -330,7 +334,7 @@ def get_annotations(
)
else:
# If the partition is not specified, get all the annotations
stems = [e.stem for e in annotations_dir.glob("**/*.json")]
stems = (e.stem for e in annotations_dir.glob("**/*.json"))

images_paths = []
annotations_paths = []
Expand Down Expand Up @@ -468,7 +472,7 @@ def compute_max_density(annotations_dir: Path):


# E.g.: {"partition" => {"class_name" => 123}}
AnnotationDistribution = Dict[str, Dict[str, int]]
AnnotationDistribution = Dict[str, Counter]


def compute_distributions(
Expand Down Expand Up @@ -512,11 +516,23 @@ def compute_distributions(

# https://github.com/python/cpython/blob/main/Lib/pathlib.py#L812
# TODO implemented here because it's not supported in Pythton < 3.9
def is_relative_to(path: Path, *other):
def is_relative_to(path: Path, *other) -> bool:
"""Return True if the path is relative to another path or False.
"""
try:
path.relative_to(*other)
return True
except ValueError:
return False


def sanitize_filename(filename: str) -> str:
chars = ["<", ">", '"', "/", "\\", "|", "?", "*"]

if not is_unix_like_os():
chars.append(":")

for char in chars:
filename = filename.replace(char, "_")

return filename
5 changes: 5 additions & 0 deletions darwin/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
import platform
from pathlib import Path
from typing import TYPE_CHECKING, Dict, List, Optional, Union

Expand Down Expand Up @@ -466,3 +467,7 @@ def convert_polygons_to_mask(polygons: List, height: int, width: int, value: Opt
def chunk(items, size):
for i in range(0, len(items), size):
yield items[i : i + size]


def is_unix_like_os() -> bool:
return platform.system() != "Windows"
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
import json
import shutil
from collections import defaultdict
from pathlib import Path
from typing import Dict
from unittest.mock import patch
from unittest.mock import MagicMock, patch

import pytest
from darwin.dataset.utils import compute_distributions, extract_classes
from darwin.dataset.utils import (
compute_distributions,
extract_classes,
sanitize_filename,
)


def open_resource_file():
Expand Down Expand Up @@ -96,6 +99,31 @@ def builds_correct_mapping_dictionaries(annotations_path: Path):
assert dict(index_dict) == {0: {"class_4"}, 1: {"class_4"}}


def describe_sanitize_filename():
def normal_filenames_stay_untouched():
assert sanitize_filename("test.jpg") == "test.jpg"

def special_characters_are_replaced_with_underscores():
assert sanitize_filename("2020-06-18T08<50<13.14815Z.json") == "2020-06-18T08_50_13.14815Z.json"
assert sanitize_filename("2020-06-18T08>50>13.14815Z.json") == "2020-06-18T08_50_13.14815Z.json"
assert sanitize_filename('2020-06-18T08"50"13.14815Z.json') == "2020-06-18T08_50_13.14815Z.json"
assert sanitize_filename("2020-06-18T08/50/13.14815Z.json") == "2020-06-18T08_50_13.14815Z.json"
assert sanitize_filename("2020-06-18T08\\50\\13.14815Z.json") == "2020-06-18T08_50_13.14815Z.json"
assert sanitize_filename("2020-06-18T08|50|13.14815Z.json") == "2020-06-18T08_50_13.14815Z.json"
assert sanitize_filename("2020-06-18T08?50?13.14815Z.json") == "2020-06-18T08_50_13.14815Z.json"
assert sanitize_filename("2020-06-18T08*50*13.14815Z.json") == "2020-06-18T08_50_13.14815Z.json"

@patch("platform.system", return_value="Windows")
def replace_columns_on_windows(mock: MagicMock):
assert sanitize_filename("2020-06-18T08:50:13.14815Z.json") == "2020-06-18T08_50_13.14815Z.json"
mock.assert_called_once()

@patch("platform.system", return_value="Linux")
def avoid_replacing_columns_on_non_windows(mock: MagicMock):
assert sanitize_filename("2020-06-18T08:50:13.14815Z.json") == "2020-06-18T08:50:13.14815Z.json"
mock.assert_called_once()


def _create_annotation_file(annotation_path: Path, filename: str, payload: Dict):
with open(annotation_path / filename, "w") as f:
json.dump(payload, f)
20 changes: 20 additions & 0 deletions tests/darwin/utils_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from unittest.mock import MagicMock, patch

from darwin.utils import is_unix_like_os


def describe_is_unix_like_os():
@patch("platform.system", return_value="Linux")
def it_returns_true_on_linux(mock: MagicMock):
assert is_unix_like_os()
mock.assert_called_once()

@patch("platform.system", return_value="Windows")
def it_returns_false_on_windows(mock: MagicMock):
assert not is_unix_like_os()
mock.assert_called_once()

@patch("platform.system", return_value="Darwin")
def it_returns_true_on_mac_os(mock: MagicMock):
assert is_unix_like_os()
mock.assert_called_once()

0 comments on commit 86ad1a2

Please sign in to comment.