Skip to content

Commit

Permalink
Do not crash on unicode filenames
Browse files Browse the repository at this point in the history
Introduce a new function, safe_walk, which wraps os.path.walk, but
ensures that the returned values are unicode objects.  This isn't
strictly safe, in that it is possible that some paths will not be
decodeable, but that case is rare, and the only alternative is to
somehow avoid all interaction between paths and unicode objects, which
seems especially tough in the presence of unicode_literals. See e.g.
https://mail.python.org/pipermail/python-dev/2008-December/083856.html

Testing Done:
Ran new test.

Reviewed at https://rbcommons.com/s/twitter/r/1193/
  • Loading branch information
dturner-tw committed Oct 23, 2014
1 parent 9c5e616 commit eaa2fc5
Show file tree
Hide file tree
Showing 20 changed files with 71 additions and 31 deletions.
4 changes: 2 additions & 2 deletions src/python/pants/backend/codegen/tasks/apache_thrift_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from pants.base.exceptions import TaskError
from pants.base.target import Target
from pants.thrift_util import calculate_compile_roots, select_thrift_binary
from pants.util.dirutil import safe_mkdir
from pants.util.dirutil import safe_mkdir, safe_walk


def _copytree(from_base, to_base):
Expand All @@ -40,7 +40,7 @@ def safe_link(src, dst):
if e.errno != errno.EEXIST:
raise e

for dirpath, dirnames, filenames in os.walk(from_base, topdown=True, onerror=abort):
for dirpath, dirnames, filenames in safe_walk(from_base, topdown=True, onerror=abort):
to_path = os.path.join(to_base, os.path.relpath(dirpath, from_base))
for dirname in dirnames:
safe_mkdir(os.path.join(to_path, dirname))
Expand Down
4 changes: 2 additions & 2 deletions src/python/pants/backend/jvm/tasks/ide_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from pants.base.source_root import SourceRoot
from pants.base.target import Target
from pants.goal.goal import Goal
from pants.util.dirutil import safe_mkdir
from pants.util.dirutil import safe_mkdir, safe_walk

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -652,7 +652,7 @@ def dedup_sources(source_set_list):
for source_set in self.sources:
paths = set()
source_base = os.path.join(self.root_dir, source_set.source_base)
for root, dirs, _ in os.walk(os.path.join(source_base, source_set.path)):
for root, dirs, _ in safe_walk(os.path.join(source_base, source_set.path)):
if dirs:
paths.update([os.path.join(root, directory) for directory in dirs])
unused_children = paths - targeted
Expand Down
6 changes: 3 additions & 3 deletions src/python/pants/backend/jvm/tasks/idea_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from pants.base.config import ConfigOption
from pants.base.generator import Generator, TemplateData
from pants.base.source_root import SourceRoot
from pants.util.dirutil import safe_mkdir
from pants.util.dirutil import safe_mkdir, safe_walk


_TEMPLATE_BASEDIR = 'templates/idea'
Expand Down Expand Up @@ -143,7 +143,7 @@ def __init__(self, *args, **kwargs):
@staticmethod
def _maven_targets_excludes(repo_root):
excludes = []
for (dirpath, dirnames, filenames) in os.walk(repo_root):
for (dirpath, dirnames, filenames) in safe_walk(repo_root):
if "pom.xml" in filenames:
excludes.append(os.path.join(os.path.relpath(dirpath, start=repo_root), "target"))
return excludes
Expand Down Expand Up @@ -313,7 +313,7 @@ def _get_resource_extensions(self, project):

# TODO(John Sirois): make test resources 1st class in ant build and punch this through to pants
# model
for _, _, files in os.walk(os.path.join(get_buildroot(), 'tests', 'resources')):
for _, _, files in safe_walk(os.path.join(get_buildroot(), 'tests', 'resources')):
resource_extensions.update(Project.extract_resource_extensions(files))

return resource_extensions
Expand Down
4 changes: 2 additions & 2 deletions src/python/pants/backend/jvm/tasks/jvm_compile/jvm_compile.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from pants.goal.products import MultipleRootedProducts
from pants.reporting.reporting_utils import items_to_report_element
from pants.util.contextutil import open_zip, temporary_dir
from pants.util.dirutil import safe_mkdir, safe_rmtree
from pants.util.dirutil import safe_mkdir, safe_rmtree, safe_walk


class JvmCompile(NailgunTaskBase, GroupMember, JvmToolTaskMixin):
Expand Down Expand Up @@ -711,7 +711,7 @@ def non_product(path):
if cls.endswith(b'.class') and not cls in self._upstream_class_to_path:
self._upstream_class_to_path[cls] = cp_entry
elif os.path.isdir(cp_entry):
for dirpath, _, filenames in os.walk(cp_entry, followlinks=True):
for dirpath, _, filenames in safe_walk(cp_entry, followlinks=True):
for f in filter(lambda x: x.endswith('.class'), filenames):
cls = os.path.relpath(os.path.join(dirpath, f), cp_entry)
if not cls in self._upstream_class_to_path:
Expand Down
4 changes: 2 additions & 2 deletions src/python/pants/backend/jvm/tasks/jvmdoc_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from pants import binary_util
from pants.backend.jvm.tasks.jvm_task import JvmTask
from pants.base.exceptions import TaskError
from pants.util.dirutil import safe_mkdir
from pants.util.dirutil import safe_mkdir, safe_walk


Jvmdoc = collections.namedtuple('Jvmdoc', ['tool_name', 'product_type'])
Expand Down Expand Up @@ -135,7 +135,7 @@ def find_jvmdoc_targets():
for target in targets:
gendir = self._gendir(target)
jvmdocs = []
for root, dirs, files in os.walk(gendir):
for root, dirs, files in safe_walk(gendir):
jvmdocs.extend(os.path.relpath(os.path.join(root, f), gendir) for f in files)
self.context.products.get(self.jvmdoc().product_type).add(target, gendir, jvmdocs)

Expand Down
6 changes: 3 additions & 3 deletions src/python/pants/backend/python/commands/setup_py.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
from pants.base.config import Config
from pants.base.exceptions import TargetDefinitionException
from pants.commands.command import Command
from pants.util.dirutil import safe_rmtree
from pants.util.dirutil import safe_rmtree, safe_walk


SETUP_BOILERPLATE = """
Expand Down Expand Up @@ -176,7 +176,7 @@ def iter_generated_sources(cls, target, root, config=None):

builder = builder_cls(target, root, config)
builder.generate()
for root, _, files in os.walk(builder.package_root):
for root, _, files in safe_walk(builder.package_root):
for fn in files:
target_file = os.path.join(root, fn)
yield os.path.relpath(target_file, builder.package_root), target_file
Expand Down Expand Up @@ -205,7 +205,7 @@ def find_packages(cls, chroot):
resources = defaultdict(set)

def iter_files():
for root, _, files in os.walk(base):
for root, _, files in safe_walk(base):
module = os.path.relpath(root, base).replace(os.path.sep, '.')
for filename in files:
yield module, filename, os.path.join(root, filename)
Expand Down
4 changes: 2 additions & 2 deletions src/python/pants/backend/python/thrift_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from pants.backend.codegen.targets.python_thrift_library import PythonThriftLibrary
from pants.base.build_environment import get_buildroot
from pants.thrift_util import select_thrift_binary
from pants.util.dirutil import safe_mkdir
from pants.util.dirutil import safe_mkdir, safe_walk


class PythonThriftBuilder(CodeGenerator):
Expand Down Expand Up @@ -125,7 +125,7 @@ def generate(self):
# Thrift generates code with all parent namespaces with empty __init__.py's. Generally
# speaking we want to drop anything w/o an __init__.py, and for anything with an __init__.py,
# we want to explicitly make it a namespace package, hence the hoops here.
for root, _, files in os.walk(os.path.normpath(self.package_root)):
for root, _, files in safe_walk(os.path.normpath(self.package_root)):
reldir = os.path.relpath(root, self.package_root)
if reldir == '.': # skip root
continue
Expand Down
5 changes: 3 additions & 2 deletions src/python/pants/base/build_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import re

from pex.interpreter import PythonIdentity
from pants.util.dirutil import safe_walk
from twitter.common.collections import OrderedSet


Expand Down Expand Up @@ -74,7 +75,7 @@ def scan_buildfiles(root_dir, base_path=None, spec_excludes=None):

def calc_exclude_roots(root_dir, excludes):
"""Return a map of root directories to subdirectory names suitable for a quick evaluation
inside os.walk()
inside safe_walk()
"""
result = defaultdict(set)
for exclude in excludes:
Expand All @@ -100,7 +101,7 @@ def find_excluded(root, dirs, exclude_roots):
else:
exclude_roots = calc_exclude_roots(root_dir, spec_excludes)

for root, dirs, files in os.walk(os.path.join(root_dir, base_path or ''), topdown=True):
for root, dirs, files in safe_walk(os.path.join(root_dir, base_path or ''), topdown=True):
to_remove = find_excluded(root, dirs, exclude_roots)
for subdir in to_remove:
dirs.remove(subdir)
Expand Down
4 changes: 2 additions & 2 deletions src/python/pants/cache/artifact.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import tarfile

from pants.util.contextutil import open_tar
from pants.util.dirutil import safe_mkdir, safe_mkdir_for
from pants.util.dirutil import safe_mkdir, safe_mkdir_for, safe_walk


class ArtifactError(Exception):
Expand Down Expand Up @@ -61,7 +61,7 @@ def collect(self, paths):
self._relpaths.add(relpath)

def extract(self):
for dir_name, _, filenames in os.walk(self._directory):
for dir_name, _, filenames in safe_walk(self._directory):
for filename in filenames:
filename = os.path.join(dir_name, filename)
relpath = os.path.relpath(filename, self._directory)
Expand Down
2 changes: 1 addition & 1 deletion src/python/pants/cache/restful_artifact_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def __init__(self, log, artifact_root, url_base, compress=True):
raise ValueError('RESTfulArtifactCache only supports HTTP and HTTPS')
self._timeout_secs = 4.0
self._netloc = parsed_url.netloc
self._path_prefix = parsed_url.path.rstrip('/')
self._path_prefix = parsed_url.path.rstrip(b'/')
self.compress = compress

# To enable connection reuse, all requests must be created from same session.
Expand Down
3 changes: 2 additions & 1 deletion src/python/pants/fs/archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from twitter.common.lang import AbstractClass

from pants.util.contextutil import open_tar, open_zip
from pants.util.dirutil import safe_walk


class Archiver(AbstractClass):
Expand Down Expand Up @@ -84,7 +85,7 @@ def __init__(self, compression):
def create(self, basedir, outdir, name, prefix=None):
zippath = os.path.join(outdir, '%s.zip' % name)
with open_zip(zippath, 'w', compression=ZIP_DEFLATED) as zip:
for root, _, files in os.walk(basedir):
for root, _, files in safe_walk(basedir):
root = root.decode('utf-8')
for file in files:
file = file.decode('utf-8')
Expand Down
1 change: 1 addition & 0 deletions src/python/pants/util/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ python_library(
name = 'dirutil',
sources = ['dirutil.py'],
dependencies = [
':strutil',
],
)

Expand Down
18 changes: 18 additions & 0 deletions src/python/pants/util/dirutil.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
import tempfile
import threading

from pants.util.strutil import ensure_text


def safe_mkdir(directory, clean=False):
"""Ensure a directory is present.
Expand All @@ -36,6 +38,22 @@ def safe_mkdir_for(path, clean=False):
safe_mkdir(os.path.dirname(path), clean)


def safe_walk(path, **kwargs):
"""Just like os.walk, but ensures that the returned values are unicode objects.
This isn't strictly safe, in that it is possible that some paths
will not be decodeable, but that case is rare, and the only
alternative is to somehow avoid all interaction between paths and
unicode objects, which seems especially tough in the presence of
unicode_literals. See e.g.
https://mail.python.org/pipermail/python-dev/2008-December/083856.html
"""
# If os.walk is given a text argument, it yields text values; if it
# is given a binary argument, it yields binary values.
return os.walk(ensure_text(path), **kwargs)


_MKDTEMP_CLEANER = None
_MKDTEMP_DIRS = defaultdict(set)
_MKDTEMP_LOCK = threading.RLock()
Expand Down
9 changes: 9 additions & 0 deletions src/python/pants/util/strutil.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,15 @@ def ensure_binary(text_or_binary):
raise TypeError('Argument is neither text nor binary type.')


def ensure_text(text_or_binary):
if isinstance(text_or_binary, six.binary_type):
return text_or_binary.decode('utf-8')
elif isinstance(text_or_binary, six.text_type):
return text_or_binary
else:
raise TypeError('Argument is neither text nor binary type.')


def safe_shlex_split(text_or_binary):
"""Split a string using shell-like syntax.
Expand Down
Empty file.
4 changes: 2 additions & 2 deletions tests/python/pants_test/fs/test_archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,13 @@

from pants.fs.archive import TAR, TBZ2, TGZ, ZIP
from pants.util.contextutil import temporary_dir
from pants.util.dirutil import safe_mkdir, touch
from pants.util.dirutil import safe_mkdir, safe_walk, touch


class ArchiveTest(unittest.TestCase):
def _listtree(self, root, empty_dirs):
listing = set()
for path, dirs, files in os.walk(root):
for path, dirs, files in safe_walk(root):
relpath = os.path.normpath(os.path.relpath(path, root)).decode('utf-8')
if empty_dirs:
listing.update(os.path.normpath(os.path.join(relpath, d.decode('utf-8'))) for d in dirs)
Expand Down
4 changes: 2 additions & 2 deletions tests/python/pants_test/jvm/jvm_tool_task_test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from pants.backend.jvm.tasks.bootstrap_jvm_tools import BootstrapJvmTools
from pants.base.dev_backend_loader import load_build_configuration_from_source
from pants.util.dirutil import safe_mkdir, safe_mkdtemp
from pants.util.dirutil import safe_mkdir, safe_mkdtemp, safe_walk
from pants_test.task_test_base import TaskTestBase


Expand Down Expand Up @@ -44,7 +44,7 @@ def link(path, optional=False, force=False):
def link_tree(path, optional=False, force=False):
src = os.path.join(self.real_build_root, path)
if not optional or os.path.exists(src):
for abspath, dirs, files in os.walk(src):
for abspath, dirs, files in safe_walk(src):
for f in files:
link(os.path.relpath(os.path.join(abspath, f), self.real_build_root), force=force)

Expand Down
8 changes: 4 additions & 4 deletions tests/python/pants_test/tasks/test_jar_publish.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from pants.base.exceptions import TaskError
from pants.scm.scm import Scm
from pants.util.contextutil import temporary_dir
from pants.util.dirutil import safe_mkdir
from pants.util.dirutil import safe_mkdir, safe_walk
from pants_test.tasks.test_base import TaskTest


Expand Down Expand Up @@ -131,7 +131,7 @@ def test_publish_local_dryrun(self):
# Nothing is written to the pushdb during a dryrun publish
# (maybe some directories are created, but git will ignore them)
files = []
for _, _, filenames in os.walk(self.push_db_basedir):
for _, _, filenames in safe_walk(self.push_db_basedir):
files.extend(filenames)
self.assertEquals(0, len(files),
"Nothing should be written to the pushdb during a dryrun publish")
Expand All @@ -156,7 +156,7 @@ def test_publish_local(self):
#Nothing is written to the pushdb during a local publish
#(maybe some directories are created, but git will ignore them)
files = []
for _, _, filenames in os.walk(self.push_db_basedir):
for _, _, filenames in safe_walk(self.push_db_basedir):
files.extend(filenames)
self.assertEquals(0, len(files),
"Nothing should be written to the pushdb during a local publish")
Expand All @@ -179,7 +179,7 @@ def test_publish_remote(self):

# One file per task is written to the pushdb during a local publish
files = []
for _, _, filenames in os.walk(self.push_db_basedir):
for _, _, filenames in safe_walk(self.push_db_basedir):
files.extend(filenames)
self.assertEquals(len(targets), len(files),
"During a remote publish, one pushdb should be written per target")
Expand Down
1 change: 1 addition & 0 deletions tests/python/pants_test/util/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,6 @@ python_tests(
'3rdparty/python:mox',
'3rdparty/python:pytest',
'src/python/pants/util:dirutil',
'src/python/pants/util:contextutil',
]
)
11 changes: 10 additions & 1 deletion tests/python/pants_test/util/test_dirutil.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
import unittest2 as unittest

from pants.util import dirutil
from pants.util.dirutil import _mkdtemp_unregister_cleaner
from pants.util.contextutil import temporary_dir
from pants.util.dirutil import _mkdtemp_unregister_cleaner, safe_mkdir


class DirutilTest(unittest.TestCase):
Expand Down Expand Up @@ -59,3 +60,11 @@ def faux_cleaner():
dirutil._mkdtemp_unregister_cleaner()

self._mox.VerifyAll()

def test_safe_walk(self):
with temporary_dir() as tmpdir:
safe_mkdir(os.path.join(tmpdir, '中文'))
if isinstance(tmpdir, unicode):
tmpdir = tmpdir.encode('utf-8')
for _, dirs, _ in dirutil.safe_walk(tmpdir):
self.assertTrue(all(isinstance(dirname, unicode) for dirname in dirs))

0 comments on commit eaa2fc5

Please sign in to comment.