Skip to content

Commit

Permalink
Remove the global codegen strategy from simple_codegen_task
Browse files Browse the repository at this point in the history
We had noticed a few issues with versioned target invalidation, first in antlr and then in the newly ported scrooge task. Now that all consumers are ported to isolated codegen, we can fix and prevent those issues by switching away from the manual caching strategy.

- Drop codegen strategies entirely
- Detect duplicate sources using the synthetic targets, rather than re-scanning the workdirs
- Prune duplicate sources from disk when they are "allowed", to avoid having to deal with them in subsequent incremental runs
- Deprecate but preserve the strategy flag
- Enable `cache_target_dirs` for simpler/more-correct caching/invalidation
- In test_simple_codegen_task: move DummyTask and DummyTarget to top level classes in order to add the `ensure_cached` annotation

Testing Done:
pantsbuild#2386

For a context containing 1549 gen targets, the time to execute the task improves signficantly:

    Clean:
      Before: 777 seconds
      After: 220 seconds
    Noop:
      Before: 25 seconds
      After: 12 seconds

Bugs closed: 2337, 2386

Reviewed at https://rbcommons.com/s/twitter/r/2985/
  • Loading branch information
stuhood committed Oct 18, 2015
1 parent 86ff14d commit 8f89227
Show file tree
Hide file tree
Showing 19 changed files with 544 additions and 959 deletions.
140 changes: 47 additions & 93 deletions contrib/scrooge/src/python/pants/contrib/scrooge/tasks/scrooge_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,20 +34,7 @@
class ScroogeGen(SimpleCodegenTask, NailgunTask):

DepInfo = namedtuple('DepInfo', ['service', 'structs'])

class PartialCmd(namedtuple('PC', ['language', 'rpc_style', 'namespace_map'])):

@property
def relative_outdir(self):
namespace_sig = None
if self.namespace_map:
sha = hashlib.sha1()
for ns_from, ns_to in sorted(self.namespace_map):
sha.update(ns_from)
sha.update(ns_to)
namespace_sig = sha.hexdigest()
output_style = '-'.join(filter(None, (self.language, self.rpc_style, namespace_sig)))
return output_style
PartialCmd = namedtuple('PartialCmd', ['language', 'rpc_style', 'namespace_map'])

@classmethod
def register_options(cls, register):
Expand Down Expand Up @@ -92,9 +79,6 @@ def _tempname(self):
os.close(fd)
return path

def _outdir(self, target):
return os.path.join(self.workdir, self.codegen_strategy.codegen_workdir_suffix(target))

def _resolve_deps(self, depmap):
"""Given a map of gen-key=>target specs, resolves the target specs into references."""
deps = defaultdict(lambda: OrderedSet())
Expand All @@ -112,91 +96,61 @@ def _resolve_deps(self, depmap):
))
return deps

def execute_codegen(self, invalid_targets):
self._validate_compiler_configs(invalid_targets)
self._must_have_sources(invalid_targets)

gentargets_by_dependee = self.context.dependents(
on_predicate=self.is_gentarget,
from_predicate=lambda t: not self.is_gentarget(t))

dependees_by_gentarget = defaultdict(set)
for dependee, tgts in gentargets_by_dependee.items():
for gentarget in tgts:
dependees_by_gentarget[gentarget].add(dependee)
def execute_codegen(self, target, target_workdir):
self._validate_compiler_configs([target])
self._must_have_sources(target)

partial_cmds = defaultdict(set)
gentargets = filter(self.is_gentarget, invalid_targets)
if not gentargets:
return

for target in gentargets:
language = self._thrift_defaults.language(target)
rpc_style = self._thrift_defaults.rpc_style(target)
partial_cmd = self.PartialCmd(
language=language,
rpc_style=rpc_style,
namespace_map=tuple(sorted(target.namespace_map.items()) if target.namespace_map else ()))
partial_cmds[partial_cmd].add(target)

for partial_cmd, tgts in partial_cmds.items():
self.gen(partial_cmd, tgts)

def gen(self, partial_cmd, invalid_targets):
language = self._thrift_defaults.language(target)
rpc_style = self._thrift_defaults.rpc_style(target)
partial_cmd = self.PartialCmd(
language=language,
rpc_style=rpc_style,
namespace_map=tuple(sorted(target.namespace_map.items()) if target.namespace_map else ()))

for vt in invalid_targets:
outdir = self.codegen_workdir(vt)
import_paths, dummy_changed_srcs = calculate_compile_sources(invalid_targets, self.is_gentarget)
changed_srcs = vt.sources_relative_to_buildroot()
self.gen(partial_cmd, target, target_workdir)

if changed_srcs:
args = []
def gen(self, partial_cmd, target, target_workdir):
import_paths, _ = calculate_compile_sources([target], self.is_gentarget)

for import_path in import_paths:
args.extend(['--import-path', import_path])
args = []

args.extend(['--language', partial_cmd.language])
for import_path in import_paths:
args.extend(['--import-path', import_path])

for lhs, rhs in partial_cmd.namespace_map:
args.extend(['--namespace-map', '%s=%s' % (lhs, rhs)])
args.extend(['--language', partial_cmd.language])

if partial_cmd.rpc_style == 'ostrich':
args.append('--finagle')
args.append('--ostrich')
elif partial_cmd.rpc_style == 'finagle':
args.append('--finagle')
for lhs, rhs in partial_cmd.namespace_map:
args.extend(['--namespace-map', '%s=%s' % (lhs, rhs)])

args.extend(['--dest', outdir])
safe_mkdir(outdir)
if partial_cmd.rpc_style == 'ostrich':
args.append('--finagle')
args.append('--ostrich')
elif partial_cmd.rpc_style == 'finagle':
args.append('--finagle')

if not self.get_options().strict:
args.append('--disable-strict')
args.extend(['--dest', target_workdir])

if self.get_options().verbose:
args.append('--verbose')
if not self.get_options().strict:
args.append('--disable-strict')

gen_file_map_path = os.path.relpath(self._tempname())
args.extend(['--gen-file-map', gen_file_map_path])
if self.get_options().verbose:
args.append('--verbose')

args.extend(changed_srcs)
gen_file_map_path = os.path.relpath(self._tempname())
args.extend(['--gen-file-map', gen_file_map_path])

classpath = self.tool_classpath('scrooge-gen')
jvm_options = list(self.get_options().jvm_options)
jvm_options.append('-Dfile.encoding=UTF-8')
returncode = self.runjava(classpath=classpath,
main='com.twitter.scrooge.Main',
jvm_options=jvm_options,
args=args,
workunit_name='scrooge-gen')
try:
# TODO: This entire block appears to have no effect.
if 0 == returncode:
self.parse_gen_file_map(gen_file_map_path, outdir)
finally:
os.remove(gen_file_map_path)
args.extend(target.sources_relative_to_buildroot())

if 0 != returncode:
raise TaskError('Scrooge compiler exited non-zero ({0})'.format(returncode))
classpath = self.tool_classpath('scrooge-gen')
jvm_options = list(self.get_options().jvm_options)
jvm_options.append('-Dfile.encoding=UTF-8')
returncode = self.runjava(classpath=classpath,
main='com.twitter.scrooge.Main',
jvm_options=jvm_options,
args=args,
workunit_name='scrooge-gen')
if 0 != returncode:
raise TaskError('Scrooge compiler exited non-zero for {} ({})'.format(target, returncode))

SERVICE_PARSER = re.compile(r'^\s*service\s+(?:[^\s{]+)')

Expand Down Expand Up @@ -229,6 +183,7 @@ def is_gentarget(self, target):
return True

def _validate_compiler_configs(self, targets):
assert len(targets) == 1, ("TODO: This method now only ever receives one target. Simplify.")
ValidateCompilerConfig = namedtuple('ValidateCompilerConfig', ['language', 'rpc_style'])

def compiler_config(tgt):
Expand Down Expand Up @@ -259,16 +214,15 @@ def collect(dep):
msg.append(' %s - %s\n' % (dep, compiler_config(dep)))
raise TaskError(''.join(msg))

def _must_have_sources(self, targets):
for target in targets:
if isinstance(target, JavaThriftLibrary) and not target.payload.sources.source_paths:
raise TargetDefinitionException(target, 'no thrift files found')
def _must_have_sources(self, target):
if isinstance(target, JavaThriftLibrary) and not target.payload.sources.source_paths:
raise TargetDefinitionException(target, 'no thrift files found')

def synthetic_target_type(self, target):
language = self._thrift_defaults.language(target)
return _TARGET_TYPE_FOR_LANG[language]

def synthetic_target_extra_dependencies(self, target):
def synthetic_target_extra_dependencies(self, target, target_workdir):
has_service = False
for source in target.sources_relative_to_buildroot():
has_service = has_service or self._declares_service(source)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -130,16 +130,17 @@ def _test_help(self, build_string, library_type, sources):

saved_add_new_target = Context.add_new_target
try:
Context.add_new_target = MagicMock()
mock = MagicMock()
Context.add_new_target = mock
task.execute()
address = task.get_synthetic_address(target)

Context.add_new_target.assert_called_once_with(address=address,
target_type=library_type,
dependencies=OrderedSet(),
provides=None,
sources=[],
derived_from=target)

self.assertEquals(1, mock.call_count)
_, call_kwargs = mock.call_args
self.assertEquals(call_kwargs['target_type'], library_type)
self.assertEquals(call_kwargs['dependencies'], OrderedSet())
self.assertEquals(call_kwargs['provides'], None)
self.assertEquals(call_kwargs['sources'], [])
self.assertEquals(call_kwargs['derived_from'], target)

finally:
Context.add_new_target = saved_add_new_target
100 changes: 49 additions & 51 deletions src/python/pants/backend/codegen/tasks/antlr_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from pants.backend.jvm.targets.java_library import JavaLibrary
from pants.backend.jvm.tasks.nailgun_task import NailgunTask
from pants.base.exceptions import TaskError
from pants.util.dirutil import safe_walk


logger = logging.getLogger(__name__)
Expand All @@ -36,10 +37,9 @@ class AntlrGen(SimpleCodegenTask, NailgunTask):
class AmbiguousPackageError(TaskError):
"""Raised when a java package cannot be unambiguously determined for a JavaAntlrLibrary."""

class AntlrIsolatedCodegenStrategy(SimpleCodegenTask.IsolatedCodegenStrategy):
def find_sources(self, target):
sources = super(AntlrGen.AntlrIsolatedCodegenStrategy, self).find_sources(target)
return [source for source in sources if source.endswith('.java')]
def find_sources(self, target, target_dir):
sources = super(AntlrGen, self).find_sources(target, target_dir)
return [source for source in sources if source.endswith('.java')]

@classmethod
def register_options(cls, register):
Expand All @@ -53,42 +53,36 @@ def is_gentarget(self, target):
def synthetic_target_type(self, target):
return JavaLibrary

@classmethod
def supported_strategy_types(cls):
return [cls.AntlrIsolatedCodegenStrategy]

def execute_codegen(self, targets):
for target in targets:
args = ['-o', self.codegen_workdir(target)]
compiler = target.compiler
if compiler == 'antlr3':
if target.package is not None:
logger.warn("The 'package' attribute is not supported for antlr3 and will be ignored.")
java_main = 'org.antlr.Tool'
elif compiler == 'antlr4':
args.append('-visitor') # Generate Parse Tree Visitor As Well
# Note that this assumes that there is no package set in the antlr file itself,
# which is considered an ANTLR best practice.
args.append('-package')
if target.package is None:
args.append(self._get_sources_package(target))
else:
args.append(target.package)
java_main = 'org.antlr.v4.Tool'

antlr_classpath = self.tool_classpath(compiler)
sources = self._calculate_sources([target])
args.extend(sources)
result = self.runjava(classpath=antlr_classpath, main=java_main, args=args,
workunit_name='antlr')
if result != 0:
raise TaskError('java {} ... exited non-zero ({})'.format(java_main, result))

if compiler == 'antlr3':
for source in list(self.codegen_strategy.find_sources(target)):
self._scrub_generated_timestamp(source)

def synthetic_target_extra_dependencies(self, target):
def execute_codegen(self, target, target_workdir):
args = ['-o', target_workdir]
compiler = target.compiler
if compiler == 'antlr3':
if target.package is not None:
logger.warn("The 'package' attribute is not supported for antlr3 and will be ignored.")
java_main = 'org.antlr.Tool'
elif compiler == 'antlr4':
args.append('-visitor') # Generate Parse Tree Visitor As Well
# Note that this assumes that there is no package set in the antlr file itself,
# which is considered an ANTLR best practice.
args.append('-package')
if target.package is None:
args.append(self._get_sources_package(target))
else:
args.append(target.package)
java_main = 'org.antlr.v4.Tool'

antlr_classpath = self.tool_classpath(compiler)
sources = self._calculate_sources([target])
args.extend(sources)
result = self.runjava(classpath=antlr_classpath, main=java_main, args=args,
workunit_name='antlr')
if result != 0:
raise TaskError('java {} ... exited non-zero ({})'.format(java_main, result))

if compiler == 'antlr3':
self._scrub_generated_timestamps(target_workdir)

def synthetic_target_extra_dependencies(self, target, target_workdir):
# Fetch the right java dependency from the target's compiler option
compiler_classpath_spec = self.get_options()[target.compiler]
return self.resolve_deps([compiler_classpath_spec])
Expand All @@ -115,14 +109,18 @@ def collect_sources(target):

_COMMENT_WITH_TIMESTAMP_RE = re.compile('^//.*\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d')

def _scrub_generated_timestamp(self, source):
# Removes the first line of comment if it contains a timestamp.
with open(source) as f:
lines = f.readlines()
if len(lines) < 1:
return
with open(source, 'w') as f:
if not self._COMMENT_WITH_TIMESTAMP_RE.match(lines[0]):
f.write(lines[0])
for line in lines[1:]:
f.write(line)
def _scrub_generated_timestamps(self, target_workdir):
"""Remove the first line of comment from each file if it contains a timestamp."""
for root, _, filenames in safe_walk(target_workdir):
for filename in filenames:
source = os.path.join(root, filename)

with open(source) as f:
lines = f.readlines()
if len(lines) < 1:
return
with open(source, 'w') as f:
if not self._COMMENT_WITH_TIMESTAMP_RE.match(lines[0]):
f.write(lines[0])
for line in lines[1:]:
f.write(line)
19 changes: 8 additions & 11 deletions src/python/pants/backend/codegen/tasks/apache_thrift_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def _declares_service(self, source):
with open(source) as thrift:
return any(line for line in thrift if self.SERVICE_PARSER.search(line))

def synthetic_target_extra_dependencies(self, target):
def synthetic_target_extra_dependencies(self, target, target_workdir):
for source in target.sources_relative_to_buildroot():
if self._declares_service(os.path.join(get_buildroot(), source)):
return self._service_deps
Expand All @@ -116,16 +116,14 @@ def _thrift_cmd(self):
cmd.append('-verbose')
return cmd

def _generate_thrift(self, target):
def _generate_thrift(self, target, target_workdir):
target_cmd = self._thrift_cmd[:]

bases = OrderedSet(tgt.target_base for tgt in target.closure() if self.is_gentarget(tgt))
for base in bases:
target_cmd.extend(('-I', base))

work_dir = self.codegen_workdir(target)
safe_mkdir(work_dir, clean=True)
target_cmd.extend(('-o', work_dir))
target_cmd.extend(('-o', target_workdir))

for source in target.sources_relative_to_buildroot():
cmd = target_cmd[:]
Expand All @@ -140,13 +138,12 @@ def _generate_thrift(self, target):
raise TaskError('{} ... exited non-zero ({})'.format(self._thrift_binary, result))

# The thrift compiler generates sources to a gen-[lang] subdir of the `-o` argument. We
# relocate the generated java sources to the root of the `work_dir` so that our base class
# relocate the generated java sources to the root of the `target_workdir` so that our base class
# maps them properly for source jars and the like.
gen_dir = os.path.join(work_dir, 'gen-java')
gen_dir = os.path.join(target_workdir, 'gen-java')
for path in os.listdir(gen_dir):
shutil.move(os.path.join(gen_dir, path), work_dir)
shutil.move(os.path.join(gen_dir, path), target_workdir)
os.rmdir(gen_dir)

def execute_codegen(self, invalid_targets):
for target in invalid_targets:
self._generate_thrift(target)
def execute_codegen(self, target, target_workdir):
self._generate_thrift(target, target_workdir)
Loading

0 comments on commit 8f89227

Please sign in to comment.