Skip to content

Commit

Permalink
Faster coalesce_targets in GroupTask
Browse files Browse the repository at this point in the history
Using linear-time BFS to sort targets topologically and group them by the type, instead of quadratic-time greedy rearrangement algorithm.

Also
- splitting sort_targets into invert_dependencies+sort_targets.
- changing a test because there are multiple correct answers but test only checks for one

Testing Done:
./pants test tests/python/pants_test/tasks:group_task

Green CI: https://travis-ci.org/pantsbuild/pants/builds/68883423

I used this command for benchmarking:

./pants compile src/scala/com/twitter/ads/batch/job/targeting/audience_extension/profile:audience-extension-profile-deploy --config-override=pants.ini.nocache

Bugs closed: 1733

Reviewed at https://rbcommons.com/s/twitter/r/2413/
  • Loading branch information
megaserg authored and stuhood committed Jun 29, 2015
1 parent aedb181 commit c9cdaea
Show file tree
Hide file tree
Showing 3 changed files with 88 additions and 61 deletions.
118 changes: 69 additions & 49 deletions src/python/pants/backend/core/tasks/group_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,10 @@

import os
from abc import abstractmethod, abstractproperty
from collections import defaultdict

from six.moves import range
from collections import defaultdict, deque

from pants.backend.core.tasks.task import Task, TaskBase
from pants.base.build_graph import sort_targets
from pants.base.build_graph import invert_dependencies
from pants.base.workunit import WorkUnit
from pants.goal.goal import Goal

Expand Down Expand Up @@ -77,58 +75,73 @@ class GroupIterator(object):

@staticmethod
def coalesce_targets(targets, discriminator):
"""Returns a list of Targets that `targets` depend on sorted from most dependent to least.
"""Returns a list of Targets that `targets` depend on sorted from least dependent to most.
The targets are grouped where possible by target type as categorized by the given discriminator.
This algorithm was historically known as the "bang" algorithm from a time when it was
optionally enabled by appending a '!' (bang) to the command line target.
"""

sorted_targets = filter(discriminator, sort_targets(targets))

# can do no better for any of these:
# []
# [a]
# [a,b]
if len(sorted_targets) <= 2:
return sorted_targets

# For these, we'd like to coalesce if possible, like:
# [a,b,a,c,a,c] -> [a,a,a,b,c,c]
# adopt a quadratic worst case solution, when we find a type change edge, scan forward for
# the opposite edge and then try to swap dependency pairs to move the type back left to its
# grouping. If the leftwards migration fails due to a dependency constraint, we just stop
# and move on leaving "type islands".
# We want to sort targets topologically, grouping targets of the same type if possible.
# Algorithm: BFS on the dependency graph with a separate queue per each type.
# First, enqueue the least dependent targets (roots). Choose a type with a non-empty queue,
# and process nodes from this queue until it's exhausted, then move on to the next non-empty
# queue. "To process" means to add the node to the resulting list, and to increment
# the number of "satisfied" dependencies for all its direct dependees. For every dependee
# that has all its dependencies satisfied, enqueue it in the corresponding queue.
# Since it's a directed acyclic graph, eventually all targets will be processed and added
# to the resulting list.
#
# This linear-complexity algorithm replaces the worst-case-quadratic-complexity algorithm
# that used DFS for topological sort, then trying to rearrange the targets in the resulting
# list without breaking the sorting order, repeatedly computing full dependency closure
# for the targets in the list.
#
# For benchmarking, "./pants compile" command was executed on a large target with about 1K nodes
# in the dependency graph. The machine was 2013 MPB with SSD.
# The quadratic implementation took on average about 18 seconds. The linear implementation
# took on average about 1 second.

roots, inverted_deps = invert_dependencies(targets)

queues = defaultdict(deque)
queues_total_size = 0

# Enqueue roots.
for root in roots:
root_type = discriminator(root)
queues[root_type].append(root)
queues_total_size += 1

sorted_targets = []
satisfied_deps = defaultdict(int)
current_type = None

# main scan left to right no backtracking
for i in range(len(sorted_targets) - 1):
current_target = sorted_targets[i]
if current_type != discriminator(current_target):
scanned_back = False

# scan ahead for next type match
for j in range(i + 1, len(sorted_targets)):
look_ahead_target = sorted_targets[j]
if current_type == discriminator(look_ahead_target):
scanned_back = True

# swap this guy as far back as we can
for k in range(j, i, -1):
previous_target = sorted_targets[k - 1]
mismatching_types = current_type != discriminator(previous_target)
not_a_dependency = look_ahead_target not in previous_target.closure()
if mismatching_types and not_a_dependency:
sorted_targets[k] = sorted_targets[k - 1]
sorted_targets[k - 1] = look_ahead_target
else:
break # out of k

break # out of j

if not scanned_back: # done with coalescing the current type, move on to next
current_type = discriminator(current_target)
# Is there anything left to process?
while queues_total_size > 0:
# Choose a type with a non-empty queue.
for potential_type in queues.keys():
if queues[potential_type]:
current_type = potential_type
break
# Process targets of this type while possible - they will form a single chunk.
while queues[current_type]:
target = queues[current_type].popleft()
queues_total_size -= 1
sorted_targets.append(target)

# Let the dependees know that one more dependency is satisfied.
if target in inverted_deps:
for dependee in inverted_deps[target]:
satisfied_deps[dependee] += 1
# Does the dependee have all its dependencies satisfied now?
if satisfied_deps[dependee] == len(dependee.dependencies):
dependee_type = discriminator(dependee)
queues[dependee_type].append(dependee)
queues_total_size += 1

# Remove targets that are not claimed by any member.
sorted_targets = filter(discriminator, sorted_targets)

return sorted_targets

Expand All @@ -149,13 +162,20 @@ def __iter__(self):
yield group_member, chunk

def _create_chunks(self):
# memoized mapping from target to its type (i.e. member)
target_to_member = dict()

def discriminator(tgt):
if tgt in target_to_member:
return target_to_member[tgt]
for member in self._group_members:
if member.select(tgt):
target_to_member[tgt] = member
return member
target_to_member[tgt] = None
return None

coalesced = list(reversed(self.coalesce_targets(self._targets, discriminator)))
coalesced = self.coalesce_targets(self._targets, discriminator)

chunks = []

Expand Down
23 changes: 15 additions & 8 deletions src/python/pants/base/build_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -409,10 +409,9 @@ def __init__(self, cycle):
' ->\n\t'.join(target.address.spec for target in cycle)
))


def sort_targets(targets):
""":return: the targets that targets depend on sorted from most dependent to least."""
roots = OrderedSet()
def invert_dependencies(targets):
""":return: the full graph of dependencies for `targets` and the list of roots."""
roots = set()
inverted_deps = defaultdict(OrderedSet) # target -> dependent targets
visited = set()
path = OrderedSet()
Expand All @@ -426,18 +425,26 @@ def invert(target):
path.add(target)
if target not in visited:
visited.add(target)
for dependency in target.dependencies:
inverted_deps[dependency].add(target)
invert(dependency)
if target.dependencies:
for dependency in target.dependencies:
inverted_deps[dependency].add(target)
invert(dependency)
else:
roots.add(target)

path.remove(target)

for target in targets:
invert(target)

return roots, inverted_deps

def sort_targets(targets):
""":return: the targets that `targets` depend on sorted from most dependent to least."""

roots, inverted_deps = invert_dependencies(targets)
ordered = []
visited.clear()
visited = set()

def topological_sort(target):
if target not in visited:
Expand Down
8 changes: 4 additions & 4 deletions tests/python/pants_test/tasks/test_group_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,13 +91,13 @@ def test(self):
self.assertEqual({a_blue}, set(targets))

group_member, targets = chunks[2]
self.assertEqual(self.green, type(group_member))
self.assertEqual({a_green}, set(targets))

group_member, targets = chunks[3]
self.assertEqual(self.red, type(group_member))
self.assertEqual({b_red, c_red}, set(targets))

group_member, targets = chunks[3]
self.assertEqual(self.green, type(group_member))
self.assertEqual({a_green}, set(targets))


class BaseGroupTaskTest(BaseTest):
def create_targets(self):
Expand Down

0 comments on commit c9cdaea

Please sign in to comment.