Skip to content

Commit

Permalink
Faster coalesce_targets
Browse files Browse the repository at this point in the history
Using linear-time BFS to sort targets topologically and group them by the type, instead of quadratic-time greedy rearrangement algorithm.

Also
- splitting sort_targets into invert_dependencies+sort_targets.
- changing a test to check for both correct answers

Testing Done:
./pants test tests/python/pants_test/tasks:group_task

Green CI: https://travis-ci.org/pantsbuild/pants/builds/69215674

I used this command for benchmarking:

./pants compile src/scala/com/twitter/ads/batch/job/targeting/audience_extension/profile:audience-extension-profile-deploy --config-override=pants.ini.nocache

Bugs closed: 1733

Reviewed at https://rbcommons.com/s/twitter/r/2413/
  • Loading branch information
megaserg authored and stuhood committed Jul 6, 2015
1 parent fe454fe commit 63cc6f4
Show file tree
Hide file tree
Showing 3 changed files with 98 additions and 72 deletions.
118 changes: 69 additions & 49 deletions src/python/pants/backend/core/tasks/group_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,10 @@

import os
from abc import abstractmethod, abstractproperty
from collections import defaultdict

from six.moves import range
from collections import defaultdict, deque

from pants.backend.core.tasks.task import Task, TaskBase
from pants.base.build_graph import sort_targets
from pants.base.build_graph import invert_dependencies
from pants.base.workunit import WorkUnit
from pants.goal.goal import Goal

Expand Down Expand Up @@ -77,58 +75,73 @@ class GroupIterator(object):

@staticmethod
def coalesce_targets(targets, discriminator):
"""Returns a list of Targets that `targets` depend on sorted from most dependent to least.
"""Returns a list of Targets that `targets` depend on sorted from least dependent to most.
The targets are grouped where possible by target type as categorized by the given discriminator.
This algorithm was historically known as the "bang" algorithm from a time when it was
optionally enabled by appending a '!' (bang) to the command line target.
"""

sorted_targets = filter(discriminator, sort_targets(targets))

# can do no better for any of these:
# []
# [a]
# [a,b]
if len(sorted_targets) <= 2:
return sorted_targets

# For these, we'd like to coalesce if possible, like:
# [a,b,a,c,a,c] -> [a,a,a,b,c,c]
# adopt a quadratic worst case solution, when we find a type change edge, scan forward for
# the opposite edge and then try to swap dependency pairs to move the type back left to its
# grouping. If the leftwards migration fails due to a dependency constraint, we just stop
# and move on leaving "type islands".
# We want to sort targets topologically, grouping targets of the same type if possible.
# Algorithm: BFS on the dependency graph with a separate queue per each type.
# First, enqueue the least dependent targets (roots). Choose a type with a non-empty queue,
# and process nodes from this queue until it's exhausted, then move on to the next non-empty
# queue. "To process" means to add the node to the resulting list, and to increment
# the number of "satisfied" dependencies for all its direct dependees. For every dependee
# that has all its dependencies satisfied, enqueue it in the corresponding queue.
# Since it's a directed acyclic graph, eventually all targets will be processed and added
# to the resulting list.
#
# This linear-complexity algorithm replaces the worst-case-quadratic-complexity algorithm
# that used DFS for topological sort, then trying to rearrange the targets in the resulting
# list without breaking the sorting order, repeatedly computing full dependency closure
# for the targets in the list.
#
# For benchmarking, "./pants compile" command was executed on a large target with about 1K nodes
# in the dependency graph. The machine was 2013 MPB with SSD.
# The quadratic implementation took on average about 18 seconds. The linear implementation
# took on average about 1 second.

roots, inverted_deps = invert_dependencies(targets)

queues = defaultdict(deque)
queues_total_size = 0

# Enqueue roots.
for root in roots:
root_type = discriminator(root)
queues[root_type].append(root)
queues_total_size += 1

sorted_targets = []
satisfied_deps = defaultdict(int)
current_type = None

# main scan left to right no backtracking
for i in range(len(sorted_targets) - 1):
current_target = sorted_targets[i]
if current_type != discriminator(current_target):
scanned_back = False

# scan ahead for next type match
for j in range(i + 1, len(sorted_targets)):
look_ahead_target = sorted_targets[j]
if current_type == discriminator(look_ahead_target):
scanned_back = True

# swap this guy as far back as we can
for k in range(j, i, -1):
previous_target = sorted_targets[k - 1]
mismatching_types = current_type != discriminator(previous_target)
not_a_dependency = look_ahead_target not in previous_target.closure()
if mismatching_types and not_a_dependency:
sorted_targets[k] = sorted_targets[k - 1]
sorted_targets[k - 1] = look_ahead_target
else:
break # out of k

break # out of j

if not scanned_back: # done with coalescing the current type, move on to next
current_type = discriminator(current_target)
# Is there anything left to process?
while queues_total_size > 0:
# Choose a type with a non-empty queue.
for potential_type in queues.keys():
if queues[potential_type]:
current_type = potential_type
break
# Process targets of this type while possible - they will form a single chunk.
while queues[current_type]:
target = queues[current_type].popleft()
queues_total_size -= 1
sorted_targets.append(target)

# Let the dependees know that one more dependency is satisfied.
if target in inverted_deps:
for dependee in inverted_deps[target]:
satisfied_deps[dependee] += 1
# Does the dependee have all its dependencies satisfied now?
if satisfied_deps[dependee] == len(dependee.dependencies):
dependee_type = discriminator(dependee)
queues[dependee_type].append(dependee)
queues_total_size += 1

# Remove targets that are not claimed by any member.
sorted_targets = filter(discriminator, sorted_targets)

return sorted_targets

Expand All @@ -149,13 +162,20 @@ def __iter__(self):
yield group_member, chunk

def _create_chunks(self):
# memoized mapping from target to its type (i.e. member)
target_to_member = dict()

def discriminator(tgt):
if tgt in target_to_member:
return target_to_member[tgt]
for member in self._group_members:
if member.select(tgt):
target_to_member[tgt] = member
return member
target_to_member[tgt] = None
return None

coalesced = list(reversed(self.coalesce_targets(self._targets, discriminator)))
coalesced = self.coalesce_targets(self._targets, discriminator)

chunks = []

Expand Down
23 changes: 15 additions & 8 deletions src/python/pants/base/build_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -409,10 +409,9 @@ def __init__(self, cycle):
' ->\n\t'.join(target.address.spec for target in cycle)
))


def sort_targets(targets):
""":return: the targets that targets depend on sorted from most dependent to least."""
roots = OrderedSet()
def invert_dependencies(targets):
""":return: the full graph of dependencies for `targets` and the list of roots."""
roots = set()
inverted_deps = defaultdict(OrderedSet) # target -> dependent targets
visited = set()
path = OrderedSet()
Expand All @@ -426,18 +425,26 @@ def invert(target):
path.add(target)
if target not in visited:
visited.add(target)
for dependency in target.dependencies:
inverted_deps[dependency].add(target)
invert(dependency)
if target.dependencies:
for dependency in target.dependencies:
inverted_deps[dependency].add(target)
invert(dependency)
else:
roots.add(target)

path.remove(target)

for target in targets:
invert(target)

return roots, inverted_deps

def sort_targets(targets):
""":return: the targets that `targets` depend on sorted from most dependent to least."""

roots, inverted_deps = invert_dependencies(targets)
ordered = []
visited.clear()
visited = set()

def topological_sort(target):
if target not in visited:
Expand Down
29 changes: 14 additions & 15 deletions tests/python/pants_test/tasks/test_group_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,21 +82,20 @@ def test(self):
chunks = self.iterate(c_red, a_green)
self.assertEqual(4, len(chunks))

group_member, targets = chunks[0]
self.assertEqual(self.red, type(group_member))
self.assertEqual({a_red}, set(targets))

group_member, targets = chunks[1]
self.assertEqual(self.blue, type(group_member))
self.assertEqual({a_blue}, set(targets))

group_member, targets = chunks[2]
self.assertEqual(self.green, type(group_member))
self.assertEqual({a_green}, set(targets))

group_member, targets = chunks[3]
self.assertEqual(self.red, type(group_member))
self.assertEqual({b_red, c_red}, set(targets))
group_members, target_lists = zip(*chunks)
group_member_types = [type(group_member) for group_member in group_members]
target_sets = [set(target_list) for target_list in target_lists]

# There are two possible topological orders, both correct.
first_possible_group_member_types = [self.red, self.blue, self.red, self.green]
first_possible_target_sets = [{a_red}, {a_blue}, {b_red, c_red}, {a_green}]
second_possible_group_member_types = [self.red, self.blue, self.green, self.red]
second_possible_target_sets = [{a_red}, {a_blue}, {a_green}, {b_red, c_red}]

self.assertIn(
(group_member_types, target_sets),
[(first_possible_group_member_types, first_possible_target_sets),
(second_possible_group_member_types, second_possible_target_sets)])


class BaseGroupTaskTest(BaseTest):
Expand Down

0 comments on commit 63cc6f4

Please sign in to comment.