Bug 1619554 Improve performance of |mach try fuzzy| preview r=ahal

Remove use of requests module in preview pane Reformat task duration data to avoid reprocessing in preview pane Avoid loading task durations json more than once. Increase required fzf version, use temporary file instead of arglist Differential Revision: https://phabricator.services.mozilla.com/D65094 --HG-- extra : moz-landing-system : lando
dothq · Mar 5, 2020 · 54f21ea · 54f21ea
1 parent 01d1683
commit 54f21ea
Show file tree

Hide file tree

Showing 5 changed files with 141 additions and 119 deletions.
diff --git a/tools/tryselect/push.py b/tools/tryselect/push.py
@@ -11,12 +11,11 @@
 from mozboot.util import get_state_dir
 from mozbuild.base import MozbuildObject
 from mozversioncontrol import get_repository_object, MissingVCSExtension
-from .util.estimates import (
-    duration_summary,
+from .util.manage_estimates import (
     download_task_history_data,
     make_trimmed_taskgraph_cache
 )
-
+from .util.estimates import duration_summary
 
 GIT_CINNABAR_NOT_FOUND = """
 Could not detect `git-cinnabar`.

diff --git a/tools/tryselect/selectors/fuzzy.py b/tools/tryselect/selectors/fuzzy.py
@@ -20,7 +20,7 @@
 from ..cli import BaseTryParser
 from ..tasks import generate_tasks, filter_tasks_by_paths
 from ..push import check_working_directory, push_to_try, generate_try_task_config
-from ..util.estimates import download_task_history_data, make_trimmed_taskgraph_cache
+from ..util.manage_estimates import download_task_history_data, make_trimmed_taskgraph_cache
 
 terminal = Terminal()
 
@@ -61,7 +61,7 @@
 FZF_VERSION_FAILED = """
 Could not obtain the 'fzf' version.
 
-The 'mach try fuzzy' command depends on fzf, and requires version > 0.18.0
+The 'mach try fuzzy' command depends on fzf, and requires version > 0.20.0
 for some of the features. Please install it following the appropriate
 instructions for your platform:
 
@@ -202,9 +202,9 @@ def should_force_fzf_update(fzf_bin):
     # Some fzf versions have extra, e.g 0.18.0 (ff95134)
     fzf_version = fzf_version.split()[0]
 
-    # 0.18.0 introduced FZF_PREVIEW_COLUMNS as an env variable
-    # in preview subprocesses, which is a feature we use.
-    if StrictVersion(fzf_version) < StrictVersion('0.18.0'):
+    # 0.20.0 introduced passing selections through a temporary file,
+    # which is good for large ctrl-a actions.
+    if StrictVersion(fzf_version) < StrictVersion('0.20.0'):
         print("fzf version is old, forcing update.")
         return True
     return False
@@ -341,12 +341,12 @@ def run(update=False, query=None, intersect_query=None, try_config=None, full=Fa
 
     if show_estimates:
         base_cmd.extend([
-            '--preview', 'python {} -g {} -s -c {} "{{+}}"'.format(
+            '--preview', 'python {} -g {} -s -c {} -t "{{+f}}"'.format(
                 PREVIEW_SCRIPT, dep_cache, cache_dir),
         ])
     else:
         base_cmd.extend([
-            '--preview', 'python {} "{{+}}"'.format(PREVIEW_SCRIPT),
+            '--preview', 'python {} -t "{{+f}}"'.format(PREVIEW_SCRIPT),
         ])
 
     if exact:

diff --git a/tools/tryselect/selectors/preview.py b/tools/tryselect/selectors/preview.py
@@ -12,16 +12,20 @@
 
 here = os.path.abspath(os.path.dirname(__file__))
 sys.path.insert(0, os.path.join(os.path.dirname(here), 'util'))
-from estimates import duration_summary, task_duration_data
+from estimates import duration_summary
 
 
 def process_args():
     """Process preview arguments."""
     argparser = argparse.ArgumentParser()
-    argparser.add_argument('-s', '--show-estimates', action="store_true")
-    argparser.add_argument('-g', '--graph-cache', type=str, default=None)
-    argparser.add_argument('-c', '--cache_dir', type=str, default=None)
-    argparser.add_argument('tasklist', type=str)
+    argparser.add_argument('-s', '--show-estimates', action="store_true",
+                           help="Show task duration estimates (default: False)")
+    argparser.add_argument('-g', '--graph-cache', type=str, default=None,
+                           help="Filename of task graph dependencies")
+    argparser.add_argument('-c', '--cache_dir', type=str, default=None,
+                           help="Path to cache directory containing task durations")
+    argparser.add_argument('-t', '--tasklist', type=str, default=None,
+                           help="Path to temporary file containing the selected tasks")
     return argparser.parse_args()
 
 
@@ -30,9 +34,10 @@ def plain_display(tasklist):
     print("\n".join(sorted(s.strip("'") for s in tasklist.split())))
 
 
-def duration_display(graph_cache_file, tasklist, cache_dir):
+def duration_display(graph_cache_file, taskfile, cache_dir):
     """Preview window display with task durations + metadata."""
-    tasklist = [t.strip("'") for t in tasklist.split()]
+    with open(taskfile, "r") as f:
+        tasklist = [line.strip() for line in f]
 
     durations = duration_summary(graph_cache_file, tasklist, cache_dir)
     output = ""
@@ -51,10 +56,9 @@ def duration_display(graph_cache_file, tasklist, cache_dir):
         durations["eta_datetime"].strftime("%H:%M"))
 
     duration_width = 5  # show five numbers at most.
-    task_durations = task_duration_data(cache_dir)
     output += "{:>{width}}\n".format("Duration", width=max_columns)
     for task in tasklist:
-        duration = int(task_durations.get(task, 0.0))
+        duration = durations["task_durations"].get(task, 0.0)
         output += "{:{align}{width}} {:{nalign}{nwidth}}s\n".format(
             task,
             duration,

diff --git a/tools/tryselect/util/estimates.py b/tools/tryselect/util/estimates.py
@@ -5,111 +5,14 @@
 from __future__ import absolute_import, print_function
 
 import os
-import requests
 import json
 from datetime import datetime, timedelta
 
-
-TASK_DURATION_URL = 'https://storage.googleapis.com/mozilla-mach-data/task_duration_history.json'
-GRAPH_QUANTILES_URL = 'https://storage.googleapis.com/mozilla-mach-data/machtry_quantiles.csv'
 TASK_DURATION_CACHE = 'task_duration_history.json'
 GRAPH_QUANTILE_CACHE = 'graph_quantile_cache.csv'
 TASK_DURATION_TAG_FILE = 'task_duration_tag.json'
 
 
-def check_downloaded_history(tag_file, duration_cache, quantile_cache):
-    if not os.path.isfile(tag_file):
-        return False
-
-    try:
-        with open(tag_file) as f:
-            duration_tags = json.load(f)
-        download_date = datetime.strptime(duration_tags.get('download_date'), '%Y-%M-%d')
-        if download_date < datetime.now() - timedelta(days=30):
-            return False
-    except (IOError, ValueError):
-        return False
-
-    if not os.path.isfile(duration_cache):
-        return False
-    if not os.path.isfile(quantile_cache):
-        return False
-
-    return True
-
-
-def download_task_history_data(cache_dir):
-    """Fetch task duration data exported from BigQuery."""
-    task_duration_cache = os.path.join(cache_dir, TASK_DURATION_CACHE)
-    task_duration_tag_file = os.path.join(cache_dir, TASK_DURATION_TAG_FILE)
-    graph_quantile_cache = os.path.join(cache_dir, GRAPH_QUANTILE_CACHE)
-
-    if check_downloaded_history(task_duration_tag_file, task_duration_cache, graph_quantile_cache):
-        return
-
-    try:
-        os.unlink(task_duration_tag_file)
-        os.unlink(task_duration_cache)
-        os.unlink(graph_quantile_cache)
-    except OSError:
-        print("No existing task history to clean up.")
-
-    try:
-        r = requests.get(TASK_DURATION_URL, stream=True)
-    except requests.exceptions.RequestException as exc:
-        # This is fine, the durations just won't be in the preview window.
-        print("Error fetching task duration cache from {}: {}".format(TASK_DURATION_URL, exc))
-        return
-
-    # The data retrieved from google storage is a newline-separated
-    # list of json entries, which Python's json module can't parse.
-    duration_data = list()
-    for line in r.content.splitlines():
-        duration_data.append(json.loads(line))
-
-    with open(task_duration_cache, 'w') as f:
-        json.dump(duration_data, f, indent=4)
-
-    try:
-        r = requests.get(GRAPH_QUANTILES_URL, stream=True)
-    except requests.exceptions.RequestException as exc:
-        # This is fine, the percentile just won't be in the preview window.
-        print("Error fetching task group percentiles from {}: {}".format(GRAPH_QUANTILES_URL, exc))
-        return
-
-    with open(graph_quantile_cache, 'w') as f:
-        f.write(r.content)
-
-    with open(task_duration_tag_file, 'w') as f:
-        json.dump({
-            'download_date': datetime.now().strftime('%Y-%m-%d')
-            }, f, indent=4)
-
-
-def make_trimmed_taskgraph_cache(graph_cache, dep_cache, target_file=None):
-    """Trim the taskgraph cache used for dependencies.
-
-    Speeds up the fzf preview window to less human-perceptible
-    ranges."""
-    if not os.path.isfile(graph_cache):
-        return
-
-    target_task_set = set()
-    if target_file:
-        with open(target_file) as f:
-            target_task_set = set(json.load(f).keys())
-
-    with open(graph_cache) as f:
-        graph = json.load(f)
-    graph = {
-        name: list(defn['dependencies'].values())
-        for name, defn in graph.items()
-        if name in target_task_set
-    }
-    with open(dep_cache, 'w') as f:
-        json.dump(graph, f, indent=4)
-
-
 def find_all_dependencies(graph, tasklist):
     all_dependencies = dict()
 
@@ -176,8 +79,7 @@ def determine_quantile(quantiles_file, duration):
 
 def task_duration_data(cache_dir):
     with open(os.path.join(cache_dir, TASK_DURATION_CACHE)) as f:
-        durations = json.load(f)
-    return {d['name']: d['mean_duration_seconds'] for d in durations}
+        return json.load(f)
 
 
 def duration_summary(graph_cache_file, tasklist, cache_dir):
@@ -217,6 +119,7 @@ def duration_summary(graph_cache_file, tasklist, cache_dir):
 
     output["wall_duration_seconds"] = timedelta(seconds=int(longest_path))
     output["eta_datetime"] = datetime.now()+timedelta(seconds=longest_path)
-    # (datetime.now()+timedelta(seconds=longest_path)).strftime("%H:%M")
+
+    output["task_durations"] = {task: int(durations.get(task, 0.0)) for task in tasklist}
 
     return output
diff --git a/tools/tryselect/util/manage_estimates.py b/tools/tryselect/util/manage_estimates.py
@@ -0,0 +1,116 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+from __future__ import absolute_import, print_function
+
+import os
+import requests
+import json
+from datetime import datetime, timedelta
+
+
+TASK_DURATION_URL = 'https://storage.googleapis.com/mozilla-mach-data/task_duration_history.json'
+GRAPH_QUANTILES_URL = 'https://storage.googleapis.com/mozilla-mach-data/machtry_quantiles.csv'
+from .estimates import TASK_DURATION_CACHE, GRAPH_QUANTILE_CACHE, TASK_DURATION_TAG_FILE
+
+
+def check_downloaded_history(tag_file, duration_cache, quantile_cache):
+    if not os.path.isfile(tag_file):
+        return False
+
+    try:
+        with open(tag_file) as f:
+            duration_tags = json.load(f)
+        download_date = datetime.strptime(duration_tags.get('download_date'), '%Y-%M-%d')
+        if download_date < datetime.now() - timedelta(days=7):
+            return False
+    except (IOError, ValueError):
+        return False
+
+    if not os.path.isfile(duration_cache):
+        return False
+    # Check for old format version of file.
+    with open(duration_cache) as f:
+        data = json.load(f)
+        if isinstance(data, list):
+            return False
+    if not os.path.isfile(quantile_cache):
+        return False
+
+    return True
+
+
+def download_task_history_data(cache_dir):
+    """Fetch task duration data exported from BigQuery."""
+    task_duration_cache = os.path.join(cache_dir, TASK_DURATION_CACHE)
+    task_duration_tag_file = os.path.join(cache_dir, TASK_DURATION_TAG_FILE)
+    graph_quantile_cache = os.path.join(cache_dir, GRAPH_QUANTILE_CACHE)
+
+    if check_downloaded_history(task_duration_tag_file, task_duration_cache, graph_quantile_cache):
+        return
+
+    try:
+        os.unlink(task_duration_tag_file)
+        os.unlink(task_duration_cache)
+        os.unlink(graph_quantile_cache)
+    except OSError:
+        print("No existing task history to clean up.")
+
+    try:
+        r = requests.get(TASK_DURATION_URL, stream=True)
+    except requests.exceptions.RequestException as exc:
+        # This is fine, the durations just won't be in the preview window.
+        print("Error fetching task duration cache from {}: {}".format(TASK_DURATION_URL, exc))
+        return
+
+    # The data retrieved from google storage is a newline-separated
+    # list of json entries, which Python's json module can't parse.
+    duration_data = list()
+    for line in r.content.splitlines():
+        duration_data.append(json.loads(line))
+
+    # Reformat duration data to avoid list of dicts, as this is slow in the preview window
+    duration_data = {d['name']: d['mean_duration_seconds'] for d in duration_data}
+
+    with open(task_duration_cache, 'w') as f:
+        json.dump(duration_data, f, indent=4)
+
+    try:
+        r = requests.get(GRAPH_QUANTILES_URL, stream=True)
+    except requests.exceptions.RequestException as exc:
+        # This is fine, the percentile just won't be in the preview window.
+        print("Error fetching task group percentiles from {}: {}".format(GRAPH_QUANTILES_URL, exc))
+        return
+
+    with open(graph_quantile_cache, 'w') as f:
+        f.write(r.content)
+
+    with open(task_duration_tag_file, 'w') as f:
+        json.dump({
+            'download_date': datetime.now().strftime('%Y-%m-%d')
+            }, f, indent=4)
+
+
+def make_trimmed_taskgraph_cache(graph_cache, dep_cache, target_file=None):
+    """Trim the taskgraph cache used for dependencies.
+
+    Speeds up the fzf preview window to less human-perceptible
+    ranges."""
+    if not os.path.isfile(graph_cache):
+        return
+
+    target_task_set = set()
+    if target_file:
+        with open(target_file) as f:
+            target_task_set = set(json.load(f).keys())
+
+    with open(graph_cache) as f:
+        graph = json.load(f)
+    graph = {
+        name: list(defn['dependencies'].values())
+        for name, defn in graph.items()
+        if name in target_task_set
+    }
+    with open(dep_cache, 'w') as f:
+        json.dump(graph, f, indent=4)