Add benchmark upload util to Bigquery. (tensorflow#3776)

* Add benchmark upload util to bigquery. Also update the benchmark logger and bigquery schema for the errors found during the integration test. * Fix lint error. * Update test to clear all the env vars during test. This was causing error since the Kokoro test has TF_PKG=tf-nightly injected during test. * Update lintrc to ignore google related package. * Another attempt to fix lint import error. * Address the review comment. * Fix lint error. * Another fix for lint. * Update test comment for env var clean up.
vdevaram · Mar 28, 2018 · 932364b · 932364b
1 parent 03781c7
commit 932364b
Show file tree

Hide file tree

Showing 9 changed files with 237 additions and 32 deletions.
diff --git a/official/benchmark/datastore/schema/benchmark_run.json b/official/benchmark/datastore/schema/benchmark_run.json
@@ -15,7 +15,7 @@
     "description": "The date when the test of the model is started",
     "mode": "REQUIRED",
     "name": "run_date",
-    "type": "DATETIME"
+    "type": "TIMESTAMP"
   },
   {
     "description": "The tensorflow version information.",
@@ -58,7 +58,7 @@
     "type": "RECORD"
   },
   {
-    "description": "Enviornment variables when the benchmark run is executed.",
+    "description": "Environment variables when the benchmark run is executed.",
     "fields": [
       {
         "description": "The name of the variable.",
@@ -74,7 +74,27 @@
       }
     ],
     "mode": "REPEATED",
-    "name": "enviornment_variable",
+    "name": "environment_variable",
+    "type": "RECORD"
+  },
+  {
+    "description": "TF Environment variables when the benchmark run is executed.",
+    "fields": [
+      {
+        "description": "The name of the variable.",
+        "mode": "REQUIRED",
+        "name": "name",
+        "type": "STRING"
+      },
+      {
+        "description": "The value of the variable.",
+        "mode": "NULLABLE",
+        "name": "value",
+        "type": "STRING"
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "tensorflow_environment_variables",
     "type": "RECORD"
   },
   {

diff --git a/official/requirements.txt b/official/requirements.txt
@@ -1,2 +1,3 @@
 psutil>=5.4.3
-py-cpuinfo>=3.3.0
+py-cpuinfo>=3.3.0
+google-cloud-bigquery>=0.31.0
diff --git a/official/resnet/resnet_run_loop.py b/official/resnet/resnet_run_loop.py
@@ -348,6 +348,12 @@ def resnet_main(flags, model_function, input_function):
           'version': flags.version,
       })
 
+  if flags.benchmark_log_dir is not None:
+    benchmark_logger = logger.BenchmarkLogger(flags.benchmark_log_dir)
+    benchmark_logger.log_run_info("resnet")
+  else:
+    benchmark_logger = None
+
   for _ in range(flags.train_epochs // flags.epochs_between_evals):
     train_hooks = hooks_helper.get_train_hooks(
         flags.hooks,
@@ -380,8 +386,7 @@ def input_fn_eval():
                                        steps=flags.max_train_steps)
     print(eval_results)
 
-    if flags.benchmark_log_dir is not None:
-      benchmark_logger = logger.BenchmarkLogger(flags.benchmark_log_dir)
+    if benchmark_logger:
       benchmark_logger.log_estimator_evaluation_result(eval_results)
 
 

diff --git a/official/utils/arg_parsers/parsers.py b/official/utils/arg_parsers/parsers.py
@@ -234,11 +234,37 @@ class BenchmarkParser(argparse.ArgumentParser):
     benchmark_log_dir: Create a flag to specify location for benchmark logging.
   """
 
-  def __init__(self, add_help=False, benchmark_log_dir=True):
+  def __init__(self, add_help=False, benchmark_log_dir=True,
+               bigquery_uploader=True):
     super(BenchmarkParser, self).__init__(add_help=add_help)
     if benchmark_log_dir:
       self.add_argument(
           "--benchmark_log_dir", "-bld", default=None,
           help="[default: %(default)s] The location of the benchmark logging.",
           metavar="<BLD>"
       )
+    if bigquery_uploader:
+      self.add_argument(
+          "--gcp_project", "-gp", default=None,
+          help="[default: %(default)s] The GCP project name where the benchmark"
+               " will be uploaded.",
+          metavar="<GP>"
+      )
+      self.add_argument(
+          "--bigquery_data_set", "-bds", default="test_benchmark",
+          help="[default: %(default)s] The Bigquery dataset name where the"
+               " benchmark will be uploaded.",
+          metavar="<BDS>"
+      )
+      self.add_argument(
+          "--bigquery_run_table", "-brt", default="benchmark_run",
+          help="[default: %(default)s] The Bigquery table name where the"
+               " benchmark run information will be uploaded.",
+          metavar="<BRT>"
+      )
+      self.add_argument(
+          "--bigquery_metric_table", "-bmt", default="benchmark_metric",
+          help="[default: %(default)s] The Bigquery table name where the"
+               " benchmark metric information will be uploaded.",
+          metavar="<BMT>"
+      )
diff --git a/official/utils/arg_parsers/parsers_test.py b/official/utils/arg_parsers/parsers_test.py
@@ -29,7 +29,7 @@ def __init__(self):
         parsers.PerformanceParser(num_parallel_calls=True, inter_op=True,
                                   intra_op=True, use_synthetic_data=True),
         parsers.ImageModelParser(data_format=True),
-        parsers.BenchmarkParser(benchmark_log_dir=True)
+        parsers.BenchmarkParser(benchmark_log_dir=True, bigquery_uploader=True)
     ])
 
 
@@ -62,7 +62,8 @@ def test_default_setting(self):
   def test_benchmark_setting(self):
     defaults = dict(
         hooks=["LoggingMetricHook"],
-        benchmark_log_dir="/tmp/12345"
+        benchmark_log_dir="/tmp/12345",
+        gcp_project="project_abc",
     )
 
     parser = TestParser()

diff --git a/official/utils/logging/benchmark_uploader.py b/official/utils/logging/benchmark_uploader.py
@@ -0,0 +1,129 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Library to upload benchmark generated by BenchmarkLogger to remote repo.
+
+This library require google cloud bigquery lib as dependency, which can be
+installed with:
+  > pip install --upgrade google-cloud-bigquery
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+import os
+import sys
+import uuid
+
+from google.cloud import bigquery
+
+import tensorflow as tf # pylint: disable=g-bad-import-order
+
+from official.utils.arg_parsers import parsers
+from official.utils.logging import logger
+
+
+class BigQueryUploader(object):
+  """Upload the benchmark and metric info to BigQuery."""
+
+  def __init__(self, logging_dir, gcp_project=None, credentials=None):
+    """Initialized BigQueryUploader with proper setting.
+
+    Args:
+      logging_dir: string, logging directory that contains the benchmark log.
+      gcp_project: string, the name of the GCP project that the log will be
+        uploaded to. The default project name will be detected from local
+        environment if no value is provided.
+      credentials: google.auth.credentials. The credential to access the
+        BigQuery service. The default service account credential will be
+        detected from local environment if no value is provided. Please use
+        google.oauth2.service_account.Credentials to load credential from local
+        file for the case that the test is run out side of GCP.
+    """
+    self._logging_dir = logging_dir
+    self._bq_client = bigquery.Client(
+        project=gcp_project, credentials=credentials)
+
+  def upload_benchmark_run(self, dataset_name, table_name, run_id):
+    """Upload benchmark run information to Bigquery.
+
+    Args:
+      dataset_name: string, the name of bigquery dataset where the data will be
+        uploaded.
+      table_name: string, the name of bigquery table under the dataset where
+        the data will be uploaded.
+      run_id: string, a unique ID that will be attached to the data, usually
+        this is a UUID4 format.
+    """
+    expected_file = os.path.join(
+        self._logging_dir, logger.BENCHMARK_RUN_LOG_FILE_NAME)
+    with tf.gfile.GFile(expected_file) as f:
+      benchmark_json = json.load(f)
+      benchmark_json["model_id"] = run_id
+      table_ref = self._bq_client.dataset(dataset_name).table(table_name)
+      errors = self._bq_client.insert_rows_json(table_ref, [benchmark_json])
+      if errors:
+        tf.logging.error(
+            "Failed to upload benchmark info to bigquery: {}".format(errors))
+
+  def upload_metric(self, dataset_name, table_name, run_id):
+    """Upload metric information to Bigquery.
+
+    Args:
+      dataset_name: string, the name of bigquery dataset where the data will be
+        uploaded.
+      table_name: string, the name of bigquery table under the dataset where
+        the metric data will be uploaded. This is different from the
+        benchmark_run table.
+      run_id: string, a unique ID that will be attached to the data, usually
+        this is a UUID4 format. This should be the same as the benchmark run_id.
+    """
+    expected_file = os.path.join(
+        self._logging_dir, logger.METRIC_LOG_FILE_NAME)
+    with tf.gfile.GFile(expected_file) as f:
+      lines = f.readlines()
+      metrics = []
+      for line in filter(lambda l: l.strip(), lines):
+        metric = json.loads(line)
+        metric["run_id"] = run_id
+        metrics.append(metric)
+      table_ref = self._bq_client.dataset(dataset_name).table(table_name)
+      errors = self._bq_client.insert_rows_json(table_ref, metrics)
+      if errors:
+        tf.logging.error(
+            "Failed to upload benchmark info to bigquery: {}".format(errors))
+
+
+def main(argv):
+  parser = parsers.BenchmarkParser()
+  flags = parser.parse_args(args=argv[1:])
+  if not flags.benchmark_log_dir:
+    print("Usage: benchmark_uploader.py --benchmark_log_dir=/some/dir")
+    sys.exit(1)
+
+  uploader = BigQueryUploader(
+      flags.benchmark_log_dir,
+      gcp_project=flags.gcp_project)
+  run_id = str(uuid.uuid4())
+  uploader.upload_benchmark_run(
+      flags.bigquery_data_set, flags.bigquery_run_table, run_id)
+  uploader.upload_metric(
+      flags.bigquery_data_set, flags.bigquery_metric_table, run_id)
+
+
+if __name__ == "__main__":
+  main(argv=sys.argv)
diff --git a/official/utils/logging/logger.py b/official/utils/logging/logger.py
@@ -31,8 +31,8 @@
 import tensorflow as tf
 from tensorflow.python.client import device_lib
 
-_METRIC_LOG_FILE_NAME = "metric.log"
-_BENCHMARK_RUN_LOG_FILE_NAME = "benchmark_run.log"
+METRIC_LOG_FILE_NAME = "metric.log"
+BENCHMARK_RUN_LOG_FILE_NAME = "benchmark_run.log"
 _DATE_TIME_FORMAT_PATTERN = "%Y-%m-%dT%H:%M:%S.%fZ"
 
 
@@ -81,9 +81,12 @@ def log_metric(self, name, value, unit=None, global_step=None, extras=None):
       tf.logging.warning(
           "Metric value to log should be a number. Got %s", type(value))
       return
-
+    if extras:
+      extras = [{"name": k, "value": v} for k, v in sorted(extras.items())]
+    else:
+      extras = []
     with tf.gfile.GFile(
-        os.path.join(self._logging_dir, _METRIC_LOG_FILE_NAME), "a") as f:
+        os.path.join(self._logging_dir, METRIC_LOG_FILE_NAME), "a") as f:
       metric = {
           "name": name,
           "value": float(value),
@@ -107,15 +110,18 @@ def log_run_info(self, model_name):
     Args:
       model_name: string, the name of the model.
     """
-    run_info = {"model_name": model_name}
+    run_info = {
+        "model_name": model_name,
+        "machine_config": {},
+        "run_date": datetime.datetime.now().strftime(_DATE_TIME_FORMAT_PATTERN)}
     _collect_tensorflow_info(run_info)
     _collect_tensorflow_environment_variables(run_info)
     _collect_cpu_info(run_info)
     _collect_gpu_info(run_info)
     _collect_memory_info(run_info)
 
     with tf.gfile.GFile(os.path.join(
-        self._logging_dir, _BENCHMARK_RUN_LOG_FILE_NAME), "w") as f:
+        self._logging_dir, BENCHMARK_RUN_LOG_FILE_NAME), "w") as f:
       try:
         json.dump(run_info, f)
         f.write("\n")
@@ -130,8 +136,9 @@ def _collect_tensorflow_info(run_info):
 
 
 def _collect_tensorflow_environment_variables(run_info):
-  run_info["tensorflow_environment_variables"] = {
-      k: v for k, v in os.environ.items() if k.startswith("TF_")}
+  run_info["tensorflow_environment_variables"] = [
+      {"name": k, "value": v}
+      for k, v in sorted(os.environ.items()) if k.startswith("TF_")]
 
 
 # The following code is mirrored from tensorflow/tools/test/system_info_lib
@@ -150,7 +157,7 @@ def _collect_cpu_info(run_info):
   cpu_info["cpu_info"] = info["brand"]
   cpu_info["mhz_per_cpu"] = info["hz_advertised_raw"][0] / 1.0e6
 
-  run_info["cpu_info"] = cpu_info
+  run_info["machine_config"]["cpu_info"] = cpu_info
 
 
 def _collect_gpu_info(run_info):
@@ -168,16 +175,16 @@ def _collect_gpu_info(run_info):
       gpu_info["model"] = _parse_gpu_model(d.physical_device_desc)
       # Assume all the GPU connected are same model
       break
-  run_info["gpu_info"] = gpu_info
+  run_info["machine_config"]["gpu_info"] = gpu_info
 
 
 def _collect_memory_info(run_info):
   # Note: psutil is not installed in the TensorFlow OSS tree.
   # It is installable via pip.
   import psutil   # pylint: disable=g-import-not-at-top
   vmem = psutil.virtual_memory()
-  run_info["memory_total"] = vmem.total
-  run_info["memory_available"] = vmem.available
+  run_info["machine_config"]["memory_total"] = vmem.total
+  run_info["machine_config"]["memory_available"] = vmem.available
 
 
 def _parse_gpu_model(physical_device_desc):