Skip to content

Commit

Permalink
Add benchmark upload util to Bigquery. (tensorflow#3776)
Browse files Browse the repository at this point in the history
* Add benchmark upload util to bigquery.

Also update the benchmark logger and bigquery schema for the
errors found during the integration test.

* Fix lint error.

* Update test to clear all the env vars during test.

This was causing error since the Kokoro test has TF_PKG=tf-nightly
injected during test.

* Update lintrc to ignore google related package.

* Another attempt to fix lint import error.

* Address the review comment.

* Fix lint error.

* Another fix for lint.

* Update test comment for env var clean up.
  • Loading branch information
qlzh727 authored Mar 28, 2018
1 parent 03781c7 commit 932364b
Show file tree
Hide file tree
Showing 9 changed files with 237 additions and 32 deletions.
26 changes: 23 additions & 3 deletions official/benchmark/datastore/schema/benchmark_run.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
"description": "The date when the test of the model is started",
"mode": "REQUIRED",
"name": "run_date",
"type": "DATETIME"
"type": "TIMESTAMP"
},
{
"description": "The tensorflow version information.",
Expand Down Expand Up @@ -58,7 +58,7 @@
"type": "RECORD"
},
{
"description": "Enviornment variables when the benchmark run is executed.",
"description": "Environment variables when the benchmark run is executed.",
"fields": [
{
"description": "The name of the variable.",
Expand All @@ -74,7 +74,27 @@
}
],
"mode": "REPEATED",
"name": "enviornment_variable",
"name": "environment_variable",
"type": "RECORD"
},
{
"description": "TF Environment variables when the benchmark run is executed.",
"fields": [
{
"description": "The name of the variable.",
"mode": "REQUIRED",
"name": "name",
"type": "STRING"
},
{
"description": "The value of the variable.",
"mode": "NULLABLE",
"name": "value",
"type": "STRING"
}
],
"mode": "REPEATED",
"name": "tensorflow_environment_variables",
"type": "RECORD"
},
{
Expand Down
3 changes: 2 additions & 1 deletion official/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
psutil>=5.4.3
py-cpuinfo>=3.3.0
py-cpuinfo>=3.3.0
google-cloud-bigquery>=0.31.0
9 changes: 7 additions & 2 deletions official/resnet/resnet_run_loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,12 @@ def resnet_main(flags, model_function, input_function):
'version': flags.version,
})

if flags.benchmark_log_dir is not None:
benchmark_logger = logger.BenchmarkLogger(flags.benchmark_log_dir)
benchmark_logger.log_run_info("resnet")
else:
benchmark_logger = None

for _ in range(flags.train_epochs // flags.epochs_between_evals):
train_hooks = hooks_helper.get_train_hooks(
flags.hooks,
Expand Down Expand Up @@ -380,8 +386,7 @@ def input_fn_eval():
steps=flags.max_train_steps)
print(eval_results)

if flags.benchmark_log_dir is not None:
benchmark_logger = logger.BenchmarkLogger(flags.benchmark_log_dir)
if benchmark_logger:
benchmark_logger.log_estimator_evaluation_result(eval_results)


Expand Down
28 changes: 27 additions & 1 deletion official/utils/arg_parsers/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,11 +234,37 @@ class BenchmarkParser(argparse.ArgumentParser):
benchmark_log_dir: Create a flag to specify location for benchmark logging.
"""

def __init__(self, add_help=False, benchmark_log_dir=True):
def __init__(self, add_help=False, benchmark_log_dir=True,
bigquery_uploader=True):
super(BenchmarkParser, self).__init__(add_help=add_help)
if benchmark_log_dir:
self.add_argument(
"--benchmark_log_dir", "-bld", default=None,
help="[default: %(default)s] The location of the benchmark logging.",
metavar="<BLD>"
)
if bigquery_uploader:
self.add_argument(
"--gcp_project", "-gp", default=None,
help="[default: %(default)s] The GCP project name where the benchmark"
" will be uploaded.",
metavar="<GP>"
)
self.add_argument(
"--bigquery_data_set", "-bds", default="test_benchmark",
help="[default: %(default)s] The Bigquery dataset name where the"
" benchmark will be uploaded.",
metavar="<BDS>"
)
self.add_argument(
"--bigquery_run_table", "-brt", default="benchmark_run",
help="[default: %(default)s] The Bigquery table name where the"
" benchmark run information will be uploaded.",
metavar="<BRT>"
)
self.add_argument(
"--bigquery_metric_table", "-bmt", default="benchmark_metric",
help="[default: %(default)s] The Bigquery table name where the"
" benchmark metric information will be uploaded.",
metavar="<BMT>"
)
5 changes: 3 additions & 2 deletions official/utils/arg_parsers/parsers_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def __init__(self):
parsers.PerformanceParser(num_parallel_calls=True, inter_op=True,
intra_op=True, use_synthetic_data=True),
parsers.ImageModelParser(data_format=True),
parsers.BenchmarkParser(benchmark_log_dir=True)
parsers.BenchmarkParser(benchmark_log_dir=True, bigquery_uploader=True)
])


Expand Down Expand Up @@ -62,7 +62,8 @@ def test_default_setting(self):
def test_benchmark_setting(self):
defaults = dict(
hooks=["LoggingMetricHook"],
benchmark_log_dir="/tmp/12345"
benchmark_log_dir="/tmp/12345",
gcp_project="project_abc",
)

parser = TestParser()
Expand Down
129 changes: 129 additions & 0 deletions official/utils/logging/benchmark_uploader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""Library to upload benchmark generated by BenchmarkLogger to remote repo.
This library require google cloud bigquery lib as dependency, which can be
installed with:
> pip install --upgrade google-cloud-bigquery
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import json
import os
import sys
import uuid

from google.cloud import bigquery

import tensorflow as tf # pylint: disable=g-bad-import-order

from official.utils.arg_parsers import parsers
from official.utils.logging import logger


class BigQueryUploader(object):
"""Upload the benchmark and metric info to BigQuery."""

def __init__(self, logging_dir, gcp_project=None, credentials=None):
"""Initialized BigQueryUploader with proper setting.
Args:
logging_dir: string, logging directory that contains the benchmark log.
gcp_project: string, the name of the GCP project that the log will be
uploaded to. The default project name will be detected from local
environment if no value is provided.
credentials: google.auth.credentials. The credential to access the
BigQuery service. The default service account credential will be
detected from local environment if no value is provided. Please use
google.oauth2.service_account.Credentials to load credential from local
file for the case that the test is run out side of GCP.
"""
self._logging_dir = logging_dir
self._bq_client = bigquery.Client(
project=gcp_project, credentials=credentials)

def upload_benchmark_run(self, dataset_name, table_name, run_id):
"""Upload benchmark run information to Bigquery.
Args:
dataset_name: string, the name of bigquery dataset where the data will be
uploaded.
table_name: string, the name of bigquery table under the dataset where
the data will be uploaded.
run_id: string, a unique ID that will be attached to the data, usually
this is a UUID4 format.
"""
expected_file = os.path.join(
self._logging_dir, logger.BENCHMARK_RUN_LOG_FILE_NAME)
with tf.gfile.GFile(expected_file) as f:
benchmark_json = json.load(f)
benchmark_json["model_id"] = run_id
table_ref = self._bq_client.dataset(dataset_name).table(table_name)
errors = self._bq_client.insert_rows_json(table_ref, [benchmark_json])
if errors:
tf.logging.error(
"Failed to upload benchmark info to bigquery: {}".format(errors))

def upload_metric(self, dataset_name, table_name, run_id):
"""Upload metric information to Bigquery.
Args:
dataset_name: string, the name of bigquery dataset where the data will be
uploaded.
table_name: string, the name of bigquery table under the dataset where
the metric data will be uploaded. This is different from the
benchmark_run table.
run_id: string, a unique ID that will be attached to the data, usually
this is a UUID4 format. This should be the same as the benchmark run_id.
"""
expected_file = os.path.join(
self._logging_dir, logger.METRIC_LOG_FILE_NAME)
with tf.gfile.GFile(expected_file) as f:
lines = f.readlines()
metrics = []
for line in filter(lambda l: l.strip(), lines):
metric = json.loads(line)
metric["run_id"] = run_id
metrics.append(metric)
table_ref = self._bq_client.dataset(dataset_name).table(table_name)
errors = self._bq_client.insert_rows_json(table_ref, metrics)
if errors:
tf.logging.error(
"Failed to upload benchmark info to bigquery: {}".format(errors))


def main(argv):
parser = parsers.BenchmarkParser()
flags = parser.parse_args(args=argv[1:])
if not flags.benchmark_log_dir:
print("Usage: benchmark_uploader.py --benchmark_log_dir=/some/dir")
sys.exit(1)

uploader = BigQueryUploader(
flags.benchmark_log_dir,
gcp_project=flags.gcp_project)
run_id = str(uuid.uuid4())
uploader.upload_benchmark_run(
flags.bigquery_data_set, flags.bigquery_run_table, run_id)
uploader.upload_metric(
flags.bigquery_data_set, flags.bigquery_metric_table, run_id)


if __name__ == "__main__":
main(argv=sys.argv)
31 changes: 19 additions & 12 deletions official/utils/logging/logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@
import tensorflow as tf
from tensorflow.python.client import device_lib

_METRIC_LOG_FILE_NAME = "metric.log"
_BENCHMARK_RUN_LOG_FILE_NAME = "benchmark_run.log"
METRIC_LOG_FILE_NAME = "metric.log"
BENCHMARK_RUN_LOG_FILE_NAME = "benchmark_run.log"
_DATE_TIME_FORMAT_PATTERN = "%Y-%m-%dT%H:%M:%S.%fZ"


Expand Down Expand Up @@ -81,9 +81,12 @@ def log_metric(self, name, value, unit=None, global_step=None, extras=None):
tf.logging.warning(
"Metric value to log should be a number. Got %s", type(value))
return

if extras:
extras = [{"name": k, "value": v} for k, v in sorted(extras.items())]
else:
extras = []
with tf.gfile.GFile(
os.path.join(self._logging_dir, _METRIC_LOG_FILE_NAME), "a") as f:
os.path.join(self._logging_dir, METRIC_LOG_FILE_NAME), "a") as f:
metric = {
"name": name,
"value": float(value),
Expand All @@ -107,15 +110,18 @@ def log_run_info(self, model_name):
Args:
model_name: string, the name of the model.
"""
run_info = {"model_name": model_name}
run_info = {
"model_name": model_name,
"machine_config": {},
"run_date": datetime.datetime.now().strftime(_DATE_TIME_FORMAT_PATTERN)}
_collect_tensorflow_info(run_info)
_collect_tensorflow_environment_variables(run_info)
_collect_cpu_info(run_info)
_collect_gpu_info(run_info)
_collect_memory_info(run_info)

with tf.gfile.GFile(os.path.join(
self._logging_dir, _BENCHMARK_RUN_LOG_FILE_NAME), "w") as f:
self._logging_dir, BENCHMARK_RUN_LOG_FILE_NAME), "w") as f:
try:
json.dump(run_info, f)
f.write("\n")
Expand All @@ -130,8 +136,9 @@ def _collect_tensorflow_info(run_info):


def _collect_tensorflow_environment_variables(run_info):
run_info["tensorflow_environment_variables"] = {
k: v for k, v in os.environ.items() if k.startswith("TF_")}
run_info["tensorflow_environment_variables"] = [
{"name": k, "value": v}
for k, v in sorted(os.environ.items()) if k.startswith("TF_")]


# The following code is mirrored from tensorflow/tools/test/system_info_lib
Expand All @@ -150,7 +157,7 @@ def _collect_cpu_info(run_info):
cpu_info["cpu_info"] = info["brand"]
cpu_info["mhz_per_cpu"] = info["hz_advertised_raw"][0] / 1.0e6

run_info["cpu_info"] = cpu_info
run_info["machine_config"]["cpu_info"] = cpu_info


def _collect_gpu_info(run_info):
Expand All @@ -168,16 +175,16 @@ def _collect_gpu_info(run_info):
gpu_info["model"] = _parse_gpu_model(d.physical_device_desc)
# Assume all the GPU connected are same model
break
run_info["gpu_info"] = gpu_info
run_info["machine_config"]["gpu_info"] = gpu_info


def _collect_memory_info(run_info):
# Note: psutil is not installed in the TensorFlow OSS tree.
# It is installable via pip.
import psutil # pylint: disable=g-import-not-at-top
vmem = psutil.virtual_memory()
run_info["memory_total"] = vmem.total
run_info["memory_available"] = vmem.available
run_info["machine_config"]["memory_total"] = vmem.total
run_info["machine_config"]["memory_available"] = vmem.available


def _parse_gpu_model(physical_device_desc):
Expand Down
Loading

0 comments on commit 932364b

Please sign in to comment.