diff --git a/mlperf_logging/benchmark_meta.py b/mlperf_logging/benchmark_meta.py index 949cedf..1091b68 100644 --- a/mlperf_logging/benchmark_meta.py +++ b/mlperf_logging/benchmark_meta.py @@ -10,6 +10,7 @@ 'minigo': 10, 'resnet': 5, 'ssd': 5, + 'stable_diffusion': 10, 'transformer': 10, 'ncf': 10, 'rnnt': 10, diff --git a/mlperf_logging/compliance_checker/README.md b/mlperf_logging/compliance_checker/README.md index fb832b4..3fc4f31 100644 --- a/mlperf_logging/compliance_checker/README.md +++ b/mlperf_logging/compliance_checker/README.md @@ -23,7 +23,7 @@ As log examples use [NVIDIA's training logs](https://github.com/mlperf/training_ ### Existing config files for training submissions 3.1.0/common.yaml - currently the default config file, checks common fields complience and equeues benchmark-specific config file - 3.1.0/closed_common.yaml - the common rules file for closed submissions. These rules apply to all benchmarks + 3.1.0/closed_common.yaml - the common rules file for closed submissions. These rules apply to all benchmarks 3.1.0/open_common.yaml - the common rules file for open submissions. These rules apply to all benchmarks 3.1.0/closed_resnet.yaml - Per-benchmark rules, closed submissions. 3.1.0/closed_ssd.yaml @@ -33,6 +33,7 @@ As log examples use [NVIDIA's training logs](https://github.com/mlperf/training_ 3.1.0/closed_bert.yaml 3.1.0/closed_dlrm_dcnv2.yaml 3.1.0/closed_gpt3.yaml + 3.1.0/closed_stable_diffusion.yaml 3.1.0/open_resnet.yaml - Per-benchmark rules, closed submissions. 3.1.0/open_ssd.yaml 3.1.0/open_maskrcnn.yaml @@ -41,6 +42,7 @@ As log examples use [NVIDIA's training logs](https://github.com/mlperf/training_ 3.1.0/open_bert.yaml 3.1.0/open_dlrm_dcnv2.yaml 3.1.0/open_gpt3.yaml + 3.1.0/open_stable_diffusion.yaml ### Existing config files for HPC submissions @@ -64,7 +66,7 @@ Compliance checking is done following below algorithm. 2. If present, evaluate `CHECK` section, and raise an exception if the result is false 7. Print all warning messages -Possible side effects of yaml sections execution can be [printing output](#other-operations), or [enqueueing +Possible side effects of yaml sections execution can be [printing output](#other-operations), or [enqueueing additional yaml files to be verified](#enqueuing-additional-config-files). ### Config file syntax @@ -72,7 +74,7 @@ Rules to be checked are provided in yaml (config) file. A config file contains t #### `BEGIN` record Defines `CODE` to be executed before any other rules defined in the current file. This record is optional -and there can be up to a single `BEGIN` record per config file. +and there can be up to a single `BEGIN` record per config file. Example: @@ -87,6 +89,7 @@ The following fields are optional: - `REQ` - specifies the requirement regarding occurrence. Possible values : - `EXACTLY_ONE` - current key has to appear exactly once - `AT_LEAST_ONE` - current key has to appear at least once + - `AT_LEAST(n)` - current key has to appear at least n times - `AT_LEAST_ONE_OR(alternatives)` - current key or one of the alternative has to appear at least once; alternatives is a comma separated list of keys - `PRE` - code to be executed before performing checks @@ -112,14 +115,14 @@ The following fields are optional: #### Global and local state access -During processing of the records there is a global state `s` maintained, accessible from +During processing of the records there is a global state `s` maintained, accessible from code provided in yaml. In addition, rules can access the information fields (values) `v` of the record, as well as timestamp and the original line string as part of the record `ll`. -Global state `s` can be used to enforce any cross keys rules, by updating the global state +Global state `s` can be used to enforce any cross keys rules, by updating the global state in `POST` (or `PRE`) of one `KEY` and using that information for `CHECK` of another `KEY`. -For each config file, `s` starts as an empty dictionary, so in order to track global state -it would require adding an entry to `s`. +For each config file, `s` starts as an empty dictionary, so in order to track global state +it would require adding an entry to `s`. Example: @@ -152,7 +155,7 @@ Config files in the queue are processed independently, meaning that they do not Each config file may define it's `BEGIN` and `END` records, as well as any other `KEY` rules. -Example: +Example: - KEY: NAME: submission_benchmark @@ -164,7 +167,7 @@ Example: #### Other operations `CODE`, `REQ`, and `POST` fields are executed using python's `exec` function. `CHECK` is performed -using `eval` call. As such, any legal python code would be suitable for use. +using `eval` call. As such, any legal python code would be suitable for use. For instance, can define rules that would print out information as shown in the [example above](#global-and-local-state-access). diff --git a/mlperf_logging/compliance_checker/mlp_compliance.py b/mlperf_logging/compliance_checker/mlp_compliance.py index af0269a..1a402ac 100644 --- a/mlperf_logging/compliance_checker/mlp_compliance.py +++ b/mlperf_logging/compliance_checker/mlp_compliance.py @@ -147,6 +147,11 @@ def parse_alternatives(self, string): alternatives = in_pharentises.split(',') return [s.strip() for s in alternatives] + def parse_at_least(self, string): + n_string = string[len('AT_LEAST(') : -1] + n = int(n_string) + return n + def configured_checks(self, loglines, config_file): with open(config_file) as f: checks = yaml.load(f, Loader=yaml.BaseLoader) @@ -164,7 +169,7 @@ def configured_checks(self, loglines, config_file): begin_blocks = [x for x in checks if list(x)[0]=='BEGIN'] assert(len(begin_blocks)<=1) # up to one begin block if len(begin_blocks)==1: - exec(begin_blocks[0]['BEGIN']['CODE'].strip(), state) + exec(begin_blocks[0]['BEGIN']['CODE'].strip(), state, locals()) key_records = {} for k in checks: @@ -231,6 +236,12 @@ def configured_checks(self, loglines, config_file): self.put_message(f"Required AT_LEAST_ONE occurrence of '{k}' but found {len(reported_values[k])}", key=k) + if v['REQ'].startswith('AT_LEAST'): + n = self.parse_at_least(v['REQ']) + if len(reported_values[k])= 0 " + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adamw' " + +- KEY: + NAME: opt_adamw_beta_1 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.9 " + +- KEY: + NAME: opt_adamw_beta_2 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.999 " + +- KEY: + NAME: opt_adamw_epsilon + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1e-08 " + +- KEY: + NAME: opt_adamw_weight_decay + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.01 " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + CHECK: " v['value'] >= 0.0 " + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] >= 0 " + +- KEY: + NAME: aggregated_eval_accuracy + REQ: AT_LEAST(2) + CHECK: + - "'FID' in v['value']" + - "'CLIP' in v['value']" + - "'step_num' in v['value']" + ATLEAST_ONE_CHECK: "(0.0 <= v['value']['FID'] <= 90.0) and (0.15 <= v['value']['CLIP'] <= 1.0)" + diff --git a/mlperf_logging/compliance_checker/training_3.1.0/open_common.yaml b/mlperf_logging/compliance_checker/training_3.1.0/open_common.yaml index b2bb177..e272cb6 100644 --- a/mlperf_logging/compliance_checker/training_3.1.0/open_common.yaml +++ b/mlperf_logging/compliance_checker/training_3.1.0/open_common.yaml @@ -2,6 +2,6 @@ - KEY: NAME: submission_benchmark REQ: EXACTLY_ONE - CHECK: " v['value'] in ['resnet', 'ssd', 'maskrcnn', 'gpt3', 'dlrm_dcnv2', 'bert', 'rnnt', 'unet3d'] " + CHECK: " v['value'] in ['resnet', 'ssd', 'stable_diffusion', 'maskrcnn', 'gpt3', 'dlrm_dcnv2', 'bert', 'rnnt', 'unet3d'] " POST: " enqueue_config('training_3.1.0/open_{}.yaml'.format(v['value'])) " diff --git a/mlperf_logging/compliance_checker/training_3.1.0/open_stable_diffusion.yaml b/mlperf_logging/compliance_checker/training_3.1.0/open_stable_diffusion.yaml new file mode 100644 index 0000000..391a4a9 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_3.1.0/open_stable_diffusion.yaml @@ -0,0 +1,34 @@ +# Stable diffusion uses two metrics, FID and CLIP. +# These metrics can be calculated offline, using different scripts +# and logged seperatly. Therefore, we create a virtual key +# called aggregated_eval_accuracy, which aggregates +# both metrics into a single log line + +- BEGIN: + CODE: | + from dataclasses import replace + agg_eval_lines = {} + for line in loglines: + if line.key == "eval_accuracy": + step_num = line.value['metadata']['step_num'] + if step_num not in agg_eval_lines: + new_line = replace(line) # Make a copy + new_line.key = "aggregated_eval_accuracy" + new_line.full_string = "" # Not needed + new_line.lineno = -1 # Not needed + new_line.value = {'value': {'step_num': step_num}, 'metadata':{}} + agg_eval_lines[step_num] = new_line + + agg_eval_lines[step_num].timestamp = max(line.timestamp, agg_eval_lines[step_num].timestamp) + agg_eval_lines[step_num].value['value'][line.value['metadata']['metric']] = line.value['value'] + loglines.extend(agg_eval_lines.values()) + +- KEY: + NAME: aggregated_eval_accuracy + REQ: AT_LEAST(2) + CHECK: + - "'FID' in v['value']" + - "'CLIP' in v['value']" + - "'step_num' in v['value']" + ATLEAST_ONE_CHECK: "v['value']['FID'] >= 0.0 and v['value']['CLIP'] <= 1.0" + diff --git a/mlperf_logging/rcp_checker/rcp_checker.py b/mlperf_logging/rcp_checker/rcp_checker.py index 6658cdb..c06cd40 100644 --- a/mlperf_logging/rcp_checker/rcp_checker.py +++ b/mlperf_logging/rcp_checker/rcp_checker.py @@ -3,6 +3,7 @@ ''' import argparse +from collections import defaultdict import glob import json import logging @@ -26,6 +27,7 @@ 'ssd' : 5, 'unet3d' : 40, 'rnnt': 10, + 'stable_diffusion': 10, }, "hpc": { 'cosmoflow': 10, @@ -48,6 +50,10 @@ def read_submission_file(result_file, use_train_samples): bs = -1 benchmark = None + # FID and CLIP metrics for stable diffusion are logged asynchronously + # and indepently from each others. We track the eval results + # so we can get the first eval step that passes the convergence criteria + stable_diffusion_eval_results = defaultdict(dict) with open(result_file, 'r', encoding='latin-1') as f: # TODO: use mlperf_logging.compliance_checker.mlp_parser instead file_contents = f.readlines() @@ -63,23 +69,39 @@ def read_submission_file(result_file, use_train_samples): benchmark = json.loads(str)["value"] if benchmark != "bert" and use_train_samples: use_train_samples = False - if not use_train_samples and ("eval_error" in str or "eval_accuracy" in str): + + if benchmark == "stable_diffusion" and ("eval_error" in str or "eval_accuracy" in str): + eval_accuracy_str = str + eval_step = json.loads(eval_accuracy_str)["metadata"]["step_num"] + eval_metric = json.loads(eval_accuracy_str)["metadata"]["metric"] + eval_score = json.loads(eval_accuracy_str)["value"] + stable_diffusion_eval_results[eval_step][eval_metric] = eval_score + elif not use_train_samples and ("eval_error" in str or "eval_accuracy" in str): eval_accuracy_str = str conv_epoch = json.loads(eval_accuracy_str)["metadata"]["epoch_num"] conv_epoch = round(conv_epoch, 3) - if use_train_samples and "train_samples" in str: + elif use_train_samples and "train_samples" in str: eval_accuracy_str = str conv_epoch = json.loads(eval_accuracy_str)["value"] + if "run_stop" in str: - # Epochs to converge is the the last epochs value on - # eval_accuracy line before run_stop conv_result = json.loads(str)["metadata"]["status"] if conv_result == "success": - subm_epochs = conv_epoch not_converged = 0 + # Epochs to converge is the the last epochs value on + # eval_accuracy line before run_stop. Except for Stable Diffusion + # where we use the first eval step that passes the convergence criteria + if benchmark == "stable_diffusion": + passing_epochs = [] + for eval_step, eval_result in stable_diffusion_eval_results.items(): + # TODO: we shouldn't hardcode the convergence criteria here ! + if eval_result["FID"] <= 90.0 and eval_result["CLIP"] >= 0.15: + passing_epochs.append(eval_step) + conv_epoch = min(passing_epochs) + subm_epochs = conv_epoch else: - subm_epochs = 1e9 not_converged = 1 + subm_epochs = 1e9 if not_converged: logging.warning(' Run incomplete or did not converge. Marking as infinite.') diff --git a/mlperf_logging/rcp_checker/training_3.1.0/rcps_stable_diffusion.json b/mlperf_logging/rcp_checker/training_3.1.0/rcps_stable_diffusion.json new file mode 100644 index 0000000..0679ec5 --- /dev/null +++ b/mlperf_logging/rcp_checker/training_3.1.0/rcps_stable_diffusion.json @@ -0,0 +1,66 @@ +{ + + "sd_ref_512": + { + "Benchmark": "sd", + "Creator": "NVIDIA", + "When": "Reference RCPs before v3.1", + "Platform": "32xDGX-A100", + "BS": 512, + "Hyperparams": { + "opt_adamw_beta_1": 0.9, + "opt_adamw_beta_2": 0.999, + "opt_adamw_epsilon": 1e-08, + "opt_adamw_weight_decay": 0.01, + "opt_base_learning_rate": 1.25e-7, + "opt_learning_rate_warmup_steps": 1000 + }, + "Epochs to converge": [ + 5000, 5000, 5000, 5000, 5000, + 5000, 5000, 5000, 5000, 5000, + 5000, 5000, 5000, 6000] + }, + + "sd_ref_1024": + { + "Benchmark": "sd", + "Creator": "NVIDIA", + "When": "Reference RCPs before v3.1", + "Platform": "32xDGX-A100", + "BS": 1024, + "Hyperparams": { + "opt_adamw_beta_1": 0.9, + "opt_adamw_beta_2": 0.999, + "opt_adamw_epsilon": 1e-08, + "opt_adamw_weight_decay": 0.01, + "opt_base_learning_rate": 1.25e-7, + "opt_learning_rate_warmup_steps": 1000 + }, + "Epochs to converge": [ + 2500, 2500, 2500, 2500, 2500, + 3000, 3000, 3000, 3000, 3000, + 3000, 3000, 2500] + }, + + "sd_ref_2048": + { + "Benchmark": "sd", + "Creator": "NVIDIA", + "When": "Reference RCPs before v3.1", + "Platform": "32xDGX-A100", + "BS": 2048, + "Hyperparams": { + "opt_adamw_beta_1": 0.9, + "opt_adamw_beta_2": 0.999, + "opt_adamw_epsilon": 1e-08, + "opt_adamw_weight_decay": 0.01, + "opt_base_learning_rate": 1.25e-7, + "opt_learning_rate_warmup_steps": 1000 + }, + "Epochs to converge": [ + 1750, 1750, 1750, 1750, 2000, + 2000, 2000, 2000, 2000, 2000, + 2000, 2250, 2250] + } + +}