[Feat] add regression test and change saving logic related to `output…

…_path` (EvolvingLMMs-Lab#259) * feat: add new ouput_path saving logic and add evaluation tracker to manage samples saving process * add: regression test * add: regression test * clean: unuseful code * 🚫 Remove unused import for cleaner code Eliminated the commented-out import statement for WandbLogger to tidy up the code and enhance readability. This helps maintain focus on active components and prevents confusion over unused code. A cleaner structure contributes to better maintainability in the long run. No functional changes were made, just a step towards a more streamlined codebase.
CaraJ7 · Sep 17, 2024 · e20d5d6 · e20d5d6
1 parent a2a881c
commit e20d5d6
Show file tree

Hide file tree

Showing 6 changed files with 244 additions and 50 deletions.
diff --git a/docs/commands.md b/docs/commands.md
@@ -251,4 +251,26 @@ pip install httpx==0.23.3
 pip install protobuf==3.20
 ```
 
+## Regression Test
 
+Now after each PR, we need to run the regression test to make sure the performance of the model is not degraded.
+
+```bash
+python3 tools/regression.py
+```
+
+```bash
+Already on 'dev/fix_output_path'
+
+|task|llava-onevision-qwen2-0.5b-ov|
+|--|--|
+|ocrbench (dev/fix_output_path)|0.70 ± 0.70|
+|mmmu_val (dev/fix_output_path)|50.00 ± 50.00|
+|ai2d (dev/fix_output_path)|50.00 ± 50.00|
+|muirbench (dev/fix_output_path)|12.50 ± 12.50|
+|videomme (dev/fix_output_path)|2500.00 ± 2500.00|
+
+|branch|runtime|%|
+|--|--|--|
+|dev/fix_output_path|87.7s|100%|
+```
diff --git a/lmms_eval/__main__.py b/lmms_eval/__main__.py
@@ -25,8 +25,6 @@
 from lmms_eval.api.registry import ALL_TASKS
 from lmms_eval.evaluator import request_caching_arg_to_dict
 from lmms_eval.loggers import EvaluationTracker, WandbLogger
-
-# from lmms_eval.logging_utils import WandbLogger
 from lmms_eval.tasks import TaskManager
 from lmms_eval.utils import (
     handle_non_serializable,
@@ -230,7 +228,7 @@ def parse_eval_args() -> argparse.Namespace:
     parser.add_argument(
         "--timezone",
         default="Asia/Singapore",
-        help="Timezone for datetime string, e.g. Asia/Singapore, America/New_York, America/Los_Angeles",
+        help="Timezone for datetime string, e.g. Asia/Singapore, America/New_York, America/Los_Angeles. You can check the full list via `import pytz; print(pytz.common_timezones)`",
     )
     parser.add_argument(
         "--hf_hub_log_args",
@@ -349,7 +347,10 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
     for args, results in zip(args_list, results_list):
         # cli_evaluate will return none if the process is not the main process (rank 0)
         if results is not None:
-            print_results(args, results)
+            print(f"{args.model} ({args.model_args}), gen_kwargs: ({args.gen_kwargs}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, " f"batch_size: {args.batch_size}")
+            print(make_table(results))
+            if "groups" in results:
+                print(make_table(results, "groups"))
 
     if args.wandb_args:
         wandb_logger.run.finish()
@@ -462,22 +463,7 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None:
 
     eval_logger.info(f"Selected Tasks: {task_names}")
     request_caching_args = request_caching_arg_to_dict(cache_requests=args.cache_requests)
-
-    # set datetime before evaluation
     datetime_str = utils.get_datetime_str(timezone=args.timezone)
-    if args.output_path:
-        if args.log_samples_suffix and len(args.log_samples_suffix) > 15:
-            eval_logger.warning("The suffix for log_samples is too long. It is recommended to keep it under 15 characters.")
-            args.log_samples_suffix = args.log_samples_suffix[:5] + "..." + args.log_samples_suffix[-5:]
-
-        hash_input = f"{args.model_args}".encode("utf-8")
-        hash_output = hashlib.sha256(hash_input).hexdigest()[:6]
-        path = Path(args.output_path)
-        path = path.expanduser().resolve().joinpath(f"{datetime_str}_{args.log_samples_suffix}_{args.model}_model_args_{hash_output}")
-        args.output_path = path
-
-    elif args.log_samples and not args.output_path:
-        assert args.output_path, "Specify --output_path"
 
     results = evaluator.simple_evaluate(
         model=args.model,
@@ -505,6 +491,7 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None:
         torch_random_seed=args.seed[2],
         fewshot_random_seed=args.seed[3],
         cli_args=args,
+        datetime_str=datetime_str,
         **request_caching_args,
     )
 
@@ -517,21 +504,30 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None:
         if args.show_config:
             print(dumped)
 
-        if args.output_path:
-            args.output_path.mkdir(parents=True, exist_ok=True)
-            result_file_path = path.joinpath("results.json")
-            if result_file_path.exists():
-                eval_logger.warning(f"Output file {result_file_path} already exists and will be overwritten.")
-
-            result_file_path.open("w").write(dumped)
-            if args.log_samples:
-                for task_name, config in results["configs"].items():
-                    filename = args.output_path.joinpath(f"{task_name}.json")
-                    # Structure the data with 'args' and 'logs' keys
-                    data_to_dump = {"args": vars(args), "model_configs": config, "logs": sorted(samples[task_name], key=lambda x: x["doc_id"]), "time": datetime_str}
-                    samples_dumped = json.dumps(data_to_dump, indent=4, default=_handle_non_serializable, ensure_ascii=False)
-                    filename.open("w", encoding="utf-8").write(samples_dumped)
-                    eval_logger.info(f"Saved samples to {filename}")
+        batch_sizes = ",".join(map(str, results["config"]["batch_sizes"]))
+
+        # Add W&B logging
+        if args.wandb_args:
+            try:
+                wandb_logger.post_init(results)
+                wandb_logger.log_eval_result()
+                if args.log_samples:
+                    wandb_logger.log_eval_samples(samples)
+            except Exception as e:
+                eval_logger.info(f"Logging to Weights and Biases failed due to {e}")
+
+        evaluation_tracker.save_results_aggregated(results=results, samples=samples if args.log_samples else None, datetime_str=datetime_str)
+
+        if args.log_samples:
+            for task_name, config in results["configs"].items():
+                evaluation_tracker.save_results_samples(task_name=task_name, samples=samples[task_name])
+
+        if evaluation_tracker.push_results_to_hub or evaluation_tracker.push_samples_to_hub:
+            evaluation_tracker.recreate_metadata_card()
+
+        if args.wandb_args:
+            # Tear down wandb run once all the logging is done.
+            wandb_logger.run.finish()
 
         return results, samples
     return None, None

diff --git a/lmms_eval/evaluator.py b/lmms_eval/evaluator.py
@@ -76,6 +76,7 @@ def simple_evaluate(
     numpy_random_seed: int = 1234,
     torch_random_seed: int = 1234,
     fewshot_random_seed: int = 1234,
+    datetime_str: str = get_datetime_str(),
     cli_args=None,
 ):
     """Instantiate and evaluate a model on a list of tasks.
@@ -292,7 +293,7 @@ def _adjust_config(task_dict):
             }
         )
         results["git_hash"] = get_git_commit_hash()
-        results["date"] = get_datetime_str()
+        results["date"] = datetime_str
         # add_env_info(results)  # additional environment info to results
         # add_tokenizer_info(results, lm)  # additional info about tokenizer
         return results

diff --git a/lmms_eval/loggers/evaluation_tracker.py b/lmms_eval/loggers/evaluation_tracker.py
@@ -15,6 +15,7 @@
 
 from lmms_eval.utils import (
     eval_logger,
+    get_datetime_str,
     get_file_datetime,
     get_file_task_name,
     get_results_filenames,
@@ -154,7 +155,7 @@ def __init__(
             eval_logger.warning(f"hub_results_org was not specified. Results will be pushed to '{hub_results_org}'.")
 
         if hub_repo_name == "":
-            details_repo_name = details_repo_name if details_repo_name != "" else "lm-eval-results"
+            details_repo_name = details_repo_name if details_repo_name != "" else "lmms-eval-results"
             results_repo_name = results_repo_name if results_repo_name != "" else details_repo_name
         else:
             details_repo_name = hub_repo_name
@@ -170,13 +171,15 @@ def save_results_aggregated(
         self,
         results: dict,
         samples: dict,
+        datetime_str: str,
     ) -> None:
         """
         Saves the aggregated results and samples to the output path and pushes them to the Hugging Face hub if requested.
 
         Args:
             results (dict): The aggregated results to save.
             samples (dict): The samples results to save.
+            datetime_str (str): The datetime string to use for the results file.
         """
         self.general_config_tracker.log_end_time()
 
@@ -205,8 +208,8 @@ def save_results_aggregated(
                 path = path.joinpath(self.general_config_tracker.model_name_sanitized)
                 path.mkdir(parents=True, exist_ok=True)
 
-                self.date_id = datetime.now().isoformat().replace(":", "-")
-                file_results_aggregated = path.joinpath(f"results_{self.date_id}.json")
+                self.date_id = datetime_str.replace(":", "-")
+                file_results_aggregated = path.joinpath(f"{self.date_id}_results.json")
                 file_results_aggregated.open("w", encoding="utf-8").write(dumped)
 
                 if self.api and self.push_results_to_hub:
@@ -219,10 +222,10 @@ def save_results_aggregated(
                     )
                     self.api.upload_file(
                         repo_id=repo_id,
-                        path_or_fileobj=str(path.joinpath(f"results_{self.date_id}.json")),
+                        path_or_fileobj=str(path.joinpath(f"{self.date_id}_results.json")),
                         path_in_repo=os.path.join(
                             self.general_config_tracker.model_name,
-                            f"results_{self.date_id}.json",
+                            f"{self.date_id}_results.json",
                         ),
                         repo_type="dataset",
                         commit_message=f"Adding aggregated results for {self.general_config_tracker.model_name}",
@@ -255,18 +258,17 @@ def save_results_samples(
                 path = path.joinpath(self.general_config_tracker.model_name_sanitized)
                 path.mkdir(parents=True, exist_ok=True)
 
-                file_results_samples = path.joinpath(f"samples_{task_name}_{self.date_id}.jsonl")
+                file_results_samples = path.joinpath(f"{self.date_id}_samples_{task_name}.jsonl")
 
                 for sample in samples:
                     # we first need to sanitize arguments and resps
                     # otherwise we won't be able to load the dataset
                     # using the datasets library
                     arguments = {}
-                    for i, arg in enumerate(sample["arguments"]):
-                        arguments[f"gen_args_{i}"] = {}
-                        for j, tmp in enumerate(arg):
-                            arguments[f"gen_args_{i}"][f"arg_{j}"] = tmp
+                    for key, value in enumerate(sample["arguments"][1]):  # update metadata into args
+                        arguments[key] = value
 
+                    sample["input"] = sample["arguments"][0]
                     sample["resps"] = sanitize_list(sample["resps"])
                     sample["filtered_resps"] = sanitize_list(sample["filtered_resps"])
                     sample["arguments"] = arguments

diff --git a/lmms_eval/utils.py b/lmms_eval/utils.py
@@ -33,6 +33,7 @@
 import gc
 from itertools import islice
 
+import numpy as np
 import pytz
 import torch
 import transformers
@@ -238,11 +239,14 @@ def get_file_datetime(filename: str) -> str:
     return filename[filename.rfind("_") + 1 :].replace(".jsonl", "")
 
 
-def sanitize_model_name(model_name: str) -> str:
+def sanitize_model_name(model_name: str, full_path: bool = False) -> str:
     """
     Given the model name, returns a sanitized version of it.
     """
-    return re.sub(r"[\"<>:/\|\\?\*\[\]]+", "__", model_name)
+    if full_path:
+        return re.sub(r"[\"<>:/\|\\?\*\[\]]+", "__", model_name)
+    else:
+        return re.sub(r"[\"<>:/\|\\?\*\[\]]+", "__", model_name.split("/")[-1])
 
 
 def sanitize_task_name(task_name: str) -> str:
@@ -263,14 +267,14 @@ def get_results_filenames(filenames: List[str]) -> List[str]:
     """
     Extracts filenames that correspond to aggregated results.
     """
-    return [f for f in filenames if "/results_" in f and ".json" in f]
+    return [f for f in filenames if "results" in f and ".json" in f]
 
 
 def get_sample_results_filenames(filenames: List[str]) -> List[str]:
     """
     Extracts filenames that correspond to sample results.
     """
-    return [f for f in filenames if "/samples_" in f and ".json" in f]
+    return [f for f in filenames if "samples" in f and ".json" in f]
 
 
 def get_rolling_token_windows(token_list, prefix_token, max_seq_len, context_len):
@@ -588,7 +592,7 @@ def get_datetime_str(timezone="Asia/Singapore"):
     tz = pytz.timezone(timezone)
     utc_now = datetime.datetime.now(datetime.timezone.utc)
     local_time = utc_now.astimezone(tz)
-    return local_time.strftime("%m%d_%H%M")
+    return local_time.strftime("%Y%m%d_%H%M%S")
 
 
 def ignore_constructor(loader, node):