From fe9122d424c80f1abb2c1f16a8acea3b7aeefa11 Mon Sep 17 00:00:00 2001
From: Daiyi Peng <daiyip@google.com>
Date: Thu, 2 May 2024 15:51:13 -0700
Subject: [PATCH] `lf.eval`: Fix a few issues

- Dryrun to use the copy's examples to avoid mess up the current evaluation's state.
- Reset examples when starting to run.
- Fix usage aggregation logic when the LLM does not provide usage information.

This avoids the state of the current evaluation get messed up if calling `self.examples` has a side effect.

PiperOrigin-RevId: 630202707
---
 langfun/core/eval/base.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)
diff --git a/langfun/core/eval/base.py b/langfun/core/eval/base.py
index c6bae9a..3fa8bf7 100644
--- a/langfun/core/eval/base.py
+++ b/langfun/core/eval/base.py
@@ -549,7 +549,7 @@ def _render_message(self, message: lf.Message, s: io.StringIO) -> None:
         )
         s.write(html.escape(pg.format(m.result)))
         s.write('</div>')
-      if 'usage' in m.metadata:
+      if 'usage' in m.metadata and m.usage is not None:
         s.write(
             '<div style="background-color: #EEEEEE; color: black; '
             'white-space: pre-wrap; padding: 10px; border: 0px solid; '
@@ -1056,11 +1056,11 @@ def _dryrun(
       verbose: bool,
       **kwargs,
   ) -> None:
-    # Set the example for dryrun.
-    example = example or self.examples[0]
-
     # We make a copy to avoid pollute the state of current object.
     copy: Evaluation = self.clone()
+
+    # Set the example for dryrun.
+    example = example or copy.examples[0]
     copy.__dict__['examples'] = [example]
 
     # We set the symbolic parent of the cloned to access contextual information
@@ -1126,6 +1126,9 @@ def _run(
       **kwargs,
   ) -> None:
     # Setup examples.
+    # Reset examples so it could be read from the input functor.
+    self.__dict__.pop('examples', None)
+
     if end is None:
       end = len(self.examples)
     examples = self.examples[start:end]
@@ -1434,8 +1437,9 @@ def audit(
     self._num_completed += 1
 
   def audit_usage(self, message: lf.Message, dryrun: bool = False) -> None:
+    del dryrun
     for m in message.trace():
-      if 'usage' in m.metadata:
+      if m.metadata.get('usage', None) is not None:
         self._total_prompt_tokens += m.usage.prompt_tokens
         self._total_completion_tokens += m.usage.completion_tokens
         self._num_usages += 1