Skip to content

Commit

Permalink
frontend: Add feedback reaction buttons and improve chat history hand…
Browse files Browse the repository at this point in the history
…ling

Signed-off-by: error9098x <[email protected]>
  • Loading branch information
error9098x committed Dec 3, 2024
1 parent e2af48f commit 56c1b1e
Show file tree
Hide file tree
Showing 4 changed files with 172 additions and 89 deletions.
7 changes: 4 additions & 3 deletions frontend/requirements-test.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
streamlit==1.37.0
streamlit==1.40.2
requests==2.32.3
requests-oauthlib==2.0.0
Pillow==10.3.0
Pillow==11.0.0
pytz==2024.1
google-auth==2.30.0
google-auth-httplib2==0.2.0
Expand All @@ -13,4 +13,5 @@ flask==3.0.3
types-pytz==2024.1.0.20240417
types-requests==2.32.0.20240622
pre-commit==3.7.1
ruff==0.5.1
ruff==0.5.1
mypy==1.10.1
4 changes: 2 additions & 2 deletions frontend/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
streamlit==1.37.0
streamlit==1.40.2
requests==2.32.3
requests-oauthlib==2.0.0
Pillow==10.3.0
Pillow==11.0.0
pytz==2024.1
google-auth==2.30.0
google-auth-httplib2==0.2.0
Expand Down
212 changes: 144 additions & 68 deletions frontend/streamlit_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,11 @@
import os
import ast
from PIL import Image
from utils.feedback import show_feedback_form
from utils.feedback import (
show_feedback_form,
submit_feedback_to_google_sheet,
get_git_commit_hash,
)
from dotenv import load_dotenv
from typing import Callable, Any

Expand All @@ -22,6 +26,24 @@ def wrapper(*args: Any, **kwargs: Any) -> tuple[Any, float]:
return wrapper


def translate_chat_history_to_api(chat_history, max_pairs=4):
api_format = []
relevant_history = [
msg for msg in chat_history[1:] if msg["role"] in ["user", "ai"]
]

i = len(relevant_history) - 1
while i > 0 and len(api_format) < max_pairs:
ai_msg = relevant_history[i]
user_msg = relevant_history[i - 1]
if ai_msg["role"] == "ai" and user_msg["role"] == "user":
api_format.insert(0, {"User": user_msg["content"], "AI": ai_msg["content"]})
i -= 2
else:
i -= 1
return api_format


@measure_response_time
def response_generator(user_input: str) -> tuple[str, str] | tuple[None, None]:
"""
Expand All @@ -34,74 +56,47 @@ def response_generator(user_input: str) -> tuple[str, str] | tuple[None, None]:
- tuple: Contains the AI response and sources.
"""
url = f"{st.session_state.base_url}{st.session_state.selected_endpoint}"

headers = {"accept": "application/json", "Content-Type": "application/json"}

payload = {"query": user_input, "list_sources": True, "list_context": True}

chat_history = translate_chat_history_to_api(st.session_state.chat_history)
payload = {
"query": user_input,
"list_sources": True,
"list_context": True,
"chat_history": chat_history,
}
try:
response = requests.post(url, headers=headers, json=payload)
response.raise_for_status()

try:
data = response.json()
if not isinstance(data, dict):
st.error("Invalid response format")
return None, None
except ValueError:
st.error("Failed to decode JSON response")
data = response.json()
if not isinstance(data, dict):
st.error("Invalid response format")
return None, None

sources = data.get("sources", "")
st.session_state.metadata[user_input] = {
"sources": sources,
"context": data.get("context", ""),
}

return data.get("response", ""), sources

except requests.exceptions.RequestException as e:
st.error(f"Request failed: {e}")
return None, None


def fetch_endpoints() -> tuple[str, list[str]]:
base_url = os.getenv("CHAT_ENDPOINT", "http://localhost:8000")
url = f"{base_url}/chains/listAll"
try:
response = requests.get(url)
response.raise_for_status()
endpoints = response.json()
return base_url, endpoints
except requests.exceptions.RequestException as e:
st.error(f"Failed to fetch endpoints: {e}")
return base_url, []


def main() -> None:
load_dotenv()

img = Image.open("assets/or_logo.png")
st.set_page_config(page_title="OR Assistant", page_icon=img)

deployment_time = datetime.datetime.now(pytz.timezone("UTC"))
st.info(f"Deployment time: {deployment_time.strftime('%m/%d/%Y %H:%M')} UTC")
st.info(f'Deployment time: {deployment_time.strftime("%m/%d/%Y %H:%M")} UTC')

st.title("OR Assistant")

base_url, endpoints = fetch_endpoints()

selected_endpoint = st.selectbox(
"Select preferred endpoint",
options=endpoints,
index=0,
format_func=lambda x: x.split("/")[-1].capitalize(),
)
base_url = os.getenv("CHAT_ENDPOINT", "http://localhost:8000")
selected_endpoint = "/graphs/agent-retriever"

if "selected_endpoint" not in st.session_state:
st.session_state.selected_endpoint = selected_endpoint
else:
st.session_state.selected_endpoint = selected_endpoint

if "base_url" not in st.session_state:
st.session_state.base_url = base_url
Expand All @@ -115,6 +110,8 @@ def main() -> None:
st.session_state.chat_history = []
if "metadata" not in st.session_state:
st.session_state.metadata = {}
if "sources" not in st.session_state:
st.session_state.sources = {}

if not st.session_state.chat_history:
st.session_state.chat_history.append(
Expand All @@ -124,10 +121,42 @@ def main() -> None:
}
)

for message in st.session_state.chat_history:
for idx, message in enumerate(st.session_state.chat_history):
with st.chat_message(message["role"]):
st.markdown(message["content"])

if message["role"] == "ai" and idx > 0:
user_message = st.session_state.chat_history[idx - 1]
if user_message["role"] == "user":
user_input = user_message["content"]
sources = st.session_state.sources.get(user_input)
with st.expander("Sources:"):
try:
if sources:
if isinstance(sources, str):
cleaned_sources = sources.replace("{", "[").replace(
"}", "]"
)
parsed_sources = ast.literal_eval(cleaned_sources)
else:
parsed_sources = sources
if (
isinstance(parsed_sources, (list, set))
and parsed_sources
):
sources_list = "\n".join(
f"- [{link}]({link})"
for link in parsed_sources
if link.strip()
)
st.markdown(sources_list)
else:
st.markdown("No Sources Attached.")
else:
st.markdown("No Sources Attached.")
except (ValueError, SyntaxError) as e:
st.markdown(f"Failed to parse sources: {e}")

user_input = st.chat_input("Enter your queries ...")

if user_input:
Expand All @@ -146,62 +175,72 @@ def main() -> None:
):
response, sources = response_tuple
if response is not None:
response_buffer = ""
response_buffer = response

with st.chat_message("ai"):
message_placeholder = st.empty()

response_buffer = ""
for chunk in response.split(" "):
response_buffer += chunk + " "
if chunk.endswith("\n"):
response_buffer += " "
message_placeholder.markdown(response_buffer)
time.sleep(0.05)

message_placeholder.markdown(response_buffer)

# Display response time
response_time_text = (
f"Response Time: {response_time / 1000:.2f} seconds"
)
response_time_colored = f":{'green' if response_time < 5000 else 'orange' if response_time < 10000 else 'red'}[{response_time_text}]"
st.markdown(response_time_colored)
if response_time < 5000:
color = "green"
elif response_time < 10000:
color = "orange"
else:
color = "red"
st.markdown(
f"<span style='color:{color}'>{response_time_text}</span>",
unsafe_allow_html=True,
)

st.session_state.chat_history.append(
{
"content": response_buffer,
"role": "ai",
}
)

if sources:
with st.expander("Sources:"):
try:
st.session_state.sources[user_input] = sources

with st.expander("Sources:"):
try:
if sources:
if isinstance(sources, str):
cleaned_sources = sources.replace("{", "[").replace(
"}", "]"
)
parsed_sources = ast.literal_eval(cleaned_sources)
else:
parsed_sources = sources
if isinstance(parsed_sources, (list, set)):
if (
isinstance(parsed_sources, (list, set))
and parsed_sources
):
sources_list = "\n".join(
f"- [{link}]({link})"
for link in parsed_sources
if link.strip()
)
st.markdown(sources_list)
else:
st.markdown("No valid sources found.")
except (ValueError, SyntaxError) as e:
st.markdown(f"Failed to parse sources: {e}")
else:
st.error("Invalid response from the API")

st.markdown("No Sources Attached.")
else:
st.markdown("No Sources Attached.")
except (ValueError, SyntaxError) as e:
st.markdown(f"Failed to parse sources: {e}")
else:
st.error("Invalid response from the API")

# Reaction buttons and feedback form
question_dict = {
interaction["content"]: i
for i, interaction in enumerate(st.session_state.chat_history)
if interaction["role"] == "user"
}

if question_dict and os.getenv("FEEDBACK_SHEET_ID"):
if "feedback_button" not in st.session_state:
st.session_state.feedback_button = False
Expand All @@ -212,10 +251,47 @@ def update_state() -> None:
"""
st.session_state.feedback_button = True

if (
st.button("Feedback", on_click=update_state)
or st.session_state.feedback_button
):
# Display reaction buttons
col1, col2, col3 = st.columns([1, 1, 2])
with col1:
thumbs_up = st.button("👍", key="thumbs_up")
with col2:
thumbs_down = st.button("👎", key="thumbs_down")
with col3:
feedback_clicked = st.button("Feedback", on_click=update_state)

# Handle thumbs up and thumbs down reactions
if thumbs_up or thumbs_down:
try:
selected_question = st.session_state.chat_history[-2][
"content"
] # Last user question
gen_ans = st.session_state.chat_history[-1][
"content"
] # Last AI response
sources = st.session_state.metadata.get(selected_question, {}).get(
"sources", ["N/A"]
)
context = st.session_state.metadata.get(selected_question, {}).get(
"context", ["N/A"]
)
reaction = "upvote" if thumbs_up else "downvote"

submit_feedback_to_google_sheet(
question=selected_question,
answer=gen_ans,
sources=sources if isinstance(sources, list) else [sources],
context=context if isinstance(context, list) else [context],
issue="", # Leave issue blank
version=os.getenv("RAG_VERSION", get_git_commit_hash()),
reaction=reaction, # Pass the reaction
)
st.success("Thank you for your feedback!")
except Exception as e:
st.error(f"Failed to submit feedback: {e}")

# Feedback form logic
if feedback_clicked or st.session_state.feedback_button:
try:
show_feedback_form(
question_dict,
Expand Down
Loading

1 comment on commit 56c1b1e

@luarss
Copy link
Collaborator

@luarss luarss commented on 56c1b1e Dec 18, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

===================================
==> Dataset: EDA Corpus
==> Running tests for agent-retriever
/home/luarss/actions-runner/_work/ORAssistant/ORAssistant/evaluation/.venv/lib/python3.12/site-packages/deepeval/init.py:49: UserWarning: You are using deepeval version 1.4.9, however version 2.0.5 is available. You should consider upgrading via the "pip install --upgrade deepeval" command.
warnings.warn(

Fetching 2 files: 0%| | 0/2 [00:00<?, ?it/s]
Fetching 2 files: 50%|█████ | 1/2 [00:00<00:00, 4.45it/s]
Fetching 2 files: 100%|██████████| 2/2 [00:00<00:00, 8.89it/s]

Evaluating: 0%| | 0/100 [00:00<?, ?it/s]
Evaluating: 1%| | 1/100 [00:13<21:27, 13.00s/it]
Evaluating: 2%|▏ | 2/100 [00:24<19:32, 11.96s/it]
Evaluating: 3%|▎ | 3/100 [00:36<19:39, 12.16s/it]
Evaluating: 4%|▍ | 4/100 [00:46<17:43, 11.08s/it]
Evaluating: 5%|▌ | 5/100 [00:56<17:24, 11.00s/it]
Evaluating: 6%|▌ | 6/100 [01:08<17:18, 11.05s/it]
Evaluating: 7%|▋ | 7/100 [01:20<18:02, 11.64s/it]
Evaluating: 8%|▊ | 8/100 [01:32<17:49, 11.62s/it]
Evaluating: 9%|▉ | 9/100 [01:44<17:51, 11.77s/it]
Evaluating: 10%|█ | 10/100 [01:58<18:34, 12.39s/it]
Evaluating: 11%|█ | 11/100 [02:13<19:28, 13.13s/it]
Evaluating: 12%|█▏ | 12/100 [02:25<19:05, 13.02s/it]
Evaluating: 13%|█▎ | 13/100 [02:37<18:18, 12.62s/it]
Evaluating: 14%|█▍ | 14/100 [02:48<17:24, 12.15s/it]
Evaluating: 15%|█▌ | 15/100 [03:00<17:05, 12.06s/it]
Evaluating: 16%|█▌ | 16/100 [03:10<16:00, 11.44s/it]
Evaluating: 17%|█▋ | 17/100 [03:22<15:54, 11.49s/it]
Evaluating: 18%|█▊ | 18/100 [03:32<15:13, 11.14s/it]
Evaluating: 19%|█▉ | 19/100 [03:42<14:43, 10.91s/it]
Evaluating: 20%|██ | 20/100 [03:53<14:27, 10.84s/it]
Evaluating: 21%|██ | 21/100 [04:05<14:31, 11.03s/it]
Evaluating: 22%|██▏ | 22/100 [04:16<14:28, 11.13s/it]
Evaluating: 23%|██▎ | 23/100 [04:26<13:57, 10.88s/it]
Evaluating: 24%|██▍ | 24/100 [04:36<13:25, 10.59s/it]
Evaluating: 25%|██▌ | 25/100 [04:47<13:20, 10.68s/it]
Evaluating: 26%|██▌ | 26/100 [04:56<12:42, 10.31s/it]
Evaluating: 27%|██▋ | 27/100 [05:08<12:57, 10.66s/it]
Evaluating: 28%|██▊ | 28/100 [05:19<12:49, 10.69s/it]
Evaluating: 29%|██▉ | 29/100 [05:29<12:29, 10.55s/it]
Evaluating: 30%|███ | 30/100 [05:40<12:39, 10.85s/it]
Evaluating: 31%|███ | 31/100 [05:51<12:13, 10.64s/it]
Evaluating: 32%|███▏ | 32/100 [06:01<12:06, 10.69s/it]
Evaluating: 33%|███▎ | 33/100 [06:13<12:14, 10.97s/it]
Evaluating: 34%|███▍ | 34/100 [06:24<12:06, 11.01s/it]
Evaluating: 35%|███▌ | 35/100 [06:37<12:24, 11.46s/it]
Evaluating: 36%|███▌ | 36/100 [06:47<11:53, 11.15s/it]
Evaluating: 37%|███▋ | 37/100 [06:58<11:33, 11.01s/it]
Evaluating: 38%|███▊ | 38/100 [07:08<11:05, 10.74s/it]
Evaluating: 39%|███▉ | 39/100 [07:18<10:43, 10.56s/it]
Evaluating: 40%|████ | 40/100 [07:28<10:30, 10.50s/it]
Evaluating: 41%|████ | 41/100 [07:39<10:29, 10.68s/it]
Evaluating: 42%|████▏ | 42/100 [07:50<10:24, 10.77s/it]
Evaluating: 43%|████▎ | 43/100 [08:01<10:13, 10.77s/it]
Evaluating: 44%|████▍ | 44/100 [08:12<10:11, 10.91s/it]
Evaluating: 45%|████▌ | 45/100 [08:25<10:19, 11.27s/it]
Evaluating: 46%|████▌ | 46/100 [08:36<10:18, 11.45s/it]
Evaluating: 47%|████▋ | 47/100 [08:48<10:16, 11.63s/it]
Evaluating: 48%|████▊ | 48/100 [08:59<09:55, 11.45s/it]
Evaluating: 49%|████▉ | 49/100 [09:13<10:17, 12.11s/it]
Evaluating: 50%|█████ | 50/100 [09:26<10:13, 12.26s/it]
Evaluating: 51%|█████ | 51/100 [09:36<09:24, 11.51s/it]
Evaluating: 52%|█████▏ | 52/100 [09:48<09:28, 11.84s/it]
Evaluating: 53%|█████▎ | 53/100 [09:59<08:59, 11.48s/it]
Evaluating: 54%|█████▍ | 54/100 [10:10<08:43, 11.38s/it]
Evaluating: 55%|█████▌ | 55/100 [10:21<08:32, 11.39s/it]
Evaluating: 56%|█████▌ | 56/100 [10:31<08:04, 11.01s/it]
Evaluating: 57%|█████▋ | 57/100 [10:42<07:46, 10.84s/it]
Evaluating: 58%|█████▊ | 58/100 [10:52<07:32, 10.77s/it]
Evaluating: 59%|█████▉ | 59/100 [11:04<07:27, 10.91s/it]
Evaluating: 60%|██████ | 60/100 [11:15<07:22, 11.06s/it]
Evaluating: 61%|██████ | 61/100 [11:27<07:18, 11.23s/it]
Evaluating: 62%|██████▏ | 62/100 [11:37<06:57, 10.99s/it]
Evaluating: 63%|██████▎ | 63/100 [11:48<06:42, 10.88s/it]
Evaluating: 64%|██████▍ | 64/100 [11:56<06:07, 10.21s/it]
Evaluating: 65%|██████▌ | 65/100 [12:05<05:38, 9.67s/it]
Evaluating: 66%|██████▌ | 66/100 [12:16<05:48, 10.25s/it]
Evaluating: 67%|██████▋ | 67/100 [12:28<05:46, 10.50s/it]
Evaluating: 68%|██████▊ | 68/100 [12:44<06:34, 12.31s/it]
Evaluating: 69%|██████▉ | 69/100 [12:58<06:31, 12.64s/it]
Evaluating: 70%|███████ | 70/100 [13:12<06:33, 13.13s/it]
Evaluating: 71%|███████ | 71/100 [13:22<05:55, 12.25s/it]
Evaluating: 72%|███████▏ | 72/100 [13:31<05:17, 11.33s/it]
Evaluating: 73%|███████▎ | 73/100 [13:43<05:07, 11.39s/it]
Evaluating: 74%|███████▍ | 74/100 [13:54<04:54, 11.31s/it]
Evaluating: 75%|███████▌ | 75/100 [14:06<04:47, 11.50s/it]
Evaluating: 76%|███████▌ | 76/100 [14:18<04:41, 11.71s/it]
Evaluating: 77%|███████▋ | 77/100 [14:31<04:40, 12.18s/it]
Evaluating: 78%|███████▊ | 78/100 [14:43<04:26, 12.13s/it]
Evaluating: 79%|███████▉ | 79/100 [14:53<04:00, 11.46s/it]
Evaluating: 80%|████████ | 80/100 [15:04<03:45, 11.26s/it]
Evaluating: 81%|████████ | 81/100 [15:15<03:31, 11.13s/it]
Evaluating: 82%|████████▏ | 82/100 [15:25<03:15, 10.88s/it]
Evaluating: 83%|████████▎ | 83/100 [15:35<02:59, 10.58s/it]
Evaluating: 84%|████████▍ | 84/100 [15:47<02:55, 10.98s/it]
Evaluating: 85%|████████▌ | 85/100 [15:56<02:36, 10.46s/it]
Evaluating: 86%|████████▌ | 86/100 [16:08<02:32, 10.91s/it]
Evaluating: 87%|████████▋ | 87/100 [16:18<02:19, 10.72s/it]
Evaluating: 88%|████████▊ | 88/100 [16:30<02:10, 10.86s/it]
Evaluating: 89%|████████▉ | 89/100 [16:42<02:05, 11.40s/it]
Evaluating: 90%|█████████ | 90/100 [16:55<01:56, 11.69s/it]
Evaluating: 91%|█████████ | 91/100 [17:08<01:50, 12.28s/it]
Evaluating: 92%|█████████▏| 92/100 [17:21<01:38, 12.31s/it]
Evaluating: 93%|█████████▎| 93/100 [17:31<01:21, 11.71s/it]
Evaluating: 94%|█████████▍| 94/100 [17:41<01:06, 11.13s/it]
Evaluating: 95%|█████████▌| 95/100 [17:51<00:54, 10.87s/it]
Evaluating: 96%|█████████▌| 96/100 [18:01<00:42, 10.72s/it]
Evaluating: 97%|█████████▋| 97/100 [18:12<00:32, 10.84s/it]
Evaluating: 98%|█████████▊| 98/100 [18:25<00:22, 11.23s/it]
Evaluating: 99%|█████████▉| 99/100 [18:35<00:10, 10.92s/it]
Evaluating: 100%|██████████| 100/100 [18:46<00:00, 11.14s/it]
Evaluating: 100%|██████████| 100/100 [18:46<00:00, 11.27s/it]
✨ You're running DeepEval's latest Contextual Precision Metric! (using
gemini-1.5-pro-002, strict=False, async_mode=True)...
✨ You're running DeepEval's latest Contextual Recall Metric! (using
gemini-1.5-pro-002, strict=False, async_mode=True)...
✨ You're running DeepEval's latest Hallucination Metric! (using
gemini-1.5-pro-002, strict=False, async_mode=True)...

Evaluating 100 test case(s) in parallel: | | 0% (0/100) [Time Taken: 00:00, ?test case/s]
Evaluating 100 test case(s) in parallel: | | 1% (1/100) [Time Taken: 00:10, 10.43s/test case]
Evaluating 100 test case(s) in parallel: |▏ | 2% (2/100) [Time Taken: 00:11, 4.92s/test case]
Evaluating 100 test case(s) in parallel: |▎ | 3% (3/100) [Time Taken: 00:11, 2.78s/test case]
Evaluating 100 test case(s) in parallel: |▍ | 4% (4/100) [Time Taken: 00:11, 1.77s/test case]
Evaluating 100 test case(s) in parallel: |▌ | 5% (5/100) [Time Taken: 00:14, 1.91s/test case]
Evaluating 100 test case(s) in parallel: |▊ | 8% (8/100) [Time Taken: 00:14, 1.16test case/s]
Evaluating 100 test case(s) in parallel: |█ | 10% (10/100) [Time Taken: 00:15, 1.49test case/s]
Evaluating 100 test case(s) in parallel: |█ | 11% (11/100) [Time Taken: 00:15, 1.80test case/s]
Evaluating 100 test case(s) in parallel: |█▏ | 12% (12/100) [Time Taken: 00:15, 2.16test case/s]
Evaluating 100 test case(s) in parallel: |█▎ | 13% (13/100) [Time Taken: 00:15, 2.56test case/s]
Evaluating 100 test case(s) in parallel: |█▍ | 14% (14/100) [Time Taken: 00:15, 3.02test case/s]
Evaluating 100 test case(s) in parallel: |█▊ | 18% (18/100) [Time Taken: 00:16, 5.43test case/s]
Evaluating 100 test case(s) in parallel: |█▉ | 19% (19/100) [Time Taken: 00:16, 5.29test case/s]
Evaluating 100 test case(s) in parallel: |██▏ | 22% (22/100) [Time Taken: 00:16, 7.98test case/s]
Evaluating 100 test case(s) in parallel: |██▌ | 26% (26/100) [Time Taken: 00:17, 8.52test case/s]
Evaluating 100 test case(s) in parallel: |██▉ | 29% (29/100) [Time Taken: 00:17, 10.99test case/s]
Evaluating 100 test case(s) in parallel: |███▎ | 33% (33/100) [Time Taken: 00:17, 9.39test case/s]
Evaluating 100 test case(s) in parallel: |███▌ | 35% (35/100) [Time Taken: 00:18, 6.12test case/s]
Evaluating 100 test case(s) in parallel: |███▋ | 37% (37/100) [Time Taken: 00:20, 2.51test case/s]
Evaluating 100 test case(s) in parallel: |███▊ | 38% (38/100) [Time Taken: 00:21, 2.24test case/s]
Evaluating 100 test case(s) in parallel: |███▉ | 39% (39/100) [Time Taken: 00:21, 2.50test case/s]
Evaluating 100 test case(s) in parallel: |████▏ | 42% (42/100) [Time Taken: 00:21, 4.00test case/s]
Evaluating 100 test case(s) in parallel: |████▍ | 44% (44/100) [Time Taken: 00:22, 4.37test case/s]
Evaluating 100 test case(s) in parallel: |████▌ | 46% (46/100) [Time Taken: 00:22, 4.83test case/s]
Evaluating 100 test case(s) in parallel: |████▋ | 47% (47/100) [Time Taken: 00:22, 5.05test case/s]
Evaluating 100 test case(s) in parallel: |████▉ | 49% (49/100) [Time Taken: 00:22, 6.42test case/s]
Evaluating 100 test case(s) in parallel: |█████ | 51% (51/100) [Time Taken: 00:22, 7.46test case/s]
Evaluating 100 test case(s) in parallel: |█████▎ | 53% (53/100) [Time Taken: 00:23, 9.14test case/s]
Evaluating 100 test case(s) in parallel: |█████▌ | 55% (55/100) [Time Taken: 00:23, 7.38test case/s]
Evaluating 100 test case(s) in parallel: |█████▋ | 57% (57/100) [Time Taken: 00:23, 5.85test case/s]
Evaluating 100 test case(s) in parallel: |█████▉ | 59% (59/100) [Time Taken: 00:24, 7.02test case/s]
Evaluating 100 test case(s) in parallel: |██████ | 61% (61/100) [Time Taken: 00:24, 5.89test case/s]
Evaluating 100 test case(s) in parallel: |██████▏ | 62% (62/100) [Time Taken: 00:24, 4.93test case/s]
Evaluating 100 test case(s) in parallel: |██████▍ | 64% (64/100) [Time Taken: 00:25, 5.90test case/s]
Evaluating 100 test case(s) in parallel: |██████▌ | 65% (65/100) [Time Taken: 00:25, 6.10test case/s]
Evaluating 100 test case(s) in parallel: |██████▌ | 66% (66/100) [Time Taken: 00:25, 4.69test case/s]
Evaluating 100 test case(s) in parallel: |██████▋ | 67% (67/100) [Time Taken: 00:25, 4.51test case/s]
Evaluating 100 test case(s) in parallel: |██████▊ | 68% (68/100) [Time Taken: 00:26, 3.75test case/s]
Evaluating 100 test case(s) in parallel: |██████▉ | 69% (69/100) [Time Taken: 00:26, 3.80test case/s]
Evaluating 100 test case(s) in parallel: |███████ | 70% (70/100) [Time Taken: 00:26, 3.84test case/s]
Evaluating 100 test case(s) in parallel: |███████ | 71% (71/100) [Time Taken: 00:27, 3.79test case/s]
Evaluating 100 test case(s) in parallel: |███████▏ | 72% (72/100) [Time Taken: 00:27, 3.47test case/s]
Evaluating 100 test case(s) in parallel: |███████▎ | 73% (73/100) [Time Taken: 00:28, 2.57test case/s]
Evaluating 100 test case(s) in parallel: |███████▍ | 74% (74/100) [Time Taken: 00:28, 2.53test case/s]
Evaluating 100 test case(s) in parallel: |███████▌ | 75% (75/100) [Time Taken: 00:28, 2.88test case/s]
Evaluating 100 test case(s) in parallel: |███████▋ | 77% (77/100) [Time Taken: 00:28, 4.10test case/s]
Evaluating 100 test case(s) in parallel: |███████▊ | 78% (78/100) [Time Taken: 00:29, 4.62test case/s]
Evaluating 100 test case(s) in parallel: |███████▉ | 79% (79/100) [Time Taken: 00:29, 4.98test case/s]
Evaluating 100 test case(s) in parallel: |████████ | 81% (81/100) [Time Taken: 00:29, 6.61test case/s]
Evaluating 100 test case(s) in parallel: |████████▏ | 82% (82/100) [Time Taken: 00:29, 4.92test case/s]
Evaluating 100 test case(s) in parallel: |████████▎ | 83% (83/100) [Time Taken: 00:29, 5.07test case/s]
Evaluating 100 test case(s) in parallel: |████████▍ | 84% (84/100) [Time Taken: 00:30, 4.89test case/s]
Evaluating 100 test case(s) in parallel: |████████▌ | 85% (85/100) [Time Taken: 00:30, 4.23test case/s]
Evaluating 100 test case(s) in parallel: |████████▌ | 86% (86/100) [Time Taken: 00:31, 2.21test case/s]
Evaluating 100 test case(s) in parallel: |████████▊ | 88% (88/100) [Time Taken: 00:31, 3.10test case/s]
Evaluating 100 test case(s) in parallel: |████████▉ | 89% (89/100) [Time Taken: 00:32, 3.25test case/s]
Evaluating 100 test case(s) in parallel: |█████████ | 91% (91/100) [Time Taken: 00:32, 4.55test case/s]
Evaluating 100 test case(s) in parallel: |█████████▎| 93% (93/100) [Time Taken: 00:32, 3.96test case/s]
Evaluating 100 test case(s) in parallel: |█████████▌| 95% (95/100) [Time Taken: 00:33, 5.03test case/s]
Evaluating 100 test case(s) in parallel: |█████████▌| 96% (96/100) [Time Taken: 00:33, 4.43test case/s]
Evaluating 100 test case(s) in parallel: |█████████▊| 98% (98/100) [Time Taken: 00:34, 2.82test case/s]
Evaluating 100 test case(s) in parallel: |█████████▉| 99% (99/100) [Time Taken: 00:39, 1.27s/test case]
Evaluating 100 test case(s) in parallel: |██████████|100% (100/100) [Time Taken: 00:55, 4.52s/test case]
Evaluating 100 test case(s) in parallel: |██████████|100% (100/100) [Time Taken: 00:55, 1.82test case/s]
✓ Tests finished 🎉! Run 'deepeval login' to save and analyze evaluation results
on Confident AI.
‼️ Friendly reminder 😇: You can also run evaluations with ALL of deepeval's
metrics directly on Confident AI instead.
Average Metric Scores:
Contextual Precision 0.7070555555555554
Contextual Recall 0.8928333333333334
Hallucination 0.5250240800865801
Metric Passrates:
Contextual Precision 0.67
Contextual Recall 0.86
Hallucination 0.6

Please sign in to comment.