Skip to content

Commit

Permalink
finish config file
Browse files Browse the repository at this point in the history
  • Loading branch information
crazycth committed Sep 17, 2023
1 parent 125fcd6 commit 93579e3
Show file tree
Hide file tree
Showing 13 changed files with 108 additions and 53 deletions.
52 changes: 41 additions & 11 deletions demo/configs/translate_quiz.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,27 +14,57 @@ dataset:
**Only provide the dictionary.**"
input_function:
description:
Given an English sentence and corresponding standard Chinese translation, then translate it into Chinese.
The current task requires assessing the model's translation ability. Please provide a complex and vivid sentence in English and the corresponding Chinese, which can fully evaluate the translation ability from English to Chinese.
name: translation_english_to_chinese
parameters:
teacher_quiz: str
number_of_examples: 2
teacher_answer: str
expected_param_name: teacher_answer
number_of_examples: 5
output_path: null
source_type: machine_generated


variations:
- name : model_name
variations:
- instantiated_value: gpt-3.5-turbo
value: gpt-3.5-turbo
- instantiated_value: a16z-infra/llama-2-13b-chat:9dff94b1bed5af738655d4a7cbcdcde2bd503aa85c94334fe1f42af7f3dd5ee3
value: a16z-infra/llama-2-13b-chat:9dff94b1bed5af738655d4a7cbcdcde2bd503aa85c94334fe1f42af7f3dd5ee3
value_type: str
variation_id: null

# evaluators: # compare teacher's answer(expect result) generate with data generator and student's answer generated with custom func?
# - evaluator_type: individual
# metric_calculators:
# - method: AVERAGE
# name: rouge_evaluator
evaluators:
- evaluator_type: individual
name: bertscore_evaluator
metric_calculators:
- method: AVERAGE
display_name: p
indicator: p
- evaluator_type: individual
name: bertscore_evaluator
metric_calculators:
- method: AVERAGE
display_name: r
indicator: r
- evaluator_type: individual
name: bertscore_evaluator
metric_calculators:
- method: AVERAGE
display_name: f
indicator: f

# improver: # different from our default imporver, how to connect openai_finetune_utils to the improver?
# name:
selection_strategy:
ahp_selection:
criteria:
- "bertscore_evaluator: p"
- "bertscore_evaluator: r"
- "bertscore_evaluator: f"
criteria_maximization:
"bertscore_evaluator: p": true
"bertscore_evaluator: r": true
"bertscore_evaluator: f": true
criteria_weights:
"bertscore_evaluator: p": 0.33
"bertscore_evaluator: r": 0.33
"bertscore_evaluator: f": 0.33
normalize_func: null
59 changes: 26 additions & 33 deletions demo/translate_quiz.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@

import openai

from yival.common.model_utils import llm_completion
from yival.logger.token_logger import TokenLogger
from yival.schemas.model_configs import Request
from yival.wrappers.string_wrapper import StringWrapper


Expand All @@ -11,40 +13,31 @@ def translate_quiz(teacher_quiz: str) -> str:
logger.reset()
# Ensure you have your OpenAI API key set up
openai.api_key = os.getenv("OPENAI_API_KEY")
messages = [{
"role":
"system",
"content":
"You are a diligent student that translates English to Chinese."
}, {
"role":
"user",
"content":
str(
StringWrapper(
"Translate this English text to Chinese: ", name="task"
)
) + f'{teacher_quiz}'
}]
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo", messages=messages
)
# prompt = str(
# StringWrapper(
# "Translate this English text to Chinese:",
# name="task",
# variables={
# "teacher_quiz": teacher_quiz,
# }
# )
# messages = [{
# "role":
# "system",
# "content":
# "You are a diligent student that translates English to Chinese."
# }, {
# "role":
# "user",
# "content":
# str(
# StringWrapper(
# "Translate this English text to Chinese: ", name="task"
# )
# ) + f'{teacher_quiz}'
# }]
# response = openai.ChatCompletion.create(
# model="gpt-3.5-turbo", messages=messages
# )
# # Use the chat-based completion
# response = llm_completion(
# Request(
# model_name=str(StringWrapper("a16z-infra/llama-2-13b-chat:9dff94b1bed5af738655d4a7cbcdcde2bd503aa85c94334fe1f42af7f3dd5ee3", name="model_name")),
# prompt=prompt
# )
# ).output

prompt = f"Translate this English text to Chinese: \n Please only output the translated sentence, do not reply with any other content. {teacher_quiz}"
model_name = str(StringWrapper("", name="model_name"))
# Use the chat-based completion
response = llm_completion(
Request(model_name=model_name, prompt=prompt)
).output
print(f"[DEBUG]response:{response}")
res = response['choices'][0]['message']['content']
# token_usage = response['usage']['total_tokens']
Expand Down
6 changes: 5 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ torch = ">=2.0.0, !=2.0.1"
transformers = "^4.33.0"
psutil = "^5.9.5"
replicate = "^0.12.0"
bert-score = "^0.3.13"
rouge = "^1.0.1"

[tool.poetry.group.doc.dependencies]
mkdocs = "*"
Expand Down
4 changes: 4 additions & 0 deletions src/yival/cli/init.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,12 @@
from ..data_generators.openai_prompt_data_generator import (
OpenAIPromptBasedGeneratorConfig,
)
from ..evaluators.bertscore_evaluator import BertScoreEvaluator
from ..evaluators.openai_elo_evaluator import OpenAIEloEvaluator
from ..evaluators.openai_prompt_based_evaluator import (
OpenAIPromptBasedEvaluator,
)
from ..evaluators.rouge_evaluator import RougeEvaluator
from ..evaluators.string_expected_result_evaluator import (
StringExpectedResultEvaluator,
)
Expand All @@ -27,6 +29,8 @@
def _prevent_unused_imports():
_ = StringWrapper
_ = StringExpectedResultEvaluator
_ = RougeEvaluator
_ = BertScoreEvaluator
_ = CSVReader
_ = OpenAIPromptBasedGeneratorConfig
_ = OpenAIPromptBasedVariationGenerator
Expand Down
12 changes: 11 additions & 1 deletion src/yival/data_generators/openai_prompt_data_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,11 +144,20 @@ def process_output(
self.config.input_function.get('parameters', {}).keys()
):
return

# choose expected_value
expected_value = generated_example.get(
self.config.expected_param_name, None
)
print(f"[DEBUG] generated_example:{generated_example}")

if expected_value:
generated_example.pop(self.config.expected_param_name)
print(f"[DEBUG] expected_value:{expected_value}")
input_data_instance = InputData(
example_id=super().generate_example_id(output_content),
content=generated_example,
expected_result=None
expected_result=expected_value
)
all_data.append(input_data_instance)
chunk.append(input_data_instance)
Expand Down Expand Up @@ -210,6 +219,7 @@ def generate_examples(self) -> Iterator[List[InputData]]:
c = extract_dict_from_gpt_output(
output.choices[0].message.content
)
print(f"[DEBUG]output:{output}")
if c:
all_data_content.append(c)
pbar.update(1)
Expand Down
11 changes: 7 additions & 4 deletions src/yival/evaluators/bertscore_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
It has been shown to correlate with human judgment on sentence-level and system-level evaluation.
"""

from bert_score import score
from bert_score import score # type:ignore

from ..schemas.common_structures import InputData
from ..schemas.evaluator_config import (
Expand Down Expand Up @@ -35,11 +35,14 @@ def evaluate(self, experiment_result: ExperimentResult) -> EvaluatorOutput:
verbose=True)

scores = {"p": p.item(), "r": r.item(), "f": f1.item()}
result = scores.get(self.config.indicator, .0)

print(f"[INFO] bert_score: {scores}, result:{result}")

return EvaluatorOutput(
name=self.config.name,
display_name="bertscore",
result=scores,
display_name=self.config.display_name,
result=result,
metric_calculators=self.config.metric_calculators
)

Expand All @@ -52,7 +55,7 @@ def evaluate(self, experiment_result: ExperimentResult) -> EvaluatorOutput:
def main():
"""Main function to test the bertscore evaluator"""
evaluator_config = BertScoreEvaluatorConfig(
name="roge_evaluator",
name="bertscore_evaluator",
display_name="rouge",
metric_calculators=[],
)
Expand Down
7 changes: 6 additions & 1 deletion src/yival/evaluators/rouge_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,20 @@ def __init__(self, config: RougeEvaluatorConfig):

def evaluate(self, experiment_result: ExperimentResult) -> EvaluatorOutput:
"""Evaluate the experiment result using rouge evaluat"""
print("[INFO] ROUGE_EVALUATOR")
assert isinstance(self.config, RougeEvaluatorConfig)
input_data = experiment_result.input_data
raw_output = experiment_result.raw_output
expected_result = input_data.expected_result

print(f"[INFO] hyps:{raw_output}, refs:{expected_result}")

scores = self.rough.get_scores(
hyps=raw_output, refs=expected_result, avg=True
)

print(f"[INFO] scores:{scores}")

return EvaluatorOutput(
name=self.config.name,
display_name="rouge",
Expand All @@ -55,7 +60,7 @@ def evaluate(self, experiment_result: ExperimentResult) -> EvaluatorOutput:
def main():
"""Main function to test the rouge evaluator"""
evaluator_config = RougeEvaluatorConfig(
name="roge_evaluator",
name="rouge_evaluator",
display_name="rouge",
metric_calculators=[],
)
Expand Down
1 change: 0 additions & 1 deletion src/yival/experiment/experiment_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,6 @@ def run(
improver = get_improver(self.config)
print(f"[INFO] improver: {improver}")
if improver:
print("[INFO] improve")
experiment.improver_output = improver.improve(
experiment, self.config, evaluator, logger
)
Expand Down
1 change: 0 additions & 1 deletion src/yival/experiment/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,6 @@ def run_single_input(
config["custom_function"], # type: ignore
**d.content
)
print(f"[INFO]res: {res}")
end_time = time.time()
latency = end_time - start_time # Time in seconds

Expand Down
2 changes: 2 additions & 0 deletions src/yival/result_selectors/ahp_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ def select(self, experiment: Experiment) -> SelectionOutput:
combination_data = {}
alternatives_data = []

print(f"[INFO] experiment:{experiment}")

# Convert each combination's metrics to a criteria vector
for combo in experiment.combination_aggregated_metrics:
data = self._extract_data(combo)
Expand Down
3 changes: 3 additions & 0 deletions src/yival/schemas/data_generator_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,5 +28,8 @@ class OpenAIPromptBasedGeneratorConfig(BaseDataGeneratorConfig):
diversify: bool = True
max_token = 2000

# Expected Value name
expected_param_name: str = ""

def asdict(self):
return asdict(self)
1 change: 1 addition & 0 deletions src/yival/schemas/evaluator_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,3 +164,4 @@ class BertScoreEvaluatorConfig(EvaluatorConfig):
evaluator_type: EvaluatorType = EvaluatorType.INDIVIDUAL
description: str = " This is the description of the evaluator"
lan: str = 'zh'
indicator: str = 'p' # p,r,f

0 comments on commit 93579e3

Please sign in to comment.