forked from patrickrchao/JailbreakingLLMs
-
Notifications
You must be signed in to change notification settings - Fork 0
/
judges.py
71 lines (59 loc) · 2.59 KB
/
judges.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
from fastchat.model import (
get_conversation_template
)
import re
from system_prompts import get_judge_system_prompt
from language_models import GPT
def load_judge(args):
if "gpt" in args.judge_model:
return GPTJudge(args)
elif args.judge_model == "no-judge":
return NoJudge(args)
else:
raise NotImplementedError
class JudgeBase:
def __init__(self, args):
self.max_n_tokens = args.judge_max_n_tokens
self.temperature = args.judge_temperature
self.judge_name = args.judge_model
self.goal = args.goal
self.target_str = args.target_str
self.system_prompt = get_judge_system_prompt(args.goal, args.target_str)
def get_judge_prompt(self, attack_prompt, target_response):
return f"[PROMPT]:{attack_prompt}\n[ASSISTANT'S RESPONSE]: {target_response}"
def score(self, prompt_list, response_list):
raise NotImplementedError
def process_output(self, raw_output):
pattern = r'\[\[(\d+)\]\]'
match = re.search(pattern, raw_output)
output = int(match.group(1)) if match else None
if output is None:
print(f"Error in processing judge output: {raw_output}" )
output = 1
return output
class NoJudge(JudgeBase):
def __init__(self, args):
super(NoJudge, self).__init__(args)
self.judge_model = None
def score(self, prompt_list, response_list):
return [1 for _ in prompt_list]
class GPTJudge(JudgeBase):
def __init__(self, args):
super(GPTJudge, self).__init__(args)
self.judge_model = GPT(model_name = self.judge_name)
def create_conv(self, full_prompt):
conv = get_conversation_template(self.judge_name)
conv.set_system_message(self.system_prompt)
conv.append_message(conv.roles[0], full_prompt)
return conv.to_openai_api_messages()
def score(self, attack_prompt_list, target_response_list):
convs_list = [self.create_conv(self.get_judge_prompt(prompt, response)) for prompt, response in zip(attack_prompt_list, target_response_list)]
raw_outputs = self.judge_model.batched_generate(convs_list,
max_n_tokens = self.max_n_tokens,
temperature = self.temperature)
outputs = [self.process_output(raw_output) for raw_output in raw_outputs]
return outputs
class OpenSourceJudge(JudgeBase):
def __init__(self, judge_model, judge_tokenizer, args):
# TODO: Implement open source judge
raise NotImplementedError