-
Notifications
You must be signed in to change notification settings - Fork 0
/
peek_data.py
32 lines (24 loc) · 953 Bytes
/
peek_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import json
import tqdm
from template import generate_prompt
from transformers import LlamaForCausalLM, LlamaTokenizer
tokenizer = LlamaTokenizer.from_pretrained('decapoda-research/llama-7b-hf', add_eos_token=True)
def his_len():
data = json.load(open(r'raw_data\metra_aplaca_10000.json'))
prompt_size = []
for d in tqdm.tqdm(data):
result = tokenizer(generate_prompt(d['input'], d['output']),truncation=False)
prompt_size.append(len(result['input_ids']))
import matplotlib.pyplot as plt
print('frac of lower than 256: ', len([d for d in prompt_size if d<256])/len(prompt_size))
plt.hist(prompt_size, bins=20)
plt.show()
def peek_data():
import re
examples = json.load(open(r'sample\merge.json'))
# for d in examples:
# if re.match('.*[\:|:].*', d['input']):
# print(d)
print(len([d for d in examples if d['input']==""]))
if __name__ == '__main__':
his_len()