Skip to content

Commit

Permalink
Merge branch 'main' of https://github.com/stanfordnlp/dspy
Browse files Browse the repository at this point in the history
  • Loading branch information
okhat committed Nov 28, 2024
2 parents 8845531 + 9138b64 commit 7f4e163
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 63 deletions.
64 changes: 31 additions & 33 deletions dspy/datasets/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,49 +20,44 @@ def __init__(self, train_seed=0, train_size=None, eval_seed=0, dev_size=None, te
self.name = self.__class__.__name__

def reset_seeds(self, train_seed=None, train_size=None, eval_seed=None, dev_size=None, test_size=None):
self.train_size = train_size if train_size is not None else self.train_size
self.train_seed = train_seed if train_seed is not None else self.train_seed
self.dev_size = dev_size if dev_size is not None else self.dev_size
self.dev_seed = eval_seed if eval_seed is not None else self.dev_seed
self.test_size = test_size if test_size is not None else self.test_size
self.test_seed = eval_seed if eval_seed is not None else self.test_seed

if hasattr(self, '_train_'):
self.train_size = train_size or self.train_size
self.train_seed = train_seed or self.train_seed
self.dev_size = dev_size or self.dev_size
self.dev_seed = eval_seed or self.dev_seed
self.test_size = test_size or self.test_size
self.test_seed = eval_seed or self.test_seed

if hasattr(self, "_train_"):
del self._train_
if hasattr(self, '_dev_'):

if hasattr(self, "_dev_"):
del self._dev_
if hasattr(self, '_test_'):

if hasattr(self, "_test_"):
del self._test_

@property
def train(self):
if not hasattr(self, '_train_'):
self._train_ = self._shuffle_and_sample('train', self._train, self.train_size, self.train_seed)
if not hasattr(self, "_train_"):
self._train_ = self._shuffle_and_sample("train", self._train, self.train_size, self.train_seed)

return self._train_

@property
def dev(self):
if not hasattr(self, '_dev_'):
self._dev_ = self._shuffle_and_sample('dev', self._dev, self.dev_size, self.dev_seed)
if not hasattr(self, "_dev_"):
self._dev_ = self._shuffle_and_sample("dev", self._dev, self.dev_size, self.dev_seed)

return self._dev_

@property
def test(self):
if not hasattr(self, '_test_'):
self._test_ = self._shuffle_and_sample('test', self._test, self.test_size, self.test_seed)
if not hasattr(self, "_test_"):
self._test_ = self._shuffle_and_sample("test", self._test, self.test_size, self.test_seed)

return self._test_

def _shuffle_and_sample(self, split, data, size, seed=0):
'''
The setting (seed=s, size=N) is always a subset
of the setting (seed=s, size=M) for N < M.
'''

data = list(data)

# Shuffle the data irrespective of the requested size.
Expand All @@ -84,15 +79,18 @@ def _shuffle_and_sample(self, split, data, size, seed=0):
# a uuid field that would respect this in some way. This means that we need a more refined concept that
# uuid (each example is unique) and more like a group_uuid.

# rng = random.Random(seed)
# rng.shuffle(data)

return output

@classmethod
def prepare_by_seed(cls, train_seeds=[1,2,3,4,5], train_size=16, dev_size=1000,
divide_eval_per_seed=True, eval_seed=2023, **kwargs):

def prepare_by_seed(
cls,
train_seeds=[1, 2, 3, 4, 5],
train_size=16,
dev_size=1000,
divide_eval_per_seed=True,
eval_seed=2023,
**kwargs,
):
data_args = dotdict(train_size=train_size, eval_seed=eval_seed, dev_size=dev_size, test_size=0, **kwargs)
dataset = cls(**data_args)

Expand All @@ -106,12 +104,12 @@ def prepare_by_seed(cls, train_seeds=[1,2,3,4,5], train_size=16, dev_size=1000,
data_args.train_seed = train_seed
dataset.reset_seeds(**data_args)

eval_sets.append(eval_set[eval_offset:eval_offset+examples_per_seed])
eval_sets.append(eval_set[eval_offset : eval_offset + examples_per_seed])
train_sets.append(dataset.train)

assert len(eval_sets[-1]) == examples_per_seed, len(eval_sets[-1])
assert len(train_sets[-1]) == train_size, len(train_sets[-1])

if divide_eval_per_seed:
eval_offset += examples_per_seed

Expand Down
53 changes: 23 additions & 30 deletions dspy/datasets/gsm8k.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,36 +5,35 @@


class GSM8K:
def __init__(self) -> None:
super().__init__()
def __init__(self):
self.do_shuffle = False

dataset = load_dataset("gsm8k", 'main')
dataset = load_dataset("gsm8k", "main")

hf_official_train = dataset['train']
hf_official_test = dataset['test']
hf_official_train = dataset["train"]
hf_official_test = dataset["test"]
official_train = []
official_test = []

for example in tqdm.tqdm(hf_official_train):
question = example['question']
question = example["question"]

answer = example['answer'].strip().split()
assert answer[-2] == '####'
gold_reasoning = ' '.join(answer[:-2])
answer = str(int(answer[-1].replace(',', '')))
answer = example["answer"].strip().split()
assert answer[-2] == "####"

gold_reasoning = " ".join(answer[:-2])
answer = str(int(answer[-1].replace(",", "")))

official_train.append(dict(question=question, gold_reasoning=gold_reasoning, answer=answer))

for example in tqdm.tqdm(hf_official_test):
question = example['question']
question = example["question"]

answer = example["answer"].strip().split()
assert answer[-2] == "####"

answer = example['answer'].strip().split()
assert answer[-2] == '####'

gold_reasoning = ' '.join(answer[:-2])
answer = str(int(answer[-1].replace(',', '')))
gold_reasoning = " ".join(answer[:-2])
answer = str(int(answer[-1].replace(",", "")))

official_test.append(dict(question=question, gold_reasoning=gold_reasoning, answer=answer))

Expand All @@ -50,35 +49,29 @@ def __init__(self) -> None:

import dspy

trainset = [dspy.Example(**x).with_inputs('question') for x in trainset]
devset = [dspy.Example(**x).with_inputs('question') for x in devset]
testset = [dspy.Example(**x).with_inputs('question') for x in testset]

# print(f"Trainset size: {len(trainset)}")
# print(f"Devset size: {len(devset)}")
# print(f"Testset size: {len(testset)}")
trainset = [dspy.Example(**x).with_inputs("question") for x in trainset]
devset = [dspy.Example(**x).with_inputs("question") for x in devset]
testset = [dspy.Example(**x).with_inputs("question") for x in testset]

self.train = trainset
self.dev = devset
self.test = testset



def parse_integer_answer(answer, only_first_line=True):
try:
if only_first_line:
answer = answer.strip().split('\n')[0]
answer = answer.strip().split("\n")[0]

# find the last token that has a number in it
answer = [token for token in answer.split() if any(c.isdigit() for c in token)][-1]
answer = answer.split('.')[0]
answer = ''.join([c for c in answer if c.isdigit()])
answer = answer.split(".")[0]
answer = "".join([c for c in answer if c.isdigit()])
answer = int(answer)

except (ValueError, IndexError):
# print(answer)
answer = 0

return answer


Expand Down

0 comments on commit 7f4e163

Please sign in to comment.