Skip to content

Commit

Permalink
new code
Browse files Browse the repository at this point in the history
  • Loading branch information
qnguyen3 committed Aug 16, 2024
1 parent 14bd801 commit 5418ff5
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 9 deletions.
45 changes: 43 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,50 @@ OpenAutoEvol is an innovative framework for automatically enhancing the complexi
- Integration with open-source LLMs
- Streamlined fine-tuning process

## Getting Started
## Running AutoEvol

[Instructions on how to set up and use OpenAutoEvol]
To run the AutoEvol script, use the following command structure:

```bash
python run_autoevol.py <dataset_name> [options]
```

### Parameters:

- `<dataset_name>`: (Required) The name of the dataset on Hugging Face to use.

### Options:

- `--batch_size <int>`: Number of instructions to process in each batch. Default is 5.
- `--num_methods <int>`: Number of evolution methods to use. Default is 2.
- `--max_concurrent_batches <int>`: Maximum number of batches to process concurrently. Default is 2.
- `--dev_set_size <int>`: Number of samples to use in the development set. Default is 5.
- `--evolve_epoch <int>`: Max epoch for each instruction when evolve.
- `--use_reward_model <store_true>`: whether or not using a reward model to find the best method each round.

### Example Usage:

To run AutoEvol on the 'small_tomb' dataset with custom parameters:

```bash
python main.py qnguyen3/small_tomb --batch_size 10 --num_methods 3 --max_concurrent_batches 2 --dev_set_size 3
```

This command will:
1. Load the 'qnguyen3/small_tomb' dataset from Hugging Face.
2. Use 3 samples for the development set.
3. Process the remaining samples in the training set.
4. Use a batch size of 10 for processing.
5. Apply 3 evolution methods for each sample in development set to find the best method
6. Process 2 batches concurrently.

### Output:

The script will output progress information to the console and save the results in a JSON file named `instruction_evolution_results-<dataset_name>.json` in the current directory.

### Note:

Ensure you have the required dependencies installed and have access to the specified Hugging Face dataset before running the script.

## Contributing

Expand Down
3 changes: 2 additions & 1 deletion run_autoevol.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ async def main():
parser.add_argument("--batch_size", type=int, default=5, help="Batch size for processing")
parser.add_argument("--num_methods", type=int, default=2, help="Number of methods to use")
parser.add_argument("--max_concurrent_batches", type=int, default=2, help="Maximum number of concurrent batches")
parser.add_argument("--evolve_epoch", type=int, default=2, help="Maximum number of epoch for each instruction")
parser.add_argument("--use_reward_model",action="store_true",help="just a flag argument")

args = parser.parse_args()
Expand All @@ -81,7 +82,7 @@ async def main():
print(f"Max concurrent batches: {args.max_concurrent_batches}")

start_time = time.time()
results = await auto_evol.run(train_set, batch_size=args.batch_size, num_methods=args.num_methods, max_concurrent_batches=args.max_concurrent_batches)
results = await auto_evol.run(train_set, batch_size=args.batch_size, num_methods=args.num_methods, max_concurrent_batches=args.max_concurrent_batches, evolve_epoch=args.evolve_epoch)
end_time = time.time()
total_time = end_time - start_time

Expand Down
12 changes: 6 additions & 6 deletions src/autoevol.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ class AutoEvol:
def __init__(self, components: Dict[str, Any]):
self.components = components

async def process_instruction(self, instruction: str, num_methods: int) -> Dict[str, Any]:
async def process_instruction(self, instruction: str, num_methods: int, evolve_epoch: int = 2) -> Dict[str, Any]:
start_time = time.time()
instruction_stages = [instruction]
methods = [INITIAL_EVOLVE_METHOD.format(instruction=instruction_stages[0])]
Expand All @@ -21,7 +21,7 @@ async def process_instruction(self, instruction: str, num_methods: int) -> Dict[
"stages": []
}

for i in range(2):
for i in range(evolve_epoch):
stage_start_time = time.time()
stage_result = {
"stage": i + 1,
Expand Down Expand Up @@ -75,12 +75,12 @@ async def process_instruction(self, instruction: str, num_methods: int) -> Dict[
result["total_time"] = end_time - start_time
return result

async def process_batch(self, batch: List[str], num_methods: int, pbar: tqdm) -> List[Dict[str, Any]]:
batch_results = await asyncio.gather(*[self.process_instruction(instruction, num_methods) for instruction in batch])
async def process_batch(self, batch: List[str], num_methods: int, evolve_epoch: int, pbar: tqdm) -> List[Dict[str, Any]]:
batch_results = await asyncio.gather(*[self.process_instruction(instruction, num_methods, evolve_epoch) for instruction in batch])
pbar.update(len(batch))
return batch_results

async def run(self, dataset: List[str], batch_size: int = 10, num_methods: int = 5, max_concurrent_batches: int = 2) -> List[Dict[str, Any]]:
async def run(self, dataset: List[str], batch_size: int = 10, num_methods: int = 5, max_concurrent_batches: int = 2, evolve_epoch: int = 2) -> List[Dict[str, Any]]:
print(f"Starting dataset processing. Dataset size: {len(dataset)}, Max concurrent batches: {max_concurrent_batches}")
start_time = time.time()

Expand All @@ -91,7 +91,7 @@ async def run(self, dataset: List[str], batch_size: int = 10, num_methods: int =

async def process_batch_with_semaphore(batch):
async with semaphore:
return await self.process_batch(batch, num_methods, pbar)
return await self.process_batch(batch, num_methods, evolve_epoch, pbar)

results = await asyncio.gather(*[process_batch_with_semaphore(batch) for batch in batches])

Expand Down

0 comments on commit 5418ff5

Please sign in to comment.