Skip to content

Commit

Permalink
Switch to hf-audio/esb-datasets-test-only-sorted dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
jordimas committed Jul 31, 2024
1 parent 20a7543 commit 6551c74
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 15 deletions.
12 changes: 6 additions & 6 deletions ctranslate2/run_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,12 @@ def main(args) -> None:
for batch in tqdm(dataset_iterator(dataset), desc=f"Evaluating {args.model_id}"):
segments, _ = asr_model.transcribe(batch["array"], language="en")
outputs = [segment._asdict() for segment in segments]
predictions.extend(
data_utils.normalizer(
"".join([segment["text"] for segment in outputs])
).strip()
)
references.extend(batch["reference"][0])
transcription = data_utils.normalizer(
"".join([segment["text"] for segment in outputs])
).strip()

predictions.append(transcription)
references.append(batch["reference"])

# Write manifest results
manifest_path = data_utils.write_manifest(
Expand Down
18 changes: 9 additions & 9 deletions ctranslate2/run_whisper.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ do

python run_eval.py \
--model_id=${MODEL_ID} \
--dataset_path="https://huggingface.co/datasets/hf-audio/esb-datasets-test-only" \
--dataset_path="hf-audio/esb-datasets-test-only-sorted" \
--dataset="ami" \
--split="test" \
--device=${DEVICE_INDEX} \
Expand All @@ -23,7 +23,7 @@ do

python run_eval.py \
--model_id=${MODEL_ID} \
--dataset_path="https://huggingface.co/datasets/hf-audio/esb-datasets-test-only" \
--dataset_path="hf-audio/esb-datasets-test-only-sorted" \
--dataset="earnings22" \
--split="test" \
--device=${DEVICE_INDEX} \
Expand All @@ -32,7 +32,7 @@ do

python run_eval.py \
--model_id=${MODEL_ID} \
--dataset_path="https://huggingface.co/datasets/hf-audio/esb-datasets-test-only" \
--dataset_path="hf-audio/esb-datasets-test-only-sorted" \
--dataset="gigaspeech" \
--split="test" \
--device=${DEVICE_INDEX} \
Expand All @@ -41,7 +41,7 @@ do

python run_eval.py \
--model_id=${MODEL_ID} \
--dataset_path="https://huggingface.co/datasets/hf-audio/esb-datasets-test-only" \
--dataset_path="hf-audio/esb-datasets-test-only-sorted" \
--dataset="librispeech" \
--split="test.clean" \
--device=${DEVICE_INDEX} \
Expand All @@ -50,7 +50,7 @@ do

python run_eval.py \
--model_id=${MODEL_ID} \
--dataset_path="https://huggingface.co/datasets/hf-audio/esb-datasets-test-only" \
--dataset_path="hf-audio/esb-datasets-test-only-sorted" \
--dataset="librispeech" \
--split="test.other" \
--device=${DEVICE_INDEX} \
Expand All @@ -59,7 +59,7 @@ do

python run_eval.py \
--model_id=${MODEL_ID} \
--dataset_path="https://huggingface.co/datasets/hf-audio/esb-datasets-test-only" \
--dataset_path="hf-audio/esb-datasets-test-only-sorted" \
--dataset="spgispeech" \
--split="test" \
--device=${DEVICE_INDEX} \
Expand All @@ -68,7 +68,7 @@ do

python run_eval.py \
--model_id=${MODEL_ID} \
--dataset_path="https://huggingface.co/datasets/hf-audio/esb-datasets-test-only" \
--dataset_path="hf-audio/esb-datasets-test-only-sorted" \
--dataset="tedlium" \
--split="test" \
--device=${DEVICE_INDEX} \
Expand All @@ -77,7 +77,7 @@ do

python run_eval.py \
--model_id=${MODEL_ID} \
--dataset_path="https://huggingface.co/datasets/hf-audio/esb-datasets-test-only" \
--dataset_path="hf-audio/esb-datasets-test-only-sorted" \
--dataset="voxpopuli" \
--split="test" \
--device=${DEVICE_INDEX} \
Expand All @@ -86,7 +86,7 @@ do

python run_eval.py \
--model_id=${MODEL_ID} \
--dataset_path="https://huggingface.co/datasets/hf-audio/esb-datasets-test-only" \
--dataset_path="hf-audio/esb-datasets-test-only-sorted" \
--dataset="common_voice" \
--split="test" \
--device=${DEVICE_INDEX} \
Expand Down

0 comments on commit 6551c74

Please sign in to comment.