Skip to content

Commit

Permalink
Use the config.ci.yml for the training defaults (#856)
Browse files Browse the repository at this point in the history
  • Loading branch information
gregtatum authored Sep 23, 2024
1 parent 9d355d8 commit 78b9232
Show file tree
Hide file tree
Showing 6 changed files with 52 additions and 138 deletions.
50 changes: 31 additions & 19 deletions taskcluster/configs/config.ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,13 @@ experiment:

teacher-ensemble: 1

mono-max-sentences-src: 10000
mono-max-sentences-trg: 10000
spm-sample-size: 10000
mono-max-sentences-src:
total: 10000
per-dataset: 10000
mono-max-sentences-trg:
total: 10000
per-dataset: 10000
spm-sample-size: 1000
spm-vocab-size: 1000

best-model: chrf
Expand All @@ -20,50 +24,57 @@ experiment:
opuscleaner-mode: "custom"
teacher-mode: "two-stage"


bicleaner:
default-threshold: 0.5
dataset-thresholds:
opus_ada83/v1: 0.0
opus_ELRC-3075-wikipedia_health/v1: 0.6

min-fluency-threshold:
mono-src: 0.8
mono-trg: 0.9

marian-args:
training-backward:
disp-freq: "1"
save-freq: "5"
valid-freq: "10"
after: 10u
# Run training for 10 updates, and display 5 updates. Only validate and save the
# model once.
disp-freq: "2"
save-freq: "25"
valid-freq: "50"
after: 50u
dim-vocabs: "1000 1000"
training-teacher:
disp-freq: "1"
save-freq: "5"
valid-freq: "10"
after: 10u
save-freq: "25"
valid-freq: "50"
after: 50u
dim-vocabs: "1000 1000"
task: transformer-base
training-student:
disp-freq: "1"
save-freq: "5"
valid-freq: "10"
after: 10u
save-freq: "25"
valid-freq: "50"
after: 50u
dim-vocabs: "1000 1000"
training-student-finetuned:
disp-freq: "1"
save-freq: "5"
valid-freq: "10"
after: 10u
dim-vocabs: "1000 1000"
save-freq: "25"
valid-freq: "50"
after: 50u
dim-vocabs: 1000 1000
decoding-backward:
mini-batch-words: "2000"
decoding-teacher:
mini-batch-words: "1000"
precision: float16

# Ensure that we have adequate coverage for dataset types in CI.
datasets:
train:
- opus_ada83/v1
- opus_ELRC-3075-wikipedia_health/v1
- url_https://storage.googleapis.com/releng-translations-dev/data/en-ru/pytest-dataset.[LANG].zst
- mtdata_ELRC-web_acquired_data_related_to_scientific_research-1-eng-rus
devtest:
- flores_dev
- sacrebleu_aug-upper_wmt19
Expand All @@ -76,7 +87,8 @@ datasets:
- news-crawl_news.2007
- opus_tldr-pages/v2023-08-29 # 39,646 sentences

wandb-publication: false
# Publishes to the "ci" project.
wandb-publication: true
target-stage: all
taskcluster:
split-chunks: 2
Expand Down
4 changes: 2 additions & 2 deletions taskcluster/test/test_default_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@

from taskgraph.taskgraph import TaskGraph

from translations_taskgraph.parameters import get_defaults
from translations_taskgraph.parameters import get_ci_training_config

PARAMS = deepcopy(get_defaults(None))
PARAMS = deepcopy(get_ci_training_config())
PARAMS["target_tasks_method"] = "train-target-tasks"

MOCK_REQUESTS = [
Expand Down
4 changes: 2 additions & 2 deletions taskcluster/test/test_target_stage.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@

from taskgraph.taskgraph import TaskGraph

from translations_taskgraph.parameters import get_defaults
from translations_taskgraph.parameters import get_ci_training_config

PARAMS = deepcopy(get_defaults(None))
PARAMS = deepcopy(get_ci_training_config())
PARAMS["target_tasks_method"] = "train-target-tasks"
PARAMS["training_config"]["target-stage"] = "train-teacher"

Expand Down
4 changes: 2 additions & 2 deletions taskcluster/test/test_training_continuation_backwards.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@

from taskgraph.taskgraph import TaskGraph

from translations_taskgraph.parameters import get_defaults
from translations_taskgraph.parameters import get_ci_training_config

PARAMS = deepcopy(get_defaults(None))
PARAMS = deepcopy(get_ci_training_config())
PARAMS["target_tasks_method"] = "train-target-tasks"
PARAMS["training_config"]["experiment"]["pretrained-models"] = {
"train-backwards": {
Expand Down
4 changes: 2 additions & 2 deletions taskcluster/translations_taskgraph/actions/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from taskgraph.taskgraph import TaskGraph
from taskgraph.util.taskcluster import get_ancestors, get_artifact

from translations_taskgraph.parameters import get_defaults
from translations_taskgraph.parameters import get_ci_training_config

logger = logging.getLogger(__name__)

Expand All @@ -34,7 +34,7 @@ def can_train(parameters):
)


defaults = get_defaults("")["training_config"]
defaults = get_ci_training_config()["training_config"]


def validate_pretrained_models(params):
Expand Down
124 changes: 13 additions & 111 deletions taskcluster/translations_taskgraph/parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,119 +2,21 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

from pathlib import Path
from taskgraph.parameters import extend_parameters_schema
from voluptuous import Extra, Optional, Required
import yaml


# These defaults line up with the `config.ci.yml` pipeline as much as possible.
# Their purpose is to provide a minimal config with a few datasets that can run
# the entire pipeline reasonably quickly to validate changes to the pipeline
# itself. Any real training should be overriding most, if not all, of these
# via the input to the `train` action.
def get_defaults(_) -> dict:
return {
"training_config": {
"target-stage": "all",
"experiment": {
"name": "ci",
"src": "ru",
"trg": "en",
"teacher-ensemble": 1,
"teacher-mode": "two-stage",
"mono-max-sentences-trg": {"total": 10000, "per-dataset": 10000},
"mono-max-sentences-src": {"total": 10000, "per-dataset": 10000},
"spm-sample-size": 10000,
"spm-vocab-size": 1000,
"best-model": "chrf",
"use-opuscleaner": "true",
"opuscleaner-mode": "custom",
"bicleaner": {
"default-threshold": 0.5,
"dataset-thresholds": {
"opus_ada83/v1": 0.0,
"opus_ELRC-3075-wikipedia_health/v1": 0.6,
},
},
"min-fluency-threshold": {
"mono-src": 0.8,
"mono-trg": 0.9,
},
},
"marian-args": {
"training-backward": {
"disp-freq": "2",
"save-freq": "25",
"valid-freq": "50",
"after": "50u",
"dim-vocabs": "1000 1000",
},
"training-teacher": {
"disp-freq": "1",
"save-freq": "25",
"valid-freq": "50",
"after": "50u",
"dim-vocabs": "1000 1000",
"task": "transformer-base",
},
"training-student": {
"disp-freq": "1",
"save-freq": "25",
"valid-freq": "50",
"after": "50u",
"dim-vocabs": "1000 1000",
},
"training-student-finetuned": {
"disp-freq": "1",
"save-freq": "25",
"valid-freq": "50",
"after": "50u",
"dim-vocabs": "1000 1000",
},
"decoding-backward": {
"mini-batch-words": "2000",
},
"decoding-teacher": {
"mini-batch-words": "1000",
"precision": "float16",
},
},
# These will never be used in practice, but specifying them ensures
# that we always generate at least one task for each kind, which helps
# to avoid bustage that doesn't show up until we run the training action.
"datasets": {
"train": [
"opus_ada83/v1",
"opus_ELRC-3075-wikipedia_health/v1",
"url_https://storage.googleapis.com/releng-translations-dev/data/en-ru/pytest-dataset.[LANG].zst",
"mtdata_ELRC-web_acquired_data_related_to_scientific_research-1-eng-rus",
],
"devtest": [
"flores_dev",
"sacrebleu_aug-upper_wmt19",
],
"test": [
"flores_devtest",
],
"mono-src": [
"news-crawl_news.2008",
"opus_tldr-pages/v2023-08-29",
],
"mono-trg": [
"news-crawl_news.2007",
"opus_tldr-pages/v2023-08-29",
],
},
# Taskcluster-specific configuration
"taskcluster": {
"split-chunks": 2,
"worker-classes": {
"default": "gcp-spot",
},
},
# Disable Weight & Biases publication on CI
"wandb-publication": True,
},
}
# By default, provide a very minimal config for CI that runs very quickly. This allows
# the pipeline to be validated in CI. The production training configs should override
# all of these values.
def get_ci_training_config(_=None) -> dict:
vcs_path = (Path(__file__).parent / "../..").resolve()
config_path = vcs_path / "taskcluster/configs/config.ci.yml"

with config_path.open() as file:
return {"training_config": yaml.safe_load(file)}


extend_parameters_schema(
Expand Down Expand Up @@ -184,7 +86,7 @@ def get_defaults(_) -> dict:
Optional("wandb-publication"): bool,
},
},
defaults_fn=get_defaults,
defaults_fn=get_ci_training_config,
)


Expand All @@ -198,4 +100,4 @@ def deep_setdefault(dict_, defaults):

def get_decision_parameters(graph_config, parameters):
parameters.setdefault("training_config", {})
deep_setdefault(parameters, get_defaults(""))
deep_setdefault(parameters, get_ci_training_config())

0 comments on commit 78b9232

Please sign in to comment.