Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Experiment] CJK with ICU segmenter #959

Draft
wants to merge 22 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Add zh-en config
  • Loading branch information
eu9ene committed Nov 23, 2024
commit 120deb8844f020d919b776c3df8c59cdb1682cd3
205 changes: 205 additions & 0 deletions configs/autogenerated/zh-en-cjk.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
# The initial configuration was generated using:
# task config-generator -- zh en --name cjk
#
# The documentation for this config can be found here:
# https://github.com/mozilla/translations/blob/3598c784dedd8ccf73e9c21a368fab46bcaa12c8/taskcluster/configs/config.prod.yml
experiment:
name: cjk
src: zh
trg: en
best-model: chrf
use-opuscleaner: 'true'
opuscleaner-mode: custom
bicleaner:
default-threshold: 0.5
dataset-thresholds: {}
mono-max-sentences-src:
total: 500_000_000
per-dataset: 200_000_000
mono-max-sentences-trg:
total: 200_000_000
per-dataset: 200_000_000
min-fluency-threshold:
mono-src: 0.8
mono-trg: 0.9
spm-sample-size: 10_000_000
spm-vocab-size: 64000
teacher-ensemble: 2
teacher-mode: two-stage
student-model: tiny
pretrained-models: {}
datasets:
devtest:
- mtdata_Neulab-tedtalks_dev-1-eng-zho
- flores_aug-mix-cjk_dev
- sacrebleu_aug-mix-cjk_wmt22
- sacrebleu_aug-mix-cjk_wmt21
- sacrebleu_aug-mix-cjk_wmt21/AB
- sacrebleu_aug-mix-cjk_wmt20
- sacrebleu_aug-mix-cjk_wmt18
- sacrebleu_aug-mix-cjk_wmt17
- sacrebleu_aug-mix-cjk_wmt17/dev
- sacrebleu_aug-mix-cjk_iwslt17
- sacrebleu_aug-mix-cjk_iwslt17/tst2015
- sacrebleu_aug-mix-cjk_iwslt17/tst2013
- sacrebleu_aug-mix-cjk_iwslt17/tst2011
- sacrebleu_aug-mix-cjk_iwslt17/dev2010
test:
- mtdata_Neulab-tedtalks_test-1-eng-zho
- mtdata_Statmt-generaltest-2022_refA-eng-zho
- mtdata_Statmt-generaltest-2022_refB-eng-zho
- mtdata_Statmt-generaltest-2022_refA-zho-eng
- mtdata_Statmt-generaltest-2022_refB-zho-eng
- mtdata_Statmt-generaltest-2023_refA-eng-zho
- mtdata_Statmt-generaltest-2023_refA-zho-eng
- flores_devtest
- flores_aug-mix-cjk_devtest
- flores_aug-noise_devtest
- flores_aug-inline-noise_devtest
- sacrebleu_wmt23
- sacrebleu_wmt21/systems
- sacrebleu_wmt21/B
- sacrebleu_wmt20/tworefs
- sacrebleu_wmt19
- sacrebleu_wmt18/test-ts
- sacrebleu_wmt17/improved
- sacrebleu_wmt17/ms
- sacrebleu_iwslt17/tst2016
- sacrebleu_iwslt17/tst2014
- sacrebleu_iwslt17/tst2012
- sacrebleu_iwslt17/tst2010

# The training data contains:
# 127,323,968 sentences
#
# Skipped datasets:
# - opus_CCMatrix/v1 - ignored datasets (71,383,325 sentences)
# - opus_WMT-News/v2019 - ignored datasets (19,965 sentences)
# - opus_SPC/v1 - ignored datasets (2,228 sentences)
# - opus_ELRC-3056-wikipedia_health/v1 - not enough data (145 sentences)
# - opus_ELRC-wikipedia_health/v1 - not enough data (145 sentences)
# - opus_ELRC_2922/v1 - not enough data (144 sentences)
# - opus_EUbookshop/v2 - not enough data (0 sentences)
# - opus_Ubuntu/v14.10 - not enough data (0 sentences)
# - opus_WikiTitles/v3 - ignored datasets (0 sentences)
# - mtdata_ELRC-wikipedia_health-1-eng-zho - duplicate with opus
# - mtdata_Facebook-wikimatrix-1-eng-zho - duplicate with opus
# - mtdata_LinguaTools-wikititles-2014-eng-zho - duplicate with opus
# - mtdata_Neulab-tedtalks_train-1-eng-zho - duplicate with opus
# - mtdata_ParaCrawl-paracrawl-1_bonus-eng-zho - duplicate with opus
# - mtdata_Statmt-news_commentary-14-eng-zho - duplicate with opus
# - mtdata_Statmt-news_commentary-15-eng-zho - duplicate with opus
# - mtdata_Statmt-news_commentary-16-eng-zho - duplicate with opus
# - mtdata_Statmt-news_commentary-17-eng-zho - duplicate with opus
# - mtdata_Statmt-news_commentary-18-eng-zho - duplicate with opus
# - mtdata_Statmt-news_commentary-18.1-eng-zho - duplicate with opus
# - mtdata_Statmt-wiki_titles-1-zho-eng - duplicate with opus
# - mtdata_Statmt-wiki_titles-2-zho-eng - duplicate with opus
# - mtdata_Statmt-wikititles-3-zho-eng - duplicate with opus
# - mtdata_Statmt-backtrans_enzh-wmt20-eng-zho - Error fetching (('https://data.statmt.org/wmt20/translation-task/back-translation/zh-en/news.en.gz', 'https://data.statmt.org/wmt20/translation-task/back-translation/zh-en/news.translatedto.zh.gz'))
# - mtdata_UN-un_dev-1-eng-zho - Error fetching (https://drive.google.com/uc?export=download&id=13GI1F1hvwpMUGBSa0QC6ov4eE57GC_Zx)
# - mtdata_UN-un_test-1-eng-zho - Error fetching (https://drive.google.com/uc?export=download&id=13GI1F1hvwpMUGBSa0QC6ov4eE57GC_Zx)
train:
- opus_NLLB/v1 # 71,383,325 sentences
- opus_UNPC/v1.0 # 17,451,549 sentences
- opus_ParaCrawl/v9 # 14,170,869 sentences
- opus_MultiUN/v1 # 9,564,315 sentences
- opus_LinguaTools-WikiTitles/v2014 # 6,664,332 sentences
- opus_XLEnt/v1.2 # 6,292,330 sentences
- opus_WikiMatrix/v1 # 786,512 sentences
- opus_wikimedia/v20230407 # 302,259 sentences
- opus_Tanzil/v1 # 187,092 sentences
- opus_TED2013/v1.1 # 154,579 sentences
- opus_News-Commentary/v16 # 125,996 sentences
- opus_bible-uedin/v1 # 124,378 sentences
- opus_PHP/v1 # 41,706 sentences
- opus_infopankki/v1 # 29,907 sentences
- opus_TED2020/v1 # 16,382 sentences
- opus_QED/v2.0a # 13,123 sentences
- opus_NeuLab-TedTalks/v1 # 8,076 sentences
- opus_tldr-pages/v2023-08-29 # 4,167 sentences
- opus_tico-19/v2020-10-28 # 3,071 sentences
- mtdata_ELRC-hrw_dataset_v1-1-eng-zho # ~631,760 sentences (71.4 MB)
- mtdata_Microsoft-ntrex-128-eng-zho_CN # No Content-Length reported (https://github.com/MicrosoftTranslator/NTREX/archive/52b9c57c.tar.gz)
- mtdata_Microsoft-ntrex-128-eng-zho_TW # No Content-Length reported (https://github.com/MicrosoftTranslator/NTREX/archive/52b9c57c.tar.gz)
- mtdata_Microsoft-ntrex-128-eng_GB-zho_CN # No Content-Length reported (https://github.com/MicrosoftTranslator/NTREX/archive/52b9c57c.tar.gz)
- mtdata_Microsoft-ntrex-128-eng_GB-zho_TW # No Content-Length reported (https://github.com/MicrosoftTranslator/NTREX/archive/52b9c57c.tar.gz)
- mtdata_Microsoft-ntrex-128-eng_IN-zho_CN # No Content-Length reported (https://github.com/MicrosoftTranslator/NTREX/archive/52b9c57c.tar.gz)
- mtdata_Microsoft-ntrex-128-eng_IN-zho_TW # No Content-Length reported (https://github.com/MicrosoftTranslator/NTREX/archive/52b9c57c.tar.gz)
- mtdata_Microsoft-ntrex-128-eng_US-zho_CN # No Content-Length reported (https://github.com/MicrosoftTranslator/NTREX/archive/52b9c57c.tar.gz)
- mtdata_Microsoft-ntrex-128-eng_US-zho_TW # No Content-Length reported (https://github.com/MicrosoftTranslator/NTREX/archive/52b9c57c.tar.gz)
- mtdata_Statmt-news_commentary_wmt18-13-zho-eng # ~1,001,393 sentences (113.2 MB)
- mtdata_Statmt-newsdev_zhen-2017-zho-eng # ~402,756 sentences (45.5 MB)
- mtdata_Statmt-newsdev_enzh-2017-eng-zho # ~402,756 sentences (45.5 MB)
- mtdata_Statmt-ccaligned-1-eng-zho_CN # ~5,669,496 sentences (640.7 MB)
- mtdata_Statmt-ccaligned-1-eng-zho_TW # ~2,407,082 sentences (272.0 MB)

# The monolingual data contains:
# ~46,677,484 sentences
# Up to 200,000,000 sentences from HPLT
mono-src:
- news-crawl_news.2010 # ~6,141 sentences
- news-crawl_news.2011 # ~60,176 sentences
- news-crawl_news.2012 # ~57,522 sentences
- news-crawl_news.2016 # ~73,451 sentences
- news-crawl_news.2017 # ~77,876 sentences
- news-crawl_news.2019 # ~1,672,566 sentences
- news-crawl_news.2020 # ~3,398,230 sentences
- news-crawl_news.2021 # ~1,849,557 sentences
- news-crawl_news.2022 # ~2,327,433 sentences
- news-crawl_news.2023 # ~4,345,132 sentences
- hplt_mono/v1.2 # Up to 200,000,000 sentences
- opus_NLLB/v1 # ~32,809,400 sentences

# The monolingual data contains:
# ~195,823,002 sentences
# Up to 200,000,000 sentences from HPLT
#
# Skipped datasets:
# - opus_NLLB/v1 - data may have lower quality, disable for back-translations (462,447,416 sentences)
mono-trg:
- news-crawl_news.2007 # ~1,557,522 sentences
- news-crawl_news.2008 # ~5,389,380 sentences
- news-crawl_news.2009 # ~6,557,522 sentences
- news-crawl_news.2010 # ~3,247,787 sentences
- news-crawl_news.2011 # ~6,318,584 sentences
- news-crawl_news.2012 # ~6,407,079 sentences
- news-crawl_news.2013 # ~10,619,469 sentences
- news-crawl_news.2014 # ~10,619,469 sentences
- news-crawl_news.2015 # ~10,619,469 sentences
- news-crawl_news.2016 # ~7,982,300 sentences
- news-crawl_news.2017 # ~11,504,424 sentences
- news-crawl_news.2018 # ~7,920,353 sentences
- news-crawl_news.2019 # ~17,699,115 sentences
- news-crawl_news.2020 # ~22,123,893 sentences
- news-crawl_news.2021 # ~21,238,938 sentences
- news-crawl_news.2022 # ~23,008,849 sentences
- news-crawl_news.2023 # ~23,008,849 sentences
- hplt_mono/v1.2 # Up to 200,000,000 sentences
marian-args:
decoding-backward:
beam-size: '12'
mini-batch-words: '2000'
decoding-teacher:
precision: float16
mini-batch-words: '5000'
maxi-batch: '10000'
fp16: true
training-backward:
early-stopping: '5'
training-teacher:
early-stopping: '20'
training-student:
early-stopping: '20'
training-student-finetuned:
early-stopping: '20'
target-stage: all
wandb-publication: true
taskcluster:
split-chunks: 20
worker-classes:
default: gcp-spot
alignments-original: gcp-standard
alignments-backtranslated: gcp-standard
alignments-student: gcp-standard
shortlist: gcp-standard
181 changes: 181 additions & 0 deletions configs/cjk/zh-en-cjk.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
# The initial configuration was generated using:
# task config-generator -- zh en --name cjk
#
# The documentation for this config can be found here:
# https://github.com/mozilla/translations/blob/3598c784dedd8ccf73e9c21a368fab46bcaa12c8/taskcluster/configs/config.prod.yml
experiment:
name: cjk
src: zh
trg: en
best-model: chrf
use-opuscleaner: 'true'
opuscleaner-mode: custom
bicleaner:
default-threshold: 0.5
dataset-thresholds: {}
mono-max-sentences-src:
total: 500_000_000
per-dataset: 200_000_000
mono-max-sentences-trg:
total: 200_000_000
per-dataset: 200_000_000
min-fluency-threshold:
mono-src: 0.8
mono-trg: 0.9
spm-sample-size: 10_000_000
spm-vocab-size: 64000
teacher-ensemble: 2
teacher-mode: two-stage
student-model: tiny
pretrained-models: {}
datasets:
devtest:
- mtdata_aug-mix-cjk_Neulab-tedtalks_dev-1-eng-zho
- flores_aug-mix-cjk_dev
- sacrebleu_aug-mix-cjk_wmt22
- sacrebleu_aug-mix-cjk_wmt20
- sacrebleu_aug-mix-cjk_wmt18
test:
- mtdata_Neulab-tedtalks_test-1-eng-zho
- flores_devtest
- sacrebleu_wmt21
- sacrebleu_wmt19
- sacrebleu_wmt17
- flores_aug-mix-cjk_devtest
- flores_aug-noise_devtest
- flores_aug-inline-noise_devtest

# The training data contains:
# 127,323,968 sentences
#
# Skipped datasets:
# - opus_CCMatrix/v1 - ignored datasets (71,383,325 sentences)
# - opus_WMT-News/v2019 - ignored datasets (19,965 sentences)
# - opus_SPC/v1 - ignored datasets (2,228 sentences)
# - opus_ELRC-3056-wikipedia_health/v1 - not enough data (145 sentences)
# - opus_ELRC-wikipedia_health/v1 - not enough data (145 sentences)
# - opus_ELRC_2922/v1 - not enough data (144 sentences)
# - opus_EUbookshop/v2 - not enough data (0 sentences)
# - opus_Ubuntu/v14.10 - not enough data (0 sentences)
# - opus_WikiTitles/v3 - ignored datasets (0 sentences)
# - mtdata_ELRC-wikipedia_health-1-eng-zho - duplicate with opus
# - mtdata_Facebook-wikimatrix-1-eng-zho - duplicate with opus
# - mtdata_LinguaTools-wikititles-2014-eng-zho - duplicate with opus
# - mtdata_Neulab-tedtalks_train-1-eng-zho - duplicate with opus
# - mtdata_ParaCrawl-paracrawl-1_bonus-eng-zho - duplicate with opus
# - mtdata_Statmt-news_commentary-14-eng-zho - duplicate with opus
# - mtdata_Statmt-news_commentary-15-eng-zho - duplicate with opus
# - mtdata_Statmt-news_commentary-16-eng-zho - duplicate with opus
# - mtdata_Statmt-news_commentary-17-eng-zho - duplicate with opus
# - mtdata_Statmt-news_commentary-18-eng-zho - duplicate with opus
# - mtdata_Statmt-news_commentary-18.1-eng-zho - duplicate with opus
# - mtdata_Statmt-wiki_titles-1-zho-eng - duplicate with opus
# - mtdata_Statmt-wiki_titles-2-zho-eng - duplicate with opus
# - mtdata_Statmt-wikititles-3-zho-eng - duplicate with opus
# - mtdata_Statmt-backtrans_enzh-wmt20-eng-zho - Error fetching (('https://data.statmt.org/wmt20/translation-task/back-translation/zh-en/news.en.gz', 'https://data.statmt.org/wmt20/translation-task/back-translation/zh-en/news.translatedto.zh.gz'))
# - mtdata_UN-un_dev-1-eng-zho - Error fetching (https://drive.google.com/uc?export=download&id=13GI1F1hvwpMUGBSa0QC6ov4eE57GC_Zx)
# - mtdata_UN-un_test-1-eng-zho - Error fetching (https://drive.google.com/uc?export=download&id=13GI1F1hvwpMUGBSa0QC6ov4eE57GC_Zx)
train:
- opus_NLLB/v1 # 71,383,325 sentences
- opus_UNPC/v1.0 # 17,451,549 sentences
- opus_ParaCrawl/v9 # 14,170,869 sentences
- opus_MultiUN/v1 # 9,564,315 sentences
- opus_LinguaTools-WikiTitles/v2014 # 6,664,332 sentences
- opus_XLEnt/v1.2 # 6,292,330 sentences
- opus_WikiMatrix/v1 # 786,512 sentences
- opus_wikimedia/v20230407 # 302,259 sentences
- opus_Tanzil/v1 # 187,092 sentences
- opus_TED2013/v1.1 # 154,579 sentences
- opus_News-Commentary/v16 # 125,996 sentences
- opus_bible-uedin/v1 # 124,378 sentences
- opus_PHP/v1 # 41,706 sentences
- opus_infopankki/v1 # 29,907 sentences
- opus_TED2020/v1 # 16,382 sentences
- opus_QED/v2.0a # 13,123 sentences
- opus_NeuLab-TedTalks/v1 # 8,076 sentences
- opus_tldr-pages/v2023-08-29 # 4,167 sentences
- opus_tico-19/v2020-10-28 # 3,071 sentences
- mtdata_ELRC-hrw_dataset_v1-1-eng-zho # ~631,760 sentences (71.4 MB)
- mtdata_Microsoft-ntrex-128-eng-zho_CN # No Content-Length reported (https://github.com/MicrosoftTranslator/NTREX/archive/52b9c57c.tar.gz)
- mtdata_Microsoft-ntrex-128-eng-zho_TW # No Content-Length reported (https://github.com/MicrosoftTranslator/NTREX/archive/52b9c57c.tar.gz)
- mtdata_Microsoft-ntrex-128-eng_GB-zho_CN # No Content-Length reported (https://github.com/MicrosoftTranslator/NTREX/archive/52b9c57c.tar.gz)
- mtdata_Microsoft-ntrex-128-eng_GB-zho_TW # No Content-Length reported (https://github.com/MicrosoftTranslator/NTREX/archive/52b9c57c.tar.gz)
- mtdata_Microsoft-ntrex-128-eng_IN-zho_CN # No Content-Length reported (https://github.com/MicrosoftTranslator/NTREX/archive/52b9c57c.tar.gz)
- mtdata_Microsoft-ntrex-128-eng_IN-zho_TW # No Content-Length reported (https://github.com/MicrosoftTranslator/NTREX/archive/52b9c57c.tar.gz)
- mtdata_Microsoft-ntrex-128-eng_US-zho_CN # No Content-Length reported (https://github.com/MicrosoftTranslator/NTREX/archive/52b9c57c.tar.gz)
- mtdata_Microsoft-ntrex-128-eng_US-zho_TW # No Content-Length reported (https://github.com/MicrosoftTranslator/NTREX/archive/52b9c57c.tar.gz)
- mtdata_Statmt-news_commentary_wmt18-13-zho-eng # ~1,001,393 sentences (113.2 MB)
- mtdata_Statmt-newsdev_zhen-2017-zho-eng # ~402,756 sentences (45.5 MB)
- mtdata_Statmt-newsdev_enzh-2017-eng-zho # ~402,756 sentences (45.5 MB)
- mtdata_Statmt-ccaligned-1-eng-zho_CN # ~5,669,496 sentences (640.7 MB)
- mtdata_Statmt-ccaligned-1-eng-zho_TW # ~2,407,082 sentences (272.0 MB)

# The monolingual data contains:
# ~46,677,484 sentences
# Up to 200,000,000 sentences from HPLT
mono-src:
- news-crawl_news.2010 # ~6,141 sentences
- news-crawl_news.2011 # ~60,176 sentences
- news-crawl_news.2012 # ~57,522 sentences
- news-crawl_news.2016 # ~73,451 sentences
- news-crawl_news.2017 # ~77,876 sentences
- news-crawl_news.2019 # ~1,672,566 sentences
- news-crawl_news.2020 # ~3,398,230 sentences
- news-crawl_news.2021 # ~1,849,557 sentences
- news-crawl_news.2022 # ~2,327,433 sentences
- news-crawl_news.2023 # ~4,345,132 sentences
- hplt_mono/v1.2 # Up to 200,000,000 sentences
- opus_NLLB/v1 # ~32,809,400 sentences

# The monolingual data contains:
# ~195,823,002 sentences
# Up to 200,000,000 sentences from HPLT
#
# Skipped datasets:
# - opus_NLLB/v1 - data may have lower quality, disable for back-translations (462,447,416 sentences)
mono-trg:
- news-crawl_news.2007 # ~1,557,522 sentences
- news-crawl_news.2008 # ~5,389,380 sentences
- news-crawl_news.2009 # ~6,557,522 sentences
- news-crawl_news.2010 # ~3,247,787 sentences
- news-crawl_news.2011 # ~6,318,584 sentences
- news-crawl_news.2012 # ~6,407,079 sentences
- news-crawl_news.2013 # ~10,619,469 sentences
- news-crawl_news.2014 # ~10,619,469 sentences
- news-crawl_news.2015 # ~10,619,469 sentences
- news-crawl_news.2016 # ~7,982,300 sentences
- news-crawl_news.2017 # ~11,504,424 sentences
- news-crawl_news.2018 # ~7,920,353 sentences
- news-crawl_news.2019 # ~17,699,115 sentences
- news-crawl_news.2020 # ~22,123,893 sentences
- news-crawl_news.2021 # ~21,238,938 sentences
- news-crawl_news.2022 # ~23,008,849 sentences
- news-crawl_news.2023 # ~23,008,849 sentences
- hplt_mono/v1.2 # Up to 200,000,000 sentences
marian-args:
decoding-backward:
beam-size: '12'
mini-batch-words: '2000'
decoding-teacher:
precision: float16
mini-batch-words: '5000'
maxi-batch: '10000'
fp16: true
training-backward:
early-stopping: '5'
training-teacher:
early-stopping: '20'
training-student:
early-stopping: '15'
training-student-finetuned:
early-stopping: '20'
target-stage: all
wandb-publication: true
taskcluster:
split-chunks: 20
worker-classes:
default: gcp-spot
alignments-original: gcp-standard
alignments-backtranslated: gcp-standard
alignments-student: gcp-standard
shortlist: gcp-standard