forked from facebookresearch/fairseq
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpreprocess.sh
45 lines (37 loc) · 1.6 KB
/
preprocess.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
pip install --editable ./
cd examples/translation/
bash prepare-iwslt14-pe.sh
cd ../..
TEXT=examples/translation/iwslt14.tokenized.de-en
# run with all data to build a dictionary
for tp in train valid test; do
for l in en de; do
cat $TEXT/${tp}.$l >> $TEXT/${tp}_all.$l
cat $TEXT/${tp}_mt.$l >> $TEXT/${tp}_all.$l
cat $TEXT/${tp}_pe.$l >> $TEXT/${tp}_all.$l
done
done
fairseq-preprocess --source-lang en --target-lang de \
--trainpref $TEXT/train_all \
--validpref $TEXT/valid_all \
--testpref $TEXT/test_all \
--destdir data-bin/iwslt14.tokenized.en-de.all \
--workers 20
fairseq-preprocess --source-lang en --target-lang de \
--trainpref $TEXT/train --validpref $TEXT/valid --testpref $TEXT/test \
--destdir data-bin/iwslt14.tokenized.en-de \
--workers 20 \
--tgtdict data-bin/iwslt14.tokenized.en-de.all/dict.de.txt \
--srcdict data-bin/iwslt14.tokenized.en-de.all/dict.en.txt
fairseq-preprocess --source-lang en --target-lang de \
--trainpref $TEXT/train_mt --validpref $TEXT/valid_mt --testpref $TEXT/test_mt \
--destdir data-bin/iwslt14.tokenized.en-de.mt \
--workers 20 \
--tgtdict data-bin/iwslt14.tokenized.en-de.all/dict.de.txt \
--srcdict data-bin/iwslt14.tokenized.en-de.all/dict.en.txt
fairseq-preprocess --source-lang en --target-lang de \
--trainpref $TEXT/train_pe --validpref $TEXT/valid_pe --testpref $TEXT/test_pe \
--destdir data-bin/iwslt14.tokenized.en-de.pe \
--workers 20 \
--tgtdict data-bin/iwslt14.tokenized.en-de.all/dict.de.txt \
--srcdict data-bin/iwslt14.tokenized.en-de.all/dict.en.txt