Skip to content

Commit

Permalink
adding scripts to get mt large dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
bmccann committed Feb 16, 2018
1 parent dc0b5e1 commit 7cb5432
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 0 deletions.
36 changes: 36 additions & 0 deletions get_data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/tokenizer/tokenizer.perl
wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/tokenizer/lowercase.perl
sed -i "s/$RealBin\/..\/share\/nonbreaking_prefixes//" tokenizer.perl
wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.de
wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.en
wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/generic/multi-bleu.perl

mkdir -p data/wmt17
cd data/wmt17
wget http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz
wget http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz
wget http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v12.tgz
wget http://data.statmt.org/wmt17/translation-task/rapid2016.tgz
wget http://data.statmt.org/wmt17/translation-task/dev.tgz
tar -xzf training-parallel-europarl-v7.tgz
tar -xzf training-parallel-commoncrawl.tgz
tar -xzf training-parallel-nc-v12.tgz
tar -xzf rapid2016.tgz
tar -xzf dev.tgz
mkdir de-en
mv *de-en* de-en
mv training/*de-en* de-en
mv dev/*deen* de-en
mv dev/*ende* de-en
mv dev/*.de de-en
mv dev/*.en de-en
mv dev/newstest2009*.en*
mv dev/news-test2008*.en*

python ../../wmt_clean.py de-en
for l in de; do for f in de-en/*.clean.$l; do perl ../../tokenizer.perl -no-escape -l $l -q < $f > $f.tok; done; done
for l in en; do for f in de-en/*.clean.$l; do perl ../../tokenizer.perl -no-escape -l $l -q < $f > $f.tok; done; done
for l in en de; do for f in de-en/*.clean.$l.tok; do perl ../../lowercase.perl < $f > $f.low; done; done
for l in en de; do perl ../../tokenizer.perl -no-escape -l $l -q < de-en/newstest2013.$l > de-en/newstest2013.$l.tok; done
for l in en de; do perl ../../lowercase.perl < de-en/newstest2013.$l.tok > de-en/newstest2013.$l.tok.low; done
for l in en de; do cat de-en/commoncraw*clean.$l.tok.low de-en/europarl*.clean.$l.tok.low de-en/news-commentary*.clean.$l.tok.low de-en/rapid*.clean.$l.tok.low > de-en/train.clean.$l.tok.low; done
38 changes: 38 additions & 0 deletions wmt_clean.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from collections import Counter
import pycld2
import unicodeblock.blocks
from argparse import ArgumentParser

parser = ArgumentParser()
parser.add_argument('prefix', default='data/wmt17/de-en/')
args = parser.parse_args()

langs = ('de','en')
lang_fix = '.' + '-'.join(langs)
subsets = 'commoncrawl', 'europarl-v7', 'news-commentary-v12', 'rapid2016'
for x in subsets:
path_prefix = args.prefix + x + lang_fix
paths_in = [path_prefix+'.'+lang for lang in langs]
paths_out = [path_prefix+'.clean.'+lang for lang in langs]
latin = lambda s: all("LATIN" in b or "PUNCT" in b or "DIGIT" in b or "SPAC" in b for b in map(unicodeblock.blocks.of,s) if b is not None)
good_src = lambda s: pycld2.detect(s)[2][0][1] in [langs[0],'un'] and latin(s.decode()) and len(s)>1
good_trg = lambda s: pycld2.detect(s)[2][0][1] in [langs[1],'un'] and latin(s.decode()) and len(s)>1

with open(paths_in[0],'rb') as src, open(paths_in[1],'rb') as trg, open(paths_out[0],'wb') as src_out, open(paths_out[1],'wb') as trg_out:
for srcline,trgline in zip(src,trg):
try:
if good_src(srcline) and good_trg(trgline):
src_out.write(srcline)
trg_out.write(trgline)
except:
try:
srcline = srcline.decode("utf-8").encode("latin-1")
trgline = trgline.decode("utf-8").encode("latin-1")
try:
if good_src(srcline) and good_trg(trgline):
src_out.write(srcline)
trg_out.write(trgline)
except:
pass
except:
pass

0 comments on commit 7cb5432

Please sign in to comment.