tests/test_tokenization_fsmt.py

# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import json
import os
import unittest

from transformers.file_utils import cached_property
from transformers.testing_utils import slow
from transformers.tokenization_fsmt import VOCAB_FILES_NAMES, FSMTTokenizer

from .test_tokenization_common import TokenizerTesterMixin


class FSMTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
    tokenizer_class = FSMTTokenizer

    def setUp(self):
        super().setUp()

        # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
        vocab = [
            "l",
            "o",
            "w",
            "e",
            "r",
            "s",
            "t",
            "i",
            "d",
            "n",
            "w</w>",
            "r</w>",
            "t</w>",
            "lo",
            "low",
            "er</w>",
            "low</w>",
            "lowest</w>",
            "newer</w>",
            "wider</w>",
            "<unk>",
        ]
        vocab_tokens = dict(zip(vocab, range(len(vocab))))
        merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]

        self.langs = ["en", "ru"]
        config = {
            "langs": self.langs,
            "src_vocab_size": 10,
            "tgt_vocab_size": 20,
        }

        self.src_vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["src_vocab_file"])
        self.tgt_vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["tgt_vocab_file"])
        config_file = os.path.join(self.tmpdirname, "tokenizer_config.json")
        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
        with open(self.src_vocab_file, "w") as fp:
            fp.write(json.dumps(vocab_tokens))
        with open(self.tgt_vocab_file, "w") as fp:
            fp.write(json.dumps(vocab_tokens))
        with open(self.merges_file, "w") as fp:
            fp.write("\n".join(merges))
        with open(config_file, "w") as fp:
            fp.write(json.dumps(config))

    @cached_property
    def tokenizer_ru_en(self):
        return FSMTTokenizer.from_pretrained("facebook/wmt19-ru-en")

    @cached_property
    def tokenizer_en_ru(self):
        return FSMTTokenizer.from_pretrained("facebook/wmt19-en-ru")

    def test_full_tokenizer(self):
        """ Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
        tokenizer = FSMTTokenizer(self.langs, self.src_vocab_file, self.tgt_vocab_file, self.merges_file)

        text = "lower"
        bpe_tokens = ["low", "er</w>"]
        tokens = tokenizer.tokenize(text)
        self.assertListEqual(tokens, bpe_tokens)

        input_tokens = tokens + ["<unk>"]
        input_bpe_tokens = [14, 15, 20]
        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)

    @slow
    def test_sequence_builders(self):
        tokenizer = self.tokenizer_ru_en

        text = tokenizer.encode("sequence builders", add_special_tokens=False)
        text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)

        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)

        assert encoded_sentence == text + [2]
        assert encoded_pair == text + [2] + text_2 + [2]

    @slow
    def test_match_encode_decode(self):
        tokenizer_enc = self.tokenizer_en_ru
        tokenizer_dec = self.tokenizer_ru_en

        targets = [
            [
                "Here's a little song I wrote. Don't worry, be happy.",
                [2470, 39, 11, 2349, 7222, 70, 5979, 7, 8450, 1050, 13160, 5, 26, 6445, 7, 2],
            ],
            ["This is it. No more. I'm done!", [132, 21, 37, 7, 1434, 86, 7, 70, 6476, 1305, 427, 2]],
        ]

        # if data needs to be recreated or added, run:
        # import torch
        # model = torch.hub.load("pytorch/fairseq", "transformer.wmt19.en-ru", checkpoint_file="model4.pt", tokenizer="moses", bpe="fastbpe")
        # for src_text, _ in targets: print(f"""[\n"{src_text}",\n {model.encode(src_text).tolist()}\n],""")

        for src_text, tgt_input_ids in targets:
            input_ids = tokenizer_enc.encode(src_text, return_tensors="pt")[0].tolist()
            self.assertListEqual(input_ids, tgt_input_ids)

            # and decode backward, using the reversed languages model
            decoded_text = tokenizer_dec.decode(input_ids, skip_special_tokens=True)
            self.assertEqual(decoded_text, src_text)

    @unittest.skip("FSMTConfig.__init__  requires non-optional args")
    def test_torch_encode_plus_sent_to_model(self):
        pass

    @unittest.skip("FSMTConfig.__init__  requires non-optional args")
    def test_np_encode_plus_sent_to_model(self):
        pass