From 3d9c62f2aca9492db5c22676416974005b9dcbae Mon Sep 17 00:00:00 2001 From: Ryan Sepassi Date: Thu, 15 Jun 2017 11:27:28 -0700 Subject: [PATCH] initial push --- AUTHORS | 7 + CONTRIBUTING.md | 23 + LICENSE | 202 +++ README.md | 103 +- setup.py | 23 + tensor2tensor/__init__.py | 14 + tensor2tensor/bin/t2t-datagen | 361 +++++ tensor2tensor/bin/t2t-trainer | 56 + tensor2tensor/data_generators/README.md | 71 + tensor2tensor/data_generators/__init__.py | 14 + tensor2tensor/data_generators/algorithmic.py | 178 +++ .../data_generators/algorithmic_math.py | 580 +++++++ .../data_generators/algorithmic_math_test.py | 84 ++ .../data_generators/algorithmic_test.py | 84 ++ tensor2tensor/data_generators/audio.py | 156 ++ tensor2tensor/data_generators/audio_test.py | 62 + .../data_generators/concatenate_examples.py | 180 +++ .../data_generators/generator_utils.py | 264 ++++ .../data_generators/generator_utils_test.py | 88 ++ tensor2tensor/data_generators/image.py | 306 ++++ tensor2tensor/data_generators/image_test.py | 71 + tensor2tensor/data_generators/lm_example.py | 123 ++ .../data_generators/problem_hparams.py | 702 +++++++++ .../data_generators/problem_hparams_test.py | 48 + tensor2tensor/data_generators/replace_oov.py | 76 + tensor2tensor/data_generators/snli.py | 167 ++ tensor2tensor/data_generators/text_encoder.py | 451 ++++++ .../text_encoder_build_subword.py | 67 + .../text_encoder_inspect_subword.py | 64 + tensor2tensor/data_generators/tokenizer.py | 117 ++ .../data_generators/tokenizer_test.py | 64 + tensor2tensor/data_generators/wmt.py | 269 ++++ tensor2tensor/data_generators/wmt_test.py | 72 + tensor2tensor/data_generators/wsj_parsing.py | 109 ++ tensor2tensor/models/README.md | 16 + tensor2tensor/models/__init__.py | 14 + tensor2tensor/models/attention_lm.py | 169 +++ tensor2tensor/models/baseline.py | 72 + tensor2tensor/models/baseline_test.py | 55 + tensor2tensor/models/bytenet.py | 112 ++ tensor2tensor/models/bytenet_test.py | 54 + tensor2tensor/models/common_attention.py | 344 +++++ tensor2tensor/models/common_hparams.py | 193 +++ tensor2tensor/models/common_layers.py | 1340 +++++++++++++++++ tensor2tensor/models/common_layers_test.py | 290 ++++ tensor2tensor/models/models.py | 32 + tensor2tensor/models/multimodel.py | 159 ++ tensor2tensor/models/multimodel_test.py | 55 + tensor2tensor/models/neural_gpu.py | 123 ++ tensor2tensor/models/neural_gpu_test.py | 62 + tensor2tensor/models/slicenet.py | 391 +++++ tensor2tensor/models/slicenet_test.py | 54 + tensor2tensor/models/transformer.py | 495 ++++++ tensor2tensor/models/transformer_test.py | 63 + tensor2tensor/models/xception.py | 89 ++ tensor2tensor/models/xception_test.py | 54 + tensor2tensor/utils/__init__.py | 14 + tensor2tensor/utils/avg_checkpoints.py | 98 ++ tensor2tensor/utils/beam_search.py | 419 ++++++ tensor2tensor/utils/beam_search_test.py | 281 ++++ tensor2tensor/utils/bleu_hook.py | 123 ++ tensor2tensor/utils/bleu_hook_test.py | 59 + tensor2tensor/utils/data_reader.py | 346 +++++ tensor2tensor/utils/data_reader_test.py | 147 ++ tensor2tensor/utils/expert_utils.py | 1284 ++++++++++++++++ tensor2tensor/utils/metrics.py | 155 ++ tensor2tensor/utils/metrics_test.py | 88 ++ tensor2tensor/utils/modality.py | 564 +++++++ tensor2tensor/utils/modality_test.py | 88 ++ tensor2tensor/utils/registry.py | 184 +++ tensor2tensor/utils/registry_test.py | 202 +++ tensor2tensor/utils/t2t_model.py | 429 ++++++ tensor2tensor/utils/trainer_utils.py | 1302 ++++++++++++++++ tensor2tensor/utils/trainer_utils_test.py | 41 + 74 files changed, 15315 insertions(+), 1 deletion(-) create mode 100644 AUTHORS create mode 100644 CONTRIBUTING.md create mode 100644 LICENSE create mode 100644 setup.py create mode 100644 tensor2tensor/__init__.py create mode 100644 tensor2tensor/bin/t2t-datagen create mode 100644 tensor2tensor/bin/t2t-trainer create mode 100644 tensor2tensor/data_generators/README.md create mode 100644 tensor2tensor/data_generators/__init__.py create mode 100644 tensor2tensor/data_generators/algorithmic.py create mode 100644 tensor2tensor/data_generators/algorithmic_math.py create mode 100644 tensor2tensor/data_generators/algorithmic_math_test.py create mode 100644 tensor2tensor/data_generators/algorithmic_test.py create mode 100644 tensor2tensor/data_generators/audio.py create mode 100644 tensor2tensor/data_generators/audio_test.py create mode 100644 tensor2tensor/data_generators/concatenate_examples.py create mode 100644 tensor2tensor/data_generators/generator_utils.py create mode 100644 tensor2tensor/data_generators/generator_utils_test.py create mode 100644 tensor2tensor/data_generators/image.py create mode 100644 tensor2tensor/data_generators/image_test.py create mode 100644 tensor2tensor/data_generators/lm_example.py create mode 100644 tensor2tensor/data_generators/problem_hparams.py create mode 100644 tensor2tensor/data_generators/problem_hparams_test.py create mode 100644 tensor2tensor/data_generators/replace_oov.py create mode 100644 tensor2tensor/data_generators/snli.py create mode 100644 tensor2tensor/data_generators/text_encoder.py create mode 100644 tensor2tensor/data_generators/text_encoder_build_subword.py create mode 100644 tensor2tensor/data_generators/text_encoder_inspect_subword.py create mode 100644 tensor2tensor/data_generators/tokenizer.py create mode 100644 tensor2tensor/data_generators/tokenizer_test.py create mode 100644 tensor2tensor/data_generators/wmt.py create mode 100644 tensor2tensor/data_generators/wmt_test.py create mode 100644 tensor2tensor/data_generators/wsj_parsing.py create mode 100644 tensor2tensor/models/README.md create mode 100644 tensor2tensor/models/__init__.py create mode 100644 tensor2tensor/models/attention_lm.py create mode 100644 tensor2tensor/models/baseline.py create mode 100644 tensor2tensor/models/baseline_test.py create mode 100644 tensor2tensor/models/bytenet.py create mode 100644 tensor2tensor/models/bytenet_test.py create mode 100644 tensor2tensor/models/common_attention.py create mode 100644 tensor2tensor/models/common_hparams.py create mode 100644 tensor2tensor/models/common_layers.py create mode 100644 tensor2tensor/models/common_layers_test.py create mode 100644 tensor2tensor/models/models.py create mode 100644 tensor2tensor/models/multimodel.py create mode 100644 tensor2tensor/models/multimodel_test.py create mode 100644 tensor2tensor/models/neural_gpu.py create mode 100644 tensor2tensor/models/neural_gpu_test.py create mode 100644 tensor2tensor/models/slicenet.py create mode 100644 tensor2tensor/models/slicenet_test.py create mode 100644 tensor2tensor/models/transformer.py create mode 100644 tensor2tensor/models/transformer_test.py create mode 100644 tensor2tensor/models/xception.py create mode 100644 tensor2tensor/models/xception_test.py create mode 100644 tensor2tensor/utils/__init__.py create mode 100644 tensor2tensor/utils/avg_checkpoints.py create mode 100644 tensor2tensor/utils/beam_search.py create mode 100644 tensor2tensor/utils/beam_search_test.py create mode 100644 tensor2tensor/utils/bleu_hook.py create mode 100644 tensor2tensor/utils/bleu_hook_test.py create mode 100644 tensor2tensor/utils/data_reader.py create mode 100644 tensor2tensor/utils/data_reader_test.py create mode 100644 tensor2tensor/utils/expert_utils.py create mode 100644 tensor2tensor/utils/metrics.py create mode 100644 tensor2tensor/utils/metrics_test.py create mode 100644 tensor2tensor/utils/modality.py create mode 100644 tensor2tensor/utils/modality_test.py create mode 100644 tensor2tensor/utils/registry.py create mode 100644 tensor2tensor/utils/registry_test.py create mode 100644 tensor2tensor/utils/t2t_model.py create mode 100644 tensor2tensor/utils/trainer_utils.py create mode 100644 tensor2tensor/utils/trainer_utils_test.py diff --git a/AUTHORS b/AUTHORS new file mode 100644 index 000000000..38e5bc724 --- /dev/null +++ b/AUTHORS @@ -0,0 +1,7 @@ +# This is the list of T2T authors for copyright purposes. +# +# This does not necessarily list everyone who has contributed code, since in +# some cases, their employer may be the copyright holder. To see the full list +# of contributors, see the revision history in source control. + +Google Inc. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 000000000..ae319c70a --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,23 @@ +# How to Contribute + +We'd love to accept your patches and contributions to this project. There are +just a few small guidelines you need to follow. + +## Contributor License Agreement + +Contributions to this project must be accompanied by a Contributor License +Agreement. You (or your employer) retain the copyright to your contribution, +this simply gives us permission to use and redistribute your contributions as +part of the project. Head over to to see +your current agreements on file or to sign a new one. + +You generally only need to submit a CLA once, so if you've already submitted one +(even if it was for a different project), you probably don't need to do it +again. + +## Code reviews + +All submissions, including submissions by project members, require review. We +use GitHub pull requests for this purpose. Consult +[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more +information on using pull requests. diff --git a/LICENSE b/LICENSE new file mode 100644 index 000000000..d64569567 --- /dev/null +++ b/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md index 1a992ff9a..1a650a5c2 100644 --- a/README.md +++ b/README.md @@ -1 +1,102 @@ -# tensor2tensor +# T2T: Tensor2Tensor Transformers + +[T2T](https://github.com/tensorflow/t2t) is a modular and extensible library and +binaries for supervised learning with TensorFlow and with a focus on sequence +tasks. Actively used and maintained by researchers and engineers within Google +Brain, T2T strives to maximize idea bandwidth and minimize execution latency. + +T2T is particularly well-suited to researchers working on sequence tasks. We're +eager to collaborate with you on extending T2T's powers, so please feel free to +open an issue on GitHub to kick off a discussion and send along pull requests, +See [our contribution doc](CONTRIBUTING.md) for details and our +[open issues](https://github.com/tensorflow/t2t/issues). + +## T2T overview + +``` +pip install tensor2tensor + +DATA_DIR=$HOME/data +PROBLEM=wmt_ende_tokens_32k +MODEL=transformer +HPARAMS=transformer_base +TRAIN_DIR=$HOME/train + +# Generate data +t2t-datagen \ + --data_dir=$DATA_DIR \ + --problem=$PROBLEM + +# Train +t2t-trainer \ + --data_dir=$DATA_DIR \ + --problems=$PROBLEM \ + --model=$MODEL \ + --hparams_set=$HPARAMS \ + --output_dir=$TRAIN_DIR \ + +# Decode +t2t-trainer \ + --data_dir=$DATA_DIR \ + --problems=$PROBLEM \ + --model=$MODEL \ + --hparams_set=$HPARAMS \ + --output_dir=$TRAIN_DIR \ + --decode_from_file=$DATA_DIR/decode_this.txt +``` + +T2T modularizes training into several components, each of which can be seen in +use in the above commands. + +### Datasets + +**Datasets** are all standardized on TFRecord files with `tensorflow.Example` +protocol buffers. All datasets are registered and generated with +[`generator.py`](data_generators/generator.py) and many common +sequence datasets are already available for generation and use. + +### Problems and Modalities + +**Problems** define training-time hyperparameters for the dataset and task, +mainly by setting input and output **modalities** (e.g. symbol, image, audio, +label) and vocabularies, if applicable. All problems are defined in +[`problem_hparams.py`](data_generators/problem_hparams.py). **Modalities**, +defined in [`modality.py`](utils/modality.py), abstract away the input and +output data types so that **models** may deal with modality-independent tensors. + +### Models + +**`T2TModel`s** define the core tensor-to-tensor transformation, independent of +input/output modality or task. Models take dense tensors in and produce dense +tensors that may then be transformed in a final step by a **modality** depending +on the task (e.g. fed through a final linear transform to produce logits for a +softmax over classes). All models are imported in +[`models.py`](models/models.py), inherit from `T2TModel` - defined in +[`t2t_model.py`](utils/t2t_model.py) - and are registered with +[`@registry.register_model`](utils/registry.py). + +### Hyperparameter Sets + +**Hyperparameter sets** are defined and registered in code with +[`@registry.register_hparams`](utils/registry.py) and are encoded in +[`tf.contrib.training.HParams`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/training/python/training/hparam.py) +objects. The `HParams` are available to both the problem specification and the +model. A basic set of hyperparameters are defined in +[`common_hparams.py`](models/common_hparams.py) and hyperparameter set +functions can compose other hyperparameter set functions. + +### Trainer + +The **trainer** binary is the main entrypoint for training, evaluation, and +inference. Users can easily switch between problems, models, and hyperparameter +sets by using the `--model`, `--problems`, and `--hparams_set` flags. Specific +hyperparameters can be overriden with the `--hparams` flag. `--schedule` and +related flags control local and distributed training/evaluation. + +## Adding a dataset + +See the data generators [README](data_generators/README.md). + +--- + +*Note: This is not an official Google product.* diff --git a/setup.py b/setup.py new file mode 100644 index 000000000..cac1a5125 --- /dev/null +++ b/setup.py @@ -0,0 +1,23 @@ +"""Install tensor2tensor.""" + +from distutils.core import setup + +setup( + name='tensor2tensor', + version='1.0', + description='Tensor2Tensor', + author='Google Inc.', + author_email='no-reply@google.com', + url='http://github.com/tensorflow/tensor2tensor', + license='Apache 2.0', + packages=[ + 'tensor2tensor', 'tensor2tensor.utils', 'tensor2tensor.data_generators', + 'tensor2tensor.models' + ], + scripts=['tensor2tensor/bin/t2t-trainer', 'tensor2tensor/bin/t2t-datagen'], + install_requires=[ + 'numpy', + 'sympy', + 'six', + 'tensorflow-gpu>=1.2.0rc1', + ],) diff --git a/tensor2tensor/__init__.py b/tensor2tensor/__init__.py new file mode 100644 index 000000000..27d533abc --- /dev/null +++ b/tensor2tensor/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/tensor2tensor/bin/t2t-datagen b/tensor2tensor/bin/t2t-datagen new file mode 100644 index 000000000..002544052 --- /dev/null +++ b/tensor2tensor/bin/t2t-datagen @@ -0,0 +1,361 @@ +#!/usr/bin/env python +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Produces the training and dev data for --problem into --data_dir. + +generator.py produces sharded and shuffled TFRecord files of tensorflow.Example +protocol buffers for a variety of datasets registered in this file. + +All datasets are registered in _SUPPORTED_PROBLEM_GENERATORS. Each entry maps a +string name (selectable on the command-line with --problem) to a function that +takes 2 arguments - input_directory and mode (one of "train" or "dev") - and +yields for each training example a dictionary mapping string feature names to +lists of {string, int, float}. The generator will be run once for each mode. +""" + +import random +import tempfile + +# Dependency imports + +import numpy as np + +from tensor2tensor.data_generators import algorithmic +from tensor2tensor.data_generators import algorithmic_math +from tensor2tensor.data_generators import audio +from tensor2tensor.data_generators import generator_utils +from tensor2tensor.data_generators import image +from tensor2tensor.data_generators import snli +from tensor2tensor.data_generators import wmt +from tensor2tensor.data_generators import wsj_parsing + +import tensorflow as tf + +flags = tf.flags +FLAGS = flags.FLAGS + +flags.DEFINE_string("data_dir", "", "Data directory.") +flags.DEFINE_string("tmp_dir", + tempfile.gettempdir(), "Temporary storage directory.") +flags.DEFINE_string("problem", "", + "The name of the problem to generate data for.") +flags.DEFINE_integer("num_shards", 1, "How many shards to use.") +flags.DEFINE_integer("max_cases", 0, + "Maximum number of cases to generate (unbounded if 0).") +flags.DEFINE_integer("random_seed", 429459, "Random seed to use.") + +# Mapping from problems that we can generate data for to their generators. +# pylint: disable=g-long-lambda +_SUPPORTED_PROBLEM_GENERATORS = { + "algorithmic_identity_binary40": ( + lambda: algorithmic.identity_generator(2, 40, 100000), + lambda: algorithmic.identity_generator(2, 400, 10000)), + "algorithmic_identity_decimal40": ( + lambda: algorithmic.identity_generator(10, 40, 100000), + lambda: algorithmic.identity_generator(10, 400, 10000)), + "algorithmic_shift_decimal40": ( + lambda: algorithmic.shift_generator(20, 10, 40, 100000), + lambda: algorithmic.shift_generator(20, 10, 80, 10000)), + "algorithmic_reverse_binary40": ( + lambda: algorithmic.reverse_generator(2, 40, 100000), + lambda: algorithmic.reverse_generator(2, 400, 10000)), + "algorithmic_reverse_decimal40": ( + lambda: algorithmic.reverse_generator(10, 40, 100000), + lambda: algorithmic.reverse_generator(10, 400, 10000)), + "algorithmic_addition_binary40": ( + lambda: algorithmic.addition_generator(2, 40, 100000), + lambda: algorithmic.addition_generator(2, 400, 10000)), + "algorithmic_addition_decimal40": ( + lambda: algorithmic.addition_generator(10, 40, 100000), + lambda: algorithmic.addition_generator(10, 400, 10000)), + "algorithmic_multiplication_binary40": ( + lambda: algorithmic.multiplication_generator(2, 40, 100000), + lambda: algorithmic.multiplication_generator(2, 400, 10000)), + "algorithmic_multiplication_decimal40": ( + lambda: algorithmic.multiplication_generator(10, 40, 100000), + lambda: algorithmic.multiplication_generator(10, 400, 10000)), + "algorithmic_algebra_inverse": ( + lambda: algorithmic_math.algebra_inverse(26, 0, 2, 100000), + lambda: algorithmic_math.algebra_inverse(26, 3, 3, 10000)), + "algorithmic_algebra_simplify": ( + lambda: algorithmic_math.algebra_simplify(8, 0, 2, 100000), + lambda: algorithmic_math.algebra_simplify(8, 3, 3, 10000)), + "algorithmic_calculus_integrate": ( + lambda: algorithmic_math.calculus_integrate(8, 0, 2, 100000), + lambda: algorithmic_math.calculus_integrate(8, 3, 3, 10000)), + "wmt_parsing_characters": ( + lambda: wmt.parsing_character_generator(FLAGS.tmp_dir, True), + lambda: wmt.parsing_character_generator(FLAGS.tmp_dir, False)), + "wmt_parsing_tokens_8k": ( + lambda: wmt.parsing_token_generator(FLAGS.tmp_dir, True, 2**13), + lambda: wmt.parsing_token_generator(FLAGS.tmp_dir, False, 2**13)), + "wsj_parsing_tokens_16k": ( + lambda: wsj_parsing.parsing_token_generator(FLAGS.tmp_dir, True, + 2**14, 2**9), + lambda: wsj_parsing.parsing_token_generator(FLAGS.tmp_dir, False, + 2**14, 2**9)), + "wsj_parsing_tokens_32k": ( + lambda: wsj_parsing.parsing_token_generator(FLAGS.tmp_dir, True, + 2**15, 2**9), + lambda: wsj_parsing.parsing_token_generator(FLAGS.tmp_dir, False, + 2**15, 2**9)), + "wmt_enfr_characters": ( + lambda: wmt.enfr_character_generator(FLAGS.tmp_dir, True), + lambda: wmt.enfr_character_generator(FLAGS.tmp_dir, False)), + "wmt_enfr_tokens_8k": ( + lambda: wmt.enfr_wordpiece_token_generator(FLAGS.tmp_dir, True, 2**13), + lambda: wmt.enfr_wordpiece_token_generator(FLAGS.tmp_dir, False, 2**13) + ), + "wmt_enfr_tokens_32k": ( + lambda: wmt.enfr_wordpiece_token_generator(FLAGS.tmp_dir, True, 2**15), + lambda: wmt.enfr_wordpiece_token_generator(FLAGS.tmp_dir, False, 2**15) + ), + "wmt_enfr_tokens_128k": ( + lambda: wmt.enfr_wordpiece_token_generator(FLAGS.tmp_dir, True, 2**17), + lambda: wmt.enfr_wordpiece_token_generator(FLAGS.tmp_dir, False, 2**17) + ), + "wmt_ende_characters": ( + lambda: wmt.ende_character_generator(FLAGS.tmp_dir, True), + lambda: wmt.ende_character_generator(FLAGS.tmp_dir, False)), + "wmt_ende_bpe32k": ( + lambda: wmt.ende_bpe_token_generator(FLAGS.tmp_dir, True), + lambda: wmt.ende_bpe_token_generator(FLAGS.tmp_dir, False)), + "wmt_ende_tokens_8k": ( + lambda: wmt.ende_wordpiece_token_generator(FLAGS.tmp_dir, True, 2**13), + lambda: wmt.ende_wordpiece_token_generator(FLAGS.tmp_dir, False, 2**13) + ), + "wmt_ende_tokens_32k": ( + lambda: wmt.ende_wordpiece_token_generator(FLAGS.tmp_dir, True, 2**15), + lambda: wmt.ende_wordpiece_token_generator(FLAGS.tmp_dir, False, 2**15) + ), + "wmt_ende_tokens_128k": ( + lambda: wmt.ende_wordpiece_token_generator(FLAGS.tmp_dir, True, 2**17), + lambda: wmt.ende_wordpiece_token_generator(FLAGS.tmp_dir, False, 2**17) + ), + "image_mnist_tune": ( + lambda: image.mnist_generator(FLAGS.tmp_dir, True, 55000), + lambda: image.mnist_generator(FLAGS.tmp_dir, True, 5000, 55000)), + "image_mnist_test": ( + lambda: image.mnist_generator(FLAGS.tmp_dir, True, 60000), + lambda: image.mnist_generator(FLAGS.tmp_dir, False, 10000)), + "image_cifar10_tune": ( + lambda: image.cifar10_generator(FLAGS.tmp_dir, True, 48000), + lambda: image.cifar10_generator(FLAGS.tmp_dir, True, 2000, 48000)), + "image_cifar10_test": ( + lambda: image.cifar10_generator(FLAGS.tmp_dir, True, 50000), + lambda: image.cifar10_generator(FLAGS.tmp_dir, False, 10000)), + "image_mscoco_characters_tune": ( + lambda: image.mscoco_generator(FLAGS.tmp_dir, True, 70000), + lambda: image.mscoco_generator(FLAGS.tmp_dir, True, 10000, 70000)), + "image_mscoco_characters_test": ( + lambda: image.mscoco_generator(FLAGS.tmp_dir, True, 80000), + lambda: image.mscoco_generator(FLAGS.tmp_dir, False, 40000)), + "image_mscoco_tokens_8k_tune": ( + lambda: image.mscoco_generator( + FLAGS.tmp_dir, + True, + 70000, + vocab_filename="tokens.vocab.%d" % 2**13, + vocab_size=2**13), + lambda: image.mscoco_generator( + FLAGS.tmp_dir, + True, + 10000, + 70000, + vocab_filename="tokens.vocab.%d" % 2**13, + vocab_size=2**13)), + "image_mscoco_tokens_8k_test": ( + lambda: image.mscoco_generator( + FLAGS.tmp_dir, + True, + 80000, + vocab_filename="tokens.vocab.%d" % 2**13, + vocab_size=2**13), + lambda: image.mscoco_generator( + FLAGS.tmp_dir, + False, + 40000, + vocab_filename="tokens.vocab.%d" % 2**13, + vocab_size=2**13)), + "image_mscoco_tokens_32k_tune": ( + lambda: image.mscoco_generator( + FLAGS.tmp_dir, + True, + 70000, + vocab_filename="tokens.vocab.%d" % 2**15, + vocab_size=2**15), + lambda: image.mscoco_generator( + FLAGS.tmp_dir, + True, + 10000, + 70000, + vocab_filename="tokens.vocab.%d" % 2**15, + vocab_size=2**15)), + "image_mscoco_tokens_32k_test": ( + lambda: image.mscoco_generator( + FLAGS.tmp_dir, + True, + 80000, + vocab_filename="tokens.vocab.%d" % 2**15, + vocab_size=2**15), + lambda: image.mscoco_generator( + FLAGS.tmp_dir, + False, + 40000, + vocab_filename="tokens.vocab.%d" % 2**15, + vocab_size=2**15)), + "image_mscoco_tokens_128k_tune": ( + lambda: image.mscoco_generator( + FLAGS.tmp_dir, + True, + 70000, + vocab_filename="tokens.vocab.%d" % 2**17, + vocab_size=2**17), + lambda: image.mscoco_generator( + FLAGS.tmp_dir, + True, + 10000, + 70000, + vocab_filename="tokens.vocab.%d" % 2**17, + vocab_size=2**17)), + "image_mscoco_tokens_128k_test": ( + lambda: image.mscoco_generator( + FLAGS.tmp_dir, + True, + 80000, + vocab_filename="tokens.vocab.%d" % 2**17, + vocab_size=2**17), + lambda: image.mscoco_generator( + FLAGS.tmp_dir, + False, + 40000, + vocab_filename="tokens.vocab.%d" % 2**17, + vocab_size=2**17)), + "snli_32k": ( + lambda: snli.snli_token_generator(FLAGS.tmp_dir, True, 2**15), + lambda: snli.snli_token_generator(FLAGS.tmp_dir, False, 2**15), + ), + "audio_timit_characters_tune": ( + lambda: audio.timit_generator(FLAGS.tmp_dir, True, 1374), + lambda: audio.timit_generator(FLAGS.tmp_dir, True, 344, 1374)), + "audio_timit_characters_test": ( + lambda: audio.timit_generator(FLAGS.tmp_dir, True, 1718), + lambda: audio.timit_generator(FLAGS.tmp_dir, False, 626)), + "audio_timit_tokens_8k_tune": ( + lambda: audio.timit_generator( + FLAGS.tmp_dir, + True, + 1374, + vocab_filename="tokens.vocab.%d" % 2**13, + vocab_size=2**13), + lambda: audio.timit_generator( + FLAGS.tmp_dir, + True, + 344, + 1374, + vocab_filename="tokens.vocab.%d" % 2**13, + vocab_size=2**13)), + "audio_timit_tokens_8k_test": ( + lambda: audio.timit_generator( + FLAGS.tmp_dir, + True, + 1718, + vocab_filename="tokens.vocab.%d" % 2**13, + vocab_size=2**13), + lambda: audio.timit_generator( + FLAGS.tmp_dir, + False, + 626, + vocab_filename="tokens.vocab.%d" % 2**13, + vocab_size=2**13)), + "audio_timit_tokens_32k_tune": ( + lambda: audio.timit_generator( + FLAGS.tmp_dir, + True, + 1374, + vocab_filename="tokens.vocab.%d" % 2**15, + vocab_size=2**15), + lambda: audio.timit_generator( + FLAGS.tmp_dir, + True, + 344, + 1374, + vocab_filename="tokens.vocab.%d" % 2**15, + vocab_size=2**15)), + "audio_timit_tokens_32k_test": ( + lambda: audio.timit_generator( + FLAGS.tmp_dir, + True, + 1718, + vocab_filename="tokens.vocab.%d" % 2**15, + vocab_size=2**15), + lambda: audio.timit_generator( + FLAGS.tmp_dir, + False, + 626, + vocab_filename="tokens.vocab.%d" % 2**15, + vocab_size=2**15)), +} + +# pylint: enable=g-long-lambda + +UNSHUFFLED_SUFFIX = "-unshuffled" + + +def set_random_seed(): + """Set the random seed from flag everywhere.""" + tf.set_random_seed(FLAGS.random_seed) + random.seed(FLAGS.random_seed) + np.random.seed(FLAGS.random_seed) + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + if FLAGS.problem not in _SUPPORTED_PROBLEM_GENERATORS: + problems_str = "\n * ".join(sorted(_SUPPORTED_PROBLEM_GENERATORS)) + error_msg = ("You must specify one of the supported problems to " + "generate data for:\n * " + problems_str + "\n") + raise ValueError(error_msg) + + if not FLAGS.data_dir: + FLAGS.data_dir = tempfile.gettempdir() + tf.logging.warning("It is strongly recommended to specify --data_dir. " + "Data will be written to default data_dir=%s.", + FLAGS.data_dir) + + set_random_seed() + + training_gen, dev_gen = _SUPPORTED_PROBLEM_GENERATORS[FLAGS.problem] + + tf.logging.info("Generating training data for %s.", FLAGS.problem) + train_output_files = generator_utils.generate_files( + training_gen(), FLAGS.problem + UNSHUFFLED_SUFFIX + "-train", + FLAGS.data_dir, FLAGS.num_shards, FLAGS.max_cases) + + tf.logging.info("Generating development data for %s.", FLAGS.problem) + dev_output_files = generator_utils.generate_files( + dev_gen(), FLAGS.problem + UNSHUFFLED_SUFFIX + "-dev", FLAGS.data_dir, 1) + + tf.logging.info("Shuffling data...") + for fname in train_output_files + dev_output_files: + records = generator_utils.read_records(fname) + random.shuffle(records) + out_fname = fname.replace(UNSHUFFLED_SUFFIX, "") + generator_utils.write_records(records, out_fname) + tf.gfile.Remove(fname) + + +if __name__ == "__main__": + tf.app.run() diff --git a/tensor2tensor/bin/t2t-trainer b/tensor2tensor/bin/t2t-trainer new file mode 100644 index 000000000..c14fac783 --- /dev/null +++ b/tensor2tensor/bin/t2t-trainer @@ -0,0 +1,56 @@ +#!/usr/bin/env python +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +r"""Trainer for T2T models. + +This binary perform training, evaluation, and inference using +the Estimator API with tf.learn Experiment objects. + +To train your model, for example: + t2t-trainer \ + --data_dir ~/data \ + --problems=algorithmic_identity_binary40 \ + --model=transformer + --hparams_set=transformer_base +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +from tensor2tensor.utils import trainer_utils as utils + +import tensorflow as tf + +FLAGS = tf.flags.FLAGS + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + utils.log_registry() + utils.validate_flags() + # TODO(rsepassi): Document distributed training + utils.run( + data_dir=FLAGS.data_dir, + model=FLAGS.model, + output_dir=FLAGS.output_dir, + train_steps=FLAGS.train_steps, + eval_steps=FLAGS.eval_steps, + schedule=FLAGS.schedule) + + +if __name__ == "__main__": + tf.app.run() diff --git a/tensor2tensor/data_generators/README.md b/tensor2tensor/data_generators/README.md new file mode 100644 index 000000000..813eb4f7e --- /dev/null +++ b/tensor2tensor/data_generators/README.md @@ -0,0 +1,71 @@ +# Data generators for T2T models. + +This directory contains data generators for a number of problems. We use a +naming scheme for the problems, they have names of the form +`[task-family]_[task]_[specifics]`. Data for all currently supported problems +can be generated by calling the main generator binary (`t2t-datagen`). For +example: + +``` +t2t-datagen \ + --problem=algorithmic_identity_binary40 \ + --data_dir=/tmp +``` + +will generate training and development data for the algorithmic copy task - +`/tmp/algorithmic_identity_binary40-dev-00000-of-00001` and +`/tmp/algorithmic_identity_binary40-train-00000-of-00001`. +All tasks produce TFRecord files of `tensorflow.Example` protocol buffers. + + +## Adding a new problem + +1. Implement and register a Python generator for the dataset +1. Add a problem specification to `problem_hparams.py` specifying input and + output modalities + +To add a new problem, you first need to create python generators for training +and development data for the problem. The python generators should yield +dictionaries with string keys and values being lists of {int, float, str}. +Here is a very simple generator for a data-set where inputs are lists of 1s with +length upto 100 and targets are lists of length 1 with an integer denoting the +length of the input list. + +``` +def length_generator(nbr_cases): + for _ in xrange(nbr_cases): + length = np.random.randint(100) + 1 + yield {"inputs": [1] * length, "targets": [length]} +``` + +Note that our data reader uses 0 for padding, so it is a good idea to never +generate 0s, except if all your examples have the same size (in which case +they'll never be padded anyway) or if you're doing padding on your own (in which +case please use 0s for padding). When adding the python generator function, +please also add unit tests to check if the code runs. + +The generator can do arbitrary setup before beginning to yield examples - for +example, downloading data, generating vocabulary files, etc. + +Some examples: + +* [Algorithmic generators](https://github.com/tensorflow/tensor2tensor/tree/master/data_generators/algorithmic.py) + and their [unit tests](https://github.com/tensorflow/tensor2tensor/tree/master/data_generators/algorithmic_test.py) +* [WMT generators](https://github.com/tensorflow/tensor2tensor/tree/master/data_generators/wmt.py) + and their [unit tests](https://github.com/tensorflow/tensor2tensor/tree/master/data_generators/wmt_test.py) + +When your python generator is ready and tested, add it to the +`_SUPPORTED_PROBLEM_GENERATORS` dictionary in +[generator.py](https://github.com/tensorflow/tensor2tensor/tree/master/data_generators/generator.py). +The keys are problem names, and the values are pairs of (training-set-generator +function, dev-set-generator function). For the generator above, one could add +the following lines: + +``` + "algorithmic_length_upto100": + (lambda: algorithmic.length_generator(10000), + lambda: algorithmic.length_generator(1000)), +``` + +Note the lambdas above: we don't want to call the generators too early. + diff --git a/tensor2tensor/data_generators/__init__.py b/tensor2tensor/data_generators/__init__.py new file mode 100644 index 000000000..27d533abc --- /dev/null +++ b/tensor2tensor/data_generators/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/tensor2tensor/data_generators/algorithmic.py b/tensor2tensor/data_generators/algorithmic.py new file mode 100644 index 000000000..46ebb27a3 --- /dev/null +++ b/tensor2tensor/data_generators/algorithmic.py @@ -0,0 +1,178 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Algorithmic data generators.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +import numpy as np + +from six.moves import xrange # pylint: disable=redefined-builtin + + +def identity_generator(nbr_symbols, max_length, nbr_cases): + """Generator for the identity (copy) task on sequences of symbols. + + The length of the sequence is drawn uniformly at random from [1, max_length] + and then symbols are drawn uniformly at random from [1, nbr_symbols] until + nbr_cases sequences have been produced. + + Args: + nbr_symbols: number of symbols to use in each sequence. + max_length: integer, maximum length of sequences to generate. + nbr_cases: the number of cases to generate. + + Yields: + A dictionary {"inputs": input-list, "targets": target-list} where + input-list and target-list are the same. + """ + for _ in xrange(nbr_cases): + l = np.random.randint(max_length) + 1 + inputs = [np.random.randint(nbr_symbols) + 1 for _ in xrange(l)] + yield {"inputs": inputs, "targets": inputs} + + +def shift_generator(nbr_symbols, shift, max_length, nbr_cases): + """Generator for the shift task on sequences of symbols. + + The length of the sequence is drawn uniformly at random from [1, max_length] + and then symbols are drawn uniformly at random from [1, nbr_symbols - shift] + until nbr_cases sequences have been produced (output[i] = input[i] + shift). + + Args: + nbr_symbols: number of symbols to use in each sequence (input + output). + shift: by how much to shift the input. + max_length: integer, maximum length of sequences to generate. + nbr_cases: the number of cases to generate. + + Yields: + A dictionary {"inputs": input-list, "targets": target-list} where + target-list[i] = input-list[i] + shift. + """ + for _ in xrange(nbr_cases): + l = np.random.randint(max_length) + 1 + inputs = [np.random.randint(nbr_symbols - shift) + 1 for _ in xrange(l)] + yield {"inputs": inputs, "targets": [i + shift for i in inputs]} + + +def reverse_generator(nbr_symbols, max_length, nbr_cases): + """Generator for the reversing task on sequences of symbols. + + The length of the sequence is drawn uniformly at random from [1, max_length] + and then symbols are drawn uniformly at random from [1, nbr_symbols] until + nbr_cases sequences have been produced. + + Args: + nbr_symbols: number of symbols to use in each sequence. + max_length: integer, maximum length of sequences to generate. + nbr_cases: the number of cases to generate. + + Yields: + A dictionary {"inputs": input-list, "targets": target-list} where + target-list is input-list reversed. + """ + for _ in xrange(nbr_cases): + l = np.random.randint(max_length) + 1 + inputs = [np.random.randint(nbr_symbols) + 1 for _ in xrange(l)] + yield {"inputs": inputs, "targets": list(reversed(inputs))} + + +def lower_endian_to_number(l, base): + """Helper function: convert a list of digits in the given base to a number.""" + return sum([d * (base**i) for i, d in enumerate(l)]) + + +def number_to_lower_endian(n, base): + """Helper function: convert a number to a list of digits in the given base.""" + if n < base: + return [n] + return [n % base] + number_to_lower_endian(n // base, base) + + +def random_number_lower_endian(length, base): + """Helper function: generate a random number as a lower-endian digits list.""" + if length == 1: # Last digit can be 0 only if length is 1. + return [np.random.randint(base)] + prefix = [np.random.randint(base) for _ in xrange(length - 1)] + return prefix + [np.random.randint(base - 1) + 1] # Last digit is not 0. + + +def addition_generator(base, max_length, nbr_cases): + """Generator for the addition task. + + The length of each number is drawn uniformly at random from [1, max_length/2] + and then digits are drawn uniformly at random. The numbers are added and + separated by [base+1] in the input. Stops at nbr_cases. + + Args: + base: in which base are the numbers. + max_length: integer, maximum length of sequences to generate. + nbr_cases: the number of cases to generate. + + Yields: + A dictionary {"inputs": input-list, "targets": target-list} where + input-list are the 2 numbers and target-list is the result of adding them. + + Raises: + ValueError: if max_length is lower than 3. + """ + if max_length < 3: + raise ValueError("Maximum length must be at least 3.") + for _ in xrange(nbr_cases): + l1 = np.random.randint(max_length // 2) + 1 + l2 = np.random.randint(max_length - l1 - 1) + 1 + n1 = random_number_lower_endian(l1, base) + n2 = random_number_lower_endian(l2, base) + result = lower_endian_to_number(n1, base) + lower_endian_to_number(n2, base) + # We shift digits by 1 on input and output to leave 0 for padding. + inputs = [i + 1 for i in n1] + [base + 1] + [i + 1 for i in n2] + targets = [i + 1 for i in number_to_lower_endian(result, base)] + yield {"inputs": inputs, "targets": targets} + + +def multiplication_generator(base, max_length, nbr_cases): + """Generator for the multiplication task. + + The length of each number is drawn uniformly at random from [1, max_length/2] + and then digits are drawn uniformly at random. The numbers are multiplied + and separated by [base+1] in the input. Stops at nbr_cases. + + Args: + base: in which base are the numbers. + max_length: integer, maximum length of sequences to generate. + nbr_cases: the number of cases to generate. + + Yields: + A dictionary {"inputs": input-list, "targets": target-list} where + input-list are the 2 numbers and target-list is the result of multiplying + them. + + Raises: + ValueError: if max_length is lower than 3. + """ + if max_length < 3: + raise ValueError("Maximum length must be at least 3.") + for _ in xrange(nbr_cases): + l1 = np.random.randint(max_length // 2) + 1 + l2 = np.random.randint(max_length - l1 - 1) + 1 + n1 = random_number_lower_endian(l1, base) + n2 = random_number_lower_endian(l2, base) + result = lower_endian_to_number(n1, base) * lower_endian_to_number(n2, base) + # We shift digits by 1 on input and output to leave 0 for padding. + inputs = [i + 1 for i in n1] + [base + 1] + [i + 1 for i in n2] + targets = [i + 1 for i in number_to_lower_endian(result, base)] + yield {"inputs": inputs, "targets": targets} diff --git a/tensor2tensor/data_generators/algorithmic_math.py b/tensor2tensor/data_generators/algorithmic_math.py new file mode 100644 index 000000000..932c080e1 --- /dev/null +++ b/tensor2tensor/data_generators/algorithmic_math.py @@ -0,0 +1,580 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Algorithmic data generators for symbolic math tasks. + +See go/symbolic-math-dataset +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from collections import namedtuple +import random + +# Dependency imports + +import six +from six.moves import xrange # pylint: disable=redefined-builtin +import sympy + + +class ExprOp(object): + """Represents an algebraic operation, such as '+', '-', etc.""" + + def __init__(self, symbol, precedence, associative=False): + """Constructor. + + Args: + symbol: The character which represents this operation, such as '+' for + addition. + precedence: Operator precedence. This will determine where parentheses + are used. + associative: If true, the order of the operands does not matter. + """ + self.symbol = symbol + self.precedence = precedence + self.associative = associative + + def __str__(self): + return self.symbol + + def __eq__(self, other): + return isinstance(other, ExprOp) and self.symbol == other.symbol + + +class ExprNode(object): + """A node in an expression tree. + + ExprNode always holds an operator. Leaves are strings. + """ + + def __init__(self, left, right, op): + self.left = left + self.right = right + self.op = op + left_depth = left.depth if isinstance(left, ExprNode) else 0 + right_depth = right.depth if isinstance(right, ExprNode) else 0 + self.depth = max(left_depth, right_depth) + 1 + + def __str__(self): + left_str = str(self.left) + right_str = str(self.right) + left_use_parens = (isinstance(self.left, ExprNode) and + self.left.op.precedence < self.op.precedence) + right_use_parens = (isinstance(self.right, ExprNode) and + self.right.op.precedence <= self.op.precedence and + not (self.op.associative and self.right.op == self.op)) + left_final = "(" + left_str + ")" if left_use_parens else left_str + right_final = "(" + right_str + ")" if right_use_parens else right_str + return left_final + str(self.op) + right_final + + def is_in(self, expr): + """Returns True if `expr` is a subtree.""" + if expr == self: + return True + is_in_left = is_in_expr(self.left, expr) + is_in_right = is_in_expr(self.right, expr) + return is_in_left or is_in_right + + +def is_in_expr(expr, find): + """Returns True if `find` is a subtree of `expr`.""" + return expr == find or (isinstance(expr, ExprNode) and expr.is_in(find)) + + +def random_expr_with_required_var(depth, required_var, optional_list, ops): + """Generate a random expression tree with a required variable. + + The required variable appears exactly once in the expression. + + Args: + depth: At least one leaf will be this many levels down from the top. + required_var: A char. This char is guaranteed to be placed exactly once at + a leaf somewhere in the tree. This is the var to solve for. + optional_list: A list of chars. These chars are randomly selected as leaf + values. These are constant vars. + ops: A list of ExprOp instances. + + Returns: + An ExprNode instance which is the root of the generated expression tree. + """ + if not depth: + if required_var: + return required_var + return str(optional_list[random.randrange(len(optional_list))]) + + max_depth_side = random.randrange(2) + other_side_depth = random.randrange(depth) + + required_var_side = random.randrange(2) + + left = random_expr_with_required_var( + depth - 1 if max_depth_side else other_side_depth, required_var + if required_var_side else None, optional_list, ops) + right = random_expr_with_required_var( + depth - 1 if not max_depth_side else other_side_depth, required_var + if not required_var_side else None, optional_list, ops) + + op = ops[random.randrange(len(ops))] + return ExprNode(left, right, op) + + +def random_expr(depth, vlist, ops): + """Generate a random expression tree. + + Args: + depth: At least one leaf will be this many levels down from the top. + vlist: A list of chars. These chars are randomly selected as leaf values. + ops: A list of ExprOp instances. + + Returns: + An ExprNode instance which is the root of the generated expression tree. + """ + if not depth: + return str(vlist[random.randrange(len(vlist))]) + + max_depth_side = random.randrange(2) + other_side_depth = random.randrange(depth) + + left = random_expr(depth - 1 + if max_depth_side else other_side_depth, vlist, ops) + right = random_expr(depth - 1 + if not max_depth_side else other_side_depth, vlist, ops) + + op = ops[random.randrange(len(ops))] + return ExprNode(left, right, op) + + +def algebra_inverse_solve(left, right, var, solve_ops): + """Solves for the value of the given var in an expression. + + See go/symbolic-math-dataset. + + Args: + left: The root of the ExprNode tree on the left side of the equals sign. + right: The root of the ExprNode tree on the right side of the equals sign. + var: A char. The variable to solve for. + solve_ops: A dictionary with the following properties. + * For each operator in the expression, there is a rule that determines + how to cancel out a value either to the left or the right of that + operator. + * For each rule, there is an entry in the dictionary. The key is two + chars- the op char, and either 'l' or 'r' meaning rule for canceling + out the left or right sides. For example, '+l', '+r', '-l', '-r'. + * The value of each entry is a function with the following signature: + (left, right, to_tree) -> (new_from_tree, new_to_tree) + left- Expression on left side of the op. + right- Expression on the right side of the op. + to_tree- The tree on the other side of the equal sign. The canceled + out expression will be moved here. + new_from_tree- The resuling from_tree after the algebraic + manipulation. + new_to_tree- The resulting to_tree after the algebraic manipulation. + + Returns: + The root of an ExprNode tree which holds the value of `var` after solving. + + Raises: + ValueError: If `var` does not appear exactly once in the equation (which + includes the left and right sides). + """ + is_in_left = is_in_expr(left, var) + is_in_right = is_in_expr(right, var) + if is_in_left == is_in_right: + if is_in_left: + raise ValueError("Solve-variable '%s' is on both sides of the equation. " + "Only equations where the solve variable-appears once " + "are supported by this solver. Left: '%s', right: '%s'" % + (var, str(left), str(right))) + else: + raise ValueError("Solve-variable '%s' is not present in the equation. It " + "must appear once. Left: '%s', right: '%s'" % + (var, str(left), str(right))) + + from_tree = left if is_in_left else right + to_tree = left if not is_in_left else right + while from_tree != var: + is_in_left = is_in_expr(from_tree.left, var) + is_in_right = is_in_expr(from_tree.right, var) + from_tree, to_tree = (solve_ops[str(from_tree.op) + + ("l" if is_in_left else "r")]( + from_tree.left, from_tree.right, + to_tree)) + return to_tree + + +def format_sympy_expr(sympy_expr, functions=None): + """Convert sympy expression into a string which can be encoded. + + Args: + sympy_expr: Any sympy expression tree or string. + functions: Defines special functions. A dict mapping human readable string + names, like "log", "exp", "sin", "cos", etc., to single chars. Each + function gets a unique token, like "L" for "log". + + Returns: + A string representation of the expression suitable for encoding as a + sequence input. + """ + if functions is None: + functions = {} + str_expr = str(sympy_expr) + result = str_expr.replace(" ", "") + for fn_name, char in six.iteritems(functions): + result = result.replace(fn_name, char) + return result + + +def generate_algebra_inverse_sample(vlist, ops, solve_ops, min_depth, + max_depth): + """Randomly generate an algebra inverse dataset sample. + + Given an input equation and variable, produce the expression equal to the + variable. + + See go/symbolic-math-dataset. + + Args: + vlist: Variable list. List of chars that can be used in the expression. + ops: List of ExprOp instances. The allowed operators for the expression. + solve_ops: See `solve_ops` documentation in `algebra_inverse_solve`. + min_depth: Expression trees will not have a smaller depth than this. 0 means + there is just a variable. 1 means there is one operation. + max_depth: Expression trees will not have a larger depth than this. To make + all trees have the same depth, set this equal to `min_depth`. + + Returns: + sample: String representation of the input. Will be of the form + 'solve_var:left_side=right_side'. + target: String representation of the solution. + """ + side = random.randrange(2) + left_depth = random.randrange(min_depth if side else 0, max_depth + 1) + right_depth = random.randrange(min_depth if not side else 0, max_depth + 1) + + var_index = random.randrange(len(vlist)) + var = vlist[var_index] + consts = vlist[:var_index] + vlist[var_index + 1:] + + left = random_expr_with_required_var(left_depth, var + if side else None, consts, ops) + right = random_expr_with_required_var(right_depth, var + if not side else None, consts, ops) + + left_str = str(left) + right_str = str(right) + target = str(algebra_inverse_solve(left, right, var, solve_ops)) + sample = var + ":" + left_str + "=" + right_str + + return sample, target + + +def generate_algebra_simplify_sample(vlist, ops, min_depth, max_depth): + """Randomly generate an algebra simplify dataset sample. + + Given an input expression, produce the simplified expression. + + See go/symbolic-math-dataset. + + Args: + vlist: Variable list. List of chars that can be used in the expression. + ops: List of ExprOp instances. The allowed operators for the expression. + min_depth: Expression trees will not have a smaller depth than this. 0 means + there is just a variable. 1 means there is one operation. + max_depth: Expression trees will not have a larger depth than this. To make + all trees have the same depth, set this equal to `min_depth`. + + Returns: + sample: String representation of the input. + target: String representation of the solution. + """ + depth = random.randrange(min_depth, max_depth + 1) + expr = random_expr(depth, vlist, ops) + + sample = str(expr) + target = format_sympy_expr(sympy.simplify(sample)) + return sample, target + + +def generate_calculus_integrate_sample(vlist, ops, min_depth, max_depth, + functions): + """Randomly generate a symbolic integral dataset sample. + + Given an input expression, produce the indefinite integral. + + See go/symbolic-math-dataset. + + Args: + vlist: Variable list. List of chars that can be used in the expression. + ops: List of ExprOp instances. The allowed operators for the expression. + min_depth: Expression trees will not have a smaller depth than this. 0 means + there is just a variable. 1 means there is one operation. + max_depth: Expression trees will not have a larger depth than this. To make + all trees have the same depth, set this equal to `min_depth`. + functions: Defines special functions. A dict mapping human readable string + names, like "log", "exp", "sin", "cos", etc., to single chars. Each + function gets a unique token, like "L" for "log". + + Returns: + sample: String representation of the input. Will be of the form + 'var:expression'. + target: String representation of the solution. + """ + var_index = random.randrange(len(vlist)) + var = vlist[var_index] + consts = vlist[:var_index] + vlist[var_index + 1:] + + depth = random.randrange(min_depth, max_depth + 1) + expr = random_expr_with_required_var(depth, var, consts, ops) + + expr_str = str(expr) + sample = var + ":" + expr_str + target = format_sympy_expr( + sympy.integrate(expr_str, sympy.Symbol(var)), functions=functions) + return sample, target + + +# AlgebraConfig holds objects required to generate the algebra inverse +# dataset. See go/symbolic-math-dataset. +# vlist: Variable list. A list of chars. +# dlist: Numberical digit list. A list of chars. +# flist: List of special function names. A list of chars. +# functions: Dict of special function names. Maps human readable string names to +# single char names used in flist. +# ops: Dict mapping op symbols (chars) to ExprOp instances. +# solve_ops: Encodes rules for how to algebraicly cancel out each operation. See +# doc-string for `algebra_inverse_solve`. +# int_encoder: Function that maps a string to a list of tokens. Use this to +# encode an expression to feed into a model. +# int_decoder: Function that maps a list of tokens to a string. Use this to +# convert model input or output into a human readable string. +AlgebraConfig = namedtuple("AlgebraConfig", [ + "vlist", "dlist", "flist", "functions", "ops", "solve_ops", "int_encoder", + "int_decoder" +]) + + +def math_dataset_init(alphabet_size=26, digits=None, functions=None): + """Initializes required objects to generate symbolic math datasets. + + See go/symbolic-math-dataset. + + Produces token set, ExprOp instances, solve_op dictionary, encoders, and + decoders needed to generate the algebra inverse dataset. + + Args: + alphabet_size: How many possible variables there are. Max 52. + digits: How many numerical digits to encode as tokens, "0" throuh + str(digits-1), or None to encode no digits. + functions: Defines special functions. A dict mapping human readable string + names, like "log", "exp", "sin", "cos", etc., to single chars. Each + function gets a unique token, like "L" for "log". + WARNING, Make sure these tokens do not conflict with the list of + possible variable names. + + Returns: + AlgebraConfig instance holding all the objects listed above. + + Raises: + ValueError: If `alphabet_size` is not in range [2, 52]. + """ + ops_list = ["+", "-", "*", "/"] + ops = { + "+": ExprOp("+", 0, True), + "-": ExprOp("-", 0, False), + "*": ExprOp("*", 1, True), + "/": ExprOp("/", 1, False) + } + solve_ops = { + "+l": lambda l, r, to: (l, ExprNode(to, r, ops["-"])), + "+r": lambda l, r, to: (r, ExprNode(to, l, ops["-"])), + "-l": lambda l, r, to: (l, ExprNode(to, r, ops["+"])), + "-r": lambda l, r, to: (r, ExprNode(l, to, ops["-"])), + "*l": lambda l, r, to: (l, ExprNode(to, r, ops["/"])), + "*r": lambda l, r, to: (r, ExprNode(to, l, ops["/"])), + "/l": lambda l, r, to: (l, ExprNode(to, r, ops["*"])), + "/r": lambda l, r, to: (r, ExprNode(l, to, ops["/"])), + } + alphabet = ( + [six.int2byte(ord("a") + c) + for c in range(26)] + [six.int2byte(ord("A") + c) for c in range(26)]) + if alphabet_size > 52: + raise ValueError( + "alphabet_size cannot be greater than 52. Got %s." % alphabet_size) + if alphabet_size < 2: + raise ValueError( + "alphabet_size cannot be less than 2. Got %s." % alphabet_size) + if digits is not None and not 1 <= digits <= 10: + raise ValueError("digits cannot must be between 1 and 10. Got %s." % digits) + vlist = alphabet[:alphabet_size] + if digits is not None: + dlist = [str(d) for d in xrange(digits)] + else: + dlist = [] + if functions is None: + functions = {} + flist = sorted(functions.values()) + pad = "_" + tokens = [pad] + [":", "(", ")", "="] + ops_list + vlist + dlist + flist + if len(tokens) != len(set(tokens)): + raise ValueError("Duplicate token. Tokens: %s" % tokens) + token_map = dict([(t, i) for i, t in enumerate(tokens)]) + + def int_encoder(sequence): + return [token_map[s] for s in sequence] + + def int_decoder(tensor_1d): + return "".join([tokens[i] for i in tensor_1d]) + + return AlgebraConfig( + vlist=vlist, + dlist=dlist, + flist=flist, + functions=functions, + ops=ops, + solve_ops=solve_ops, + int_encoder=int_encoder, + int_decoder=int_decoder) + + +def algebra_inverse(alphabet_size=26, min_depth=0, max_depth=2, + nbr_cases=10000): + """Generate the algebra inverse dataset. + + Each sample is a symbolic math equation involving unknown variables. The + task is to solve for the given variable. The target is the resulting + expression. + + Args: + alphabet_size: How many possible variables there are. Max 52. + min_depth: Minimum depth of the expression trees on both sides of the + equals sign in the equation. + max_depth: Maximum depth of the expression trees on both sides of the + equals sign in the equation. + nbr_cases: The number of cases to generate. + + Yields: + A dictionary {"inputs": input-list, "targets": target-list} where + input-list are the tokens encoding the variable to solve for and the math + equation, and target-list is a list of tokens encoding the resulting math + expression after solving for the variable. + + Raises: + ValueError: If `max_depth` < `min_depth`. + """ + + if max_depth < min_depth: + raise ValueError("max_depth must be greater than or equal to min_depth. " + "Got max_depth=%s, min_depth=%s" % (max_depth, min_depth)) + + alg_cfg = math_dataset_init(alphabet_size) + for _ in xrange(nbr_cases): + sample, target = generate_algebra_inverse_sample( + alg_cfg.vlist, + list(alg_cfg.ops.values()), alg_cfg.solve_ops, min_depth, max_depth) + yield { + "inputs": alg_cfg.int_encoder(sample), + "targets": alg_cfg.int_encoder(target) + } + + +def algebra_simplify(alphabet_size=26, + min_depth=0, + max_depth=2, + nbr_cases=10000): + """Generate the algebra simplify dataset. + + Each sample is a symbolic math expression involving unknown variables. The + task is to simplify the expression. The target is the resulting expression. + + Args: + alphabet_size: How many possible variables there are. Max 52. + min_depth: Minimum depth of the expression trees on both sides of the + equals sign in the equation. + max_depth: Maximum depth of the expression trees on both sides of the + equals sign in the equation. + nbr_cases: The number of cases to generate. + + Yields: + A dictionary {"inputs": input-list, "targets": target-list} where + input-list are the tokens encoding the expression to simplify, and + target-list is a list of tokens encoding the resulting math expression after + simplifying. + + Raises: + ValueError: If `max_depth` < `min_depth`. + """ + if max_depth < min_depth: + raise ValueError("max_depth must be greater than or equal to min_depth. " + "Got max_depth=%s, min_depth=%s" % (max_depth, min_depth)) + + alg_cfg = math_dataset_init(alphabet_size, digits=5) + for _ in xrange(nbr_cases): + sample, target = generate_algebra_simplify_sample( + alg_cfg.vlist, list(alg_cfg.ops.values()), min_depth, max_depth) + yield { + "inputs": alg_cfg.int_encoder(sample), + "targets": alg_cfg.int_encoder(target) + } + + +def calculus_integrate(alphabet_size=26, + min_depth=0, + max_depth=2, + nbr_cases=10000): + """Generate the calculus integrate dataset. + + Each sample is a symbolic math expression involving unknown variables. The + task is to take the indefinite integral of the expression. The target is the + resulting expression. + + Args: + alphabet_size: How many possible variables there are. Max 26. + min_depth: Minimum depth of the expression trees on both sides of the + equals sign in the equation. + max_depth: Maximum depth of the expression trees on both sides of the + equals sign in the equation. + nbr_cases: The number of cases to generate. + + Yields: + A dictionary {"inputs": input-list, "targets": target-list} where + input-list are the tokens encoding the variable to integrate with respect + to and the expression to integrate, and target-list is a list of tokens + encoding the resulting math expression after integrating. + + Raises: + ValueError: If `max_depth` < `min_depth`, or if alphabet_size > 26. + """ + if max_depth < min_depth: + raise ValueError("max_depth must be greater than or equal to min_depth. " + "Got max_depth=%s, min_depth=%s" % (max_depth, min_depth)) + + # Don't allow alphabet to use capital letters. Those are reserved for function + # names. + if alphabet_size > 26: + raise ValueError( + "alphabet_size must not be greater than 26. Got %s." % alphabet_size) + + functions = {"log": "L"} + alg_cfg = math_dataset_init(alphabet_size, digits=5, functions=functions) + for _ in xrange(nbr_cases): + sample, target = generate_calculus_integrate_sample( + alg_cfg.vlist, + list(alg_cfg.ops.values()), min_depth, max_depth, alg_cfg.functions) + yield { + "inputs": alg_cfg.int_encoder(sample), + "targets": alg_cfg.int_encoder(target) + } diff --git a/tensor2tensor/data_generators/algorithmic_math_test.py b/tensor2tensor/data_generators/algorithmic_math_test.py new file mode 100644 index 000000000..6c4b63054 --- /dev/null +++ b/tensor2tensor/data_generators/algorithmic_math_test.py @@ -0,0 +1,84 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for tensor2tensor.data_generators.algorithmic_math.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +import six +import sympy +from tensor2tensor.data_generators import algorithmic_math + +import tensorflow as tf + + +class AlgorithmicMathTest(tf.test.TestCase): + + def testAlgebraInverse(self): + dataset_objects = algorithmic_math.math_dataset_init(26) + counter = 0 + for d in algorithmic_math.algebra_inverse(26, 0, 3, 10): + counter += 1 + decoded_input = dataset_objects.int_decoder(d["inputs"]) + solve_var, expression = decoded_input.split(":") + lhs, rhs = expression.split("=") + + # Solve for the solve-var. + result = sympy.solve("%s-(%s)" % (lhs, rhs), solve_var) + target_expression = dataset_objects.int_decoder(d["targets"]) + + # Check that the target and sympy's solutions are equivalent. + self.assertEqual( + 0, sympy.simplify(str(result[0]) + "-(%s)" % target_expression)) + self.assertEqual(counter, 10) + + def testAlgebraSimplify(self): + dataset_objects = algorithmic_math.math_dataset_init(8, digits=5) + counter = 0 + for d in algorithmic_math.algebra_simplify(8, 0, 3, 10): + counter += 1 + expression = dataset_objects.int_decoder(d["inputs"]) + target = dataset_objects.int_decoder(d["targets"]) + + # Check that the input and output are equivalent expressions. + self.assertEqual(0, sympy.simplify("%s-(%s)" % (expression, target))) + self.assertEqual(counter, 10) + + def testCalculusIntegrate(self): + dataset_objects = algorithmic_math.math_dataset_init( + 8, digits=5, functions={"log": "L"}) + counter = 0 + for d in algorithmic_math.calculus_integrate(8, 0, 3, 10): + counter += 1 + decoded_input = dataset_objects.int_decoder(d["inputs"]) + var, expression = decoded_input.split(":") + target = dataset_objects.int_decoder(d["targets"]) + + for fn_name, fn_char in six.iteritems(dataset_objects.functions): + target = target.replace(fn_char, fn_name) + + # Take the derivative of the target. + derivative = str(sympy.diff(target, var)) + + # Check that the derivative of the integral equals the input. + self.assertEqual(0, sympy.simplify("%s-(%s)" % (expression, derivative))) + self.assertEqual(counter, 10) + + +if __name__ == "__main__": + tf.test.main() diff --git a/tensor2tensor/data_generators/algorithmic_test.py b/tensor2tensor/data_generators/algorithmic_test.py new file mode 100644 index 000000000..7bc2fb5bb --- /dev/null +++ b/tensor2tensor/data_generators/algorithmic_test.py @@ -0,0 +1,84 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Algorithmic generators test.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +from tensor2tensor.data_generators import algorithmic + +import tensorflow as tf + + +class AlgorithmicTest(tf.test.TestCase): + + def testIdentityGenerator(self): + counter = 0 + for d in algorithmic.identity_generator(3, 8, 10): + counter += 1 + self.assertEqual(d["inputs"], d["targets"]) + self.assertEqual(counter, 10) + + def testReverseGenerator(self): + counter = 0 + for d in algorithmic.reverse_generator(3, 8, 10): + counter += 1 + self.assertEqual(list(reversed(d["inputs"])), d["targets"]) + self.assertEqual(counter, 10) + + def testLowerEndianToNumber(self): + self.assertEqual(algorithmic.lower_endian_to_number([0], 2), 0) + self.assertEqual(algorithmic.lower_endian_to_number([0], 7), 0) + self.assertEqual(algorithmic.lower_endian_to_number([1], 2), 1) + self.assertEqual(algorithmic.lower_endian_to_number([5], 8), 5) + self.assertEqual(algorithmic.lower_endian_to_number([0, 1], 2), 2) + self.assertEqual(algorithmic.lower_endian_to_number([0, 1, 1], 2), 6) + self.assertEqual(algorithmic.lower_endian_to_number([7, 3, 1, 2], 10), 2137) + + def testNumberToLowerEndian(self): + self.assertEqual(algorithmic.number_to_lower_endian(0, 2), [0]) + self.assertEqual(algorithmic.number_to_lower_endian(0, 7), [0]) + self.assertEqual(algorithmic.number_to_lower_endian(1, 2), [1]) + self.assertEqual(algorithmic.number_to_lower_endian(5, 8), [5]) + self.assertEqual(algorithmic.number_to_lower_endian(2, 2), [0, 1]) + self.assertEqual(algorithmic.number_to_lower_endian(6, 2), [0, 1, 1]) + self.assertEqual(algorithmic.number_to_lower_endian(2137, 10), [7, 3, 1, 2]) + + def testAdditionGenerator(self): + counter = 0 + for d in algorithmic.addition_generator(4, 8, 10): + counter += 1 + self.assertEqual(d["inputs"].count(5), 1) + self.assertEqual(d["inputs"].count(0), 0) + self.assertEqual(d["targets"].count(5), 0) + self.assertEqual(d["targets"].count(0), 0) + self.assertEqual(counter, 10) + + def testMultiplicationGenerator(self): + counter = 0 + for d in algorithmic.multiplication_generator(4, 8, 10): + counter += 1 + self.assertEqual(d["inputs"].count(5), 1) + self.assertEqual(d["inputs"].count(0), 0) + self.assertEqual(d["targets"].count(5), 0) + self.assertEqual(d["targets"].count(0), 0) + self.assertEqual(counter, 10) + + +if __name__ == "__main__": + tf.test.main() diff --git a/tensor2tensor/data_generators/audio.py b/tensor2tensor/data_generators/audio.py new file mode 100644 index 000000000..12e0c7b43 --- /dev/null +++ b/tensor2tensor/data_generators/audio.py @@ -0,0 +1,156 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""TIMIT data generator.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +from subprocess import call +import tarfile +import wave + +# Dependency imports + +from tensor2tensor.data_generators import generator_utils + +import tensorflow as tf + +flags = tf.flags +FLAGS = flags.FLAGS + +flags.DEFINE_string("timit_paths", "", + "Comma-separated list of tarfiles containing TIMIT " + "datasets") + +_TIMIT_TRAIN_DATASETS = [ + ["timit/TIMIT/TRAIN", (".WAV", ".WRD")], +] +_TIMIT_TEST_DATASETS = [ + ["timit/TIMIT/TEST", (".WAV", ".WRD")], +] + + +def _get_timit(directory): + """Extract TIMIT datasets to directory unless directory/timit exists.""" + if os.path.exists(os.path.join(directory, "timit")): + return + + assert FLAGS.timit_paths + for path in FLAGS.timit_paths.split(","): + with tf.gfile.GFile(path) as f: + with tarfile.open(fileobj=f, mode="r:gz") as timit_compressed: + timit_compressed.extractall(directory) + + +def _collect_data(directory, input_ext, target_ext): + """Traverses directory collecting input and target files.""" + # Directory from string to tuple pair of strings + # key: the filepath to a datafile including the datafile's basename. Example, + # if the datafile was "/path/to/datafile.wav" then the key would be + # "/path/to/datafile" + # value: a pair of strings (input_filepath, target_filepath) + data_files = dict() + for root, _, filenames in os.walk(directory): + input_files = [filename for filename in filenames if input_ext in filename] + for input_filename in input_files: + basename = input_filename.strip(input_ext) + input_file = os.path.join(root, input_filename) + target_file = os.path.join(root, basename + target_ext) + key = os.path.join(root, basename) + assert os.path.exists(target_file) + assert key not in data_files + data_files[key] = (input_file, target_file) + return data_files + + +def _get_audio_data(filepath): + # Construct a true .wav file. + out_filepath = filepath.strip(".WAV") + ".wav" + # Assumes sox is installed on system. Sox converts from NIST SPHERE to WAV. + call(["sox", filepath, out_filepath]) + wav_file = wave.open(open(out_filepath)) + frame_count = wav_file.getnframes() + byte_array = wav_file.readframes(frame_count) + data = [int(b.encode("hex"), base=16) for b in byte_array] + return data, frame_count, wav_file.getsampwidth(), wav_file.getnchannels() + + +def _get_text_data(filepath): + with tf.gfile.GFile(filepath, mode="r") as text_file: + words = [] + for line in text_file: + word = line.strip().split()[2] + words.append(word) + return " ".join(words) + + +def timit_generator(tmp_dir, + training, + how_many, + start_from=0, + eos_list=None, + vocab_filename=None, + vocab_size=0): + """Data generator for TIMIT transcription problem. + + Args: + tmp_dir: path to temporary storage directory. + training: a Boolean; if true, we use the train set, otherwise the test set. + how_many: how many inputs and labels to generate. + start_from: from which input to start. + eos_list: optional list of end of sentence tokens, otherwise use default + value `1`. + vocab_filename: file within `tmp_dir` to read vocabulary from. If this is + not provided then the target sentence will be encoded by character. + vocab_size: integer target to generate vocabulary size to. + + Yields: + A dictionary representing the images with the following fields: + * inputs: a float sequence containing the audio data + * audio/channel_count: an integer + * audio/sample_count: an integer + * audio/sample_width: an integer + * targets: an integer sequence representing the encoded sentence + """ + eos_list = [1] if eos_list is None else eos_list + if vocab_filename is not None: + vocab_symbolizer = generator_utils.get_or_generate_vocab( + tmp_dir, vocab_filename, vocab_size) + _get_timit(tmp_dir) + datasets = (_TIMIT_TRAIN_DATASETS if training else _TIMIT_TEST_DATASETS) + i = 0 + for data_dir, (audio_ext, transcription_ext) in datasets: + data_dir = os.path.join(tmp_dir, data_dir) + data_files = _collect_data(data_dir, audio_ext, transcription_ext) + data_pairs = data_files.values() + for input_file, target_file in sorted(data_pairs)[start_from:]: + if i == how_many: + return + i += 1 + audio_data, sample_count, sample_width, num_channels = _get_audio_data( + input_file) + text_data = _get_text_data(target_file) + if vocab_filename is None: + label = [ord(c) for c in text_data] + eos_list + else: + label = vocab_symbolizer.encode(text_data) + eos_list + yield { + "inputs": audio_data, + "audio/channel_count": [num_channels], + "audio/sample_count": [sample_count], + "audio/sample_width": [sample_width], + "targets": label + } diff --git a/tensor2tensor/data_generators/audio_test.py b/tensor2tensor/data_generators/audio_test.py new file mode 100644 index 000000000..f1830043f --- /dev/null +++ b/tensor2tensor/data_generators/audio_test.py @@ -0,0 +1,62 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for tensor2tensor.data_generators.audio.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import io +import os + +# Dependency imports + +from tensor2tensor.data_generators import audio + +import tensorflow as tf + + +class AudioTest(tf.test.TestCase): + + def testDataCollection(self): + # Generate a trivial source and target file. + tmp_dir = self.get_temp_dir() + test_files = [ + "dir1/file1", + "dir1/file2", + "dir1/dir2/file3", + "dir1/dir2/dir3/file4", + ] + for filename in test_files: + input_filename = os.path.join(tmp_dir, filename + ".WAV") + target_filename = os.path.join(tmp_dir, filename + ".WRD") + directories = os.path.dirname(input_filename) + if not os.path.exists(directories): + os.makedirs(directories) + io.open(input_filename, "wb") + io.open(target_filename, "wb") + + data_dict = audio._collect_data(tmp_dir, ".WAV", ".WRD") + expected = [os.path.join(tmp_dir, filename) for filename in test_files] + self.assertEqual(sorted(list(data_dict)), sorted(expected)) + + # Clean up. + for filename in test_files: + os.remove(os.path.join(tmp_dir, "%s.WAV" % filename)) + os.remove(os.path.join(tmp_dir, "%s.WRD" % filename)) + + +if __name__ == "__main__": + tf.test.main() diff --git a/tensor2tensor/data_generators/concatenate_examples.py b/tensor2tensor/data_generators/concatenate_examples.py new file mode 100644 index 000000000..b346b6c08 --- /dev/null +++ b/tensor2tensor/data_generators/concatenate_examples.py @@ -0,0 +1,180 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +r"""Convert seq-seq examples to "concatenated" examples. + +The concatenated example has no "inputs". +Instead the source is at the beginning of the target. + +We can now use a simple language model. + +Example: +seq-seq mode: +{ + "inputs": subtokenizer.encode("I love you.") + [1] + "targets": subtokenizer.encode("Je t'aime.") + [1] +} +-> +concatenated mode: +{ + "inputs": [0] + "targets": (subtokenizer.encode("source English I love you.") + [1] + + subtokenizer.encode("target French Je t'aime.") + [1]) +} + +We add a dummy feature "inputs"=[0] for compatability with seq-to-seq models. + +If FLAGS.combine_to_length is nonzero, then we combine multiple examples into +examples of a constant length, possibly with some padding at the end. + +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import random + +# Dependency imports + +from tensor2tensor.data_generators import generator_utils +from tensor2tensor.data_generators import text_encoder +import tensorflow as tf + +tf.app.flags.DEFINE_string("vocab_file", "", + "SubwordTextEncoder vocabulary file") + +tf.app.flags.DEFINE_boolean( + "random_reverse", False, + "If true, write half of the example with source/target reversed") + +tf.app.flags.DEFINE_boolean( + "count_everything", False, + "If true, assign positive weights to designators, source and target. " + "If false, assign positive weights only to target.") + +tf.app.flags.DEFINE_string("source_domain_string", "English", "") +tf.app.flags.DEFINE_string("target_domain_string", "French", "") + +tf.app.flags.DEFINE_integer( + "combine_to_length", 0, + "If positive, concatenate examples to form examples with target length " + " equal to this value. Targets are padded with subtoken id=0.") + +tf.app.flags.DEFINE_string("in_file", "", "input filename") + +tf.app.flags.DEFINE_string( + "out_prefix", "/usr/local/google/tmp/concat", + "The output filename is equal to out_prefix plus " + "the last 15 characters of in_file. (e.g. -00001-of-00100)") + +FLAGS = tf.app.flags.FLAGS + + +def _make_example(ids, weights, raw_num_bytes): + if FLAGS.combine_to_length > 0: + ids += [0] * (FLAGS.combine_to_length - len(ids)) + return generator_utils.to_example({ + "targets": ids, + "target_weights": weights, + "inputs": [0], + "raw_num_bytes": [raw_num_bytes] + }).SerializeToString() + + +def main(_): + """Convert a file to examples.""" + subtokenizer = text_encoder.SubwordTextEncoder(FLAGS.vocab_file) + total_bytes = 0 + total_subtokens = 0 + total_examples = 0 + dropped_examples = 0 + + combined_subtokens = [] + combined_num_bytes = 0 + combined_weights = [] + + source_specifier = subtokenizer.encode("source " + FLAGS.source_domain_string) + target_specifier = subtokenizer.encode("target " + FLAGS.target_domain_string) + if FLAGS.random_reverse: + r_source_specifier = subtokenizer.encode("source " + + FLAGS.target_domain_string) + r_target_specifier = subtokenizer.encode("target " + + FLAGS.source_domain_string) + + reader = tf.python_io.tf_record_iterator(FLAGS.in_file) + + out_file = FLAGS.out_prefix + FLAGS.in_file[-15:] + writer = tf.python_io.TFRecordWriter(out_file) + + for record in reader: + total_examples += 1 + if total_examples % 1000 == 0: + tf.logging.info("total_examples: %d", total_examples) + x = tf.train.Example() + x.ParseFromString(record) + inputs = [i for i in x.features.feature["inputs"].int64_list.value] + targets = [i for i in x.features.feature["targets"].int64_list.value] + should_reverse = FLAGS.random_reverse and random.random() < 0.5 + source_bytes = len(subtokenizer.decode(inputs[:-1])) + 1 + target_bytes = len(subtokenizer.decode(targets[:-1])) + 1 + if not should_reverse: + subtokens = source_specifier + inputs + target_specifier + targets + weights = ([0.0] * + (len(source_specifier) + len(inputs) + len(target_specifier)) + + [1.0] * len(targets)) + num_bytes = target_bytes + else: + subtokens = r_source_specifier + targets + r_target_specifier + inputs + weights = ( + [0.0] * + (len(r_source_specifier) + len(targets) + len(r_target_specifier)) + + [1.0] * len(inputs)) + num_bytes = source_bytes + if FLAGS.count_everything: + weights = [1.0] * len(subtokens) + num_bytes = source_bytes + target_bytes + total_bytes += num_bytes + total_subtokens += sum(weights) + if FLAGS.combine_to_length: + if combined_subtokens and (len(combined_subtokens) + len(subtokens) > + FLAGS.combine_to_length): + writer.write( + _make_example(combined_subtokens, combined_weights, + combined_num_bytes)) + combined_subtokens = [] + combined_weights = [] + combined_num_bytes = 0 + if len(subtokens) <= FLAGS.combine_to_length: + combined_subtokens.extend(subtokens) + combined_weights.extend(weights) + combined_num_bytes += num_bytes + else: + dropped_examples += 1 + else: + writer.write(_make_example(subtokens, weights, num_bytes)) + if combined_subtokens: + writer.write( + _make_example(combined_subtokens, combined_weights, combined_num_bytes)) + writer.close() + + tf.logging.info("total bytes: %d", total_bytes) + tf.logging.info("total subtokens: %d", total_subtokens) + tf.logging.info("bytes per subtoken: %f", total_bytes / total_subtokens) + tf.logging.info("total documents: %d", total_examples) + tf.logging.info("dropped documents: %d", dropped_examples) + + +if __name__ == "__main__": + tf.app.run() diff --git a/tensor2tensor/data_generators/generator_utils.py b/tensor2tensor/data_generators/generator_utils.py new file mode 100644 index 000000000..487546e16 --- /dev/null +++ b/tensor2tensor/data_generators/generator_utils.py @@ -0,0 +1,264 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utilities for data generators.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import gzip +import io +import os +import tarfile +import urllib + +# Dependency imports + +import six +from six.moves import xrange # pylint: disable=redefined-builtin + +from tensor2tensor.data_generators.text_encoder import SubwordTextEncoder +from tensor2tensor.data_generators.tokenizer import Tokenizer + +import tensorflow as tf + + +def to_example(dictionary): + """Helper: build tf.Example from (string -> int/float/str list) dictionary.""" + features = {} + for (k, v) in six.iteritems(dictionary): + if not v: + raise ValueError("Empty generated field: %s", str((k, v))) + if isinstance(v[0], six.integer_types): + features[k] = tf.train.Feature(int64_list=tf.train.Int64List(value=v)) + elif isinstance(v[0], float): + features[k] = tf.train.Feature(float_list=tf.train.FloatList(value=v)) + elif isinstance(v[0], six.string_types): + features[k] = tf.train.Feature(bytes_list=tf.train.BytesList(value=v)) + else: + raise ValueError("Value is neither an int nor a float; v: %s type: %s" % + (str(v[0]), str(type(v[0])))) + return tf.train.Example(features=tf.train.Features(feature=features)) + + +def generate_files_distributed(generator, + output_name, + output_dir, + num_shards=1, + max_cases=None, + task_id=0): + """generate_files but with a single writer writing to shard task_id.""" + assert task_id < num_shards + output_filename = "%s-%.5d-of-%.5d" % (output_name, task_id, num_shards) + output_file = os.path.join(output_dir, output_filename) + tf.logging.info("Writing to file %s", output_file) + writer = tf.python_io.TFRecordWriter(output_file) + + counter = 0 + for case in generator: + if counter % 100000 == 0: + tf.logging.info("Generating case %d for %s." % (counter, output_name)) + counter += 1 + if max_cases and counter > max_cases: + break + sequence_example = to_example(case) + writer.write(sequence_example.SerializeToString()) + + writer.close() + return output_file + + +def generate_files(generator, + output_name, + output_dir, + num_shards=1, + max_cases=None): + """Generate cases from a generator and save as TFRecord files. + + Generated cases are transformed to tf.Example protos and saved as TFRecords + in sharded files named output_dir/output_name-00..N-of-00..M=num_shards. + + Args: + generator: a generator yielding (string -> int/float/str list) dictionaries. + output_name: the file name prefix under which output will be saved. + output_dir: directory to save the output to. + num_shards: how many shards to use (defaults to 1). + max_cases: maximum number of cases to get from the generator; + if None (default), we use the generator until StopIteration is raised. + + Returns: + List of output file paths. + """ + writers = [] + output_files = [] + for shard in xrange(num_shards): + output_filename = "%s-%.5d-of-%.5d" % (output_name, shard, num_shards) + output_file = os.path.join(output_dir, output_filename) + output_files.append(output_file) + writers.append(tf.python_io.TFRecordWriter(output_file)) + + counter, shard = 0, 0 + for case in generator: + if counter % 100000 == 0: + tf.logging.info("Generating case %d for %s." % (counter, output_name)) + counter += 1 + if max_cases and counter > max_cases: + break + sequence_example = to_example(case) + writers[shard].write(sequence_example.SerializeToString()) + shard = (shard + 1) % num_shards + + for writer in writers: + writer.close() + + return output_files + + +def maybe_download(directory, filename, url): + """Download filename from url unless it's already in directory. + + Args: + directory: path to the directory that will be used. + filename: name of the file to download to (do nothing if it already exists). + url: URL to download from. + + Returns: + The path to the downloaded file. + """ + if not tf.gfile.Exists(directory): + tf.logging.info("Creating directory %s" % directory) + os.mkdir(directory) + filepath = os.path.join(directory, filename) + if not tf.gfile.Exists(filepath): + tf.logging.info("Downloading %s to %s" % (url, filepath)) + filepath, _ = urllib.urlretrieve(url, filepath) + statinfo = os.stat(filepath) + tf.logging.info("Succesfully downloaded %s, %s bytes." % (filename, + statinfo.st_size)) + else: + tf.logging.info("Not downloading, file already found: %s" % filepath) + return filepath + + +def gunzip_file(gz_path, new_path): + """Unzips from gz_path into new_path. + + Args: + gz_path: path to the zipped file. + new_path: path to where the file will be unzipped. + """ + tf.logging.info("Unpacking %s to %s" % (gz_path, new_path)) + with gzip.open(gz_path, "rb") as gz_file: + with io.open(new_path, "wb") as new_file: + for line in gz_file: + new_file.write(line) + + +# TODO(aidangomez): en-fr tasks are significantly over-represented below +_DATA_FILE_URLS = [ + # German-English + [ + "http://data.statmt.org/wmt16/translation-task/training-parallel-nc-v11.tgz", # pylint: disable=line-too-long + [ + "training-parallel-nc-v11/news-commentary-v11.de-en.en", + "training-parallel-nc-v11/news-commentary-v11.de-en.de" + ] + ], + # German-English & French-English + [ + "http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz", [ + "commoncrawl.de-en.en", "commoncrawl.de-en.de", + "commoncrawl.fr-en.en", "commoncrawl.fr-en.fr" + ] + ], + [ + "http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz", [ + "training/europarl-v7.de-en.en", "training/europarl-v7.de-en.de", + "training/europarl-v7.fr-en.en", "training/europarl-v7.fr-en.fr" + ] + ], + # French-English + [ + "http://www.statmt.org/wmt10/training-giga-fren.tar", + ["giga-fren.release2.fixed.en.gz", "giga-fren.release2.fixed.fr.gz"] + ], + [ + "http://www.statmt.org/wmt13/training-parallel-un.tgz", + ["un/undoc.2000.fr-en.en", "un/undoc.2000.fr-en.fr"] + ], +] + + +def get_or_generate_vocab(tmp_dir, vocab_filename, vocab_size): + """Generate a vocabulary from the datasets listed in _DATA_FILE_URLS.""" + vocab_filepath = os.path.join(tmp_dir, vocab_filename) + if os.path.exists(vocab_filepath): + vocab = SubwordTextEncoder(vocab_filepath) + return vocab + + tokenizer = Tokenizer() + for source in _DATA_FILE_URLS: + url = source[0] + filename = os.path.basename(url) + read_type = "r:gz" if "tgz" in filename else "r" + + compressed_file = maybe_download(tmp_dir, filename, url) + + with tarfile.open(compressed_file, read_type) as corpus_tar: + corpus_tar.extractall(tmp_dir) + + for lang_file in source[1]: + tf.logging.info("Reading file: %s" % lang_file) + filepath = os.path.join(tmp_dir, lang_file) + + # For some datasets a second extraction is necessary. + if ".gz" in lang_file: + tf.logging.info("Unpacking subdirectory %s" % filepath) + new_filepath = os.path.join(tmp_dir, lang_file[:-3]) + gunzip_file(filepath, new_filepath) + filepath = new_filepath + + # Use Tokenizer to count the word occurrences. + with tf.gfile.GFile(filepath, mode="r") as source_file: + file_byte_budget = 3.5e5 if "en" in filepath else 7e5 + for line in source_file: + if file_byte_budget <= 0: + break + line = line.strip() + file_byte_budget -= len(line) + _ = tokenizer.encode(line) + + vocab = SubwordTextEncoder.build_to_target_size( + vocab_size, tokenizer.token_counts, vocab_filepath, 1, 1e3) + return vocab + + +def read_records(filename): + reader = tf.python_io.tf_record_iterator(filename) + records = [] + for record in reader: + records.append(record) + if len(records) % 10000 == 0: + tf.logging.info("read: %d", len(records)) + return records + + +def write_records(records, out_filename): + writer = tf.python_io.TFRecordWriter(out_filename) + for count, record in enumerate(records): + writer.write(record) + if count % 10000 == 0: + tf.logging.info("write: %d", count) + writer.close() diff --git a/tensor2tensor/data_generators/generator_utils_test.py b/tensor2tensor/data_generators/generator_utils_test.py new file mode 100644 index 000000000..726763f7a --- /dev/null +++ b/tensor2tensor/data_generators/generator_utils_test.py @@ -0,0 +1,88 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Generator utilities test.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import gzip +import io +import os +import tempfile + +# Dependency imports + +from tensor2tensor.data_generators import generator_utils + +import tensorflow as tf + + +class GeneratorUtilsTest(tf.test.TestCase): + + def testGenerateFiles(self): + tmp_dir = self.get_temp_dir() + (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir) + tmp_file_name = os.path.basename(tmp_file_path) + + # Generate a trivial file and assert the file exists. + def test_generator(): + yield {"inputs": [1], "target": [1]} + + generator_utils.generate_files(test_generator(), tmp_file_name, tmp_dir) + self.assertTrue(tf.gfile.Exists(tmp_file_path + "-00000-of-00001")) + + # Clean up. + os.remove(tmp_file_path + "-00000-of-00001") + os.remove(tmp_file_path) + + def testMaybeDownload(self): + tmp_dir = self.get_temp_dir() + (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir) + tmp_file_name = os.path.basename(tmp_file_path) + + # Download Google index to the temporary file.http. + res_path = generator_utils.maybe_download(tmp_dir, tmp_file_name + ".http", + "http://google.com") + self.assertEqual(res_path, tmp_file_path + ".http") + + # Clean up. + os.remove(tmp_file_path + ".http") + os.remove(tmp_file_path) + + def testGunzipFile(self): + tmp_dir = self.get_temp_dir() + (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir) + + # Create a test zip file and unzip it. + with gzip.open(tmp_file_path + ".gz", "wb") as gz_file: + gz_file.write("test line") + generator_utils.gunzip_file(tmp_file_path + ".gz", tmp_file_path + ".txt") + + # Check that the unzipped result is as expected. + lines = [] + for line in io.open(tmp_file_path + ".txt", "rb"): + lines.append(line.strip()) + self.assertEqual(len(lines), 1) + self.assertEqual(lines[0], "test line") + + # Clean up. + os.remove(tmp_file_path + ".gz") + os.remove(tmp_file_path + ".txt") + os.remove(tmp_file_path) + + +if __name__ == "__main__": + tf.test.main() diff --git a/tensor2tensor/data_generators/image.py b/tensor2tensor/data_generators/image.py new file mode 100644 index 000000000..55b5f2fc7 --- /dev/null +++ b/tensor2tensor/data_generators/image.py @@ -0,0 +1,306 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Data generators for image data-sets.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import cPickle +import gzip +import io +import json +import os +import random +import tarfile +import zipfile + +# Dependency imports + +import numpy as np +from six.moves import xrange # pylint: disable=redefined-builtin +from six.moves import zip # pylint: disable=redefined-builtin +from tensor2tensor.data_generators import generator_utils + +import tensorflow as tf + + +def image_generator(images, labels): + """Generator for images that takes image and labels lists and creates pngs. + + Args: + images: list of images given as [width x height x channels] numpy arrays. + labels: list of ints, same length as images. + + Yields: + A dictionary representing the images with the following fields: + * image/encoded: the string encoding the image as PNG, + * image/format: the string "png" representing image format, + * image/class/label: an integer representing the label, + * image/height: an integer representing the height, + * image/width: an integer representing the width. + Every field is actually a singleton list of the corresponding type. + + Raises: + ValueError: if images is an empty list. + """ + if not images: + raise ValueError("Must provide some images for the generator.") + (width, height, channels) = images[0].shape + with tf.Graph().as_default(): + image_t = tf.placeholder(dtype=tf.uint8, shape=(width, height, channels)) + encoded_image_t = tf.image.encode_png(image_t) + with tf.Session() as sess: + for (image, label) in zip(images, labels): + enc_string = sess.run(encoded_image_t, feed_dict={image_t: image}) + yield { + "image/encoded": [enc_string], + "image/format": ["png"], + "image/class/label": [label], + "image/height": [height], + "image/width": [width] + } + + +# URLs and filenames for MNIST data. +_MNIST_URL = "http://yann.lecun.com/exdb/mnist/" +_MNIST_TRAIN_DATA_FILENAME = "train-images-idx3-ubyte.gz" +_MNIST_TRAIN_LABELS_FILENAME = "train-labels-idx1-ubyte.gz" +_MNIST_TEST_DATA_FILENAME = "t10k-images-idx3-ubyte.gz" +_MNIST_TEST_LABELS_FILENAME = "t10k-labels-idx1-ubyte.gz" +_MNIST_IMAGE_SIZE = 28 + + +def _get_mnist(directory): + """Download all MNIST files to directory unless they are there.""" + for filename in [ + _MNIST_TRAIN_DATA_FILENAME, _MNIST_TRAIN_LABELS_FILENAME, + _MNIST_TEST_DATA_FILENAME, _MNIST_TEST_LABELS_FILENAME + ]: + generator_utils.maybe_download(directory, filename, _MNIST_URL + filename) + + +def _extract_mnist_images(filename, num_images): + """Extract images from an MNIST file into a numpy array. + + Args: + filename: The path to an MNIST images file. + num_images: The number of images in the file. + + Returns: + A numpy array of shape [number_of_images, height, width, channels]. + """ + with gzip.open(filename) as bytestream: + bytestream.read(16) + buf = bytestream.read(_MNIST_IMAGE_SIZE * _MNIST_IMAGE_SIZE * num_images) + data = np.frombuffer(buf, dtype=np.uint8) + data = data.reshape(num_images, _MNIST_IMAGE_SIZE, _MNIST_IMAGE_SIZE, 1) + return data + + +def _extract_mnist_labels(filename, num_labels): + """Extract labels from an MNIST file into integers. + + Args: + filename: The path to an MNIST labels file. + num_labels: The number of labels in the file. + + Returns: + A int64 numpy array of shape [num_labels] + """ + with gzip.open(filename) as bytestream: + bytestream.read(8) + buf = bytestream.read(num_labels) + labels = np.frombuffer(buf, dtype=np.uint8).astype(np.int64) + return labels + + +def mnist_generator(tmp_dir, training, how_many, start_from=0): + """Image generator for MNIST. + + Args: + tmp_dir: path to temporary storage directory. + training: a Boolean; if true, we use the train set, otherwise the test set. + how_many: how many images and labels to generate. + start_from: from which image to start. + + Returns: + An instance of image_generator that produces MNIST images. + """ + _get_mnist(tmp_dir) + d = _MNIST_TRAIN_DATA_FILENAME if training else _MNIST_TEST_DATA_FILENAME + l = _MNIST_TRAIN_LABELS_FILENAME if training else _MNIST_TEST_LABELS_FILENAME + data_path = os.path.join(tmp_dir, d) + labels_path = os.path.join(tmp_dir, l) + images = _extract_mnist_images(data_path, 60000 if training else 10000) + labels = _extract_mnist_labels(labels_path, 60000 if training else 10000) + # Shuffle the data to make sure classes are well distributed. + data = list(zip(images, labels)) + random.shuffle(data) + images, labels = list(zip(*data)) + return image_generator(images[start_from:start_from + how_many], + labels[start_from:start_from + how_many]) + + +# URLs and filenames for CIFAR data. +_CIFAR10_URL = "https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz" +_CIFAR10_PREFIX = "cifar-10-batches-py/" +_CIFAR10_TRAIN_FILES = [ + "data_batch_1", "data_batch_2", "data_batch_3", "data_batch_4", + "data_batch_5" +] +_CIFAR10_TEST_FILES = ["test_batch"] +_CIFAR10_IMAGE_SIZE = 32 + + +def _get_cifar10(directory): + """Download and extract CIFAR to directory unless it is there.""" + filename = os.path.basename(_CIFAR10_URL) + path = generator_utils.maybe_download(directory, filename, _CIFAR10_URL) + tarfile.open(path, "r:gz").extractall(directory) + + +def cifar10_generator(tmp_dir, training, how_many, start_from=0): + """Image generator for CIFAR-10. + + Args: + tmp_dir: path to temporary storage directory. + training: a Boolean; if true, we use the train set, otherwise the test set. + how_many: how many images and labels to generate. + start_from: from which image to start. + + Returns: + An instance of image_generator that produces CIFAR-10 images and labels. + """ + _get_cifar10(tmp_dir) + data_files = _CIFAR10_TRAIN_FILES if training else _CIFAR10_TEST_FILES + all_images, all_labels = [], [] + for filename in data_files: + path = os.path.join(tmp_dir, _CIFAR10_PREFIX, filename) + with tf.gfile.Open(path, "r") as f: + data = cPickle.load(f) + images = data["data"] + num_images = images.shape[0] + images = images.reshape((num_images, 3, _CIFAR10_IMAGE_SIZE, + _CIFAR10_IMAGE_SIZE)) + all_images.extend([ + np.squeeze(images[j]).transpose((1, 2, 0)) for j in xrange(num_images) + ]) + labels = data["labels"] + all_labels.extend([labels[j] for j in xrange(num_images)]) + # Shuffle the data to make sure classes are well distributed. + data = zip(all_images, all_labels) + random.shuffle(data) + all_images, all_labels = zip(*data) + return image_generator(all_images[start_from:start_from + how_many], + all_labels[start_from:start_from + how_many]) + + +# URLs and filenames for MSCOCO data. +_MSCOCO_ROOT_URL = "http://msvocds.blob.core.windows.net/" +_MSCOCO_URLS = [ + "coco2014/train2014.zip", "coco2014/val2014.zip", "coco2014/test2014.zip", + "annotations-1-0-3/captions_train-val2014.zip" +] +_MSCOCO_TRAIN_PREFIX = "train2014" +_MSCOCO_EVAL_PREFIX = "val2014" +_MSCOCO_TRAIN_CAPTION_FILE = "annotations/captions_train2014.json" +_MSCOCO_EVAL_CAPTION_FILE = "annotations/captions_val2014.json" + + +def _get_mscoco(directory): + """Download and extract MSCOCO datasets to directory unless it is there.""" + for url in _MSCOCO_URLS: + filename = os.path.basename(url) + download_url = os.path.join(_MSCOCO_ROOT_URL, url) + path = generator_utils.maybe_download(directory, filename, download_url) + unzip_dir = os.path.join(directory, filename.strip(".zip")) + if not tf.gfile.Exists(unzip_dir): + zipfile.ZipFile(path, "r").extractall(directory) + + +def mscoco_generator(tmp_dir, + training, + how_many, + start_from=0, + eos_list=None, + vocab_filename=None, + vocab_size=0): + """Image generator for MSCOCO captioning problem with token-wise captions. + + Args: + tmp_dir: path to temporary storage directory. + training: a Boolean; if true, we use the train set, otherwise the test set. + how_many: how many images and labels to generate. + start_from: from which image to start. + eos_list: optional list of end of sentence tokens, otherwise use default + value `1`. + vocab_filename: file within `tmp_dir` to read vocabulary from. + vocab_size: integer target to generate vocabulary size to. + + Yields: + A dictionary representing the images with the following fields: + * image/encoded: the string encoding the image as JPEG, + * image/format: the string "jpeg" representing image format, + * image/class/label: a list of integers representing the caption, + * image/height: an integer representing the height, + * image/width: an integer representing the width. + Every field is actually a list of the corresponding type. + """ + eos_list = [1] if eos_list is None else eos_list + if vocab_filename is not None: + vocab_symbolizer = generator_utils.get_or_generate_vocab( + tmp_dir, vocab_filename, vocab_size) + _get_mscoco(tmp_dir) + caption_filepath = (_MSCOCO_TRAIN_CAPTION_FILE + if training else _MSCOCO_EVAL_CAPTION_FILE) + caption_filepath = os.path.join(tmp_dir, caption_filepath) + prefix = _MSCOCO_TRAIN_PREFIX if training else _MSCOCO_EVAL_PREFIX + caption_file = io.open(caption_filepath) + caption_json = json.load(caption_file) + # Dictionary from image_id to ((filename, height, width), captions). + image_dict = dict() + for image in caption_json["images"]: + image_dict[image["id"]] = [(image["file_name"], image["height"], + image["width"]), []] + annotations = caption_json["annotations"] + annotation_count = len(annotations) + image_count = len(image_dict) + tf.logging.info("Processing %d images and %d labels\n" % (image_count, + annotation_count)) + for annotation in annotations: + image_id = annotation["image_id"] + image_dict[image_id][1].append(annotation["caption"]) + + data = list(image_dict.values())[start_from:start_from + how_many] + random.shuffle(data) + for image_info, labels in data: + image_filename = image_info[0] + image_filepath = os.path.join(tmp_dir, prefix, image_filename) + with tf.gfile.Open(image_filepath, "r") as f: + encoded_image_data = f.read() + height, width = image_info[1], image_info[2] + for label in labels: + if vocab_filename is None: + label = [ord(c) for c in label] + eos_list + else: + label = vocab_symbolizer.encode(label) + eos_list + yield { + "image/encoded": [encoded_image_data], + "image/format": ["jpeg"], + "image/class/label": label, + "image/height": [height], + "image/width": [width] + } diff --git a/tensor2tensor/data_generators/image_test.py b/tensor2tensor/data_generators/image_test.py new file mode 100644 index 000000000..c5b4f14be --- /dev/null +++ b/tensor2tensor/data_generators/image_test.py @@ -0,0 +1,71 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Image generators test.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +import numpy as np +from tensor2tensor.data_generators import image + +import tensorflow as tf + + +class ImageTest(tf.test.TestCase): + + def testImageGenerator(self): + # 2 random images + np.random.seed(1111) # To avoid any flakiness. + image1 = np.random.randint(0, 255, size=(10, 12, 3)) + image2 = np.random.randint(0, 255, size=(10, 12, 3)) + # Call image generator on the 2 images with labels [1, 2]. + encoded_imgs, labels = [], [] + for dictionary in image.image_generator([image1, image2], [1, 2]): + self.assertEqual( + sorted(list(dictionary)), [ + "image/class/label", "image/encoded", "image/format", + "image/height", "image/width" + ]) + self.assertEqual(dictionary["image/format"], ["png"]) + self.assertEqual(dictionary["image/height"], [12]) + self.assertEqual(dictionary["image/width"], [10]) + encoded_imgs.append(dictionary["image/encoded"]) + labels.append(dictionary["image/class/label"]) + + # Check that the result labels match the inputs. + self.assertEqual(len(labels), 2) + self.assertEqual(labels[0], [1]) + self.assertEqual(labels[1], [2]) + + # Decode images and check that they match the inputs. + self.assertEqual(len(encoded_imgs), 2) + image_t = tf.placeholder(dtype=tf.string) + decoded_png_t = tf.image.decode_png(image_t) + with self.test_session() as sess: + encoded_img1 = encoded_imgs[0] + self.assertEqual(len(encoded_img1), 1) + decoded1 = sess.run(decoded_png_t, feed_dict={image_t: encoded_img1[0]}) + self.assertAllClose(decoded1, image1) + encoded_img2 = encoded_imgs[1] + self.assertEqual(len(encoded_img2), 1) + decoded2 = sess.run(decoded_png_t, feed_dict={image_t: encoded_img2[0]}) + self.assertAllClose(decoded2, image2) + + +if __name__ == "__main__": + tf.test.main() diff --git a/tensor2tensor/data_generators/lm_example.py b/tensor2tensor/data_generators/lm_example.py new file mode 100644 index 000000000..9cf930afc --- /dev/null +++ b/tensor2tensor/data_generators/lm_example.py @@ -0,0 +1,123 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +r"""Convert language modeling data to tf.Example format. + +Uses SubwordTextEncoder. + +For each line, we generate a tf.Example, with "targets" equal to a sequence +of subtokens (integers), ending in subtoken id 1 for end-of-sequence. We add +a dummy feature "inputs"=[0] for compatability with seq-to-seq models. + +If FLAGS.combine_to_length is nonzero, then we combine multiple sequences into +examples of a constant length, possibly with some padding at the end. + +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +from tensor2tensor.data_generators import generator_utils +from tensor2tensor.data_generators import text_encoder + +import tensorflow as tf + +tf.app.flags.DEFINE_string( + "vocab_file", "", "SubwordTextEncoder vocabulary file") + +tf.app.flags.DEFINE_integer( + "combine_to_length", 0, + "If positive, concatenate documents to form examples with length exactly" + " equal to this value. Documents are still suffixed with subtoken id=1. " + " Examples are padded with subtoken id=0.") + +tf.app.flags.DEFINE_string("in_filepattern", "", "input filename") + +tf.app.flags.DEFINE_string( + "out_prefix", "", "The output filename is equal to out_prefix plus " + "the last 15 characters of in_file. (e.g. -00001-of-00100)") + +FLAGS = tf.app.flags.FLAGS + + +def _make_example(ids, raw_num_bytes): + if FLAGS.combine_to_length > 0: + ids += [0] * (FLAGS.combine_to_length - len(ids)) + return generator_utils.to_example({ + "targets": ids, + "inputs": [0], + "raw_num_bytes": [raw_num_bytes] + }).SerializeToString() + + +def convert_file(in_file, encoder): + """Convert a file to examples.""" + total_bytes = 0 + total_subtokens = 0 + total_documents = 0 + dropped_documents = 0 + + combined_subtokens = [] + combined_num_bytes = 0 + + out_file = FLAGS.out_prefix + in_file[-15:] + writer = tf.python_io.TFRecordWriter(out_file) + out_file = FLAGS.out_prefix + in_file[-15:] + print ("in_file", in_file, "out_file", out_file) + for line in tf.gfile.Open(in_file): + total_documents += 1 + assert line[-1] == "\n" + num_bytes = len(line) + total_bytes += num_bytes + line = line[:-1] + subtokens = encoder.encode(line) + [1] + total_subtokens += len(subtokens) + if FLAGS.combine_to_length: + if len(combined_subtokens) + len(subtokens) > FLAGS.combine_to_length: + writer.write(_make_example(combined_subtokens, combined_num_bytes)) + combined_subtokens = [] + combined_num_bytes = 0 + if len(subtokens) <= FLAGS.combine_to_length: + combined_subtokens.extend(subtokens) + combined_num_bytes += num_bytes + else: + dropped_documents += 1 + else: + writer.write(_make_example(subtokens, num_bytes)) + if combined_subtokens: + writer.write(_make_example(combined_subtokens, combined_num_bytes)) + writer.close() + + tf.logging.info("total bytes: %d", total_bytes) + tf.logging.info("total subtokens: %d", total_subtokens) + tf.logging.info("bytes per subtoken: %f", total_bytes / total_subtokens) + tf.logging.info("total documents: %d", total_documents) + tf.logging.info("dropped documents: %d", dropped_documents) + + +def main(_): + """Convert a file to examples.""" + encoder = text_encoder.SubwordTextEncoder(FLAGS.vocab_file) + + in_files = tf.gfile.Glob(FLAGS.in_filepattern) + assert in_files, "No matching input files" + for in_file in in_files: + convert_file(in_file, encoder) + + +if __name__ == "__main__": + tf.app.run() diff --git a/tensor2tensor/data_generators/problem_hparams.py b/tensor2tensor/data_generators/problem_hparams.py new file mode 100644 index 000000000..26249d2bc --- /dev/null +++ b/tensor2tensor/data_generators/problem_hparams.py @@ -0,0 +1,702 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Hyperparameters defining different problems. + +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os + +# Dependency imports + +from tensor2tensor.data_generators import text_encoder +from tensor2tensor.utils import modality + +import tensorflow as tf + + +def default_problem_hparams(): + """A set of basic model hyperparameters.""" + return tf.contrib.training.HParams( + # Use this parameter to get comparable perplexity numbers with different + # tokenizations. This value should be set to the ratio of the number of + # tokens in the test set according to the tokeization used to the number + # of tokens in the test set in the "official" tokenization. For example, + # if we are using a word-piece based model and we want to compute + # per-word perplexity, then we set loss_multiplier to the number of + # wordpieces per word in the test set. + loss_multiplier=1.0, + + # Use this parameter to allow for larger sequences in the batch. Without + # the use of this parameter, the size of the inner two dimensions will be + # used to judge the sequence length. + batch_size_multiplier=1, + + # To make queues of the right capacity, it's good to know the maximal + # expected batch size, as it can vary a lot. It only affects performance + # of input readers and memory use. The defaults should be safe and fast, + # but decrease if your reader uses a lot of memory and increase if slow. + max_expected_batch_size_per_shard=64, + + # Modalities used to map from input features to a space compatible with + # chosen model architecture. One modality per feature key. + input_modality={}, + + # Modality used to map from hidden representation to the target space. + target_modality=None, + + # Identifiers used to tell the model which input/target space will be + # expected. For example, it can tell that we expect French as characters + # as output, or Spanish as sound. An integer with the following semantics: + # 0: Generic / unknown output space (default) + # 1: Image labels + # 2: English characters + # 3: English tokens + # 4: English bpe tokens + # 5: French characters + # 6: French tokens + # 7: German characters + # 8: German tokens + # 9: German bpe tokens + # 10: Digit cipher lexicon 0 + # 11: Digit cipher lexicon 1 + # 12: Audio waveform domain + # 13: Audio spectral domain + # 14: Parse characters + # 15: Parse tokens + # Add more above if needed. + input_space_id=0, + target_space_id=0, + + # Vocabulary per feature key. + # a vocabulary converts to/from human-readable strings. + # E.g. {"inputs": text_encoder.ByteTextEncoder(), + # "targets": wordpiece.WordpieceVocab("vocab_filename.txt")} + vocabulary={ + "inputs": text_encoder.TextEncoder(), + "targets": text_encoder.TextEncoder() + }, + + # This is a marker to keep track if the problem was reversed or copied. + # Only set automatically, do not override the default. + # + # These tags can be combined in order to perform copies of the input or + # the targets. For instance `problem_copy` will copy the inputs, but + # `problem_rev_copy` will copy the targets. + was_reversed=False, + was_copy=False,) + + +def parse_problem_name(problem_name): + """Determines if problem_name specifies a copy and/or reversal. + + Args: + problem_name: A string containing a single problem name from FLAGS.problems. + + Returns: + base_name: A string with the base problem name. + was_reversed: A boolean. + was_copy: A boolean. + """ + # Recursively strip tags until we reach a base name. + if len(problem_name) > 4 and problem_name[-4:] == "_rev": + base, _, was_copy = parse_problem_name(problem_name[:-4]) + return base, True, was_copy + elif len(problem_name) > 5 and problem_name[-5:] == "_copy": + base, was_reversed, _ = parse_problem_name(problem_name[:-5]) + return base, was_reversed, True + else: + return problem_name, False, False + + +def problem_hparams(problem_name, model_hparams): + """Generate problem hyperparameters based on problem name. + + Args: + problem_name: a string + model_hparams: a tf.contrib.training.HParams + + Returns: + a tf.contrib.training.HParams + + Raises: + ValueError: if problem_name is unknown. + """ + base_name, was_reversed, was_copy = parse_problem_name(problem_name) + if base_name not in _problem_hparams_map: + map_str = "\n* ".join(_problem_hparams_map.keys()) + error_msg = "%s not in the supported set of problems:\n%s" % (base_name, + map_str) + raise ValueError(error_msg) + p = _problem_hparams_map.get(base_name)(model_hparams) + if was_reversed: + # Swap modalities. + input_modality = p.input_modality["inputs"] + target_modality = p.target_modality + p.input_modality["inputs"] = target_modality + p.target_modality = input_modality + # Swap vocabularies. + input_vocabulary = p.vocabulary["inputs"] + target_vocabulary = p.vocabulary["targets"] + p.vocabulary["inputs"] = target_vocabulary + p.vocabulary["targets"] = input_vocabulary + # Swap input/target space ids. + input_space_id = p.input_space_id + target_space_id = p.target_space_id + p.input_space_id = target_space_id + p.target_space_id = input_space_id + # Mark that p was reversed. + p.was_reversed = True + if was_copy: + # Duplicate input modality. + p.target_modality = p.input_modality["inputs"] + # Duplicate input vocabulary. + p.vocabulary["targets"] = p.vocabulary["inputs"] + # Duplicate input space ids. + p.target_space_id = p.input_space_id + # Mark that p was reversed. + p.was_copy = True + return p + + +def test_problem_hparams(model_hparams, input_vocab_size, target_vocab_size): + """Problem hparams for testing model bodies.""" + p = default_problem_hparams() + p.input_modality = { + "inputs": modality.SymbolModality(model_hparams, input_vocab_size) + } + p.target_modality = modality.SymbolModality(model_hparams, target_vocab_size) + p.vocabulary = { + "inputs": text_encoder.TextEncoder(), + "targets": text_encoder.TextEncoder() + } + return p + + +def algorithmic(vocab_size, model_hparams): + """Default parameters for algorithmic tasks.""" + p = default_problem_hparams() + p.input_modality = { + "inputs": modality.SymbolModality(model_hparams, vocab_size) + } + p.target_modality = modality.SymbolModality(model_hparams, vocab_size) + p.vocabulary = { + "inputs": text_encoder.TextEncoder(num_reserved_ids=1), + "targets": text_encoder.TextEncoder(num_reserved_ids=1), + } + p.input_space_id = 10 + p.target_space_id = 11 + return p + + +def audio_timit_characters(model_hparams): + """English audio transcription benchmark.""" + p = default_problem_hparams() + p.input_modality = { + "inputs": modality.AudioModality(model_hparams), + } + p.target_modality = modality.SymbolModality(model_hparams, 256) + p.vocabulary = { + "inputs": text_encoder.TextEncoder(), + "targets": text_encoder.ByteTextEncoder(), + } + p.batch_size_multiplier = 256 + p.loss_multiplier = 2.0 + p.input_space_id = 12 + p.target_space_id = 2 + return p + + +def audio_timit_tokens(model_hparams, wrong_vocab_size): + """English audio transcription benchmark. + + Args: + model_hparams: a tf.contrib.training.HParams + wrong_vocab_size: a number used in the filename indicating the approximate + vocabulary size. This is not to be confused with the actual vocabulary + size. + Returns: + a tf.contrib.training.HParams + """ + p = default_problem_hparams() + # This vocab file must be present within the data directory. + vocab_filename = os.path.join(model_hparams.data_dir, + "tokens.vocab.%d" % wrong_vocab_size) + subtokenizer = text_encoder.SubwordTextEncoder(vocab_filename) + p.input_modality = { + "inputs": modality.AudioModality(model_hparams), + } + p.target_modality = modality.SymbolModality(model_hparams, + subtokenizer.vocab_size) + p.vocabulary = { + "inputs": text_encoder.TextEncoder(), + "targets": subtokenizer, + } + p.batch_size_multiplier = 256 + p.loss_multiplier = 2.0 + p.input_space_id = 13 + p.target_space_id = 3 + return p + + +def audio_wsj_characters(model_hparams): + """English audio transcription benchmark.""" + p = default_problem_hparams() + p.input_modality = { + "inputs": modality.AudioSpectralModality(model_hparams), + } + p.target_modality = modality.SymbolModality(model_hparams, 256) + p.vocabulary = { + "inputs": text_encoder.TextEncoder(), + "targets": text_encoder.ByteTextEncoder(), + } + p.batch_size_multiplier = 512 + p.loss_multiplier = 2.0 + p.input_space_id = 13 + p.target_space_id = 2 + return p + + +def audio_wsj_tokens(model_hparams, wrong_vocab_size): + """English audio transcription benchmark. + + Args: + model_hparams: a tf.contrib.training.HParams + wrong_vocab_size: a number used in the filename indicating the approximate + vocabulary size. This is not to be confused with the actual vocabulary + size. + Returns: + a tf.contrib.training.HParams + """ + p = default_problem_hparams() + # This vocab file must be present within the data directory. + vocab_filename = os.path.join(model_hparams.data_dir, + "tokens.vocab.%d" % wrong_vocab_size) + subtokenizer = text_encoder.SubwordTextEncoder(vocab_filename) + p.input_modality = { + "inputs": modality.AudioModality(model_hparams), + } + p.target_modality = modality.SymbolModality(model_hparams, + subtokenizer.vocab_size) + p.vocabulary = { + "inputs": text_encoder.TextEncoder(), + "targets": subtokenizer, + } + p.batch_size_multiplier = 512 + p.loss_multiplier = 2.0 + p.input_space_id = 12 + p.target_space_id = 3 + return p + + +def lm1b_16k(model_hparams): + """Billion-word language-modeling benchmark, 16k subtoken vocabulary.""" + p = default_problem_hparams() + p.perplexity_exponent = 1.184206 + p.input_modality = {} + p.target_modality = modality.SymbolModality(model_hparams, 16384) + p.vocabulary = { + "targets": + text_encoder.SubwordTextEncoder( + os.path.join(model_hparams.data_dir, + "lm1b_16k.subword_text_encoder")) + } + p.target_space_id = 3 + return p + + +def lm1b_64k(model_hparams): + """Billion-word language-modeling benchmark, 64k subtoken vocabulary.""" + p = default_problem_hparams() + p.perplexity_exponent = 1.067068 + p.input_modality = {} + p.target_modality = modality.SymbolModality(model_hparams, 65536) + p.vocabulary = { + "targets": + text_encoder.SubwordTextEncoder( + os.path.join(model_hparams.data_dir, + "lm1b_64k.subword_text_encoder")) + } + p.target_space_id = 3 + return p + + +def wmt_enfr_characters(model_hparams): + """English to French translation benchmark.""" + p = default_problem_hparams() + p.input_modality = {"inputs": modality.SymbolModality(model_hparams, 256)} + p.target_modality = modality.SymbolModality(model_hparams, 256) + p.vocabulary = { + "inputs": text_encoder.ByteTextEncoder(), + "targets": text_encoder.ByteTextEncoder(), + } + p.loss_multiplier = 2.0 + p.input_space_id = 2 + p.target_space_id = 5 + return p + + +def wmt_enfr_tokens(model_hparams, wrong_vocab_size): + """English to French translation benchmark. + + Args: + model_hparams: a tf.contrib.training.HParams + wrong_vocab_size: a number used in the filename indicating the approximate + vocabulary size. This is not to be confused with the actual vocabulary + size. + Returns: + a tf.contrib.training.HParams + """ + p = default_problem_hparams() + # This vocab file must be present within the data directory. + vocab_filename = os.path.join(model_hparams.data_dir, + "tokens.vocab.%d" % wrong_vocab_size) + subtokenizer = text_encoder.SubwordTextEncoder(vocab_filename) + p.input_modality = { + "inputs": modality.SymbolModality(model_hparams, subtokenizer.vocab_size) + } + p.target_modality = modality.SymbolModality(model_hparams, + subtokenizer.vocab_size) + p.vocabulary = { + "inputs": subtokenizer, + "targets": subtokenizer, + } + p.input_space_id = 3 + p.target_space_id = 6 + return p + + +def wmt_ende_bpe32k(model_hparams): + """English to German translation benchmark.""" + p = default_problem_hparams() + # single modality object enables embedding sharing between inputs and target + # when model_hparams.shared_source_target_embedding is True. + vocab_size = 40960 + m = modality.SymbolModality(model_hparams, vocab_size) + p.input_modality = {"inputs": m} + p.target_modality = m + # This vocab file must be present within the data directory. + vocab_filename = os.path.join(model_hparams.data_dir, "vocab.bpe.32000") + p.vocabulary = { + "inputs": text_encoder.TokenTextEncoder(vocab_filename=vocab_filename), + "targets": text_encoder.TokenTextEncoder(vocab_filename=vocab_filename), + } + p.loss_multiplier = 1.4 + p.input_space_id = 4 + p.target_space_id = 9 + return p + + +def wmt_ende_characters(model_hparams): + """English to German translation benchmark.""" + p = default_problem_hparams() + p.input_modality = {"inputs": modality.SymbolModality(model_hparams, 256)} + p.target_modality = modality.SymbolModality(model_hparams, 256) + p.vocabulary = { + "inputs": text_encoder.ByteTextEncoder(), + "targets": text_encoder.ByteTextEncoder(), + } + p.loss_multiplier = 2.0 + p.input_space_id = 2 + p.target_space_id = 7 + return p + + +def wmt_ende_tokens(model_hparams, wrong_vocab_size): + """English to German translation benchmark.""" + p = default_problem_hparams() + # This vocab file must be present within the data directory. + vocab_filename = os.path.join(model_hparams.data_dir, + "tokens.vocab.%d" % wrong_vocab_size) + subtokenizer = text_encoder.SubwordTextEncoder(vocab_filename) + p.input_modality = { + "inputs": modality.SymbolModality(model_hparams, subtokenizer.vocab_size) + } + p.target_modality = modality.SymbolModality(model_hparams, + subtokenizer.vocab_size) + p.vocabulary = { + "inputs": subtokenizer, + "targets": subtokenizer, + } + p.input_space_id = 3 + p.target_space_id = 8 + return p + + +def wmt_ende_v2(model_hparams, vocab_size): + """English to German translation benchmark with separate vocabularies.""" + p = default_problem_hparams() + # These vocab files must be present within the data directory. + source_vocab_filename = os.path.join(model_hparams.data_dir, + "wmt_ende_v2.en.vocab.%d" % vocab_size) + target_vocab_filename = os.path.join(model_hparams.data_dir, + "wmt_ende_v2.de.vocab.%d" % vocab_size) + p.input_modality = { + "inputs": modality.SymbolModality(model_hparams, vocab_size) + } + p.target_modality = modality.SymbolModality(model_hparams, vocab_size) + p.vocabulary = { + "inputs": text_encoder.SubwordTextEncoder(source_vocab_filename), + "targets": text_encoder.SubwordTextEncoder(target_vocab_filename), + } + p.input_space_id = 3 + p.target_space_id = 8 + return p + + +def wmt_concat(model_hparams, wrong_vocab_size): + """English to German translation benchmark.""" + p = default_problem_hparams() + # This vocab file must be present within the data directory. + vocab_filename = os.path.join(model_hparams.data_dir, + "tokens.vocab.%d" % wrong_vocab_size) + subtokenizer = text_encoder.SubwordTextEncoder(vocab_filename) + vocab_size = subtokenizer.vocab_size + p.input_modality = {} + p.target_modality = modality.SymbolModality(model_hparams, vocab_size) + p.vocabulary = {"targets": subtokenizer} + return p + + +def wmt_parsing_characters(model_hparams): + """English to parse tree translation benchmark.""" + p = default_problem_hparams() + p.input_modality = {"inputs": modality.SymbolModality(model_hparams, 256)} + p.target_modality = modality.SymbolModality(model_hparams, 256) + p.vocabulary = { + "inputs": text_encoder.ByteTextEncoder(), + "targets": text_encoder.ByteTextEncoder(), + } + p.loss_multiplier = 2.0 + p.input_space_id = 2 + p.target_space_id = 14 + return p + + +def wmt_parsing_tokens(model_hparams, wrong_vocab_size): + """English to parse tree translation benchmark. + + Args: + model_hparams: a tf.contrib.training.HParams + wrong_vocab_size: a number used in the filename indicating the approximate + vocabulary size. This is not to be confused with the actual vocabulary + size. + Returns: + a tf.contrib.training.HParams + """ + p = default_problem_hparams() + # This vocab file must be present within the data directory. + vocab_filename = os.path.join(model_hparams.data_dir, + "tokens.vocab.%d" % wrong_vocab_size) + subtokenizer = text_encoder.SubwordTextEncoder(vocab_filename) + p.input_modality = { + "inputs": modality.SymbolModality(model_hparams, subtokenizer.vocab_size) + } + p.target_modality = modality.SymbolModality(model_hparams, + subtokenizer.vocab_size) + p.vocabulary = { + "inputs": subtokenizer, + "targets": subtokenizer, + } + p.input_space_id = 3 + p.target_space_id = 15 + return p + + +def wsj_parsing_tokens(model_hparams, wrong_source_vocab_size, + wrong_target_vocab_size): + """English to parse tree translation benchmark. + + Args: + model_hparams: a tf.contrib.training.HParams + wrong_source_vocab_size: a number used in the filename indicating the + approximate vocabulary size. This is not to be confused with the actual + vocabulary size. + wrong_target_vocab_size: a number used in the filename indicating the + approximate target vocabulary size. This is not to be confused with the + actual target vocabulary size. + Returns: + a tf.contrib.training.HParams + """ + p = default_problem_hparams() + # This vocab file must be present within the data directory. + source_vocab_filename = os.path.join( + model_hparams.data_dir, + "wsj_source.tokens.vocab.%d" % wrong_source_vocab_size) + target_vocab_filename = os.path.join( + model_hparams.data_dir, + "wsj_target.tokens.vocab.%d" % wrong_target_vocab_size) + source_subtokenizer = text_encoder.SubwordTextEncoder( + source_vocab_filename) + target_subtokenizer = text_encoder.SubwordTextEncoder( + target_vocab_filename) + p.input_modality = { + "inputs": modality.SymbolModality(model_hparams, + source_subtokenizer.vocab_size) + } + p.target_modality = modality.SymbolModality(model_hparams, + target_subtokenizer.vocab_size) + p.vocabulary = { + "inputs": source_subtokenizer, + "targets": target_subtokenizer, + } + p.input_space_id = 3 + p.target_space_id = 15 + return p + + +def image_cifar10(model_hparams): + """CIFAR-10.""" + p = default_problem_hparams() + p.input_modality = {"inputs": modality.SmallImageModality(model_hparams)} + p.target_modality = modality.ClassLabelModality(model_hparams, 10) + p.batch_size_multiplier = 4 + p.max_expected_batch_size_per_shard = 8 + p.loss_multiplier = 3.0 + p.input_space_id = 1 + p.target_space_id = 1 + return p + + +def image_mnist(model_hparams): + """MNIST.""" + p = default_problem_hparams() + p.input_modality = {"inputs": modality.SymbolModality(model_hparams, 256)} + p.target_modality = modality.ClassLabelModality(model_hparams, 10) + p.batch_size_multiplier = 4 + p.max_expected_batch_size_per_shard = 8 + p.loss_multiplier = 3.0 + p.input_space_id = 1 + p.target_space_id = 1 + return p + + +def image_imagenet(model_hparams): + """ImageNet.""" + p = default_problem_hparams() + p.input_modality = { + "inputs": modality.ImageModality(model_hparams), + } + p.target_modality = modality.ClassLabelModality( + model_hparams, 1000, is2d=model_hparams.imagenet_use_2d) + p.batch_size_multiplier = 256 + p.max_expected_batch_size_per_shard = 2 + p.loss_multiplier = 0.7 + p.input_space_id = 1 + p.target_space_id = 1 + return p + + +def image_mscoco_characters(model_hparams): + """COCO image captioning with captions as characters.""" + p = default_problem_hparams() + p.input_modality = {"inputs": modality.ImageModality(model_hparams)} + p.target_modality = modality.SymbolModality(model_hparams, 256) + p.vocabulary = { + "inputs": text_encoder.TextEncoder(), + "targets": text_encoder.ByteTextEncoder(), + } + p.batch_size_multiplier = 128 + p.max_expected_batch_size_per_shard = 2 + p.loss_multiplier = 2.0 + p.input_space_id = 1 + p.target_space_id = 2 + return p + + +def image_mscoco_tokens(model_hparams, vocab_count): + """COCO image captioning with captions as tokens.""" + p = default_problem_hparams() + p.input_modality = {"inputs": modality.ImageModality(model_hparams)} + # This vocab file must be present within the data directory. + vocab_filename = os.path.join(model_hparams.data_dir, + "tokens.vocab.%d" % vocab_count) + subtokenizer = text_encoder.SubwordTextEncoder(vocab_filename) + p.target_modality = modality.SymbolModality(model_hparams, + subtokenizer.vocab_size) + p.vocabulary = { + "inputs": text_encoder.TextEncoder(), + "targets": subtokenizer, + } + p.batch_size_multiplier = 256 + p.max_expected_batch_size_per_shard = 2 + p.input_space_id = 1 + p.target_space_id = 3 + return p + + +# Dictionary of named hyperparameter settings for various problems. +# This is only accessed through the problem_hparams function below. +_problem_hparams_map = { + "algorithmic_addition_binary40": lambda p: algorithmic(3, p), + "algorithmic_addition_decimal40": lambda p: algorithmic(11, p), + "algorithmic_identity_binary40": lambda p: algorithmic(3, p), + "algorithmic_identity_decimal40": lambda p: algorithmic(11, p), + "algorithmic_multiplication_binary40": lambda p: algorithmic(3, p), + "algorithmic_multiplication_decimal40": lambda p: algorithmic(11, p), + "algorithmic_reverse_binary40": lambda p: algorithmic(3, p), + "algorithmic_reverse_decimal40": lambda p: algorithmic(11, p), + "algorithmic_shift_decimal40": lambda p: algorithmic(21, p), + "audio_timit_characters_tune": audio_timit_characters, + "audio_timit_characters_test": audio_timit_characters, + "audio_timit_tokens_8k_tune": lambda p: audio_timit_tokens(p, 2**13), + "audio_timit_tokens_8k_test": lambda p: audio_timit_tokens(p, 2**13), + "audio_wsj_characters_tune": audio_wsj_characters, + "audio_wsj_characters_test": audio_wsj_characters, + "audio_wsj_tokens_8k_tune": lambda p: audio_wsj_tokens(p, 2**13), + "audio_wsj_tokens_8k_test": lambda p: audio_wsj_tokens(p, 2**13), + "lm1b_16k": lm1b_16k, + "lm1b_64k": lm1b_64k, + "wmt_parsing_characters": wmt_parsing_characters, + "wmt_parsing_tokens_8k": lambda p: wmt_parsing_tokens(p, 2**13), + "wsj_parsing_tokens_16k": lambda p: wsj_parsing_tokens(p, 2**14, 2**9), + "wsj_parsing_tokens_32k": lambda p: wsj_parsing_tokens(p, 2**15, 2**9), + "wmt_enfr_characters": wmt_enfr_characters, + "wmt_enfr_tokens_8k": lambda p: wmt_enfr_tokens(p, 2**13), + "wmt_enfr_tokens_32k": lambda p: wmt_enfr_tokens(p, 2**15), + "wmt_enfr_tokens_32k_shuffled": lambda p: wmt_enfr_tokens(p, 2**15), + "wmt_enfr_tokens_32k_combined": lambda p: wmt_enfr_tokens(p, 2**15), + "wmt_enfr_tokens_128k": lambda p: wmt_enfr_tokens(p, 2**17), + # bytes per subtoken: 3.267350 + "wmt_ende_concat_8k": lambda p: wmt_concat(p, 2**13), + # bytes per subtoken: 4.236272 + "wmt_ende_concat_32k": lambda p: wmt_concat(p, 2**15), + "wmt_ende_characters": wmt_ende_characters, + "wmt_ende_tokens_8k": lambda p: wmt_ende_tokens(p, 2**13), + "wmt_ende_tokens_32k": lambda p: wmt_ende_tokens(p, 2**15), + "wmt_ende_tokens_128k": lambda p: wmt_ende_tokens(p, 2**17), + # bytes per subtoken: 4.59291664162 + "wmt_ende_bpe32k": wmt_ende_bpe32k, + "wmt_ende_bpe32k_shuffled": wmt_ende_bpe32k, + "wmt_ende_bpe32k_combined": wmt_ende_bpe32k, + "wmt_ende_bpe32k_160": wmt_ende_bpe32k, + "wmt_ende_v2_32k_combined": lambda p: wmt_ende_v2(p, 2**15), + "wmt_ende_v2_16k_combined": lambda p: wmt_ende_v2(p, 2**14), + "image_cifar10_tune": image_cifar10, + "image_cifar10_test": image_cifar10, + "image_mnist_tune": image_mnist, + "image_mnist_test": image_mnist, + "image_mscoco_characters_tune": image_mscoco_characters, + "image_mscoco_characters_test": image_mscoco_characters, + "image_mscoco_tokens_8k_tune": lambda p: image_mscoco_tokens(p, 2**13), + "image_mscoco_tokens_8k_test": lambda p: image_mscoco_tokens(p, 2**13), + "image_mscoco_tokens_32k_tune": lambda p: image_mscoco_tokens(p, 2**15), + "image_mscoco_tokens_32k_test": lambda p: image_mscoco_tokens(p, 2**15), + "image_mscoco_tokens_128k_tune": lambda p: image_mscoco_tokens(p, 2**17), + "image_mscoco_tokens_128k_test": lambda p: image_mscoco_tokens(p, 2**17), + "image_imagenet": image_imagenet, +} diff --git a/tensor2tensor/data_generators/problem_hparams_test.py b/tensor2tensor/data_generators/problem_hparams_test.py new file mode 100644 index 000000000..5c8bc5516 --- /dev/null +++ b/tensor2tensor/data_generators/problem_hparams_test.py @@ -0,0 +1,48 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for tensor2tensor.problem_hparams.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +from tensor2tensor.data_generators import problem_hparams + +import tensorflow as tf + + +class ProblemHparamsTest(tf.test.TestCase): + + def testParseProblemName(self): + problem_name = "base" + self.assertEqual(problem_hparams.parse_problem_name(problem_name), + ("base", False, False)) + problem_name = "base_rev" + self.assertEqual( + problem_hparams.parse_problem_name(problem_name), ("base", True, False)) + problem_name = "base_copy" + self.assertEqual( + problem_hparams.parse_problem_name(problem_name), ("base", False, True)) + problem_name = "base_copy_rev" + self.assertEqual( + problem_hparams.parse_problem_name(problem_name), ("base", True, True)) + problem_name = "base_rev_copy" + self.assertEqual( + problem_hparams.parse_problem_name(problem_name), ("base", True, True)) + + +if __name__ == "__main__": + tf.test.main() diff --git a/tensor2tensor/data_generators/replace_oov.py b/tensor2tensor/data_generators/replace_oov.py new file mode 100644 index 000000000..7e2c8dc50 --- /dev/null +++ b/tensor2tensor/data_generators/replace_oov.py @@ -0,0 +1,76 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +r"""Data preprocessor for lm1b benchmark. + +Process the raw text file to replace out-of-vocab words with "". + +The input consists of a tokenized text file, where tokens are separated with +whitespace. + +Outputs a similar text file where the OOV words have been repalced with UNK. +The whitespace in the output may be different. + +This maintains compatibility with the benchmark, which does the same thing. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +from six.moves import xrange # pylint: disable=redefined-builtin + +import tensorflow as tf + +tf.app.flags.DEFINE_string("vocab_file", "", + "text file containing one word per line") + +tf.app.flags.DEFINE_string("in_filepattern", "", "input filename") + +tf.app.flags.DEFINE_string( + "out_prefix", "", "The output filename is equal to out_prefix plus " + "the last 15 characters of in_file. (e.g. -00001-of-00100)") + +FLAGS = tf.app.flags.FLAGS + + +def replace_oov(vocab, in_file): + """Replace out-of-vocab words with .""" + out_file = FLAGS.out_prefix + in_file[-15:] + print ("in_file", in_file, "out_file", out_file) + with tf.gfile.Open(out_file, "w") as out: + for line in tf.gfile.Open(in_file): + words = line.split() + for i in xrange(len(words)): + if not vocab.get(words[i]): + words[i] = "UNK" + out_line = " ".join(words) + "\n" + out.write(out_line) + + +def main(_): + vocab = {} + with tf.gfile.Open(FLAGS.vocab_file) as vocab_file: + for line in vocab_file: + vocab[line.strip()] = True + + in_files = tf.gfile.Glob(FLAGS.in_filepattern) + assert in_files, "No matching input files" + for in_file in in_files: + replace_oov(vocab, in_file) + +if __name__ == "__main__": + tf.app.run() diff --git a/tensor2tensor/data_generators/snli.py b/tensor2tensor/data_generators/snli.py new file mode 100644 index 000000000..5613ece4d --- /dev/null +++ b/tensor2tensor/data_generators/snli.py @@ -0,0 +1,167 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Data generators for the SNLI data-set.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import zipfile + +# Dependency imports + +from tensor2tensor.data_generators import generator_utils +from tensor2tensor.data_generators import text_encoder + +import tensorflow as tf + +_EOS = 1 +_SEP = 2 + +_LABEL_INDEX = 0 +_PARSE1_INDEX = 3 +_PARSE2_INDEX = 4 +_SENT1_INDEX = 5 +_SENT2_INDEX = 6 + +_LABEL_TO_ID = { + 'contradiction': 0, + 'entailment': 1, + 'neutral': 2, +} + +_EXAMPLES_FILE = 'examples.txt' +_SNLI_DATA_PATH = 'snli_1.0/snli_1.0_%s.txt' +_SNLI_ZIP = 'snli_1.0.zip' +_SNLI_URL = 'https://nlp.stanford.edu/projects/snli/' + _SNLI_ZIP + + +def _download_and_parse_dataset(tmp_dir, train): + """Downloads and prepairs the dataset to be parsed by the data_generator.""" + file_path = generator_utils.maybe_download(tmp_dir, _SNLI_ZIP, _SNLI_URL) + zip_ref = zipfile.ZipFile(file_path, 'r') + zip_ref.extractall(tmp_dir) + zip_ref.close() + + file_name = 'train' if train else 'dev' + dataset_file_path = os.path.join(tmp_dir, _SNLI_DATA_PATH % file_name) + _parse_dataset(dataset_file_path, tmp_dir, train) + + +def _get_tokens_and_tags(parse_str): + """Parse str to tokens and pos tags.""" + tokens = [] + parse_split = parse_str.split(' ') + for p in parse_split: + assert p.startswith('(') or p.endswith(')') + if p.endswith(')'): + token = p.replace(')', '') + tokens.append(token) + + return tokens + + +def _parse_dataset(file_path, tmp_dir, train): + """Convert the dataset in to a simpler format. + + This function creates two files. One for being processed to produce a vocab + and another to generate the data. + + Args: + file_path: string, path to the file to parse. + tmp_dir: string, path to the directory to output the files. + train: bool, indicating if we are parsing the training set. + """ + input_path = file_path + file_name = 'train' if train else 'dev' + gen_output_path = os.path.join(tmp_dir, file_name + '.txt') + example_output_path = os.path.join(tmp_dir, _EXAMPLES_FILE) + + print('input path: ' + input_path) + print('gen_output_path: ' + gen_output_path) + print('example_output_path: ' + example_output_path) + + input_file = tf.gfile.Open(input_path, mode='r') + examples = [] + for counter, line in enumerate(input_file): + if counter == 0: # Ignore first line since its a header. + continue + # Get the token and embedding vector. + line_split = line.split('\t') + + parse1 = line_split[_PARSE1_INDEX] + parse2 = line_split[_PARSE2_INDEX] + consensus_label = line_split[_LABEL_INDEX] + + tokens1 = _get_tokens_and_tags(parse1) + tokens2 = _get_tokens_and_tags(parse2) + + tokens1_str = ' '.join(tokens1) + tokens2_str = ' '.join(tokens2) + + if consensus_label != '-': + examples.append([tokens1_str, tokens2_str, consensus_label]) + + input_file.close() + + # Output tab delimited file of lines of examples (sentence1, sentence2, label) + with tf.gfile.GFile(gen_output_path, 'w') as f: + for tokens1_str, tokens2_str, consensus_label in examples: + f.write('%s\t%s\t%s\n' % (tokens1_str, tokens2_str, consensus_label)) + + if train: + # Output file containing all the sentences for generating the vocab from. + with tf.gfile.GFile(example_output_path, 'w') as f: + for tokens1_str, tokens2_str, consensus_label in examples: + f.write('%s %s\n' % (tokens1_str, tokens2_str)) + + +def _get_or_generate_vocab(tmp_dir, vocab_filename, vocab_size): + vocab_filepath = os.path.join(tmp_dir, vocab_filename) + print('Vocab file written to: ' + vocab_filepath) + + if tf.gfile.Exists(vocab_filepath): + gs = text_encoder.SubwordTextEncoder(vocab_filepath) + return gs + else: + example_file = os.path.join(tmp_dir, _EXAMPLES_FILE) + gs = text_encoder.SubwordTextEncoder() + token_counts = text_encoder.SubwordTextEncoder.get_token_counts( + example_file, corpus_max_lines=1000000) + gs = gs.build_to_target_size( + vocab_size, token_counts, vocab_filepath, min_val=1, max_val=1e3) + return gs + + +def snli_token_generator(tmp_dir, train, vocab_size): + _download_and_parse_dataset(tmp_dir, train) + + symbolizer_vocab = _get_or_generate_vocab( + tmp_dir, 'vocab.subword_text_encoder', vocab_size) + + file_name = 'train' if train else 'dev' + data_file = os.path.join(tmp_dir, file_name + '.txt') + with tf.gfile.GFile(data_file, mode='r') as f: + for line in f: + sent1, sent2, label = line.strip().split('\t') + sent1_enc = symbolizer_vocab.encode(sent1) + sent2_enc = symbolizer_vocab.encode(sent2) + + inputs = sent1_enc + [_SEP] + sent2_enc + [_EOS] + yield { + 'inputs': inputs, + 'targets': [_LABEL_TO_ID[label]], + } diff --git a/tensor2tensor/data_generators/text_encoder.py b/tensor2tensor/data_generators/text_encoder.py new file mode 100644 index 000000000..6d9ecb4a8 --- /dev/null +++ b/tensor2tensor/data_generators/text_encoder.py @@ -0,0 +1,451 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Encoders for text data. + +* TextEncoder: base class +* ByteTextEncoder: for ascii text +* TokenTextEncoder: with user-supplied vocabulary file +* SubwordTextEncoder: invertible +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +import six +from six.moves import xrange # pylint: disable=redefined-builtin +from tensor2tensor.data_generators import tokenizer + +import tensorflow as tf + +# Reserved tokens for things like padding and EOS symbols. +PAD = '' +EOS = '' +RESERVED_TOKENS = [PAD, EOS] + + +class TextEncoder(object): + """Base class for converting from ints to/from human readable strings.""" + + def __init__(self, num_reserved_ids=2): + self._num_reserved_ids = num_reserved_ids + + def encode(self, s): + """Transform a human-readable string into a sequence of int ids. + + The ids should be in the range [num_reserved_ids, vocab_size). Ids [0, + num_reserved_ids) are reserved. + + EOS is not appended. + + Args: + s: human-readable string to be converted. + + Returns: + ids: list of integers + """ + return [int(w) + self._num_reserved_ids for w in s.split()] + + def decode(self, ids): + """Transform a sequence of int ids into a human-readable string. + + EOS is not expected in ids. + + Args: + ids: list of integers to be converted. + + Returns: + s: human-readable string. + """ + decoded_ids = [] + for id_ in ids: + if 0 <= id_ < self._num_reserved_ids: + decoded_ids.append(RESERVED_TOKENS[int(id_)]) + else: + decoded_ids.append(id_) + return '%s' % decoded_ids + + @property + def vocab_size(self): + raise NotImplementedError() + + +class ByteTextEncoder(TextEncoder): + """Encodes each byte to an id. For 8-bit strings only.""" + + def encode(self, s): + return [ord(c) + self._num_reserved_ids for c in s] + + def decode(self, ids): + decoded_ids = [] + for id_ in ids: + if 0 <= id_ < self._num_reserved_ids: + decoded_ids.append(RESERVED_TOKENS[int(id_)]) + else: + decoded_ids.append(chr(id_)) + + return ''.join(decoded_ids) + + @property + def vocab_size(self): + return 2**8 + self._num_reserved_ids + + +class TokenTextEncoder(TextEncoder): + """Encoder based on a user-supplied vocabulary.""" + + def __init__(self, vocab_filename, reverse=False, num_reserved_ids=2): + """Initialize from a file, one token per line.""" + self._reverse = reverse + if vocab_filename is None: + self._load_vocab_from_file(vocab_filename) + + super(TokenTextEncoder, self).__init__(num_reserved_ids=num_reserved_ids) + + def encode(self, sentence): + """Converts a space-separated string of tokens to a list of ids.""" + ret = [self._token_to_id[tok] for tok in sentence.strip().split()] + if self._reverse: + ret = ret[::-1] + return ret + + def decode(self, ids): + if self._reverse: + ids = ids[::-1] + return ' '.join([self._safe_id_to_token(i) for i in ids]) + + @property + def vocab_size(self): + return len(self._id_to_token) + + def _safe_id_to_token(self, idx): + return self._id_to_token.get(idx, 'ID_%d' % idx) + + def _load_vocab_from_file(self, filename): + """Load vocab from a file.""" + self._token_to_id = {} + self._id_to_token = {} + + for idx, tok in enumerate(RESERVED_TOKENS): + self._token_to_id[tok] = idx + self._id_to_token[idx] = tok + + token_start_idx = self._num_reserved_ids + with tf.gfile.Open(filename) as f: + for i, line in enumerate(f): + idx = token_start_idx + i + tok = line.strip() + self._token_to_id[tok] = idx + self._id_to_token[idx] = tok + + +class SubwordTextEncoder(TextEncoder): + """Class for breaking tokens into subtokens. + + Invertibly encodes a string as a sequence of subtokens from a limited + vocabulary. + + A SubwordTextEncoder is built from a corpus (so it is tailored to the text in + the corpus), and stored to a file. See text_encoder_build_subword.py. + + It can then be loaded and used to encode/decode any text. + """ + + def __init__(self, filename=None, num_reserved_ids=2): + """Read from a file.""" + self._tokenizer = tokenizer.Tokenizer() + if filename is not None: + self._load_from_file(filename) + + super(SubwordTextEncoder, self).__init__(num_reserved_ids=num_reserved_ids) + + def encode(self, raw_text): + """Converts a string to a list of subtoken ids. + + Args: + raw_text: a string. + Returns: + a list of integers in the range [0, vocab_size) + """ + return self._tokens_to_subtokens(self._tokenizer.encode(raw_text)) + + def decode(self, subtokens): + """Converts a sequence of subtoken ids to a string. + + Args: + subtokens: a list of integers in the range [0, vocab_size) + Returns: + a string + """ + return self._tokenizer.decode(self._subtokens_to_tokens(subtokens)) + + @property + def vocab_size(self): + """The subtoken vocabulary size.""" + return len(self._all_subtoken_strings) + + def _tokens_to_subtokens(self, tokens): + """Converts a list of tokens to a list of subtoken ids. + + Args: + tokens: a list of strings. + Returns: + a list of integers in the range [0, vocab_size) + """ + ret = [] + for token in tokens: + ret.extend(self._escaped_token_to_subtokens(self._escape_token(token))) + return ret + + def _subtokens_to_tokens(self, subtokens): + """Converts a list of subtoken ids to a list of tokens. + + Args: + subtokens: a list of integers in the range [0, vocab_size) + Returns: + a list of strings. + """ + concatenated = ''.join( + [self.subtoken_to_subtoken_string(s) for s in subtokens]) + split = concatenated.split('_') + return [self._unescape_token(t + '_') for t in split if t] + + def subtoken_to_subtoken_string(self, subtoken): + """Subtoken_String (string) corresponding to the given subtoken (id).""" + if (subtoken >= 0 and subtoken < self.vocab_size and + self._all_subtoken_strings[subtoken]): + return self._all_subtoken_strings[subtoken] + else: + return 'ID%d_' % subtoken + + def _escaped_token_to_subtokens(self, escaped_token): + """Converts an escaped token string to a list of subtokens. + + Args: + escaped_token: an escaped token + Returns: + a list of one or more integers. + """ + ret = [] + pos = 0 + while pos < len(escaped_token): + end = len(escaped_token) + while True: + subtoken = self._subtoken_string_to_id.get(escaped_token[pos:end], -1) + if subtoken != -1: + break + end -= 1 + ret.append(subtoken) + pos = end + return ret + + @classmethod + def build_to_target_size(cls, + target_size, + token_counts, + store_filename, + min_val, + max_val, + num_iterations=4): + """Builds a SubwordTextEncoder that has `vocab_size` near `target_size`. + + Uses simple recursive binary search to find a `min_count` value that most + closely matches the `target_size`. + + Args: + target_size: desired vocab_size to approximate. + token_counts: a dictionary of string to int. + store_filename: a string - where to write the vocabulary. + min_val: an integer - lower bound for `min_count`. + max_val: an integer - upper bound for `min_count`. + num_iterations: an integer. how many iterations of refinement. + + Returns: + a SubwordTextEncoder instance. + """ + present_count = (max_val + min_val) // 2 + tf.logging.info('Trying min_count %d' % present_count) + subtokenizer = cls() + subtokenizer.build_from_token_counts(token_counts, store_filename, + present_count, num_iterations) + + if min_val == max_val or subtokenizer.vocab_size == target_size: + return subtokenizer + elif subtokenizer.vocab_size > target_size: + other_subtokenizer = cls.build_to_target_size( + target_size, token_counts, store_filename, present_count + 1, max_val, + num_iterations) + if (abs(other_subtokenizer.vocab_size - target_size) < + abs(subtokenizer.vocab_size - target_size)): + return other_subtokenizer + else: + return subtokenizer + else: + other_subtokenizer = cls.build_to_target_size( + target_size, token_counts, store_filename, min_val, present_count - 1, + num_iterations) + if (abs(other_subtokenizer.vocab_size - target_size) < + abs(subtokenizer.vocab_size - target_size)): + return other_subtokenizer + else: + return subtokenizer + + def build_from_token_counts(self, + token_counts, + store_filename, + min_count, + num_iterations=4): + """Train a SubwordTextEncoder based on a dictionary of word counts. + + Args: + token_counts: a dictionary of string to int. + store_filename: a string - where to write the vocabulary. + min_count: an integer - discard subtokens with lower counts. + num_iterations: an integer. how many iterations of refinement. + """ + # We build iteratively. On each iteration, we segment all the words, + # then count the resulting potential subtokens, keeping the ones + # with high enough counts for our new vocabulary. + for i in xrange(num_iterations): + counts = {} + for token, count in six.iteritems(token_counts): + escaped_token = self._escape_token(token) + # we will count all tails of the escaped_token, starting from boundaries + # determined by our current segmentation. + if i == 0: + starts = list(range(len(escaped_token))) + else: + subtokens = self._escaped_token_to_subtokens(escaped_token) + pos = 0 + starts = [] + for subtoken in subtokens: + starts.append(pos) + pos += len(self.subtoken_to_subtoken_string(subtoken)) + for start in starts: + for end in xrange(start + 1, len(escaped_token) + 1): + subtoken_string = escaped_token[start:end] + counts[subtoken_string] = counts.get(subtoken_string, 0) + count + # array of lists of candidate subtoken strings, by length + len_to_subtoken_strings = [] + for subtoken_string, count in six.iteritems(counts): + if count < min_count or len(subtoken_string) <= 1: + continue + while len(len_to_subtoken_strings) <= len(subtoken_string): + len_to_subtoken_strings.append([]) + len_to_subtoken_strings[len(subtoken_string)].append(subtoken_string) + new_subtoken_strings = [] + # consider the candidates longest to shortest, so that if we accept + # a longer subtoken string, we can decrement the counts of its prefixes. + for subtoken_strings in len_to_subtoken_strings[::-1]: + for subtoken_string in subtoken_strings: + count = counts[subtoken_string] + if count < min_count: + continue + new_subtoken_strings.append((-count, subtoken_string)) + for l in xrange(1, len(subtoken_string)): + counts[subtoken_string[:l]] -= count + # make sure we have all single characters. + new_subtoken_strings.extend([(-counts.get(chr(i), 0), chr(i)) + for i in xrange(2**8)]) + new_subtoken_strings.sort() + self._init_from_list([''] * self._num_reserved_ids + + [p[1] for p in new_subtoken_strings]) + print('vocab_size = %d' % self.vocab_size) + + original = 'This sentence was encoded by the SubwordTextEncoder.' + encoded = self.encode(original) + print(encoded) + print([self.subtoken_to_subtoken_string(s) for s in encoded]) + decoded = self.decode(encoded) + print(decoded) + assert decoded == original + self._store_to_file(store_filename) + + def _init_from_list(self, subtoken_strings): + """Initialize from a list of subtoken strings.""" + self._all_subtoken_strings = subtoken_strings + self._subtoken_string_to_id = {} + for i in xrange(len(subtoken_strings)): + subtoken_string = subtoken_strings[i] + if subtoken_string: + self._subtoken_string_to_id[subtoken_string] = i + + def _load_from_file(self, filename): + """Load from a file.""" + subtoken_strings = [] + with tf.gfile.Open(filename) as f: + for line in f: + subtoken_strings.append(line.strip()[1:-1].decode('string-escape')) + self._init_from_list(subtoken_strings) + + def _store_to_file(self, filename): + with tf.gfile.Open(filename, 'w') as f: + for subtoken_string in self._all_subtoken_strings: + f.write('\'' + subtoken_string.encode('string-escape') + '\'\n') + + def _escape_token(self, token): + r"""Translate '\'->'\\' and '_'->'\u', then append '_'. + + Args: + token: a string + Returns: + escaped_token: a string + """ + return token.replace('\\', '\\\\').replace('_', '\\u') + '_' + + def _unescape_token(self, escaped_token): + r"""Remove '_' from end, then translate '\\'->'\' and '\u'->'_'. + + TODO(noam): There must be some better way to do this with regexps. + + Args: + escaped_token: a string + Returns: + token: a string + """ + assert escaped_token[-1] == '_' + escaped_token = escaped_token[:-1] + if '\\' not in escaped_token: + return escaped_token + ret = '' + pos = 0 + while pos < len(escaped_token): + if escaped_token[pos] == '\\' and pos + 1 < len(escaped_token): + if escaped_token[pos + 1] == 'u': + ret += '_' + else: + ret += escaped_token[pos + 1] + pos += 1 + pos += 1 + return ret + + @classmethod + def get_token_counts(cls, text_filepattern, corpus_max_lines): + """Read the corpus and compute a dictionary of word counts.""" + tok = tokenizer.Tokenizer() + token_counts = {} + lines_read = 0 + filenames = tf.gfile.Glob(text_filepattern) + for text_filename in filenames: + with tf.gfile.Open(text_filename) as f: + for line in f: + tokens = tok.encode(line.strip()) + for t in tokens: + token_counts[t] = token_counts.get(t, 0) + 1 + lines_read += 1 + if corpus_max_lines > 0 and lines_read > corpus_max_lines: + return token_counts + return token_counts diff --git a/tensor2tensor/data_generators/text_encoder_build_subword.py b/tensor2tensor/data_generators/text_encoder_build_subword.py new file mode 100644 index 000000000..ee71af9f6 --- /dev/null +++ b/tensor2tensor/data_generators/text_encoder_build_subword.py @@ -0,0 +1,67 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +r"""Program to build a SubwordTextEncoder. + +The flags --min_count and --corpus_max_lines will affect the size of the +vocabulary. Try changing these flags until you get a vocabulary +of the size you want. + +Example usage: + +python data_generators/text_encoder_build_subword.py \ + --corpus_filepattern=$LM1B_DIR/train-unk-* \ + --corpus_max_lines=17500 \ + --output_fn=$DATA_DIR/lm1b16k.subword_text_encoder \ + --logtostderr + +python data_generators/text_encoder_build_subword.py \ + --corpus_filepattern=$LM1B_DIR/train-unk-* \ + --corpus_max_lines=270000 \ + --output_fn=$DATA_DIR/lm1b64k.subword_text_encoder \ + --logtostderr +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +from tensor2tensor.data_generators import text_encoder + +import tensorflow as tf + +tf.app.flags.DEFINE_string('output_fn', '/tmp/my.subword_text_encoder', + 'where to store the SubwordTextEncoder') +tf.app.flags.DEFINE_string('corpus_filepattern', '', + 'Corpus of one or more text files') +tf.app.flags.DEFINE_integer('min_count', 5, 'Minimum subtoken count in corpus') +tf.app.flags.DEFINE_integer('corpus_max_lines', 10000, + 'How many lines of corpus to read') +tf.app.flags.DEFINE_integer('num_iterations', 4, 'Number of iterations') +FLAGS = tf.app.flags.FLAGS + + +def main(unused_argv): + gs = text_encoder.SubwordTextEncoder() + if not FLAGS.corpus_filepattern: + raise ValueError('Must provide --corpus_filepattern') + token_counts = text_encoder.SubwordTextEncoder.get_token_counts( + FLAGS.corpus_filepattern, FLAGS.corpus_max_lines) + gs.build_from_token_counts(token_counts, FLAGS.output_fn, FLAGS.min_count, + FLAGS.num_iterations) + + +if __name__ == '__main__': + tf.app.run() diff --git a/tensor2tensor/data_generators/text_encoder_inspect_subword.py b/tensor2tensor/data_generators/text_encoder_inspect_subword.py new file mode 100644 index 000000000..0ad9a2701 --- /dev/null +++ b/tensor2tensor/data_generators/text_encoder_inspect_subword.py @@ -0,0 +1,64 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +r"""Inspect a TFRecord file of tensorflow.Example and show tokenizations. + +python data_generators/text_encoder_inspect_subword.py \ + --logtostderr \ + --vocab_file=$DATA_DIR/tokens.vocab.8192 \ + --in_file=$DATA_DIR/wmt_ende_tokens_8k-train-00000-of-00100 +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +from tensor2tensor.data_generators import text_encoder + +import tensorflow as tf + +tf.app.flags.DEFINE_string("vocab_file", "", + "SubwordTextEncoder vocabulary file") + +tf.app.flags.DEFINE_string("in_file", "", "input filename") + +FLAGS = tf.app.flags.FLAGS + + +def ShowSequence(subtokenizer, subtokens, label): + print("%s decoded = %s" % (label, subtokenizer.decode(subtokens))) + print("%s subtoken ids = %s" % (label, subtokens)) + print("%s subtoken strings = %s" % + (label, + [subtokenizer.subtoken_to_subtoken_string(s) for s in subtokens])) + print("") + + +def main(_): + """Convert a file to examples.""" + subtokenizer = text_encoder.SubwordTextEncoder(FLAGS.vocab_file) + reader = tf.python_io.tf_record_iterator(FLAGS.in_file) + for record in reader: + x = tf.train.Example() + x.ParseFromString(record) + inputs = [int(i) for i in x.features.feature["inputs"].int64_list.value] + targets = [int(i) for i in x.features.feature["targets"].int64_list.value] + ShowSequence(subtokenizer, inputs, "inputs") + ShowSequence(subtokenizer, targets, "targets") + + +if __name__ == "__main__": + tf.app.run() diff --git a/tensor2tensor/data_generators/tokenizer.py b/tensor2tensor/data_generators/tokenizer.py new file mode 100644 index 000000000..15b199907 --- /dev/null +++ b/tensor2tensor/data_generators/tokenizer.py @@ -0,0 +1,117 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""A simple invertible tokenizer. + +Converts from a raw string to a list of tokens (strings). + +This tokenizer has the following desirable properties: + - It is invertible. + - Punctuation is broken away from adjacent letters. + - A single space between words does not produce an extra token. + +The tokenization algorithm is as follows: + +0. We classify the 256 characters into "word characters" and + "separator characters". Separator characters are defined as the union of + string.punctuation and string.whitespace. All other characters are + "word characters". + +1. Split the text into a list of tokens, splitting at every boundary of a + "word character" and a "separator character". This produces a list which + alternates between "word tokens" (strings of word characters) and + "separator tokens" (strings of of separator characters). + +2. Remove every token consisting of a single space, unless it is + the very first or very last token in the list. These tokens are now + implied by the fact that there are two adjacent word tokens. + +e.g. "Dude - that's so cool." + -> ["Dude", " - ", "that", "'", "s", "so", "cool", "."] +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import array +import string + +# Dependency imports + +from six.moves import xrange # pylint: disable=redefined-builtin + + +class Tokenizer(object): + """Vocab for breaking words into wordpieces. + """ + + def __init__(self): + self._separator_chars = string.punctuation + string.whitespace + self._separator_char_mask = array.array( + "l", [chr(i) in self._separator_chars for i in xrange(256)]) + self.token_counts = dict() + + def _increment_token_count(self, token): + if token in self.token_counts: + self.token_counts[token] += 1 + else: + self.token_counts[token] = 1 + + def encode(self, raw_text): + """Encode a raw string as a list of tokens. + + Args: + raw_text: a string + Returns: + a list of stirngs. + """ + if not raw_text: + return [] + ret = [] + token_start = 0 + for pos in xrange(1, len(raw_text)): + if (self._is_separator_char(raw_text[pos]) != + self._is_separator_char(raw_text[pos - 1])): + token = raw_text[token_start:pos] + if token != " " or token_start == 0: + ret.append(token) + self._increment_token_count(token) + token_start = pos + final_token = raw_text[token_start:] + ret.append(final_token) + self._increment_token_count(final_token) + return ret + + def decode(self, tokens): + """Decode a list of tokens to a string. + + Args: + tokens: a list of stirngs + Returns: + a string. + """ + ret = "" + for i, token in enumerate(tokens): + if (i > 0 and self._is_word_char(tokens[i - 1][0]) and + self._is_word_char(token[0])): + ret += " " + ret += token + return ret + + def _is_separator_char(self, c): + return self._separator_char_mask[ord(c)] + + def _is_word_char(self, c): + return not self._is_separator_char(c) diff --git a/tensor2tensor/data_generators/tokenizer_test.py b/tensor2tensor/data_generators/tokenizer_test.py new file mode 100644 index 000000000..4102051e6 --- /dev/null +++ b/tensor2tensor/data_generators/tokenizer_test.py @@ -0,0 +1,64 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# coding=utf-8 +"""Tests for tensor2tensor.data_generators.tokenizer.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import random + +# Dependency imports + +import six +from six.moves import xrange # pylint: disable=redefined-builtin +from tensor2tensor.data_generators import tokenizer + +import tensorflow as tf + + +class TokenizerTest(tf.test.TestCase): + + def testEncode(self): + t = tokenizer.Tokenizer() + self.assertEqual( + t.encode("Dude - that's so cool."), + ["Dude", " - ", "that", "'", "s", "so", "cool", "."]) + self.assertEqual( + t.encode("Łukasz est né en 1981."), + ["Łukasz", "est", "né", "en", "1981", "."]) + self.assertEqual( + t.encode(" Spaces at the ends "), + [" ", "Spaces", "at", "the", "ends", " "]) + self.assertEqual(t.encode("802.11b"), ["802", ".", "11b"]) + self.assertEqual(t.encode("two. \nlines"), ["two", ". \n", "lines"]) + + def testDecode(self): + t = tokenizer.Tokenizer() + self.assertEqual( + t.decode(["Dude", " - ", "that", "'", "s", "so", "cool", "."]), + "Dude - that's so cool.") + + def testInvertibilityOnRandomStrings(self): + t = tokenizer.Tokenizer() + random.seed(123) + for _ in xrange(10000): + s = "".join([six.int2byte(random.randint(0, 255)) for _ in xrange(10)]) + self.assertEqual(s, t.decode(t.encode(s))) + + +if __name__ == "__main__": + tf.test.main() diff --git a/tensor2tensor/data_generators/wmt.py b/tensor2tensor/data_generators/wmt.py new file mode 100644 index 000000000..4ac669f71 --- /dev/null +++ b/tensor2tensor/data_generators/wmt.py @@ -0,0 +1,269 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Data generators for WMT data-sets.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import tarfile + +# Dependency imports + +from tensor2tensor.data_generators import generator_utils +from tensor2tensor.data_generators import text_encoder + +import tensorflow as tf + + +def character_generator(source_path, target_path, eos=None): + """Generator for sequence-to-sequence tasks that just uses characters. + + This generator assumes the files at source_path and target_path have + the same number of lines and yields dictionaries of "inputs" and "targets" + where inputs are characters from the source lines converted to integers, + and targets are characters from the target lines, also converted to integers. + + Args: + source_path: path to the file with source sentences. + target_path: path to the file with target sentences. + eos: integer to append at the end of each sequence (default: None). + + Yields: + A dictionary {"inputs": source-line, "targets": target-line} where + the lines are integer lists converted from characters in the file lines. + """ + eos_list = [] if eos is None else [eos] + with tf.gfile.GFile(source_path, mode="r") as source_file: + with tf.gfile.GFile(target_path, mode="r") as target_file: + source, target = source_file.readline(), target_file.readline() + while source and target: + source_ints = [ord(c) for c in source.strip()] + eos_list + target_ints = [ord(c) for c in target.strip()] + eos_list + yield {"inputs": source_ints, "targets": target_ints} + source, target = source_file.readline(), target_file.readline() + + +def token_generator(source_path, target_path, token_vocab, eos=None): + """Generator for sequence-to-sequence tasks that uses tokens. + + This generator assumes the files at source_path and target_path have + the same number of lines and yields dictionaries of "inputs" and "targets" + where inputs are token ids from the " "-split source (and target, resp.) lines + converted to integers using the token_map. + + Args: + source_path: path to the file with source sentences. + target_path: path to the file with target sentences. + token_vocab: text_encoder.TextEncoder object. + eos: integer to append at the end of each sequence (default: None). + + Yields: + A dictionary {"inputs": source-line, "targets": target-line} where + the lines are integer lists converted from tokens in the file lines. + """ + eos_list = [] if eos is None else [eos] + with tf.gfile.GFile(source_path, mode="r") as source_file: + with tf.gfile.GFile(target_path, mode="r") as target_file: + source, target = source_file.readline(), target_file.readline() + while source and target: + source_ints = token_vocab.encode(source.strip()) + eos_list + target_ints = token_vocab.encode(target.strip()) + eos_list + yield {"inputs": source_ints, "targets": target_ints} + source, target = source_file.readline(), target_file.readline() + + +def _get_wmt_ende_dataset(directory, filename): + """Extract the WMT en-de corpus `filename` to directory unless it's there.""" + train_path = os.path.join(directory, filename) + if not (tf.gfile.Exists(train_path + ".de") and + tf.gfile.Exists(train_path + ".en")): + # We expect that this file has been downloaded from: + # https://drive.google.com/open?id=0B_bZck-ksdkpM25jRUN2X2UxMm8 and placed + # in `directory`. + corpus_file = os.path.join(directory, "wmt16_en_de.tar.gz") + with tarfile.open(corpus_file, "r:gz") as corpus_tar: + corpus_tar.extractall(directory) + return train_path + + +def ende_bpe_token_generator(tmp_dir, train): + """Instance of token generator for the WMT en->de task, training set.""" + dataset_path = ("train.tok.clean.bpe.32000" + if train else "newstest2013.tok.bpe.32000") + train_path = _get_wmt_ende_dataset(tmp_dir, dataset_path) + token_path = os.path.join(tmp_dir, "vocab.bpe.32000") + token_vocab = text_encoder.TokenTextEncoder(vocab_filename=token_path) + return token_generator(train_path + ".en", train_path + ".de", token_vocab, 1) + + +_ENDE_TRAIN_DATASETS = [ + [ + "http://data.statmt.org/wmt16/translation-task/training-parallel-nc-v11.tgz", # pylint: disable=line-too-long + ("training-parallel-nc-v11/news-commentary-v11.de-en.en", + "training-parallel-nc-v11/news-commentary-v11.de-en.de") + ], + [ + "http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz", + ("commoncrawl.de-en.en", "commoncrawl.de-en.de") + ], + [ + "http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz", + ("training/europarl-v7.de-en.en", "training/europarl-v7.de-en.de") + ], +] +_ENDE_TEST_DATASETS = [ + [ + "http://data.statmt.org/wmt16/translation-task/dev.tgz", + ("dev/newstest2013.en", "dev/newstest2013.de") + ], +] + +_ENFR_TRAIN_DATASETS = [ + [ + "http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz", + ("commoncrawl.fr-en.en", "commoncrawl.fr-en.fr") + ], + [ + "http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz", + ("training/europarl-v7.fr-en.en", "training/europarl-v7.fr-en.fr") + ], + [ + "http://www.statmt.org/wmt14/training-parallel-nc-v9.tgz", + ("training/news-commentary-v9.fr-en.en", + "training/news-commentary-v9.fr-en.fr") + ], + [ + "http://www.statmt.org/wmt10/training-giga-fren.tar", + ("giga-fren.release2.fixed.en.gz", "giga-fren.release2.fixed.fr.gz") + ], + [ + "http://www.statmt.org/wmt13/training-parallel-un.tgz", + ("un/undoc.2000.fr-en.en", "un/undoc.2000.fr-en.fr") + ], +] +_ENFR_TEST_DATASETS = [ + [ + "http://data.statmt.org/wmt16/translation-task/dev.tgz", + ("dev/newstest2013.en", "dev/newstest2013.fr") + ], +] + + +def _compile_data(tmp_dir, datasets, filename): + """Concatenate all `datasets` and save to `filename`.""" + filename = os.path.join(tmp_dir, filename) + lang1_lines, lang2_lines = [], [] + for dataset in datasets: + url = dataset[0] + compressed_filename = os.path.basename(url) + compressed_filepath = os.path.join(tmp_dir, compressed_filename) + + lang1_filename, lang2_filename = dataset[1] + lang1_filepath = os.path.join(tmp_dir, lang1_filename) + lang2_filepath = os.path.join(tmp_dir, lang2_filename) + + if not os.path.exists(compressed_filepath): + generator_utils.maybe_download(tmp_dir, compressed_filename, url) + if not os.path.exists(lang1_filepath) or not os.path.exists(lang2_filepath): + mode = "r:gz" if "gz" in compressed_filepath else "r" + with tarfile.open(compressed_filepath, mode) as corpus_tar: + corpus_tar.extractall(tmp_dir) + if ".gz" in lang1_filepath: + new_filepath = lang1_filepath.strip(".gz") + generator_utils.gunzip_file(lang1_filepath, new_filepath) + lang1_filepath = new_filepath + if ".gz" in lang2_filepath: + new_filepath = lang2_filepath.strip(".gz") + generator_utils.gunzip_file(lang2_filepath, new_filepath) + lang2_filepath = new_filepath + with tf.gfile.GFile(lang1_filepath, mode="r") as lang1_file: + with tf.gfile.GFile(lang2_filepath, mode="r") as lang2_file: + lang1_file_lines = lang1_file.readlines() + lang2_file_lines = lang2_file.readlines() + assert len(lang1_file_lines) == len(lang2_file_lines), lang1_filepath + lang1_lines.extend(lang1_file_lines) + lang2_lines.extend(lang2_file_lines) + + write_chunk_size = 10000 + assert len(lang1_lines) == len(lang2_lines) + with tf.gfile.GFile(filename + ".lang1", mode="w") as lang1_file: + i = 0 + while i <= len(lang1_lines): + lang1_file.writelines( + lang1_lines[i * write_chunk_size:(i + 1) * write_chunk_size]) + i += 1 + lang1_file.writelines(lang1_lines[i * write_chunk_size:]) + with tf.gfile.GFile(filename + ".lang2", mode="w") as lang2_file: + i = 0 + while i <= len(lang2_lines): + lang2_file.writelines( + lang2_lines[i * write_chunk_size:(i + 1) * write_chunk_size]) + i += 1 + lang2_file.writelines(lang2_lines[i * write_chunk_size:]) + return filename + + +def ende_wordpiece_token_generator(tmp_dir, train, vocab_size): + symbolizer_vocab = generator_utils.get_or_generate_vocab( + tmp_dir, "tokens.vocab.%d" % vocab_size, vocab_size) + datasets = _ENDE_TRAIN_DATASETS if train else _ENDE_TEST_DATASETS + tag = "train" if train else "dev" + data_path = _compile_data(tmp_dir, datasets, "wmt_ende_tok_%s" % tag) + return token_generator(data_path + ".lang1", data_path + ".lang2", + symbolizer_vocab, 1) + + +def ende_character_generator(tmp_dir, train): + datasets = _ENDE_TRAIN_DATASETS if train else _ENDE_TEST_DATASETS + tag = "train" if train else "dev" + data_path = _compile_data(tmp_dir, datasets, "wmt_ende_chr_%s" % tag) + return character_generator(data_path + ".lang1", data_path + ".lang2", 1) + + +def enfr_wordpiece_token_generator(tmp_dir, train, vocab_size): + """Instance of token generator for the WMT en->fr task.""" + symbolizer_vocab = generator_utils.get_or_generate_vocab( + tmp_dir, "tokens.vocab.%d" % vocab_size, vocab_size) + datasets = _ENFR_TRAIN_DATASETS if train else _ENFR_TEST_DATASETS + tag = "train" if train else "dev" + data_path = _compile_data(tmp_dir, datasets, "wmt_enfr_tok_%s" % tag) + return token_generator(data_path + ".lang1", data_path + ".lang2", + symbolizer_vocab, 1) + + +def enfr_character_generator(tmp_dir, train): + """Instance of character generator for the WMT en->fr task.""" + datasets = _ENFR_TRAIN_DATASETS if train else _ENFR_TEST_DATASETS + tag = "train" if train else "dev" + data_path = _compile_data(tmp_dir, datasets, "wmt_enfr_chr_%s" % tag) + return character_generator(data_path + ".lang1", data_path + ".lang2", 1) + + +def parsing_character_generator(tmp_dir, train): + filename = "parsing_%s" % ("train" if train else "dev") + text_filepath = os.path.join(tmp_dir, filename + ".text") + tags_filepath = os.path.join(tmp_dir, filename + ".tags") + return character_generator(text_filepath, tags_filepath, 1) + + +def parsing_token_generator(tmp_dir, train, vocab_size): + symbolizer_vocab = generator_utils.get_or_generate_vocab( + tmp_dir, "tokens.vocab.%d" % vocab_size, vocab_size) + filename = "parsing_%s" % ("train" if train else "dev") + text_filepath = os.path.join(tmp_dir, filename + ".text") + tags_filepath = os.path.join(tmp_dir, filename + ".tags") + return token_generator(text_filepath, tags_filepath, symbolizer_vocab, 1) diff --git a/tensor2tensor/data_generators/wmt_test.py b/tensor2tensor/data_generators/wmt_test.py new file mode 100644 index 000000000..7121e3d8a --- /dev/null +++ b/tensor2tensor/data_generators/wmt_test.py @@ -0,0 +1,72 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""WMT generators test.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import io +import os +import tempfile + +# Dependency imports + +import six +from tensor2tensor.data_generators import wmt + +import tensorflow as tf + + +class WMTTest(tf.test.TestCase): + + def testCharacterGenerator(self): + # Generate a trivial source and target file. + tmp_dir = self.get_temp_dir() + (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir) + with io.open(tmp_file_path + ".src", "wb") as src_file: + src_file.write("source1\n") + src_file.write("source2\n") + with io.open(tmp_file_path + ".tgt", "wb") as tgt_file: + tgt_file.write("target1\n") + tgt_file.write("target2\n") + + # Call character generator on the generated files. + results_src, results_tgt = [], [] + for dictionary in wmt.character_generator(tmp_file_path + ".src", + tmp_file_path + ".tgt"): + self.assertEqual(sorted(list(dictionary)), ["inputs", "targets"]) + results_src.append(dictionary["inputs"]) + results_tgt.append(dictionary["targets"]) + + # Check that the results match the files. + self.assertEqual(len(results_src), 2) + self.assertEqual("".join([six.int2byte(i) + for i in results_src[0]]), "source1") + self.assertEqual("".join([six.int2byte(i) + for i in results_src[1]]), "source2") + self.assertEqual("".join([six.int2byte(i) + for i in results_tgt[0]]), "target1") + self.assertEqual("".join([six.int2byte(i) + for i in results_tgt[1]]), "target2") + + # Clean up. + os.remove(tmp_file_path + ".src") + os.remove(tmp_file_path + ".tgt") + os.remove(tmp_file_path) + + +if __name__ == "__main__": + tf.test.main() diff --git a/tensor2tensor/data_generators/wsj_parsing.py b/tensor2tensor/data_generators/wsj_parsing.py new file mode 100644 index 000000000..a2dda4d9d --- /dev/null +++ b/tensor2tensor/data_generators/wsj_parsing.py @@ -0,0 +1,109 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Data generators for parsing data-sets.""" + +import os + +# Dependency imports + +from tensor2tensor.data_generators import generator_utils + +import tensorflow as tf + + +def words_and_tags_from_wsj_tree(tree_string): + """Generates linearized trees and tokens from the wsj tree format. + + It uses the linearized algorithm described in https://arxiv.org/abs/1412.7449. + + Args: + tree_string: tree in wsj format + + Returns: + tuple: (words, linearized tree) + """ + stack, tags, words = [], [], [] + for tok in tree_string.strip().split(): + if tok[0] == "(": + symbol = tok[1:] + tags.append(symbol) + stack.append(symbol) + else: + assert tok[-1] == ")" + stack.pop() # Pop the POS-tag. + while tok[-2] == ")": + tags.append("/" + stack.pop()) + tok = tok[:-1] + words.append(tok[:-1]) + return str.join(" ", words), str.join(" ", tags[1:-1]) # Strip "TOP" tag. + + +def token_generator(tree_path, source_token_vocab, target_token_vocab, + eos=None): + """Generator for parsing as a sequence-to-sequence task that uses tokens. + + This generator assumes the files at source_path and target_path have + the same number of lines and yields dictionaries of "inputs" and "targets" + where inputs and targets are token ids from source and taret lines + converted to integers using the token_map. + + Args: + tree_path: path to the file with wsj format trees, one per line. + source_token_vocab: GenericVocabulary object for source vocabulary. + target_token_vocab: GenericVocabulary object for target vocabulary. + eos: integer to append at the end of each sequence (default: None). + + Yields: + A dictionary {"inputs": source-line, "targets": target-line} where + the lines are integer lists converted from tokens in the file lines. + """ + eos_list = [] if eos is None else [eos] + with tf.gfile.GFile(tree_path, mode="r") as tree_file: + tree_line = tree_file.readline() + while tree_line: + source, target = words_and_tags_from_wsj_tree(tree_line) + source_ints = source_token_vocab.encode(source.strip()) + eos_list + target_ints = target_token_vocab.encode(target.strip()) + eos_list + yield {"inputs": source_ints, "targets": target_ints} + tree_line = tree_file.readline() + + +def parsing_token_generator(tmp_dir, train, source_vocab_size, + target_vocab_size): + """Generator for parsing as a sequence-to-sequence task that uses tokens. + + This generator assumes the files parsing_{train,dev}.wsj, which contain trees + in wsj format and wsj_{source,target}.tokens.vocab. exist in + tmp_dir. + + Args: + tmp_dir: path to the file with source sentences. + train: path to the file with target sentences. + source_vocab_size: source vocab size. + target_vocab_size: target vocab size. + + Returns: + A generator to a dictionary of inputs and outputs. + """ + source_symbolizer_vocab = generator_utils.get_or_generate_vocab( + tmp_dir, "wsj_source.tokens.vocab.%d" % source_vocab_size, + source_vocab_size) + target_symbolizer_vocab = generator_utils.get_or_generate_vocab( + tmp_dir, "wsj_target.tokens.vocab.%d" % target_vocab_size, + target_vocab_size) + filename = "parsing_%s.trees" % ("train" if train else "dev") + tree_filepath = os.path.join(tmp_dir, filename) + return token_generator(tree_filepath, source_symbolizer_vocab, + target_symbolizer_vocab, 1) diff --git a/tensor2tensor/models/README.md b/tensor2tensor/models/README.md new file mode 100644 index 000000000..29b88484f --- /dev/null +++ b/tensor2tensor/models/README.md @@ -0,0 +1,16 @@ +# Constructing T2T Models. + +This directory contains T2T models, their hyperparameters, and a number +of common layers and hyperparameter settings to help construct new models. +Common building blocks are in `common_layers.py` and `common_attention.py`. +Common hyperparameters are in `common_hparams.py`. Models are imported in +`models.py`. + +## Adding a new model. + +To add a model to the built-in set, create a new file (see, e.g., +`neural_gpu.py`) and write your model class inheriting from `T2TModel` there and +decorate it with `registry.register_model`. Import it in `models.py`. + +It is now avaialable to use with the trainer binary (`t2t-trainer`) using the +`--model=model_name` flag. diff --git a/tensor2tensor/models/__init__.py b/tensor2tensor/models/__init__.py new file mode 100644 index 000000000..27d533abc --- /dev/null +++ b/tensor2tensor/models/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/tensor2tensor/models/attention_lm.py b/tensor2tensor/models/attention_lm.py new file mode 100644 index 000000000..581cd767f --- /dev/null +++ b/tensor2tensor/models/attention_lm.py @@ -0,0 +1,169 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Self-attention based language model. + +Like transformer.py, but no encoder + +decoder: [Self-Attention, Feed-forward] x n + +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import copy + +# Dependency imports + +from six.moves import xrange # pylint: disable=redefined-builtin + +from tensor2tensor.models import common_attention +from tensor2tensor.models import common_hparams +from tensor2tensor.models import common_layers +from tensor2tensor.utils import registry +from tensor2tensor.utils import t2t_model + +import tensorflow as tf + + +@registry.register_model +class AttentionLM(t2t_model.T2TModel): + """Attention net. See file docstring.""" + + def model_fn_body(self, features, train): + # Remove dropout if not training + hparams = copy.copy(self._hparams) + if not train: + hparams.attention_dropout = 0. + hparams.relu_dropout = 0. + hparams.residual_dropout = 0. + targets = features["targets"] + targets = tf.squeeze(targets, 2) + + (decoder_input, decoder_self_attention_bias) = attention_lm_prepare_decoder( + targets, hparams) + + def residual_fn(x, y): + return common_layers.layer_norm(x + tf.nn.dropout( + y, 1.0 - hparams.residual_dropout)) + + decoder_input = tf.nn.dropout(decoder_input, 1.0 - hparams.residual_dropout) + decoder_output = attention_lm_decoder( + decoder_input, residual_fn, decoder_self_attention_bias, hparams) + decoder_output = tf.expand_dims(decoder_output, 2) + + return decoder_output + + +def attention_lm_prepare_decoder(targets, hparams): + """Prepare one shard of the model for the decoder. + + Args: + targets: a Tensor. + hparams: run hyperparameters + + Returns: + decoder_input: a Tensor, bottom of decoder stack + decoder_self_attention_bias: a Tensor, containing large negative values + to implement masked attention and possibly baises for diagonal alignments + """ + decoder_self_attention_bias = ( + common_attention.attention_bias_lower_triangle(tf.shape(targets)[1])) + decoder_input = common_layers.shift_left_3d(targets) + if hparams.pos == "timing": + decoder_input = common_attention.add_timing_signal_1d(decoder_input) + return (decoder_input, decoder_self_attention_bias) + + +def attention_lm_decoder(decoder_input, + residual_fn, + decoder_self_attention_bias, + hparams, + name="decoder"): + """A stack of attention_lm layers. + + Args: + decoder_input: a Tensor + residual_fn: a function from (layer_input, layer_output) -> combined_output + decoder_self_attention_bias: bias Tensor for self-attention + (see common_attention.attention_bias()) + hparams: hyperparameters for model + name: a string + + Returns: + y: a Tensors + """ + x = decoder_input + # Summaries don't work in multi-problem setting yet. + summaries = "problems" not in hparams.values() or len(hparams.problems) == 1 + with tf.variable_scope(name): + for layer in xrange(hparams.num_hidden_layers): + with tf.variable_scope("layer_%d" % layer): + x = residual_fn( + x, + common_attention.multihead_attention( + x, + None, + decoder_self_attention_bias, + hparams.attention_key_channels or hparams.hidden_size, + hparams.attention_value_channels or hparams.hidden_size, + hparams.hidden_size, + hparams.num_heads, + hparams.attention_dropout, + summaries=summaries, + name="decoder_self_attention")) + x = residual_fn(x, + common_layers.conv_hidden_relu( + x, + hparams.filter_size, + hparams.hidden_size, + dropout=hparams.relu_dropout)) + return x + + +@registry.register_hparams +def attention_lm_base(): + """Set of hyperparameters.""" + hparams = common_hparams.basic_params1() + hparams.hidden_size = 1024 + hparams.batch_size = 8192 + hparams.max_length = 256 + hparams.dropout = 0.0 + hparams.clip_grad_norm = 0. # i.e. no gradient clipping + hparams.optimizer_adam_epsilon = 1e-9 + hparams.learning_rate_decay_scheme = "noam" + hparams.learning_rate = 1.0 + hparams.learning_rate_warmup_steps = 1000 + hparams.initializer_gain = 1.0 + hparams.num_hidden_layers = 6 + hparams.initializer = "uniform_unit_scaling" + hparams.weight_decay = 0.0 + hparams.optimizer_adam_beta1 = 0.9 + hparams.optimizer_adam_beta2 = 0.98 + hparams.num_sampled_classes = 0 + hparams.label_smoothing = 0.1 + hparams.shared_embedding_and_softmax_weights = int(False) + + hparams.add_hparam("filter_size", 4096) # Add new ones like this. + # attention-related flags + hparams.add_hparam("num_heads", 8) + hparams.add_hparam("attention_key_channels", 0) + hparams.add_hparam("attention_value_channels", 0) + hparams.add_hparam("attention_dropout", 0.0) + hparams.add_hparam("relu_dropout", 0.0) + hparams.add_hparam("pos", "timing") # timing, none + hparams.add_hparam("residual_dropout", 0.1) + return hparams diff --git a/tensor2tensor/models/baseline.py b/tensor2tensor/models/baseline.py new file mode 100644 index 000000000..78f79eed0 --- /dev/null +++ b/tensor2tensor/models/baseline.py @@ -0,0 +1,72 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Baseline models.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +from tensor2tensor.models import common_layers +from tensor2tensor.utils import registry +from tensor2tensor.utils import t2t_model + +import tensorflow as tf + + +def lstm(inputs, hparams, train, name, initial_state=None): + """Run LSTM cell on inputs, assuming they are [batch x time x size].""" + + def dropout_lstm_cell(): + return tf.contrib.rnn.DropoutWrapper( + tf.contrib.rnn.BasicLSTMCell(hparams.hidden_size), + input_keep_prob=1.0 - hparams.dropout * tf.to_float(train)) + + layers = [dropout_lstm_cell() for _ in range(hparams.num_hidden_layers)] + with tf.variable_scope(name): + return tf.nn.dynamic_rnn( + tf.contrib.rnn.MultiRNNCell(layers), + inputs, + initial_state=initial_state, + dtype=tf.float32, + time_major=False) + + +def lstm_seq2seq_internal(inputs, targets, hparams, train): + """The basic LSTM seq2seq model, main step used for training.""" + with tf.variable_scope("lstm_seq2seq"): + # Flatten inputs. + inputs = common_layers.flatten4d3d(inputs) + # LSTM encoder. + _, final_encoder_state = lstm( + tf.reverse(inputs, axis=[1]), hparams, train, "encoder") + # LSTM decoder. + shifted_targets = common_layers.shift_left(targets) + decoder_outputs, _ = lstm( + common_layers.flatten4d3d(shifted_targets), + hparams, + train, + "decoder", + initial_state=final_encoder_state) + return tf.expand_dims(decoder_outputs, axis=2) + + +@registry.register_model("baseline_lstm_seq2seq") +class LSTMSeq2Seq(t2t_model.T2TModel): + + def model_fn_body(self, features, train): + return lstm_seq2seq_internal(features["inputs"], features["targets"], + self._hparams, train) diff --git a/tensor2tensor/models/baseline_test.py b/tensor2tensor/models/baseline_test.py new file mode 100644 index 000000000..25e191d6f --- /dev/null +++ b/tensor2tensor/models/baseline_test.py @@ -0,0 +1,55 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Baseline models tests.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +import numpy as np + +from tensor2tensor.data_generators import problem_hparams +from tensor2tensor.models import baseline +from tensor2tensor.models import common_hparams + +import tensorflow as tf + + +class BaselineTest(tf.test.TestCase): + + def testLSTMSeq2Seq(self): + vocab_size = 9 + x = np.random.random_integers(1, high=vocab_size - 1, size=(3, 5, 1, 1)) + y = np.random.random_integers(1, high=vocab_size - 1, size=(3, 6, 1, 1)) + hparams = common_hparams.basic_params1() + p_hparams = problem_hparams.test_problem_hparams(hparams, vocab_size, + vocab_size) + with self.test_session() as session: + features = { + "inputs": tf.constant(x, dtype=tf.int32), + "targets": tf.constant(y, dtype=tf.int32), + } + model = baseline.LSTMSeq2Seq(hparams, p_hparams) + sharded_logits, _, _ = model.model_fn(features, True) + logits = tf.concat(sharded_logits, 0) + session.run(tf.global_variables_initializer()) + res = session.run(logits) + self.assertEqual(res.shape, (3, 6, 1, 1, vocab_size)) + + +if __name__ == "__main__": + tf.test.main() diff --git a/tensor2tensor/models/bytenet.py b/tensor2tensor/models/bytenet.py new file mode 100644 index 000000000..42db05700 --- /dev/null +++ b/tensor2tensor/models/bytenet.py @@ -0,0 +1,112 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""ByteNet.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +from six.moves import xrange # pylint: disable=redefined-builtin + +from tensor2tensor.models import common_hparams +from tensor2tensor.models import common_layers +from tensor2tensor.utils import registry +from tensor2tensor.utils import t2t_model + +import tensorflow as tf + + +def residual_dilated_conv(x, repeat, padding, name, hparams, train): + """A stack of convolution blocks with residual connections.""" + with tf.variable_scope(name): + k = (hparams.kernel_height, hparams.kernel_width) + dilations_and_kernels = [((2**i, 1), k) + for i in xrange(hparams.num_hidden_layers)] + for i in xrange(repeat): + with tf.variable_scope("repeat_%d" % i): + y = common_layers.conv_block( + x, + hparams.hidden_size, + dilations_and_kernels, + padding=padding, + name="residual_conv") + x = common_layers.layer_norm(x + y, hparams.hidden_size, name="lnorm") + x = tf.nn.dropout(x, 1.0 - hparams.dropout * tf.to_float(train)) + return x + + +def bytenet_internal(inputs, targets, hparams, train): + """ByteNet, main step used for training.""" + with tf.variable_scope("bytenet"): + # Flatten inputs and extend length by 50%. + inputs = tf.expand_dims(common_layers.flatten4d3d(inputs), axis=2) + extend_length = tf.to_int32(0.5 * tf.to_float(tf.shape(inputs)[1])) + inputs_shape = inputs.shape.as_list() + inputs = tf.pad(inputs, [[0, 0], [0, extend_length], [0, 0], [0, 0]]) + inputs_shape[1] = None + inputs.set_shape(inputs_shape) # Don't lose the other shapes when padding. + # Pad inputs and targets to be the same length, divisible by 50. + inputs, targets = common_layers.pad_to_same_length( + inputs, targets, final_length_divisible_by=50) + final_encoder = residual_dilated_conv( + inputs, hparams.num_block_repeat, "SAME", "encoder", hparams, train) + + shifted_targets = common_layers.shift_left(targets) + kernel = (hparams.kernel_height, hparams.kernel_width) + decoder_start = common_layers.conv_block( + tf.concat([final_encoder, shifted_targets], axis=3), + hparams.hidden_size, [((1, 1), kernel)], + padding="LEFT") + + return residual_dilated_conv( + decoder_start, hparams.num_block_repeat, + "LEFT", "decoder", hparams, train) + + +@registry.register_model +class ByteNet(t2t_model.T2TModel): + + def model_fn_body(self, features, train): + return bytenet_internal(features["inputs"], features["targets"], + self._hparams, train) + + +@registry.register_hparams +def bytenet_base(): + """Set of hyperparameters.""" + hparams = common_hparams.basic_params1() + hparams.batch_size = 2048 + hparams.hidden_size = 768 + hparams.dropout = 0.2 + hparams.symbol_dropout = 0.2 + hparams.label_smoothing = 0.1 + hparams.clip_grad_norm = 2.0 + hparams.num_hidden_layers = 4 + hparams.kernel_height = 3 + hparams.kernel_width = 1 + hparams.learning_rate_decay_scheme = "exp50k" + hparams.learning_rate = 0.05 + hparams.learning_rate_warmup_steps = 3000 + hparams.initializer_gain = 1.0 + hparams.weight_decay = 3.0 + hparams.num_sampled_classes = 0 + hparams.sampling_method = "argmax" + hparams.optimizer_adam_epsilon = 1e-6 + hparams.optimizer_adam_beta1 = 0.85 + hparams.optimizer_adam_beta2 = 0.997 + hparams.add_hparam("num_block_repeat", 4) + return hparams diff --git a/tensor2tensor/models/bytenet_test.py b/tensor2tensor/models/bytenet_test.py new file mode 100644 index 000000000..676220cc8 --- /dev/null +++ b/tensor2tensor/models/bytenet_test.py @@ -0,0 +1,54 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""ByteNet tests.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +import numpy as np + +from tensor2tensor.data_generators import problem_hparams +from tensor2tensor.models import bytenet + +import tensorflow as tf + + +class ByteNetTest(tf.test.TestCase): + + def testByteNet(self): + vocab_size = 9 + x = np.random.random_integers(1, high=vocab_size - 1, size=(3, 5, 1, 1)) + y = np.random.random_integers(1, high=vocab_size - 1, size=(3, 6, 1, 1)) + hparams = bytenet.bytenet_base() + p_hparams = problem_hparams.test_problem_hparams(hparams, vocab_size, + vocab_size) + with self.test_session() as session: + features = { + "inputs": tf.constant(x, dtype=tf.int32), + "targets": tf.constant(y, dtype=tf.int32), + } + model = bytenet.ByteNet(hparams, p_hparams) + sharded_logits, _, _ = model.model_fn(features, True) + logits = tf.concat(sharded_logits, 0) + session.run(tf.global_variables_initializer()) + res = session.run(logits) + self.assertEqual(res.shape, (3, 50, 1, 1, vocab_size)) + + +if __name__ == "__main__": + tf.test.main() diff --git a/tensor2tensor/models/common_attention.py b/tensor2tensor/models/common_attention.py new file mode 100644 index 000000000..ccf288a09 --- /dev/null +++ b/tensor2tensor/models/common_attention.py @@ -0,0 +1,344 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utilities for attention.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math + +# Dependency imports + +from tensor2tensor.models import common_layers + +import tensorflow as tf + + +def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4): + """Adds a bunch of sinusoids of different frequencies to a Tensor. + + Each channel of the input Tensor is incremented by a sinusoid of a different + frequency and phase. + + This allows attention to learn to use absolute and relative positions. + Timing signals should be added to some precursors of both the query and the + memory inputs to attention. + + The use of relative position is possible because sin(x+y) and cos(x+y) can be + experessed in terms of y, sin(x) and cos(x). + + In particular, we use a geometric sequence of timescales starting with + min_timescale and ending with max_timescale. The number of different + timescales is equal to channels / 2. For each timescale, we + generate the two sinusoidal signals sin(timestep/timescale) and + cos(timestep/timescale). All of these sinusoids are concatenated in + the channels dimension. + + Args: + x: a Tensor with shape [batch, length, channels] + min_timescale: a float + max_timescale: a float + + Returns: + a Tensor the same shape as x. + """ + length = tf.shape(x)[1] + channels = tf.shape(x)[2] + position = tf.to_float(tf.range(length)) + num_timescales = channels // 2 + log_timescale_increment = ( + math.log(float(max_timescale) / float(min_timescale)) / + (tf.to_float(num_timescales) - 1)) + inv_timescales = min_timescale * tf.exp( + tf.to_float(tf.range(num_timescales)) * -log_timescale_increment) + scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales, 0) + signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1) + signal = tf.pad(signal, [[0, 0], [0, tf.mod(channels, 2)]]) + signal = tf.reshape(signal, [1, length, channels]) + return x + signal + + +def add_timing_signal_nd(x, min_timescale=1.0, max_timescale=1.0e4): + """Adds a bunch of sinusoids of different frequencies to a Tensor. + + Each channel of the input Tensor is incremented by a sinusoid of a different + frequency and phase in one of the positional dimensions. + + This allows attention to learn to use absolute and relative positions. + Timing signals should be added to some precursors of both the query and the + memory inputs to attention. + + The use of relative position is possible because sin(a+b) and cos(a+b) can be + experessed in terms of b, sin(a) and cos(a). + + x is a Tensor with n "positional" dimensions, e.g. one dimension for a + sequence or two dimensions for an image + + We use a geometric sequence of timescales starting with + min_timescale and ending with max_timescale. The number of different + timescales is equal to channels // (n * 2). For each timescale, we + generate the two sinusoidal signals sin(timestep/timescale) and + cos(timestep/timescale). All of these sinusoids are concatenated in + the channels dimension. + + Args: + x: a Tensor with shape [batch, d1 ... dn, channels] + min_timescale: a float + max_timescale: a float + + Returns: + a Tensor the same shape as x. + """ + static_shape = x.get_shape().as_list() + num_dims = len(static_shape) - 2 + channels = tf.shape(x)[-1] + num_timescales = channels // (num_dims * 2) + log_timescale_increment = ( + math.log(float(max_timescale) / float(min_timescale)) / + (tf.to_float(num_timescales) - 1)) + inv_timescales = min_timescale * tf.exp( + tf.to_float(tf.range(num_timescales)) * -log_timescale_increment) + for dim in xrange(num_dims): + length = tf.shape(x)[dim + 1] + position = tf.to_float(tf.range(length)) + scaled_time = tf.expand_dims(position, 1) * tf.expand_dims( + inv_timescales, 0) + signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1) + prepad = dim * 2 * num_timescales + postpad = channels - (dim + 1) * 2 * num_timescales + signal = tf.pad(signal, [[0, 0], [prepad, postpad]]) + for _ in xrange(1 + dim): + signal = tf.expand_dims(signal, 0) + for _ in xrange(num_dims - 1 - dim): + signal = tf.expand_dims(signal, -2) + x += signal + return x + + +def embedding_to_padding(emb): + """Input embeddings -> is_padding. + + We have hacked symbol_modality to return all-zero embeddings for padding. + + Args: + emb: a Tensor with shape [..., depth]. + Returns: + a boolean Tensor with shape [...]. + """ + emb_sum = tf.reduce_sum(tf.abs(emb), axis=-1) + return tf.equal(emb_sum, 0.0) + + +def attention_bias_lower_triangle(length): + """Create an bias tensor to be added to attention logits. + + Args: + length: a Scalar. + + Returns: + a `Tensor` with shape [1, 1, length, length]. + """ + lower_triangle = tf.matrix_band_part(tf.ones([length, length]), -1, 0) + ret = -1e9 * (1.0 - lower_triangle) + return tf.reshape(ret, [1, 1, length, length]) + + +def attention_bias_ignore_padding(memory_padding): + """Create an bias tensor to be added to attention logits. + + Args: + memory_padding: a boolean `Tensor` with shape [batch, memory_length]. + + Returns: + a `Tensor` with shape [batch, 1, 1, memory_length]. + """ + ret = tf.to_float(memory_padding) * -1e9 + return tf.expand_dims(tf.expand_dims(ret, 1), 1) + + +def split_last_dimension(x, n): + """Reshape x so that the last dimension becomes two dimensions. + + The first of these two dimensions is n. + + Args: + x: a Tensor with shape [..., m] + n: an integer. + + Returns: + a Tensor with shape [..., n, m/n] + """ + old_shape = x.get_shape().dims + last = old_shape[-1] + new_shape = old_shape[:-1] + [n] + [last // n if last else None] + ret = tf.reshape(x, tf.concat([tf.shape(x)[:-1], [n, -1]], 0)) + ret.set_shape(new_shape) + return ret + + +def combine_last_two_dimensions(x): + """Reshape x so that the last two dimension become one. + + Args: + x: a Tensor with shape [..., a, b] + + Returns: + a Tensor with shape [..., ab] + """ + old_shape = x.get_shape().dims + a, b = old_shape[-2:] + new_shape = old_shape[:-2] + [a * b if a and b else None] + ret = tf.reshape(x, tf.concat([tf.shape(x)[:-2], [-1]], 0)) + ret.set_shape(new_shape) + return ret + + +def split_heads(x, num_heads): + """Split channels (dimension 3) into multiple heads (becomes dimension 1). + + Args: + x: a Tensor with shape [batch, length, channels] + num_heads: an integer + + Returns: + a Tensor with shape [batch, num_heads, length, channels / num_heads] + """ + return tf.transpose(split_last_dimension(x, num_heads), [0, 2, 1, 3]) + + +def combine_heads(x): + """Inverse of split_heads. + + Args: + x: a Tensor with shape [batch, num_heads, length, channels / num_heads] + + Returns: + a Tensor with shape [batch, length, channels] + """ + return combine_last_two_dimensions(tf.transpose(x, [0, 2, 1, 3])) + + +def attention_image_summary(attn): + """Compute color image summary. + + Args: + attn: a Tensor with shape [batch, num_heads, query_length, memory_length] + """ + num_heads = attn.get_shape().as_list()[1] + # [batch, query_length, memory_length, num_heads] + image = tf.transpose(attn, [0, 2, 3, 1]) + image = tf.pow(image, 0.2) # for high-dynamic-range + # Each head will correspond to one of RGB. + # pad the heads to be a multiple of 3 + image = tf.pad(image, [[0, 0], [0, 0], [0, 0], [0, -num_heads % 3]]) + image = split_last_dimension(image, 3) + image = tf.reduce_max(image, 4) + tf.summary.image("attention", image, max_outputs=1) + + +def dot_product_attention(q, + k, + v, + bias, + dropout_rate=0.0, + summaries=False, + name=None): + """dot-product attention. + + Args: + q: a Tensor with shape [batch, heads, length_q, depth_k] + k: a Tensor with shape [batch, heads, length_kv, depth_k] + v: a Tensor with shape [batch, heads, length_kv, depth_v] + bias: bias Tensor (see attention_bias()) + dropout_rate: a floating point number + summaries: a boolean + name: an optional string + + Returns: + A Tensor. + """ + with tf.variable_scope( + name, default_name="dot_product_attention", values=[q, k, v]): + # [batch, num_heads, query_length, memory_length] + logits = tf.matmul(q, k, transpose_b=True) + if bias is not None: + logits += bias + weights = tf.nn.softmax(logits, name="attention_weights") + # dropping out the attention links for each of the heads + weights = tf.nn.dropout(weights, 1.0 - dropout_rate) + if summaries and not tf.get_variable_scope().reuse: + attention_image_summary(weights) + return tf.matmul(weights, v) + + +def multihead_attention(query_antecedent, + memory_antecedent, + bias, + total_key_depth, + total_value_depth, + output_depth, + num_heads, + dropout_rate, + summaries=False, + name=None): + """Multihead scaled-dot-product attention with input/output transformations. + + Args: + query_antecedent: a Tensor with shape [batch, length_q, channels] + memory_antecedent: a Tensor with shape [batch, length_m, channels] + bias: bias Tensor (see attention_bias()) + total_key_depth: an integer + total_value_depth: an integer + output_depth: an integer + num_heads: an integer dividing total_key_depth and total_value_depth + dropout_rate: a floating point number + summaries: a boolean + name: an optional string + + Returns: + A Tensor. + """ + with tf.variable_scope( + name, + default_name="multihead_attention", + values=[query_antecedent, memory_antecedent]): + if memory_antecedent is None: + # self attention + combined = common_layers.conv1d( + query_antecedent, + total_key_depth * 2 + total_value_depth, + 1, + name="qkv_transform") + q, k, v = tf.split( + combined, [total_key_depth, total_key_depth, total_value_depth], + axis=2) + else: + q = common_layers.conv1d( + query_antecedent, total_key_depth, 1, name="q_transform") + combined = common_layers.conv1d( + memory_antecedent, + total_key_depth + total_value_depth, + 1, + name="kv_transform") + k, v = tf.split(combined, [total_key_depth, total_value_depth], axis=2) + q = split_heads(q, num_heads) + k = split_heads(k, num_heads) + v = split_heads(v, num_heads) + key_depth_per_head = total_key_depth // num_heads + q *= key_depth_per_head**-0.5 + x = dot_product_attention(q, k, v, bias, dropout_rate, summaries) + x = combine_heads(x) + x = common_layers.conv1d(x, output_depth, 1, name="output_transform") + return x diff --git a/tensor2tensor/models/common_hparams.py b/tensor2tensor/models/common_hparams.py new file mode 100644 index 000000000..81c41dcc5 --- /dev/null +++ b/tensor2tensor/models/common_hparams.py @@ -0,0 +1,193 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Hyperparameters and ranges common to multiple models.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +import six +from six.moves import zip # pylint: disable=redefined-builtin +from tensor2tensor.utils import registry + +import tensorflow as tf + + +@registry.register_hparams("basic1") +def basic_params1(): + """A set of basic hyperparameters.""" + return tf.contrib.training.HParams( + batch_size=4096, # in tokens per batch per gpu + # This flag controls the number of length buckets in the data reader. + # Too many buckets slows down data reading - this needs fixing. + # Too few buckets mean lots of wasted padding. + # If this value is 1, we have buckets with maximum lengths: + # [8, 12, 16, 24, 32, 48 ... (max_length or batch_size)] + # If this value is 2, we have buckets with maximum lengths: + # [8, 10, 12, 14, 16, 20, 24 ... (max_length or batch_size)] + batching_mantissa_bits=1, + num_hidden_layers=4, + kernel_height=3, + kernel_width=1, + hidden_size=64, + compress_steps=0, + dropout=0.2, + clip_grad_norm=2.0, + initializer="orthogonal", + initializer_gain=1.5, + label_smoothing=0.1, + optimizer="Adam", + optimizer_adam_epsilon=1e-6, + optimizer_adam_beta1=0.85, + optimizer_adam_beta2=0.997, + optimizer_momentum_momentum=0.9, + weight_decay=0.1, + weight_noise=0.0, + learning_rate_decay_scheme="none", + learning_rate_warmup_steps=100, + learning_rate=0.1, + sampling_method="argmax", # "argmax" or "random" + problem_choice="adaptive", # "uniform", "adaptive", "distributed" + multiply_embedding_mode="sqrt_depth", + symbol_modality_num_shards=16, + # setting the max length in a minibatch. 0 means default behavior, + # max_length = hparams.batch_size * length_multiplier + max_length=0, + # in SymbolModality, share the output embeddings and the softmax + # variables. + # You can also share the input embeddings with the output embeddings + # by using a problem_hparams that uses the same modality object for + # the input_modality and target_modality. + shared_embedding_and_softmax_weights=int(False),) + + +class RangedHParams(object): + """Defines parameter ranges for tuning.""" + + # From ParameterConfig proto + LINEAR_SCALE = 1 + LOG_SCALE = 2 + REVERSE_LOG_SCALE = 3 + + def __init__(self): + self._categorical_params = {} + self._discrete_params = {} + self._float_params = {} + self._int_params = {} + + def _check_reset_and_type_change(self, name, orig_ctr): + """Check if name is in orig_ctr or in one of the other type containers.""" + # Resetting a hyperparameter + if name in orig_ctr: + tf.logging.warning("Overwriting hparam %s", name) + + ctr_names = [(self._categorical_params, + "categorical"), (self._discrete_params, "discrete"), + (self._float_params, "float"), (self._int_params, "int")] + ctrs, names = list(zip(*ctr_names)) + orig_name = names[ctrs.index(orig_ctr)] + + for ctr, ctr_name in ctr_names: + if ctr is orig_ctr: + continue + + # Using a different type for the same hyperparameter name + if name in ctr: + raise ValueError("Setting hyperparameter %s as type %s, but a " + "hyperparemeter of the same name was originally " + "registered as type %s" % (name, ctr_name, orig_name)) + + def set_categorical(self, name, categories, length=None): + self._check_reset_and_type_change(name, self._categorical_params) + self._categorical_params[name] = (name, categories, length) + + def set_discrete(self, name, feasible_points, scale=None, length=None): + self._check_reset_and_type_change(name, self._discrete_params) + self._discrete_params[name] = (name, feasible_points, scale, length) + + def set_float(self, name, min_val, max_val, scale=None, length=None): + self._check_reset_and_type_change(name, self._float_params) + self._float_params[name] = (name, min_val, max_val, scale, length) + + def set_int(self, name, min_val, max_val, scale=None, length=None): + self._check_reset_and_type_change(name, self._int_params) + self._int_params[name] = (name, min_val, max_val, scale, length) + + +def fill_ranged_hparams_from_hparams(hparams, ranged_hparams): + """Fill ranged_hparams with singleton values from hparams. + + HParams are placed in RangedHParams with the following functions, according to + type: + * int: set_discrete + * float: set_float + * str: set_categorical + + Args: + hparams: tf.contrib.training.HParams; contains the hyperparameters to copy + over to ranged_hparams. + ranged_hparams: RangedHParams; will have hparams values copied to it. + + Raises: + ValueError: if hparams contains a hyperparameter not of type + {int, float, str, bool}. + """ + for name, (hp_type, is_multivalent) in six.iteritems(hparams._hparam_types): # pylint: disable=protected-access + + if is_multivalent: + raise ValueError("Multivalent hparams not supported in RangedHParams. " + "Hyperparameter %s is multivalent." % name) + val = getattr(hparams, name) + if hp_type == int: + ranged_hparams.set_discrete(name, [val]) + elif hp_type == float: + ranged_hparams.set_float(name, val, val) + elif hp_type == str: + ranged_hparams.set_categorical(name, [val]) + else: + raise ValueError("Unsupported type %s for param %s" % (hp_type, name)) + + +@registry.register_ranged_hparams("basic1") +def basic_range1(ranged_hparams): + """A basic range of hyperparameters.""" + rhp = ranged_hparams + + hparams = basic_params1() + fill_ranged_hparams_from_hparams(hparams, rhp) + + rhp.set_discrete("batch_size", [1024, 2048, 4096]) + rhp.set_discrete("num_hidden_layers", [1, 2, 3, 4, 5, 6]) + rhp.set_discrete("hidden_size", [32, 64, 128, 256, 512], scale=rhp.LOG_SCALE) + rhp.set_discrete("kernel_height", [1, 3, 5, 7]) + rhp.set_discrete("kernel_width", [1, 3, 5, 7]) + rhp.set_discrete("compress_steps", [0, 1, 2]) + rhp.set_float("dropout", 0.0, 0.5) + rhp.set_float("weight_decay", 1e-4, 10.0, scale=rhp.LOG_SCALE) + rhp.set_float("label_smoothing", 0.0, 0.2) + rhp.set_float("clip_grad_norm", 0.01, 50.0, scale=rhp.LOG_SCALE) + rhp.set_float("learning_rate", 0.005, 2.0, scale=rhp.LOG_SCALE) + rhp.set_categorical("initializer", + ["uniform", "orthogonal", "uniform_unit_scaling"]) + rhp.set_float("initializer_gain", 0.5, 3.5) + rhp.set_categorical("learning_rate_decay_scheme", + ["none", "sqrt", "noam", "exp10k"]) + rhp.set_float("optimizer_adam_epsilon", 1e-7, 1e-2, scale=rhp.LOG_SCALE) + rhp.set_float("optimizer_adam_beta1", 0.8, 0.9) + rhp.set_float("optimizer_adam_beta2", 0.995, 0.999) + rhp.set_categorical("optimizer", + ["Adam", "Adagrad", "Momentum", "RMSProp", "SGD"]) diff --git a/tensor2tensor/models/common_layers.py b/tensor2tensor/models/common_layers.py new file mode 100644 index 000000000..ef6559f9e --- /dev/null +++ b/tensor2tensor/models/common_layers.py @@ -0,0 +1,1340 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Layers common to multiple models.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math + +# Dependency imports + +import numpy as np +from six.moves import xrange # pylint: disable=redefined-builtin +from tensor2tensor.utils import expert_utils as eu + +import tensorflow as tf + +from tensorflow.python.framework import function + +# This is a global setting. When turned off, no @function.Defun is used. +allow_defun = True + + +def saturating_sigmoid(x): + """Saturating sigmoid: 1.2 * sigmoid(x) - 0.1 cut to [0, 1].""" + with tf.name_scope("saturating_sigmoid", [x]): + y = tf.sigmoid(x) + return tf.minimum(1.0, tf.maximum(0.0, 1.2 * y - 0.1)) + + +def hard_sigmoid(x, saturation_limit=0.9): + saturation_cost = tf.reduce_mean(tf.nn.relu(tf.abs(x) - saturation_limit)) + x_shifted = 0.5 * x + 0.5 + return tf.minimum(1.0, tf.nn.relu(x_shifted)), saturation_cost + + +def hard_tanh(x, saturation_limit=0.9): + saturation_cost = tf.reduce_mean(tf.nn.relu(tf.abs(x) - saturation_limit)) + return tf.minimum(1.0, tf.maximum(x, -1.0)), saturation_cost + + +def inverse_exp_decay(max_step, min_value=0.01): + """Inverse-decay exponentially from 0.01 to 1.0 reached at max_step.""" + inv_base = tf.exp(tf.log(min_value) / float(max_step)) + step = tf.to_float(tf.contrib.framework.get_global_step()) + return inv_base**tf.maximum(float(max_step) - step, 0.0) + + +def standardize_images(x): + """Image standardization on batches (tf.image.per_image_standardization).""" + with tf.name_scope("standardize_images", [x]): + x = tf.to_float(x) + x_mean = tf.reduce_mean(x, axis=[1, 2, 3], keep_dims=True) + x_variance = tf.reduce_mean( + tf.square(x - x_mean), axis=[1, 2, 3], keep_dims=True) + num_pixels = tf.to_float(tf.shape(x)[1] * tf.shape(x)[2] * 3) + x = (x - x_mean) / tf.maximum(tf.sqrt(x_variance), tf.rsqrt(num_pixels)) + # TODO(lukaszkaiser): remove hack below, needed for greedy decoding for now. + if x.shape and len(x.shape) == 4 and x.shape[3] == 1: + x = tf.concat([x, x, x], axis=3) # Not used, just a dead tf.cond branch. + x.set_shape([None, None, None, 3]) + return x + + +def image_augmentation(images, do_colors=False): + """Image augmentation: cropping, flipping, and color transforms.""" + images = tf.random_crop(images, [299, 299, 3]) + images = tf.image.random_flip_left_right(images) + if do_colors: # More augmentation, but might be slow. + images = tf.image.random_brightness(images, max_delta=32. / 255.) + images = tf.image.random_saturation(images, lower=0.5, upper=1.5) + images = tf.image.random_hue(images, max_delta=0.2) + images = tf.image.random_contrast(images, lower=0.5, upper=1.5) + return images + + +def flatten4d3d(x): + """Flatten a 4d-tensor into a 3d-tensor by joining width and height.""" + xshape = tf.shape(x) + result = tf.reshape(x, [xshape[0], xshape[1] * xshape[2], xshape[3]]) + # Preserve static shapes when available. + xshape_static = x.get_shape() + result.set_shape([xshape_static[0], None, xshape_static[3]]) + return result + + +def embedding(x, vocab_size, dense_size, name=None, reuse=None, multiplier=1.0): + """Embed x of type int64 into dense vectors, reducing to max 4 dimensions.""" + with tf.variable_scope( + name, default_name="embedding", values=[x], reuse=reuse): + embedding_var = tf.get_variable("kernel", [vocab_size, dense_size]) + # On the backwards pass, we want to convert the gradient from + # an indexed-slices to a regular tensor before sending it back to the + # parameter server. This avoids excess computation on the parameter server. + embedding_var = eu.ConvertGradientToTensor(embedding_var) + emb_x = tf.gather(embedding_var, x) + if multiplier != 1.0: + emb_x *= multiplier + shape, static_shape = tf.shape(emb_x), emb_x.shape.as_list() + if not static_shape or len(static_shape) < 5: + return emb_x + # If we had extra channel dimensions, assume it's 1, i.e. shape[3] == 1. + assert len(static_shape) == 5 + return tf.reshape(emb_x, [shape[0], shape[1], shape[2], static_shape[4]]) + + +def shift_left(x, pad_value=None): + """Shift the second dimension of x right by one.""" + if pad_value is None: + shifted_targets = tf.pad(x, [[0, 0], [1, 0], [0, 0], [0, 0]])[:, :-1, :, :] + else: + shifted_targets = tf.concat([pad_value, x], axis=1)[:, :-1, :, :] + return shifted_targets + + +def shift_left_3d(x, pad_value=None): + """Shift the second dimension of x right by one.""" + if pad_value is None: + shifted_targets = tf.pad(x, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] + else: + shifted_targets = tf.concat([pad_value, x], axis=1)[:, :-1, :] + return shifted_targets + + +def conv_stride2_multistep(x, nbr_steps, output_filters, name=None, reuse=None): + """Use a strided convolution to downsample x by 2, `nbr_steps` times. + + We use stride and filter size 2 to avoid the checkerboard problem of deconvs. + As detailed in http://distill.pub/2016/deconv-checkerboard/. + + Args: + x: a `Tensor` with shape `[batch, spatial, depth]` or + `[batch, spatial_1, spatial_2, depth]` + nbr_steps: number of halving downsample rounds to apply + output_filters: an int specifying the filter count for the convolutions + name: a string + reuse: a boolean + + Returns: + a `Tensor` with shape `[batch, spatial / (2**nbr_steps), output_filters]` or + `[batch, spatial_1 / (2**nbr_steps), spatial_2 / (2**nbr_steps), + output_filters]` + """ + with tf.variable_scope( + name, default_name="conv_stride2_multistep", values=[x], reuse=reuse): + if nbr_steps == 0: + out = conv(x, output_filters, (1, 1)) + return out, [out] + hidden_layers = [x] + for i in xrange(nbr_steps): + hidden_layers.append( + conv( + hidden_layers[-1], + output_filters, (2, 2), + strides=2, + activation=tf.nn.relu, + name="conv" + str(i))) + return hidden_layers[-1], hidden_layers + + +def deconv_stride2_multistep(x, + nbr_steps, + output_filters, + name=None, + reuse=None): + """Use a deconvolution to upsample x by 2**`nbr_steps`. + + Args: + x: a `Tensor` with shape `[batch, spatial, depth]` or + `[batch, spatial_1, spatial_2, depth]` + nbr_steps: an int specifying the number of doubling upsample rounds to + apply. + output_filters: an int specifying the filter count for the deconvolutions + name: a string + reuse: a boolean + + Returns: + a `Tensor` with shape `[batch, spatial * (2**nbr_steps), output_filters]` or + `[batch, spatial_1 * (2**nbr_steps), spatial_2 * (2**nbr_steps), + output_filters]` + """ + with tf.variable_scope( + name, default_name="deconv_stride2_multistep", values=[x], reuse=reuse): + + def deconv1d(cur, i): + cur_shape = tf.shape(cur) + thicker = conv( + cur, + output_filters * 2, (1, 1), + padding="SAME", + activation=tf.nn.relu, + name="deconv1d" + str(i)) + return tf.reshape(thicker, + [cur_shape[0], cur_shape[1] * 2, 1, output_filters]) + + def deconv2d(cur, i): + thicker = conv( + cur, + output_filters * 4, (1, 1), + padding="SAME", + activation=tf.nn.relu, + name="deconv2d" + str(i)) + return tf.depth_to_space(thicker, 2) + + cur = x + for i in xrange(nbr_steps): + if cur.get_shape()[2] == 1: + cur = deconv1d(cur, i) + else: + cur = tf.cond( + tf.equal(tf.shape(cur)[2], 1), + lambda idx=i: deconv1d(cur, idx), + lambda idx=i: deconv2d(cur, idx)) + return cur + + +def conv_internal(conv_fn, inputs, filters, kernel_size, **kwargs): + """Conditional conv_fn making kernel 1d or 2d depending on inputs shape.""" + static_shape = inputs.get_shape() + if not static_shape or len(static_shape) != 4: + raise ValueError("Inputs to conv must have statically known rank 4.") + inputs.set_shape([static_shape[0], None, None, static_shape[3]]) + # Add support for left padding. + if "padding" in kwargs and kwargs["padding"] == "LEFT": + dilation_rate = (1, 1) + if "dilation_rate" in kwargs: + dilation_rate = kwargs["dilation_rate"] + assert kernel_size[0] % 2 == 1 and kernel_size[1] % 2 == 1 + height_padding = 2 * (kernel_size[0] // 2) * dilation_rate[0] + cond_padding = tf.cond( + tf.equal(tf.shape(inputs)[2], 1), lambda: tf.constant(0), + lambda: tf.constant(2 * (kernel_size[1] // 2) * dilation_rate[1])) + width_padding = 0 if static_shape[2] == 1 else cond_padding + padding = [[0, 0], [height_padding, 0], [width_padding, 0], [0, 0]] + inputs = tf.pad(inputs, padding) + kwargs["padding"] = "VALID" + force2d = False # Special argument we use to force 2d kernels (see below). + if "force2d" in kwargs: + force2d = kwargs["force2d"] + + def conv2d_kernel(kernel_size_arg, name_suffix): + """Call conv2d but add suffix to name.""" + if "name" in kwargs: + original_name = kwargs["name"] + name = kwargs.pop("name") + "_" + name_suffix + else: + original_name = None + name = "conv_" + name_suffix + original_force2d = None + if "force2d" in kwargs: + original_force2d = kwargs.pop("force2d") + result = conv_fn(inputs, filters, kernel_size_arg, name=name, **kwargs) + if original_name is not None: + kwargs["name"] = original_name # Restore for other calls. + if original_force2d is not None: + kwargs["force2d"] = original_force2d + return result + + # Manually setting the shape to be unknown in the middle two dimensions so + # that the `tf.cond` below won't throw an error based on the convolution + # kernels being too large for the data. + inputs._shape = tf.TensorShape([static_shape[0], None, None, static_shape[3]]) # pylint: disable=protected-access + if kernel_size[1] == 1 or force2d: + # Avoiding the cond below can speed up graph and gradient construction. + return conv2d_kernel(kernel_size, "single") + return tf.cond( + tf.equal(tf.shape(inputs)[2], + 1), lambda: conv2d_kernel((kernel_size[0], 1), "small"), + lambda: conv2d_kernel(kernel_size, "std")) + + +def conv(inputs, filters, kernel_size, **kwargs): + return conv_internal(tf.layers.conv2d, inputs, filters, kernel_size, **kwargs) + + +def conv1d(inputs, filters, kernel_size, **kwargs): + return tf.squeeze( + conv(tf.expand_dims(inputs, 2), filters, (kernel_size, 1), **kwargs), 2) + + +def separable_conv(inputs, filters, kernel_size, **kwargs): + return conv_internal(tf.layers.separable_conv2d, inputs, filters, kernel_size, + **kwargs) + + +def subseparable_conv(inputs, filters, kernel_size, **kwargs): + """Sub-separable convolution. If separability == 0 it's a separable_conv.""" + + def conv_fn(inputs, filters, kernel_size, **kwargs): + """Sub-separable convolution, splits into separability-many blocks.""" + separability = None + if "separability" in kwargs: + separability = kwargs.pop("separability") + if separability: + parts = [] + abs_sep = separability if separability > 0 else -1 * separability + for split_idx, split in enumerate(tf.split(inputs, abs_sep, axis=3)): + with tf.variable_scope("part_%d" % split_idx): + if separability > 0: + parts.append( + tf.layers.conv2d(split, filters // separability, kernel_size, ** + kwargs)) + else: + parts.append( + tf.layers.separable_conv2d(split, filters // abs_sep, + kernel_size, **kwargs)) + if separability > 1: + result = tf.layers.conv2d(tf.concat(parts, axis=3), filters, (1, 1)) + elif abs_sep == 1: # If we have just one block, return it. + assert len(parts) == 1 + result = parts[0] + else: + result = tf.concat(parts, axis=3) + else: + result = tf.layers.separable_conv2d(inputs, filters, kernel_size, + **kwargs) + if separability is not None: + kwargs["separability"] = separability + return result + + return conv_internal(conv_fn, inputs, filters, kernel_size, **kwargs) + + +def layer_norm_compute_python(x, epsilon, scale, bias): + """Layer norm raw computation.""" + mean = tf.reduce_mean(x, axis=[-1], keep_dims=True) + variance = tf.reduce_mean(tf.square(x - mean), axis=[-1], keep_dims=True) + norm_x = (x - mean) * tf.rsqrt(variance + epsilon) + return norm_x * scale + bias + + +@function.Defun(compiled=True) +def layer_norm_compute_grad(x, epsilon, scale, bias, dy): + y = layer_norm_compute_python(x, epsilon, scale, bias) + dx = tf.gradients(ys=[y], xs=[x, epsilon, scale, bias], grad_ys=[dy]) + return dx + + +@function.Defun( + compiled=True, + separate_compiled_gradients=True, + grad_func=layer_norm_compute_grad) +def layer_norm_compute(x, epsilon, scale, bias): + return layer_norm_compute_python(x, epsilon, scale, bias) + + +def layer_norm(x, filters=None, epsilon=1e-6, name=None, reuse=None): + """Layer normalize the tensor x, averaging over the last dimension.""" + if filters is None: + filters = x.get_shape()[-1] + with tf.variable_scope( + name, default_name="layer_norm", values=[x], reuse=reuse): + scale = tf.get_variable( + "layer_norm_scale", [filters], initializer=tf.ones_initializer()) + bias = tf.get_variable( + "layer_norm_bias", [filters], initializer=tf.zeros_initializer()) + if allow_defun: + result = layer_norm_compute(x, tf.constant(epsilon), scale, bias) + result.set_shape(x.get_shape()) + else: + result = layer_norm_compute_python(x, epsilon, scale, bias) + return result + + +def noam_norm(x, name=None): + """One version of layer normalization.""" + with tf.name_scope(name, default_name="noam_norm", values=[x]): + shape = x.get_shape() + ndims = len(shape) + return (tf.nn.l2_normalize(x, ndims - 1, epsilon=1.0) * + tf.sqrt(tf.to_float(shape[-1]))) + + +def residual_function(hparams): + """Returns a function for combining layer input and layer output. + + The returned function on x (layer input) and y (layer output) computes: + norm_function(x + t + + Args: + hparams: model hyperparameters + + Returns: + a function from x= and y= to computed output + """ + + def residual_fn(x, y): + return hparams.norm_function(x + tf.nn.dropout( + y, 1.0 - hparams.residual_dropout)) + + return residual_fn + + +def conv_block_internal(conv_fn, + inputs, + filters, + dilation_rates_and_kernel_sizes, + first_relu=True, + use_elu=False, + separabilities=None, + **kwargs): + """A block of convolutions. + + Args: + conv_fn: convolution function, e.g. conv or separable_conv. + inputs: a Tensor + filters: an Integer + dilation_rates_and_kernel_sizes: a list of tuples (dilation, (k_w, k_h)) + first_relu: whether to do a relu at start (defaults to True) + use_elu: whether to use ELUs instead of ReLUs (defaults to False) + separabilities: list of separability factors (per-layer). + **kwargs: additional arguments (e.g., pooling) + + Returns: + a Tensor. + """ + name = kwargs.pop("name") if "name" in kwargs else None + mask = kwargs.pop("mask") if "mask" in kwargs else None + norm = kwargs.pop("normalizer_fn") if "normalizer_fn" in kwargs else None + if norm is None and "normalizer_fn" not in kwargs: + norm = lambda x, name: layer_norm(x, filters, name=name) + with tf.variable_scope(name, "conv_block", [inputs]): + cur, counter = inputs, -1 + for dilation_rate, kernel_size in dilation_rates_and_kernel_sizes: + counter += 1 + if first_relu or counter > 0: + cur = tf.nn.elu(cur) if use_elu else tf.nn.relu(cur) + if mask is not None: + cur *= mask + if separabilities: + cur = conv_fn( + cur, + filters, + kernel_size, + dilation_rate=dilation_rate, + name="conv_block_%d" % counter, + use_bias=norm is None, + separability=separabilities[counter], + **kwargs) + else: + cur = conv_fn( + cur, + filters, + kernel_size, + dilation_rate=dilation_rate, + name="conv_block_%d" % counter, + use_bias=norm is None, + **kwargs) + if norm is not None: + cur = norm(cur, name="conv_block_norm_%d" % counter) + return cur + + +def conv_block(inputs, filters, dilation_rates_and_kernel_sizes, **kwargs): + """A block of standard convolutions.""" + return conv_block_internal(conv, inputs, filters, + dilation_rates_and_kernel_sizes, **kwargs) + + +def separable_conv_block(inputs, filters, dilation_rates_and_kernel_sizes, + **kwargs): + """A block of separable convolutions.""" + return conv_block_internal(separable_conv, inputs, filters, + dilation_rates_and_kernel_sizes, **kwargs) + + +def subseparable_conv_block(inputs, filters, dilation_rates_and_kernel_sizes, + **kwargs): + """A block of separable convolutions.""" + return conv_block_internal(subseparable_conv, inputs, filters, + dilation_rates_and_kernel_sizes, **kwargs) + + +def pool(inputs, window_size, pooling_type, padding, strides=(1, 1)): + """Pooling (supports "LEFT").""" + with tf.name_scope("pool", [inputs]): + static_shape = inputs.get_shape() + if not static_shape or len(static_shape) != 4: + raise ValueError("Inputs to conv must have statically known rank 4.") + # Add support for left padding. + if padding == "LEFT": + assert window_size[0] % 2 == 1 and window_size[1] % 2 == 1 + if len(static_shape) == 3: + width_padding = 2 * (window_size[1] // 2) + padding_ = [[0, 0], [width_padding, 0], [0, 0]] + else: + height_padding = 2 * (window_size[0] // 2) + cond_padding = tf.cond( + tf.equal(tf.shape(inputs)[2], 1), lambda: tf.constant(0), + lambda: tf.constant(2 * (window_size[1] // 2))) + width_padding = 0 if static_shape[2] == 1 else cond_padding + padding_ = [[0, 0], [height_padding, 0], [width_padding, 0], [0, 0]] + inputs = tf.pad(inputs, padding_) + inputs.set_shape([static_shape[0], None, None, static_shape[3]]) + padding = "VALID" + window_size_small = (window_size[0], 1) + strides_small = (strides[0], 1) + # Manually setting the shape to be unknown in the middle two dimensions so + # that the `tf.cond` below won't throw an error based on the convolution + # kernels being too large for the data. + inputs._shape = tf.TensorShape( # pylint: disable=protected-access + [static_shape[0], None, None, static_shape[3]]) + return tf.cond( + tf.equal(tf.shape(inputs)[2], 1), + lambda: tf.nn.pool( # pylint: disable=g-long-lambda + inputs, window_size_small, pooling_type, padding, + strides=strides_small), + lambda: tf.nn.pool( # pylint: disable=g-long-lambda + inputs, window_size, pooling_type, padding, strides=strides)) + + +def conv_block_downsample(x, + kernel, + strides, + padding, + separability=0, + name=None, + reuse=None): + """Implements a downwards-striding conv block, like Xception exit flow.""" + with tf.variable_scope( + name, default_name="conv_block_downsample", values=[x], reuse=reuse): + hidden_size = int(x.get_shape()[-1]) + res = conv_block( + x, + int(1.25 * hidden_size), [((1, 1), kernel)], + padding=padding, + strides=strides, + name="res_conv") + + x = subseparable_conv_block( + x, + hidden_size, [((1, 1), kernel)], + padding=padding, + separability=separability, + name="conv0") + x = subseparable_conv_block( + x, + int(1.25 * hidden_size), [((1, 1), kernel)], + padding=padding, + separability=separability, + name="conv1") + x = pool(x, kernel, "MAX", padding, strides=strides) + + x += res + + x = subseparable_conv_block( + x, + 2 * hidden_size, [((1, 1), kernel)], + first_relu=False, + padding=padding, + separability=separability, + name="conv2") + x = subseparable_conv_block( + x, + int(2.5 * hidden_size), [((1, 1), kernel)], + padding=padding, + separability=separability, + name="conv3") + return x + + +def decompress_seqcnn(x, + targets, + targets_vocab_size, + dilations_and_kernels, + block_size, + is_2d=False, + embedding_var=None, + name=None, + reuse=None): + """Decompress x into targets size using a Sequence CNN at every element.""" + with tf.variable_scope( + name, + default_name="decompress_batch_seqcnn", + values=[x, targets], + reuse=reuse): + # We assume targets are [batch x block_size * N x block_size * N x C] if + # is_2d=True or [batch, block_size * N, 1, C] otherwise, and C is static. + # Let's shift targets to depth and embed. + targets_shape, targets_shape_static = tf.shape(targets), targets.get_shape() + channels = int(targets_shape_static[-1]) + hidden_size = int(x.get_shape()[-1]) + if is_2d: + depth_targets = tf.space_to_depth(targets, block_size) + factor = channels * block_size * block_size + else: + depth_targets = tf.reshape(targets, [ + targets_shape[0], targets_shape[1] // block_size, 1, + channels * block_size + ]) + factor = channels * block_size + if embedding_var is None: + embedding_var = tf.get_variable("targets_embedding", + [targets_vocab_size, hidden_size]) + targets_emb = tf.gather(embedding_var, depth_targets) + # Flatten x and embedded targets. Flat targets are factor* larger on axis=1. + flat_x = tf.reshape(x, [-1, 1, 1, hidden_size]) + flat_targets = tf.reshape(targets_emb, [-1, factor, 1, hidden_size]) + shifted_targets = shift_left(flat_targets) + # Run a SeqCNN large-batch to produce factor outputs out of every target. + flat_x += tf.zeros_like(shifted_targets) # Broadcast on axis=1. + flat_outputs = conv_block( + tf.concat([flat_x, shifted_targets], axis=3), + hidden_size, + dilations_and_kernels, + padding="LEFT") + # Reshape back to embedded targets shape. + outputs = tf.reshape(flat_outputs, [ + tf.shape(targets_emb)[0], + tf.shape(targets_emb)[1], + tf.shape(targets_emb)[2], factor * hidden_size + ]) + # Move depth back to target space. + if is_2d: + outputs = tf.depth_to_space(outputs, 2) + else: + outputs = tf.reshape(outputs, [ + tf.shape(outputs)[0], block_size * tf.shape(outputs)[1], 1, + hidden_size + ]) + # Final reshape before prediction to ensure target size. + outputs = tf.reshape(outputs, [ + targets_shape[0], targets_shape[1], targets_shape[2], channels, + hidden_size + ]) + return tf.layers.dense(outputs, targets_vocab_size) + + +def moe_layer(data_parallelism, + ps_devices, + xs, + train, + model_hidden_size, + expert_hidden_size, + n1, + n2, + loss_coef, + autoscale=True, + name=None): + """A mixture of experts layer. + + Args: + data_parallelism: a expert_utils.Parallelism object. + ps_devices: a list of strings + xs: a list of input tensors. + train: a boolean scalar. + model_hidden_size: an integer (input/output size for this layer) + expert_hidden_size: an integer (size of each expert's hidden layer) + n1: an integer - number of experts (or # of groups for hierarchical MoE) + n2: optional integer - size of each group of experts for hierarchical MoE + loss_coef: a scalar - multiplier on load-balancing losses + autoscale: a boolean + name: a string + + Returns: + ys: a list of tensors: + extra_training_loss: a scalar + """ + dp = data_parallelism + with tf.variable_scope(name, default_name="moe"): + # Set up the hyperparameters for the gating networks. + primary_gating_hp = eu.NoisyTopKGatingParams() + primary_gating_hp.num_experts = n1 + if n2: + # hierarchical MoE containing moe_n1 groups of moe_n2 experts. + assert n2 > 1 + secondary_gating_hp = eu.NoisyTopKGatingParams() + secondary_gating_hp.num_experts = n2 + else: + # flat mixture of moe_n1 experts. + secondary_gating_hp = None + # Set up the hyperparameters for the expert networks. + # Each expert contains a hidden RELU layer of size filter_size + expert_hp = eu.FeedForwardExpertParams() + expert_hp.autoscale = autoscale + expert_hp.hidden_layer_sizes = [expert_hidden_size] + # Create the mixture of experts. + moe = eu.DistributedMixtureOfExperts(primary_gating_hp, secondary_gating_hp, + expert_hp, model_hidden_size, + model_hidden_size, ps_devices, "moe") + # MoE expects input tensors to be 2d. + # Flatten out spatial dimensions. + xs_2d = dp(tf.reshape, xs, [[-1, model_hidden_size]] * dp.n) + # Call the MoE + moe_out_2d, importance, load, _, _ = moe.Eval( + dp.devices, xs_2d, train, identifiers=None, summaries=True) + # Reshape the output to the original shape. + moe_out = dp(tf.reshape, moe_out_2d, dp(tf.shape, xs)) + # These losses encourage equal load on the different experts. + loss = loss_coef * (eu.CVSquared(importance) + eu.CVSquared(load)) + return moe_out, loss + + +def simple_attention(target, source, bias=None, summaries=True): + """A simple attention function. + + Args: + target: a `Tensor` with shape `[batch, target_timesteps, depth]` or + `[batch, target_timesteps_1, target_timesteps_2, depth]` + source: a `Tensor` with shape `[batch, source_timesteps, depth]` or + `[batch, source_timesteps_1, source_timesteps_2, depth]` + bias: an optional `Tensor` with shape `[batch, timesteps, 1, 1]` used + to mask the attention to not attend to padding of input. + summaries: Boolean, whether to output summaries. + + Returns: + a `Tensor` with same shape as `target` + """ + with tf.name_scope("simple_attention", [target, source]): + target_shape = tf.shape(target) + source_shape = tf.shape(source) + target = tf.reshape(target, [ + target_shape[0], target_shape[1] * target_shape[2], target_shape[3] + ]) + source = tf.reshape(source, [ + source_shape[0], source_shape[1] * source_shape[2], source_shape[3] + ]) + attention = tf.matmul(target, source, transpose_b=True) + attention *= tf.rsqrt(tf.to_float(tf.shape(target)[2])) + if bias is not None: + attention += tf.expand_dims(tf.squeeze(bias, axis=[2, 3]), axis=1) + attention = tf.nn.softmax(attention) + if summaries and not tf.get_variable_scope().reuse: + tf.summary.image("attention", tf.expand_dims(attention, 3), max_outputs=5) + attended = tf.matmul(attention, source) + return tf.reshape(attended, target_shape) + + +def multiscale_conv_sum(inputs, output_size, dilation_rates_and_kernel_sizes, + pooling_type, **kwargs): + """Sum of several dilated convolutions. + + For all convolutions with dilation_rate > 1, we first pool the input with + width dilation_rate. + + Args: + inputs: a Tensor + output_size: an Integer + dilation_rates_and_kernel_sizes: a list of pairs (dilation, kernel_size) + pooling_type: "AVG" or "MAX" + **kwargs: additional + + Returns: + a Tensor. + """ + name = kwargs.pop("name") if "name" in kwargs else None + with tf.variable_scope(name, "multiscale_conv_sum", [inputs]): + padding = kwargs["padding"] + results, counter = [], -1 + for dilation_rate, kernel_size in dilation_rates_and_kernel_sizes: + counter += 1 + if dilation_rate > 1: + pooled = pool(inputs, kernel_size, pooling_type, padding) + else: + pooled = inputs + results.append( + conv( + pooled, + output_size, + kernel_size, + dilation_rate=dilation_rate, + name="conv_layer%d" % counter, + **kwargs)) + return tf.add_n(results) * (len(results)**-0.5) + + +def multiscale_conv_and_attention(x, + padding, + hparams, + source=None, + summaries=True): + """A common part of t2t layers. + + First, do a linear multiscale convolution + Second, do attention (if source is not None) + + Applies residuals and normalization on both steps. + + Args: + x: a Tensor. + padding: a padding type + hparams: hyperparameters for model + source: optional source tensor for attention. (encoder output) + summaries: Boolean, whether to output summaries. + + Returns: + a Tensor. + """ + # TODO(noam): The number of different scales should be a hyperparameter. + conv_sum = multiscale_conv_sum( + x, + hparams.hidden_size, [((hparams.kernel_height**i, hparams.kernel_width** + i), (hparams.kernel_height, hparams.kernel_width)) + for i in xrange(3)], + "AVG", + padding=padding) + # For residuals a rescale if necessary if channels differ. + if x.get_shape().as_list()[-1] != conv_sum.get_shape().as_list()[-1]: + x = conv(x, hparams.hidden_size, (1, 1)) + x = noam_norm(x + conv_sum) + if source is not None: + x = noam_norm(x + simple_attention(x, source, summaries=summaries)) + return x + + +def conv_with_pools(inputs, output_size, kernel_size, pool_sizes, pooling_type, + **kwargs): + """Convolution plus 1x1 convolution applied to specified pools. + + For example we might do a regular convolution with kernel size (3, 1), + and pools of sizes [(9, 1), (27, 1)]. + + Args: + inputs: a Tensor + output_size: an Integer + kernel_size: a tuple of integers + pool_sizes: a list of tuples of integers. + pooling_type: "AVG" or "MAX" + **kwargs: additional keyword args for conv + + Returns: + a Tensor. + """ + name = kwargs.pop("name") if "name" in kwargs else None + with tf.variable_scope(name, "conv_with_pools", [inputs]): + padding = kwargs["padding"] + results = [] + results.append(conv(inputs, output_size, kernel_size, **kwargs)) + for i, pool_size in enumerate(pool_sizes): + pooled = pool(inputs, pool_size, pooling_type, padding) + results.append( + conv(pooled, output_size, (1, 1), name="pool_%d" % i, **kwargs)) + return tf.add_n(results) * (len(results)**-0.5) + + +def conv_with_pools_and_attention(x, + padding, + hparams, + source=None, + summaries=True): + """A common part of t2t layers. + + First, do conv_with_pools + Second, do attention (if source is not None) + + Applies residuals and normalization on both steps. + + Args: + x: a Tensor. + padding: a padding type + hparams: hyperparameters for model + source: optional source tensor for attention. (encoder output) + summaries: Boolean, whether to output summaries. + + Returns: + a Tensor. + """ + conv_sum = conv_with_pools( + x, + hparams.hidden_size, (hparams.kernel_height, hparams.kernel_width), + hparams.pool_sizes, + "AVG", + padding=padding) + if x.get_shape().as_list()[-1] == conv_sum.get_shape().as_list()[-1]: + conv_sum += x + x = noam_norm(conv_sum) + if source is not None: + x = noam_norm(x + simple_attention(x, source, summaries=summaries)) + return x + + +def get_timing_signal(length, + min_timescale=1, + max_timescale=1e4, + num_timescales=16): + """Create Tensor of sinusoids of different frequencies. + + Args: + length: Length of the Tensor to create, i.e. Number of steps. + min_timescale: a float + max_timescale: a float + num_timescales: an int + + Returns: + Tensor of shape (length, 2*num_timescales) + """ + positions = tf.to_float(tf.range(length)) + log_timescale_increment = (math.log(max_timescale / min_timescale) / + (num_timescales - 1)) + inv_timescales = min_timescale * tf.exp( + tf.to_float(tf.range(num_timescales)) * -log_timescale_increment) + scaled_time = tf.expand_dims(positions, 1) * tf.expand_dims(inv_timescales, 0) + return tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1) + + +def add_timing_signal(x, min_timescale=1, max_timescale=1e4, num_timescales=16): + """Adds a bunch of sinusoids of different frequencies to a Tensor. + + This allows attention to learn to use absolute and relative positions. + The timing signal should be added to some precursor of both the source + and the target of the attention. + + The use of relative position is possible because sin(x+y) and cos(x+y) can be + experessed in terms of y, sin(x) and cos(x). + + In particular, we use a geometric sequence of timescales starting with + min_timescale and ending with max_timescale. For each timescale, we + generate the two sinusoidal signals sin(timestep/timescale) and + cos(timestep/timescale). All of these sinusoids are concatenated in + the depth dimension, padded with zeros to be the same depth as the input, + and added into input. + + Args: + x: a Tensor with shape [?, length, ?, depth] + min_timescale: a float + max_timescale: a float + num_timescales: an int <= depth/2 + + Returns: + a Tensor the same shape as x. + """ + length = tf.shape(x)[1] + depth = tf.shape(x)[3] + signal = get_timing_signal(length, min_timescale, max_timescale, + num_timescales) + padded_signal = tf.pad(signal, [[0, 0], [0, depth - 2 * num_timescales]]) + return x + tf.reshape(padded_signal, [1, length, 1, depth]) + + +def mask_from_embedding(emb): + """Input embeddings -> padding mask. + + We have hacked symbol_modality to return all-zero embeddings for padding. + Returns a mask with 0.0 in the padding positions and 1.0 elsewhere. + + Args: + emb: a Tensor with shape [batch, width, height, depth]. + Returns: + a 0.0/1.0 Tensor with shape [batch, width, height, 1]. + """ + return weights_nonzero(tf.reduce_sum(tf.abs(emb), axis=3, keep_dims=True)) + + +def mask_leq(target_length, source_length): + """A mask with 1.0 wherever source_pos <= target_pos and 0.0 elsewhere. + + Args: + target_length: an integer + source_length: an integer + Returns: + a Tensor with shape [1, target_length, source_length] + """ + return tf.expand_dims( + tf.matrix_band_part(tf.ones([target_length, source_length]), -1, 0), 0) + + +def attention_1d_v0(source, + target, + attention_size, + output_size, + num_heads, + mask=None, + transform_source=True, + transform_target=True, + transform_output=True, + summaries=True, + name=None): + """multi-headed attention. + + TODO(noam): this could probably be extended to 2d. + + Args: + source: a Tensor of shape [batch, source_length, source_depth] + target: a Tensor of shape [batch, target_length, target_depth] + attention_size: an integer + output_size: an integer + num_heads: an integer divisor of attention_size + mask: a float32 Tensor of shape [batch, target_length, source_length] + 1.0 means can-see; 0.0 means can't-see. + Any dimension can be 1 (supports broadcasting). + transform_source: a boolean + transform_target: a boolean + transform_output: a boolean + summaries: a boolean + name: an optional string + + Returns: + a Tensor of shape [batch, length, output_size] + """ + with tf.variable_scope(name, default_name="attention", values=[target]): + source_length = tf.shape(source)[1] + target_length = tf.shape(target)[1] + batch = tf.shape(source)[0] + + def _maybe_transform(t, size, should_transform, name): + if should_transform: + return conv1d(t, size, 1, name=name) + else: + assert t.get_shape()[-1] == size + return t + + source_attention = _maybe_transform(source, attention_size, + transform_source, "source_attention") + target_attention = _maybe_transform(target, attention_size, + transform_target, "target_attention") + assert attention_size % num_heads == 0 + size_per_head = attention_size // num_heads + source_attention = tf.reshape( + source_attention, [batch, source_length, num_heads, size_per_head]) + target_attention = tf.reshape( + target_attention, [batch, target_length, num_heads, size_per_head]) + # [batch, num_heads, length, size_per_head] + source_attention = tf.transpose(source_attention, [0, 2, 1, 3]) + target_attention = tf.transpose(target_attention, [0, 2, 1, 3]) + + # [batch, num_heads, target_length, source_length] + attention = tf.matmul(target_attention, source_attention, transpose_b=True) + attention *= size_per_head**-0.5 + + if mask is not None: + mask = tf.expand_dims(mask, 1) + mask = (1.0 - mask) * -1e9 + attention += mask + attention = tf.nn.softmax(attention) + if summaries and not tf.get_variable_scope().reuse: + # Compute a color image summary. + image = tf.reshape(attention, + [batch, num_heads, target_length, source_length]) + image = tf.transpose(image, [0, 2, 3, 1]) + image = tf.pow(image, 0.2) # for high-dynamic-range + # Each head will correspond to one of RGB. + # pad the heads to be a multiple of 3 + extra_heads = -num_heads % 3 + image = tf.pad(image, [[0, 0], [0, 0], [0, 0], [0, -num_heads % 3]]) + image = tf.reshape(image, [ + batch, target_length, source_length, 3, (num_heads + extra_heads) // 3 + ]) + image = tf.reduce_max(image, 4) + tf.summary.image("local_attention", image, max_outputs=1) + # output: [batch, num_heads, target_length, size_per_head] + output = tf.matmul(attention, source_attention) + output = tf.transpose(output, [0, 2, 1, 3]) + output = tf.reshape(output, [batch, target_length, attention_size]) + output = _maybe_transform(output, output_size, transform_output, + "attention_output") + return output + + +def relu_density_logit(x, reduce_dims): + """logit(density(x)). + + Useful for histograms. + + Args: + x: a Tensor, typilcally the output of tf.relu + reduce_dims: a list of dimensions + + Returns: + a Tensor + """ + frac = tf.reduce_mean(tf.to_float(x > 0.0), reduce_dims) + scaled = tf.log(frac + math.exp(-10)) - tf.log((1.0 - frac) + math.exp(-10)) + return scaled + + +def conv_hidden_relu(inputs, + hidden_size, + output_size, + kernel_size=(1, 1), + summaries=True, + dropout=0.0, + **kwargs): + """Hidden layer with RELU activation followed by linear projection.""" + name = kwargs.pop("name") if "name" in kwargs else None + with tf.variable_scope(name, "conv_hidden_relu", [inputs]): + if inputs.get_shape().ndims == 3: + is_3d = True + inputs = tf.expand_dims(inputs, 2) + else: + is_3d = False + h = conv( + inputs, + hidden_size, + kernel_size, + activation=tf.nn.relu, + name="conv1", + **kwargs) + if dropout != 0.0: + h = tf.nn.dropout(h, 1.0 - dropout) + if summaries and not tf.get_variable_scope().reuse: + tf.summary.histogram("hidden_density_logit", + relu_density_logit( + h, list(range(inputs.shape.ndims - 1)))) + ret = conv(h, output_size, (1, 1), name="conv2", **kwargs) + if is_3d: + ret = tf.squeeze(ret, 2) + return ret + + +def conv_gru(x, + kernel_size, + filters, + padding="SAME", + dilation_rate=(1, 1), + name=None, + reuse=None): + """Convolutional GRU in 1 dimension.""" + + # Let's make a shorthand for conv call first. + def do_conv(args, name, bias_start, padding): + return conv( + args, + filters, + kernel_size, + padding=padding, + dilation_rate=dilation_rate, + bias_initializer=tf.constant_initializer(bias_start), + name=name) + + # Here comes the GRU gate. + with tf.variable_scope( + name, default_name="conv_gru", values=[x], reuse=reuse): + reset = saturating_sigmoid(do_conv(x, "reset", 1.0, padding)) + gate = saturating_sigmoid(do_conv(x, "gate", 1.0, padding)) + candidate = tf.tanh(do_conv(reset * x, "candidate", 0.0, padding)) + return gate * x + (1 - gate) * candidate + + +def conv_lstm(x, + kernel_size, + filters, + padding="SAME", + dilation_rate=(1, 1), + name=None, + reuse=None): + """Convolutional LSTM in 1 dimension.""" + with tf.variable_scope( + name, default_name="conv_lstm", values=[x], reuse=reuse): + gates = conv( + x, + 4 * filters, + kernel_size, + padding=padding, + dilation_rate=dilation_rate) + g = tf.split(layer_norm(gates, 4 * filters), 4, axis=3) + new_cell = tf.sigmoid(g[0]) * x + tf.sigmoid(g[1]) * tf.tanh(g[3]) + return tf.sigmoid(g[2]) * tf.tanh(new_cell) + + +def diagonal_conv_gru(x, + kernel_size, + filters, + train, + dropout=0.0, + name=None, + reuse=None): + """Diagonal Convolutional GRU as in https://arxiv.org/abs/1702.08727.""" + + # Let's make a shorthand for conv call first. + def do_conv(args, name, bias_start): + return conv( + args, + filters, + kernel_size, + padding="SAME", + bias_initializer=tf.constant_initializer(bias_start), + name=name) + + # Here comes the GRU gate. + with tf.variable_scope( + name, default_name="diagonal_conv_gru", values=[x], reuse=reuse): + reset, reset_cost = hard_sigmoid(do_conv(x, "reset", 0.5)) + gate, gate_cost = hard_sigmoid(do_conv(x, "gate", 0.7)) + candidate = tf.tanh(do_conv(reset * x, "candidate", 0.0)) + + # Dropout if training. + if dropout > 0.0 and train: + candidate = tf.nn.dropout(candidate, 1.0 - dropout) + + # Diagonal shift. + shift_filters = filters // 3 + base_filter = ([[0, 1, 0]] * (filters - 2 * shift_filters) + + [[1, 0, 0]] * shift_filters + [[0, 0, 1]] * shift_filters) + shift_filter = tf.constant(np.transpose(base_filter), dtype=tf.float32) + shift_filter = tf.expand_dims(tf.expand_dims(shift_filter, 0), 3) + x_shifted = tf.nn.depthwise_conv2d( + x, shift_filter, [1, 1, 1, 1], padding="SAME") + + # Return the gated result and cost. + total_cost_avg = 0.5 * (reset_cost + gate_cost) + return gate * x_shifted + (1 - gate) * candidate, total_cost_avg + + +def pad_to_same_length(x, y, final_length_divisible_by=1, axis=1): + """Pad tensors x and y on axis 1 so that they have the same length.""" + if axis not in [1, 2]: + raise ValueError("Only axis=1 and axis=2 supported for now.") + with tf.name_scope("pad_to_same_length", [x, y]): + x_length = tf.shape(x)[axis] + y_length = tf.shape(y)[axis] + max_length = tf.maximum(x_length, y_length) + if final_length_divisible_by > 1: + # Find the nearest larger-or-equal integer divisible by given number. + max_length += final_length_divisible_by - 1 + max_length //= final_length_divisible_by + max_length *= final_length_divisible_by + length_diff1 = max_length - x_length + length_diff2 = max_length - y_length + + def padding_list(length_diff, arg): + if axis == 1: + return [[[0, 0], [0, length_diff]], + tf.zeros([tf.rank(arg) - 2, 2], dtype=tf.int32)] + return [[[0, 0], [0, 0], [0, length_diff]], + tf.zeros([tf.rank(arg) - 3, 2], dtype=tf.int32)] + + paddings1 = tf.concat(padding_list(length_diff1, x), axis=0) + paddings2 = tf.concat(padding_list(length_diff2, y), axis=0) + res_x = tf.pad(x, paddings1) + res_y = tf.pad(y, paddings2) + # Static shapes are the same except for axis=1. + x_shape = x.shape.as_list() + x_shape[axis] = None + res_x.set_shape(x_shape) + y_shape = y.shape.as_list() + y_shape[axis] = None + res_y.set_shape(y_shape) + return res_x, res_y + + +def pad_with_zeros(logits, labels): + """Pad labels on the length dimension to match logits length.""" + with tf.name_scope("pad_with_zeros", [logits, labels]): + logits, labels = pad_to_same_length(logits, labels) + if len(labels.shape.as_list()) == 3: # 2-d labels. + logits, labels = pad_to_same_length(logits, labels, axis=2) + return labels + + +def weights_nonzero(labels): + """Assign weight 1.0 to all labels except for padding (id=0).""" + return tf.to_float(tf.not_equal(labels, 0)) + + +def weights_all(labels): + """Assign weight 1.0 to all labels.""" + return tf.ones_like(labels, dtype=tf.float32) + + +def weights_concatenated(labels): + """Assign weight 1.0 to the "target" part of the concatenated labels. + + The labels look like: + source English I love you . ID1 target French Je t'aime . ID1 source + English the cat ID1 target French le chat ID1 source English ... + + We want to assign weight 1.0 to all words in the target text (including the + ID1 end symbol), but not to the source text or the boilerplate. In the + above example, the target words that get positive weight are: + Je t'aime . ID1 le chat ID1 + + Args: + labels: a Tensor + Returns: + a Tensor + """ + eos_mask = tf.to_int32(tf.equal(labels, 1)) + sentence_num = tf.cumsum(eos_mask, axis=1, exclusive=True) + in_target = tf.equal(tf.mod(sentence_num, 2), 1) + # first two tokens of each sentence are boilerplate. + sentence_num_plus_one = sentence_num + 1 + shifted = tf.pad(sentence_num_plus_one, [[0, 0], [2, 0], [0, 0], + [0, 0]])[:, :-2, :, :] + nonboilerplate = tf.equal(sentence_num_plus_one, shifted) + ret = tf.to_float(tf.logical_and(nonboilerplate, in_target)) + return ret + + +def padded_cross_entropy(logits, + labels, + label_smoothing, + weights_fn=weights_nonzero, + reduce_sum=True): + """Compute cross-entropy assuming 0s are padding. + + Computes a loss numerator (the sum of losses), and loss denominator + (the number of non-padding tokens). + + Args: + logits: a `Tensor` with shape `[batch, timesteps, vocab_size]`. + labels: an integer `Tensor` with shape `[batch, timesteps]`. + label_smoothing: a floating point `Scalar`. + weights_fn: A function from labels to weights. + reduce_sum: a Boolean, whether to sum at the end or not. + + Returns: + loss_numerator: a `Scalar`. Sum of losses. + loss_denominator: a `Scalar. The number of non-padding target tokens. + """ + confidence = 1.0 - label_smoothing + vocab_size = tf.shape(logits)[-1] + with tf.name_scope("padded_cross_entropy", [logits, labels]): + pad_labels = pad_with_zeros(logits, labels) + xent = smoothing_cross_entropy(logits, pad_labels, vocab_size, confidence) + weights = weights_fn(pad_labels) + if not reduce_sum: + return xent * weights, weights + return tf.reduce_sum(xent * weights), tf.reduce_sum(weights) + + +def smoothing_cross_entropy(logits, labels, vocab_size, confidence): + """Cross entropy with label smoothing to limit over-confidence.""" + with tf.name_scope("smoothing_cross_entropy", [logits, labels]): + # Low confidence is given to all non-true labels, uniformly. + low_confidence = (1.0 - confidence) / tf.to_float(vocab_size - 1) + # Normalizing constant is the best cross-entropy value with soft targets. + # We subtract it just for readability, makes no difference on learning. + normalizing = -(confidence * tf.log(confidence) + tf.to_float( + vocab_size - 1) * low_confidence * tf.log(low_confidence + 1e-20)) + # Soft targets. + soft_targets = tf.one_hot( + tf.cast(labels, tf.int32), + depth=vocab_size, + on_value=confidence, + off_value=low_confidence) + xentropy = tf.nn.softmax_cross_entropy_with_logits( + logits=logits, labels=soft_targets) + return xentropy - normalizing diff --git a/tensor2tensor/models/common_layers_test.py b/tensor2tensor/models/common_layers_test.py new file mode 100644 index 000000000..2bd6a53ad --- /dev/null +++ b/tensor2tensor/models/common_layers_test.py @@ -0,0 +1,290 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for common layers.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +import numpy as np +from tensor2tensor.models import common_layers + +import tensorflow as tf + + +class CommonLayersTest(tf.test.TestCase): + + def testStandardizeImages(self): + x = np.random.rand(5, 7, 7, 3) + with self.test_session() as session: + y = common_layers.standardize_images(tf.constant(x)) + res = session.run(y) + self.assertEqual(res.shape, (5, 7, 7, 3)) + + def testImageAugmentation(self): + x = np.random.rand(500, 500, 3) + with self.test_session() as session: + y = common_layers.image_augmentation(tf.constant(x)) + res = session.run(y) + self.assertEqual(res.shape, (299, 299, 3)) + + def testSaturatingSigmoid(self): + x = np.array([-120.0, -100.0, 0.0, 100.0, 120.0], dtype=np.float32) + with self.test_session() as session: + y = common_layers.saturating_sigmoid(tf.constant(x)) + res = session.run(y) + self.assertAllClose(res, [0.0, 0.0, 0.5, 1.0, 1.0]) + + def testFlatten4D3D(self): + x = np.random.random_integers(1, high=8, size=(3, 5, 2)) + with self.test_session() as session: + y = common_layers.flatten4d3d(common_layers.embedding(x, 10, 7)) + session.run(tf.global_variables_initializer()) + res = session.run(y) + self.assertEqual(res.shape, (3, 5 * 2, 7)) + + def testEmbedding(self): + x = np.random.random_integers(1, high=8, size=(3, 5)) + with self.test_session() as session: + y = common_layers.embedding(x, 10, 16) + session.run(tf.global_variables_initializer()) + res = session.run(y) + self.assertEqual(res.shape, (3, 5, 16)) + + def testConv(self): + x = np.random.rand(5, 7, 1, 11) + with self.test_session() as session: + y = common_layers.conv(tf.constant(x, dtype=tf.float32), 13, (3, 3)) + session.run(tf.global_variables_initializer()) + res = session.run(y) + self.assertEqual(res.shape, (5, 5, 1, 13)) + + def testSeparableConv(self): + x = np.random.rand(5, 7, 1, 11) + with self.test_session() as session: + y = common_layers.separable_conv( + tf.constant(x, dtype=tf.float32), 13, (3, 3)) + session.run(tf.global_variables_initializer()) + res = session.run(y) + self.assertEqual(res.shape, (5, 5, 1, 13)) + + def testSubSeparableConv(self): + for sep in [0, 1, 2, 4]: + x = np.random.rand(5, 7, 1, 12) + with self.test_session() as session: + with tf.variable_scope("sep_%d" % sep): + y = common_layers.subseparable_conv( + tf.constant(x, dtype=tf.float32), 16, (3, 3), separability=sep) + session.run(tf.global_variables_initializer()) + res = session.run(y) + self.assertEqual(res.shape, (5, 5, 1, 16)) + + def testConvBlock(self): + x = np.random.rand(5, 7, 1, 11) + with self.test_session() as session: + y = common_layers.conv_block( + tf.constant(x, dtype=tf.float32), + 13, [(1, (3, 3)), (1, (3, 3))], + padding="SAME", + normalizer_fn=common_layers.noam_norm) + session.run(tf.global_variables_initializer()) + res = session.run(y) + self.assertEqual(res.shape, (5, 7, 1, 13)) + + def testSeparableConvBlock(self): + x = np.random.rand(5, 7, 1, 11) + with self.test_session() as session: + y = common_layers.separable_conv_block( + tf.constant(x, dtype=tf.float32), + 13, [(1, (3, 3)), (1, (3, 3))], + padding="SAME") + session.run(tf.global_variables_initializer()) + res = session.run(y) + self.assertEqual(res.shape, (5, 7, 1, 13)) + + def testSubSeparableConvBlock(self): + for sep in [0, 1, 2, 4]: + x = np.random.rand(5, 7, 1, 12) + with self.test_session() as session: + with tf.variable_scope("sep_%d" % sep): + y = common_layers.subseparable_conv_block( + tf.constant(x, dtype=tf.float32), + 16, [(1, (3, 3)), (1, (3, 3))], + padding="SAME", + separability=sep) + session.run(tf.global_variables_initializer()) + res = session.run(y) + self.assertEqual(res.shape, (5, 7, 1, 16)) + + def testPool(self): + x = np.random.rand(5, 8, 1, 11) + with self.test_session() as session: + y = common_layers.pool( + tf.constant(x, dtype=tf.float32), (2, 2), "AVG", "SAME") + session.run(tf.global_variables_initializer()) + res = session.run(y) + self.assertEqual(res.shape, (5, 8, 1, 11)) + + def testConvBlockDownsample(self): + x = np.random.rand(5, 7, 1, 11) + with self.test_session() as session: + y = common_layers.conv_block_downsample( + tf.constant(x, dtype=tf.float32), (3, 1), (2, 1), "SAME") + session.run(tf.global_variables_initializer()) + res = session.run(y) + self.assertEqual(res.shape, (5, 4, 1, 27)) + + def testSimpleAttention(self): + x = np.random.rand(5, 7, 1, 11) + y = np.random.rand(5, 9, 1, 11) + with self.test_session() as session: + a = common_layers.simple_attention( + tf.constant(x, dtype=tf.float32), tf.constant(y, dtype=tf.float32)) + session.run(tf.global_variables_initializer()) + res = session.run(a) + self.assertEqual(res.shape, (5, 7, 1, 11)) + + def testGetTimingSignal(self): + length = 7 + num_timescales = 10 + with self.test_session() as session: + a = common_layers.get_timing_signal(length, num_timescales=num_timescales) + session.run(tf.global_variables_initializer()) + res = session.run(a) + self.assertEqual(res.shape, (length, 2 * num_timescales)) + + def testAddTimingSignal(self): + batch = 5 + length = 7 + height = 3 + depth = 35 + x = np.random.rand(batch, length, height, depth) + with self.test_session() as session: + a = common_layers.add_timing_signal(tf.constant(x, dtype=tf.float32)) + session.run(tf.global_variables_initializer()) + res = session.run(a) + self.assertEqual(res.shape, (batch, length, height, depth)) + + def testAttention1D(self): + batch = 5 + target_length = 7 + source_length = 13 + source_depth = 9 + target_depth = 11 + attention_size = 21 + output_size = 15 + num_heads = 7 + source = np.random.rand(batch, source_length, source_depth) + target = np.random.rand(batch, target_length, target_depth) + mask = np.random.rand(batch, target_length, source_length) + with self.test_session() as session: + a = common_layers.attention_1d_v0( + tf.constant(source, dtype=tf.float32), + tf.constant(target, dtype=tf.float32), attention_size, output_size, + num_heads, tf.constant(mask, dtype=tf.float32)) + session.run(tf.global_variables_initializer()) + res = session.run(a) + self.assertEqual(res.shape, (batch, target_length, output_size)) + + def testMultiscaleConvSum(self): + x = np.random.rand(5, 9, 1, 11) + with self.test_session() as session: + y = common_layers.multiscale_conv_sum( + tf.constant(x, dtype=tf.float32), + 13, [((1, 1), (5, 5)), ((2, 2), (3, 3))], + "AVG", + padding="SAME") + session.run(tf.global_variables_initializer()) + res = session.run(y) + self.assertEqual(res.shape, (5, 9, 1, 13)) + + def testConvGRU(self): + x = np.random.rand(5, 7, 3, 11) + with self.test_session() as session: + y = common_layers.conv_gru(tf.constant(x, dtype=tf.float32), (1, 3), 11) + z = common_layers.conv_gru( + tf.constant(x, dtype=tf.float32), (1, 3), 11, padding="LEFT") + session.run(tf.global_variables_initializer()) + res1 = session.run(y) + res2 = session.run(z) + self.assertEqual(res1.shape, (5, 7, 3, 11)) + self.assertEqual(res2.shape, (5, 7, 3, 11)) + + def testLayerNorm(self): + x = np.random.rand(5, 7, 11) + with self.test_session() as session: + y = common_layers.layer_norm(tf.constant(x, dtype=tf.float32), 11) + session.run(tf.global_variables_initializer()) + res = session.run(y) + self.assertEqual(res.shape, (5, 7, 11)) + + def testConvLSTM(self): + x = np.random.rand(5, 7, 11, 13) + with self.test_session() as session: + y = common_layers.conv_lstm(tf.constant(x, dtype=tf.float32), (1, 3), 13) + session.run(tf.global_variables_initializer()) + res = session.run(y) + self.assertEqual(res.shape, (5, 7, 11, 13)) + + def testPadToSameLength(self): + x1 = np.random.rand(5, 7, 11) + x2 = np.random.rand(5, 9, 11) + with self.test_session() as session: + a, b = common_layers.pad_to_same_length( + tf.constant(x1, dtype=tf.float32), tf.constant(x2, dtype=tf.float32)) + c, d = common_layers.pad_to_same_length( + tf.constant(x1, dtype=tf.float32), + tf.constant(x2, dtype=tf.float32), + final_length_divisible_by=4) + res1, res2 = session.run([a, b]) + res1a, res2a = session.run([c, d]) + self.assertEqual(res1.shape, (5, 9, 11)) + self.assertEqual(res2.shape, (5, 9, 11)) + self.assertEqual(res1a.shape, (5, 12, 11)) + self.assertEqual(res2a.shape, (5, 12, 11)) + + def testShiftLeft(self): + x1 = np.zeros((5, 7, 1, 11)) + x1[:, 0, :] = np.ones_like(x1[:, 0, :]) + expected = np.zeros((5, 7, 1, 11)) + expected[:, 1, :] = np.ones_like(expected[:, 1, :]) + with self.test_session() as session: + a = common_layers.shift_left(tf.constant(x1, dtype=tf.float32)) + actual = session.run(a) + self.assertAllEqual(actual, expected) + + def testConvStride2MultiStep(self): + x1 = np.random.rand(5, 32, 1, 11) + with self.test_session() as session: + a = common_layers.conv_stride2_multistep( + tf.constant(x1, dtype=tf.float32), 4, 16) + session.run(tf.global_variables_initializer()) + actual = session.run(a[0]) + self.assertEqual(actual.shape, (5, 2, 1, 16)) + + def testDeconvStride2MultiStep(self): + x1 = np.random.rand(5, 2, 1, 11) + with self.test_session() as session: + a = common_layers.deconv_stride2_multistep( + tf.constant(x1, dtype=tf.float32), 4, 16) + session.run(tf.global_variables_initializer()) + actual = session.run(a) + self.assertEqual(actual.shape, (5, 32, 1, 16)) + + +if __name__ == "__main__": + tf.test.main() diff --git a/tensor2tensor/models/models.py b/tensor2tensor/models/models.py new file mode 100644 index 000000000..bf19a307b --- /dev/null +++ b/tensor2tensor/models/models.py @@ -0,0 +1,32 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Models defined in T2T. Imports here force registration.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +# pylint: disable=unused-import +from tensor2tensor.models import attention_lm +from tensor2tensor.models import baseline +from tensor2tensor.models import bytenet +from tensor2tensor.models import multimodel +from tensor2tensor.models import neural_gpu +from tensor2tensor.models import slicenet +from tensor2tensor.models import transformer +from tensor2tensor.models import xception +# pylint: enable=unused-import diff --git a/tensor2tensor/models/multimodel.py b/tensor2tensor/models/multimodel.py new file mode 100644 index 000000000..bcbf16995 --- /dev/null +++ b/tensor2tensor/models/multimodel.py @@ -0,0 +1,159 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""MultiModel.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +from tensor2tensor.models import common_layers +from tensor2tensor.models import slicenet +from tensor2tensor.utils import expert_utils as eu +from tensor2tensor.utils import registry +from tensor2tensor.utils import t2t_model + +import tensorflow as tf + + +def experts(xs, moe_n1, moe_n2, hidden_size, filter_size, dp, ps, train): + """Mixture-of-Experts layer.""" + # Set up the hyperparameters for the gating networks. + primary_gating_hp = eu.NoisyTopKGatingParams() + primary_gating_hp.num_experts = moe_n1 + if moe_n2: + # Hierarchical MoE containing moe_n1 groups of moe_n2 experts. + assert moe_n2 > 1 + secondary_gating_hp = eu.NoisyTopKGatingParams() + secondary_gating_hp.num_experts = moe_n2 + else: + # Flat mixture of moe_n1 experts. + secondary_gating_hp = None + # Set up the hyperparameters for the expert networks. + # Each expert contains a hidden RELU layer of size filter_size + expert_hp = eu.FeedForwardExpertParams() + expert_hp.hidden_layer_sizes = [filter_size] + # Create the mixture of experts. + moe = eu.DistributedMixtureOfExperts(primary_gating_hp, secondary_gating_hp, + expert_hp, hidden_size, hidden_size, ps, + "moe") + # MoE expects input tensors to be 2d. Flatten out spatial dimensions. + xs_2d = dp(tf.reshape, xs, [[-1, hidden_size]] * dp.n) + # Call the MoE + moe_out_2d, importance, load, _, _ = moe.Eval( + dp.devices, xs_2d, train, summaries=False, identifiers=None) + # Reshape the output to the original shape. + moe_out = dp(tf.reshape, moe_out_2d, dp(tf.shape, xs)) + # These losses encourage equal load on the different experts. + loss = eu.CVSquared(importance) + eu.CVSquared(load) + + # Apply residual and normalize. + def add_and_normalize(x, y): + return common_layers.layer_norm(x + y, hidden_size, name="moe_norm") + + return dp(add_and_normalize, xs, moe_out), loss + + +@registry.register_model +class MultiModel(t2t_model.T2TModel): + + def model_fn_body_sharded(self, sharded_features, train): + dp = self._data_parallelism + hparams = self._hparams + targets = sharded_features["targets"] + + def flatten(inputs): + return tf.expand_dims(common_layers.flatten4d3d(inputs), axis=2) + + inputs = dp(flatten, sharded_features["inputs"]) + + # Encode inputs. + def encode_half(inputs, inputs_mask, hparams): + # Add timing and encode. + inputs = common_layers.add_timing_signal(inputs) + return slicenet.multi_conv_res(inputs, "SAME", "encoder1", + hparams.num_hidden_layers // 2, + hparams, train, mask=inputs_mask) + + target_space_emb = dp(slicenet.embed_target_space, + sharded_features["target_space_id"], + hparams.hidden_size) + inputs_pad = dp(slicenet.embedding_to_padding, inputs) + inputs_mask = dp(lambda x: 1.0 - x, inputs_pad) + inputs_encoded = dp(encode_half, inputs, inputs_mask, hparams) + with tf.variable_scope("experts_enc"): + inputs_encoded, expert_loss = experts( + inputs_encoded, hparams.moe_n1, hparams.moe_n2, hparams.hidden_size, + hparams.hidden_size, dp, self._ps_devices, train) + expert_loss *= hparams.moe_loss_coef + inputs_encoded = dp( + slicenet.multi_conv_res, inputs_encoded, "SAME", + "encoder2", hparams.num_hidden_layers, hparams, train, + mask=inputs_mask) + + # If we're just predicing a class, there is no use for a decoder, return. + target_modality = hparams.problems[self._problem_idx].target_modality + if "class_label_modality" in target_modality.name: + return inputs_encoded, tf.reduce_mean(expert_loss) + + # Do the middle part. + decoder_start, similarity_loss = dp( + slicenet.slicenet_middle, inputs_encoded, targets, + target_space_emb, inputs_mask, hparams, train) + + # Decode. + decoder_half = dp( + slicenet.multi_conv_res, + decoder_start, + "LEFT", + "decoder1", + hparams.num_hidden_layers // 2, + hparams, + train, + mask=inputs_mask, + source=inputs_encoded) + with tf.variable_scope("experts_dec"): + decoder_half, expert_dec_loss = experts( + decoder_half, hparams.moe_n1, hparams.moe_n2, hparams.hidden_size, + hparams.hidden_size, dp, self._ps_devices, train) + expert_loss += expert_dec_loss * hparams.moe_loss_coef + decoder_final = dp( + slicenet.multi_conv_res, + decoder_half, + "LEFT", + "decoder2", + hparams.num_hidden_layers // 2, + hparams, + train, + mask=inputs_mask, + source=inputs_encoded) + + total_loss = tf.reduce_mean(expert_loss) + tf.reduce_mean(similarity_loss) + return decoder_final, total_loss + + +@registry.register_hparams("multimodel1p8") +def multimodel_params1_p8(): + """Version for eight problem runs.""" + hparams = slicenet.slicenet_params1() + hparams.problem_choice = "distributed" + hparams.attention_type = "simple" # TODO(lukaszkaiser): add transformer. + hparams.hidden_size = 1536 + hparams.moe_n1 = 120 + hparams.shared_embedding_and_softmax_weights = int(False) + hparams.dropout = 0.1 + hparams.attention_dropout = 0.1 + hparams.learning_rate_decay_scheme = "exp500k" + return hparams diff --git a/tensor2tensor/models/multimodel_test.py b/tensor2tensor/models/multimodel_test.py new file mode 100644 index 000000000..8df682c5c --- /dev/null +++ b/tensor2tensor/models/multimodel_test.py @@ -0,0 +1,55 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for Xnet.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +import numpy as np + +from tensor2tensor.data_generators import problem_hparams +from tensor2tensor.models import multimodel +from tensor2tensor.models import slicenet + +import tensorflow as tf + + +class MultiModelTest(tf.test.TestCase): + + def testMultiModel(self): + x = np.random.random_integers(0, high=255, size=(3, 5, 4, 3)) + y = np.random.random_integers(0, high=9, size=(3, 5, 1, 1)) + hparams = slicenet.slicenet_params1_tiny() + p_hparams = problem_hparams.image_cifar10(hparams) + hparams.problems = [p_hparams] + with self.test_session() as session: + features = { + "inputs": tf.constant(x, dtype=tf.int32), + "targets": tf.constant(y, dtype=tf.int32), + "target_space_id": tf.constant(1, dtype=tf.int32), + } + model = multimodel.MultiModel(hparams, p_hparams) + sharded_logits, _, _ = model.model_fn(features, True) + logits = tf.concat(sharded_logits, 0) + session.run(tf.global_variables_initializer()) + res = session.run(logits) + self.assertEqual(res.shape, (3, 1, 1, 1, 10)) + + +if __name__ == "__main__": + tf.test.main() diff --git a/tensor2tensor/models/neural_gpu.py b/tensor2tensor/models/neural_gpu.py new file mode 100644 index 000000000..39aa735e1 --- /dev/null +++ b/tensor2tensor/models/neural_gpu.py @@ -0,0 +1,123 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""The Neural GPU model and its variants.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +from six.moves import xrange # pylint: disable=redefined-builtin + +from tensor2tensor.models import common_hparams +from tensor2tensor.models import common_layers +from tensor2tensor.utils import registry +from tensor2tensor.utils import t2t_model + +import tensorflow as tf + + +def neural_gpu(inputs, hparams, train, name=None): + """The core Neural GPU.""" + with tf.variable_scope(name, "neural_gpu"): + + def step(state, inp): # pylint: disable=missing-docstring + x = tf.nn.dropout(state, 1.0 - hparams.dropout * tf.to_float(train)) + for layer in xrange(hparams.num_hidden_layers): + x = common_layers.conv_gru( + x, (hparams.kernel_height, hparams.kernel_width), + hparams.hidden_size, + name="cgru_%d" % layer) + # Padding input is zeroed-out in the modality, we check this by summing. + padding_inp = tf.less(tf.reduce_sum(tf.abs(inp), axis=[1, 2]), 0.00001) + new_state = tf.where(padding_inp, state, x) # No-op where inp is padding. + return new_state + + return tf.foldl( + step, + tf.transpose(inputs, [1, 0, 2, 3]), + initializer=inputs, + parallel_iterations=1, + swap_memory=True) + + +@registry.register_model +class NeuralGPU(t2t_model.T2TModel): + + def model_fn_body(self, features, train): + return neural_gpu(features["inputs"], self._hparams, train) + + +def diagonal_neural_gpu(inputs, hparams, train, name=None): + """Improved Neural GPU as in https://arxiv.org/abs/1702.08727.""" + with tf.variable_scope(name, "diagonal_neural_gpu"): + + def step(state_tup, inp): + """Single step of the improved Neural GPU.""" + state, _ = state_tup + x = state + for layer in xrange(hparams.num_hidden_layers): + x, new_loss = common_layers.diagonal_conv_gru( + x, (hparams.kernel_height, hparams.kernel_width), + hparams.hidden_size, + train, + dropout=hparams.dropout, + name="dcgru_%d" % layer) + # Padding input is zeroed-out in the modality, we check this by summing. + padding_inp = tf.less(tf.reduce_sum(tf.abs(inp), axis=[1, 2]), 0.00001) + new_state = tf.where(padding_inp, state, x) # No-op where inp is padding. + return new_state, new_loss + + final_state, losses = tf.scan( + step, + tf.transpose(inputs, [1, 0, 2, 3]), + initializer=(inputs, tf.constant(0.0)), + parallel_iterations=1, + swap_memory=True) + return final_state[0, :, :, :, :], 2.0 * tf.reduce_mean(losses) + + +@registry.register_model +class DiagonalNeuralGPU(t2t_model.T2TModel): + + def model_fn_body(self, features, train): + return diagonal_neural_gpu(features["inputs"], self._hparams, train) + + +@registry.register_hparams("neural_gpu1") +def neural_gpu_params1(): + """Set of hyperparameters.""" + hparams = common_hparams.basic_params1() + hparams.batch_size = 1024 + hparams.num_hidden_layers = 1 + hparams.hidden_size = 256 + hparams.dropout = 0.1 + hparams.label_smoothing = 0.0 + hparams.clip_grad_norm = 10.0 + hparams.num_hidden_layers = 1 + hparams.kernel_height = 3 + hparams.kernel_width = 1 + hparams.learning_rate_decay_scheme = "exp50k" + hparams.learning_rate = 0.02 + hparams.learning_rate_warmup_steps = 3000 + hparams.initializer_gain = 1.0 + hparams.weight_decay = 0.0 + hparams.num_sampled_classes = 0 + hparams.sampling_method = "argmax" + hparams.optimizer_adam_epsilon = 1e-6 + hparams.optimizer_adam_beta1 = 0.85 + hparams.optimizer_adam_beta2 = 0.997 + return hparams diff --git a/tensor2tensor/models/neural_gpu_test.py b/tensor2tensor/models/neural_gpu_test.py new file mode 100644 index 000000000..0d4937a5d --- /dev/null +++ b/tensor2tensor/models/neural_gpu_test.py @@ -0,0 +1,62 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for Neural GPU.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +import numpy as np + +from tensor2tensor.data_generators import problem_hparams +from tensor2tensor.models import common_hparams +from tensor2tensor.models import neural_gpu + +import tensorflow as tf + + +class NeuralGPUTest(tf.test.TestCase): + + def testNeuralGPU(self): + hparams = common_hparams.basic_params1() + batch_size = 3 + input_length = 5 + target_length = input_length + input_vocab_size = 9 + target_vocab_size = 11 + p_hparams = problem_hparams.test_problem_hparams(hparams, input_vocab_size, + target_vocab_size) + inputs = -1 + np.random.random_integers( + input_vocab_size, size=(batch_size, input_length, 1, 1)) + targets = -1 + np.random.random_integers( + target_vocab_size, size=(batch_size, target_length, 1, 1)) + with self.test_session() as session: + features = { + "inputs": tf.constant(inputs, dtype=tf.int32), + "targets": tf.constant(targets, dtype=tf.int32) + } + model = neural_gpu.NeuralGPU(hparams, p_hparams) + shadred_logits, _, _ = model.model_fn(features, True) + logits = tf.concat(shadred_logits, 0) + session.run(tf.global_variables_initializer()) + res = session.run(logits) + self.assertEqual(res.shape, (batch_size, target_length, 1, 1, + target_vocab_size)) + + +if __name__ == "__main__": + tf.test.main() diff --git a/tensor2tensor/models/slicenet.py b/tensor2tensor/models/slicenet.py new file mode 100644 index 000000000..a7e2623cc --- /dev/null +++ b/tensor2tensor/models/slicenet.py @@ -0,0 +1,391 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""SliceNet.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +from six.moves import xrange # pylint: disable=redefined-builtin +from six.moves import zip # pylint: disable=redefined-builtin + +from tensor2tensor.models import common_attention +from tensor2tensor.models import common_hparams +from tensor2tensor.models import common_layers +from tensor2tensor.utils import registry +from tensor2tensor.utils import t2t_model + +import tensorflow as tf + + +def get_norm(hparams): + """Get the normalizer function.""" + if hparams.normalizer_fn == "layer": + return lambda x, name: common_layers.layer_norm( # pylint: disable=g-long-lambda + x, hparams.hidden_size, name=name) + if hparams.normalizer_fn == "batch": + return tf.layers.batch_normalization + if hparams.normalizer_fn == "noam": + return common_layers.noam_norm + if hparams.normalizer_fn == "none": + return lambda x, name: x + raise ValueError("Parameter normalizer_fn must be one of: 'layer', 'batch'," + "'noam', 'none'.") + + +def attention(targets_shifted, inputs_encoded, norm_fn, hparams, train, + bias=None): + """Complete attention layer with preprocessing.""" + separabilities = [hparams.separability, hparams.separability] + if hparams.separability < 0: + separabilities = [hparams.separability - 1, hparams.separability] + targets_timed = common_layers.subseparable_conv_block( + common_layers.add_timing_signal(targets_shifted), + hparams.hidden_size, [((1, 1), (5, 1)), ((4, 1), (5, 1))], + normalizer_fn=norm_fn, + padding="LEFT", + separabilities=separabilities, + name="targets_time") + if hparams.attention_type == "transformer": + targets_timed = tf.squeeze(targets_timed, 2) + target_shape = tf.shape(targets_timed) + targets_segment = tf.zeros([target_shape[0], target_shape[1]]) + target_attention_bias = common_attention.attention_bias( + targets_segment, targets_segment, lower_triangular=True) + inputs_attention_bias = tf.zeros([ + tf.shape(inputs_encoded)[0], hparams.num_heads, + tf.shape(targets_segment)[1], + tf.shape(inputs_encoded)[1] + ]) + + attention_dropout = hparams.attention_dropout * tf.to_float(train) + qv = common_attention.multihead_attention( + targets_timed, + None, + target_attention_bias, + hparams.hidden_size, + hparams.hidden_size, + hparams.hidden_size, + hparams.num_heads, + attention_dropout, + name="self_attention", + summaries=False) + qv = common_attention.multihead_attention( + qv, + inputs_encoded, + inputs_attention_bias, + hparams.hidden_size, + hparams.hidden_size, + hparams.hidden_size, + hparams.num_heads, + attention_dropout, + name="encdec_attention", + summaries=False) + return tf.expand_dims(qv, 2) + elif hparams.attention_type == "simple": + targets_with_attention = common_layers.simple_attention( + targets_timed, inputs_encoded, bias=bias, summaries=False) + return norm_fn(targets_shifted + targets_with_attention, name="attn_norm") + + +def multi_conv_res(x, padding, name, layers, hparams, train, + mask=None, source=None): + """A stack of separable convolution blocks with residual connections.""" + with tf.variable_scope(name): + padding_bias = None + if mask is not None: + padding_bias = (1.0 - mask) * -1e9 # Bias to not attend to padding. + if padding == "LEFT": # Do not mask anything when left-padding. + mask = None + if (hparams.kernel_scheme in _KERNEL_SCHEMES and + hparams.dilation_scheme in _DILATION_SCHEMES): + kernels = _KERNEL_SCHEMES[hparams.kernel_scheme] + dilations = _DILATION_SCHEMES[hparams.dilation_scheme] + dilations_and_kernels = list(zip(dilations, kernels)) + dilations_and_kernels1 = dilations_and_kernels[:2] + dilations_and_kernels2 = dilations_and_kernels[2:] + else: + k = (hparams.kernel_height, hparams.kernel_width) + k2 = (hparams.large_kernel_size, 1) + dilations_and_kernels1 = [((1, 1), k), ((1, 1), k)] + dilations_and_kernels2 = [((1, 1), k2), ((4, 4), k2)] + separabilities1 = [hparams.separability, hparams.separability] + separabilities2 = [hparams.separability] * len(dilations_and_kernels2) + if hparams.separability < 0: + separabilities1 = [hparams.separability - 1, hparams.separability] + separabilities2 = [ + hparams.separability - i + for i in reversed(range(len(dilations_and_kernels2))) + ] + norm_fn = get_norm(hparams) + for layer in xrange(layers): + with tf.variable_scope("layer_%d" % layer): + y = common_layers.subseparable_conv_block( + x, + hparams.hidden_size, + dilations_and_kernels1, + normalizer_fn=norm_fn, + padding=padding, + mask=mask, + separabilities=separabilities1, + name="residual1") + x += common_layers.subseparable_conv_block( + x + y, + hparams.hidden_size, + dilations_and_kernels2, + normalizer_fn=norm_fn, + padding=padding, + mask=mask, + separabilities=separabilities2, + name="residual2") + y + if source is not None and hparams.attention_type != "none": + x += attention(x, source, norm_fn, hparams, train, bias=padding_bias) + if mask is not None: + x *= mask + return tf.nn.dropout(x, 1.0 - hparams.dropout * tf.to_float(train)) + + +def rank_loss(sentence_emb, image_emb, margin=0.2): + """Experimental rank loss, thanks to kkurach@ for the code.""" + with tf.name_scope("rank_loss"): + # Normalize first as this is assumed in cosine similarity later. + sentence_emb = tf.nn.l2_normalize(sentence_emb, 1) + image_emb = tf.nn.l2_normalize(image_emb, 1) + # Both sentence_emb and image_emb have size [batch, depth]. + scores = tf.matmul(image_emb, tf.transpose(sentence_emb)) # [batch, batch] + diagonal = tf.diag_part(scores) # [batch] + cost_s = tf.maximum(0.0, margin - diagonal + scores) # [batch, batch] + cost_im = tf.maximum( + 0.0, margin - tf.reshape(diagonal, [-1, 1]) + scores) # [batch, batch] + # Clear diagonals. + batch_size = tf.shape(sentence_emb)[0] + empty_diagonal_mat = tf.ones_like(cost_s) - tf.eye(batch_size) + cost_s *= empty_diagonal_mat + cost_im *= empty_diagonal_mat + return tf.reduce_mean(cost_s) + tf.reduce_mean(cost_im) + + +def similarity_cost(inputs_encoded, targets_encoded): + """Loss telling to be more similar to your own targets than to others.""" + # This is a first very simple version: handle variable-length by padding + # to same length and putting everything into batch. In need of a better way. + x, y = common_layers.pad_to_same_length(inputs_encoded, targets_encoded) + depth = tf.shape(inputs_encoded)[3] + x, y = tf.reshape(x, [-1, depth]), tf.reshape(y, [-1, depth]) + return rank_loss(x, y) + + +def slicenet_middle(inputs_encoded, targets, target_space_emb, mask, + hparams, train): + """Middle part of slicenet, connecting encoder and decoder.""" + norm_fn = get_norm(hparams) + + # Flatten targets and embed target_space_id. + targets_flat = tf.expand_dims(common_layers.flatten4d3d(targets), axis=2) + target_space_emb = tf.tile(target_space_emb, + [tf.shape(targets_flat)[0], 1, 1, 1]) + + # Calculate similarity loss (but don't run if not needed). + if len(hparams.problems) > 1 and hparams.sim_loss_mult > 0.00001: + targets_timed = common_layers.add_timing_signal(targets_flat) + extra_layers = int(hparams.num_hidden_layers * 1.5) + with tf.variable_scope(tf.get_variable_scope(), reuse=True): + targets_encoded = multi_conv_res(targets_timed, "SAME", "encoder", + extra_layers, hparams, train) + with tf.variable_scope("similarity_loss"): + similarity_loss = similarity_cost(inputs_encoded, targets_encoded) + similarity_loss *= hparams.sim_loss_mult + else: + similarity_loss = 0.0 + + # Use attention from each target to look at input and retrieve. + targets_shifted = common_layers.shift_left( + targets_flat, pad_value=target_space_emb) + if hparams.attention_type == "none": + targets_with_attention = tf.zeros_like(targets_shifted) + else: + inputs_padding_bias = (1.0 - mask) * -1e9 # Bias to not attend to padding. + targets_with_attention = attention( + targets_shifted, inputs_encoded, norm_fn, hparams, train, + bias=inputs_padding_bias) + + # Positional targets: merge attention and raw. + kernel = (hparams.kernel_height, hparams.kernel_width) + targets_merged = common_layers.subseparable_conv_block( + tf.concat([targets_with_attention, targets_shifted], axis=3), + hparams.hidden_size, [((1, 1), kernel)], + normalizer_fn=norm_fn, + padding="LEFT", + separability=4, + name="targets_merge") + + return targets_merged, similarity_loss + + +def embed_target_space(target_space_id, hidden_size): + target_space_emb = common_layers.embedding( + target_space_id, 32, hidden_size, name="target_space_embedding") + return tf.reshape(target_space_emb, [1, 1, 1, -1]) + + +def embedding_to_padding(emb): + """Input embeddings -> is_padding.""" + emb_sum = tf.reduce_sum(tf.abs(emb), axis=-1, keep_dims=True) + return tf.to_float(tf.equal(emb_sum, 0.0)) + + +def slicenet_internal(inputs, targets, target_space, + problem_idx, hparams, train): + """The slicenet model, main step used for training.""" + with tf.variable_scope("slicenet"): + # Flatten inputs and encode. + inputs = tf.expand_dims(common_layers.flatten4d3d(inputs), axis=2) + inputs_mask = 1.0 - embedding_to_padding(inputs) + inputs = common_layers.add_timing_signal(inputs) # Add position info. + target_space_emb = embed_target_space(target_space, hparams.hidden_size) + extra_layers = int(hparams.num_hidden_layers * 1.5) + inputs_encoded = multi_conv_res(inputs, "SAME", "encoder", extra_layers, + hparams, train, mask=inputs_mask) + target_modality_name = hparams.problems[problem_idx].target_modality.name + if "class_label_modality" in target_modality_name: + # If we're just predicing a class, there is no use for a decoder. + return inputs_encoded + # Do the middle part. + decoder_start, similarity_loss = slicenet_middle( + inputs_encoded, targets, target_space_emb, inputs_mask, hparams, train) + # Decode. + decoder_final = multi_conv_res( + decoder_start, + "LEFT", + "decoder", + hparams.num_hidden_layers, + hparams, + train, + mask=inputs_mask, + source=inputs_encoded) + return decoder_final, tf.reduce_mean(similarity_loss) + + +@registry.register_model +class SliceNet(t2t_model.T2TModel): + + def model_fn_body(self, features, train): + return slicenet_internal(features["inputs"], features["targets"], + features["target_space_id"], self._problem_idx, + self._hparams, train) + +_KERNEL_SCHEMES = { + "3.3.3.3": [(3, 1), (3, 1), (3, 1), (3, 1)], + "3.7.7.7": [(3, 1), (7, 1), (7, 1), (7, 1)], + "3.7.15.15": [(3, 1), (7, 1), (15, 1), (15, 1)], + "3.7.15.31": [(3, 1), (7, 1), (15, 1), (31, 1)], + "3.7.15.31.63": [(3, 1), (7, 1), (15, 1), (31, 1), (63, 1)], +} +_DILATION_SCHEMES = { + "1.1.1.1.1": [(1, 1), (1, 1), (1, 1), (1, 1), (1, 1)], + "1.1.1.1": [(1, 1), (1, 1), (1, 1), (1, 1)], + "1.1.1.2": [(1, 1), (1, 1), (1, 1), (2, 1)], + "1.1.2.4": [(1, 1), (1, 1), (2, 1), (4, 1)], + "1.2.4.8": [(1, 1), (2, 1), (4, 1), (8, 1)], +} + + +@registry.register_hparams("slicenet1") +def slicenet_params1(): + """Set of hyperparameters.""" + hparams = common_hparams.basic_params1() + hparams.batch_size = 1024 + hparams.hidden_size = 768 + hparams.dropout = 0.5 + hparams.symbol_dropout = 0.2 + hparams.label_smoothing = 0.1 + hparams.clip_grad_norm = 2.0 + hparams.num_hidden_layers = 4 + hparams.kernel_height = 3 + hparams.kernel_width = 1 + hparams.add_hparam("normalizer_fn", "layer") # New ones are added like this. + hparams.learning_rate_decay_scheme = "exp50k" + hparams.learning_rate = 0.05 + hparams.learning_rate_warmup_steps = 3000 + hparams.initializer_gain = 1.0 + hparams.weight_decay = 3.0 + hparams.num_sampled_classes = 0 + hparams.sampling_method = "argmax" + hparams.optimizer_adam_epsilon = 1e-6 + hparams.optimizer_adam_beta1 = 0.85 + hparams.optimizer_adam_beta2 = 0.997 + hparams.add_hparam("large_kernel_size", 15) + hparams.add_hparam("separability", -2) + # A dilation scheme, one of _DILATION_SCHEMES. + hparams.add_hparam("dilation_scheme", "1.1.1.1") + # A kernel scheme, one of _KERNEL_SCHEMES; overrides large_kernel_size. + hparams.add_hparam("kernel_scheme", "3.7.15.31") + hparams.add_hparam("audio_compression", 8) + hparams.add_hparam("moe_n1", 32) + hparams.add_hparam("moe_n2", 0) + hparams.add_hparam("moe_loss_coef", 1e-2) + hparams.add_hparam("imagenet_use_2d", int(True)) + # attention-related flags + hparams.add_hparam("attention_type", "simple") + hparams.add_hparam("num_heads", 8) + hparams.add_hparam("attention_key_channels", 0) + hparams.add_hparam("attention_value_channels", 0) + hparams.add_hparam("sim_loss_mult", 0.0) # Try 10.0 for experiments. + hparams.add_hparam("attention_dropout", 0.2) + hparams.shared_embedding_and_softmax_weights = int(True) + return hparams + + +@registry.register_hparams("slicenet1noam") +def slicenet_params1_noam(): + """Version with Noam's decay scheme.""" + hparams = slicenet_params1() + hparams.learning_rate_decay_scheme = "noam" + hparams.learning_rate = 1.0 + hparams.learning_rate_warmup_steps = 4000 + hparams.initializer = "uniform_unit_scaling" + hparams.optimizer_adam_epsilon = 1e-9 + hparams.optimizer_adam_beta1 = 0.9 + hparams.optimizer_adam_beta2 = 0.98 + return hparams + + +@registry.register_hparams("slicenet1tiny") +def slicenet_params1_tiny(): + """Version for fast local runs.""" + hparams = slicenet_params1() + hparams.attention_type = "simple" + hparams.separability = 0 + hparams.hidden_size = 128 + hparams.num_hidden_layers = 2 + hparams.moe_n1 = 2 + hparams.batch_size = 512 + hparams.learning_rate_warmup_steps = 200 + return hparams + + +@registry.register_ranged_hparams("slicenet1") +def slicenet_range1(ranged_hparams): + """Small range of hyperparameters.""" + rhp = ranged_hparams + + hparams = slicenet_params1() + common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp) + + rhp.set_float("clip_grad_norm", 1.0, 10.0, scale=rhp.LOG_SCALE) + rhp.set_float("learning_rate", 0.02, 1.0, scale=rhp.LOG_SCALE) + rhp.set_float("optimizer_adam_beta2", 0.995, 0.998) + rhp.set_float("weight_decay", 1.0, 5.0) diff --git a/tensor2tensor/models/slicenet_test.py b/tensor2tensor/models/slicenet_test.py new file mode 100644 index 000000000..bbeb3a284 --- /dev/null +++ b/tensor2tensor/models/slicenet_test.py @@ -0,0 +1,54 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for SliceNet.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +import numpy as np + +from tensor2tensor.data_generators import problem_hparams +from tensor2tensor.models import slicenet + +import tensorflow as tf + + +class SliceNetTest(tf.test.TestCase): + + def testSliceNet(self): + x = np.random.random_integers(0, high=255, size=(3, 5, 4, 3)) + y = np.random.random_integers(0, high=9, size=(3, 5, 1, 1)) + hparams = slicenet.slicenet_params1_tiny() + p_hparams = problem_hparams.image_cifar10(hparams) + hparams.problems = [p_hparams] + with self.test_session() as session: + features = { + "inputs": tf.constant(x, dtype=tf.int32), + "targets": tf.constant(y, dtype=tf.int32), + "target_space_id": tf.constant(1, dtype=tf.int32), + } + model = slicenet.SliceNet(hparams, p_hparams) + sharded_logits, _, _ = model.model_fn(features, True) + logits = tf.concat(sharded_logits, 0) + session.run(tf.global_variables_initializer()) + res = session.run(logits) + self.assertEqual(res.shape, (3, 1, 1, 1, 10)) + + +if __name__ == "__main__": + tf.test.main() diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py new file mode 100644 index 000000000..379210d67 --- /dev/null +++ b/tensor2tensor/models/transformer.py @@ -0,0 +1,495 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""transformer (attention). + +encoder: [Self-Attention, Feed-forward] x n +decoder: [Self-Attention, Source-Target-Attention, Feed-forward] x n + +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import copy + +# Dependency imports + +from six.moves import xrange # pylint: disable=redefined-builtin + +from tensor2tensor.models import common_attention +from tensor2tensor.models import common_hparams +from tensor2tensor.models import common_layers +from tensor2tensor.utils import registry +from tensor2tensor.utils import t2t_model + +import tensorflow as tf + + +@registry.register_model +class Transformer(t2t_model.T2TModel): + """Attention net. See file docstring.""" + + def model_fn_body(self, features, train): + # Remove dropout if not training + hparams = copy.copy(self._hparams) + if not train: + hparams.attention_dropout = 0. + hparams.relu_dropout = 0. + hparams.residual_dropout = 0. + targets = features["targets"] + inputs = features.get("inputs") + target_space = features.get("target_space_id") + + inputs = tf.squeeze(inputs, 2) + targets = tf.squeeze(targets, 2) + + (encoder_input, encoder_attention_bias, _) = (transformer_prepare_encoder( + inputs, target_space, hparams)) + (decoder_input, decoder_self_attention_bias) = transformer_prepare_decoder( + targets, hparams) + + def residual_fn(x, y): + return common_layers.layer_norm(x + tf.nn.dropout( + y, 1.0 - hparams.residual_dropout)) + + # encoder_input = tf.squeeze(encoder_input, 2) + # decoder_input = tf.squeeze(decoder_input, 2) + encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.residual_dropout) + decoder_input = tf.nn.dropout(decoder_input, 1.0 - hparams.residual_dropout) + encoder_output = transformer_encoder(encoder_input, residual_fn, + encoder_attention_bias, hparams) + + decoder_output = transformer_decoder( + decoder_input, encoder_output, residual_fn, decoder_self_attention_bias, + encoder_attention_bias, hparams) + decoder_output = tf.expand_dims(decoder_output, 2) + + return decoder_output + + +def transformer_prepare_encoder(inputs, target_space, hparams): + """Prepare one shard of the model for the encoder. + + Args: + inputs: a Tensor. + target_space: a Tensor. + hparams: run hyperparameters + + Returns: + encoder_input: a Tensor, bottom of encoder stack + encoder_self_attention_bias: a Tensor, containing large negative values + to implement masked attention and possibly baises for diagonal + alignments + encoder_padding: a Tensor + """ + # Flatten inputs. + ishape_static = inputs.shape.as_list() + encoder_input = inputs + encoder_padding = common_attention.embedding_to_padding(encoder_input) + encoder_self_attention_bias = common_attention.attention_bias_ignore_padding( + encoder_padding) + # Append target_space_id embedding to inputs. + emb_target_space = common_layers.embedding( + target_space, 32, ishape_static[-1], name="target_space_embedding") + emb_target_space = tf.reshape(emb_target_space, [1, 1, -1]) + encoder_input += emb_target_space + if hparams.pos == "timing": + encoder_input = common_attention.add_timing_signal_1d(encoder_input) + return (encoder_input, encoder_self_attention_bias, encoder_padding) + + +def transformer_prepare_decoder(targets, hparams): + """Prepare one shard of the model for the decoder. + + Args: + targets: a Tensor. + hparams: run hyperparameters + + Returns: + decoder_input: a Tensor, bottom of decoder stack + decoder_self_attention_bias: a Tensor, containing large negative values + to implement masked attention and possibly baises for diagonal alignments + """ + decoder_self_attention_bias = ( + common_attention.attention_bias_lower_triangle(tf.shape(targets)[1])) + decoder_input = common_layers.shift_left_3d(targets) + if hparams.pos == "timing": + decoder_input = common_attention.add_timing_signal_1d(decoder_input) + return (decoder_input, decoder_self_attention_bias) + + +def transformer_encoder(encoder_input, + residual_fn, + encoder_self_attention_bias, + hparams, + name="encoder"): + """A stack of transformer layers. + + Args: + encoder_input: a Tensor + residual_fn: a function from (layer_input, layer_output) -> combined_output + encoder_self_attention_bias: bias Tensor for self-attention + (see common_attention.attention_bias()) + hparams: hyperparameters for model + name: a string + + Returns: + y: a Tensors + """ + x = encoder_input + # Summaries don't work in multi-problem setting yet. + summaries = "problems" not in hparams.values() or len(hparams.problems) == 1 + with tf.variable_scope(name): + for layer in xrange(hparams.num_hidden_layers): + with tf.variable_scope("layer_%d" % layer): + x = residual_fn( + x, + common_attention.multihead_attention( + x, + None, + encoder_self_attention_bias, + hparams.attention_key_channels or hparams.hidden_size, + hparams.attention_value_channels or hparams.hidden_size, + hparams.hidden_size, + hparams.num_heads, + hparams.attention_dropout, + summaries=summaries, + name="encoder_self_attention")) + x = residual_fn(x, + common_layers.conv_hidden_relu( + x, + hparams.filter_size, + hparams.hidden_size, + dropout=hparams.relu_dropout)) + return x + + +def transformer_decoder(decoder_input, + encoder_output, + residual_fn, + decoder_self_attention_bias, + encoder_decoder_attention_bias, + hparams, + name="decoder"): + """A stack of transformer layers. + + Args: + decoder_input: a Tensor + encoder_output: a Tensor + residual_fn: a function from (layer_input, layer_output) -> combined_output + decoder_self_attention_bias: bias Tensor for self-attention + (see common_attention.attention_bias()) + encoder_decoder_attention_bias: bias Tensor for encoder-decoder attention + (see common_attention.attention_bias()) + hparams: hyperparameters for model + name: a string + + Returns: + y: a Tensors + """ + x = decoder_input + # Summaries don't work in multi-problem setting yet. + summaries = "problems" not in hparams.values() or len(hparams.problems) == 1 + with tf.variable_scope(name): + for layer in xrange(hparams.num_hidden_layers): + with tf.variable_scope("layer_%d" % layer): + x = residual_fn( + x, + common_attention.multihead_attention( + x, + None, + decoder_self_attention_bias, + hparams.attention_key_channels or hparams.hidden_size, + hparams.attention_value_channels or hparams.hidden_size, + hparams.hidden_size, + hparams.num_heads, + hparams.attention_dropout, + summaries=summaries, + name="decoder_self_attention")) + x = residual_fn( + x, + common_attention.multihead_attention( + x, + encoder_output, + encoder_decoder_attention_bias, + hparams.attention_key_channels or hparams.hidden_size, + hparams.attention_value_channels or hparams.hidden_size, + hparams.hidden_size, + hparams.num_heads, + hparams.attention_dropout, + summaries=summaries, + name="encdec_attention")) + x = residual_fn(x, + common_layers.conv_hidden_relu( + x, + hparams.filter_size, + hparams.hidden_size, + dropout=hparams.relu_dropout)) + return x + + +@registry.register_hparams +def transformer_base(): + """Set of hyperparameters.""" + hparams = common_hparams.basic_params1() + hparams.hidden_size = 512 + hparams.batch_size = 4096 + hparams.max_length = 256 + hparams.dropout = 0.0 + hparams.clip_grad_norm = 0. # i.e. no gradient clipping + hparams.optimizer_adam_epsilon = 1e-9 + hparams.learning_rate_decay_scheme = "noam" + hparams.learning_rate = 0.1 + hparams.learning_rate_warmup_steps = 4000 + hparams.initializer_gain = 1.0 + hparams.num_hidden_layers = 6 + hparams.initializer = "uniform_unit_scaling" + hparams.weight_decay = 0.0 + hparams.optimizer_adam_beta1 = 0.9 + hparams.optimizer_adam_beta2 = 0.98 + hparams.num_sampled_classes = 0 + hparams.label_smoothing = 0.1 + hparams.shared_embedding_and_softmax_weights = int(True) + + hparams.add_hparam("filter_size", 2048) # Add new ones like this. + # attention-related flags + hparams.add_hparam("num_heads", 8) + hparams.add_hparam("attention_key_channels", 0) + hparams.add_hparam("attention_value_channels", 0) + hparams.add_hparam("attention_dropout", 0.0) + hparams.add_hparam("relu_dropout", 0.0) + hparams.add_hparam("pos", "timing") # timing, none + hparams.add_hparam("residual_dropout", 0.1) + hparams.add_hparam("nbr_decoder_problems", 1) + return hparams + + +@registry.register_hparams +def transformer_single_gpu(): + hparams = transformer_base() + hparams.batch_size = 8192 + hparams.learning_rate_warmup_steps = 16000 + hparams.batching_mantissa_bits = 2 + return hparams + + +@registry.register_hparams +def transformer_tiny(): + hparams = transformer_base() + hparams.hidden_size = 64 + hparams.filter_size = 128 + hparams.num_heads = 4 + return hparams + + +@registry.register_hparams +def transformer_l2(): + hparams = transformer_base() + hparams.num_hidden_layers = 2 + return hparams + + +@registry.register_hparams +def transformer_l4(): + hparams = transformer_base() + hparams.num_hidden_layers = 4 + return hparams + + +@registry.register_hparams +def transformer_l8(): + hparams = transformer_base() + hparams.num_hidden_layers = 8 + return hparams + + +@registry.register_hparams +def transformer_h1(): + hparams = transformer_base() + hparams.num_heads = 1 + return hparams + + +@registry.register_hparams +def transformer_h4(): + hparams = transformer_base() + hparams.num_heads = 4 + return hparams + + +@registry.register_hparams +def transformer_h16(): + hparams = transformer_base() + hparams.num_heads = 16 + return hparams + + +@registry.register_hparams +def transformer_h32(): + hparams = transformer_base() + hparams.num_heads = 32 + return hparams + + +@registry.register_hparams +def transformer_k128(): + hparams = transformer_base() + hparams.attention_key_channels = 128 + return hparams + + +@registry.register_hparams +def transformer_k256(): + hparams = transformer_base() + hparams.attention_key_channels = 256 + return hparams + + +@registry.register_hparams +def transformer_ff1024(): + hparams = transformer_base() + hparams.filter_size = 1024 + return hparams + + +@registry.register_hparams +def transformer_ff4096(): + hparams = transformer_base() + hparams.filter_size = 4096 + return hparams + + +@registry.register_hparams +def transformer_dr0(): + hparams = transformer_base() + hparams.residual_dropout = 0.0 + return hparams + + +@registry.register_hparams +def transformer_dr2(): + hparams = transformer_base() + hparams.residual_dropout = 0.2 + return hparams + + +@registry.register_hparams +def transformer_ls0(): + hparams = transformer_base() + hparams.label_smoothing = 0.0 + return hparams + + +@registry.register_hparams +def transformer_ls2(): + hparams = transformer_base() + hparams.label_smoothing = 0.2 + return hparams + + +@registry.register_hparams +def transformer_hs256(): + hparams = transformer_base() + hparams.hidden_size = 256 + return hparams + + +@registry.register_hparams +def transformer_hs1024(): + hparams = transformer_base() + hparams.hidden_size = 1024 + return hparams + + +@registry.register_hparams +def transformer_big_dr1(): + hparams = transformer_base() + hparams.hidden_size = 1024 + hparams.filter_size = 4096 + hparams.num_heads = 16 + hparams.residual_dropout = 0.1 + hparams.batching_mantissa_bits = 2 + return hparams + + +@registry.register_hparams +def transformer_big_enfr(): + hparams = transformer_big_dr1() + hparams.shared_embedding_and_softmax_weights = int(False) + hparams.filter_size = 8192 + hparams.residual_dropout = 0.1 + return hparams + + +@registry.register_hparams +def transformer_big_dr2(): + hparams = transformer_big_dr1() + hparams.residual_dropout = 0.2 + return hparams + + +@registry.register_hparams +def transformer_big_dr3(): + hparams = transformer_big_dr1() + hparams.residual_dropout = 0.3 + return hparams + + +@registry.register_hparams +def transformer_big_single_gpu(): + hparams = transformer_big_dr1() + hparams.learning_rate_warmup_steps = 16000 + hparams.optimizer_adam_beta2 = 0.998 + hparams.batching_mantissa_bits = 3 + return hparams + + +@registry.register_hparams +def transformer_parsing_base_dr6(): + """hparams for parsing on wsj only.""" + hparams = transformer_base() + hparams.attention_dropout = 0.2 + hparams.residual_dropout = 0.2 + hparams.max_length = 512 + hparams.learning_rate_warmup_steps = 16000 + hparams.hidden_size = 1024 + hparams.learning_rate = 0.5 + hparams.shared_embedding_and_softmax_weights = int(False) + return hparams + + +@registry.register_hparams +def transformer_parsing_big(): + """HParams for parsing on wsj semi-supervised.""" + hparams = transformer_big_dr1() + hparams.max_length = 512 + hparams.shared_source_target_embedding = int(False) + hparams.learning_rate_warmup_steps = 4000 + hparams.batch_size = 2048 + hparams.learning_rate = 0.5 + return hparams + + +@registry.register_ranged_hparams("transformer_big_single_gpu") +def transformer_range1(rhp): + """Small range of hyperparameters.""" + hparams = transformer_big_single_gpu() + common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp) + + rhp.set_float("learning_rate", 0.3, 3.0, scale=rhp.LOG_SCALE) + rhp.set_float("initializer_gain", 0.5, 2.0) + rhp.set_float("optimizer_adam_beta2", 0.97, 0.99) + rhp.set_float("weight_decay", 0.0, 2.0) diff --git a/tensor2tensor/models/transformer_test.py b/tensor2tensor/models/transformer_test.py new file mode 100644 index 000000000..1b43ce625 --- /dev/null +++ b/tensor2tensor/models/transformer_test.py @@ -0,0 +1,63 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for Transformer.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +import numpy as np + +from tensor2tensor.data_generators import problem_hparams +from tensor2tensor.models import transformer + +import tensorflow as tf + + +class TransformerTest(tf.test.TestCase): + + def _testTransformer(self, net): + batch_size = 3 + input_length = 5 + target_length = 7 + vocab_size = 9 + hparams = transformer.transformer_tiny() + p_hparams = problem_hparams.test_problem_hparams(hparams, vocab_size, + vocab_size) + inputs = -1 + np.random.random_integers( + vocab_size, size=(batch_size, input_length, 1, 1)) + targets = -1 + np.random.random_integers( + vocab_size, size=(batch_size, target_length, 1, 1)) + with self.test_session() as session: + features = { + "inputs": tf.constant(inputs, dtype=tf.int32), + "targets": tf.constant(targets, dtype=tf.int32), + "target_space_id": tf.constant(1, dtype=tf.int32), + } + model = net(hparams, p_hparams) + shadred_logits, _, _ = model.model_fn(features, True) + logits = tf.concat(shadred_logits, 0) + session.run(tf.global_variables_initializer()) + res = session.run(logits) + self.assertEqual(res.shape, (batch_size, target_length, 1, 1, vocab_size)) + + def testTransformer(self): + self._testTransformer(transformer.Transformer) + + +if __name__ == "__main__": + tf.test.main() diff --git a/tensor2tensor/models/xception.py b/tensor2tensor/models/xception.py new file mode 100644 index 000000000..b6e271c36 --- /dev/null +++ b/tensor2tensor/models/xception.py @@ -0,0 +1,89 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Xception.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +from six.moves import xrange # pylint: disable=redefined-builtin + +from tensor2tensor.models import common_hparams +from tensor2tensor.models import common_layers +from tensor2tensor.utils import registry +from tensor2tensor.utils import t2t_model + +import tensorflow as tf + + +def residual_block(x, hparams, train): + """A stack of convolution blocks with residual connection.""" + k = (hparams.kernel_height, hparams.kernel_width) + dilations_and_kernels = [((1, 1), k) for _ in xrange(3)] + y = common_layers.subseparable_conv_block( + x, + hparams.hidden_size, + dilations_and_kernels, + padding="SAME", + separability=0, + name="residual_block") + x = common_layers.layer_norm(x + y, hparams.hidden_size, name="lnorm") + return tf.nn.dropout(x, 1.0 - hparams.dropout * tf.to_float(train)) + + +def xception_internal(inputs, hparams, train): + """Xception body.""" + with tf.variable_scope("xception"): + cur = inputs + for i in xrange(hparams.num_hidden_layers): + with tf.variable_scope("layer_%d" % i): + cur = residual_block(cur, hparams, train) + return cur + + +@registry.register_model +class Xception(t2t_model.T2TModel): + + def model_fn_body(self, features, train): + return xception_internal(features["inputs"], self._hparams, train) + + +@registry.register_hparams +def xception_base(): + """Set of hyperparameters.""" + hparams = common_hparams.basic_params1() + hparams.batch_size = 4096 + hparams.hidden_size = 768 + hparams.dropout = 0.2 + hparams.symbol_dropout = 0.2 + hparams.label_smoothing = 0.1 + hparams.clip_grad_norm = 2.0 + hparams.num_hidden_layers = 8 + hparams.kernel_height = 3 + hparams.kernel_width = 3 + hparams.learning_rate_decay_scheme = "exp50k" + hparams.learning_rate = 0.05 + hparams.learning_rate_warmup_steps = 3000 + hparams.initializer_gain = 1.0 + hparams.weight_decay = 3.0 + hparams.num_sampled_classes = 0 + hparams.sampling_method = "argmax" + hparams.optimizer_adam_epsilon = 1e-6 + hparams.optimizer_adam_beta1 = 0.85 + hparams.optimizer_adam_beta2 = 0.997 + hparams.add_hparam("imagenet_use_2d", True) + return hparams diff --git a/tensor2tensor/models/xception_test.py b/tensor2tensor/models/xception_test.py new file mode 100644 index 000000000..106604659 --- /dev/null +++ b/tensor2tensor/models/xception_test.py @@ -0,0 +1,54 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Xception tests.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +import numpy as np + +from tensor2tensor.data_generators import problem_hparams +from tensor2tensor.models import xception + +import tensorflow as tf + + +class XceptionTest(tf.test.TestCase): + + def testXception(self): + vocab_size = 9 + x = np.random.random_integers(1, high=vocab_size - 1, size=(3, 5, 1, 1)) + y = np.random.random_integers(1, high=vocab_size - 1, size=(3, 1, 1, 1)) + hparams = xception.xception_base() + p_hparams = problem_hparams.test_problem_hparams(hparams, vocab_size, + vocab_size) + with self.test_session() as session: + features = { + "inputs": tf.constant(x, dtype=tf.int32), + "targets": tf.constant(y, dtype=tf.int32), + } + model = xception.Xception(hparams, p_hparams) + sharded_logits, _, _ = model.model_fn(features, True) + logits = tf.concat(sharded_logits, 0) + session.run(tf.global_variables_initializer()) + res = session.run(logits) + self.assertEqual(res.shape, (3, 5, 1, 1, vocab_size)) + + +if __name__ == "__main__": + tf.test.main() diff --git a/tensor2tensor/utils/__init__.py b/tensor2tensor/utils/__init__.py new file mode 100644 index 000000000..27d533abc --- /dev/null +++ b/tensor2tensor/utils/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/tensor2tensor/utils/avg_checkpoints.py b/tensor2tensor/utils/avg_checkpoints.py new file mode 100644 index 000000000..4e5286f62 --- /dev/null +++ b/tensor2tensor/utils/avg_checkpoints.py @@ -0,0 +1,98 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Script to average values of variables in a list of checkpoint files.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +import numpy as np +import six +from six.moves import zip # pylint: disable=redefined-builtin +import tensorflow as tf + +flags = tf.flags +FLAGS = flags.FLAGS + +flags.DEFINE_string("checkpoints", "", + "Comma-separated list of checkpoints to average.") +flags.DEFINE_string("prefix", "", + "Prefix (e.g., directory) to append to each checkpoint.") +flags.DEFINE_string("output_path", "/tmp/averaged.ckpt", + "Path to output the averaged checkpoint to.") + + +def checkpoint_exists(path): + return (tf.gfile.Exists(path) or tf.gfile.Exists(path + ".meta") or + tf.gfile.Exists(path + ".index")) + + +def main(unused_argv): + # Get the checkpoints list from flags and run some basic checks. + checkpoints = [c.strip() for c in FLAGS.checkpoints.split(",")] + checkpoints = [c for c in checkpoints if c] + if not checkpoints: + raise ValueError("No checkpoints provided for averaging.") + if flags.FLAGS.prefix: + checkpoints = [FLAGS.prefix + c for c in checkpoints] + checkpoints = [c for c in checkpoints if checkpoint_exists(c)] + if not checkpoints: + raise ValueError( + "None of the provided checkpoints exist. %s" % FLAGS.checkpoints) + + # Read variables from all checkpoints and average them. + tf.logging.info("Reading variables and averaging checkpoints:") + for c in checkpoints: + tf.logging.info("%s ", c) + var_list = tf.contrib.framework.list_variables(checkpoints[0]) + var_values, var_dtypes = {}, {} + for (name, shape) in var_list: + if not name.startswith("global_step"): + var_values[name] = np.zeros(shape) + for checkpoint in checkpoints: + reader = tf.contrib.framework.load_checkpoint(checkpoint) + for name in var_values: + tensor = reader.get_tensor(name) + var_dtypes[name] = tensor.dtype + var_values[name] += tensor + tf.logging.info("Read from checkpoint %s", checkpoint) + for name in var_values: # Average. + var_values[name] /= len(checkpoints) + + tf_vars = [ + tf.get_variable(v, shape=var_values[v].shape, dtype=var_dtypes[name]) + for v in var_values + ] + placeholders = [tf.placeholder(v.dtype, shape=v.shape) for v in tf_vars] + assign_ops = [tf.assign(v, p) for (v, p) in zip(tf_vars, placeholders)] + global_step = tf.Variable( + 0, name="global_step", trainable=False, dtype=tf.int64) + saver = tf.train.Saver(tf.all_variables()) + + # Build a model consisting only of variables, set them to the average values. + with tf.Session() as sess: + sess.run(tf.initialize_all_variables()) + for p, assign_op, (name, value) in zip(placeholders, assign_ops, + six.iteritems(var_values)): + sess.run(assign_op, {p: value}) + # Use the built saver to save the averaged checkpoint. + saver.save(sess, flags.FLAGS.output_path, global_step=global_step) + + tf.logging.info("Averaged checkpoints saved in %s", flags.FLAGS.output_path) + + +if __name__ == "__main__": + tf.app.run() diff --git a/tensor2tensor/utils/beam_search.py b/tensor2tensor/utils/beam_search.py new file mode 100644 index 000000000..eacbf467f --- /dev/null +++ b/tensor2tensor/utils/beam_search.py @@ -0,0 +1,419 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Implemetation of beam seach with penalties.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports +import tensorflow as tf + +# Assuming EOS_ID is 1 +EOS_ID = 1 +# Default value for INF +INF = 1. * 1e7 + + +def log_prob_from_logits(logits): + return logits - tf.reduce_logsumexp(logits, axis=2, keep_dims=True) + + +def compute_batch_indices(batch_size, beam_size): + """Computes the i'th coodinate that contains the batch index for gathers. + + Batch pos is a tensor like [[0,0,0,0,],[1,1,1,1],..]. It says which + batch the beam item is in. This will create the i of the i,j coordinate + needed for the gather. + + Args: + batch_size: Batch size + beam_size: Size of the beam. + Returns: + batch_pos: [batch_size, beam_size] tensor of ids + """ + batch_pos = tf.range(batch_size * beam_size) // beam_size + batch_pos = tf.reshape(batch_pos, [batch_size, beam_size]) + return batch_pos + + +def compute_topk_scores_and_seq(sequences, scores, scores_to_gather, flags, + beam_size, batch_size): + """Given sequences and scores, will gather the top k=beam size sequences. + + This function is used to grow alive, and finished. It takes sequences, + scores, and flags, and returns the top k from sequences, scores_to_gather, + and flags based on the values in scores. + + Args: + sequences: Tensor of sequences that we need to gather from. + [batch_size, beam_size, seq_length] + scores: Tensor of scores for each sequence in sequences. + [batch_size, beam_size]. We will use these to compute the topk. + scores_to_gather: Tensor of scores for each sequence in sequences. + [batch_size, beam_size]. We will return the gathered scores from here. + Scores to gather is different from scores because for grow_alive, we will + need to return log_probs, while for grow_finished, we will need to return + the length penalized scors. + flags: Tensor of bools for sequences that say whether a sequence has reached + EOS or not + beam_size: int + batch_size: int + Returns: + Tuple of + (topk_seq [batch_size, beam_size, decode_length], + topk_gathered_scores [batch_size, beam_size], + topk_finished_flags[batch_size, beam_size]) + """ + _, topk_indexes = tf.nn.top_k(scores, k=beam_size) + # The next three steps are to create coordinates for tf.gather_nd to pull + # out the topk sequences from sequences based on scores. + # batch pos is a tensor like [[0,0,0,0,],[1,1,1,1],..]. It says which + # batch the beam item is in. This will create the i of the i,j coordinate + # needed for the gather + batch_pos = compute_batch_indices(batch_size, beam_size) + + # top coordinates will give us the actual coordinates to do the gather. + # stacking will create a tensor of dimension batch * beam * 2, where the + # last dimension contains the i,j gathering coordinates. + top_coordinates = tf.stack([batch_pos, topk_indexes], axis=2) + + # Gather up the highest scoring sequences + topk_seq = tf.gather_nd(sequences, top_coordinates) + topk_flags = tf.gather_nd(flags, top_coordinates) + topk_gathered_scores = tf.gather_nd(scores_to_gather, top_coordinates) + return topk_seq, topk_gathered_scores, topk_flags + + +def beam_search(symbols_to_logits_fn, + initial_ids, + beam_size, + decode_length, + vocab_size, + alpha, + eos_id=EOS_ID): + """Beam search with length penalties. + + Uses an interface specific to the sequence cnn models; + Requires a function that can take the currently decoded sybmols and return + the logits for the next symbol. The implementation is inspired by + https://arxiv.org/abs/1609.08144. + + Args: + symbols_to_logits_fn: Interface to the model, to provide logits. + Shoud take [batch_size, decoded_ids] and return [batch_size, vocab_size] + initial_ids: Ids to start off the decoding, this will be the first thing + handed to symbols_to_logits_fn (after expanding to beam size) + [batch_size] + beam_size: Size of the beam. + decode_length: Number of steps to decode for. + vocab_size: Size of the vocab, must equal the size of the logits returned by + symbols_to_logits_fn + alpha: alpha for length penalty. + eos_id: ID for end of sentence. + Returns: + Tuple of + (decoded beams [batch_size, beam_size, decode_length] + decoding probablities [batch_size, beam_size]) + """ + batch_size = tf.shape(initial_ids)[0] + + # Assume initial_ids are prob 1.0 + initial_log_probs = tf.constant([[0.] + [-float("inf")] * (beam_size - 1)]) + # Expand to beam_size (batch_size, beam_size) + alive_log_probs = tf.tile(initial_log_probs, [batch_size, 1]) + + # Expand each batch to beam_size + alive_seq = tf.tile(tf.expand_dims(initial_ids, 1), [1, beam_size]) + alive_seq = tf.expand_dims(alive_seq, 2) # (batch_size, beam_size, 1) + + # Finished will keep track of all the sequences that have finished so far + # Finished log probs will be negative infinity in the beginning + # finished_flags will keep track of booleans + finished_seq = tf.zeros(tf.shape(alive_seq), tf.int32) + # Setting the scores of the initial to negative infinity. + finished_scores = tf.ones([batch_size, beam_size]) * -INF + finished_flags = tf.zeros([batch_size, beam_size], tf.bool) + + def grow_finished(finished_seq, finished_scores, finished_flags, curr_seq, + curr_scores, curr_finished): + """Given sequences and scores, will gather the top k=beam size sequences. + + Args: + finished_seq: Current finished sequences. + [batch_size, beam_size, current_decoded_length] + finished_scores: scores for each of these sequences. + [batch_size, beam_size] + finished_flags: finished bools for each of these sequences. + [batch_size, beam_size] + curr_seq: current topk sequence that has been grown by one position. + [batch_size, beam_size, current_decoded_length] + curr_scores: scores for each of these sequences. [batch_size, beam_size] + curr_finished: Finished flags for each of these sequences. + [batch_size, beam_size] + Returns: + Tuple of + (Topk sequences based on scores, + log probs of these sequences, + Finished flags of these sequences) + """ + # First append a column of 0'ids to finished to make the same length with + # finished scores + finished_seq = tf.concat( + [finished_seq, + tf.zeros([batch_size, beam_size, 1], tf.int32)], axis=2) + + # Set the scores of the unfinished seq in curr_seq to large negative + # values + curr_scores += (1. - tf.to_float(curr_finished)) * -INF + # concatenating the sequences and scores along beam axis + curr_finished_seq = tf.concat([finished_seq, curr_seq], axis=1) + curr_finished_scores = tf.concat([finished_scores, curr_scores], axis=1) + curr_finished_flags = tf.concat([finished_flags, curr_finished], axis=1) + return compute_topk_scores_and_seq( + curr_finished_seq, curr_finished_scores, curr_finished_scores, + curr_finished_flags, beam_size, batch_size) + + def grow_alive(curr_seq, curr_scores, curr_log_probs, curr_finished): + """Given sequences and scores, will gather the top k=beam size sequences. + + Args: + curr_seq: current topk sequence that has been grown by one position. + [batch_size, beam_size, i+1] + curr_scores: scores for each of these sequences. [batch_size, beam_size] + curr_log_probs: log probs for each of these sequences. + [batch_size, beam_size] + curr_finished: Finished flags for each of these sequences. + [batch_size, beam_size] + Returns: + Tuple of + (Topk sequences based on scores, + log probs of these sequences, + Finished flags of these sequences) + """ + # Set the scores of the finished seq in curr_seq to large negative + # values + curr_scores += tf.to_float(curr_finished) * -INF + return compute_topk_scores_and_seq(curr_seq, curr_scores, curr_log_probs, + curr_finished, beam_size, batch_size) + + def grow_topk(i, alive_seq, alive_log_probs): + r"""Inner beam seach loop. + + This function takes the current alive sequences, and grows them to topk + sequences where k = 2*beam. We use 2*beam because, we could have beam_size + number of sequences that might hit and there will be no alive + sequences to continue. With 2*beam_size, this will not happen. This relies + on the assumption the vocab size is > beam size. If this is true, we'll + have at least beam_size non extensions if we extract the next top + 2*beam words. + Length penalty is given by = (5+len(decode)/6) ^ -\alpha. Pls refer to + https://arxiv.org/abs/1609.08144. + + Args: + i: loop index + alive_seq: Topk sequences decoded so far [batch_size, beam_size, i+1] + alive_log_probs: probabilities of these sequences. [batch_size, beam_size] + Returns: + Tuple of + (Topk sequences extended by the next word, + The log probs of these sequences, + The scores with length penalty of these sequences, + Flags indicating which of these sequences have finished decoding) + """ + # Get the logits for all the possible next symbols + flat_ids = tf.reshape(alive_seq, [batch_size * beam_size, -1]) + + # (batch_size * beam_size, decoded_length) + flat_logits = symbols_to_logits_fn(flat_ids) + logits = tf.reshape(flat_logits, (batch_size, beam_size, -1)) + + # Convert logits to normalized log probs + candidate_log_probs = log_prob_from_logits(logits) + + # Multiply the probabilites by the current probabilites of the beam. + # (batch_size, beam_size, vocab_size) + (batch_size, beam_size, 1) + log_probs = candidate_log_probs + tf.expand_dims(alive_log_probs, axis=2) + + length_penalty = tf.pow(((5. + tf.to_float(i + 1)) / 6.), alpha) + + curr_scores = log_probs / length_penalty + # Flatten out (beam_size, vocab_size) probs in to a list of possibilites + flat_curr_scores = tf.reshape(curr_scores, [-1, beam_size * vocab_size]) + + topk_scores, topk_ids = tf.nn.top_k(flat_curr_scores, k=beam_size * 2) + + # Recovering the log probs becuase we will need to send them back + topk_log_probs = topk_scores * length_penalty + + # Work out what beam the top probs are in. + topk_beam_index = topk_ids // vocab_size + topk_ids %= vocab_size # Unflatten the ids + + # The next three steps are to create coordinates for tf.gather_nd to pull + # out the correct seqences from id's that we need to grow. + # We will also use the coordinates to gather the booleans of the beam items + # that survived. + batch_pos = compute_batch_indices(batch_size, beam_size * 2) + + # top beams will give us the actual coordinates to do the gather. + # stacking will create a tensor of dimension batch * beam * 2, where the + # last dimension contains the i,j gathering coordinates. + topk_coordinates = tf.stack([batch_pos, topk_beam_index], axis=2) + + # Gather up the most probable 2*beams both for the ids and finished_in_alive + # bools + topk_seq = tf.gather_nd(alive_seq, topk_coordinates) + + # Append the most probable alive + topk_seq = tf.concat([topk_seq, tf.expand_dims(topk_ids, axis=2)], axis=2) + + topk_finished = tf.equal(topk_ids, eos_id) + + return topk_seq, topk_log_probs, topk_scores, topk_finished + + def inner_loop(i, alive_seq, alive_log_probs, finished_seq, finished_scores, + finished_flags): + """Inner beam seach loop. + + There are three groups of tensors, alive, finished, and topk. + The alive group contains information about the current alive sequences + The topk group contains information about alive + topk current decoded words + the finished group contains information about finished sentences, that is, + the ones that have decoded to . These are what we return. + The general beam search algorithm is as follows: + While we haven't terminated (pls look at termination condition) + 1. Grow the current alive to get beam*2 topk sequences + 2. Among the topk, keep the top beam_size ones that haven't reached EOS + into alive + 3. Among the topk, keep the top beam_size ones have reached EOS into + finished + Repeat + To make things simple with using fixed size tensors, we will end + up inserting unfinished sequences into finished in the beginning. To stop + that we add -ve INF to the score of the unfinished sequence so that when a + true finished sequence does appear, it will have a higher score than all the + unfinished ones. + + Args: + i: loop index + alive_seq: Topk sequences decoded so far [batch_size, beam_size, i+1] + alive_log_probs: probabilities of the beams. [batch_size, beam_size] + finished_seq: Current finished sequences. + [batch_size, beam_size, i+1] + finished_scores: scores for each of these sequences. + [batch_size, beam_size] + finished_flags: finished bools for each of these sequences. + [batch_size, beam_size] + + Returns: + Tuple of + (Incremented loop index + New alive sequences, + Log probs of the alive sequences, + New finished sequences, + Scores of the new finished sequences, + Flags inidicating which sequence in finished as reached EOS) + """ + + # Each inner loop, we carry out three steps: + # 1. Get the current topk items. + # 2. Extract the ones that have finished and haven't finished + # 3. Recompute the contents of finished based on scores. + topk_seq, topk_log_probs, topk_scores, topk_finished = grow_topk( + i, alive_seq, alive_log_probs) + alive_seq, alive_log_probs, _ = grow_alive(topk_seq, topk_scores, + topk_log_probs, topk_finished) + finished_seq, finished_scores, finished_flags = grow_finished( + finished_seq, finished_scores, finished_flags, topk_seq, topk_scores, + topk_finished) + + return (i + 1, alive_seq, alive_log_probs, finished_seq, finished_scores, + finished_flags) + + def _is_finished(i, unused_alive_seq, alive_log_probs, unused_finished_seq, + finished_scores, finished_in_finished): + """Checking termination condition. + + We terminate when we decoded up to decode_length or the lowest scoring item + in finished has a greater score that the higest prob item in alive divided + by the max length penalty + + Args: + i: loop index + alive_log_probs: probabilities of the beams. [batch_size, beam_size] + finished_scores: scores for each of these sequences. + [batch_size, beam_size] + finished_in_finished: finished bools for each of these sequences. + [batch_size, beam_size] + + Returns: + Bool. + """ + max_length_penalty = tf.pow(((5. + tf.to_float(decode_length)) / 6.), alpha) + # The best possible score of the most likley alive sequence + lower_bound_alive_scores = alive_log_probs[:, 0] / max_length_penalty + + # Now to compute the lowest score of a finished sequence in finished + # If the sequence isn't finished, we multiply it's score by 0. since + # scores are all -ve, taking the min will give us the score of the lowest + # finished item. + lowest_score_of_fininshed_in_finished = tf.reduce_min( + finished_scores * tf.to_float(finished_in_finished), axis=1) + # If none of the sequences have finished, then the min will be 0 and + # we have to replace it by -ve INF if it is. The score of any seq in alive + # will be much higher than -ve INF and the termination condition will not + # be met. + lowest_score_of_fininshed_in_finished += ( + (1. - tf.to_float(tf.reduce_any(finished_in_finished, 1))) * -INF) + + bound_is_met = tf.reduce_all( + tf.greater(lowest_score_of_fininshed_in_finished, + lower_bound_alive_scores)) + + return tf.logical_and( + tf.less(i, decode_length), tf.logical_not(bound_is_met)) + + (_, alive_seq, alive_log_probs, finished_seq, finished_scores, + finished_flags) = tf.while_loop( + _is_finished, + inner_loop, [ + tf.constant(0), alive_seq, alive_log_probs, finished_seq, + finished_scores, finished_flags + ], + shape_invariants=[ + tf.TensorShape([]), + tf.TensorShape([None, None, None]), + alive_log_probs.get_shape(), + tf.TensorShape([None, None, None]), + finished_scores.get_shape(), + finished_flags.get_shape() + ], + parallel_iterations=1, + back_prop=False) + + alive_seq.set_shape((None, beam_size, None)) + finished_seq.set_shape((None, beam_size, None)) + + # Accounting for corner case: It's possible that no sequence in alive for a + # particular batch item ever reached EOS. In that case, we should just copy + # the contents of alive for that batch item. tf.reduce_any(finished_flags, 1) + # if 0, means that no sequence for that batch index had reached EOS. We need + # to do the same for the scores as well. + finished_seq = tf.where( + tf.reduce_any(finished_flags, 1), finished_seq, alive_seq) + finished_scores = tf.where( + tf.reduce_any(finished_flags, 1), finished_scores, alive_log_probs) + return finished_seq, finished_scores diff --git a/tensor2tensor/utils/beam_search_test.py b/tensor2tensor/utils/beam_search_test.py new file mode 100644 index 000000000..33439b41f --- /dev/null +++ b/tensor2tensor/utils/beam_search_test.py @@ -0,0 +1,281 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for tensor2tensor.beam_search.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +import numpy as np +from tensor2tensor.utils import beam_search + +import tensorflow as tf + + +class BeamSearchTest(tf.test.TestCase): + + def testShapes(self): + batch_size = 2 + beam_size = 3 + vocab_size = 4 + decode_length = 10 + + initial_ids = tf.constant([0, 0]) # GO + + def symbols_to_logits(_): + # Just return random logits + return tf.random_uniform((batch_size * beam_size, vocab_size)) + + final_ids, final_probs = beam_search.beam_search( + symbols_to_logits, initial_ids, beam_size, decode_length, vocab_size, + 0.) + + self.assertEqual(final_ids.get_shape().as_list(), [None, beam_size, None]) + + self.assertEqual(final_probs.get_shape().as_list(), [None, beam_size]) + + def testComputeTopkScoresAndSeq(self): + batch_size = 2 + beam_size = 3 + + sequences = tf.constant([[[2, 3], [4, 5], [6, 7], [19, 20]], + [[8, 9], [10, 11], [12, 13], [80, 17]]]) + + scores = tf.constant([[-0.1, -2.5, 0., -1.5], + [-100., -5., -0.00789, -1.34]]) + flags = tf.constant([[True, False, False, True], + [False, False, False, True]]) + + topk_seq, topk_scores, topk_flags = beam_search.compute_topk_scores_and_seq( + sequences, scores, scores, flags, beam_size, batch_size) + + with self.test_session(): + topk_seq = topk_seq.eval() + topk_scores = topk_scores.eval() + topk_flags = topk_flags.eval() + + exp_seq = [[[6, 7], [2, 3], [19, 20]], [[12, 13], [80, 17], [10, 11]]] + exp_scores = [[0., -0.1, -1.5], [-0.00789, -1.34, -5.]] + + exp_flags = [[False, True, True], [False, True, False]] + self.assertAllEqual(exp_seq, topk_seq) + self.assertAllClose(exp_scores, topk_scores) + self.assertAllEqual(exp_flags, topk_flags) + + def testGreedyBatchOne(self): + batch_size = 1 + beam_size = 1 + vocab_size = 2 + decode_length = 3 + + initial_ids = tf.constant([0] * batch_size) # GO + + # Test that beam search finds the most probable sequence. + # These probabilities represent the following search + # + # G0 (0) + # / \ + # / \ + # / \ + # / \ + # 0(0.7) 1(0.3) + # / \ + # / \ + # / \ + # 0(0.4) 1(0.6) + # /\ + # / \ + # / \ + # 0(0.5) 1(0.5) + # and the following decoding probabilities + # 0000 - 0.7 * 0.4 * 0.1 + # 0001 - 0.7 * 0.4 * 0.9 + # 001 - 0.7 * 0.6 (Best) + # 01 = 0.3 + # + # 001 is the most likely sequence under these probabilities. + probabilities = tf.constant([[[0.7, 0.3]], [[0.4, 0.6]], [[0.5, 0.5]]]) + + def symbols_to_logits(ids): + pos = tf.shape(ids)[1] + logits = tf.to_float(tf.log(probabilities[pos - 1, :])) + return logits + + final_ids, final_probs = beam_search.beam_search( + symbols_to_logits, + initial_ids, + beam_size, + decode_length, + vocab_size, + 0.0, + eos_id=1) + + with self.test_session(): + ids = final_ids.eval() + probs = final_probs.eval() + self.assertAllEqual([[[0, 0, 1]]], ids) + self.assertAllClose([[0.7 * 0.6]], np.exp(probs)) + + def testNotGreedyBeamTwo(self): + batch_size = 1 + beam_size = 2 + vocab_size = 3 + decode_length = 3 + + initial_ids = tf.constant([0] * batch_size) # GO + probabilities = tf.constant([[[0.1, 0.1, 0.8], [0.1, 0.1, 0.8]], + [[0.4, 0.5, 0.1], [0.2, 0.4, 0.4]], + [[0.05, 0.9, 0.05], [0.4, 0.4, 0.2]]]) + + def symbols_to_logits(ids): + pos = tf.shape(ids)[1] + logits = tf.to_float(tf.log(probabilities[pos - 1, :])) + return logits + + final_ids, final_probs = beam_search.beam_search( + symbols_to_logits, + initial_ids, + beam_size, + decode_length, + vocab_size, + 0.0, + eos_id=1) + + with self.test_session(): + ids = final_ids.eval() + probs = final_probs.eval() + self.assertAllEqual([[[0, 2, 1, 0], [0, 2, 0, 1]]], ids) + self.assertAllClose([[0.8 * 0.5, 0.8 * 0.4 * 0.9]], np.exp(probs)) + + def testGreedyWithCornerCase(self): + batch_size = 1 + beam_size = 1 + vocab_size = 3 + decode_length = 2 + + initial_ids = tf.constant([0] * batch_size) # GO + probabilities = tf.constant([[0.2, 0.1, 0.7], [0.4, 0.1, 0.5]]) + + def symbols_to_logits(ids): + pos = tf.shape(ids)[1] + logits = tf.to_float(tf.log(probabilities[pos - 1, :])) + return logits + + final_ids, final_probs = beam_search.beam_search( + symbols_to_logits, + initial_ids, + beam_size, + decode_length, + vocab_size, + 0.0, + eos_id=1) + + with self.test_session(): + ids = final_ids.eval() + probs = final_probs.eval() + self.assertAllEqual([[[0, 2, 2]]], ids) + self.assertAllClose([[0.7 * 0.5]], np.exp(probs)) + + def testNotGreedyBatchTwoBeamTwoWithAlpha(self): + batch_size = 2 + beam_size = 2 + vocab_size = 3 + decode_length = 3 + + initial_ids = tf.constant([0] * batch_size) # GO + # Probabilities for position * batch * beam * vocab + # Probabilities have been set such that with alpha = 3.5, the less probable + # but longer sequence will have a better score than the shorter sequence + # with higher log prob in batch 1, and the order will be reverse in batch + # 2. That is, the shorter sequence will still have a higher score in spite + # of the length penalty + probabilities = tf.constant([[[[0.1, 0.1, 0.8], [0.1, 0.1, 0.8]], + [[0.1, 0.1, 0.8], [0.1, 0.1, 0.8]]], + [[[0.4, 0.5, 0.1], [0.2, 0.4, 0.4]], + [[0.3, 0.6, 0.1], [0.2, 0.4, 0.4]]], + [[[0.05, 0.9, 0.05], [0.4, 0.4, 0.2]], + [[0.05, 0.9, 0.05], [0.4, 0.4, 0.2]]]]) + + def symbols_to_logits(ids): + pos = tf.shape(ids)[1] + logits = tf.to_float(tf.log(probabilities[pos - 1, :])) + return logits + + final_ids, final_scores = beam_search.beam_search( + symbols_to_logits, + initial_ids, + beam_size, + decode_length, + vocab_size, + 3.5, + eos_id=1) + + with self.test_session(): + ids = final_ids.eval() + scores = final_scores.eval() + self.assertAllEqual([[[0, 2, 0, 1], [0, 2, 1, 0]], [[0, 2, 1, 0], + [0, 2, 0, 1]]], ids) + self.assertAllClose([[ + np.log(0.8 * 0.4 * 0.9) / (8. / 6.)**3.5, + np.log(0.8 * 0.5) / (7. / 6.)**3.5 + ], [ + np.log(0.8 * 0.6) / (7. / 6.)**3.5, + np.log(0.8 * 0.3 * 0.9) / (8. / 6.)**3.5 + ]], scores) + + def testNotGreedyBeamTwoWithAlpha(self): + batch_size = 1 + beam_size = 2 + vocab_size = 3 + decode_length = 3 + + initial_ids = tf.constant([0] * batch_size) # GO + # Probabilities for position * batch * beam * vocab + # Probabilities have been set such that with alpha = 3.5, the less probable + # but longer sequence will have a better score that the shorter sequence + # with higher log prob. + probabilities = tf.constant([[[0.1, 0.1, 0.8], [0.1, 0.1, 0.8]], + [[0.4, 0.5, 0.1], [0.2, 0.4, 0.4]], + [[0.05, 0.9, 0.05], [0.4, 0.4, 0.2]]]) + + def symbols_to_logits(ids): + pos = tf.shape(ids)[1] + logits = tf.to_float(tf.log(probabilities[pos - 1, :])) + return logits + + # Disable early stopping + final_ids, final_scores = beam_search.beam_search( + symbols_to_logits, + initial_ids, + beam_size, + decode_length, + vocab_size, + 3.5, + eos_id=1) + + with self.test_session(): + ids = final_ids.eval() + scores = final_scores.eval() + self.assertAllClose([[ + np.log(0.8 * 0.4 * 0.9) / (8. / 6.)**3.5, + np.log(0.8 * 0.5) / (7. / 6.)**3.5 + ]], scores) + self.assertAllEqual([[[0, 2, 0, 1], [0, 2, 1, 0]]], ids) + + +if __name__ == "__main__": + tf.test.main() diff --git a/tensor2tensor/utils/bleu_hook.py b/tensor2tensor/utils/bleu_hook.py new file mode 100644 index 000000000..eb8749b3f --- /dev/null +++ b/tensor2tensor/utils/bleu_hook.py @@ -0,0 +1,123 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""BLEU metric util used during eval for MT.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import math + +# Dependency imports + +import numpy as np +# pylint: disable=redefined-builtin +from six.moves import xrange +from six.moves import zip +# pylint: enable=redefined-builtin + +import tensorflow as tf + + +def _get_ngrams(segment, max_order): + """Extracts all n-grams upto a given maximum order from an input segment. + + Args: + segment: text segment from which n-grams will be extracted. + max_order: maximum length in tokens of the n-grams returned by this + methods. + + Returns: + The Counter containing all n-grams upto max_order in segment + with a count of how many times each n-gram occurred. + """ + ngram_counts = collections.Counter() + for order in xrange(1, max_order + 1): + for i in xrange(0, len(segment) - order + 1): + ngram = tuple(segment[i:i + order]) + ngram_counts[ngram] += 1 + return ngram_counts + + +def compute_bleu(reference_corpus, + translation_corpus, + max_order=4, + use_bp=True): + """Computes BLEU score of translated segments against one or more references. + + Args: + reference_corpus: list of references for each translation. Each + reference should be tokenized into a list of tokens. + translation_corpus: list of translations to score. Each translation + should be tokenized into a list of tokens. + max_order: Maximum n-gram order to use when computing BLEU score. + use_bp: boolean, whether to apply brevity penalty. + + Returns: + BLEU score. + """ + reference_length = 0 + translation_length = 0 + bp = 1.0 + geo_mean = 0 + + matches_by_order = [0] * max_order + possible_matches_by_order = [0] * max_order + precisions = [] + + for (references, translations) in zip(reference_corpus, translation_corpus): + reference_length += len(references) + translation_length += len(translations) + ref_ngram_counts = _get_ngrams(references, max_order) + translation_ngram_counts = _get_ngrams(translations, max_order) + + overlap = dict((ngram, + min(count, translation_ngram_counts[ngram])) + for ngram, count in ref_ngram_counts.items()) + + for ngram in overlap: + matches_by_order[len(ngram) - 1] += overlap[ngram] + for ngram in translation_ngram_counts: + possible_matches_by_order[len(ngram)-1] += translation_ngram_counts[ngram] + + precisions = [0] * max_order + for i in xrange(0, max_order): + if possible_matches_by_order[i] > 0: + precisions[i] = matches_by_order[i] / possible_matches_by_order[i] + else: + precisions[i] = 0.0 + + if max(precisions) > 0: + p_log_sum = sum(math.log(p) for p in precisions if p) + geo_mean = math.exp(p_log_sum/max_order) + + if use_bp: + ratio = translation_length / reference_length + bp = math.exp(1 - 1. / ratio) if ratio < 1.0 else 1.0 + + bleu = geo_mean * bp + return np.float32(bleu) + + +def padded_bleu_score(predictions, + labels, **unused_kwargs): + """Bleu score computation between labels and predictions on non-0s.""" + outputs = tf.to_int32(tf.argmax(predictions, axis=-1)) + # Convert the outputs and labels to a [batch_size, input_length] tensor. + outputs = tf.squeeze(outputs) + labels = tf.squeeze(labels) + + bleu = tf.py_func(compute_bleu, (labels, outputs), tf.float32) + return bleu, tf.constant(1.0) diff --git a/tensor2tensor/utils/bleu_hook_test.py b/tensor2tensor/utils/bleu_hook_test.py new file mode 100644 index 000000000..1838affd6 --- /dev/null +++ b/tensor2tensor/utils/bleu_hook_test.py @@ -0,0 +1,59 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for tensor2tensor.utils.bleu_hook.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +from tensor2tensor.utils import bleu_hook + +import tensorflow as tf + + +class BleuHookTest(tf.test.TestCase): + + def testComputeBleuEqual(self): + translation_corpus = [[1, 2, 3]] + reference_corpus = [[1, 2, 3]] + bleu = bleu_hook.compute_bleu(reference_corpus, translation_corpus) + actual_bleu = 1.0 + self.assertEqual(bleu, actual_bleu) + + def testComputeNotEqual(self): + translation_corpus = [[1, 2, 3, 4]] + reference_corpus = [[5, 6, 7, 8]] + bleu = bleu_hook.compute_bleu(reference_corpus, translation_corpus) + actual_bleu = 0.0 + self.assertEqual(bleu, actual_bleu) + + def testComputeMultipleBatch(self): + translation_corpus = [[1, 2, 3, 4], [5, 6, 7, 0]] + reference_corpus = [[1, 2, 3, 4], [5, 6, 7, 10]] + bleu = bleu_hook.compute_bleu(reference_corpus, translation_corpus) + actual_bleu = 0.7231 + self.assertAllClose(bleu, actual_bleu, atol=1e-03) + + def testComputeMultipleNgrams(self): + reference_corpus = [[1, 2, 1, 13], [12, 6, 7, 4, 8, 9, 10]] + translation_corpus = [[1, 2, 1, 3], [5, 6, 7, 4]] + bleu = bleu_hook.compute_bleu(reference_corpus, translation_corpus) + actual_bleu = 0.486 + self.assertAllClose(bleu, actual_bleu, atol=1e-03) + +if __name__ == '__main__': + tf.test.main() diff --git a/tensor2tensor/utils/data_reader.py b/tensor2tensor/utils/data_reader.py new file mode 100644 index 000000000..0080ecaa6 --- /dev/null +++ b/tensor2tensor/utils/data_reader.py @@ -0,0 +1,346 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Data reader module.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import os + +# Dependency imports + +import six +from six.moves import zip # pylint: disable=redefined-builtin + +from tensor2tensor.data_generators import problem_hparams +from tensor2tensor.models import common_layers + +import tensorflow as tf + + +def examples_queue(data_sources, + data_fields_to_features, + training, + capacity=32, + data_items_to_decoders=None, + data_items_to_decode=None): + """Contruct a queue of training or evaluation examples. + + This function will create a reader from files given by data_sources, + then enqueue the tf.Examples from these files, shuffling if training + is true, and finally parse these tf.Examples to tensors. + + The dictionary data_fields_to_features for an image dataset can be this: + + data_fields_to_features = { + 'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''), + 'image/format': tf.FixedLenFeature((), tf.string, default_value='raw'), + 'image/class/label': tf.FixedLenFeature( + [1], tf.int64, default_value=tf.zeros([1], dtype=tf.int64)), + } + + and for a simple algorithmic dataset with variable-length data it is this: + + data_fields_to_features = { + 'inputs': tf.VarLenFeature(tf.int64), + 'targets': tf.VarLenFeature(tf.int64), + } + + The data_items_to_decoders dictionary argument can be left as None if there + is no decoding to be performed. But, e.g. for images, it should be set so that + the images are decoded from the features, e.g., like this for MNIST: + + data_items_to_decoders = { + 'image': tfexample_decoder.Image( + image_key = 'image/encoded', + format_key = 'image/format', + shape=[28, 28], + channels=1), + 'label': tfexample_decoder.Tensor('image/class/label'), + } + + These arguments are compatible with the use of tf.contrib.slim.data module, + see there for more documentation. + + Args: + data_sources: a list or tuple of sources from which the data will be read, + for example [/path/to/train@128, /path/to/train2*, /tmp/.../train3*] + data_fields_to_features: a dictionary from data fields in the data sources + to features, such as tf.VarLenFeature(tf.int64), see above for examples. + training: a Boolean, whether to read for training or evaluation. + capacity: integer, queue capacity; set to 2 * max_batch_size or more. + data_items_to_decoders: a dictionary mapping data items (that will be + in the returned result) to decoders that will decode them using features + defined in data_fields_to_features; see above for examples. By default + (if this is None), we grab the tensor from every feature. + data_items_to_decode: a subset of data items that will be decoded; + by default (if this is None), we decode all items. + + Returns: + A dictionary mapping each data_field to a corresponding 1D int64 tensor + read from the created queue. + + Raises: + ValueError: if no files are found with the provided data_prefix or no data + fields were provided. + """ + with tf.name_scope("examples_queue"): + # Read serialized examples using slim parallel_reader. + num_epochs = None if training else 1 + _, example_serialized = tf.contrib.slim.parallel_reader.parallel_read( + data_sources, + tf.TFRecordReader, + num_epochs=num_epochs, + shuffle=training, + capacity=2 * capacity, + min_after_dequeue=capacity, + num_readers=4 if training else 1) + + if data_items_to_decoders is None: + data_items_to_decoders = { + field: tf.contrib.slim.tfexample_decoder.Tensor(field) + for field in data_fields_to_features + } + + decoder = tf.contrib.slim.tfexample_decoder.TFExampleDecoder( + data_fields_to_features, data_items_to_decoders) + + if data_items_to_decode is None: + data_items_to_decode = list(data_items_to_decoders) + + decoded = decoder.decode(example_serialized, items=data_items_to_decode) + return { + field: tensor + for (field, tensor) in zip(data_items_to_decode, decoded) + } + + +def input_pipeline(data_file_pattern, capacity, mode): + """Input pipeline, returns a dictionary of tensors from queues.""" + # Read from image TFRecords if the file has "image" in its name. + if data_file_pattern and "image" in data_file_pattern: + data_fields = { + "image/encoded": tf.FixedLenFeature((), tf.string), + "image/format": tf.FixedLenFeature((), tf.string), + "image/class/label": tf.VarLenFeature(tf.int64) + } + data_items_to_decoders = { + "inputs": + tf.contrib.slim.tfexample_decoder.Image( + image_key="image/encoded", + format_key="image/format", + channels=1 if "mnist" in data_file_pattern else 3), + "targets": + tf.contrib.slim.tfexample_decoder.Tensor("image/class/label"), + } + elif data_file_pattern and "audio" in data_file_pattern: + data_type = tf.int64 if "timit" in data_file_pattern else tf.float32 + data_fields = { + "inputs": tf.VarLenFeature(data_type), + "audio/sample_count": tf.FixedLenFeature((), tf.int64), + "audio/sample_width": tf.FixedLenFeature((), tf.int64), + "targets": tf.VarLenFeature(tf.int64), + } + data_items_to_decoders = None + else: + data_fields = { + "inputs": tf.VarLenFeature(tf.int64), + "targets": tf.VarLenFeature(tf.int64) + } + data_items_to_decoders = None + + # Create placeholders for input, rather than reading data from disk. + if data_file_pattern is None: + feature_map = {} + for (field, tp) in data_fields: + if field != "targets": + feature_map[field] = tf.placeholder( + dtype=tp, shape=[None] * 4, name=field) + return feature_map + + # Now the non-trivial case construction. + examples = examples_queue( + [data_file_pattern], + data_fields, + training=(mode == tf.contrib.learn.ModeKeys.TRAIN), + capacity=capacity, + data_items_to_decoders=data_items_to_decoders) + + if "image" in data_file_pattern: + # Small single-example pre-processing for images. + examples["inputs"] = tf.cast(examples["inputs"], tf.int64) + if ("image_imagenet" in data_file_pattern or + "image_mscoco" in data_file_pattern): + # For imagnet/coco, resize images to 299x299 as is standard. + def resize(img): + return tf.to_int64(tf.image.resize_images(img, [299, 299])) + + def preprocess(img): + img = tf.image.resize_images(img, [360, 360]) + img = common_layers.image_augmentation(tf.to_float(img) / 255.) + return tf.to_int64(img * 255.) + + inputs = examples["inputs"] + if mode == tf.contrib.learn.ModeKeys.TRAIN: + examples["inputs"] = tf.cond( # Preprocess 80% of the time. + tf.less(tf.random_uniform([]), 0.8), + lambda img=inputs: preprocess(img), + lambda img=inputs: resize(img)) + else: + examples["inputs"] = tf.to_int64(resize(inputs)) + elif "audio" in data_file_pattern: + # Reshape audio to proper shape + sample_count = tf.to_int32(examples.pop("audio/sample_count")) + sample_width = tf.to_int32(examples.pop("audio/sample_width")) + channel_count = 1 + examples["inputs"] = tf.reshape(examples["inputs"], + [sample_count, sample_width, channel_count]) + if "wsj" in data_file_pattern: + examples["inputs"] = tf.bitcast(examples["inputs"], tf.int32) + elif "a2q_20161229" in data_file_pattern: + # we forgot the EOS when we preprocessed this data. + examples["targets"] = tf.concat([examples["targets"], [1]], 0) + + # We do not want int64s as they do are not supported on GPUs. + return {k: tf.to_int32(v) for (k, v) in six.iteritems(examples)} + + +def batch_examples(examples, batching_scheme): + """Given a queue of examples, create batches of examples with similar lengths. + + We assume that examples is a dictionary with string keys and tensor values, + possibly coming from a queue, e.g., constructed by examples_queue above. + Each tensor in examples is assumed to be 1D. We will put tensors of similar + length into batches togeter. We return a dictionary with the same keys as + examples, and with values being batches of size batch_size. If elements have + different lengths, they are padded with 0s. This function is based on + tf.contrib.training.bucket_by_sequence_length so see there for details. + + For example, if examples is a queue containing [1, 2, 3] and [4], then + this function with batch_size=2 will return a batch [[1, 2, 3], [4, 0, 0]]. + + Args: + examples: a dictionary with string keys and 1D tensor values. + batching_scheme: a dictionary containing + "boundaries": a list of integers for the boundaries that will be + used for bucketing; see tf.contrib.training.bucket_by_sequence_length + for more details. + "batch_sizes": a list of batch sizes corresponding to the buckets + "max_length": an integer. We drop sequences which are longer. + + Returns: + A dictionary with the same keys as examples and with values being batches + of examples padded with 0s, i.e., [batch_size x length] tensors. + """ + with tf.name_scope("batch_examples"): + # The queue to bucket on will be chosen based on maximum length. + max_length = 0 + for v in examples.values(): + # For images the sequence length is the size of the spatial dimensions. + sequence_length = (tf.shape(v)[0] if len(v.get_shape()) < 3 else + tf.shape(v)[0] * tf.shape(v)[1]) + max_length = tf.maximum(max_length, sequence_length) + (_, outputs) = tf.contrib.training.bucket_by_sequence_length( + max_length, + examples, + batching_scheme["batch_sizes"], + [b + 1 for b in batching_scheme["boundaries"]], + capacity=2, # Number of full batches to store, we don't need many. + bucket_capacities=[2 * b for b in batching_scheme["batch_sizes"]], + dynamic_pad=True, + keep_input=(max_length <= batching_scheme["max_length"])) + return outputs + + +def bucket_boundaries(max_length, min_length=8, mantissa_bits=2): + """A default set of length-bucket boundaries.""" + x = min_length + boundaries = [] + while x < max_length: + boundaries.append(x) + x += 2**max(0, int(math.log(x, 2)) - mantissa_bits) + return boundaries + + +def hparams_to_batching_scheme(hparams, + drop_long_sequences=False, + shard_multiplier=1, + length_multiplier=1): + """A batching scheme based on model hyperparameters. + + Every batch containins a number of sequences divisible by `shard_multiplier`. + + If `drop_long_sequences` is True, then sequences longer than + `hparams.batch_size` are dropped. This prevents generating batches with + more than the usual number of tokens, which can cause out-of-memory errors. + + Args: + hparams: a hyperparameters. + drop_long_sequences: a boolean. + shard_multiplier: an integer increasing the batch_size to suit splitting + across datashards. + length_multiplier: an integer multiplier that is used to increase the + batch sizes and sequence length tolerance. + + Returns: + a dictionary + """ + max_length = hparams.max_length or hparams.batch_size + boundaries = bucket_boundaries( + max_length, mantissa_bits=hparams.batching_mantissa_bits) + batch_sizes = [ + max(1, hparams.batch_size // length) + for length in boundaries + [max_length] + ] + batch_sizes = [b * shard_multiplier for b in batch_sizes] + max_length *= length_multiplier + boundaries = [boundary * length_multiplier for boundary in boundaries] + return { + "boundaries": boundaries, + "batch_sizes": batch_sizes, + "max_length": (max_length if drop_long_sequences else 10**9) + } + + +def constant_batching_scheme(constant_batch_size_in_sequences): + """A batching scheme with constant batch size. + + Args: + constant_batch_size_in_sequences: an integer + + Returns: + a dictionary + """ + boundaries = bucket_boundaries(1024) + batch_sizes = [constant_batch_size_in_sequences] * (1 + len(boundaries)) + return { + "boundaries": boundaries, + "batch_sizes": batch_sizes, + "max_length": 10**9 + } + + +def get_datasets(problems, data_dir, mode): + """Return the location of a dataset for a given mode.""" + datasets = [] + for problem in problems.split("-"): + problem, _, _ = problem_hparams.parse_problem_name(problem) + path = os.path.join(data_dir, problem) + if mode == tf.contrib.learn.ModeKeys.TRAIN: + datasets.append("%s-train*" % path) + else: + datasets.append("%s-dev*" % path) + return datasets diff --git a/tensor2tensor/utils/data_reader_test.py b/tensor2tensor/utils/data_reader_test.py new file mode 100644 index 000000000..883a3673a --- /dev/null +++ b/tensor2tensor/utils/data_reader_test.py @@ -0,0 +1,147 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Data reader test.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import tempfile + +# Dependency imports + +import numpy as np +from six.moves import xrange # pylint: disable=redefined-builtin + +from tensor2tensor.data_generators import generator_utils +from tensor2tensor.utils import data_reader + +import tensorflow as tf + + +class DataReaderTest(tf.test.TestCase): + + def testExamplesQueue(self): + tf.set_random_seed(1) + tmp_dir = self.get_temp_dir() + (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir) + tmp_file_name = os.path.basename(tmp_file_path) + + # Generate a file with 100 examples. + def test_generator(): + for i in xrange(100): + yield {"inputs": [i], "targets": [i], "floats": [i + 0.5]} + + generator_utils.generate_files(test_generator(), tmp_file_name, tmp_dir) + self.assertTrue(tf.gfile.Exists(tmp_file_path + "-00000-of-00001")) + + examples_train = data_reader.examples_queue( + [tmp_file_path + "*"], { + "inputs": tf.VarLenFeature(tf.int64), + "targets": tf.VarLenFeature(tf.int64) + }, + training=True) + examples_eval = data_reader.examples_queue( + [tmp_file_path + "*"], { + "inputs": tf.VarLenFeature(tf.int64), + "targets": tf.VarLenFeature(tf.int64), + "floats": tf.VarLenFeature(tf.float32) + }, + training=False) + with tf.train.MonitoredSession() as session: + # Evaluation data comes in the same order as in the file, check 10. + for i in xrange(10): + examples = session.run(examples_eval) + self.assertEqual(len(examples["inputs"]), 1) + self.assertEqual(len(examples["targets"]), 1) + self.assertEqual(examples["inputs"][0], i) + self.assertEqual(examples["targets"][0], i) + self.assertEqual(examples["floats"][0], i + 0.5) + # Training data is shuffled. + is_shuffled = False + for i in xrange(10): + examples = session.run(examples_train) + self.assertEqual(len(examples["inputs"]), 1) + self.assertEqual(len(examples["targets"]), 1) + self.assertEqual(examples["inputs"][0], examples["targets"][0]) + if examples["inputs"][0] != i: + is_shuffled = True + self.assertTrue(is_shuffled) + + # Clean up. + os.remove(tmp_file_path + "-00000-of-00001") + os.remove(tmp_file_path) + + def testBatchExamples(self): + tf.set_random_seed(1) + tmp_dir = self.get_temp_dir() + (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir) + tmp_file_name = os.path.basename(tmp_file_path) + + # Generate a file with 100 examples, n-th example of length n + 1. + def test_generator(): + for i in xrange(100): + yield {"inputs": [i + 1 for _ in xrange(i + 1)], "targets": [i + 1]} + + generator_utils.generate_files(test_generator(), tmp_file_name, tmp_dir) + self.assertTrue(tf.gfile.Exists(tmp_file_path + "-00000-of-00001")) + + examples_train = data_reader.examples_queue([tmp_file_path + "*"], { + "inputs": tf.VarLenFeature(tf.int64), + "targets": tf.VarLenFeature(tf.int64) + }, True) + batch_train = data_reader.batch_examples(examples_train, 4) + examples_eval = data_reader.examples_queue([tmp_file_path + "*"], { + "inputs": tf.VarLenFeature(tf.int64), + "targets": tf.VarLenFeature(tf.int64) + }, False) + batch_eval = data_reader.batch_examples(examples_eval, 2) + session, coord = tf.Session(), tf.train.Coordinator() + with session.as_default(): + tf.train.start_queue_runners(coord=coord) + + # Evaluation data comes in the same order as in the file. + # The first batch will be inputs=[[1, 0], [2, 2]], targets=[[1], [2]]. + examples = session.run(batch_eval) + self.assertAllClose(examples["inputs"], np.array([[1, 0], [2, 2]])) + self.assertAllClose(examples["targets"], np.array([[1], [2]])) + # Check the second batch too. + examples = session.run(batch_eval) + self.assertAllClose(examples["inputs"], + np.array([[3, 3, 3, 0], [4, 4, 4, 4]])) + self.assertAllClose(examples["targets"], np.array([[3], [4]])) + + # Training data is shuffled but shouldn't have too many pads. + for _ in xrange(10): + examples = session.run(batch_train) + inputs = examples["inputs"] + # Only 3 out of 4 examples in a batch have padding zeros at all. + pad_per_example = (inputs.size - np.count_nonzero(inputs)) // 3 + # Default bucketing is in steps of 8 until 64 and 32 later. + if int(max(examples["targets"])) < 64: + self.assertLess(pad_per_example, 8) + else: + self.assertLess(pad_per_example, 32) + + # Clean up. + coord.request_stop() + coord.join() + os.remove(tmp_file_path + "-00000-of-00001") + os.remove(tmp_file_path) + + +if __name__ == "__main__": + tf.test.main() diff --git a/tensor2tensor/utils/expert_utils.py b/tensor2tensor/utils/expert_utils.py new file mode 100644 index 000000000..8d3d1d50c --- /dev/null +++ b/tensor2tensor/utils/expert_utils.py @@ -0,0 +1,1284 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utilities for creating Sparsely-Gated Mixture-of-Experts Layers. + +See the most recent draft of our ICLR paper: +https://openreview.net/pdf?id=B1ckMDqlg +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math + +# Dependency imports + +import six +from six.moves import xrange # pylint: disable=redefined-builtin +from six.moves import zip # pylint: disable=redefined-builtin +import tensorflow as tf + +from tensorflow.python.framework import function + + +def NoisyTopKGatingParams(): + """Hyperparams defining NoisyTopK Gating Network. + + Returns: + a tf.contrib.training.HParams object + """ + return tf.contrib.training.HParams( + gating_class=NoisyTopKGating, + num_experts=16, # The number of experts + k=2, # 'The number of experts to use per example + input_size=None, # size of input to MoE. Set by MoE class + dtype=tf.float32, # floating point data type + initializer=tf.zeros_initializer(), # initializer for weight matrices + noisy_gating=True, # Add tunable noise (necessary for load-balancing) + noise_epsilon=1e-2, # Added to noise stddev for numerical stability + ) + + +def FeedForwardExpertParams(): + """Hyperparameters defining feed-forward expert networks. + + Returns: + a tf.contrib.training.HParams object + """ + return tf.contrib.training.HParams( + # The class that implements the expert network + expert_class=FeedForwardExpert, + input_size=None, # Size of input to MoE. Set by MoE class. + # List of hidden layer sizes, or None for no hidden layers. + # The length of this list determines the number of hidden layers + hidden_layer_sizes=None, + output_size=None, # Size of output from MoE. Set by MoE class. + dtype=tf.float32, # Floating point data type) + # Activation function applied at each hidden layer) + hidden_activation=tf.nn.relu, + initializer=None, # Optional initializer for weight matrices.) + # If autoscale=True, At each hidden/output layer, multiply by + # rsqrt(prev_layer_size / input_size). This scaling happens + # before application of hidden_activation) + autoscale=True,) + + +def _SetInputOutputSizes(hp, input_size, output_size): + """Fill in the input_size and output_size hyperparameters. + + This is used by LocalMixtureOfExperts and DistributedMixtureOfExperts to + fill in the input_size and output_size on the gating parameters and expert + parameters so that the user does not have to set them in multiple places. + + Args: + hp: a hyperparameters + input_size: an integer + output_size: an integer + """ + if hp.input_size is None: + hp.input_size = input_size + else: + assert hp.input_size == input_size + if output_size is not None: + if hp.output_size is None: + hp.output_size = output_size + else: + assert hp.output_size == output_size + + +class FeedForwardExpert(object): + """An object representing a feed forward network (used as an expert). + """ + + def __init__(self, hp, name): + """Creates a FeedForwardExpert. + + Args: + hp: hyperparameters. Call FeedForwardExpertParams() to create these. + name: a string. + """ + self._hp = hp + hidden_layer_sizes = hp.hidden_layer_sizes or [] + num_layers = 1 + len(hidden_layer_sizes) + layer_sizes = [hp.input_size] + hidden_layer_sizes + [hp.output_size] + self._layer_sizes = layer_sizes + self._w = [] + for layer in range(num_layers): + shape = layer_sizes[layer:layer + 2] + self._w.append( + tf.get_variable('%s_layer_%d' % (name, layer), shape, hp.dtype, + hp.initializer)) + + def Eval(self, x): + """Evaluate the FeedForwardExpert on the given input. + + Args: + x: a `Tensor` of shape `[batch_size, hp.input_size]` + + Returns: + a `Tensor` of shape `[batch_size, hp.output_size]` + """ + hp = self._hp + num_layers = len(self._w) + for i in xrange(num_layers): + x = tf.matmul(x, self._w[i]) + if hp.autoscale and self._layer_sizes[i] != hp.input_size: + x *= (self._layer_sizes[i] / hp.input_size)**-0.5 + if i + 1 < num_layers and hp.hidden_activation: + x = hp.hidden_activation(x) + return x + + @property + def vars(self): + return self._w + + +@function.Defun( + python_grad_func=lambda x, dy: tf.convert_to_tensor(dy), + shape_func=lambda op: [op.inputs[0].get_shape()]) +def ConvertGradientToTensor(x): + """Identity operation whose gradient is converted to a `Tensor`. + + Currently, the gradient to `tf.concat` is particularly expensive to + compute if dy is an `IndexedSlices` (a lack of GPU implementation + forces the gradient operation onto CPU). This situation occurs when + the output of the `tf.concat` is eventually passed to `tf.gather`. + It is sometimes faster to convert the gradient to a `Tensor`, so as + to get the cheaper gradient for `tf.concat`. To do this, replace + `tf.concat(x)` with `ConvertGradientToTensor(tf.concat(x))`. + + Args: + x: A `Tensor`. + + Returns: + The input `Tensor`. + """ + return x + + +class Parallelism(object): + """Helper class for creating sets of parallel function calls. + + The purpose of this class is to replace this code: + + e = [] + f = [] + for i in xrange(len(devices)): + with tf.device(devices[i]): + e_, f_ = func(a[i], b[i], c) + e.append(e_) + f.append(f_) + + with this code: + + e, f = expert_utils.Parallelism(devices)(func, a, b, c) + """ + + def __init__(self, + device_names_or_functions, + reuse=None, + caching_devices=None, + daisy_chain_variables=False): + """Create a Parallelism. + + Args: + device_names_or_functions: A list of of length n, containing device names + or device functions (see `tf.device`) + reuse: True or None. Whether to reuse variables created in the first + replica in the subsequent replicas. + caching_devices: Either `None`, or a list of length n containing device + names. + daisy_chain_variables: a boolean - if true, then copies variables in a + daisy chain between devices. + + Returns: + a Parallelism. + """ + assert device_names_or_functions + self._devices = device_names_or_functions + self._n = len(device_names_or_functions) + self._reuse = reuse + self._caching_devices = self._MaybeRepeat(caching_devices) + self._daisy_chain_variables = daisy_chain_variables + + def __call__(self, fn, *args, **kwargs): + """A parallel set of function calls (using the specified devices). + + Args: + fn: a function or a list of n functions. + *args: additional args. Each arg should either be not a list, or a list + of length n. + **kwargs: additional keyword args. Each arg should either be not a + list, or a list of length n. + + Returns: + either a single list of length n (if fn does not return a tuple), or a + tuple of lists of length n (if fn returns a tuple). + """ + # Construct lists or args and kwargs for each function. + if args: + my_args = TransposeListOfLists([self._MaybeRepeat(arg) for arg in args]) + else: + my_args = [[] for _ in xrange(self.n)] + my_kwargs = [{} for _ in xrange(self.n)] + for k, v in six.iteritems(kwargs): + vals = self._MaybeRepeat(v) + for i in xrange(self.n): + my_kwargs[i][k] = vals[i] + + # Construct lists of functions. + fns = self._MaybeRepeat(fn) + + # Now make the parallel call. + outputs = [] + cache = {} + for i in xrange(self.n): + + def DaisyChainGetter(getter, name, *args, **kwargs): + """Get a variable and cache in a daisy chain.""" + device_var_key = (self._devices[i], name) + if device_var_key in cache: + # if we have the variable on the correct device, return it. + return cache[device_var_key] + if name in cache: + # if we have it on a different device, copy it from the last device + v = tf.identity(cache[name]) + else: + var = getter(name, *args, **kwargs) + v = tf.identity(var._ref()) # pylint: disable=protected-access + # update the cache + cache[name] = v + cache[device_var_key] = v + return v + + # Variable scope will not reset caching_device on reused variables, + # so we make a custom getter that uses identity to cache the variable. + # pylint: disable=cell-var-from-loop + def CachingGetter(getter, name, *args, **kwargs): + v = getter(name, *args, **kwargs) + key = (self._caching_devices[i], name) + if key in cache: + return cache[key] + with tf.device(self._caching_devices[i]): + ret = tf.identity(v._ref()) # pylint: disable=protected-access + cache[key] = ret + return ret + + if self._daisy_chain_variables: + custom_getter = DaisyChainGetter + elif self._caching_devices: + custom_getter = CachingGetter + else: + custom_getter = None + # pylint: enable=cell-var-from-loop + with tf.name_scope('parallel_%d' % i): + with tf.variable_scope( + tf.get_variable_scope(), + reuse=True if i > 0 and self._reuse else None, + caching_device=self._caching_devices[i], + custom_getter=custom_getter): + with tf.device(self._devices[i]): + outputs.append(fns[i](*my_args[i], **my_kwargs[i])) + if isinstance(outputs[0], tuple): + outputs = list(zip(*outputs)) + outputs = tuple([list(o) for o in outputs]) + return outputs + + @property + def n(self): + return self._n + + @property + def devices(self): + return self._devices + + def _MaybeRepeat(self, x): + """Utility function for processing arguments that are singletons or lists. + + Args: + x: either a list of self.n elements, or not a list. + + Returns: + a list of self.n elements. + """ + if isinstance(x, list): + assert len(x) == self.n + return x + else: + return [x] * self.n + + +def Parallel(device_names_or_functions, fn, *args): + """Deprecated interface. + + Use `Parallelism(device_names_or_functions)(fn, *args)` instead. + + Args: + device_names_or_functions: A list of length n. + fn: a function or a list of n functions. + *args: additional args. Each arg should either be not a list, or a list + of length n. + + Returns: + either a single list of length n (if fn does not return a tuple), or a + tuple of lists of length n (if fn returns a tuple). + """ + return Parallelism(device_names_or_functions)(fn, *args) + + +def _RowwiseUnsortedSegmentSum(values, indices, n): + """UnsortedSegmentSum on each row. + + Args: + values: a `Tensor` with shape `[batch_size, k]`. + indices: an integer `Tensor` with shape `[batch_size, k]`. + n: an integer. + Returns: + A `Tensor` with the same type as `values` and shape `[batch_size, n]`. + """ + batch, k = tf.unstack(tf.shape(indices), num=2) + indices_flat = tf.reshape(indices, [-1]) + tf.div(tf.range(batch * k), k) * n + ret_flat = tf.unsorted_segment_sum( + tf.reshape(values, [-1]), indices_flat, batch * n) + return tf.reshape(ret_flat, [batch, n]) + + +def _NormalDistributionCDF(x, stddev): + """Evaluates the CDF of the normal distribution. + + Normal distribution with mean 0 and standard deviation stddev, + evaluated at x=x. + + input and output `Tensor`s have matching shapes. + + Args: + x: a `Tensor` + stddev: a `Tensor` with the same shape as `x`. + + Returns: + a `Tensor` with the same shape as `x`. + + """ + return 0.5 * (1.0 + tf.erf(x / (math.sqrt(2) * stddev + 1e-20))) + + +def _ProbInTopK(clean_values, noisy_values, noise_stddev, noisy_top_values, k): + """Helper function to NoisyTopKGating. + + Computes the probability that value is in top k, given different random noise. + + This gives us a way of backpropagating from a loss that balances the number + of times each expert is in the top k experts per example. + + In the case of no noise, pass in None for noise_stddev, and the result will + not be differentiable. + + Args: + clean_values: a `Tensor` of shape [batch, n]. + noisy_values: a `Tensor` of shape [batch, n]. Equal to clean values plus + normally distributed noise with standard deviation noise_stddev. + noise_stddev: a `Tensor` of shape [batch, n], or None + noisy_top_values: a `Tensor` of shape [batch, m]. + 'values' Output of tf.top_k(noisy_top_values, m). m >= k+1 + k: an integer. + + Returns: + a `Tensor` of shape [batch, n]. + """ + batch = tf.shape(clean_values)[0] + m = tf.shape(noisy_top_values)[1] + top_values_flat = tf.reshape(noisy_top_values, [-1]) + # we want to compute the threshold that a particular value would have to + # exceed in order to make the top k. This computation differs depending + # on whether the value is already in the top k. + threshold_positions_if_in = tf.range(batch) * m + k + threshold_if_in = tf.expand_dims( + tf.gather(top_values_flat, threshold_positions_if_in), 1) + is_in = tf.greater(noisy_values, threshold_if_in) + if noise_stddev is None: + return tf.to_float(is_in) + threshold_positions_if_out = threshold_positions_if_in - 1 + threshold_if_out = tf.expand_dims( + tf.gather(top_values_flat, threshold_positions_if_out), 1) + # is each value currently in the top k. + prob_if_in = _NormalDistributionCDF(clean_values - threshold_if_in, + noise_stddev) + prob_if_out = _NormalDistributionCDF(clean_values - threshold_if_out, + noise_stddev) + prob = tf.where(is_in, prob_if_in, prob_if_out) + return prob + + +def CVSquared(x): + """The squared coefficient of variation of a sample. + + Useful as a loss to encourage a positive distribution to be more uniform. + Epsilons added for numerical stability. + Returns 0 for an empty Tensor. + + Args: + x: a `Tensor`. + + Returns: + a `Scalar`. + """ + epsilon = 1e-10 + float_size = tf.to_float(tf.size(x)) + epsilon + mean = tf.reduce_sum(x) / float_size + variance = tf.reduce_sum(tf.square(x - mean)) / float_size + return variance / (tf.square(mean) + epsilon) + + +def MaxOverload(load): + """The load of the hardest-hit device relative to average. + + This is useful for monitoring the performance of MoEs. + + The load of an expert is the number of examples assigned to that expert. + The load of a device is the sum of the loads of all experts on that device. + + The input to this function is generally the 'load' output of + DistributedMixtureOfExperts.Eval(), which is either a 1d or 2d `Tensor` of + per-expert loads. In either case, the fist dimension corresponds to devices. + + This function sums over all dimensions other than dimension zero, then + computes the ratio of the maxmium value to the mean value. + + Args: + load: a 1d or 2d `Tensor`. + + Returns: + a `Scalar`. + """ + per_device_load = tf.reduce_sum(tf.reshape(load, [tf.shape(load)[0], -1]), 1) + return (tf.reduce_max(per_device_load) / + (tf.reduce_mean(per_device_load) + 1e-10)) + + +def _GatesToLoad(gates): + """Compute the true load per expert, given the gates. + + The load is the number of examples for which the corresponding gate is >0. + + Args: + gates: a `Tensor` of shape [batch_size, n] + Returns: + a float32 `Tensor` of shape [n] + """ + return tf.reduce_sum(tf.to_float(gates > 0), 0) + + +def _MyTopK(x, k): + """GPU-compatible version of top-k that works for very small constant k. + + Calls argmax repeatedly. + + Args: + x: a 2d Tensor. + k: a small integer. + + Returns: + values: a Tensor of shape [batch_size, k] + indices: a int32 Tensor of shape [batch_size, k] + """ + if k > 10: + return tf.nn.top_k(x, k) + values = [] + indices = [] + depth = tf.shape(x)[1] + for i in xrange(k): + values.append(tf.reduce_max(x, 1)) + argmax = tf.argmax(x, 1) + indices.append(argmax) + if i + 1 < k: + x += tf.one_hot(argmax, depth, -1e9) + return tf.stack(values, axis=1), tf.to_int32(tf.stack(indices, axis=1)) + + +class NoisyTopKGating(object): + """Noisy top-k gating network. + + See paper: https://arxiv.org/abs/1701.06538. + """ + + def __init__(self, hp, name): + """Create a NoisyTopKGating network. + + Args: + hp: a hyperparameters created by NoisyTopKGatingParams() + name: a string + """ + self._vars = [] + self._hp = hp + self._w_gate = tf.get_variable('%s_gate' % name, + [hp.input_size, + hp.num_experts], hp.dtype, hp.initializer) + self._vars.append(self._w_gate) + if hp.noisy_gating: + self._w_noise = tf.get_variable('%s_noise' % name, + [hp.input_size, hp.num_experts], hp.dtype, + hp.initializer) + self._vars.append(self._w_noise) + + def Eval(self, x, train=True, summaries=False): + """Compute noisy top-k gating. + + Args: + x: a `Tensor` of shape `[batch_size, input_size]`. + train: a boolean `Scalar`. Setting this to false turns off noise. + summaries: a boolean. Whether to add summaries. + Returns: + gates: a `Tensor` of shape `[batch_size, n]` + load: a `Tensor` of shape `[n]`. + If we are using noise, this is a smooth approximation of the load, + and you can define a loss in terms of it to help with load-balancing. + """ + with tf.variable_scope('NoisyTopKGating'): + hp = self._hp + clean_logits = tf.matmul(x, self._w_gate) + if hp.noisy_gating: + raw_noise_stddev = tf.matmul(x, self._w_noise) + noise_stddev = ((tf.nn.softplus(raw_noise_stddev) + hp.noise_epsilon) * + (tf.to_float(train))) + noisy_logits = clean_logits + ( + tf.random_normal(tf.shape(clean_logits)) * noise_stddev) + logits = noisy_logits + if summaries: + tf.summary.histogram('noisy_logits', noisy_logits) + tf.summary.histogram('noise_stddev', noise_stddev) + else: + logits = clean_logits + top_logits, top_indices = _MyTopK(logits, min(hp.k + 1, hp.num_experts)) + top_k_logits = tf.slice(top_logits, [0, 0], [-1, hp.k]) + top_k_indices = tf.slice(top_indices, [0, 0], [-1, hp.k]) + top_k_gates = tf.nn.softmax(top_k_logits) + # This will be a `Tensor` of shape `[batch_size, n]`, with zeros in the + # positions corresponding to all but the top k experts per example. + gates = _RowwiseUnsortedSegmentSum(top_k_gates, top_k_indices, + hp.num_experts) + if hp.noisy_gating and hp.k < hp.num_experts: + load = tf.reduce_sum( + _ProbInTopK(clean_logits, noisy_logits, noise_stddev, top_logits, + hp.k), 0) + else: + load = _GatesToLoad(gates) + if summaries: + tf.summary.histogram('importance', tf.reduce_sum(gates, 0)) + tf.summary.histogram('load', load) + return gates, load + + @property + def vars(self): + return self._vars + + +class LocalMixtureOfExperts(object): + """A MoE on a single device. + """ + + def __init__(self, gating_hp, expert_hp, input_size, output_size, name): + """Create a LocalMixtureOfExperts. + + Args: + gating_hp: hyperparameters for the gating network. + e.g. NoisyTopKGatingParams() + expert_hp: hyperparameters for the expert networks. + e.g. FeedForwardExpertParams() + input_size: an integer. + output_size: an integer. + name: a string. + """ + self._name = name + _SetInputOutputSizes(gating_hp, input_size, None) + _SetInputOutputSizes(expert_hp, input_size, output_size) + self._gating_hp = gating_hp + self._gating = gating_hp.gating_class(gating_hp, name + '_gating') + self._expert_hp = expert_hp + self._experts = [ + expert_hp.expert_class(expert_hp, name + '_%d' % i) + for i in xrange(gating_hp.num_experts) + ] + + def Eval(self, + x, + train=True, + per_example_multiplier=None, + summaries=False, + identifiers=None): + """Evaluate mixture of experts. + + We provide a convenient debugging tool for determining the set of examples + that we passed to each expert. The caller may provide a `Tensor` of + "identifiers", of any type whose first dimension matches the number of + input examples. The function will then return a list + "expert_to_identifiers", with one `Tensor` for each expert containing the + identifiers for all examples assigned to that expert. A parallel list of + `Tensor`s, "expert_to_gates", is also returned, containing the + corresponding gate values. + + Args: + x: a `Tensor` of shape `[batch_size, input_size]` + train: a boolean Scalar. Are we in training mode? + per_example_multiplier: an optional `Tensor` of shape `[batch_size]` which + gets multiplied into the gate values. If this LocalMixtureOfExperts + represents one secondary MoE in a hierarchical MoE, then we pass in + in the gate values from the primary gating function here. This causes + the computed values (`y`, `importance` and `expert_to_gates`) to also + reflect the primary gate values. + summaries: an boolean. Enable summaries. + identifiers: an optional `Tensor` whose first dimension is equal to + batch_size. + + Returns: + y: a `Tensor` of shape `[batch_size, output_size]`. Output of the MoE. + importance: a `Tensor` of shape `[n]`. Batchwise sum of gates. + load: a `Tensor` of shape `[n]`. Smooth estimator of the number of + examples passed to each expert. This is useful for load-balancing, + as any gradient on this `Tensor` will back-propagate to the gating + network. + expert_to_identifiers: if `identifiers` was passed in, a list of + length `num_experts`. Each element is a `Tensor` whose shape matches + that of `identifiers` in all but the first dimension. Contains the + slices of `identifiers` corresponding to the batch elements that were + dispatched to that expert. + expert_to_gates: A list of length `num_experts`. Each element contains + a 1-dimensional tensor + """ + gating_hp = self._gating_hp + gates, load = self._gating.Eval(x, train, summaries) + if per_example_multiplier is not None: + gates *= tf.expand_dims(per_example_multiplier, 1) + dispatcher = SparseDispatcher(gating_hp.num_experts, gates) + expert_input = dispatcher.Dispatch(x) + expert_output = [ + self._experts[i].Eval(expert_input[i]) + for i in xrange(gating_hp.num_experts) + ] + y = dispatcher.Combine(expert_output) + if identifiers is not None: + expert_to_identifiers = dispatcher.Dispatch(identifiers) + else: + expert_to_identifiers = None + return (y, tf.reduce_sum(gates, 0), load, expert_to_identifiers, + dispatcher.ExpertToGates()) + + @property + def vars(self): + ret = [] + for x in self._experts: + ret.extend(x.vars) + ret.extend(self._gating.vars) + return ret + + +class DistributedMixtureOfExperts(object): + """Distributed (optionally Hierarchical) Mixture of Experts. + + This class implements the scheme described in our paper. + See link at the top of this file. + + The model is trained synchronously using one large TF graph using + multiple devices. + + The conventional (non-MoE) layers use data-parallelism, with each device + processing a subset of the training batch. We call these datashards. + + The MoE layer (this object) uses model parallelism. Each expert is assigned + to a particular device, which hosts the expert parameters and performs the + expert computation for all examples assigned to that expert. In the case + of a hierarchical MoE, each second-level MoE is assigned to a device. + """ + + def __init__(self, primary_gating_hp, secondary_gating_hp, expert_hp, + input_size, output_size, expert_devices, name): + """Create a DistributedMixtureOfExperts. + + If `secondary_gating_hp` is `None`, then this is a flat MoE with + `primary_gating_hp.num_experts` experts. Otherwise, this is a hierarchical + MoE with `primary_gating_hp.num_experts` groups of + `secondary_gating_hp.num_experts` experts. + + The assignemnt of experts (or groups of experts) to devices is by + round-robin. So to make equal use of all the devices, one should set + `primary_gating_hp.num_experts` to the number of devices or a multiple + thereof. + + Args: + primary_gating_hp: hyperparameters for the primary gating network. + e.g. NoisyTopKGatingParams(). + secondary_gating_hp: hyperparameters for the secondary gating network. + e.g. NoisyTopKGatingParams(). None indicates a flat MoE. + expert_hp: hyperparameters for the expert networks. + e.g. FeedForwardExpertParams() + input_size: an integer. + output_size: an integer. + expert_devices: a list of device strings. The devices to be used for + the experts. + name: a string. + """ + self._name = name + # fill in the missing values in the hyperparameters + _SetInputOutputSizes(primary_gating_hp, input_size, None) + _SetInputOutputSizes(expert_hp, input_size, output_size) + self._is_hierarchical = secondary_gating_hp is not None + self._primary_gating_hp = primary_gating_hp + self._primary_gating = primary_gating_hp.gating_class( + primary_gating_hp, name + '_primary_gating') + n1 = self._primary_gating_hp.num_experts + # round robin assignment of experts to devices. + expert_devices = [ + expert_devices[i % len(expert_devices)] for i in xrange(n1) + ] + self._expert_devices = expert_devices + self._all_vars = [] + self._all_vars.extend(self._primary_gating.vars) + if self._is_hierarchical: + # hierarchical MoE + self._secondary_moe = [] + for i in xrange(n1): + with tf.device(expert_devices[i]): + secondary_moe = LocalMixtureOfExperts(secondary_gating_hp, expert_hp, + input_size, output_size, + '%s_secondary_%d' % (name, i)) + self._secondary_moe.append(secondary_moe) + self._all_vars.extend(secondary_moe.vars) + else: + # flat MoE + self._experts = [] + for i in xrange(n1): + with tf.device(expert_devices[i]): + expert = expert_hp.expert_class(expert_hp, name + '_%d' % i) + self._experts.append(expert) + self._all_vars.extend(expert.vars) + + def Eval(self, + datashard_devices, + xs, + train=True, + summaries=False, + identifiers=None, + shadow_xs=None): + """Evaluate MoE on given inputs. + + This class is designed for the case where the rest of the model is using + data parallelism. We receive an array of input `Tensor`s, one per + datashard, and we produce a list of output Tensors, one per datashard. + + We provide a convenient debugging tool for determining the set of examples + that we passed to each expert. The caller may provide a `Tensor` of + "identifiers", of any type whose first dimension matches the number of + input examples. The function will then return a list + "expert_to_identifiers", with one `Tensor` for each expert containing the + identifiers for all examples assigned to that expert. A parallel list of + `Tensor`s, "expert_to_gates", is also returned, containing the + corresponding gate values. + + Args: + datashard_devices: a `list` of device strings of length `num_datashards`. + Which devices to use for the output tensors. + xs: A `list` of `Tensor`s of length `num_datashards`. Each has shape + `[batch_size[d], input_size]. + train: a boolean `Scalar`. When train=`True`, noise is added to the + gating function. + summaries: a boolean. Whether to write summaries. + identifiers: an optional list of tensors. + Each tensor has shape [, extra_dims] + shadow_xs: Optional `list` of `Tensor`s of length `num_datashards`. Each + has shape `[batch_size[d], input_size]. Shadow_xs is useful if you want + to dispatch a transformed version of xs to the experts, but you want + untransformed xs for the gating network. + + Returns: + ys: the output (a list of one tensor per datashard). Each has shape + `[batch_size[d], output_size]. + importance: a `Tensor` of shape `[n]` for a flat MoE or `[n1, n2]` for a + hierarchical MoE. Batchwise sum of gates. + load: a `Tensor` of shape `[n]` for a flat MoE or `[n1, n2]` for a + hierarchical MoE. Smooth estimator of the number of + examples passed to each expert. This is useful for load-balancing, + as any gradient on this `Tensor` will back-propagate to the gating + network. + expert_to_identifiers: if `identifiers` was passed in, a list of + length `num_experts`. Each element is a `Tensor` whose shape matches + that of `identifiers` in all but the first dimension. Contains the + slices of `identifiers` corresponding to the batch elements that were + dispatched to that expert. + expert_to_gates: a list of one tensor per expert. + Each tensor has shape [] + + """ + n1 = self._primary_gating_hp.num_experts + epsilon = 1e-10 + assert len(datashard_devices) == len(xs) + num_datashards = len(xs) + expert_devices = self._expert_devices + has_identifiers = identifiers is not None + # pylint: disable=unbalanced-tuple-unpacking + primary_gates, primary_smooth_load = Parallel( + datashard_devices, self._primary_gating.Eval, xs, train, + [summaries] + [False] * (num_datashards - 1)) + primary_importance = tf.add_n( + Parallel(datashard_devices, tf.reduce_sum, primary_gates, 0)) + primary_smooth_load = tf.add_n(primary_smooth_load) + primary_true_load = tf.add_n( + Parallel(datashard_devices, _GatesToLoad, primary_gates)) + primary_dispatcher = DistributedSparseDispatcher( + datashard_devices, expert_devices, primary_gates) + + if shadow_xs is None: + secondary_input = primary_dispatcher.Dispatch(xs) + else: + secondary_input = primary_dispatcher.Dispatch(shadow_xs) + + primary_expert_to_identifiers = (primary_dispatcher.Dispatch(identifiers) + if has_identifiers else None) + primary_expert_to_gates = primary_dispatcher.ExpertToGates() + if not self._is_hierarchical: + # one-level distributed mixture of experts + secondary_output = Parallel(expert_devices, lambda a, b: a.Eval(b), + self._experts, secondary_input) + ys = primary_dispatcher.Combine(secondary_output) + return (ys, primary_importance, primary_smooth_load, + primary_expert_to_identifiers, primary_expert_to_gates) + # two-level hierarchical MoE + (secondary_output, secondary_importance, secondary_load, + secondary_expert_to_identifiers, secondary_expert_to_gates) = (Parallel( + expert_devices, [m.Eval for m in self._secondary_moe], secondary_input, + train, primary_expert_to_gates, [summaries] + [False] * (n1 - 1), + primary_expert_to_identifiers)) + # pylint: enable=unbalanced-tuple-unpacking + ys = primary_dispatcher.Combine(secondary_output, multiply_by_gates=False) + importance = tf.stack(secondary_importance) + load = tf.stack(secondary_load) * tf.expand_dims(primary_smooth_load / ( + primary_true_load + epsilon), 1) + expert_to_identifiers = [] + if identifiers is not None: + for el in secondary_expert_to_identifiers: + expert_to_identifiers.extend(el) + expert_to_gates = [] + for el in secondary_expert_to_gates: + expert_to_gates.extend(el) + return (ys, importance, load, expert_to_identifiers, expert_to_gates) + + @property + def vars(self): + return self._all_vars + + +class SparseDispatcher(object): + """Helper for implementing a mixture of experts. + + Example use: + + gates: a float32 `Tensor` with shape `[batch_size, num_experts]` + inputs: a float32 `Tensor` with shape `[batch_size, input_size]` + experts: a list of length `num_experts` containing sub-networks. + + dispatcher = SparseDispatcher(num_experts, gates) + expert_inputs = dispatcher.Dispatch(inputs) + expert_outputs = [experts[i](expert_inputs[i]) for i in range(num_experts)] + outputs = dispatcher.Combine(expert_outputs) + + The preceding code sets the output for a particular example b to: + output[b] = Sum_i(gates[b, i] * experts[i](inputs[b])) + + This class takes advantage of sparsity in the gate matrix by including in the + `Tensor`s for expert i only the batch elements for which `gates[b, i] > 0`. + """ + + def __init__(self, num_experts, gates): + """Create a SparseDispatcher. + + Args: + num_experts: an integer. + gates: a `Tensor` of shape `[batch_size, num_experts]`. + + Returns: + a SparseDispatcher + """ + self._gates = gates + self._num_experts = num_experts + + where = tf.to_int32(tf.where(tf.transpose(gates) > 0)) + self._expert_index, self._batch_index = tf.unstack(where, num=2, axis=1) + self._part_sizes_tensor = tf.reduce_sum(tf.to_int32(gates > 0), [0]) + self._nonzero_gates = tf.gather( + tf.reshape(self._gates, [-1]), + self._batch_index * num_experts + self._expert_index) + + def Dispatch(self, inp): + """Create one input Tensor for each expert. + + The `Tensor` for a expert `i` contains the slices of `inp` corresponding + to the batch elements `b` where `gates[b, i] > 0`. + + Args: + inp: a `Tensor` of shape '[batch_size, ]` + Returns: + a list of `num_experts` `Tensor`s with shapes + `[expert_batch_size_i, ]`. + """ + inp = tf.gather(inp, self._batch_index) + return tf.split(inp, self._part_sizes_tensor, 0) + + def Combine(self, expert_out, multiply_by_gates=True): + """Sum together the expert output, weighted by the gates. + + The slice corresponding to a particular batch element `b` is computed + as the sum over all experts `i` of the expert output, weighted by the + corresponding gate values. If `multiply_by_gates` is set to False, the + gate values are ignored. + + Args: + expert_out: a list of `num_experts` `Tensor`s, each with shape + `[expert_batch_size_i, ]`. + multiply_by_gates: a boolean + + Returns: + a `Tensor` with shape `[batch_size, ]`. + """ + # see comments on ConvertGradientToTensor + stitched = ConvertGradientToTensor(tf.concat(expert_out, 0)) + if multiply_by_gates: + stitched *= tf.expand_dims(self._nonzero_gates, 1) + combined = tf.unsorted_segment_sum(stitched, self._batch_index, + tf.shape(self._gates)[0]) + return combined + + def ExpertToGates(self): + """Gate values corresponding to the examples in the per-expert `Tensor`s. + + Returns: + a list of `num_experts` one-dimensional `Tensor`s with type `tf.float32` + and shapes `[expert_batch_size_i]` + """ + return tf.split(self._nonzero_gates, self._part_sizes_tensor, 0) + + @property + def part_sizes(self): + return self._part_sizes_tensor + + +class DistributedSparseDispatcher(object): + """A distributed version of SparseDispatcher. + + Instead of one batch of input examples, we simultaneously process + num_datashards batches of input examples. The per-expert `Tensor`s contain + a combination of examples from the different datashards. + + Each datashard is associated with a particular device and each expert is + associated with a particular device. All per-datashard and per-expert + `Tensor`s are created on those devices. There is no single-device bottleneck. + """ + + def __init__(self, datashard_devices, expert_devices, gates): + """Create a DistributedSparseDispatcher. + + Args: + datashard_devices: a list of num_datashards device strings. + expert_devices: a list of num_experts device strings. + gates: a list of num_datashards `Tensor`s of shapes + `[batch_size[d], num_experts]`. + + Returns: + a DistributedSparseDispatcher + """ + self._gates = gates + self._num_experts = len(expert_devices) + assert len(gates) == len(datashard_devices) + self._num_datashards = len(gates) + self._datashard_devices = datashard_devices + self._expert_devices = expert_devices + self._dispatchers = Parallel(self._datashard_devices, SparseDispatcher, + self._num_experts, gates) + + def Dispatch(self, inp): + """Create one input Tensor for each expert. + + Args: + inp: a list of length num_datashards `Tensor`s with shapes + `[batch_size[d], ]`. + Returns: + a list of `num_experts` `Tensor`s with shapes + `[num_examples[i], ]`. + """ + dispatched = Parallel(self._datashard_devices, lambda a, b: a.Dispatch(b), + self._dispatchers, inp) + ret = Parallel(self._expert_devices, tf.concat, + TransposeListOfLists(dispatched), 0) + if ret[0].dtype == tf.float32: + # see comments on ConvertGradientToTensor + ret = Parallel(self._expert_devices, ConvertGradientToTensor, ret) + return ret + + def Combine(self, expert_out, multiply_by_gates=True): + """Sum together the expert output, multiplied by the corresponding gates. + + Args: + expert_out: a list of `num_experts` `Tensor`s, each with shape + `[expert_batch_size_i, ]`. + multiply_by_gates: a boolean. + + Returns: + a list of num_datashards `Tensor`s with shapes + `[batch_size[d], ]`. + """ + expert_part_sizes = tf.unstack( + tf.stack([ + self._dispatchers[d].part_sizes + for d in xrange(self._num_datashards) + ]), + num=self._num_experts, + axis=1) + # list of lists of shape [num_experts][num_datashards] + expert_output_parts = Parallel(self._expert_devices, tf.split, expert_out, + expert_part_sizes) + expert_output_parts_t = TransposeListOfLists(expert_output_parts) + ret = [] + for d in xrange(self._num_datashards): + with tf.device(self._datashard_devices[d]): + ret.append(self._dispatchers[d].Combine( + # see comments on ConvertGradientToTensor + ConvertGradientToTensor(tf.concat(expert_output_parts_t[d], 0)), + multiply_by_gates=multiply_by_gates)) + return ret + + def ExpertToGates(self): + """Gate values corresponding to the examples in the per-expert `Tensor`s. + + Returns: + a list of `num_experts` one-dimensional `Tensor`s of type `tf.float32`. + """ + return Parallel(self._expert_devices, tf.concat, + TransposeListOfLists( + Parallel(self._datashard_devices, [ + self._dispatchers[d].ExpertToGates + for d in xrange(self._num_datashards) + ])), 0) + + +def TransposeListOfLists(lol): + """Transpose a list of equally-sized python lists. + + Args: + lol: a list of lists + Returns: + a list of lists + """ + assert lol, 'cannot pass the empty list' + return [list(x) for x in zip(*lol)] + + +class DistributedSingleDispatcher(object): + """Dispatches to experts according to gates. + + Each example goes to one expert. + + Unlike SparseDispatcher, the gates are one-dimensional `Tensor`s of integer + expert ids. There are no weights. + """ + + def __init__(self, data_parallelism, model_parallelism, gates): + """Constructs a Dispatcher. + + Args: + data_parallelism: a Parallelism object. + model_parallelism: a Parallelism object. + gates: a list of 1d integer `Tensor`s, one per datashard. + Says which expert to use for each batch element. + + Returns: + a DistributedSingleDispatcher + """ + gates = data_parallelism(tf.to_int32, gates) + self._gates = gates + self._data_parallelism = data_parallelism + self._model_parallelism = model_parallelism + + # Compute the sizes number of examples going from each datashard to each + # expert. + def _PartSizes(gates): + return tf.unsorted_segment_sum( + tf.ones_like(gates), gates, model_parallelism.n) + + part_sizes_by_datashard = data_parallelism(_PartSizes, gates) + self._part_sizes_by_expert = tf.unstack( + tf.stack(part_sizes_by_datashard), num=model_parallelism.n, axis=1) + + # These indices will be used to combine the output on the datashards. + def _StitchIndices(gates): + return tf.dynamic_partition( + tf.range(tf.size(gates)), gates, model_parallelism.n) + + self._stitch_indices = data_parallelism(_StitchIndices, gates) + + def Dispatch(self, d_tensors): + """Reshuffles input `Tensor`s to produce output `Tensor`s. + + The dimensions of all input and output `Tensor`s match, except for + dimension 0. In dimension 0, the input `Tensor`s match the corresponding + `gates` `Tensor`s which were passed to the constructor. + + Args: + d_tensors: a list of `Tensor`s, one per datashard. + + Returns: + a list of `Tensor`s, one per expert. + + """ + parts = self._data_parallelism(tf.dynamic_partition, d_tensors, self._gates, + self._model_parallelism.n) + parts_by_expert = TransposeListOfLists(parts) + x_tensors = self._model_parallelism(tf.concat, parts_by_expert, 0) + return x_tensors + + def Combine(self, x_tensors): + """Reshuffles per-expert `Tensor`s to produce per-datashard `Tensor`s. + + Dispatch must have been called at least once first. + + The dimensions of all input and output `Tensor`s match, except for + dimension 0. In dimension 0, the input `Tensor`s match the corresponding + outputs of `Dispatch`, and the output `Tensor`s match the corresponding + `gates` `Tensor`s which were passed to the constructor. + + Args: + x_tensors: a list of `Tensor`s, one per expert. + + Returns: + a list of `Tensor`s, one per datashard. + """ + parts = self._model_parallelism(tf.split, x_tensors, + self._part_sizes_by_expert) + d_tensors = self._data_parallelism(tf.dynamic_stitch, self._stitch_indices, + TransposeListOfLists(parts)) + return d_tensors + + +def ParallelEmbeddingLookup(params, ids, data_parallelism): + """Mod-sharded embedding lookup with multiple datashards. + + TODO(noam): does this work when vocab_size is not a multiple of `num_shards`? + + Args: + params: A list of `num_shards` `Tensors`, each with shapes + `[vocab_size / num_params, depth]`. + ids: A list of `num_datashards` one-dimensional ineger `Tensors`, + with shapes `[batch_size[i]]` + data_parallelism: A Parallelism object. + + Returns: + a list of `num_datashards` `Tensors`, each with shape + `[batch_size[i], depth]`. + """ + param_devices = [x.device for x in params] + model_parallelism = Parallelism(param_devices) + num_shards = len(param_devices) + # pylint: disable=unbalanced-tuple-unpacking + ids, unique_idx = data_parallelism(tf.unique, ids) + # pylint: enable=unbalanced-tuple-unpacking + gates = data_parallelism(tf.mod, ids, num_shards) + ids_div = data_parallelism(tf.div, ids, num_shards) + dispatcher = DistributedSingleDispatcher(data_parallelism, model_parallelism, + gates) + x_ids_div = dispatcher.Dispatch(ids_div) + params = model_parallelism(ConvertGradientToTensor, params) + x_emb = model_parallelism(tf.gather, params, x_ids_div) + r_emb = dispatcher.Combine(x_emb) + r_emb = data_parallelism(tf.gather, r_emb, unique_idx) + return r_emb + + +def SampledSoftmaxLoss(features, sampler, num_classes, target_classes, + target_params, sampled_classes, sampled_params): + """Loss for training softmax classifiers on large label vocabulary. + + This function assumes that we have already chosen the sampled classes and + fetched the parameters for the target classes and the sampled classes. + + Args: + features: a Tensor with shape [batch_size, hidden_size] + sampler: a candidate sampler object + (see learning/brain/google/python/ops/candidate_sampling.py) + num_classes: an integer + target_classes: an integer Tensor with shape [batch_size] + target_params: a Tensor with shape [batch_size, hidden_size] + The parameters corresponding to the target classes. + sampled_classes: an integer tensor with shape [num_sampled_classes] + sampled_params: a Tensor with shape [num_sampled_classes, hidden_size] + The parameters corresponding to the sampled classes. + + Returns: + a Tensor with shape [batch_size] + """ + sampled_logits = (tf.matmul(features, sampled_params, transpose_b=True) - + sampler.log_expected_count(sampled_classes)) + target_logits = (tf.reduce_sum(target_params * features, 1) - + sampler.log_expected_count(target_classes)) + sampled_log_denominator = tf.reduce_logsumexp( + sampled_logits, [1], name='SampledLogDenominator') + sampled_classes_mask = tf.unsorted_segment_sum( + tf.fill(tf.shape(sampled_classes), float('-inf')), sampled_classes, + num_classes) + target_log_denominator = ( + target_logits + tf.gather(sampled_classes_mask, target_classes)) + combined_log_denominator = tf.reduce_logsumexp( + tf.stack([sampled_log_denominator, target_log_denominator]), [0]) + loss = combined_log_denominator - target_logits + return loss + + +def ParallelSampledSoftmaxLoss(params, + features, + target_classes, + sampler, + num_classes, + data_parallelism, + target_weights=None): + """Computes sampled softmax loss across many datashards. + + This is used during training to efficiently train a softmax classifier layer. + + Args: + params: A list of num_param_shards Tensors, each with shape + [num_classes / num_param_shards, num_features]. + The parameters are assumed to be mod-sharded by class. + features: a list of num_datashards Tensors, each with shape + [batch_size_i, num_features] + target_classes: A list of num_datashards integer Tensors each with shape + [batch_size_i] + sampler: a candidate sampler object + (see learning/brain/google/python/ops/candidate_sampling.py) + num_classes: an Integer + data_parallelism: a Parallelism object + target_weights: an optional list of num_datashards Tensors each with + shape [batch_size_i] + Returns: + a Scalar. + """ + sampled_classes = data_parallelism(sampler.sample) + sampled_params = ParallelEmbeddingLookup(params, sampled_classes, + data_parallelism) + target_params = ParallelEmbeddingLookup(params, target_classes, + data_parallelism) + ret = data_parallelism(SampledSoftmaxLoss, features, sampler, num_classes, + target_classes, target_params, sampled_classes, + sampled_params) + if target_weights is not None: + ret = data_parallelism(tf.multiply, ret, target_weights) + ret = data_parallelism(tf.reduce_sum, ret) + ret = tf.add_n(ret) + return ret diff --git a/tensor2tensor/utils/metrics.py b/tensor2tensor/utils/metrics.py new file mode 100644 index 000000000..4dc952a08 --- /dev/null +++ b/tensor2tensor/utils/metrics.py @@ -0,0 +1,155 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utils for metrics used in eval.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import functools + +# Dependency imports + +import six + +from tensor2tensor.models import common_layers +from tensor2tensor.utils import bleu_hook + +import tensorflow as tf + +flags = tf.flags +FLAGS = flags.FLAGS + + +def padded_accuracy_topk(predictions, + labels, + k, + weights_fn=common_layers.weights_nonzero): + """Percentage of times that top-k predictions matches labels on non-0s.""" + with tf.variable_scope("padded_accuracy_topk", values=[predictions, labels]): + padded_labels = common_layers.pad_with_zeros(predictions, labels) + weights = weights_fn(padded_labels) + effective_k = tf.minimum(k, tf.shape(predictions)[-1]) + _, outputs = tf.nn.top_k(predictions, k=effective_k) + outputs = tf.to_int32(outputs) + padded_labels = tf.expand_dims(padded_labels, axis=-1) + padded_labels += tf.zeros_like(outputs) # Pad to same shape. + same = tf.to_float(tf.equal(outputs, padded_labels)) + same_topk = tf.reduce_sum(same, axis=-1) + return same_topk, weights + + +def padded_accuracy_top5(predictions, + labels, + weights_fn=common_layers.weights_nonzero): + return padded_accuracy_topk(predictions, labels, 5, weights_fn) + + +def padded_sequence_accuracy(predictions, + labels, + weights_fn=common_layers.weights_nonzero): + """Percentage of times that predictions matches labels everywhere (non-0).""" + with tf.variable_scope( + "padded_sequence_accuracy", values=[predictions, labels]): + padded_labels = common_layers.pad_with_zeros(predictions, labels) + weights = weights_fn(padded_labels) + outputs = tf.to_int32(tf.argmax(predictions, axis=-1)) + not_correct = tf.to_float(tf.not_equal(outputs, padded_labels)) * weights + axis = list(range(1, len(outputs.get_shape()))) + correct_seq = 1.0 - tf.minimum(1.0, tf.reduce_sum(not_correct, axis=axis)) + return correct_seq, tf.constant(1.0) + + +def padded_neg_log_perplexity(predictions, + labels, + weights_fn=common_layers.weights_nonzero): + """Average log-perplexity exluding padding 0s. No smoothing.""" + num, den = common_layers.padded_cross_entropy( + predictions, labels, 0.0, weights_fn=weights_fn, reduce_sum=False) + return (-num, den) + + +def padded_accuracy(predictions, + labels, + weights_fn=common_layers.weights_nonzero): + """Percentage of times that predictions matches labels on non-0s.""" + with tf.variable_scope("padded_accuracy", values=[predictions, labels]): + padded_labels = common_layers.pad_with_zeros(predictions, labels) + weights = weights_fn(padded_labels) + outputs = tf.to_int32(tf.argmax(predictions, axis=-1)) + return tf.to_float(tf.equal(outputs, padded_labels)), weights + + +def create_evaluation_metrics(problems): + """Creates the evaluation metrics for the model. + + Args: + problems: List of strings containing the name of the problems. + + Returns: + A dictionary with keys that are strings naming the evaluation + metrics and values that are functions taking arguments of + (predictions, targets), returning a tuple of a tensor of the + metric's value together with an op to update the metric's value. + """ + + def append_metric_fns(metric_tup, eval_metrics): + """Append problem-specific and global metrics to eval_metrics.""" + metric_name, metric_function = metric_tup + def fn(predictions, labels, weights, idx, weights_fn): + # The 'weights' argument represents problem-choice here, + # we need to keep this name because MetricSpecs checks it. + problem_choice = weights + (scores, weights) = tf.cond( + tf.equal(idx, problem_choice), # pylint: disable=cell-var-from-loop + lambda: metric_function(predictions, labels, weights_fn=weights_fn), + lambda: (tf.constant(0.0), tf.constant(0.0))) + # The tf.metrics.mean function assures correct aggregation. + return tf.metrics.mean(scores, weights) + + for i, problem in enumerate(problems): + name = "metrics-%s/%s" % (problem, metric_name) + weights_fn = (common_layers.weights_concatenated + if "concat" in problem else common_layers.weights_nonzero) + eval_metrics[name] = functools.partial(fn, idx=i, weights_fn=weights_fn) + + def global_fn(predictions, labels, weights): + (scores, weights) = metric_function(predictions, labels) + return tf.metrics.mean(scores, weights) + + eval_metrics["metrics/%s" % metric_name] = global_fn + + eval_metrics = dict() + + # Metrics are functions that take predictions and labels and return + # a tensor of metrics and a tensor of weights. + # The results are passed to tf.metrics.mean to accumulate properly. + metrics_list = [("accuracy", padded_accuracy), ("accuracy_top5", + padded_accuracy_top5), + ("accuracy_per_sequence", padded_sequence_accuracy), + ("neg_log_perplexity", padded_neg_log_perplexity)] + + # TODO(nikip): Extend this to support use of custom metrics for problems. + for problem in problems: + if "wmt" in problem: + metrics_list.append(("bleu_score", bleu_hook.padded_bleu_score)) + + for metric in metrics_list: + append_metric_fns(metric, eval_metrics) + + return { + k: tf.contrib.learn.MetricSpec( + v, prediction_key="predictions", weight_key="problem_choice") + for (k, v) in six.iteritems(eval_metrics) + } diff --git a/tensor2tensor/utils/metrics_test.py b/tensor2tensor/utils/metrics_test.py new file mode 100644 index 000000000..0472d4f21 --- /dev/null +++ b/tensor2tensor/utils/metrics_test.py @@ -0,0 +1,88 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for tensor2tensor.utils.metrics.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +import numpy as np +from tensor2tensor.utils import metrics + +import tensorflow as tf + + +class CommonLayersTest(tf.test.TestCase): + + def testAccuracyMetric(self): + predictions = np.random.randint(1, 5, size=(12, 12, 12, 1)) + targets = np.random.randint(1, 5, size=(12, 12, 12, 1)) + expected = np.mean((predictions == targets).astype(float)) + with self.test_session() as session: + scores, _ = metrics.padded_accuracy( + tf.one_hot(predictions, depth=5, dtype=tf.float32), + tf.constant(targets, dtype=tf.int32)) + a = tf.reduce_mean(scores) + session.run(tf.global_variables_initializer()) + actual = session.run(a) + self.assertAlmostEqual(actual, expected) + + def testAccuracyTopKMetric(self): + predictions = np.random.randint(1, 5, size=(12, 12, 12, 1)) + targets = np.random.randint(1, 5, size=(12, 12, 12, 1)) + expected = np.mean((predictions == targets).astype(float)) + with self.test_session() as session: + predicted = tf.one_hot(predictions, depth=5, dtype=tf.float32) + scores1, _ = metrics.padded_accuracy_topk( + predicted, tf.constant(targets, dtype=tf.int32), k=1) + scores2, _ = metrics.padded_accuracy_topk( + predicted, tf.constant(targets, dtype=tf.int32), k=7) + a1 = tf.reduce_mean(scores1) + a2 = tf.reduce_mean(scores2) + session.run(tf.global_variables_initializer()) + actual1, actual2 = session.run([a1, a2]) + self.assertAlmostEqual(actual1, expected) + self.assertAlmostEqual(actual2, 1.0) + + def testSequenceAccuracyMetric(self): + predictions = np.random.randint(4, size=(12, 12, 12, 1)) + targets = np.random.randint(4, size=(12, 12, 12, 1)) + expected = np.mean( + np.prod((predictions == targets).astype(float), axis=(1, 2))) + with self.test_session() as session: + scores, _ = metrics.padded_sequence_accuracy( + tf.one_hot(predictions, depth=4, dtype=tf.float32), + tf.constant(targets, dtype=tf.int32)) + a = tf.reduce_mean(scores) + session.run(tf.global_variables_initializer()) + actual = session.run(a) + self.assertEqual(actual, expected) + + def testNegativeLogPerplexity(self): + predictions = np.random.randint(4, size=(12, 12, 12, 1)) + targets = np.random.randint(4, size=(12, 12, 12, 1)) + with self.test_session() as session: + scores, _ = metrics.padded_neg_log_perplexity( + tf.one_hot(predictions, depth=4, dtype=tf.float32), + tf.constant(targets, dtype=tf.int32)) + a = tf.reduce_mean(scores) + session.run(tf.global_variables_initializer()) + actual = session.run(a) + self.assertEqual(actual.shape, ()) + + +if __name__ == '__main__': + tf.test.main() diff --git a/tensor2tensor/utils/modality.py b/tensor2tensor/utils/modality.py new file mode 100644 index 000000000..e6b1c9994 --- /dev/null +++ b/tensor2tensor/utils/modality.py @@ -0,0 +1,564 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Modalities define the bottom and top of the model (not the body).""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import re + +# Dependency imports + +from six.moves import xrange # pylint: disable=redefined-builtin + +from tensor2tensor.models import common_layers +from tensor2tensor.utils import expert_utils as eu +import tensorflow as tf + + +class Modality(object): + """Abstract Modality class for data transformations. + + An abstract class representing modalities for transforming data to a space + interpretable by sequence models. It has 3 functions: + * inputs_bottom: called on inputs entering the model. + * targets_bottom: called on targets entering the model (e.g., the decoder). + * targets_top : called on targets to generate predictions. + + For example, think about a modality for images. The inputs_bottom function + represents the part of the model applied to an incoming image, e.g., an entry + flow of a convolutional network. The targets_top function represents the top + part of a model that is generating images, e.g., a PixelCNN network. The final + function targets_bottom represents the auto-regressive part of the network. + It is applied to the already-generated part of an image, which is given to + the decoder to generate the next part. In some cases, e.g., for text, it is + the same as the inputs_bottom function, as that is the default we use. But, + e.g., for images, a different function might be needed to regress properly. + + All 3 functions have simple and sharded versions. A sub-class only needs + to implement the simple version, the default sharding will be used then. + """ + + def __init__(self, model_hparams): + self._model_hparams = model_hparams + + @property + def name(self): + camelcase_name = type(self).__name__ # DeCamelCase for TF readability. + return re.sub("([A-Z]+)", r"_\1", camelcase_name).lower()[1:] + + @property + def targets_dimensionality(self): + """Integer, the last dimension of the predictions (vocab size).""" + raise NotImplementedError("Abstract Method") + + @property + def _body_input_depth(self): + return self._model_hparams.hidden_size + + def inputs_bottom_simple(self, x): + """Transform one shard of input. + + Args: + x: An int32 Tensor with shape [batch, p0, p1, input_channels] + Returns: + A float32 Tensor with shape [batch, p0, p1, body_input_depth] + """ + raise NotImplementedError("Abstract Method") + + def inputs_bottom_sharded(self, xs, data_parallelism): + """Transform the inputs. + + Args: + xs: A list of num_datashards Tensors (one per shard) + each with shape [batch, p0, p1, depth] + data_parallelism: a expert_utils.Parallelism object + Returns: + shaded_body_input: A list of num_datashards Tensors, each with shape + [batch, p0, p1, body_input_depth]. + """ + return data_parallelism(self.inputs_bottom_simple, xs) + + def targets_bottom_simple(self, x): + """Transform one shard of targets. + + Args: + x: An int32 Tensor with shape [batch, p0, p1, target_channels] + Returns: + A float32 Tensor with shape [batch, p0, p1, body_input_depth] + """ + with tf.variable_scope("targets_bottom_simple"): + return self.inputs_bottom_simple(x) + + def targets_bottom_sharded(self, xs, data_parallelism): + """Transform the targets. + + Args: + xs: A list of num_datashards Tensors (one per shard) + each with shape [batch, p0, p1, target_channels] + data_parallelism: a expert_utils.Parallelism object + Returns: + shaded_body_input: A list of num_datashards Tensors, each with shape + [batch, p0, p1, body_input_depth]. + """ + return data_parallelism(self.targets_bottom_simple, xs) + + def targets_top_simple(self, body_output, targets): + """Transform one shard of output. + + Most classes will override this function. + + Args: + body_output: A Tensor with shape [batch, p0, p1, body_output_depth] + targets: A Tensor with shape [batch, p0, p1, targets_channels, + targets_dimensionality] + Returns: + A Tensor of class logits. + """ + raise NotImplementedError("Abstract Method") + + def targets_top_sharded(self, + sharded_body_output, + sharded_targets, + data_parallelism, + weights_fn=common_layers.weights_nonzero): + """Transform all shards of targets. + + Classes with cross-shard interaction will override this function. + + Args: + sharded_body_output: A list of Tensors. + sharded_targets: A list of Tensors. + data_parallelism: a expert_utils.Parallelism object. + weights_fn: function from targets to target weights. + Returns: + shaded_logits: A list of Tensors. + training_loss: a Scalar. + """ + sharded_logits = data_parallelism(self.targets_top_simple, + sharded_body_output, sharded_targets) + loss_num, loss_den = data_parallelism( + common_layers.padded_cross_entropy, + sharded_logits, + sharded_targets, + self._model_hparams.label_smoothing, + weights_fn=weights_fn) + loss = tf.add_n(loss_num) / tf.maximum(1.0, tf.add_n(loss_den)) + return sharded_logits, loss + + +class SymbolModality(Modality): + """Modality for sets of discrete symbols. + + Input: + Embedding. + + Output: + Linear transformation + softmax. + """ + + def __init__(self, model_hparams, vocab_size): + super(SymbolModality, self).__init__(model_hparams) + self._vocab_size = vocab_size + self._datashard_device_to_embedding = None + self._datashard_device_to_softmax_weights = None + + @property + def name(self): + return "symbol_modality_%d_%d" % (self._vocab_size, self._body_input_depth) + + @property + def targets_dimensionality(self): + return self._vocab_size + + def _get_weights(self): + """Create or get concatenated embedding or softmax variable. + + Returns: + a list of self._num_shards Tensors. + """ + num_shards = self._model_hparams.symbol_modality_num_shards + shards = [] + for i in xrange(num_shards): + shard_size = (self._vocab_size // num_shards) + ( + 1 if i < self._vocab_size % num_shards else 0) + var_name = "weights_%d" % i + shards.append( + tf.get_variable( + var_name, [shard_size, self._body_input_depth], + initializer=tf.random_normal_initializer( + 0.0, self._body_input_depth**-0.5))) + if num_shards == 1: + ret = shards[0] + else: + ret = tf.concat(shards, 0) + ret = eu.ConvertGradientToTensor(ret) + return ret + + def bottom_simple(self, x, name, reuse): + with tf.variable_scope(name, reuse=reuse): + # Squeeze out the channels dimension. + x = tf.squeeze(x, axis=3) + var = self._get_weights() + ret = tf.gather(var, x) + if self._model_hparams.multiply_embedding_mode == "sqrt_depth": + ret *= self._body_input_depth**0.5 + ret *= tf.expand_dims(tf.to_float(tf.not_equal(x, 0)), -1) + return ret + + def inputs_bottom_simple(self, x): + if self._model_hparams.shared_embedding_and_softmax_weights: + return self.bottom_simple(x, "shared", reuse=None) + else: + return self.bottom_simple(x, "input_emb", reuse=None) + + def targets_bottom_simple(self, x): + if self._model_hparams.shared_embedding_and_softmax_weights: + return self.bottom_simple(x, "shared", reuse=True) + else: + return self.bottom_simple(x, "target_emb", reuse=None) + + def targets_top_simple(self, body_output, targets): + """Generate logits. + + Args: + body_output: A Tensor with shape [batch, p0, p1, body_input_depth] + targets: A Tensor with shape [batch, p0, p1, 1] + Returns: + logits: A Tensor with shape [batch, p0, p1, ?, vocab_size]. + """ + if self._model_hparams.shared_embedding_and_softmax_weights: + scope_name = "shared" + reuse = True + else: + scope_name = "softmax" + reuse = False + with tf.variable_scope(scope_name, reuse=reuse): + var = self._get_weights() + shape = tf.shape(body_output)[:-1] + body_output = tf.reshape(body_output, [-1, self._body_input_depth]) + logits = tf.matmul(body_output, var, transpose_b=True) + logits = tf.reshape(logits, tf.concat([shape, [self._vocab_size]], 0)) + # insert a channels dimension + return tf.expand_dims(logits, 3) + + +class SmallImageModality(Modality): + """Performs strided conv compressions for small image data.""" + + def __init__(self, model_hparams): + super(SmallImageModality, self).__init__(model_hparams) + + @property + def targets_dimensionality(self): + return 256 + + def inputs_bottom_simple(self, inputs): + with tf.variable_scope(self.name): + inputs = common_layers.standardize_images(inputs) + # TODO(lukaszkaiser): summaries here don't work in multi-problem case yet. + # tf.summary.image("inputs", inputs, max_outputs=2) + if self._model_hparams.compress_steps > 0: + strides = (2, 2) + else: + strides = (1, 1) + return common_layers.conv_block( + inputs, + self._body_input_depth, [((1, 1), (3, 3))], + first_relu=False, + strides=strides, + padding="SAME", + force2d=True, + name="small_image_conv") + + def targets_bottom_simple(self, inputs): + with tf.variable_scope(self.name): + inputs = common_layers.standardize_images(inputs) + if self._model_hparams.compress_steps > 0: + kernel, strides = (2, 2), (2, 2) # Crucial to not leak! + else: + kernel, strides = (1, 1), (1, 1) + return common_layers.conv_block( + inputs, + self._body_input_depth, [((1, 1), kernel)], + first_relu=False, + strides=strides, + force2d=True, + name="small_image_conv") + + def targets_top_simple(self, body_output, targets): + with tf.variable_scope(self.name): + if self._model_hparams.compress_steps == 0: + targets_shape = tf.shape(targets) + channels = targets.shape.as_list()[-1] + outputs = tf.layers.dense(body_output, 256 * channels) + return tf.reshape(outputs, [ + targets_shape[0], targets_shape[1], targets_shape[2], 3, 256 + ]) + dilations_kernels = [((1, 1), (3, 1)), ((2, 1), (3, 1)), ((4, 1), (3, 1))] + return common_layers.decompress_seqcnn( + body_output, targets, 256, dilations_kernels, 2, is_2d=True) + + def targets_top_sharded(self, + sharded_body_output, + sharded_targets, + data_parallelism, + weights_fn=common_layers.weights_all): + # Call the default implementation, but weight 1.0 on 0s by default. + # (Since we're processing images and so have no padding and some pixel 0s.) + return super(SmallImageModality, self).targets_top_sharded( + sharded_body_output, + sharded_targets, + data_parallelism, + weights_fn=weights_fn) + + +class ImageModality(Modality): + """Performs embedding and strided conv compressions for large image data.""" + + def __init__(self, model_hparams): + super(ImageModality, self).__init__(model_hparams) + + @property + def targets_dimensionality(self): + return 256 + + def inputs_bottom_simple(self, inputs): + """Transform input from data space to model space. + + Perform the Xception "Entry flow", which consists of two convolutional + filter upscalings followed by three residually connected separable + convolution blocks. + + Args: + inputs: A Tensor with shape [batch, ...] + Returns: + body_input: A Tensor with shape [batch, ?, ?, body_input_depth]. + """ + with tf.variable_scope(self.name): + + def xnet_resblock(x, filters, res_relu, name): + with tf.variable_scope(name): + y = common_layers.separable_conv_block( + x, + filters, [((1, 1), (3, 3)), ((1, 1), (3, 3))], + first_relu=True, + padding="SAME", + force2d=True, + name="sep_conv_block") + y = common_layers.pool(y, (3, 3), "MAX", "SAME", strides=(2, 2)) + return y + common_layers.conv_block( + x, + filters, [((1, 1), (1, 1))], + padding="SAME", + strides=(2, 2), + first_relu=res_relu, + force2d=True, + name="res_conv0") + + inputs = common_layers.standardize_images(inputs) + # TODO(lukaszkaiser): summaries here don't work in multi-problem case yet. + # tf.summary.image("inputs", inputs, max_outputs=2) + x = common_layers.conv_block( + inputs, + 32, [((1, 1), (3, 3))], + first_relu=False, + padding="SAME", + strides=(2, 2), + force2d=True, + name="conv0") + x = common_layers.conv_block( + x, 64, [((1, 1), (3, 3))], padding="SAME", force2d=True, name="conv1") + x = xnet_resblock(x, min(128, self._body_input_depth), True, "block0") + x = xnet_resblock(x, min(256, self._body_input_depth), False, "block1") + return xnet_resblock(x, self._body_input_depth, False, "block2") + + def targets_top_simple(self, body_output, _): + # TODO(lukaszkaiser): work on a better way to generate large images. + with tf.variable_scope(self.name): + decompressed_inputs = common_layers.deconv_stride2_multistep( + body_output, + self._model_hparams.compress_steps, + body_output.get_shape()[-1], + name="deconv") + return common_layers.conv( + decompressed_inputs, self._vocab_size, (1, 1), padding="SAME") + + +class AudioModality(Modality): + """Performs strided conv compressions for audio data.""" + + def __init__(self, model_hparams): + super(AudioModality, self).__init__(model_hparams) + + def inputs_bottom_simple(self, inputs): + """Transform input from data space to model space. + + Args: + inputs: A Tensor with shape [batch, ...] + Returns: + body_input: A Tensor with shape [batch, ?, ?, body_input_depth]. + """ + with tf.variable_scope(self.name): + # TODO(aidangomez): Will need to sort out a better audio pipeline + def xnet_resblock(x, filters, res_relu, name): + with tf.variable_scope(name): + # Typically audio samples are >100k samples in length and have a width + # of 2 or 4. Mono audio has a single channel while stereo has 2. + y = common_layers.separable_conv_block( + x, + filters, [((1, 1), (3, 3)), ((1, 1), (3, 3))], + first_relu=True, + padding="SAME", + force2d=True, + name="sep_conv_block") + y = common_layers.pool(y, (3, 3), "MAX", "SAME", strides=(2, 2)) + return y + common_layers.conv_block( + x, + filters, [((1, 1), (1, 1))], + padding="SAME", + strides=(2, 2), + first_relu=res_relu, + force2d=True, + name="res_conv0") + + x = tf.to_float(inputs) / 255. + x.set_shape([None, None, None, 1]) + for i in xrange(self._model_hparams.audio_compression): + x = xnet_resblock(x, 2**(i + 1), True, "compress_block_%d" % i) + return xnet_resblock(x, self._body_input_depth, False, + "compress_block_final") + + +class AudioSpectralModality(Modality): + """Performs strided conv compressions for audio spectral data.""" + + def __init__(self, model_hparams): + super(AudioSpectralModality, self).__init__(model_hparams) + + def inputs_bottom_simple(self, inputs): + """Transform input from data space to model space. + + Args: + inputs: A Tensor with shape [batch, ...] + Returns: + body_input: A Tensor with shape [batch, ?, ?, body_input_depth]. + """ + with tf.variable_scope(self.name): + # TODO(aidangomez): Will need to sort out a better audio pipeline + def xnet_resblock(x, filters, res_relu, name): + with tf.variable_scope(name): + # We only stride along the length dimension to preserve the spectral + # bins (which are tiny in dimensionality relative to length) + y = common_layers.separable_conv_block( + x, + filters, [((1, 1), (3, 3)), ((1, 1), (3, 3))], + first_relu=True, + padding="SAME", + force2d=True, + name="sep_conv_block") + y = common_layers.pool(y, (3, 3), "MAX", "SAME", strides=(2, 1)) + return y + common_layers.conv_block( + x, + filters, [((1, 1), (1, 1))], + padding="SAME", + strides=(2, 1), + first_relu=res_relu, + force2d=True, + name="res_conv0") + + # Bitcast back from int32 + x = tf.bitcast(inputs, tf.float32) + x.set_shape([None, None, None, 1]) + for i in xrange(self._model_hparams.audio_compression): + x = xnet_resblock(x, 2**(i + 1), True, "compress_block_%d" % i) + return xnet_resblock(x, self._body_input_depth, False, + "compress_block_final") + + +class ClassLabelModality(Modality): + """Used for label data.""" + + def __init__(self, model_hparams, vocab_size, is2d=False): + super(ClassLabelModality, self).__init__(model_hparams) + self._vocab_size = vocab_size + self._is_2d = is2d + self._kernel = (3, 3) if is2d else (5, 1) + self._strides = (2, 2) if is2d else (4, 1) + self._padding = "SAME" if is2d else "LEFT" + + @property + def name(self): + return "class_label_modality_%d_%d" % (self._vocab_size, + self._body_input_depth) + + @property + def targets_dimensionality(self): + return self._vocab_size + + def inputs_bottom_simple(self, x): + with tf.variable_scope(self.name): + return common_layers.embedding( + x, + self._vocab_size, + self._body_input_depth, + multiplier=self._body_input_depth**0.5 if + self._model_hparams.multiply_embedding_mode == "sqrt_depth" else 1.0) + + def targets_bottom_simple(self, x): + with tf.variable_scope(self.name): + return tf.zeros([tf.shape(x)[0], 1, 1, self._body_input_depth]) + + def targets_top_simple(self, body_output, _): + """Transform inputs from model space to target space. + + Perform the Xception "Exit flow", consisting of a single residual block and + two separable convolutional upscalings followed by global spatial average + pooling. + + Args: + body_output: A Tensor with shape [batch, ?, ?, body_output_size]. + Returns: + a Tensors, each with shape [batch_size, ?, ?, vocab_size] + """ + with tf.variable_scope(self.name): + x = body_output + + # Assume input is a square with self._body_input_depth channels. + if self._is_2d: + length_float = tf.to_float(tf.shape(x)[1]) + spatial_dim_float = tf.sqrt(length_float) + spatial_dim = tf.to_int32(spatial_dim_float) + x = tf.reshape(x, [-1, spatial_dim, spatial_dim, + self._body_input_depth]) + x = common_layers.conv_block_downsample(x, self._kernel, self._strides, + self._padding) + x = tf.nn.relu(x) + x = tf.reduce_mean(x, axis=[1, 2], keep_dims=True) + res = common_layers.conv(x, self._vocab_size, (1, 1)) + return tf.expand_dims(res, 3) + + def targets_top_sharded(self, + sharded_body_output, + sharded_targets, + data_parallelism, + weights_fn=common_layers.weights_all): + # Call the default implementation, but weight 1.0 on 0s by default. + # (Since we're processing images and so have no padding and some labels 0.) + return super(ClassLabelModality, self).targets_top_sharded( + sharded_body_output, + sharded_targets, + data_parallelism, + weights_fn=weights_fn) diff --git a/tensor2tensor/utils/modality_test.py b/tensor2tensor/utils/modality_test.py new file mode 100644 index 000000000..0b22b4eff --- /dev/null +++ b/tensor2tensor/utils/modality_test.py @@ -0,0 +1,88 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for Modalities.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +import numpy as np + +from tensor2tensor.utils import expert_utils +from tensor2tensor.utils import modality + +import tensorflow as tf + + +class ModalityTest(tf.test.TestCase): + + def testSymbolModalityInputs(self): + batch_size = 10 + num_datashards = 5 + length = 5 + vocab_size = 5000 + hidden_size = 9 + model_hparams = tf.contrib.training.HParams( + symbol_modality_num_shards=4, + hidden_size=hidden_size, + multiply_embedding_mode="sqrt_depth", + shared_embedding_and_softmax_weights=0) + x = -1 + np.random.random_integers(vocab_size, size=( + batch_size, length, 1, 1)) + m = modality.SymbolModality(model_hparams, vocab_size) + data_parallelism = expert_utils.Parallelism( + ["/device:CPU:0"] * num_datashards, reuse=True) + with self.test_session() as session: + xs = tf.split(x, num_datashards) + sharded_output = m.inputs_bottom_sharded(xs, data_parallelism) + output = tf.concat(sharded_output, 0) + session.run(tf.global_variables_initializer()) + res = session.run(output) + self.assertEqual(res.shape, (batch_size, length, 1, hidden_size)) + + def testSymbolModalityTargets(self): + batch_size = 10 + num_datashards = 5 + length = 6 + height = 7 + hidden_size = 9 + vocab_size = 11 + model_hparams = tf.contrib.training.HParams( + symbol_modality_num_shards=4, + hidden_size=hidden_size, + label_smoothing=0.2, + shared_embedding_and_softmax_weights=0) + body_output = -1 + np.random.random_integers( + 100, size=(batch_size, length, height, hidden_size)) + targets = -1 + np.random.random_integers( + vocab_size, size=(batch_size, length, height, 1)) + m = modality.SymbolModality(model_hparams, vocab_size) + data_parallelism = expert_utils.Parallelism( + ["/device:CPU:0"] * num_datashards, reuse=True) + with self.test_session() as session: + sharded_body_output = tf.split(tf.to_float(body_output), num_datashards) + sharded_targets = tf.split(targets, num_datashards) + sharded_logits, train_loss = m.targets_top_sharded( + sharded_body_output, sharded_targets, data_parallelism) + logits = tf.concat(sharded_logits, 0) + session.run(tf.global_variables_initializer()) + res1, res2 = session.run((logits, train_loss)) + self.assertEqual(res1.shape, (batch_size, length, height, 1, vocab_size)) + self.assertEqual(res2.shape, ()) + + +if __name__ == "__main__": + tf.test.main() diff --git a/tensor2tensor/utils/registry.py b/tensor2tensor/utils/registry.py new file mode 100644 index 000000000..7be75b919 --- /dev/null +++ b/tensor2tensor/utils/registry.py @@ -0,0 +1,184 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Registry for models, hyperparameter settings, problem types, and datasets. + +Define a new model by subclassing T2TModel and register it: + +``` +@registry.register_model +class MyModel(T2TModel): + ... +``` + +Access by snake-cased name: `registry.model("my_model")`. If you're using +`trainer.py`, you can pass on the command-line: `--model=my_model`. + +See all the models registered: `registry.list_models()`. + +For hyperparameter sets: + * Register: `registry.register_hparams` + * List: `registry.list_hparams` + * Retrieve by name: `registry.hparams` + * Command-line flag in `trainer.py`: `--hparams_set=name` + +For hyperparameter ranges: + * Register: `registry.register_ranged_hparams` + * List: `registry.list_ranged_hparams` + * Retrieve by name: `registry.ranged_hparams` + * Command-line flag in `trainer.py`: `--hparams_range=name` +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import inspect +import re + +# Dependency imports + +from tensor2tensor.utils import t2t_model + +import tensorflow as tf + +_MODELS = {} +_HPARAMS = {} +_RANGED_HPARAMS = {} + +# Camel case to snake case utils +_first_cap_re = re.compile("(.)([A-Z][a-z0-9]+)") +_all_cap_re = re.compile("([a-z])([A-Z])") + + +def _convert_camel_to_snake(name): + s1 = _first_cap_re.sub(r"\1_\2", name) + return _all_cap_re.sub(r"\1_\2", s1).lower() + + +def _reset(): + for ctr in [_MODELS, _HPARAMS, _RANGED_HPARAMS]: + ctr.clear() + + +def _default_name(obj): + return _convert_camel_to_snake(obj.__name__) + + +def register_model(name=None): + """Register a model. name defaults to class name snake-cased.""" + + def decorator(model_cls, registration_name=None): + """Registers & returns model_cls with registration_name or default name.""" + model_name = registration_name or _default_name(model_cls) + if model_name in _MODELS: + raise ValueError("Model %s already registered." % model_name) + if (not inspect.isclass(model_cls) or + not issubclass(model_cls, t2t_model.T2TModel)): + tf.logging.warning("Model %s is not an instance of T2TModel. " + "Object is expected to abide by its API.", model_name) + _MODELS[model_name] = model_cls + return model_cls + + # Handle if decorator was used without parens + if callable(name): + model_cls = name + return decorator(model_cls, registration_name=_default_name(model_cls)) + + return lambda model_cls: decorator(model_cls, name) + + +def model(name): + if name not in _MODELS: + raise ValueError("Model %s never registered." % name) + return _MODELS[name] + + +def list_models(): + return list(_MODELS) + + +def register_hparams(name=None): + """Register an HParams set. name defaults to function name snake-cased.""" + + def decorator(hp_fn, registration_name=None): + """Registers & returns hp_fn with registration_name or default name.""" + hp_name = registration_name or _default_name(hp_fn) + if hp_name in _HPARAMS: + raise ValueError("HParams set %s already registered." % hp_name) + _HPARAMS[hp_name] = hp_fn + return hp_fn + + # Handle if decorator was used without parens + if callable(name): + hp_fn = name + return decorator(hp_fn, registration_name=_default_name(hp_fn)) + + return lambda hp_fn: decorator(hp_fn, name) + + +def hparams(name): + if name not in _HPARAMS: + raise ValueError("HParams set %s never registered." % name) + return _HPARAMS[name] + + +def list_hparams(): + return list(_HPARAMS) + + +def register_ranged_hparams(name=None): + """Register a RangedHParams set. name defaults to fn name snake-cased.""" + + def decorator(rhp_fn, registration_name=None): + """Registers & returns hp_fn with registration_name or default name.""" + rhp_name = registration_name or _default_name(rhp_fn) + if rhp_name in _RANGED_HPARAMS: + raise ValueError("RangedHParams set %s already registered." % rhp_name) + # Check that the fn takes a single argument + args, varargs, keywords, _ = inspect.getargspec(rhp_fn) + if len(args) != 1 or varargs is not None or keywords is not None: + raise ValueError("RangedHParams set function must take a single " + "argument, the RangedHParams object.") + + _RANGED_HPARAMS[rhp_name] = rhp_fn + return rhp_fn + + # Handle if decorator was used without parens + if callable(name): + rhp_fn = name + return decorator(rhp_fn, registration_name=_default_name(rhp_fn)) + + return lambda rhp_fn: decorator(rhp_fn, name) + + +def ranged_hparams(name): + if name not in _RANGED_HPARAMS: + raise ValueError("RangedHParams set %s never registered." % name) + return _RANGED_HPARAMS[name] + + +def list_ranged_hparams(): + return list(_RANGED_HPARAMS) + + +def help_string(): + help_str = """Registry contents: + + Models: %s + + HParams: %s + + RangedHParams: %s + """ + return help_str % (list_models(), list_hparams(), list_ranged_hparams()) diff --git a/tensor2tensor/utils/registry_test.py b/tensor2tensor/utils/registry_test.py new file mode 100644 index 000000000..54ccca749 --- /dev/null +++ b/tensor2tensor/utils/registry_test.py @@ -0,0 +1,202 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for tensor2tensor.registry.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +from tensor2tensor.utils import registry +from tensor2tensor.utils import t2t_model + +import tensorflow as tf + +# pylint: disable=unused-variable + + +class ModelRegistryTest(tf.test.TestCase): + + def setUp(self): + registry._reset() + + def testT2TModelRegistration(self): + + @registry.register_model + class MyModel1(t2t_model.T2TModel): + pass + + model = registry.model("my_model1") + self.assertTrue(model is MyModel1) + + def testNamedRegistration(self): + + @registry.register_model("model2") + class MyModel1(t2t_model.T2TModel): + pass + + model = registry.model("model2") + self.assertTrue(model is MyModel1) + + def testNonT2TModelRegistration(self): + + @registry.register_model + def model_fn(): + pass + + model = registry.model("model_fn") + self.assertTrue(model is model_fn) + + def testUnknownModel(self): + with self.assertRaisesRegexp(ValueError, "never registered"): + registry.model("not_registered") + + def testDuplicateRegistration(self): + + @registry.register_model + def m1(): + pass + + with self.assertRaisesRegexp(ValueError, "already registered"): + + @registry.register_model("m1") + def m2(): + pass + + def testListModels(self): + + @registry.register_model + def m1(): + pass + + @registry.register_model + def m2(): + pass + + self.assertSetEqual(set(["m1", "m2"]), set(registry.list_models())) + + def testSnakeCase(self): + convert = registry._convert_camel_to_snake + + self.assertEqual("typical_camel_case", convert("TypicalCamelCase")) + self.assertEqual("numbers_fuse2gether", convert("NumbersFuse2Gether")) + self.assertEqual("lstm_seq2seq", convert("LSTMSeq2Seq")) + self.assertEqual("starts_lower", convert("startsLower")) + self.assertEqual("starts_lower_caps", convert("startsLowerCAPS")) + self.assertEqual("caps_fuse_together", convert("CapsFUSETogether")) + self.assertEqual("startscap", convert("Startscap")) + self.assertEqual("s_tartscap", convert("STartscap")) + + +class HParamRegistryTest(tf.test.TestCase): + + def setUp(self): + registry._reset() + + def testHParamSet(self): + + @registry.register_hparams + def my_hparams_set(): + pass + + @registry.register_ranged_hparams + def my_hparams_range(_): + pass + + self.assertTrue(registry.hparams("my_hparams_set") is my_hparams_set) + self.assertTrue( + registry.ranged_hparams("my_hparams_range") is my_hparams_range) + + def testNamedRegistration(self): + + @registry.register_hparams("a") + def my_hparams_set(): + pass + + @registry.register_ranged_hparams("a") + def my_hparams_range(_): + pass + + self.assertTrue(registry.hparams("a") is my_hparams_set) + self.assertTrue(registry.ranged_hparams("a") is my_hparams_range) + + def testUnknownHparams(self): + with self.assertRaisesRegexp(ValueError, "never registered"): + registry.hparams("not_registered") + with self.assertRaisesRegexp(ValueError, "never registered"): + registry.ranged_hparams("not_registered") + + def testDuplicateRegistration(self): + + @registry.register_hparams + def hp1(): + pass + + with self.assertRaisesRegexp(ValueError, "already registered"): + + @registry.register_hparams("hp1") + def hp2(): + pass + + @registry.register_ranged_hparams + def rhp1(_): + pass + + with self.assertRaisesRegexp(ValueError, "already registered"): + + @registry.register_ranged_hparams("rhp1") + def rhp2(_): + pass + + def testListHparams(self): + + @registry.register_hparams + def hp1(): + pass + + @registry.register_hparams("hp2_named") + def hp2(): + pass + + @registry.register_ranged_hparams + def rhp1(_): + pass + + @registry.register_ranged_hparams("rhp2_named") + def rhp2(_): + pass + + self.assertSetEqual(set(["hp1", "hp2_named"]), set(registry.list_hparams())) + self.assertSetEqual( + set(["rhp1", "rhp2_named"]), set(registry.list_ranged_hparams())) + + def testRangeSignatureCheck(self): + + with self.assertRaisesRegexp(ValueError, "must take a single argument"): + + @registry.register_ranged_hparams + def rhp_bad(): + pass + + with self.assertRaisesRegexp(ValueError, "must take a single argument"): + + @registry.register_ranged_hparams + def rhp_bad2(a, b): # pylint: disable=unused-argument + pass + + +if __name__ == "__main__": + tf.test.main() diff --git a/tensor2tensor/utils/t2t_model.py b/tensor2tensor/utils/t2t_model.py new file mode 100644 index 000000000..80c06e347 --- /dev/null +++ b/tensor2tensor/utils/t2t_model.py @@ -0,0 +1,429 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""T2TModel Base Class.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import time + +# Dependency imports + +import six +from six.moves import xrange # pylint: disable=redefined-builtin + +from tensor2tensor.utils import beam_search +from tensor2tensor.utils import expert_utils as eu +from tensor2tensor.utils import modality + +import tensorflow as tf + + +def _with_timing(fn, msg): + + def fn_with_timing(*args, **kwargs): + start_time = time.time() + res = fn(*args, **kwargs) + tf.logging.info("Doing %s took %.3f sec." % (msg, time.time() - start_time)) + return res + + return fn_with_timing + + +class T2TModel(object): + """Abstract base class for models. + + Subclassess generally only need to override `build_model`. + """ + + def __init__(self, + hparams, + problem_hparams, + problem_idx=0, + data_parallelism=None, + ps_devices=None): + """Create a T2TModel. + + Args: + hparams: a hyperparameters object. + problem_hparams: a hyperparameters object. + problem_idx: an integer. + data_parallelism: a expert_utils.parallelism + (specifies devices for data parallelism). + ps_devices: a list of devices to be used for experts + + Returns: + a T2TModel + """ + if data_parallelism is None: + data_parallelism = eu.Parallelism([""]) + if ps_devices is None: + ps_devices = [""] + self._hparams = hparams + self._data_parallelism = data_parallelism + self._num_datashards = data_parallelism.n + self._ps_devices = ps_devices + self._problem_hparams = problem_hparams + self._problem_idx = problem_idx + + @property + def has_input(self): + return self._input_modality + + def infer(self, + features=None, + decode_length=50, + beam_size=1, + top_beams=1, + last_position_only=False, + alpha=0.0): + """A inference method. + + Quadratic time in decode_length. + + Args: + features: an map of string to `Tensor` + decode_length: an integer. How many additional timesteps to decode. + beam_size: number of beams. + top_beams: an integer. How many of the beams to return. + last_position_only: a boolean, speed-up by computing last position only. + alpha: Float that controls the length penalty. larger the alpha, stronger + the preference for slonger translations. + + Returns: + samples: an integer `Tensor`. + """ + if beam_size == 1: + tf.logging.info("Greedy Decoding") + return self._greedy_infer(features, decode_length, last_position_only) + else: + tf.logging.info("Beam Decoding with beam size %d" % beam_size) + return self._beam_decode(features, decode_length, beam_size, top_beams, + last_position_only, alpha) + + def _beam_decode(self, features, decode_length, beam_size, top_beams, + last_position_only, alpha): + """Beam search decoding. + + Args: + features: an map of string to `Tensor` + decode_length: an integer. How many additional timesteps to decode. + beam_size: number of beams. + top_beams: an integer. How many of the beams to return. + last_position_only: a boolean, speed-up by computing last position only. + alpha: Float that controls the length penalty. larger the alpha, stronger + the preference for slonger translations. + + Returns: + samples: an integer `Tensor`. Top samples from the beam search + """ + + def symbols_to_logits_fn(ids): + """Go from ids to logits.""" + ids = tf.expand_dims(tf.expand_dims(ids, axis=2), axis=3) + ids = tf.pad(ids[:, 1:], [[0, 0], [0, 1], [0, 0], [0, 0]]) + + features["targets"] = ids + self._coverage = None + sharded_logits, _, _ = self.model_fn( + features, False, last_position_only=last_position_only) + # now self._coverage is a coverage tensor for the first datashard. + # it has shape [batch_size] and contains floats between 0 and + # source_length. + logits = sharded_logits[0] # Assuming we have one shard. + if last_position_only: + return tf.squeeze(logits, axis=[1, 2, 3]) + current_output_position = tf.shape(ids)[1] - 1 # -1 due to the pad above. + logits = logits[:, current_output_position, :, :] + return tf.squeeze(logits, axis=[1, 2]) + + batch_size = tf.shape(features["inputs"])[0] + initial_ids = tf.zeros([batch_size], dtype=tf.int32) + + inputs_old = features["inputs"] + features["inputs"] = tf.expand_dims(features["inputs"], 1) + if len(features["inputs"].shape) < 5: + features["inputs"] = tf.expand_dims(features["inputs"], 4) + # Expand the inputs in to the beam size. + features["inputs"] = tf.tile(features["inputs"], [1, beam_size, 1, 1, 1]) + s = tf.shape(features["inputs"]) + features["inputs"] = tf.reshape(features["inputs"], + [s[0] * s[1], s[2], s[3], s[4]]) + + target_modality = self._hparams.problems[self._problem_idx].target_modality + vocab_size = target_modality.targets_dimensionality + # Setting decode length to input length + decode_length + decode_length = tf.shape(features["inputs"])[1] + tf.constant(decode_length) + ids, scores = beam_search.beam_search(symbols_to_logits_fn, initial_ids, + beam_size, decode_length, vocab_size, + alpha) + + # Set inputs back to the unexpanded inputs to not to confuse the Estimator! + features["inputs"] = inputs_old + + # Return `top_beams` decodings (also remove initial id from the beam search) + return_scores = False # TODO(lukaszkaiser): make it work multi-problem. + if top_beams == 1: + if return_scores: + return {"outputs": ids[:, 0, 1:], "scores": scores} + return ids[:, 0, 1:] + else: + if return_scores: + return {"outputs": ids[:, :top_beams, 1:], "scores": scores} + return ids[:, :top_beams, 1:] + + def _greedy_infer(self, features, decode_length, last_position_only): + """A slow greedy inference method. + + Quadratic time in decode_length. + + Args: + features: an map of string to `Tensor` + decode_length: an integer. How many additional timesteps to decode. + last_position_only: a boolean, speed-up by computing last position only. + + Returns: + samples: an integer `Tensor`. + """ + if not features: + features = {} + inputs_old = None + if "inputs" in features and len(features["inputs"].shape) < 4: + inputs_old = features["inputs"] + features["inputs"] = tf.expand_dims(features["inputs"], 2) + + def infer_step(recent_output, _): + """Inference step.""" + recent_output.set_shape([None, None, None, 1]) + padded = tf.pad(recent_output, [[0, 0], [0, 1], [0, 0], [0, 0]]) + features["targets"] = padded + # This is inefficient in that it generates samples at all timesteps, + # not just the last one, except if last_position_only is set (dangerous). + samples = self.sample(features, last_position_only=last_position_only) + # Concatenate the already-generated recent_output with last timestep + # of the newly-generated samples. + if last_position_only: + cur_sample = samples[:, -1, :, :] + else: + cur_sample = samples[:, tf.shape(recent_output)[1], :, :] + cur_sample = tf.to_int64(tf.expand_dims(cur_sample, axis=1)) + samples = tf.concat([recent_output, cur_sample], axis=1) + samples.set_shape([None, None, None, 1]) + return samples + + # Create an initial output tensor. This will be passed + # to the infer_step, which adds one timestep at every iteration. + if "partial_targets" in features: + initial_output = tf.convert_to_tensor(features["partial_targets"]) + else: + batch_size = tf.shape(features["inputs"])[0] + initial_output = tf.zeros((batch_size, 0, 1, 1), dtype=tf.int64) + # Hack: foldl complains when the output shape is less specified than the + # input shape, so we confuse it about the input shape. + initial_output = tf.slice(initial_output, [0, 0, 0, 0], + tf.shape(initial_output)) + if isinstance(self._hparams.problems[self._problem_idx].target_modality, + modality.ClassLabelModality): + decode_length = 1 + else: + decode_length = tf.shape(features["inputs"])[1] + decode_length + result = tf.foldl( + infer_step, + tf.range(decode_length), + initializer=initial_output, + back_prop=False, + parallel_iterations=1) + if inputs_old is not None: # Restore to not confuse Estimator. + features["inputs"] = inputs_old + return result + + def sample(self, features, last_position_only=False): + """Run the model and extract samples. + + Args: + features: an map of string to `Tensor`. + last_position_only: a boolean, speed-up by computing last position only. + + Returns: + samples: an integer `Tensor`. + """ + sharded_logits, _, _ = self.model_fn( + features, False, last_position_only=last_position_only) + if self._hparams.sampling_method == "argmax": + sharded_samples = self._data_parallelism(tf.argmax, sharded_logits, 4) + else: + assert self._hparams.sampling_method == "random" + + def _multinomial_squeeze(logits): + reshaped_logits = tf.reshape(logits, [-1, tf.shape(logits)[-1]]) + choices = tf.multinomial(reshaped_logits, 1) + choices = tf.reshape(choices, + tf.shape(logits)[:logits.get_shape().ndims - 1]) + return choices + + sharded_samples = self._data_parallelism(_multinomial_squeeze, + sharded_logits) + return tf.concat(sharded_samples, 0) + + def _shard_features(self, features): # pylint: disable=missing-docstring + sharded_features = dict() + for k, v in six.iteritems(features): + v = tf.convert_to_tensor(v) + if not v.shape.as_list(): + v = tf.expand_dims(v, axis=-1) + v = tf.tile(v, [self._num_datashards]) + sharded_features[k] = self._data_parallelism(tf.identity, + tf.split( + v, self._num_datashards, + 0)) + return sharded_features + + def model_fn(self, features, train, skip=False, last_position_only=False): + """Computes the entire model and produces sharded logits and training loss. + + Args: + features: A dictionary of feature name to tensor. + train: a boolean `Scalar` (whether we are in training mode). + skip: a boolean, if we're just dummy-calling and actually skip this model + (but we need to create variables to not confuse distributed training). + last_position_only: a boolean, compute logits for only the last position. + + Returns: + sharded_logits: a list of `Tensor`s, one per datashard. + training_loss: a floating point `Scalar`. + """ + start_time = time.time() + dp = self._data_parallelism + + sharded_features = self._shard_features(features) + + # Construct the model bottom for inputs. + transformed_features = {} + all_previous_modalities = [] + + for key, input_modality in six.iteritems( + self._problem_hparams.input_modality): + previous_modalities = [ + self._hparams.problems[i].input_modality[key].name + for i in xrange(self._problem_idx) + ] + all_previous_modalities.extend(previous_modalities) + do_reuse = input_modality.name in all_previous_modalities + with tf.variable_scope(input_modality.name, reuse=do_reuse): + transformed_features[key] = input_modality.inputs_bottom_sharded( + sharded_features[key], dp) + all_previous_modalities.append(input_modality.name) + + # Target space id just gets copied to every shard. + if "target_space_id" in features: + transformed_features["target_space_id"] = [features["target_space_id"] + ] * self._num_datashards + + # Targets are transformed by the autoregressive part of the modality + previous_tgt_modalities = [ + self._hparams.problems[i].target_modality.name + for i in xrange(self._problem_idx) + ] + all_previous_modalities.extend(previous_tgt_modalities) + + target_modality = self._problem_hparams.target_modality + target_reuse = target_modality.name in previous_tgt_modalities + with tf.variable_scope(target_modality.name, reuse=target_reuse): + transformed_features["targets"] = target_modality.targets_bottom_sharded( + sharded_features["targets"], dp) + + # Construct the model body. + with tf.variable_scope("body", reuse=self._problem_idx > 0): + if skip: + body_outputs, extra_loss = transformed_features["targets"], 0.0 + else: + body_outputs, extra_loss = self.model_fn_body_sharded( + transformed_features, train) + + with tf.variable_scope(target_modality.name, reuse=target_reuse): + if not last_position_only: + sharded_logits, training_loss = (target_modality.targets_top_sharded( + body_outputs, sharded_features["targets"], self._data_parallelism)) + + training_loss *= self._problem_hparams.loss_multiplier + else: + # Take body outputs for the last position only, and targets too. + # TODO(lukaszkaiser): warning, this doesn't work for all modalities! + last_position_body_outputs = [ + tf.expand_dims(body_shard[:, -1, :, :], axis=[1]) + for body_shard in body_outputs + ] + last_position_targets = [ + tf.expand_dims(target_shard[:, -1:, :, :], axis=[1]) + for target_shard in sharded_features["targets"] + ] + sharded_logits, training_loss = (target_modality.targets_top_sharded( + last_position_body_outputs, last_position_targets, + self._data_parallelism)) + + training_loss = None + + tf.logging.info("This model_fn took %.3f sec." % (time.time() - start_time)) + return sharded_logits, training_loss, extra_loss + + def model_fn_body_sharded(self, sharded_features, train): + """Mixture-of-experts models will override this function. + + Compute model body on all datashards. + + Args: + sharded_features: map from string to list of Tensors each with shape + [batch, ?, ?, body_input_size] + train: A boolean `Scalar` (whether we are in training mode). + + Returns: + sharded_body_output: + a list of Tensors, each with shape [batch, O, P, body_output_size] + extra_loss: a Scalar. + """ + with tf.name_scope("model"): + datashard_to_features = [{ + k: v[d] + for k, v in six.iteritems(sharded_features) + } for d in xrange(self._num_datashards)] + output = self._data_parallelism( + _with_timing(self.model_fn_body, "model_fn_body"), + datashard_to_features, train) + if isinstance(output, tuple): + loss = tf.reduce_mean(output[1]) + output = output[0] + else: + loss = 0.0 + return output, loss + + def model_fn_body(self, features, train): + """Most models will override this function. + + Compute label logits for one shard as a function of the transformed + features. + + Args: + features: A dictionary of key to Tensor. Each Tensor has shape + `[batch_size, ?, ?, hidden_size]`. + train: A boolean `Scalar` (whether we are in training mode). + + Returns: + a `Tensor` of logits with shape `[batch_size, O, P, body_output_size]`. + """ + raise NotImplementedError("Abstract Method") + + @property + def hparams(self): + return self._hparams diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py new file mode 100644 index 000000000..87f56f76c --- /dev/null +++ b/tensor2tensor/utils/trainer_utils.py @@ -0,0 +1,1302 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utilities for trainer binary.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import operator +import os +import sys + +# Dependency imports + +import numpy as np +import six +# pylint: disable=redefined-builtin +from six.moves import input +from six.moves import xrange +from six.moves import zip +# pylint: enable=redefined-builtin + +from tensor2tensor.data_generators import problem_hparams +from tensor2tensor.models import models # pylint: disable=unused-import +from tensor2tensor.utils import data_reader +from tensor2tensor.utils import expert_utils as eu +from tensor2tensor.utils import metrics +from tensor2tensor.utils import registry + +import tensorflow as tf +from tensorflow.contrib.learn.python.learn import learn_runner +from tensorflow.python.ops import init_ops + + +# Number of samples to draw for an image input (in such cases as captioning) +IMAGE_DECODE_LENGTH = 100 + +flags = tf.flags +FLAGS = flags.FLAGS + +flags.DEFINE_bool("registry_help", False, + "If True, logs the contents of the registry and exits.") +flags.DEFINE_string("master", "", "Address of TensorFlow master.") +flags.DEFINE_string("schedule", "local_run", + "Method of tf.contrib.learn.Experiment to run.") +flags.DEFINE_string("output_dir", "", "Base output directory for run.") +flags.DEFINE_string("model", "", "Which model to use.") +flags.DEFINE_string("hparams_set", "", "Which parameters to use.") +flags.DEFINE_string("hparams_range", "", "Parameters range.") +flags.DEFINE_string( + "hparams", "", + """A comma-separated list of `name=value` hyperparameter values. This flag + is used to override hyperparameter settings either when manually selecting + hyperparameters or when using Vizier. If a hyperparameter setting is + specified by this flag then it must be a valid hyperparameter name for the + model.""") +flags.DEFINE_string("problems", "", "Dash separated list of problems to " + "solve.") +flags.DEFINE_string("data_dir", "/tmp/data", "Directory with training data.") +flags.DEFINE_string("worker_job", "/job:worker", "name of worker job") +flags.DEFINE_integer("worker_gpu", 1, "How many GPUs to use.") +flags.DEFINE_integer("worker_replicas", 1, "How many workers to use.") +flags.DEFINE_integer("worker_id", 0, "Which worker task are we.") +flags.DEFINE_integer("ps_gpu", 0, "How many GPUs to use per ps.") +flags.DEFINE_string("gpu_order", "", "Optional order for daisy-chaining gpus." + " e.g. \"1 3 2 4\"") +flags.DEFINE_string("ps_job", "/job:ps", "name of ps job") +flags.DEFINE_integer("ps_replicas", 0, "How many ps replicas.") +flags.DEFINE_bool("experimental_optimize_placement", False, + "Optimize ops placement with experimental session options.") +flags.DEFINE_bool("sync", False, "Sync compute on PS.") +flags.DEFINE_bool("infer_use_last_position_only", False, + "In inference, use last position only for speedup.") +flags.DEFINE_integer("train_steps", 250000, + "The number of steps to run training for.") +flags.DEFINE_integer("eval_steps", 10, "Number of steps in evaluation.") +flags.DEFINE_integer("keep_checkpoint_max", 20, + "How many recent checkpoints to keep.") +flags.DEFINE_bool("interactive", False, "Interactive local inference mode.") +flags.DEFINE_bool("endless_dec", False, "Run decoding endlessly. Temporary.") +flags.DEFINE_bool("save_images", False, "Save inference input images.") +flags.DEFINE_string("decode_from_file", None, "Path to decode file") +flags.DEFINE_string("decode_to_file", None, "Path to inference output file") +flags.DEFINE_integer("decode_shards", 1, "How many shards to decode.") +flags.DEFINE_integer("decode_problem_id", 0, "Which problem to decode.") +flags.DEFINE_integer("decode_extra_length", 50, "Added decode length.") +flags.DEFINE_integer("decode_batch_size", 32, "Batch size for decoding. " + "The decodes will be written to .decodes in" + "format result\tinput") +flags.DEFINE_integer("beam_size", 4, "The beam size for beam decoding") +flags.DEFINE_float("alpha", 0.6, "Alpha for length penalty") +flags.DEFINE_bool("return_beams", False, + "Whether to return 1 (False) or all (True) beams. The \n " + "output file will have the format " + "\t..\t") +flags.DEFINE_bool("daisy_chain_variables", True, + "copy variables around in a daisy chain") + + +def make_experiment_fn(data_dir, model_name, train_steps, eval_steps): + """Returns experiment_fn for learn_runner. Wraps create_experiment.""" + + def experiment_fn(output_dir): + return create_experiment( + output_dir=output_dir, + data_dir=data_dir, + model_name=model_name, + train_steps=train_steps, + eval_steps=eval_steps) + + return experiment_fn + + +def create_experiment(output_dir, data_dir, model_name, train_steps, + eval_steps): + hparams = create_hparams(FLAGS.hparams_set, FLAGS.data_dir) + estimator, input_fns = create_experiment_components( + hparams=hparams, + output_dir=output_dir, + data_dir=data_dir, + model_name=model_name) + return tf.contrib.learn.Experiment( + estimator=estimator, + train_input_fn=input_fns["train"], + eval_input_fn=input_fns["eval"], + eval_metrics=metrics.create_evaluation_metrics(FLAGS.problems.split("-")), + train_steps=train_steps, + eval_steps=eval_steps, + train_monitors=[]) + + +def create_experiment_components(hparams, output_dir, data_dir, model_name): + """Constructs and returns Estimator and train/eval input functions.""" + hparams.problems = [ + problem_hparams.problem_hparams(problem, hparams) + for problem in FLAGS.problems.split("-") + ] + + num_datashards = data_parallelism().n + + tf.logging.info("Creating experiment, storing model files in %s", output_dir) + + train_problems_data = get_datasets_for_mode(data_dir, + tf.contrib.learn.ModeKeys.TRAIN) + train_input_fn = get_input_fn( + mode=tf.contrib.learn.ModeKeys.TRAIN, + hparams=hparams, + data_file_patterns=train_problems_data, + num_datashards=num_datashards) + + eval_problems_data = get_datasets_for_mode(data_dir, + tf.contrib.learn.ModeKeys.EVAL) + eval_input_fn = get_input_fn( + mode=tf.contrib.learn.ModeKeys.EVAL, + hparams=hparams, + data_file_patterns=eval_problems_data, + num_datashards=num_datashards) + estimator = tf.contrib.learn.Estimator( + model_fn=model_builder(model_name, hparams=hparams), + model_dir=output_dir, + config=tf.contrib.learn.RunConfig( + master=FLAGS.master, + model_dir=output_dir, + session_config=session_config(), + keep_checkpoint_max=20)) + return estimator, {"train": train_input_fn, "eval": eval_input_fn} + + +def log_registry(): + tf.logging.info(registry.help_string()) + if FLAGS.registry_help: + sys.exit(0) + + +def create_hparams(params_id, data_dir): + """Returns hyperparameters, including any flag value overrides. + + If the hparams FLAG is set, then it will use any values specified in + hparams to override any individually-set hyperparameter. This logic + allows tuners to override hyperparameter settings to find optimal values. + + Args: + params_id: which set of parameters to choose (must be in _PARAMS above). + data_dir: the directory containing the training data. + + Returns: + The hyperparameters as a tf.contrib.training.HParams object. + """ + hparams = registry.hparams(params_id)() + hparams.add_hparam("data_dir", data_dir) + # Command line flags override any of the preceding hyperparameter values. + if FLAGS.hparams: + hparams = hparams.parse(FLAGS.hparams) + return hparams + + +def run(data_dir, model, output_dir, train_steps, eval_steps, schedule): + """Runs an Estimator locally or distributed. + + This function chooses one of two paths to execute: + + 1. Running locally if schedule=="local_run". + 3. Distributed training/evaluation otherwise. + + Args: + data_dir: The directory the data can be found in. + model: The name of the model to use. + output_dir: The directory to store outputs in. + train_steps: The number of steps to run training for. + eval_steps: The number of steps to run evaluation for. + schedule: (str) The schedule to run. The value here must + be the name of one of Experiment's methods. + """ + if schedule == "local_run": + # Run the local demo. + run_locally( + data_dir=data_dir, + model=model, + output_dir=output_dir, + train_steps=train_steps, + eval_steps=eval_steps) + else: + # Perform distributed training/evaluation. + learn_runner.run( + experiment_fn=make_experiment_fn( + data_dir=data_dir, + model_name=model, + train_steps=train_steps, + eval_steps=eval_steps), + schedule=schedule, + output_dir=FLAGS.output_dir) + + +def validate_flags(): + if not FLAGS.model: + raise ValueError("Must specify a model with --model.") + if not FLAGS.problems: + raise ValueError("Must specify a set of problems with --problems.") + if not (FLAGS.hparams_set or FLAGS.hparams_range): + raise ValueError("Must specify either --hparams_set or --hparams_range.") + if not FLAGS.schedule: + raise ValueError("Must specify --schedule.") + if not FLAGS.output_dir: + FLAGS.output_dir = "/tmp/tensor2tensor" + tf.logging.warning("It is strongly recommended to specify --output_dir. " + "Using default output_dir=%s.", FLAGS.output_dir) + + +def session_config(): + """The TensorFlow Session config to use.""" + graph_options = tf.GraphOptions(optimizer_options=tf.OptimizerOptions( + opt_level=tf.OptimizerOptions.L1, do_function_inlining=False)) + if FLAGS.experimental_optimize_placement: + rewrite_options = tf.RewriterConfig(optimize_tensor_layout=True) + rewrite_options.optimizers.append("pruning") + rewrite_options.optimizers.append("constfold") + rewrite_options.optimizers.append("layout") + graph_options = tf.GraphOptions( + rewrite_options=rewrite_options, infer_shapes=True) + config = tf.ConfigProto( + allow_soft_placement=True, graph_options=graph_options) + + return config + + +def model_builder(model, hparams): + """Returns a function to build the model. + + Args: + model: The name of the model to use. + hparams: The hyperparameters. + + Returns: + A function to build the model's graph. This function is called by + the Estimator object to construct the graph. + """ + + def initializer(): + if hparams.initializer == "orthogonal": + return tf.orthogonal_initializer(gain=hparams.initializer_gain) + elif hparams.initializer == "uniform": + max_val = 0.1 * hparams.initializer_gain + return tf.random_uniform_initializer(-max_val, max_val) + elif hparams.initializer == "normal_unit_scaling": + return init_ops.variance_scaling_initializer( + hparams.initializer_gain, mode="fan_avg", distribution="normal") + elif hparams.initializer == "uniform_unit_scaling": + return init_ops.variance_scaling_initializer( + hparams.initializer_gain, mode="fan_avg", distribution="uniform") + else: + raise ValueError("Unrecognized initializer: %s" % hparams.initializer) + + def learning_rate_decay(): + """Inverse-decay learning rate until warmup_steps, then decay.""" + warmup_steps = tf.to_float( + hparams.learning_rate_warmup_steps * FLAGS.worker_replicas) + step = tf.to_float(tf.contrib.framework.get_global_step()) + if hparams.learning_rate_decay_scheme == "noam": + return 5000.0 * hparams.hidden_size**-0.5 * tf.minimum( + (step + 1) * warmup_steps**-1.5, (step + 1)**-0.5) + elif hparams.learning_rate_decay_scheme == "exp100k": + return 0.94**(step // 100000) + + inv_base = tf.exp(tf.log(0.01) / warmup_steps) + inv_decay = inv_base**(warmup_steps - step) + if hparams.learning_rate_decay_scheme == "sqrt": + decay = _sqrt_decay(step - warmup_steps) + elif hparams.learning_rate_decay_scheme == "exp10k": + decay = _exp_decay_after(step - warmup_steps, 0.9995, + FLAGS.train_steps - warmup_steps - 10000) + elif hparams.learning_rate_decay_scheme == "exp50k": + decay = _exp_decay_after(step - warmup_steps, 0.99995, + FLAGS.train_steps - warmup_steps - 50000) + elif hparams.learning_rate_decay_scheme == "exp500k": + decay = _exp_decay_after(step - warmup_steps, 0.9999955, + FLAGS.train_steps - warmup_steps - 500000) + elif hparams.learning_rate_decay_scheme == "none": + decay = tf.constant(1.0) + else: + raise ValueError("Unrecognized learning rate decay scheme: %s" % + hparams.learning_rate_decay_scheme) + return tf.cond( + step < warmup_steps, + lambda: inv_decay, + lambda: decay, + name="learning_rate_decay_warump_cond") + + def model_fn(features, targets, mode): + """Creates the prediction, loss, and train ops. + + Args: + features: A dictionary of tensors keyed by the feature name. + targets: A tensor representing the labels (targets). + mode: The execution mode, as defined in tf.contrib.learn.ModeKeys. + + Returns: + A tuple consisting of the prediction, loss, and train_op. + """ + if mode == tf.contrib.learn.ModeKeys.INFER and FLAGS.interactive: + features = _interactive_input_tensor_to_features_dict(features, hparams) + if mode == tf.contrib.learn.ModeKeys.INFER and FLAGS.decode_from_file: + features = _decode_input_tensor_to_features_dict(features, hparams) + # A dictionary containing: + # - problem_choice: A Tensor containing an integer indicating which problem + # was selected for this run. + # - predictions: A Tensor containing the model's output predictions. + run_info = dict() + run_info["problem_choice"] = features["problem_choice"] + + if targets is not None: + features["targets"] = targets + + dp = data_parallelism() + + # Add input statistics for incoming features. + with tf.name_scope("input_stats"): + for (k, v) in six.iteritems(features): + if isinstance(v, tf.Tensor) and v.get_shape().ndims > 1: + tf.summary.scalar("%s_batch" % k, tf.shape(v)[0] // dp.n) + tf.summary.scalar("%s_length" % k, tf.shape(v)[1]) + nonpadding = tf.to_float(tf.not_equal(v, 0)) + tf.summary.scalar("%s_nonpadding_tokens" % k, + tf.reduce_sum(nonpadding)) + tf.summary.scalar("%s_nonpadding_fraction" % k, + tf.reduce_mean(nonpadding)) + + tf.get_variable_scope().set_initializer(initializer()) + train = mode == tf.contrib.learn.ModeKeys.TRAIN + + # Get multi-problem logits and loss based on features["problem_choice"]. + def nth_model(n): + """Build the model for the n-th problem, plus some added variables.""" + model_class = registry.model(model)( + hparams, hparams.problems[n], n, dp, _ps_devices(all_workers=True)) + if mode == tf.contrib.learn.ModeKeys.INFER: + return model_class.infer( + features, + beam_size=FLAGS.beam_size, + top_beams=FLAGS.beam_size if FLAGS.return_beams else 1, + last_position_only=FLAGS.infer_use_last_position_only, + alpha=FLAGS.alpha, + decode_length=FLAGS.decode_extra_length) + # In distributed mode, we build graph for problem=0 and problem=worker_id. + skipping_is_on = hparams.problem_choice == "distributed" and train + problem_worker_id = FLAGS.worker_id % len(hparams.problems) + skip_this_one = n != 0 and n % FLAGS.worker_replicas != problem_worker_id + # On worker 0 also build graph for problems <= 1. + # TODO(lukaszkaiser): why is this hack needed for variables init? Repair. + skip_this_one = skip_this_one and (FLAGS.worker_id != 0 or n > 1) + sharded_logits, training_loss, extra_loss = model_class.model_fn( + features, train, skip=(skipping_is_on and skip_this_one)) + with tf.variable_scope("losses_avg", reuse=True): + loss_moving_avg = tf.get_variable("problem_%d/training_loss" % n) + o1 = loss_moving_avg.assign(loss_moving_avg * 0.9 + training_loss * 0.1) + loss_moving_avg = tf.get_variable("problem_%d/extra_loss" % n) + o2 = loss_moving_avg.assign(loss_moving_avg * 0.9 + extra_loss * 0.1) + loss_moving_avg = tf.get_variable("problem_%d/total_loss" % n) + total_loss = training_loss + extra_loss + o3 = loss_moving_avg.assign(loss_moving_avg * 0.9 + total_loss * 0.1) + with tf.variable_scope("train_stats"): # Count steps for this problem. + problem_steps = tf.get_variable( + "problem_%d_steps" % n, initializer=0, trainable=False) + o4 = problem_steps.assign_add(1) + with tf.control_dependencies([o1, o2, o3, o4]): # Make sure the ops run. + total_loss = tf.identity(total_loss) + return [total_loss] + sharded_logits # Need to flatten for cond later. + + result_list = _cond_on_index(nth_model, features["problem_choice"], 0, + len(hparams.problems) - 1) + + if mode == tf.contrib.learn.ModeKeys.INFER: + # Beam search in sequence model returns both decodes withe key "outputs" + # and scores with they key "scores". If return list is a dict, we expect + # that it will have keys "outputs", a tensor of int32 and scores, a + # tensor of floats. This is useful if we want to return scores from + # estimator.predict + if not isinstance(result_list, dict): + ret = {"outputs": result_list}, None, None + else: + ret = { + "outputs": result_list["outputs"], + "scores": result_list["scores"] + }, None, None + if "inputs" in features: + ret[0]["inputs"] = features["inputs"] + if "infer_targets" in features: + ret[0]["targets"] = features["infer_targets"] + return ret + + sharded_logits, total_loss = result_list[1:], result_list[0] + if mode == tf.contrib.learn.ModeKeys.EVAL: + logits = tf.concat(sharded_logits, 0) + # For evaluation, return the logits layer as our predictions. + run_info["predictions"] = logits + train_op = None + return run_info, total_loss, None + + assert mode == tf.contrib.learn.ModeKeys.TRAIN + + # Some training statistics. + with tf.name_scope("training_stats"): + learning_rate = hparams.learning_rate * learning_rate_decay() + learning_rate /= math.sqrt(float(FLAGS.worker_replicas)) + tf.summary.scalar("learning_rate", learning_rate) + global_step = tf.to_float(tf.contrib.framework.get_global_step()) + for n in xrange(len(hparams.problems)): + with tf.variable_scope("losses_avg", reuse=True): + total_loss_var = tf.get_variable("problem_%d/total_loss" % n) + training_loss_var = tf.get_variable("problem_%d/training_loss" % n) + extra_loss_var = tf.get_variable("problem_%d/extra_loss" % n) + tf.summary.scalar("loss_avg_%d/total_loss" % n, total_loss_var) + tf.summary.scalar("loss_avg_%d/training_loss" % n, training_loss_var) + tf.summary.scalar("loss_avg_%d/extra_loss" % n, extra_loss_var) + with tf.variable_scope("train_stats", reuse=True): + nth_steps = tf.get_variable("problem_%d_steps" % n, dtype=tf.int32) + tf.summary.scalar("problem_%d_frequency" % n, + tf.to_float(nth_steps) / (global_step + 1.0)) + + # Log trainable weights and add decay. + total_size, total_embedding, weight_decay_loss = 0, 0, 0.0 + all_weights = {v.name: v for v in tf.trainable_variables()} + for v_name in sorted(list(all_weights)): + v = all_weights[v_name] + v_size = int(np.prod(np.array(v.shape.as_list()))) + tf.logging.info("Weight %s\tshape %s\tsize %d", + v.name[:-2].ljust(80), str(v.shape).ljust(20), v_size) + if "embedding" in v_name: + total_embedding += v_size + total_size += v_size + if hparams.weight_decay > 0.0 and len(v.shape.as_list()) > 1: + # Add weight regularization if set and the weight is not a bias (dim>1). + with tf.device(v._ref().device): # pylint: disable=protected-access + v_loss = tf.nn.l2_loss(v) / v_size + weight_decay_loss += v_loss + is_body = len(v_name) > 5 and v_name[:5] == "body/" + if hparams.weight_noise > 0.0 and is_body: + # Add weight noise if set in hparams. + with tf.device(v._ref().device): # pylint: disable=protected-access + scale = learning_rate * 0.001 + noise = tf.truncated_normal(v.shape) * hparams.weight_noise * scale + noise_op = v.assign_add(noise) + with tf.control_dependencies([noise_op]): + total_loss = tf.identity(total_loss) + tf.logging.info("Total trainable variables size: %d", total_size) + tf.logging.info("Total embedding variables size: %d", total_embedding) + tf.logging.info("Total non-embedding variables size: %d", + total_size - total_embedding) + total_loss += weight_decay_loss * hparams.weight_decay + + # Define the train_op for the TRAIN mode. + opt = _ConditionalOptimizer(hparams.optimizer, learning_rate, hparams) + tf.logging.info("Computing gradients for global model_fn.") + train_op = tf.contrib.layers.optimize_loss( + name="training", + loss=total_loss, + global_step=tf.contrib.framework.get_global_step(), + learning_rate=learning_rate, + clip_gradients=hparams.clip_grad_norm or None, + optimizer=opt, + colocate_gradients_with_ops=True) + + tf.logging.info("Global model_fn finished.") + return run_info, total_loss, train_op + + return model_fn + + +def run_locally(data_dir, model, output_dir, train_steps, eval_steps): + """Runs an Estimator locally. + + This function demonstrates model training, evaluation, inference locally. + + Args: + data_dir: The directory the data can be found in. + model: The name of the model to use. + output_dir: The directory to store outputs in. + train_steps: The number of steps to run training for. + eval_steps: The number of steps to run evaluation for. + """ + train_problems_data = get_datasets_for_mode(data_dir, + tf.contrib.learn.ModeKeys.TRAIN) + + # For a local run, we can train, evaluate, predict. + hparams = create_hparams(FLAGS.hparams_set, FLAGS.data_dir) + hparams.problems = [ + problem_hparams.problem_hparams(problem, hparams) + for problem in FLAGS.problems.split("-") + ] + + estimator = tf.contrib.learn.Estimator( + model_fn=model_builder(model, hparams=hparams), + model_dir=output_dir, + config=tf.contrib.learn.RunConfig( + session_config=session_config(), + keep_checkpoint_max=FLAGS.keep_checkpoint_max)) + + num_datashards = data_parallelism().n + + if train_steps > 0: + # Train. + tf.logging.info("Performing local training.") + estimator.fit( + input_fn=get_input_fn( + mode=tf.contrib.learn.ModeKeys.TRAIN, + hparams=hparams, + data_file_patterns=train_problems_data, + num_datashards=num_datashards), + steps=train_steps, + monitors=[]) + + if eval_steps > 0: + # Evaluate. + tf.logging.info("Performing local evaluation.") + eval_problems_data = get_datasets_for_mode(data_dir, + tf.contrib.learn.ModeKeys.EVAL) + eval_input_fn = get_input_fn( + mode=tf.contrib.learn.ModeKeys.EVAL, + hparams=hparams, + data_file_patterns=eval_problems_data, + num_datashards=num_datashards) + unused_metrics = estimator.evaluate( + input_fn=eval_input_fn, + steps=eval_steps, + metrics=metrics.create_evaluation_metrics(FLAGS.problems.split("-"))) + + # Predict. + if FLAGS.interactive: + infer_input_fn = _interactive_input_fn(hparams) + for problem_idx, example in infer_input_fn: + targets_vocab = hparams.problems[problem_idx].vocabulary["targets"] + result_iter = estimator.predict(input_fn=lambda e=example: e) + for result in result_iter: + if FLAGS.return_beams: + beams = np.split(result["outputs"], FLAGS.beam_size, axis=0) + scores = None + if "scores" in result: + scores = np.split(result["scores"], FLAGS.beam_size, axis=0) + for k, beam in enumerate(beams): + tf.logging.info("BEAM %d:" % k) + if scores is not None: + tf.logging.info("%s\tScore:%f" % + (targets_vocab.decode(beam.flatten()), scores[k])) + else: + tf.logging.info(targets_vocab.decode(beam.flatten())) + else: + tf.logging.info(targets_vocab.decode(result["outputs"].flatten())) + # Predict from file + elif FLAGS.decode_from_file is not None: + problem_id = FLAGS.decode_problem_id + inputs_vocab = hparams.problems[problem_id].vocabulary["inputs"] + targets_vocab = hparams.problems[problem_id].vocabulary["targets"] + tf.logging.info("Performing Decoding from a file.") + sorted_inputs, sorted_keys = _get_sorted_inputs() + num_decode_batches = (len(sorted_inputs) - 1) // FLAGS.decode_batch_size + 1 + input_fn = _decode_batch_input_fn(problem_id, num_decode_batches, + sorted_inputs, inputs_vocab) + + # strips everything after the first id, which is assumed to be 1 + def _save_until_eos(hyp): # pylint: disable=missing-docstring + ret = [] + index = 0 + # until you reach id + while index < len(hyp) and hyp[index] != 1: + ret.append(hyp[index]) + index += 1 + return np.array(ret) + + decodes = [] + for _ in range(num_decode_batches): + result_iter = estimator.predict(input_fn=input_fn.next, as_iterable=True) + for result in result_iter: + + def log_fn(inputs, outputs): + decoded_inputs = inputs_vocab.decode( + _save_until_eos(inputs.flatten())) + tf.logging.info("Inference results INPUT: %s" % decoded_inputs) + + decoded_outputs = targets_vocab.decode( + _save_until_eos(outputs.flatten())) + tf.logging.info("Inference results OUTPUT: %s" % decoded_outputs) + return decoded_outputs + + if FLAGS.return_beams: + beam_decodes = [] + output_beams = np.split(result["outputs"], FLAGS.beam_size, axis=0) + for k, beam in enumerate(output_beams): + tf.logging.info("BEAM %d:" % k) + beam_decodes.append(log_fn(result["inputs"], beam)) + decodes.append(str.join("\t", beam_decodes)) + + else: + decodes.append(log_fn(result["inputs"], result["outputs"])) + + # Reversing the decoded inputs and outputs because they were reversed in + # _decode_batch_input_fn + sorted_inputs.reverse() + decodes.reverse() + # Dumping inputs and outputs to file FLAGS.decode_from_file.decodes in + # format result\tinput in the same order as original inputs + if FLAGS.decode_shards > 1: + base_filename = FLAGS.decode_from_file + ("%.2d" % FLAGS.worker_id) + else: + base_filename = FLAGS.decode_from_file + decode_filename = ( + base_filename + "." + FLAGS.model + "." + FLAGS.hparams_set + ".beam" + + str(FLAGS.beam_size) + ".a" + str(FLAGS.alpha) + ".alpha" + + str(FLAGS.alpha) + ".decodes") + tf.logging.info("Writing decodes into %s" % decode_filename) + outfile = tf.gfile.Open(decode_filename, "w") + for index in range(len(sorted_inputs)): + outfile.write("%s\t%s\n" % (decodes[sorted_keys[index]], + sorted_inputs[sorted_keys[index]])) + else: + for i, problem in enumerate(FLAGS.problems.split("-")): + inputs_vocab = hparams.problems[i].vocabulary.get("inputs", None) + targets_vocab = hparams.problems[i].vocabulary["targets"] + tf.logging.info("Performing local inference.") + infer_problems_data = get_datasets_for_mode( + data_dir, tf.contrib.learn.ModeKeys.INFER) + infer_input_fn = get_input_fn( + mode=tf.contrib.learn.ModeKeys.INFER, + hparams=hparams, + data_file_patterns=infer_problems_data, + num_datashards=num_datashards, + fixed_problem=i) + result_iter = estimator.predict( + input_fn=infer_input_fn, as_iterable=FLAGS.endless_dec) + + def log_fn(inputs, targets, outputs, problem, j): + """Log inference results.""" + if "image" in problem and FLAGS.save_images: + save_path = os.path.join(FLAGS.output_dir, + "%s_prediction_%d.jpg" % (problem, j)) + show_and_save_image(inputs / 255., save_path) + elif inputs_vocab: + decoded_inputs = inputs_vocab.decode(inputs.flatten()) + tf.logging.info("Inference results INPUT: %s" % decoded_inputs) + + decoded_outputs = targets_vocab.decode(outputs.flatten()) + decoded_targets = targets_vocab.decode(targets.flatten()) + tf.logging.info("Inference results OUTPUT: %s" % decoded_outputs) + if FLAGS.decode_to_file: + output_filepath = FLAGS.decode_to_file + ".outputs." + problem + output_file = tf.gfile.Open(output_filepath, "a") + output_file.write(decoded_outputs + "\n") + target_filepath = FLAGS.decode_to_file + ".targets." + problem + target_file = tf.gfile.Open(target_filepath, "a") + target_file.write(decoded_targets + "\n") + + # The function predict() returns an iterable over the network's + # predictions from the test input. if FLAGS.endless_dec is set, it will + # decode over the dev set endlessly, looping over it. We use the returned + # iterator to log inputs and decodes. + if FLAGS.endless_dec: + tf.logging.info("Warning: Decoding endlessly") + for j, result in enumerate(result_iter): + inputs, targets, outputs = (result["inputs"], result["targets"], + result["outputs"]) + if FLAGS.return_beams: + output_beams = np.split(outputs, FLAGS.beam_size, axis=0) + for k, beam in enumerate(output_beams): + tf.logging.info("BEAM %d:" % k) + log_fn(inputs, targets, beam, problem, j) + else: + log_fn(inputs, targets, outputs, problem, j) + else: + for j, (inputs, targets, outputs) in enumerate( + zip(result_iter["inputs"], result_iter["targets"], result_iter[ + "outputs"])): + if FLAGS.return_beams: + output_beams = np.split(outputs, FLAGS.beam_size, axis=0) + for k, beam in enumerate(output_beams): + tf.logging.info("BEAM %d:" % k) + log_fn(inputs, targets, beam, problem, j) + else: + log_fn(inputs, targets, outputs, problem, j) + + +def _decode_batch_input_fn(problem_id, num_decode_batches, sorted_inputs, + vocabulary): + tf.logging.info(" batch %d" % num_decode_batches) + # First reverse all the input sentences so that if you're going to get OOMs, + # you'll see it in the first batch + sorted_inputs.reverse() + for b in range(num_decode_batches): + tf.logging.info("Deocding batch %d" % b) + batch_length = 0 + batch_inputs = [] + for inputs in sorted_inputs[b * FLAGS.decode_batch_size:( + b + 1) * FLAGS.decode_batch_size]: + input_ids = vocabulary.encode(inputs) + input_ids.append(1) # Assuming EOS=1. + batch_inputs.append(input_ids) + if len(input_ids) > batch_length: + batch_length = len(input_ids) + final_batch_inputs = [] + for input_ids in batch_inputs: + assert len(input_ids) <= batch_length + x = input_ids + [0] * (batch_length - len(input_ids)) + final_batch_inputs.append(x) + yield { + "inputs": np.array(final_batch_inputs), + "problem_choice": np.array(problem_id) + } + + +def get_datasets_for_mode(data_dir, mode): + return data_reader.get_datasets(FLAGS.problems, data_dir, mode) + + +def _cond_on_index(fn, index_tensor, cur_idx, max_idx): + """Call fn(index_tensor) using tf.cond in [cur_id, max_idx].""" + if cur_idx == max_idx: + return fn(cur_idx) + return tf.cond( + tf.equal(index_tensor, cur_idx), lambda: fn(cur_idx), + lambda: _cond_on_index(fn, index_tensor, cur_idx + 1, max_idx)) + + +def _interactive_input_fn(hparams): + """Generator that reads from the terminal and yields "interactive inputs". + + Due to temporary limitations in tf.learn, if we don't want to reload the + whole graph, then we are stuck encoding all of the input as one fixed-size + numpy array. + + We yield int64 arrays with shape [const_array_size]. The format is: + [num_samples, decode_length, len(input ids), , ] + + Args: + hparams: model hparams + Yields: + numpy arrays + + Raises: + Exception: when `input_type` is invalid. + """ + num_samples = 3 + decode_length = 100 + input_type = "text" + problem_id = 0 + p_hparams = hparams.problems[problem_id] + has_input = "inputs" in p_hparams.input_modality + vocabulary = p_hparams.vocabulary["inputs" if has_input else "targets"] + # This should be longer than the longest input. + const_array_size = 10000 + while True: + prompt = ("INTERACTIVE MODE num_samples=%d decode_length=%d \n" + " it= ('text' or 'image')\n" + " pr= (set the problem number)\n" + " in= (set the input problem number)\n" + " ou= (set the output problem number)\n" + " ns= (changes number of samples)\n" + " dl= (changes decode legnth)\n" + " <%s> (decode)\n" + " q (quit)\n" + ">" % (num_samples, decode_length, "source_string" + if has_input else "target_prefix")) + input_string = input(prompt) + if input_string == "q": + return + elif input_string[:3] == "pr=": + problem_id = int(input_string[3:]) + p_hparams = hparams.problems[problem_id] + has_input = "inputs" in p_hparams.input_modality + vocabulary = p_hparams.vocabulary["inputs" if has_input else "targets"] + elif input_string[:3] == "in=": + problem = int(input_string[3:]) + p_hparams.input_modality = hparams.problems[problem].input_modality + p_hparams.input_space_id = hparams.problems[problem].input_space_id + elif input_string[:3] == "ou=": + problem = int(input_string[3:]) + p_hparams.target_modality = hparams.problems[problem].target_modality + p_hparams.target_space_id = hparams.problems[problem].target_space_id + elif input_string[:3] == "ns=": + num_samples = int(input_string[3:]) + elif input_string[:3] == "dl=": + decode_length = int(input_string[3:]) + elif input_string[:3] == "it=": + input_type = input_string[3:] + else: + if input_type == "text": + input_ids = vocabulary.encode(input_string) + if has_input: + input_ids.append(1) # assume 1 means end-of-source + x = [num_samples, decode_length, len(input_ids)] + input_ids + assert len(x) < const_array_size + x += [0] * (const_array_size - len(x)) + yield problem_id, { + "inputs": np.array(x), + "problem_choice": np.array(problem_id) + } + elif input_type == "image": + input_path = input_string + img = read_image(input_path) + yield problem_id, { + "inputs": img, + "problem_choice": np.array(problem_id) + } + else: + raise Exception("Unsupported input type.") + + +def read_image(path): + try: + import matplotlib.image as im # pylint: disable=g-import-not-at-top + except ImportError as e: + tf.logging.warning( + "Reading an image requires matplotlib to be installed: %s", e) + raise NotImplementedError("Image reading not implemented.") + return im.imread(path) + + +def show_and_save_image(img, save_path): + try: + import matplotlib.pyplot as plt # pylint: disable=g-import-not-at-top + except ImportError as e: + tf.logging.warning("Showing and saving an image requires matplotlib to be " + "installed: %s", e) + raise NotImplementedError("Image display and save not implemented.") + plt.imshow(img) + plt.savefig(save_path) + + +def _get_sorted_inputs(): + """Returning inputs sorted according to length. + + Returns: + a sorted list of inputs + + """ + tf.logging.info("Getting sorted inputs") + # read file and sort inputs according them according to input length. + if FLAGS.decode_shards > 1: + decode_filename = FLAGS.decode_from_file + ("%.2d" % FLAGS.worker_id) + else: + decode_filename = FLAGS.decode_from_file + inputs = [line.strip() for line in tf.gfile.Open(decode_filename)] + input_lens = [(i, len(line.strip().split())) for i, line in enumerate(inputs)] + sorted_input_lens = sorted(input_lens, key=operator.itemgetter(1)) + # We'll need the keys to rearrange the inputs back into their original order + sorted_keys = {} + sorted_inputs = [] + for i, (index, _) in enumerate(sorted_input_lens): + sorted_inputs.append(inputs[index]) + sorted_keys[index] = i + return sorted_inputs, sorted_keys + + +def _interactive_input_tensor_to_features_dict(feature_map, hparams): + """Convert the interactive input format (see above) to a dictionary. + + Args: + feature_map: a dictionary with keys `problem_choice` and `input` containing + Tensors. + hparams: model hyperparameters + + Returns: + a features dictionary, as expected by the decoder. + """ + inputs = tf.constant(feature_map["inputs"]) + input_is_image = False if len(inputs.shape) < 3 else True + + def input_fn(problem_choice, x=inputs): # pylint: disable=missing-docstring + p_hparams = hparams.problems[problem_choice] + if not input_is_image: + # Remove the batch dimension. + num_samples = x[0] + length = x[2] + x = tf.slice(x, [3], tf.to_int32([length])) + x = tf.reshape(x, [1, -1, 1, 1]) + # Transform into a batch of size num_samples to get that many random + # decodes. + x = tf.tile(x, tf.to_int32([num_samples, 1, 1, 1])) + else: + x = tf.image.resize_images(x, [299, 299]) + x = tf.reshape(x, [1, 299, 299, -1]) + x = tf.to_int32(x) + return (tf.constant(p_hparams.input_space_id), + tf.constant(p_hparams.target_space_id), x) + + input_space_id, target_space_id, x = _cond_on_index( + input_fn, feature_map["problem_choice"], 0, len(hparams.problems) - 1) + + features = {} + features["problem_choice"] = tf.constant(feature_map["problem_choice"]) + features["input_space_id"] = input_space_id + features["target_space_id"] = target_space_id + features["decode_length"] = (IMAGE_DECODE_LENGTH + if input_is_image else inputs[1]) + features["inputs"] = x + return features + + +def _decode_input_tensor_to_features_dict(feature_map, hparams): + """Convert the interactive input format (see above) to a dictionary. + + Args: + feature_map: a dictionary with keys `problem_choice` and `input` containing + Tensors. + hparams: model hyperparameters + + Returns: + a features dictionary, as expected by the decoder. + """ + inputs = tf.constant(feature_map["inputs"]) + input_is_image = False + + def input_fn(problem_choice, x=inputs): # pylint: disable=missing-docstring + p_hparams = hparams.problems[problem_choice] + # Add a third empty dimension dimension + x = tf.expand_dims(x, axis=[2]) + x = tf.to_int32(x) + return (tf.constant(p_hparams.input_space_id), + tf.constant(p_hparams.target_space_id), x) + + input_space_id, target_space_id, x = _cond_on_index( + input_fn, feature_map["problem_choice"], 0, len(hparams.problems) - 1) + + features = {} + features["problem_choice"] = feature_map["problem_choice"] + features["input_space_id"] = input_space_id + features["target_space_id"] = target_space_id + features["decode_length"] = (IMAGE_DECODE_LENGTH + if input_is_image else tf.shape(x)[1] + 50) + features["inputs"] = x + return features + + +def get_input_fn(mode, + hparams, + data_file_patterns=None, + num_datashards=None, + fixed_problem=None): + """Provides input to the graph, either from disk or via a placeholder. + + This function produces an input function that will feed data into + the network. There are two modes of operation: + + 1. If data_file_pattern and all subsequent arguments are None, then + it creates a placeholder for a serialized tf.Example proto. + 2. If data_file_pattern is defined, it will read the data from the + files at the given location. Use this mode for training, + evaluation, and testing prediction. + + Args: + mode: The execution mode, as defined in tf.contrib.learn.ModeKeys. + hparams: HParams object. + data_file_patterns: The list of file patterns to use to read in data. Set to + `None` if you want to create a placeholder for the input data. The + `problems` flag is a list of problem names joined by the `-` character. + The flag's string is then split along the `-` and each problem gets its + own example queue. + num_datashards: An integer. + fixed_problem: An integer indicating the problem to fetch data for, or None + if the input is to be randomly selected. + + Returns: + A function that returns a dictionary of features and the target labels. + """ + + def input_fn(): + """Supplies input to our model. + + This function supplies input to our model, where this input is a + function of the mode. For example, we supply different data if + we're performing training versus evaluation. + + Returns: + A tuple consisting of 1) a dictionary of tensors whose keys are + the feature names, and 2) a tensor of target labels if the mode + is not INFER (and None, otherwise). + + Raises: + ValueError: if one of the parameters has an unsupported value. + """ + problem_count, batches = len(data_file_patterns), [] + with tf.name_scope("input_queues"): + for n in xrange(problem_count): + if fixed_problem is not None and n != fixed_problem: + continue + with tf.name_scope("problem_%d" % n): + with tf.device("/cpu:0"): # Input queues are on CPU. + capacity = hparams.problems[n].max_expected_batch_size_per_shard + capacity *= num_datashards + examples = data_reader.input_pipeline(data_file_patterns[n], + capacity, mode) + drop_long_sequences = mode == tf.contrib.learn.ModeKeys.TRAIN + batch_size_multiplier = hparams.problems[n].batch_size_multiplier + feature_map = data_reader.batch_examples( + examples, + data_reader.hparams_to_batching_scheme( + hparams, + shard_multiplier=num_datashards, + drop_long_sequences=drop_long_sequences, + length_multiplier=batch_size_multiplier)) + + # Reverse inputs and targets features if the problem was reversed. + if hparams.problems[n].was_reversed: + inputs = feature_map["inputs"] + targets = feature_map["targets"] + feature_map["inputs"] = targets + feature_map["targets"] = inputs + + # Use the inputs as the targets if the problem is a copy problem. + if hparams.problems[n].was_copy: + feature_map["targets"] = feature_map["inputs"] + + # Ensure inputs and targets are proper rank. + while len(feature_map["inputs"].get_shape()) != 4: + feature_map["inputs"] = tf.expand_dims(feature_map["inputs"], axis=-1) + while len(feature_map["targets"].get_shape()) != 4: + feature_map["targets"] = tf.expand_dims( + feature_map["targets"], axis=-1) + + batches.append( + (feature_map["inputs"], feature_map["targets"], tf.constant(n), + tf.constant(hparams.problems[n].input_space_id), + tf.constant(hparams.problems[n].target_space_id))) + + # We choose which problem to process. + loss_moving_avgs = [] # Need loss moving averages for that. + for n in xrange(problem_count): + with tf.variable_scope("losses_avg"): + loss_moving_avgs.append( + tf.get_variable( + "problem_%d/total_loss" % n, initializer=100.0, + trainable=False)) + tf.get_variable( + "problem_%d/training_loss" % n, initializer=100.0, trainable=False) + tf.get_variable( + "problem_%d/extra_loss" % n, initializer=100.0, trainable=False) + if fixed_problem is None: + if (hparams.problem_choice == "uniform" or + mode != tf.contrib.learn.ModeKeys.TRAIN): + problem_choice = tf.random_uniform( + [], maxval=problem_count, dtype=tf.int32) + elif hparams.problem_choice == "adaptive": + loss_moving_avgs = tf.stack(loss_moving_avgs) + problem_choice = tf.multinomial( + tf.reshape(loss_moving_avgs, [1, -1]), 1) + problem_choice = tf.to_int32(tf.squeeze(problem_choice)) + elif hparams.problem_choice == "distributed": + assert FLAGS.worker_replicas >= problem_count + assert FLAGS.worker_replicas % problem_count == 0 + problem_choice = tf.to_int32(FLAGS.worker_id % problem_count) + else: + raise ValueError("Value of hparams.problem_choice is %s and must be " + "one of [uniform, adaptive, distributed]", + hparams.problem_choice) + + # Inputs and targets conditional on problem_choice. + rand_inputs, rand_target, choice, inp_id, tgt_id = _cond_on_index( + lambda n: batches[n], problem_choice, 0, problem_count - 1) + else: + problem_choice = tf.constant(fixed_problem) + # Take the only constructed batch, which is the fixed_problem. + rand_inputs, rand_target, choice, inp_id, tgt_id = batches[0] + + # Set shapes so the ranks are clear. + rand_inputs.set_shape([None, None, None, None]) + rand_target.set_shape([None, None, None, None]) + choice.set_shape([]) + inp_id.set_shape([]) + tgt_id.set_shape([]) + # Forced shape obfuscation is necessary for inference. + if mode == tf.contrib.learn.ModeKeys.INFER: + rand_inputs._shape = tf.TensorShape([None, None, None, None]) # pylint: disable=protected-access + rand_target._shape = tf.TensorShape([None, None, None, None]) # pylint: disable=protected-access + + # Final feature map. + rand_feature_map = { + "inputs": rand_inputs, + "problem_choice": choice, + "input_space_id": inp_id, + "target_space_id": tgt_id + } + if mode == tf.contrib.learn.ModeKeys.INFER: + rand_feature_map["infer_targets"] = rand_target + rand_target = None + return rand_feature_map, rand_target + + return input_fn + + +class _ConditionalOptimizer(tf.train.Optimizer): + """Conditional optimizer.""" + + def __init__(self, optimizer_name, lr, hparams, skip_condition_tensor=False): + self._skip_condition = skip_condition_tensor + if optimizer_name == "Adam": + # We change the default epsilon for Adam and re-scale lr. + # Using LazyAdam as it's much faster for large vocabulary embeddings. + self._opt = tf.contrib.opt.LazyAdamOptimizer( + lr / 500.0, + beta1=hparams.optimizer_adam_beta1, + beta2=hparams.optimizer_adam_beta2, + epsilon=hparams.optimizer_adam_epsilon) + elif optimizer_name == "Momentum": + self._opt = tf.train.MomentumOptimizer( + lr, momentum=hparams.optimizer_momentum_momentum) + else: + self._opt = tf.contrib.layers.OPTIMIZER_CLS_NAMES[optimizer_name](lr) + + def compute_gradients(self, loss, var_list, colocate_gradients_with_ops): + return self._opt.compute_gradients( + loss, var_list, colocate_gradients_with_ops=colocate_gradients_with_ops) + + def apply_gradients(self, gradients, global_step=None, name=None): + + def opt_gradients(): + return self._opt.apply_gradients( + gradients, global_step=global_step, name=name) + + if self._skip_condition is False: + return opt_gradients() + return tf.cond( + self._skip_condition, + tf.no_op, + opt_gradients, + name="conditional_optimizer_gradients_skip_cond") + + +def _sqrt_decay(step): + """Decay like 1 / sqrt(step), multiplied by 500 to normalize.""" + return 500.0 / tf.sqrt(tf.maximum(step, 1.0)) + + +def _exp_decay_after(step, rate, from_which_step): + """Decay exponentially by rate (per step) starting at from_which_step.""" + return tf.cond( + step < from_which_step, + lambda: tf.constant(1.0), + lambda: rate**(step - from_which_step), + name="exponential_decay_step_cond") + + +def _ps_replicas(all_workers=False): + if all_workers: + return list(range(FLAGS.ps_replicas)) + # Worker K will be using replicas {0,...n-1} + K*n if we have n replicas. + num_replicas = FLAGS.ps_replicas // FLAGS.worker_replicas + return [d + FLAGS.worker_id * num_replicas for d in xrange(num_replicas)] + + +def _gpu_order(num_gpus): + if FLAGS.gpu_order: + ret = [int(s) for s in FLAGS.gpu_order.split(" ")] + if len(ret) == num_gpus: + return ret + return list(range(num_gpus)) + + +def _ps_gpus(all_workers=False): + ps_gpus = [] + for d in _ps_replicas(all_workers=all_workers): + ps_gpus.extend([(d, gpu) for gpu in _gpu_order(FLAGS.ps_gpu)]) + return ps_gpus + + +def _ps_devices(all_workers=False): + """List of ps devices (where to put the experts). + + Args: + all_workers: whether the list is for all async workers or just this one. + + Returns: + a list of device names + """ + if FLAGS.ps_replicas > 0: + if FLAGS.ps_gpu > 0: + return [ + FLAGS.ps_job + "/task:%d/GPU:%d" % (d, gpu) + for (d, gpu) in _ps_gpus(all_workers=all_workers) + ] + else: + return [ + FLAGS.ps_job + "/task:%d" % d + for d in _ps_replicas(all_workers=all_workers) + ] + else: + if FLAGS.worker_gpu > 0: + return ["gpu:%d" % d for d in _gpu_order(FLAGS.worker_gpu)] + else: + return [""] + + +def data_parallelism(all_workers=False): + """Over which devices do we split each training batch. + + In old-fashioned async mode, we split the batch over all GPUs on the + current worker. + + In sync mode, we split the batch over all the parameter server GPUs. + + This function returns an expert_utils.Parallelism object, which can be used + to build the model. It is configured in a way that any variables created + by `tf.get_variable` will be assigned to the parameter servers and shared + between datashards. + + Args: + all_workers: whether the devices are all async workers or just this one. + + Returns: + a expert_utils.Parallelism. + """ + + def _replica_device_setter(worker_device): + if FLAGS.ps_replicas == 0: + return worker_device + return tf.train.replica_device_setter( + worker_device=worker_device, + ps_tasks=FLAGS.ps_replicas, + ps_device=FLAGS.ps_job + "/GPU:0" if FLAGS.ps_gpu > 0 else FLAGS.ps_job) + + if FLAGS.schedule == "local_run": + assert not FLAGS.sync + datashard_devices = ["gpu:%d" % d for d in _gpu_order(FLAGS.worker_gpu)] + caching_devices = None + elif FLAGS.sync: + assert FLAGS.ps_replicas > 0 + datashard_devices = [ + _replica_device_setter(d) for d in _ps_devices(all_workers=all_workers) + ] + if FLAGS.ps_gpu > 0 and FLAGS.ps_replicas > 1: + caching_devices = [ + FLAGS.ps_job + "/task:%d/cpu:0" % d + for (d, _) in _ps_gpus(all_workers=all_workers) + ] + else: + caching_devices = None + else: + # old fashioned async - compute on worker + if FLAGS.worker_gpu > 1: + datashard_devices = [ + _replica_device_setter(FLAGS.worker_job + "/GPU:%d" % d) + for d in _gpu_order(FLAGS.worker_gpu) + ] + caching_devices = [FLAGS.worker_job + "/GPU:0"] * FLAGS.worker_gpu + else: + datashard_devices = [_replica_device_setter(FLAGS.worker_job)] + caching_devices = None + tf.logging.info("datashard_devices: %s", datashard_devices) + tf.logging.info("caching_devices: %s", caching_devices) + return eu.Parallelism( + datashard_devices, + reuse=True, + caching_devices=caching_devices, + daisy_chain_variables=FLAGS.daisy_chain_variables) diff --git a/tensor2tensor/utils/trainer_utils_test.py b/tensor2tensor/utils/trainer_utils_test.py new file mode 100644 index 000000000..4e0807d4e --- /dev/null +++ b/tensor2tensor/utils/trainer_utils_test.py @@ -0,0 +1,41 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for trainer_utils.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +from tensor2tensor.utils import registry +from tensor2tensor.utils import trainer_utils as utils # pylint: disable=unused-import + +import tensorflow as tf + + +class TrainerUtilsTest(tf.test.TestCase): + + def testModelsImported(self): + models = registry.list_models() + self.assertTrue("baseline_lstm_seq2seq" in models) + + def testHParamsImported(self): + hparams = registry.list_hparams() + self.assertTrue("transformer_base" in hparams) + + +if __name__ == "__main__": + tf.test.main()