From 3d9c62f2aca9492db5c22676416974005b9dcbae Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Thu, 15 Jun 2017 11:27:28 -0700
Subject: [PATCH] initial push

---
 AUTHORS                                       |    7 +
 CONTRIBUTING.md                               |   23 +
 LICENSE                                       |  202 +++
 README.md                                     |  103 +-
 setup.py                                      |   23 +
 tensor2tensor/__init__.py                     |   14 +
 tensor2tensor/bin/t2t-datagen                 |  361 +++++
 tensor2tensor/bin/t2t-trainer                 |   56 +
 tensor2tensor/data_generators/README.md       |   71 +
 tensor2tensor/data_generators/__init__.py     |   14 +
 tensor2tensor/data_generators/algorithmic.py  |  178 +++
 .../data_generators/algorithmic_math.py       |  580 +++++++
 .../data_generators/algorithmic_math_test.py  |   84 ++
 .../data_generators/algorithmic_test.py       |   84 ++
 tensor2tensor/data_generators/audio.py        |  156 ++
 tensor2tensor/data_generators/audio_test.py   |   62 +
 .../data_generators/concatenate_examples.py   |  180 +++
 .../data_generators/generator_utils.py        |  264 ++++
 .../data_generators/generator_utils_test.py   |   88 ++
 tensor2tensor/data_generators/image.py        |  306 ++++
 tensor2tensor/data_generators/image_test.py   |   71 +
 tensor2tensor/data_generators/lm_example.py   |  123 ++
 .../data_generators/problem_hparams.py        |  702 +++++++++
 .../data_generators/problem_hparams_test.py   |   48 +
 tensor2tensor/data_generators/replace_oov.py  |   76 +
 tensor2tensor/data_generators/snli.py         |  167 ++
 tensor2tensor/data_generators/text_encoder.py |  451 ++++++
 .../text_encoder_build_subword.py             |   67 +
 .../text_encoder_inspect_subword.py           |   64 +
 tensor2tensor/data_generators/tokenizer.py    |  117 ++
 .../data_generators/tokenizer_test.py         |   64 +
 tensor2tensor/data_generators/wmt.py          |  269 ++++
 tensor2tensor/data_generators/wmt_test.py     |   72 +
 tensor2tensor/data_generators/wsj_parsing.py  |  109 ++
 tensor2tensor/models/README.md                |   16 +
 tensor2tensor/models/__init__.py              |   14 +
 tensor2tensor/models/attention_lm.py          |  169 +++
 tensor2tensor/models/baseline.py              |   72 +
 tensor2tensor/models/baseline_test.py         |   55 +
 tensor2tensor/models/bytenet.py               |  112 ++
 tensor2tensor/models/bytenet_test.py          |   54 +
 tensor2tensor/models/common_attention.py      |  344 +++++
 tensor2tensor/models/common_hparams.py        |  193 +++
 tensor2tensor/models/common_layers.py         | 1340 +++++++++++++++++
 tensor2tensor/models/common_layers_test.py    |  290 ++++
 tensor2tensor/models/models.py                |   32 +
 tensor2tensor/models/multimodel.py            |  159 ++
 tensor2tensor/models/multimodel_test.py       |   55 +
 tensor2tensor/models/neural_gpu.py            |  123 ++
 tensor2tensor/models/neural_gpu_test.py       |   62 +
 tensor2tensor/models/slicenet.py              |  391 +++++
 tensor2tensor/models/slicenet_test.py         |   54 +
 tensor2tensor/models/transformer.py           |  495 ++++++
 tensor2tensor/models/transformer_test.py      |   63 +
 tensor2tensor/models/xception.py              |   89 ++
 tensor2tensor/models/xception_test.py         |   54 +
 tensor2tensor/utils/__init__.py               |   14 +
 tensor2tensor/utils/avg_checkpoints.py        |   98 ++
 tensor2tensor/utils/beam_search.py            |  419 ++++++
 tensor2tensor/utils/beam_search_test.py       |  281 ++++
 tensor2tensor/utils/bleu_hook.py              |  123 ++
 tensor2tensor/utils/bleu_hook_test.py         |   59 +
 tensor2tensor/utils/data_reader.py            |  346 +++++
 tensor2tensor/utils/data_reader_test.py       |  147 ++
 tensor2tensor/utils/expert_utils.py           | 1284 ++++++++++++++++
 tensor2tensor/utils/metrics.py                |  155 ++
 tensor2tensor/utils/metrics_test.py           |   88 ++
 tensor2tensor/utils/modality.py               |  564 +++++++
 tensor2tensor/utils/modality_test.py          |   88 ++
 tensor2tensor/utils/registry.py               |  184 +++
 tensor2tensor/utils/registry_test.py          |  202 +++
 tensor2tensor/utils/t2t_model.py              |  429 ++++++
 tensor2tensor/utils/trainer_utils.py          | 1302 ++++++++++++++++
 tensor2tensor/utils/trainer_utils_test.py     |   41 +
 74 files changed, 15315 insertions(+), 1 deletion(-)
 create mode 100644 AUTHORS
 create mode 100644 CONTRIBUTING.md
 create mode 100644 LICENSE
 create mode 100644 setup.py
 create mode 100644 tensor2tensor/__init__.py
 create mode 100644 tensor2tensor/bin/t2t-datagen
 create mode 100644 tensor2tensor/bin/t2t-trainer
 create mode 100644 tensor2tensor/data_generators/README.md
 create mode 100644 tensor2tensor/data_generators/__init__.py
 create mode 100644 tensor2tensor/data_generators/algorithmic.py
 create mode 100644 tensor2tensor/data_generators/algorithmic_math.py
 create mode 100644 tensor2tensor/data_generators/algorithmic_math_test.py
 create mode 100644 tensor2tensor/data_generators/algorithmic_test.py
 create mode 100644 tensor2tensor/data_generators/audio.py
 create mode 100644 tensor2tensor/data_generators/audio_test.py
 create mode 100644 tensor2tensor/data_generators/concatenate_examples.py
 create mode 100644 tensor2tensor/data_generators/generator_utils.py
 create mode 100644 tensor2tensor/data_generators/generator_utils_test.py
 create mode 100644 tensor2tensor/data_generators/image.py
 create mode 100644 tensor2tensor/data_generators/image_test.py
 create mode 100644 tensor2tensor/data_generators/lm_example.py
 create mode 100644 tensor2tensor/data_generators/problem_hparams.py
 create mode 100644 tensor2tensor/data_generators/problem_hparams_test.py
 create mode 100644 tensor2tensor/data_generators/replace_oov.py
 create mode 100644 tensor2tensor/data_generators/snli.py
 create mode 100644 tensor2tensor/data_generators/text_encoder.py
 create mode 100644 tensor2tensor/data_generators/text_encoder_build_subword.py
 create mode 100644 tensor2tensor/data_generators/text_encoder_inspect_subword.py
 create mode 100644 tensor2tensor/data_generators/tokenizer.py
 create mode 100644 tensor2tensor/data_generators/tokenizer_test.py
 create mode 100644 tensor2tensor/data_generators/wmt.py
 create mode 100644 tensor2tensor/data_generators/wmt_test.py
 create mode 100644 tensor2tensor/data_generators/wsj_parsing.py
 create mode 100644 tensor2tensor/models/README.md
 create mode 100644 tensor2tensor/models/__init__.py
 create mode 100644 tensor2tensor/models/attention_lm.py
 create mode 100644 tensor2tensor/models/baseline.py
 create mode 100644 tensor2tensor/models/baseline_test.py
 create mode 100644 tensor2tensor/models/bytenet.py
 create mode 100644 tensor2tensor/models/bytenet_test.py
 create mode 100644 tensor2tensor/models/common_attention.py
 create mode 100644 tensor2tensor/models/common_hparams.py
 create mode 100644 tensor2tensor/models/common_layers.py
 create mode 100644 tensor2tensor/models/common_layers_test.py
 create mode 100644 tensor2tensor/models/models.py
 create mode 100644 tensor2tensor/models/multimodel.py
 create mode 100644 tensor2tensor/models/multimodel_test.py
 create mode 100644 tensor2tensor/models/neural_gpu.py
 create mode 100644 tensor2tensor/models/neural_gpu_test.py
 create mode 100644 tensor2tensor/models/slicenet.py
 create mode 100644 tensor2tensor/models/slicenet_test.py
 create mode 100644 tensor2tensor/models/transformer.py
 create mode 100644 tensor2tensor/models/transformer_test.py
 create mode 100644 tensor2tensor/models/xception.py
 create mode 100644 tensor2tensor/models/xception_test.py
 create mode 100644 tensor2tensor/utils/__init__.py
 create mode 100644 tensor2tensor/utils/avg_checkpoints.py
 create mode 100644 tensor2tensor/utils/beam_search.py
 create mode 100644 tensor2tensor/utils/beam_search_test.py
 create mode 100644 tensor2tensor/utils/bleu_hook.py
 create mode 100644 tensor2tensor/utils/bleu_hook_test.py
 create mode 100644 tensor2tensor/utils/data_reader.py
 create mode 100644 tensor2tensor/utils/data_reader_test.py
 create mode 100644 tensor2tensor/utils/expert_utils.py
 create mode 100644 tensor2tensor/utils/metrics.py
 create mode 100644 tensor2tensor/utils/metrics_test.py
 create mode 100644 tensor2tensor/utils/modality.py
 create mode 100644 tensor2tensor/utils/modality_test.py
 create mode 100644 tensor2tensor/utils/registry.py
 create mode 100644 tensor2tensor/utils/registry_test.py
 create mode 100644 tensor2tensor/utils/t2t_model.py
 create mode 100644 tensor2tensor/utils/trainer_utils.py
 create mode 100644 tensor2tensor/utils/trainer_utils_test.py

diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 000000000..38e5bc724
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1,7 @@
+# This is the list of T2T authors for copyright purposes.
+#
+# This does not necessarily list everyone who has contributed code, since in
+# some cases, their employer may be the copyright holder.  To see the full list
+# of contributors, see the revision history in source control.
+
+Google Inc.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 000000000..ae319c70a
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,23 @@
+# How to Contribute
+
+We'd love to accept your patches and contributions to this project. There are
+just a few small guidelines you need to follow.
+
+## Contributor License Agreement
+
+Contributions to this project must be accompanied by a Contributor License
+Agreement. You (or your employer) retain the copyright to your contribution,
+this simply gives us permission to use and redistribute your contributions as
+part of the project. Head over to <https://cla.developers.google.com/> to see
+your current agreements on file or to sign a new one.
+
+You generally only need to submit a CLA once, so if you've already submitted one
+(even if it was for a different project), you probably don't need to do it
+again.
+
+## Code reviews
+
+All submissions, including submissions by project members, require review. We
+use GitHub pull requests for this purpose. Consult
+[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
+information on using pull requests.
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 000000000..d64569567
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/README.md b/README.md
index 1a992ff9a..1a650a5c2 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,102 @@
-# tensor2tensor
+# T2T: Tensor2Tensor Transformers
+
+[T2T](https://github.com/tensorflow/t2t) is a modular and extensible library and
+binaries for supervised learning with TensorFlow and with a focus on sequence
+tasks. Actively used and maintained by researchers and engineers within Google
+Brain, T2T strives to maximize idea bandwidth and minimize execution latency.
+
+T2T is particularly well-suited to researchers working on sequence tasks. We're
+eager to collaborate with you on extending T2T's powers, so please feel free to
+open an issue on GitHub to kick off a discussion and send along pull requests,
+See [our contribution doc](CONTRIBUTING.md) for details and our
+[open issues](https://github.com/tensorflow/t2t/issues).
+
+## T2T overview
+
+```
+pip install tensor2tensor
+
+DATA_DIR=$HOME/data
+PROBLEM=wmt_ende_tokens_32k
+MODEL=transformer
+HPARAMS=transformer_base
+TRAIN_DIR=$HOME/train
+
+# Generate data
+t2t-datagen \
+  --data_dir=$DATA_DIR \
+  --problem=$PROBLEM
+
+# Train
+t2t-trainer \
+  --data_dir=$DATA_DIR \
+  --problems=$PROBLEM \
+  --model=$MODEL \
+  --hparams_set=$HPARAMS \
+  --output_dir=$TRAIN_DIR \
+
+# Decode
+t2t-trainer \
+  --data_dir=$DATA_DIR \
+  --problems=$PROBLEM \
+  --model=$MODEL \
+  --hparams_set=$HPARAMS \
+  --output_dir=$TRAIN_DIR \
+  --decode_from_file=$DATA_DIR/decode_this.txt
+```
+
+T2T modularizes training into several components, each of which can be seen in
+use in the above commands.
+
+### Datasets
+
+**Datasets** are all standardized on TFRecord files with `tensorflow.Example`
+protocol buffers. All datasets are registered and generated with
+[`generator.py`](data_generators/generator.py) and many common
+sequence datasets are already available for generation and use.
+
+### Problems and Modalities
+
+**Problems** define training-time hyperparameters for the dataset and task,
+mainly by setting input and output **modalities** (e.g. symbol, image, audio,
+label) and vocabularies, if applicable. All problems are defined in
+[`problem_hparams.py`](data_generators/problem_hparams.py). **Modalities**,
+defined in [`modality.py`](utils/modality.py), abstract away the input and
+output data types so that **models** may deal with modality-independent tensors.
+
+### Models
+
+**`T2TModel`s** define the core tensor-to-tensor transformation, independent of
+input/output modality or task. Models take dense tensors in and produce dense
+tensors that may then be transformed in a final step by a **modality** depending
+on the task (e.g. fed through a final linear transform to produce logits for a
+softmax over classes). All models are imported in
+[`models.py`](models/models.py), inherit from `T2TModel` - defined in
+[`t2t_model.py`](utils/t2t_model.py) - and are registered with
+[`@registry.register_model`](utils/registry.py).
+
+### Hyperparameter Sets
+
+**Hyperparameter sets** are defined and registered in code with
+[`@registry.register_hparams`](utils/registry.py) and are encoded in
+[`tf.contrib.training.HParams`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/training/python/training/hparam.py)
+objects. The `HParams` are available to both the problem specification and the
+model. A basic set of hyperparameters are defined in
+[`common_hparams.py`](models/common_hparams.py) and hyperparameter set
+functions can compose other hyperparameter set functions.
+
+### Trainer
+
+The **trainer** binary is the main entrypoint for training, evaluation, and
+inference. Users can easily switch between problems, models, and hyperparameter
+sets by using the `--model`, `--problems`, and `--hparams_set` flags. Specific
+hyperparameters can be overriden with the `--hparams` flag. `--schedule` and
+related flags control local and distributed training/evaluation.
+
+## Adding a dataset
+
+See the data generators [README](data_generators/README.md).
+
+---
+
+*Note: This is not an official Google product.*
diff --git a/setup.py b/setup.py
new file mode 100644
index 000000000..cac1a5125
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,23 @@
+"""Install tensor2tensor."""
+
+from distutils.core import setup
+
+setup(
+    name='tensor2tensor',
+    version='1.0',
+    description='Tensor2Tensor',
+    author='Google Inc.',
+    author_email='no-reply@google.com',
+    url='http://github.com/tensorflow/tensor2tensor',
+    license='Apache 2.0',
+    packages=[
+        'tensor2tensor', 'tensor2tensor.utils', 'tensor2tensor.data_generators',
+        'tensor2tensor.models'
+    ],
+    scripts=['tensor2tensor/bin/t2t-trainer', 'tensor2tensor/bin/t2t-datagen'],
+    install_requires=[
+        'numpy',
+        'sympy',
+        'six',
+        'tensorflow-gpu>=1.2.0rc1',
+    ],)
diff --git a/tensor2tensor/__init__.py b/tensor2tensor/__init__.py
new file mode 100644
index 000000000..27d533abc
--- /dev/null
+++ b/tensor2tensor/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/tensor2tensor/bin/t2t-datagen b/tensor2tensor/bin/t2t-datagen
new file mode 100644
index 000000000..002544052
--- /dev/null
+++ b/tensor2tensor/bin/t2t-datagen
@@ -0,0 +1,361 @@
+#!/usr/bin/env python
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Produces the training and dev data for --problem into --data_dir.
+
+generator.py produces sharded and shuffled TFRecord files of tensorflow.Example
+protocol buffers for a variety of datasets registered in this file.
+
+All datasets are registered in _SUPPORTED_PROBLEM_GENERATORS. Each entry maps a
+string name (selectable on the command-line with --problem) to a function that
+takes 2 arguments - input_directory and mode (one of "train" or "dev") - and
+yields for each training example a dictionary mapping string feature names to
+lists of {string, int, float}. The generator will be run once for each mode.
+"""
+
+import random
+import tempfile
+
+# Dependency imports
+
+import numpy as np
+
+from tensor2tensor.data_generators import algorithmic
+from tensor2tensor.data_generators import algorithmic_math
+from tensor2tensor.data_generators import audio
+from tensor2tensor.data_generators import generator_utils
+from tensor2tensor.data_generators import image
+from tensor2tensor.data_generators import snli
+from tensor2tensor.data_generators import wmt
+from tensor2tensor.data_generators import wsj_parsing
+
+import tensorflow as tf
+
+flags = tf.flags
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("data_dir", "", "Data directory.")
+flags.DEFINE_string("tmp_dir",
+                    tempfile.gettempdir(), "Temporary storage directory.")
+flags.DEFINE_string("problem", "",
+                    "The name of the problem to generate data for.")
+flags.DEFINE_integer("num_shards", 1, "How many shards to use.")
+flags.DEFINE_integer("max_cases", 0,
+                     "Maximum number of cases to generate (unbounded if 0).")
+flags.DEFINE_integer("random_seed", 429459, "Random seed to use.")
+
+# Mapping from problems that we can generate data for to their generators.
+# pylint: disable=g-long-lambda
+_SUPPORTED_PROBLEM_GENERATORS = {
+    "algorithmic_identity_binary40": (
+        lambda: algorithmic.identity_generator(2, 40, 100000),
+        lambda: algorithmic.identity_generator(2, 400, 10000)),
+    "algorithmic_identity_decimal40": (
+        lambda: algorithmic.identity_generator(10, 40, 100000),
+        lambda: algorithmic.identity_generator(10, 400, 10000)),
+    "algorithmic_shift_decimal40": (
+        lambda: algorithmic.shift_generator(20, 10, 40, 100000),
+        lambda: algorithmic.shift_generator(20, 10, 80, 10000)),
+    "algorithmic_reverse_binary40": (
+        lambda: algorithmic.reverse_generator(2, 40, 100000),
+        lambda: algorithmic.reverse_generator(2, 400, 10000)),
+    "algorithmic_reverse_decimal40": (
+        lambda: algorithmic.reverse_generator(10, 40, 100000),
+        lambda: algorithmic.reverse_generator(10, 400, 10000)),
+    "algorithmic_addition_binary40": (
+        lambda: algorithmic.addition_generator(2, 40, 100000),
+        lambda: algorithmic.addition_generator(2, 400, 10000)),
+    "algorithmic_addition_decimal40": (
+        lambda: algorithmic.addition_generator(10, 40, 100000),
+        lambda: algorithmic.addition_generator(10, 400, 10000)),
+    "algorithmic_multiplication_binary40": (
+        lambda: algorithmic.multiplication_generator(2, 40, 100000),
+        lambda: algorithmic.multiplication_generator(2, 400, 10000)),
+    "algorithmic_multiplication_decimal40": (
+        lambda: algorithmic.multiplication_generator(10, 40, 100000),
+        lambda: algorithmic.multiplication_generator(10, 400, 10000)),
+    "algorithmic_algebra_inverse": (
+        lambda: algorithmic_math.algebra_inverse(26, 0, 2, 100000),
+        lambda: algorithmic_math.algebra_inverse(26, 3, 3, 10000)),
+    "algorithmic_algebra_simplify": (
+        lambda: algorithmic_math.algebra_simplify(8, 0, 2, 100000),
+        lambda: algorithmic_math.algebra_simplify(8, 3, 3, 10000)),
+    "algorithmic_calculus_integrate": (
+        lambda: algorithmic_math.calculus_integrate(8, 0, 2, 100000),
+        lambda: algorithmic_math.calculus_integrate(8, 3, 3, 10000)),
+    "wmt_parsing_characters": (
+        lambda: wmt.parsing_character_generator(FLAGS.tmp_dir, True),
+        lambda: wmt.parsing_character_generator(FLAGS.tmp_dir, False)),
+    "wmt_parsing_tokens_8k": (
+        lambda: wmt.parsing_token_generator(FLAGS.tmp_dir, True, 2**13),
+        lambda: wmt.parsing_token_generator(FLAGS.tmp_dir, False, 2**13)),
+    "wsj_parsing_tokens_16k": (
+        lambda: wsj_parsing.parsing_token_generator(FLAGS.tmp_dir, True,
+                                                    2**14, 2**9),
+        lambda: wsj_parsing.parsing_token_generator(FLAGS.tmp_dir, False,
+                                                    2**14, 2**9)),
+    "wsj_parsing_tokens_32k": (
+        lambda: wsj_parsing.parsing_token_generator(FLAGS.tmp_dir, True,
+                                                    2**15, 2**9),
+        lambda: wsj_parsing.parsing_token_generator(FLAGS.tmp_dir, False,
+                                                    2**15, 2**9)),
+    "wmt_enfr_characters": (
+        lambda: wmt.enfr_character_generator(FLAGS.tmp_dir, True),
+        lambda: wmt.enfr_character_generator(FLAGS.tmp_dir, False)),
+    "wmt_enfr_tokens_8k": (
+        lambda: wmt.enfr_wordpiece_token_generator(FLAGS.tmp_dir, True, 2**13),
+        lambda: wmt.enfr_wordpiece_token_generator(FLAGS.tmp_dir, False, 2**13)
+    ),
+    "wmt_enfr_tokens_32k": (
+        lambda: wmt.enfr_wordpiece_token_generator(FLAGS.tmp_dir, True, 2**15),
+        lambda: wmt.enfr_wordpiece_token_generator(FLAGS.tmp_dir, False, 2**15)
+    ),
+    "wmt_enfr_tokens_128k": (
+        lambda: wmt.enfr_wordpiece_token_generator(FLAGS.tmp_dir, True, 2**17),
+        lambda: wmt.enfr_wordpiece_token_generator(FLAGS.tmp_dir, False, 2**17)
+    ),
+    "wmt_ende_characters": (
+        lambda: wmt.ende_character_generator(FLAGS.tmp_dir, True),
+        lambda: wmt.ende_character_generator(FLAGS.tmp_dir, False)),
+    "wmt_ende_bpe32k": (
+        lambda: wmt.ende_bpe_token_generator(FLAGS.tmp_dir, True),
+        lambda: wmt.ende_bpe_token_generator(FLAGS.tmp_dir, False)),
+    "wmt_ende_tokens_8k": (
+        lambda: wmt.ende_wordpiece_token_generator(FLAGS.tmp_dir, True, 2**13),
+        lambda: wmt.ende_wordpiece_token_generator(FLAGS.tmp_dir, False, 2**13)
+    ),
+    "wmt_ende_tokens_32k": (
+        lambda: wmt.ende_wordpiece_token_generator(FLAGS.tmp_dir, True, 2**15),
+        lambda: wmt.ende_wordpiece_token_generator(FLAGS.tmp_dir, False, 2**15)
+    ),
+    "wmt_ende_tokens_128k": (
+        lambda: wmt.ende_wordpiece_token_generator(FLAGS.tmp_dir, True, 2**17),
+        lambda: wmt.ende_wordpiece_token_generator(FLAGS.tmp_dir, False, 2**17)
+    ),
+    "image_mnist_tune": (
+        lambda: image.mnist_generator(FLAGS.tmp_dir, True, 55000),
+        lambda: image.mnist_generator(FLAGS.tmp_dir, True, 5000, 55000)),
+    "image_mnist_test": (
+        lambda: image.mnist_generator(FLAGS.tmp_dir, True, 60000),
+        lambda: image.mnist_generator(FLAGS.tmp_dir, False, 10000)),
+    "image_cifar10_tune": (
+        lambda: image.cifar10_generator(FLAGS.tmp_dir, True, 48000),
+        lambda: image.cifar10_generator(FLAGS.tmp_dir, True, 2000, 48000)),
+    "image_cifar10_test": (
+        lambda: image.cifar10_generator(FLAGS.tmp_dir, True, 50000),
+        lambda: image.cifar10_generator(FLAGS.tmp_dir, False, 10000)),
+    "image_mscoco_characters_tune": (
+        lambda: image.mscoco_generator(FLAGS.tmp_dir, True, 70000),
+        lambda: image.mscoco_generator(FLAGS.tmp_dir, True, 10000, 70000)),
+    "image_mscoco_characters_test": (
+        lambda: image.mscoco_generator(FLAGS.tmp_dir, True, 80000),
+        lambda: image.mscoco_generator(FLAGS.tmp_dir, False, 40000)),
+    "image_mscoco_tokens_8k_tune": (
+        lambda: image.mscoco_generator(
+            FLAGS.tmp_dir,
+            True,
+            70000,
+            vocab_filename="tokens.vocab.%d" % 2**13,
+            vocab_size=2**13),
+        lambda: image.mscoco_generator(
+            FLAGS.tmp_dir,
+            True,
+            10000,
+            70000,
+            vocab_filename="tokens.vocab.%d" % 2**13,
+            vocab_size=2**13)),
+    "image_mscoco_tokens_8k_test": (
+        lambda: image.mscoco_generator(
+            FLAGS.tmp_dir,
+            True,
+            80000,
+            vocab_filename="tokens.vocab.%d" % 2**13,
+            vocab_size=2**13),
+        lambda: image.mscoco_generator(
+            FLAGS.tmp_dir,
+            False,
+            40000,
+            vocab_filename="tokens.vocab.%d" % 2**13,
+            vocab_size=2**13)),
+    "image_mscoco_tokens_32k_tune": (
+        lambda: image.mscoco_generator(
+            FLAGS.tmp_dir,
+            True,
+            70000,
+            vocab_filename="tokens.vocab.%d" % 2**15,
+            vocab_size=2**15),
+        lambda: image.mscoco_generator(
+            FLAGS.tmp_dir,
+            True,
+            10000,
+            70000,
+            vocab_filename="tokens.vocab.%d" % 2**15,
+            vocab_size=2**15)),
+    "image_mscoco_tokens_32k_test": (
+        lambda: image.mscoco_generator(
+            FLAGS.tmp_dir,
+            True,
+            80000,
+            vocab_filename="tokens.vocab.%d" % 2**15,
+            vocab_size=2**15),
+        lambda: image.mscoco_generator(
+            FLAGS.tmp_dir,
+            False,
+            40000,
+            vocab_filename="tokens.vocab.%d" % 2**15,
+            vocab_size=2**15)),
+    "image_mscoco_tokens_128k_tune": (
+        lambda: image.mscoco_generator(
+            FLAGS.tmp_dir,
+            True,
+            70000,
+            vocab_filename="tokens.vocab.%d" % 2**17,
+            vocab_size=2**17),
+        lambda: image.mscoco_generator(
+            FLAGS.tmp_dir,
+            True,
+            10000,
+            70000,
+            vocab_filename="tokens.vocab.%d" % 2**17,
+            vocab_size=2**17)),
+    "image_mscoco_tokens_128k_test": (
+        lambda: image.mscoco_generator(
+            FLAGS.tmp_dir,
+            True,
+            80000,
+            vocab_filename="tokens.vocab.%d" % 2**17,
+            vocab_size=2**17),
+        lambda: image.mscoco_generator(
+            FLAGS.tmp_dir,
+            False,
+            40000,
+            vocab_filename="tokens.vocab.%d" % 2**17,
+            vocab_size=2**17)),
+    "snli_32k": (
+        lambda: snli.snli_token_generator(FLAGS.tmp_dir, True, 2**15),
+        lambda: snli.snli_token_generator(FLAGS.tmp_dir, False, 2**15),
+    ),
+    "audio_timit_characters_tune": (
+        lambda: audio.timit_generator(FLAGS.tmp_dir, True, 1374),
+        lambda: audio.timit_generator(FLAGS.tmp_dir, True, 344, 1374)),
+    "audio_timit_characters_test": (
+        lambda: audio.timit_generator(FLAGS.tmp_dir, True, 1718),
+        lambda: audio.timit_generator(FLAGS.tmp_dir, False, 626)),
+    "audio_timit_tokens_8k_tune": (
+        lambda: audio.timit_generator(
+            FLAGS.tmp_dir,
+            True,
+            1374,
+            vocab_filename="tokens.vocab.%d" % 2**13,
+            vocab_size=2**13),
+        lambda: audio.timit_generator(
+            FLAGS.tmp_dir,
+            True,
+            344,
+            1374,
+            vocab_filename="tokens.vocab.%d" % 2**13,
+            vocab_size=2**13)),
+    "audio_timit_tokens_8k_test": (
+        lambda: audio.timit_generator(
+            FLAGS.tmp_dir,
+            True,
+            1718,
+            vocab_filename="tokens.vocab.%d" % 2**13,
+            vocab_size=2**13),
+        lambda: audio.timit_generator(
+            FLAGS.tmp_dir,
+            False,
+            626,
+            vocab_filename="tokens.vocab.%d" % 2**13,
+            vocab_size=2**13)),
+    "audio_timit_tokens_32k_tune": (
+        lambda: audio.timit_generator(
+            FLAGS.tmp_dir,
+            True,
+            1374,
+            vocab_filename="tokens.vocab.%d" % 2**15,
+            vocab_size=2**15),
+        lambda: audio.timit_generator(
+            FLAGS.tmp_dir,
+            True,
+            344,
+            1374,
+            vocab_filename="tokens.vocab.%d" % 2**15,
+            vocab_size=2**15)),
+    "audio_timit_tokens_32k_test": (
+        lambda: audio.timit_generator(
+            FLAGS.tmp_dir,
+            True,
+            1718,
+            vocab_filename="tokens.vocab.%d" % 2**15,
+            vocab_size=2**15),
+        lambda: audio.timit_generator(
+            FLAGS.tmp_dir,
+            False,
+            626,
+            vocab_filename="tokens.vocab.%d" % 2**15,
+            vocab_size=2**15)),
+}
+
+# pylint: enable=g-long-lambda
+
+UNSHUFFLED_SUFFIX = "-unshuffled"
+
+
+def set_random_seed():
+  """Set the random seed from flag everywhere."""
+  tf.set_random_seed(FLAGS.random_seed)
+  random.seed(FLAGS.random_seed)
+  np.random.seed(FLAGS.random_seed)
+
+
+def main(_):
+  tf.logging.set_verbosity(tf.logging.INFO)
+  if FLAGS.problem not in _SUPPORTED_PROBLEM_GENERATORS:
+    problems_str = "\n  * ".join(sorted(_SUPPORTED_PROBLEM_GENERATORS))
+    error_msg = ("You must specify one of the supported problems to "
+                 "generate data for:\n  * " + problems_str + "\n")
+    raise ValueError(error_msg)
+
+  if not FLAGS.data_dir:
+    FLAGS.data_dir = tempfile.gettempdir()
+    tf.logging.warning("It is strongly recommended to specify --data_dir. "
+                       "Data will be written to default data_dir=%s.",
+                       FLAGS.data_dir)
+
+  set_random_seed()
+
+  training_gen, dev_gen = _SUPPORTED_PROBLEM_GENERATORS[FLAGS.problem]
+
+  tf.logging.info("Generating training data for %s.", FLAGS.problem)
+  train_output_files = generator_utils.generate_files(
+      training_gen(), FLAGS.problem + UNSHUFFLED_SUFFIX + "-train",
+      FLAGS.data_dir, FLAGS.num_shards, FLAGS.max_cases)
+
+  tf.logging.info("Generating development data for %s.", FLAGS.problem)
+  dev_output_files = generator_utils.generate_files(
+      dev_gen(), FLAGS.problem + UNSHUFFLED_SUFFIX + "-dev", FLAGS.data_dir, 1)
+
+  tf.logging.info("Shuffling data...")
+  for fname in train_output_files + dev_output_files:
+    records = generator_utils.read_records(fname)
+    random.shuffle(records)
+    out_fname = fname.replace(UNSHUFFLED_SUFFIX, "")
+    generator_utils.write_records(records, out_fname)
+    tf.gfile.Remove(fname)
+
+
+if __name__ == "__main__":
+  tf.app.run()
diff --git a/tensor2tensor/bin/t2t-trainer b/tensor2tensor/bin/t2t-trainer
new file mode 100644
index 000000000..c14fac783
--- /dev/null
+++ b/tensor2tensor/bin/t2t-trainer
@@ -0,0 +1,56 @@
+#!/usr/bin/env python
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""Trainer for T2T models.
+
+This binary perform training, evaluation, and inference using
+the Estimator API with tf.learn Experiment objects.
+
+To train your model, for example:
+  t2t-trainer \
+      --data_dir ~/data \
+      --problems=algorithmic_identity_binary40 \
+      --model=transformer
+      --hparams_set=transformer_base
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+from tensor2tensor.utils import trainer_utils as utils
+
+import tensorflow as tf
+
+FLAGS = tf.flags.FLAGS
+
+
+def main(_):
+  tf.logging.set_verbosity(tf.logging.INFO)
+  utils.log_registry()
+  utils.validate_flags()
+  # TODO(rsepassi): Document distributed training
+  utils.run(
+      data_dir=FLAGS.data_dir,
+      model=FLAGS.model,
+      output_dir=FLAGS.output_dir,
+      train_steps=FLAGS.train_steps,
+      eval_steps=FLAGS.eval_steps,
+      schedule=FLAGS.schedule)
+
+
+if __name__ == "__main__":
+  tf.app.run()
diff --git a/tensor2tensor/data_generators/README.md b/tensor2tensor/data_generators/README.md
new file mode 100644
index 000000000..813eb4f7e
--- /dev/null
+++ b/tensor2tensor/data_generators/README.md
@@ -0,0 +1,71 @@
+# Data generators for T2T models.
+
+This directory contains data generators for a number of problems. We use a
+naming scheme for the problems, they have names of the form
+`[task-family]_[task]_[specifics]`.  Data for all currently supported problems
+can be generated by calling the main generator binary (`t2t-datagen`). For
+example:
+
+```
+t2t-datagen \
+  --problem=algorithmic_identity_binary40 \
+  --data_dir=/tmp
+```
+
+will generate training and development data for the algorithmic copy task -
+`/tmp/algorithmic_identity_binary40-dev-00000-of-00001` and
+`/tmp/algorithmic_identity_binary40-train-00000-of-00001`.
+All tasks produce TFRecord files of `tensorflow.Example` protocol buffers.
+
+
+## Adding a new problem
+
+1. Implement and register a Python generator for the dataset
+1. Add a problem specification to `problem_hparams.py` specifying input and
+   output modalities
+
+To add a new problem, you first need to create python generators for training
+and development data for the problem. The python generators should yield
+dictionaries with string keys and values being lists of {int, float, str}.
+Here is a very simple generator for a data-set where inputs are lists of 1s with
+length upto 100 and targets are lists of length 1 with an integer denoting the
+length of the input list.
+
+```
+def length_generator(nbr_cases):
+  for _ in xrange(nbr_cases):
+    length = np.random.randint(100) + 1
+    yield {"inputs": [1] * length, "targets": [length]}
+```
+
+Note that our data reader uses 0 for padding, so it is a good idea to never
+generate 0s, except if all your examples have the same size (in which case
+they'll never be padded anyway) or if you're doing padding on your own (in which
+case please use 0s for padding). When adding the python generator function,
+please also add unit tests to check if the code runs.
+
+The generator can do arbitrary setup before beginning to yield examples - for
+example, downloading data, generating vocabulary files, etc.
+
+Some examples:
+
+*   [Algorithmic generators](https://github.com/tensorflow/tensor2tensor/tree/master/data_generators/algorithmic.py)
+    and their [unit tests](https://github.com/tensorflow/tensor2tensor/tree/master/data_generators/algorithmic_test.py)
+*   [WMT generators](https://github.com/tensorflow/tensor2tensor/tree/master/data_generators/wmt.py)
+    and their [unit tests](https://github.com/tensorflow/tensor2tensor/tree/master/data_generators/wmt_test.py)
+
+When your python generator is ready and tested, add it to the
+`_SUPPORTED_PROBLEM_GENERATORS` dictionary in
+[generator.py](https://github.com/tensorflow/tensor2tensor/tree/master/data_generators/generator.py).
+The keys are problem names, and the values are pairs of (training-set-generator
+function, dev-set-generator function). For the generator above, one could add
+the following lines:
+
+```
+  "algorithmic_length_upto100":
+  (lambda: algorithmic.length_generator(10000),
+   lambda: algorithmic.length_generator(1000)),
+```
+
+Note the lambdas above: we don't want to call the generators too early.
+
diff --git a/tensor2tensor/data_generators/__init__.py b/tensor2tensor/data_generators/__init__.py
new file mode 100644
index 000000000..27d533abc
--- /dev/null
+++ b/tensor2tensor/data_generators/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/tensor2tensor/data_generators/algorithmic.py b/tensor2tensor/data_generators/algorithmic.py
new file mode 100644
index 000000000..46ebb27a3
--- /dev/null
+++ b/tensor2tensor/data_generators/algorithmic.py
@@ -0,0 +1,178 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Algorithmic data generators."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+import numpy as np
+
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+
+def identity_generator(nbr_symbols, max_length, nbr_cases):
+  """Generator for the identity (copy) task on sequences of symbols.
+
+  The length of the sequence is drawn uniformly at random from [1, max_length]
+  and then symbols are drawn uniformly at random from [1, nbr_symbols] until
+  nbr_cases sequences have been produced.
+
+  Args:
+    nbr_symbols: number of symbols to use in each sequence.
+    max_length: integer, maximum length of sequences to generate.
+    nbr_cases: the number of cases to generate.
+
+  Yields:
+    A dictionary {"inputs": input-list, "targets": target-list} where
+    input-list and target-list are the same.
+  """
+  for _ in xrange(nbr_cases):
+    l = np.random.randint(max_length) + 1
+    inputs = [np.random.randint(nbr_symbols) + 1 for _ in xrange(l)]
+    yield {"inputs": inputs, "targets": inputs}
+
+
+def shift_generator(nbr_symbols, shift, max_length, nbr_cases):
+  """Generator for the shift task on sequences of symbols.
+
+  The length of the sequence is drawn uniformly at random from [1, max_length]
+  and then symbols are drawn uniformly at random from [1, nbr_symbols - shift]
+  until nbr_cases sequences have been produced (output[i] = input[i] + shift).
+
+  Args:
+    nbr_symbols: number of symbols to use in each sequence (input + output).
+    shift: by how much to shift the input.
+    max_length: integer, maximum length of sequences to generate.
+    nbr_cases: the number of cases to generate.
+
+  Yields:
+    A dictionary {"inputs": input-list, "targets": target-list} where
+    target-list[i] = input-list[i] + shift.
+  """
+  for _ in xrange(nbr_cases):
+    l = np.random.randint(max_length) + 1
+    inputs = [np.random.randint(nbr_symbols - shift) + 1 for _ in xrange(l)]
+    yield {"inputs": inputs, "targets": [i + shift for i in inputs]}
+
+
+def reverse_generator(nbr_symbols, max_length, nbr_cases):
+  """Generator for the reversing task on sequences of symbols.
+
+  The length of the sequence is drawn uniformly at random from [1, max_length]
+  and then symbols are drawn uniformly at random from [1, nbr_symbols] until
+  nbr_cases sequences have been produced.
+
+  Args:
+    nbr_symbols: number of symbols to use in each sequence.
+    max_length: integer, maximum length of sequences to generate.
+    nbr_cases: the number of cases to generate.
+
+  Yields:
+    A dictionary {"inputs": input-list, "targets": target-list} where
+    target-list is input-list reversed.
+  """
+  for _ in xrange(nbr_cases):
+    l = np.random.randint(max_length) + 1
+    inputs = [np.random.randint(nbr_symbols) + 1 for _ in xrange(l)]
+    yield {"inputs": inputs, "targets": list(reversed(inputs))}
+
+
+def lower_endian_to_number(l, base):
+  """Helper function: convert a list of digits in the given base to a number."""
+  return sum([d * (base**i) for i, d in enumerate(l)])
+
+
+def number_to_lower_endian(n, base):
+  """Helper function: convert a number to a list of digits in the given base."""
+  if n < base:
+    return [n]
+  return [n % base] + number_to_lower_endian(n // base, base)
+
+
+def random_number_lower_endian(length, base):
+  """Helper function: generate a random number as a lower-endian digits list."""
+  if length == 1:  # Last digit can be 0 only if length is 1.
+    return [np.random.randint(base)]
+  prefix = [np.random.randint(base) for _ in xrange(length - 1)]
+  return prefix + [np.random.randint(base - 1) + 1]  # Last digit is not 0.
+
+
+def addition_generator(base, max_length, nbr_cases):
+  """Generator for the addition task.
+
+  The length of each number is drawn uniformly at random from [1, max_length/2]
+  and then digits are drawn uniformly at random. The numbers are added and
+  separated by [base+1] in the input. Stops at nbr_cases.
+
+  Args:
+    base: in which base are the numbers.
+    max_length: integer, maximum length of sequences to generate.
+    nbr_cases: the number of cases to generate.
+
+  Yields:
+    A dictionary {"inputs": input-list, "targets": target-list} where
+    input-list are the 2 numbers and target-list is the result of adding them.
+
+  Raises:
+    ValueError: if max_length is lower than 3.
+  """
+  if max_length < 3:
+    raise ValueError("Maximum length must be at least 3.")
+  for _ in xrange(nbr_cases):
+    l1 = np.random.randint(max_length // 2) + 1
+    l2 = np.random.randint(max_length - l1 - 1) + 1
+    n1 = random_number_lower_endian(l1, base)
+    n2 = random_number_lower_endian(l2, base)
+    result = lower_endian_to_number(n1, base) + lower_endian_to_number(n2, base)
+    # We shift digits by 1 on input and output to leave 0 for padding.
+    inputs = [i + 1 for i in n1] + [base + 1] + [i + 1 for i in n2]
+    targets = [i + 1 for i in number_to_lower_endian(result, base)]
+    yield {"inputs": inputs, "targets": targets}
+
+
+def multiplication_generator(base, max_length, nbr_cases):
+  """Generator for the multiplication task.
+
+  The length of each number is drawn uniformly at random from [1, max_length/2]
+  and then digits are drawn uniformly at random. The numbers are multiplied
+  and separated by [base+1] in the input. Stops at nbr_cases.
+
+  Args:
+    base: in which base are the numbers.
+    max_length: integer, maximum length of sequences to generate.
+    nbr_cases: the number of cases to generate.
+
+  Yields:
+    A dictionary {"inputs": input-list, "targets": target-list} where
+    input-list are the 2 numbers and target-list is the result of multiplying
+    them.
+
+  Raises:
+    ValueError: if max_length is lower than 3.
+  """
+  if max_length < 3:
+    raise ValueError("Maximum length must be at least 3.")
+  for _ in xrange(nbr_cases):
+    l1 = np.random.randint(max_length // 2) + 1
+    l2 = np.random.randint(max_length - l1 - 1) + 1
+    n1 = random_number_lower_endian(l1, base)
+    n2 = random_number_lower_endian(l2, base)
+    result = lower_endian_to_number(n1, base) * lower_endian_to_number(n2, base)
+    # We shift digits by 1 on input and output to leave 0 for padding.
+    inputs = [i + 1 for i in n1] + [base + 1] + [i + 1 for i in n2]
+    targets = [i + 1 for i in number_to_lower_endian(result, base)]
+    yield {"inputs": inputs, "targets": targets}
diff --git a/tensor2tensor/data_generators/algorithmic_math.py b/tensor2tensor/data_generators/algorithmic_math.py
new file mode 100644
index 000000000..932c080e1
--- /dev/null
+++ b/tensor2tensor/data_generators/algorithmic_math.py
@@ -0,0 +1,580 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Algorithmic data generators for symbolic math tasks.
+
+See go/symbolic-math-dataset
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from collections import namedtuple
+import random
+
+# Dependency imports
+
+import six
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import sympy
+
+
+class ExprOp(object):
+  """Represents an algebraic operation, such as '+', '-', etc."""
+
+  def __init__(self, symbol, precedence, associative=False):
+    """Constructor.
+
+    Args:
+      symbol: The character which represents this operation, such as '+' for
+          addition.
+      precedence: Operator precedence. This will determine where parentheses
+          are used.
+      associative: If true, the order of the operands does not matter.
+    """
+    self.symbol = symbol
+    self.precedence = precedence
+    self.associative = associative
+
+  def __str__(self):
+    return self.symbol
+
+  def __eq__(self, other):
+    return isinstance(other, ExprOp) and self.symbol == other.symbol
+
+
+class ExprNode(object):
+  """A node in an expression tree.
+
+  ExprNode always holds an operator. Leaves are strings.
+  """
+
+  def __init__(self, left, right, op):
+    self.left = left
+    self.right = right
+    self.op = op
+    left_depth = left.depth if isinstance(left, ExprNode) else 0
+    right_depth = right.depth if isinstance(right, ExprNode) else 0
+    self.depth = max(left_depth, right_depth) + 1
+
+  def __str__(self):
+    left_str = str(self.left)
+    right_str = str(self.right)
+    left_use_parens = (isinstance(self.left, ExprNode) and
+                       self.left.op.precedence < self.op.precedence)
+    right_use_parens = (isinstance(self.right, ExprNode) and
+                        self.right.op.precedence <= self.op.precedence and
+                        not (self.op.associative and self.right.op == self.op))
+    left_final = "(" + left_str + ")" if left_use_parens else left_str
+    right_final = "(" + right_str + ")" if right_use_parens else right_str
+    return left_final + str(self.op) + right_final
+
+  def is_in(self, expr):
+    """Returns True if `expr` is a subtree."""
+    if expr == self:
+      return True
+    is_in_left = is_in_expr(self.left, expr)
+    is_in_right = is_in_expr(self.right, expr)
+    return is_in_left or is_in_right
+
+
+def is_in_expr(expr, find):
+  """Returns True if `find` is a subtree of `expr`."""
+  return expr == find or (isinstance(expr, ExprNode) and expr.is_in(find))
+
+
+def random_expr_with_required_var(depth, required_var, optional_list, ops):
+  """Generate a random expression tree with a required variable.
+
+  The required variable appears exactly once in the expression.
+
+  Args:
+    depth: At least one leaf will be this many levels down from the top.
+    required_var: A char. This char is guaranteed to be placed exactly once at
+        a leaf somewhere in the tree. This is the var to solve for.
+    optional_list: A list of chars. These chars are randomly selected as leaf
+        values. These are constant vars.
+    ops: A list of ExprOp instances.
+
+  Returns:
+    An ExprNode instance which is the root of the generated expression tree.
+  """
+  if not depth:
+    if required_var:
+      return required_var
+    return str(optional_list[random.randrange(len(optional_list))])
+
+  max_depth_side = random.randrange(2)
+  other_side_depth = random.randrange(depth)
+
+  required_var_side = random.randrange(2)
+
+  left = random_expr_with_required_var(
+      depth - 1 if max_depth_side else other_side_depth, required_var
+      if required_var_side else None, optional_list, ops)
+  right = random_expr_with_required_var(
+      depth - 1 if not max_depth_side else other_side_depth, required_var
+      if not required_var_side else None, optional_list, ops)
+
+  op = ops[random.randrange(len(ops))]
+  return ExprNode(left, right, op)
+
+
+def random_expr(depth, vlist, ops):
+  """Generate a random expression tree.
+
+  Args:
+    depth: At least one leaf will be this many levels down from the top.
+    vlist: A list of chars. These chars are randomly selected as leaf values.
+    ops: A list of ExprOp instances.
+
+  Returns:
+    An ExprNode instance which is the root of the generated expression tree.
+  """
+  if not depth:
+    return str(vlist[random.randrange(len(vlist))])
+
+  max_depth_side = random.randrange(2)
+  other_side_depth = random.randrange(depth)
+
+  left = random_expr(depth - 1
+                     if max_depth_side else other_side_depth, vlist, ops)
+  right = random_expr(depth - 1
+                      if not max_depth_side else other_side_depth, vlist, ops)
+
+  op = ops[random.randrange(len(ops))]
+  return ExprNode(left, right, op)
+
+
+def algebra_inverse_solve(left, right, var, solve_ops):
+  """Solves for the value of the given var in an expression.
+
+  See go/symbolic-math-dataset.
+
+  Args:
+    left: The root of the ExprNode tree on the left side of the equals sign.
+    right: The root of the ExprNode tree on the right side of the equals sign.
+    var: A char. The variable to solve for.
+    solve_ops: A dictionary with the following properties.
+        * For each operator in the expression, there is a rule that determines
+          how to cancel out a value either to the left or the right of that
+          operator.
+        * For each rule, there is an entry in the dictionary. The key is two
+          chars- the op char, and either 'l' or 'r' meaning rule for canceling
+          out the left or right sides. For example, '+l', '+r', '-l', '-r'.
+        * The value of each entry is a function with the following signature:
+          (left, right, to_tree) -> (new_from_tree, new_to_tree)
+          left- Expression on left side of the op.
+          right- Expression on the right side of the op.
+          to_tree- The tree on the other side of the equal sign. The canceled
+              out expression will be moved here.
+          new_from_tree- The resuling from_tree after the algebraic
+              manipulation.
+          new_to_tree- The resulting to_tree after the algebraic manipulation.
+
+  Returns:
+    The root of an ExprNode tree which holds the value of `var` after solving.
+
+  Raises:
+    ValueError: If `var` does not appear exactly once in the equation (which
+        includes the left and right sides).
+  """
+  is_in_left = is_in_expr(left, var)
+  is_in_right = is_in_expr(right, var)
+  if is_in_left == is_in_right:
+    if is_in_left:
+      raise ValueError("Solve-variable '%s' is on both sides of the equation. "
+                       "Only equations where the solve variable-appears once "
+                       "are supported by this solver. Left: '%s', right: '%s'" %
+                       (var, str(left), str(right)))
+    else:
+      raise ValueError("Solve-variable '%s' is not present in the equation. It "
+                       "must appear once. Left: '%s', right: '%s'" %
+                       (var, str(left), str(right)))
+
+  from_tree = left if is_in_left else right
+  to_tree = left if not is_in_left else right
+  while from_tree != var:
+    is_in_left = is_in_expr(from_tree.left, var)
+    is_in_right = is_in_expr(from_tree.right, var)
+    from_tree, to_tree = (solve_ops[str(from_tree.op)
+                                    + ("l" if is_in_left else "r")](
+                                        from_tree.left, from_tree.right,
+                                        to_tree))
+  return to_tree
+
+
+def format_sympy_expr(sympy_expr, functions=None):
+  """Convert sympy expression into a string which can be encoded.
+
+  Args:
+    sympy_expr: Any sympy expression tree or string.
+    functions: Defines special functions. A dict mapping human readable string
+        names, like "log", "exp", "sin", "cos", etc., to single chars. Each
+        function gets a unique token, like "L" for "log".
+
+  Returns:
+    A string representation of the expression suitable for encoding as a
+        sequence input.
+  """
+  if functions is None:
+    functions = {}
+  str_expr = str(sympy_expr)
+  result = str_expr.replace(" ", "")
+  for fn_name, char in six.iteritems(functions):
+    result = result.replace(fn_name, char)
+  return result
+
+
+def generate_algebra_inverse_sample(vlist, ops, solve_ops, min_depth,
+                                    max_depth):
+  """Randomly generate an algebra inverse dataset sample.
+
+  Given an input equation and variable, produce the expression equal to the
+  variable.
+
+  See go/symbolic-math-dataset.
+
+  Args:
+    vlist: Variable list. List of chars that can be used in the expression.
+    ops: List of ExprOp instances. The allowed operators for the expression.
+    solve_ops: See `solve_ops` documentation in `algebra_inverse_solve`.
+    min_depth: Expression trees will not have a smaller depth than this. 0 means
+        there is just a variable. 1 means there is one operation.
+    max_depth: Expression trees will not have a larger depth than this. To make
+        all trees have the same depth, set this equal to `min_depth`.
+
+  Returns:
+    sample: String representation of the input. Will be of the form
+        'solve_var:left_side=right_side'.
+    target: String representation of the solution.
+  """
+  side = random.randrange(2)
+  left_depth = random.randrange(min_depth if side else 0, max_depth + 1)
+  right_depth = random.randrange(min_depth if not side else 0, max_depth + 1)
+
+  var_index = random.randrange(len(vlist))
+  var = vlist[var_index]
+  consts = vlist[:var_index] + vlist[var_index + 1:]
+
+  left = random_expr_with_required_var(left_depth, var
+                                       if side else None, consts, ops)
+  right = random_expr_with_required_var(right_depth, var
+                                        if not side else None, consts, ops)
+
+  left_str = str(left)
+  right_str = str(right)
+  target = str(algebra_inverse_solve(left, right, var, solve_ops))
+  sample = var + ":" + left_str + "=" + right_str
+
+  return sample, target
+
+
+def generate_algebra_simplify_sample(vlist, ops, min_depth, max_depth):
+  """Randomly generate an algebra simplify dataset sample.
+
+  Given an input expression, produce the simplified expression.
+
+  See go/symbolic-math-dataset.
+
+  Args:
+    vlist: Variable list. List of chars that can be used in the expression.
+    ops: List of ExprOp instances. The allowed operators for the expression.
+    min_depth: Expression trees will not have a smaller depth than this. 0 means
+        there is just a variable. 1 means there is one operation.
+    max_depth: Expression trees will not have a larger depth than this. To make
+        all trees have the same depth, set this equal to `min_depth`.
+
+  Returns:
+    sample: String representation of the input.
+    target: String representation of the solution.
+  """
+  depth = random.randrange(min_depth, max_depth + 1)
+  expr = random_expr(depth, vlist, ops)
+
+  sample = str(expr)
+  target = format_sympy_expr(sympy.simplify(sample))
+  return sample, target
+
+
+def generate_calculus_integrate_sample(vlist, ops, min_depth, max_depth,
+                                       functions):
+  """Randomly generate a symbolic integral dataset sample.
+
+  Given an input expression, produce the indefinite integral.
+
+  See go/symbolic-math-dataset.
+
+  Args:
+    vlist: Variable list. List of chars that can be used in the expression.
+    ops: List of ExprOp instances. The allowed operators for the expression.
+    min_depth: Expression trees will not have a smaller depth than this. 0 means
+        there is just a variable. 1 means there is one operation.
+    max_depth: Expression trees will not have a larger depth than this. To make
+        all trees have the same depth, set this equal to `min_depth`.
+    functions: Defines special functions. A dict mapping human readable string
+        names, like "log", "exp", "sin", "cos", etc., to single chars. Each
+        function gets a unique token, like "L" for "log".
+
+  Returns:
+    sample: String representation of the input. Will be of the form
+        'var:expression'.
+    target: String representation of the solution.
+  """
+  var_index = random.randrange(len(vlist))
+  var = vlist[var_index]
+  consts = vlist[:var_index] + vlist[var_index + 1:]
+
+  depth = random.randrange(min_depth, max_depth + 1)
+  expr = random_expr_with_required_var(depth, var, consts, ops)
+
+  expr_str = str(expr)
+  sample = var + ":" + expr_str
+  target = format_sympy_expr(
+      sympy.integrate(expr_str, sympy.Symbol(var)), functions=functions)
+  return sample, target
+
+
+# AlgebraConfig holds objects required to generate the algebra inverse
+# dataset. See go/symbolic-math-dataset.
+# vlist: Variable list. A list of chars.
+# dlist: Numberical digit list. A list of chars.
+# flist: List of special function names. A list of chars.
+# functions: Dict of special function names. Maps human readable string names to
+#     single char names used in flist.
+# ops: Dict mapping op symbols (chars) to ExprOp instances.
+# solve_ops: Encodes rules for how to algebraicly cancel out each operation. See
+#     doc-string for `algebra_inverse_solve`.
+# int_encoder: Function that maps a string to a list of tokens. Use this to
+#     encode an expression to feed into a model.
+# int_decoder: Function that maps a list of tokens to a string. Use this to
+#     convert model input or output into a human readable string.
+AlgebraConfig = namedtuple("AlgebraConfig", [
+    "vlist", "dlist", "flist", "functions", "ops", "solve_ops", "int_encoder",
+    "int_decoder"
+])
+
+
+def math_dataset_init(alphabet_size=26, digits=None, functions=None):
+  """Initializes required objects to generate symbolic math datasets.
+
+  See go/symbolic-math-dataset.
+
+  Produces token set, ExprOp instances, solve_op dictionary, encoders, and
+  decoders needed to generate the algebra inverse dataset.
+
+  Args:
+    alphabet_size: How many possible variables there are. Max 52.
+    digits: How many numerical digits to encode as tokens, "0" throuh
+        str(digits-1), or None to encode no digits.
+    functions: Defines special functions. A dict mapping human readable string
+        names, like "log", "exp", "sin", "cos", etc., to single chars. Each
+        function gets a unique token, like "L" for "log".
+        WARNING, Make sure these tokens do not conflict with the list of
+        possible variable names.
+
+  Returns:
+    AlgebraConfig instance holding all the objects listed above.
+
+  Raises:
+    ValueError: If `alphabet_size` is not in range [2, 52].
+  """
+  ops_list = ["+", "-", "*", "/"]
+  ops = {
+      "+": ExprOp("+", 0, True),
+      "-": ExprOp("-", 0, False),
+      "*": ExprOp("*", 1, True),
+      "/": ExprOp("/", 1, False)
+  }
+  solve_ops = {
+      "+l": lambda l, r, to: (l, ExprNode(to, r, ops["-"])),
+      "+r": lambda l, r, to: (r, ExprNode(to, l, ops["-"])),
+      "-l": lambda l, r, to: (l, ExprNode(to, r, ops["+"])),
+      "-r": lambda l, r, to: (r, ExprNode(l, to, ops["-"])),
+      "*l": lambda l, r, to: (l, ExprNode(to, r, ops["/"])),
+      "*r": lambda l, r, to: (r, ExprNode(to, l, ops["/"])),
+      "/l": lambda l, r, to: (l, ExprNode(to, r, ops["*"])),
+      "/r": lambda l, r, to: (r, ExprNode(l, to, ops["/"])),
+  }
+  alphabet = (
+      [six.int2byte(ord("a") + c)
+       for c in range(26)] + [six.int2byte(ord("A") + c) for c in range(26)])
+  if alphabet_size > 52:
+    raise ValueError(
+        "alphabet_size cannot be greater than 52. Got %s." % alphabet_size)
+  if alphabet_size < 2:
+    raise ValueError(
+        "alphabet_size cannot be less than 2. Got %s." % alphabet_size)
+  if digits is not None and not 1 <= digits <= 10:
+    raise ValueError("digits cannot must be between 1 and 10. Got %s." % digits)
+  vlist = alphabet[:alphabet_size]
+  if digits is not None:
+    dlist = [str(d) for d in xrange(digits)]
+  else:
+    dlist = []
+  if functions is None:
+    functions = {}
+  flist = sorted(functions.values())
+  pad = "_"
+  tokens = [pad] + [":", "(", ")", "="] + ops_list + vlist + dlist + flist
+  if len(tokens) != len(set(tokens)):
+    raise ValueError("Duplicate token. Tokens: %s" % tokens)
+  token_map = dict([(t, i) for i, t in enumerate(tokens)])
+
+  def int_encoder(sequence):
+    return [token_map[s] for s in sequence]
+
+  def int_decoder(tensor_1d):
+    return "".join([tokens[i] for i in tensor_1d])
+
+  return AlgebraConfig(
+      vlist=vlist,
+      dlist=dlist,
+      flist=flist,
+      functions=functions,
+      ops=ops,
+      solve_ops=solve_ops,
+      int_encoder=int_encoder,
+      int_decoder=int_decoder)
+
+
+def algebra_inverse(alphabet_size=26, min_depth=0, max_depth=2,
+                    nbr_cases=10000):
+  """Generate the algebra inverse dataset.
+
+  Each sample is a symbolic math equation involving unknown variables. The
+  task is to solve for the given variable. The target is the resulting
+  expression.
+
+  Args:
+    alphabet_size: How many possible variables there are. Max 52.
+    min_depth: Minimum depth of the expression trees on both sides of the
+        equals sign in the equation.
+    max_depth: Maximum depth of the expression trees on both sides of the
+        equals sign in the equation.
+    nbr_cases: The number of cases to generate.
+
+  Yields:
+    A dictionary {"inputs": input-list, "targets": target-list} where
+    input-list are the tokens encoding the variable to solve for and the math
+    equation, and target-list is a list of tokens encoding the resulting math
+    expression after solving for the variable.
+
+  Raises:
+    ValueError: If `max_depth` < `min_depth`.
+  """
+
+  if max_depth < min_depth:
+    raise ValueError("max_depth must be greater than or equal to min_depth. "
+                     "Got max_depth=%s, min_depth=%s" % (max_depth, min_depth))
+
+  alg_cfg = math_dataset_init(alphabet_size)
+  for _ in xrange(nbr_cases):
+    sample, target = generate_algebra_inverse_sample(
+        alg_cfg.vlist,
+        list(alg_cfg.ops.values()), alg_cfg.solve_ops, min_depth, max_depth)
+    yield {
+        "inputs": alg_cfg.int_encoder(sample),
+        "targets": alg_cfg.int_encoder(target)
+    }
+
+
+def algebra_simplify(alphabet_size=26,
+                     min_depth=0,
+                     max_depth=2,
+                     nbr_cases=10000):
+  """Generate the algebra simplify dataset.
+
+  Each sample is a symbolic math expression involving unknown variables. The
+  task is to simplify the expression. The target is the resulting expression.
+
+  Args:
+    alphabet_size: How many possible variables there are. Max 52.
+    min_depth: Minimum depth of the expression trees on both sides of the
+        equals sign in the equation.
+    max_depth: Maximum depth of the expression trees on both sides of the
+        equals sign in the equation.
+    nbr_cases: The number of cases to generate.
+
+  Yields:
+    A dictionary {"inputs": input-list, "targets": target-list} where
+    input-list are the tokens encoding the expression to simplify, and
+    target-list is a list of tokens encoding the resulting math expression after
+    simplifying.
+
+  Raises:
+    ValueError: If `max_depth` < `min_depth`.
+  """
+  if max_depth < min_depth:
+    raise ValueError("max_depth must be greater than or equal to min_depth. "
+                     "Got max_depth=%s, min_depth=%s" % (max_depth, min_depth))
+
+  alg_cfg = math_dataset_init(alphabet_size, digits=5)
+  for _ in xrange(nbr_cases):
+    sample, target = generate_algebra_simplify_sample(
+        alg_cfg.vlist, list(alg_cfg.ops.values()), min_depth, max_depth)
+    yield {
+        "inputs": alg_cfg.int_encoder(sample),
+        "targets": alg_cfg.int_encoder(target)
+    }
+
+
+def calculus_integrate(alphabet_size=26,
+                       min_depth=0,
+                       max_depth=2,
+                       nbr_cases=10000):
+  """Generate the calculus integrate dataset.
+
+  Each sample is a symbolic math expression involving unknown variables. The
+  task is to take the indefinite integral of the expression. The target is the
+  resulting expression.
+
+  Args:
+    alphabet_size: How many possible variables there are. Max 26.
+    min_depth: Minimum depth of the expression trees on both sides of the
+        equals sign in the equation.
+    max_depth: Maximum depth of the expression trees on both sides of the
+        equals sign in the equation.
+    nbr_cases: The number of cases to generate.
+
+  Yields:
+    A dictionary {"inputs": input-list, "targets": target-list} where
+    input-list are the tokens encoding the variable to integrate with respect
+    to and the expression to integrate, and target-list is a list of tokens
+    encoding the resulting math expression after integrating.
+
+  Raises:
+    ValueError: If `max_depth` < `min_depth`, or if alphabet_size > 26.
+  """
+  if max_depth < min_depth:
+    raise ValueError("max_depth must be greater than or equal to min_depth. "
+                     "Got max_depth=%s, min_depth=%s" % (max_depth, min_depth))
+
+  # Don't allow alphabet to use capital letters. Those are reserved for function
+  # names.
+  if alphabet_size > 26:
+    raise ValueError(
+        "alphabet_size must not be greater than 26. Got %s." % alphabet_size)
+
+  functions = {"log": "L"}
+  alg_cfg = math_dataset_init(alphabet_size, digits=5, functions=functions)
+  for _ in xrange(nbr_cases):
+    sample, target = generate_calculus_integrate_sample(
+        alg_cfg.vlist,
+        list(alg_cfg.ops.values()), min_depth, max_depth, alg_cfg.functions)
+    yield {
+        "inputs": alg_cfg.int_encoder(sample),
+        "targets": alg_cfg.int_encoder(target)
+    }
diff --git a/tensor2tensor/data_generators/algorithmic_math_test.py b/tensor2tensor/data_generators/algorithmic_math_test.py
new file mode 100644
index 000000000..6c4b63054
--- /dev/null
+++ b/tensor2tensor/data_generators/algorithmic_math_test.py
@@ -0,0 +1,84 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for tensor2tensor.data_generators.algorithmic_math."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+import six
+import sympy
+from tensor2tensor.data_generators import algorithmic_math
+
+import tensorflow as tf
+
+
+class AlgorithmicMathTest(tf.test.TestCase):
+
+  def testAlgebraInverse(self):
+    dataset_objects = algorithmic_math.math_dataset_init(26)
+    counter = 0
+    for d in algorithmic_math.algebra_inverse(26, 0, 3, 10):
+      counter += 1
+      decoded_input = dataset_objects.int_decoder(d["inputs"])
+      solve_var, expression = decoded_input.split(":")
+      lhs, rhs = expression.split("=")
+
+      # Solve for the solve-var.
+      result = sympy.solve("%s-(%s)" % (lhs, rhs), solve_var)
+      target_expression = dataset_objects.int_decoder(d["targets"])
+
+      # Check that the target and sympy's solutions are equivalent.
+      self.assertEqual(
+          0, sympy.simplify(str(result[0]) + "-(%s)" % target_expression))
+    self.assertEqual(counter, 10)
+
+  def testAlgebraSimplify(self):
+    dataset_objects = algorithmic_math.math_dataset_init(8, digits=5)
+    counter = 0
+    for d in algorithmic_math.algebra_simplify(8, 0, 3, 10):
+      counter += 1
+      expression = dataset_objects.int_decoder(d["inputs"])
+      target = dataset_objects.int_decoder(d["targets"])
+
+      # Check that the input and output are equivalent expressions.
+      self.assertEqual(0, sympy.simplify("%s-(%s)" % (expression, target)))
+    self.assertEqual(counter, 10)
+
+  def testCalculusIntegrate(self):
+    dataset_objects = algorithmic_math.math_dataset_init(
+        8, digits=5, functions={"log": "L"})
+    counter = 0
+    for d in algorithmic_math.calculus_integrate(8, 0, 3, 10):
+      counter += 1
+      decoded_input = dataset_objects.int_decoder(d["inputs"])
+      var, expression = decoded_input.split(":")
+      target = dataset_objects.int_decoder(d["targets"])
+
+      for fn_name, fn_char in six.iteritems(dataset_objects.functions):
+        target = target.replace(fn_char, fn_name)
+
+      # Take the derivative of the target.
+      derivative = str(sympy.diff(target, var))
+
+      # Check that the derivative of the integral equals the input.
+      self.assertEqual(0, sympy.simplify("%s-(%s)" % (expression, derivative)))
+    self.assertEqual(counter, 10)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensor2tensor/data_generators/algorithmic_test.py b/tensor2tensor/data_generators/algorithmic_test.py
new file mode 100644
index 000000000..7bc2fb5bb
--- /dev/null
+++ b/tensor2tensor/data_generators/algorithmic_test.py
@@ -0,0 +1,84 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Algorithmic generators test."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+from tensor2tensor.data_generators import algorithmic
+
+import tensorflow as tf
+
+
+class AlgorithmicTest(tf.test.TestCase):
+
+  def testIdentityGenerator(self):
+    counter = 0
+    for d in algorithmic.identity_generator(3, 8, 10):
+      counter += 1
+      self.assertEqual(d["inputs"], d["targets"])
+    self.assertEqual(counter, 10)
+
+  def testReverseGenerator(self):
+    counter = 0
+    for d in algorithmic.reverse_generator(3, 8, 10):
+      counter += 1
+      self.assertEqual(list(reversed(d["inputs"])), d["targets"])
+    self.assertEqual(counter, 10)
+
+  def testLowerEndianToNumber(self):
+    self.assertEqual(algorithmic.lower_endian_to_number([0], 2), 0)
+    self.assertEqual(algorithmic.lower_endian_to_number([0], 7), 0)
+    self.assertEqual(algorithmic.lower_endian_to_number([1], 2), 1)
+    self.assertEqual(algorithmic.lower_endian_to_number([5], 8), 5)
+    self.assertEqual(algorithmic.lower_endian_to_number([0, 1], 2), 2)
+    self.assertEqual(algorithmic.lower_endian_to_number([0, 1, 1], 2), 6)
+    self.assertEqual(algorithmic.lower_endian_to_number([7, 3, 1, 2], 10), 2137)
+
+  def testNumberToLowerEndian(self):
+    self.assertEqual(algorithmic.number_to_lower_endian(0, 2), [0])
+    self.assertEqual(algorithmic.number_to_lower_endian(0, 7), [0])
+    self.assertEqual(algorithmic.number_to_lower_endian(1, 2), [1])
+    self.assertEqual(algorithmic.number_to_lower_endian(5, 8), [5])
+    self.assertEqual(algorithmic.number_to_lower_endian(2, 2), [0, 1])
+    self.assertEqual(algorithmic.number_to_lower_endian(6, 2), [0, 1, 1])
+    self.assertEqual(algorithmic.number_to_lower_endian(2137, 10), [7, 3, 1, 2])
+
+  def testAdditionGenerator(self):
+    counter = 0
+    for d in algorithmic.addition_generator(4, 8, 10):
+      counter += 1
+      self.assertEqual(d["inputs"].count(5), 1)
+      self.assertEqual(d["inputs"].count(0), 0)
+      self.assertEqual(d["targets"].count(5), 0)
+      self.assertEqual(d["targets"].count(0), 0)
+    self.assertEqual(counter, 10)
+
+  def testMultiplicationGenerator(self):
+    counter = 0
+    for d in algorithmic.multiplication_generator(4, 8, 10):
+      counter += 1
+      self.assertEqual(d["inputs"].count(5), 1)
+      self.assertEqual(d["inputs"].count(0), 0)
+      self.assertEqual(d["targets"].count(5), 0)
+      self.assertEqual(d["targets"].count(0), 0)
+    self.assertEqual(counter, 10)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensor2tensor/data_generators/audio.py b/tensor2tensor/data_generators/audio.py
new file mode 100644
index 000000000..12e0c7b43
--- /dev/null
+++ b/tensor2tensor/data_generators/audio.py
@@ -0,0 +1,156 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""TIMIT data generator."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+from subprocess import call
+import tarfile
+import wave
+
+# Dependency imports
+
+from tensor2tensor.data_generators import generator_utils
+
+import tensorflow as tf
+
+flags = tf.flags
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("timit_paths", "",
+                    "Comma-separated list of tarfiles containing TIMIT "
+                    "datasets")
+
+_TIMIT_TRAIN_DATASETS = [
+    ["timit/TIMIT/TRAIN", (".WAV", ".WRD")],
+]
+_TIMIT_TEST_DATASETS = [
+    ["timit/TIMIT/TEST", (".WAV", ".WRD")],
+]
+
+
+def _get_timit(directory):
+  """Extract TIMIT datasets to directory unless directory/timit exists."""
+  if os.path.exists(os.path.join(directory, "timit")):
+    return
+
+  assert FLAGS.timit_paths
+  for path in FLAGS.timit_paths.split(","):
+    with tf.gfile.GFile(path) as f:
+      with tarfile.open(fileobj=f, mode="r:gz") as timit_compressed:
+        timit_compressed.extractall(directory)
+
+
+def _collect_data(directory, input_ext, target_ext):
+  """Traverses directory collecting input and target files."""
+  # Directory from string to tuple pair of strings
+  # key: the filepath to a datafile including the datafile's basename. Example,
+  #   if the datafile was "/path/to/datafile.wav" then the key would be
+  #   "/path/to/datafile"
+  # value: a pair of strings (input_filepath, target_filepath)
+  data_files = dict()
+  for root, _, filenames in os.walk(directory):
+    input_files = [filename for filename in filenames if input_ext in filename]
+    for input_filename in input_files:
+      basename = input_filename.strip(input_ext)
+      input_file = os.path.join(root, input_filename)
+      target_file = os.path.join(root, basename + target_ext)
+      key = os.path.join(root, basename)
+      assert os.path.exists(target_file)
+      assert key not in data_files
+      data_files[key] = (input_file, target_file)
+  return data_files
+
+
+def _get_audio_data(filepath):
+  # Construct a true .wav file.
+  out_filepath = filepath.strip(".WAV") + ".wav"
+  # Assumes sox is installed on system. Sox converts from NIST SPHERE to WAV.
+  call(["sox", filepath, out_filepath])
+  wav_file = wave.open(open(out_filepath))
+  frame_count = wav_file.getnframes()
+  byte_array = wav_file.readframes(frame_count)
+  data = [int(b.encode("hex"), base=16) for b in byte_array]
+  return data, frame_count, wav_file.getsampwidth(), wav_file.getnchannels()
+
+
+def _get_text_data(filepath):
+  with tf.gfile.GFile(filepath, mode="r") as text_file:
+    words = []
+    for line in text_file:
+      word = line.strip().split()[2]
+      words.append(word)
+    return " ".join(words)
+
+
+def timit_generator(tmp_dir,
+                    training,
+                    how_many,
+                    start_from=0,
+                    eos_list=None,
+                    vocab_filename=None,
+                    vocab_size=0):
+  """Data generator for TIMIT transcription problem.
+
+  Args:
+    tmp_dir: path to temporary storage directory.
+    training: a Boolean; if true, we use the train set, otherwise the test set.
+    how_many: how many inputs and labels to generate.
+    start_from: from which input to start.
+    eos_list: optional list of end of sentence tokens, otherwise use default
+      value `1`.
+    vocab_filename: file within `tmp_dir` to read vocabulary from. If this is
+      not provided then the target sentence will be encoded by character.
+    vocab_size: integer target to generate vocabulary size to.
+
+  Yields:
+    A dictionary representing the images with the following fields:
+    * inputs: a float sequence containing the audio data
+    * audio/channel_count: an integer
+    * audio/sample_count: an integer
+    * audio/sample_width: an integer
+    * targets: an integer sequence representing the encoded sentence
+  """
+  eos_list = [1] if eos_list is None else eos_list
+  if vocab_filename is not None:
+    vocab_symbolizer = generator_utils.get_or_generate_vocab(
+        tmp_dir, vocab_filename, vocab_size)
+  _get_timit(tmp_dir)
+  datasets = (_TIMIT_TRAIN_DATASETS if training else _TIMIT_TEST_DATASETS)
+  i = 0
+  for data_dir, (audio_ext, transcription_ext) in datasets:
+    data_dir = os.path.join(tmp_dir, data_dir)
+    data_files = _collect_data(data_dir, audio_ext, transcription_ext)
+    data_pairs = data_files.values()
+    for input_file, target_file in sorted(data_pairs)[start_from:]:
+      if i == how_many:
+        return
+      i += 1
+      audio_data, sample_count, sample_width, num_channels = _get_audio_data(
+          input_file)
+      text_data = _get_text_data(target_file)
+      if vocab_filename is None:
+        label = [ord(c) for c in text_data] + eos_list
+      else:
+        label = vocab_symbolizer.encode(text_data) + eos_list
+      yield {
+          "inputs": audio_data,
+          "audio/channel_count": [num_channels],
+          "audio/sample_count": [sample_count],
+          "audio/sample_width": [sample_width],
+          "targets": label
+      }
diff --git a/tensor2tensor/data_generators/audio_test.py b/tensor2tensor/data_generators/audio_test.py
new file mode 100644
index 000000000..f1830043f
--- /dev/null
+++ b/tensor2tensor/data_generators/audio_test.py
@@ -0,0 +1,62 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for tensor2tensor.data_generators.audio."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import io
+import os
+
+# Dependency imports
+
+from tensor2tensor.data_generators import audio
+
+import tensorflow as tf
+
+
+class AudioTest(tf.test.TestCase):
+
+  def testDataCollection(self):
+    # Generate a trivial source and target file.
+    tmp_dir = self.get_temp_dir()
+    test_files = [
+        "dir1/file1",
+        "dir1/file2",
+        "dir1/dir2/file3",
+        "dir1/dir2/dir3/file4",
+    ]
+    for filename in test_files:
+      input_filename = os.path.join(tmp_dir, filename + ".WAV")
+      target_filename = os.path.join(tmp_dir, filename + ".WRD")
+      directories = os.path.dirname(input_filename)
+      if not os.path.exists(directories):
+        os.makedirs(directories)
+      io.open(input_filename, "wb")
+      io.open(target_filename, "wb")
+
+    data_dict = audio._collect_data(tmp_dir, ".WAV", ".WRD")
+    expected = [os.path.join(tmp_dir, filename) for filename in test_files]
+    self.assertEqual(sorted(list(data_dict)), sorted(expected))
+
+    # Clean up.
+    for filename in test_files:
+      os.remove(os.path.join(tmp_dir, "%s.WAV" % filename))
+      os.remove(os.path.join(tmp_dir, "%s.WRD" % filename))
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensor2tensor/data_generators/concatenate_examples.py b/tensor2tensor/data_generators/concatenate_examples.py
new file mode 100644
index 000000000..b346b6c08
--- /dev/null
+++ b/tensor2tensor/data_generators/concatenate_examples.py
@@ -0,0 +1,180 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""Convert seq-seq examples to "concatenated" examples.
+
+The concatenated example has no "inputs".
+Instead the source is at the beginning of the target.
+
+We can now use a simple language model.
+
+Example:
+seq-seq mode:
+{
+  "inputs": subtokenizer.encode("I love you.") + [1]
+  "targets": subtokenizer.encode("Je t'aime.") + [1]
+}
+->
+concatenated mode:
+{
+  "inputs": [0]
+  "targets": (subtokenizer.encode("source English I love you.") + [1]
+              + subtokenizer.encode("target French Je t'aime.") + [1])
+}
+
+We add a dummy feature "inputs"=[0] for compatability with seq-to-seq models.
+
+If FLAGS.combine_to_length is nonzero, then we combine multiple examples into
+examples of a constant length, possibly with some padding at the end.
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import random
+
+# Dependency imports
+
+from tensor2tensor.data_generators import generator_utils
+from tensor2tensor.data_generators import text_encoder
+import tensorflow as tf
+
+tf.app.flags.DEFINE_string("vocab_file", "",
+                           "SubwordTextEncoder vocabulary file")
+
+tf.app.flags.DEFINE_boolean(
+    "random_reverse", False,
+    "If true, write half of the example with source/target reversed")
+
+tf.app.flags.DEFINE_boolean(
+    "count_everything", False,
+    "If true, assign positive weights to designators, source and target. "
+    "If false, assign positive weights only to target.")
+
+tf.app.flags.DEFINE_string("source_domain_string", "English", "")
+tf.app.flags.DEFINE_string("target_domain_string", "French", "")
+
+tf.app.flags.DEFINE_integer(
+    "combine_to_length", 0,
+    "If positive, concatenate examples to form examples with target length "
+    " equal to this value. Targets are padded with subtoken id=0.")
+
+tf.app.flags.DEFINE_string("in_file", "", "input filename")
+
+tf.app.flags.DEFINE_string(
+    "out_prefix", "/usr/local/google/tmp/concat",
+    "The output filename is equal to out_prefix plus "
+    "the last 15 characters of in_file. (e.g. -00001-of-00100)")
+
+FLAGS = tf.app.flags.FLAGS
+
+
+def _make_example(ids, weights, raw_num_bytes):
+  if FLAGS.combine_to_length > 0:
+    ids += [0] * (FLAGS.combine_to_length - len(ids))
+  return generator_utils.to_example({
+      "targets": ids,
+      "target_weights": weights,
+      "inputs": [0],
+      "raw_num_bytes": [raw_num_bytes]
+  }).SerializeToString()
+
+
+def main(_):
+  """Convert a file to examples."""
+  subtokenizer = text_encoder.SubwordTextEncoder(FLAGS.vocab_file)
+  total_bytes = 0
+  total_subtokens = 0
+  total_examples = 0
+  dropped_examples = 0
+
+  combined_subtokens = []
+  combined_num_bytes = 0
+  combined_weights = []
+
+  source_specifier = subtokenizer.encode("source " + FLAGS.source_domain_string)
+  target_specifier = subtokenizer.encode("target " + FLAGS.target_domain_string)
+  if FLAGS.random_reverse:
+    r_source_specifier = subtokenizer.encode("source " +
+                                             FLAGS.target_domain_string)
+    r_target_specifier = subtokenizer.encode("target " +
+                                             FLAGS.source_domain_string)
+
+  reader = tf.python_io.tf_record_iterator(FLAGS.in_file)
+
+  out_file = FLAGS.out_prefix + FLAGS.in_file[-15:]
+  writer = tf.python_io.TFRecordWriter(out_file)
+
+  for record in reader:
+    total_examples += 1
+    if total_examples % 1000 == 0:
+      tf.logging.info("total_examples: %d", total_examples)
+    x = tf.train.Example()
+    x.ParseFromString(record)
+    inputs = [i for i in x.features.feature["inputs"].int64_list.value]
+    targets = [i for i in x.features.feature["targets"].int64_list.value]
+    should_reverse = FLAGS.random_reverse and random.random() < 0.5
+    source_bytes = len(subtokenizer.decode(inputs[:-1])) + 1
+    target_bytes = len(subtokenizer.decode(targets[:-1])) + 1
+    if not should_reverse:
+      subtokens = source_specifier + inputs + target_specifier + targets
+      weights = ([0.0] *
+                 (len(source_specifier) + len(inputs) + len(target_specifier)) +
+                 [1.0] * len(targets))
+      num_bytes = target_bytes
+    else:
+      subtokens = r_source_specifier + targets + r_target_specifier + inputs
+      weights = (
+          [0.0] *
+          (len(r_source_specifier) + len(targets) + len(r_target_specifier)) +
+          [1.0] * len(inputs))
+      num_bytes = source_bytes
+    if FLAGS.count_everything:
+      weights = [1.0] * len(subtokens)
+      num_bytes = source_bytes + target_bytes
+    total_bytes += num_bytes
+    total_subtokens += sum(weights)
+    if FLAGS.combine_to_length:
+      if combined_subtokens and (len(combined_subtokens) + len(subtokens) >
+                                 FLAGS.combine_to_length):
+        writer.write(
+            _make_example(combined_subtokens, combined_weights,
+                          combined_num_bytes))
+        combined_subtokens = []
+        combined_weights = []
+        combined_num_bytes = 0
+      if len(subtokens) <= FLAGS.combine_to_length:
+        combined_subtokens.extend(subtokens)
+        combined_weights.extend(weights)
+        combined_num_bytes += num_bytes
+      else:
+        dropped_examples += 1
+    else:
+      writer.write(_make_example(subtokens, weights, num_bytes))
+  if combined_subtokens:
+    writer.write(
+        _make_example(combined_subtokens, combined_weights, combined_num_bytes))
+  writer.close()
+
+  tf.logging.info("total bytes: %d", total_bytes)
+  tf.logging.info("total subtokens: %d", total_subtokens)
+  tf.logging.info("bytes per subtoken: %f", total_bytes / total_subtokens)
+  tf.logging.info("total documents: %d", total_examples)
+  tf.logging.info("dropped documents: %d", dropped_examples)
+
+
+if __name__ == "__main__":
+  tf.app.run()
diff --git a/tensor2tensor/data_generators/generator_utils.py b/tensor2tensor/data_generators/generator_utils.py
new file mode 100644
index 000000000..487546e16
--- /dev/null
+++ b/tensor2tensor/data_generators/generator_utils.py
@@ -0,0 +1,264 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities for data generators."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gzip
+import io
+import os
+import tarfile
+import urllib
+
+# Dependency imports
+
+import six
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensor2tensor.data_generators.text_encoder import SubwordTextEncoder
+from tensor2tensor.data_generators.tokenizer import Tokenizer
+
+import tensorflow as tf
+
+
+def to_example(dictionary):
+  """Helper: build tf.Example from (string -> int/float/str list) dictionary."""
+  features = {}
+  for (k, v) in six.iteritems(dictionary):
+    if not v:
+      raise ValueError("Empty generated field: %s", str((k, v)))
+    if isinstance(v[0], six.integer_types):
+      features[k] = tf.train.Feature(int64_list=tf.train.Int64List(value=v))
+    elif isinstance(v[0], float):
+      features[k] = tf.train.Feature(float_list=tf.train.FloatList(value=v))
+    elif isinstance(v[0], six.string_types):
+      features[k] = tf.train.Feature(bytes_list=tf.train.BytesList(value=v))
+    else:
+      raise ValueError("Value is neither an int nor a float; v: %s type: %s" %
+                       (str(v[0]), str(type(v[0]))))
+  return tf.train.Example(features=tf.train.Features(feature=features))
+
+
+def generate_files_distributed(generator,
+                               output_name,
+                               output_dir,
+                               num_shards=1,
+                               max_cases=None,
+                               task_id=0):
+  """generate_files but with a single writer writing to shard task_id."""
+  assert task_id < num_shards
+  output_filename = "%s-%.5d-of-%.5d" % (output_name, task_id, num_shards)
+  output_file = os.path.join(output_dir, output_filename)
+  tf.logging.info("Writing to file %s", output_file)
+  writer = tf.python_io.TFRecordWriter(output_file)
+
+  counter = 0
+  for case in generator:
+    if counter % 100000 == 0:
+      tf.logging.info("Generating case %d for %s." % (counter, output_name))
+    counter += 1
+    if max_cases and counter > max_cases:
+      break
+    sequence_example = to_example(case)
+    writer.write(sequence_example.SerializeToString())
+
+  writer.close()
+  return output_file
+
+
+def generate_files(generator,
+                   output_name,
+                   output_dir,
+                   num_shards=1,
+                   max_cases=None):
+  """Generate cases from a generator and save as TFRecord files.
+
+  Generated cases are transformed to tf.Example protos and saved as TFRecords
+  in sharded files named output_dir/output_name-00..N-of-00..M=num_shards.
+
+  Args:
+    generator: a generator yielding (string -> int/float/str list) dictionaries.
+    output_name: the file name prefix under which output will be saved.
+    output_dir: directory to save the output to.
+    num_shards: how many shards to use (defaults to 1).
+    max_cases: maximum number of cases to get from the generator;
+      if None (default), we use the generator until StopIteration is raised.
+
+  Returns:
+    List of output file paths.
+  """
+  writers = []
+  output_files = []
+  for shard in xrange(num_shards):
+    output_filename = "%s-%.5d-of-%.5d" % (output_name, shard, num_shards)
+    output_file = os.path.join(output_dir, output_filename)
+    output_files.append(output_file)
+    writers.append(tf.python_io.TFRecordWriter(output_file))
+
+  counter, shard = 0, 0
+  for case in generator:
+    if counter % 100000 == 0:
+      tf.logging.info("Generating case %d for %s." % (counter, output_name))
+    counter += 1
+    if max_cases and counter > max_cases:
+      break
+    sequence_example = to_example(case)
+    writers[shard].write(sequence_example.SerializeToString())
+    shard = (shard + 1) % num_shards
+
+  for writer in writers:
+    writer.close()
+
+  return output_files
+
+
+def maybe_download(directory, filename, url):
+  """Download filename from url unless it's already in directory.
+
+  Args:
+    directory: path to the directory that will be used.
+    filename: name of the file to download to (do nothing if it already exists).
+    url: URL to download from.
+
+  Returns:
+    The path to the downloaded file.
+  """
+  if not tf.gfile.Exists(directory):
+    tf.logging.info("Creating directory %s" % directory)
+    os.mkdir(directory)
+  filepath = os.path.join(directory, filename)
+  if not tf.gfile.Exists(filepath):
+    tf.logging.info("Downloading %s to %s" % (url, filepath))
+    filepath, _ = urllib.urlretrieve(url, filepath)
+    statinfo = os.stat(filepath)
+    tf.logging.info("Succesfully downloaded %s, %s bytes." % (filename,
+                                                              statinfo.st_size))
+  else:
+    tf.logging.info("Not downloading, file already found: %s" % filepath)
+  return filepath
+
+
+def gunzip_file(gz_path, new_path):
+  """Unzips from gz_path into new_path.
+
+  Args:
+    gz_path: path to the zipped file.
+    new_path: path to where the file will be unzipped.
+  """
+  tf.logging.info("Unpacking %s to %s" % (gz_path, new_path))
+  with gzip.open(gz_path, "rb") as gz_file:
+    with io.open(new_path, "wb") as new_file:
+      for line in gz_file:
+        new_file.write(line)
+
+
+# TODO(aidangomez): en-fr tasks are significantly over-represented below
+_DATA_FILE_URLS = [
+    # German-English
+    [
+        "http://data.statmt.org/wmt16/translation-task/training-parallel-nc-v11.tgz",  # pylint: disable=line-too-long
+        [
+            "training-parallel-nc-v11/news-commentary-v11.de-en.en",
+            "training-parallel-nc-v11/news-commentary-v11.de-en.de"
+        ]
+    ],
+    # German-English & French-English
+    [
+        "http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz", [
+            "commoncrawl.de-en.en", "commoncrawl.de-en.de",
+            "commoncrawl.fr-en.en", "commoncrawl.fr-en.fr"
+        ]
+    ],
+    [
+        "http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz", [
+            "training/europarl-v7.de-en.en", "training/europarl-v7.de-en.de",
+            "training/europarl-v7.fr-en.en", "training/europarl-v7.fr-en.fr"
+        ]
+    ],
+    # French-English
+    [
+        "http://www.statmt.org/wmt10/training-giga-fren.tar",
+        ["giga-fren.release2.fixed.en.gz", "giga-fren.release2.fixed.fr.gz"]
+    ],
+    [
+        "http://www.statmt.org/wmt13/training-parallel-un.tgz",
+        ["un/undoc.2000.fr-en.en", "un/undoc.2000.fr-en.fr"]
+    ],
+]
+
+
+def get_or_generate_vocab(tmp_dir, vocab_filename, vocab_size):
+  """Generate a vocabulary from the datasets listed in _DATA_FILE_URLS."""
+  vocab_filepath = os.path.join(tmp_dir, vocab_filename)
+  if os.path.exists(vocab_filepath):
+    vocab = SubwordTextEncoder(vocab_filepath)
+    return vocab
+
+  tokenizer = Tokenizer()
+  for source in _DATA_FILE_URLS:
+    url = source[0]
+    filename = os.path.basename(url)
+    read_type = "r:gz" if "tgz" in filename else "r"
+
+    compressed_file = maybe_download(tmp_dir, filename, url)
+
+    with tarfile.open(compressed_file, read_type) as corpus_tar:
+      corpus_tar.extractall(tmp_dir)
+
+    for lang_file in source[1]:
+      tf.logging.info("Reading file: %s" % lang_file)
+      filepath = os.path.join(tmp_dir, lang_file)
+
+      # For some datasets a second extraction is necessary.
+      if ".gz" in lang_file:
+        tf.logging.info("Unpacking subdirectory %s" % filepath)
+        new_filepath = os.path.join(tmp_dir, lang_file[:-3])
+        gunzip_file(filepath, new_filepath)
+        filepath = new_filepath
+
+      # Use Tokenizer to count the word occurrences.
+      with tf.gfile.GFile(filepath, mode="r") as source_file:
+        file_byte_budget = 3.5e5 if "en" in filepath else 7e5
+        for line in source_file:
+          if file_byte_budget <= 0:
+            break
+          line = line.strip()
+          file_byte_budget -= len(line)
+          _ = tokenizer.encode(line)
+
+  vocab = SubwordTextEncoder.build_to_target_size(
+      vocab_size, tokenizer.token_counts, vocab_filepath, 1, 1e3)
+  return vocab
+
+
+def read_records(filename):
+  reader = tf.python_io.tf_record_iterator(filename)
+  records = []
+  for record in reader:
+    records.append(record)
+    if len(records) % 10000 == 0:
+      tf.logging.info("read: %d", len(records))
+  return records
+
+
+def write_records(records, out_filename):
+  writer = tf.python_io.TFRecordWriter(out_filename)
+  for count, record in enumerate(records):
+    writer.write(record)
+    if count % 10000 == 0:
+      tf.logging.info("write: %d", count)
+  writer.close()
diff --git a/tensor2tensor/data_generators/generator_utils_test.py b/tensor2tensor/data_generators/generator_utils_test.py
new file mode 100644
index 000000000..726763f7a
--- /dev/null
+++ b/tensor2tensor/data_generators/generator_utils_test.py
@@ -0,0 +1,88 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Generator utilities test."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gzip
+import io
+import os
+import tempfile
+
+# Dependency imports
+
+from tensor2tensor.data_generators import generator_utils
+
+import tensorflow as tf
+
+
+class GeneratorUtilsTest(tf.test.TestCase):
+
+  def testGenerateFiles(self):
+    tmp_dir = self.get_temp_dir()
+    (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir)
+    tmp_file_name = os.path.basename(tmp_file_path)
+
+    # Generate a trivial file and assert the file exists.
+    def test_generator():
+      yield {"inputs": [1], "target": [1]}
+
+    generator_utils.generate_files(test_generator(), tmp_file_name, tmp_dir)
+    self.assertTrue(tf.gfile.Exists(tmp_file_path + "-00000-of-00001"))
+
+    # Clean up.
+    os.remove(tmp_file_path + "-00000-of-00001")
+    os.remove(tmp_file_path)
+
+  def testMaybeDownload(self):
+    tmp_dir = self.get_temp_dir()
+    (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir)
+    tmp_file_name = os.path.basename(tmp_file_path)
+
+    # Download Google index to the temporary file.http.
+    res_path = generator_utils.maybe_download(tmp_dir, tmp_file_name + ".http",
+                                              "http://google.com")
+    self.assertEqual(res_path, tmp_file_path + ".http")
+
+    # Clean up.
+    os.remove(tmp_file_path + ".http")
+    os.remove(tmp_file_path)
+
+  def testGunzipFile(self):
+    tmp_dir = self.get_temp_dir()
+    (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir)
+
+    # Create a test zip file and unzip it.
+    with gzip.open(tmp_file_path + ".gz", "wb") as gz_file:
+      gz_file.write("test line")
+    generator_utils.gunzip_file(tmp_file_path + ".gz", tmp_file_path + ".txt")
+
+    # Check that the unzipped result is as expected.
+    lines = []
+    for line in io.open(tmp_file_path + ".txt", "rb"):
+      lines.append(line.strip())
+    self.assertEqual(len(lines), 1)
+    self.assertEqual(lines[0], "test line")
+
+    # Clean up.
+    os.remove(tmp_file_path + ".gz")
+    os.remove(tmp_file_path + ".txt")
+    os.remove(tmp_file_path)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensor2tensor/data_generators/image.py b/tensor2tensor/data_generators/image.py
new file mode 100644
index 000000000..55b5f2fc7
--- /dev/null
+++ b/tensor2tensor/data_generators/image.py
@@ -0,0 +1,306 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Data generators for image data-sets."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import cPickle
+import gzip
+import io
+import json
+import os
+import random
+import tarfile
+import zipfile
+
+# Dependency imports
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+from six.moves import zip  # pylint: disable=redefined-builtin
+from tensor2tensor.data_generators import generator_utils
+
+import tensorflow as tf
+
+
+def image_generator(images, labels):
+  """Generator for images that takes image and labels lists and creates pngs.
+
+  Args:
+    images: list of images given as [width x height x channels] numpy arrays.
+    labels: list of ints, same length as images.
+
+  Yields:
+    A dictionary representing the images with the following fields:
+    * image/encoded: the string encoding the image as PNG,
+    * image/format: the string "png" representing image format,
+    * image/class/label: an integer representing the label,
+    * image/height: an integer representing the height,
+    * image/width: an integer representing the width.
+    Every field is actually a singleton list of the corresponding type.
+
+  Raises:
+    ValueError: if images is an empty list.
+  """
+  if not images:
+    raise ValueError("Must provide some images for the generator.")
+  (width, height, channels) = images[0].shape
+  with tf.Graph().as_default():
+    image_t = tf.placeholder(dtype=tf.uint8, shape=(width, height, channels))
+    encoded_image_t = tf.image.encode_png(image_t)
+    with tf.Session() as sess:
+      for (image, label) in zip(images, labels):
+        enc_string = sess.run(encoded_image_t, feed_dict={image_t: image})
+        yield {
+            "image/encoded": [enc_string],
+            "image/format": ["png"],
+            "image/class/label": [label],
+            "image/height": [height],
+            "image/width": [width]
+        }
+
+
+# URLs and filenames for MNIST data.
+_MNIST_URL = "http://yann.lecun.com/exdb/mnist/"
+_MNIST_TRAIN_DATA_FILENAME = "train-images-idx3-ubyte.gz"
+_MNIST_TRAIN_LABELS_FILENAME = "train-labels-idx1-ubyte.gz"
+_MNIST_TEST_DATA_FILENAME = "t10k-images-idx3-ubyte.gz"
+_MNIST_TEST_LABELS_FILENAME = "t10k-labels-idx1-ubyte.gz"
+_MNIST_IMAGE_SIZE = 28
+
+
+def _get_mnist(directory):
+  """Download all MNIST files to directory unless they are there."""
+  for filename in [
+      _MNIST_TRAIN_DATA_FILENAME, _MNIST_TRAIN_LABELS_FILENAME,
+      _MNIST_TEST_DATA_FILENAME, _MNIST_TEST_LABELS_FILENAME
+  ]:
+    generator_utils.maybe_download(directory, filename, _MNIST_URL + filename)
+
+
+def _extract_mnist_images(filename, num_images):
+  """Extract images from an MNIST file into a numpy array.
+
+  Args:
+    filename: The path to an MNIST images file.
+    num_images: The number of images in the file.
+
+  Returns:
+    A numpy array of shape [number_of_images, height, width, channels].
+  """
+  with gzip.open(filename) as bytestream:
+    bytestream.read(16)
+    buf = bytestream.read(_MNIST_IMAGE_SIZE * _MNIST_IMAGE_SIZE * num_images)
+    data = np.frombuffer(buf, dtype=np.uint8)
+    data = data.reshape(num_images, _MNIST_IMAGE_SIZE, _MNIST_IMAGE_SIZE, 1)
+  return data
+
+
+def _extract_mnist_labels(filename, num_labels):
+  """Extract labels from an MNIST file into integers.
+
+  Args:
+    filename: The path to an MNIST labels file.
+    num_labels: The number of labels in the file.
+
+  Returns:
+    A int64 numpy array of shape [num_labels]
+  """
+  with gzip.open(filename) as bytestream:
+    bytestream.read(8)
+    buf = bytestream.read(num_labels)
+    labels = np.frombuffer(buf, dtype=np.uint8).astype(np.int64)
+  return labels
+
+
+def mnist_generator(tmp_dir, training, how_many, start_from=0):
+  """Image generator for MNIST.
+
+  Args:
+    tmp_dir: path to temporary storage directory.
+    training: a Boolean; if true, we use the train set, otherwise the test set.
+    how_many: how many images and labels to generate.
+    start_from: from which image to start.
+
+  Returns:
+    An instance of image_generator that produces MNIST images.
+  """
+  _get_mnist(tmp_dir)
+  d = _MNIST_TRAIN_DATA_FILENAME if training else _MNIST_TEST_DATA_FILENAME
+  l = _MNIST_TRAIN_LABELS_FILENAME if training else _MNIST_TEST_LABELS_FILENAME
+  data_path = os.path.join(tmp_dir, d)
+  labels_path = os.path.join(tmp_dir, l)
+  images = _extract_mnist_images(data_path, 60000 if training else 10000)
+  labels = _extract_mnist_labels(labels_path, 60000 if training else 10000)
+  # Shuffle the data to make sure classes are well distributed.
+  data = list(zip(images, labels))
+  random.shuffle(data)
+  images, labels = list(zip(*data))
+  return image_generator(images[start_from:start_from + how_many],
+                         labels[start_from:start_from + how_many])
+
+
+# URLs and filenames for CIFAR data.
+_CIFAR10_URL = "https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz"
+_CIFAR10_PREFIX = "cifar-10-batches-py/"
+_CIFAR10_TRAIN_FILES = [
+    "data_batch_1", "data_batch_2", "data_batch_3", "data_batch_4",
+    "data_batch_5"
+]
+_CIFAR10_TEST_FILES = ["test_batch"]
+_CIFAR10_IMAGE_SIZE = 32
+
+
+def _get_cifar10(directory):
+  """Download and extract CIFAR to directory unless it is there."""
+  filename = os.path.basename(_CIFAR10_URL)
+  path = generator_utils.maybe_download(directory, filename, _CIFAR10_URL)
+  tarfile.open(path, "r:gz").extractall(directory)
+
+
+def cifar10_generator(tmp_dir, training, how_many, start_from=0):
+  """Image generator for CIFAR-10.
+
+  Args:
+    tmp_dir: path to temporary storage directory.
+    training: a Boolean; if true, we use the train set, otherwise the test set.
+    how_many: how many images and labels to generate.
+    start_from: from which image to start.
+
+  Returns:
+    An instance of image_generator that produces CIFAR-10 images and labels.
+  """
+  _get_cifar10(tmp_dir)
+  data_files = _CIFAR10_TRAIN_FILES if training else _CIFAR10_TEST_FILES
+  all_images, all_labels = [], []
+  for filename in data_files:
+    path = os.path.join(tmp_dir, _CIFAR10_PREFIX, filename)
+    with tf.gfile.Open(path, "r") as f:
+      data = cPickle.load(f)
+    images = data["data"]
+    num_images = images.shape[0]
+    images = images.reshape((num_images, 3, _CIFAR10_IMAGE_SIZE,
+                             _CIFAR10_IMAGE_SIZE))
+    all_images.extend([
+        np.squeeze(images[j]).transpose((1, 2, 0)) for j in xrange(num_images)
+    ])
+    labels = data["labels"]
+    all_labels.extend([labels[j] for j in xrange(num_images)])
+  # Shuffle the data to make sure classes are well distributed.
+  data = zip(all_images, all_labels)
+  random.shuffle(data)
+  all_images, all_labels = zip(*data)
+  return image_generator(all_images[start_from:start_from + how_many],
+                         all_labels[start_from:start_from + how_many])
+
+
+# URLs and filenames for MSCOCO data.
+_MSCOCO_ROOT_URL = "http://msvocds.blob.core.windows.net/"
+_MSCOCO_URLS = [
+    "coco2014/train2014.zip", "coco2014/val2014.zip", "coco2014/test2014.zip",
+    "annotations-1-0-3/captions_train-val2014.zip"
+]
+_MSCOCO_TRAIN_PREFIX = "train2014"
+_MSCOCO_EVAL_PREFIX = "val2014"
+_MSCOCO_TRAIN_CAPTION_FILE = "annotations/captions_train2014.json"
+_MSCOCO_EVAL_CAPTION_FILE = "annotations/captions_val2014.json"
+
+
+def _get_mscoco(directory):
+  """Download and extract MSCOCO datasets to directory unless it is there."""
+  for url in _MSCOCO_URLS:
+    filename = os.path.basename(url)
+    download_url = os.path.join(_MSCOCO_ROOT_URL, url)
+    path = generator_utils.maybe_download(directory, filename, download_url)
+    unzip_dir = os.path.join(directory, filename.strip(".zip"))
+    if not tf.gfile.Exists(unzip_dir):
+      zipfile.ZipFile(path, "r").extractall(directory)
+
+
+def mscoco_generator(tmp_dir,
+                     training,
+                     how_many,
+                     start_from=0,
+                     eos_list=None,
+                     vocab_filename=None,
+                     vocab_size=0):
+  """Image generator for MSCOCO captioning problem with token-wise captions.
+
+  Args:
+    tmp_dir: path to temporary storage directory.
+    training: a Boolean; if true, we use the train set, otherwise the test set.
+    how_many: how many images and labels to generate.
+    start_from: from which image to start.
+    eos_list: optional list of end of sentence tokens, otherwise use default
+      value `1`.
+    vocab_filename: file within `tmp_dir` to read vocabulary from.
+    vocab_size: integer target to generate vocabulary size to.
+
+  Yields:
+    A dictionary representing the images with the following fields:
+    * image/encoded: the string encoding the image as JPEG,
+    * image/format: the string "jpeg" representing image format,
+    * image/class/label: a list of integers representing the caption,
+    * image/height: an integer representing the height,
+    * image/width: an integer representing the width.
+    Every field is actually a list of the corresponding type.
+  """
+  eos_list = [1] if eos_list is None else eos_list
+  if vocab_filename is not None:
+    vocab_symbolizer = generator_utils.get_or_generate_vocab(
+        tmp_dir, vocab_filename, vocab_size)
+  _get_mscoco(tmp_dir)
+  caption_filepath = (_MSCOCO_TRAIN_CAPTION_FILE
+                      if training else _MSCOCO_EVAL_CAPTION_FILE)
+  caption_filepath = os.path.join(tmp_dir, caption_filepath)
+  prefix = _MSCOCO_TRAIN_PREFIX if training else _MSCOCO_EVAL_PREFIX
+  caption_file = io.open(caption_filepath)
+  caption_json = json.load(caption_file)
+  # Dictionary from image_id to ((filename, height, width), captions).
+  image_dict = dict()
+  for image in caption_json["images"]:
+    image_dict[image["id"]] = [(image["file_name"], image["height"],
+                                image["width"]), []]
+  annotations = caption_json["annotations"]
+  annotation_count = len(annotations)
+  image_count = len(image_dict)
+  tf.logging.info("Processing %d images and %d labels\n" % (image_count,
+                                                            annotation_count))
+  for annotation in annotations:
+    image_id = annotation["image_id"]
+    image_dict[image_id][1].append(annotation["caption"])
+
+  data = list(image_dict.values())[start_from:start_from + how_many]
+  random.shuffle(data)
+  for image_info, labels in data:
+    image_filename = image_info[0]
+    image_filepath = os.path.join(tmp_dir, prefix, image_filename)
+    with tf.gfile.Open(image_filepath, "r") as f:
+      encoded_image_data = f.read()
+      height, width = image_info[1], image_info[2]
+      for label in labels:
+        if vocab_filename is None:
+          label = [ord(c) for c in label] + eos_list
+        else:
+          label = vocab_symbolizer.encode(label) + eos_list
+        yield {
+            "image/encoded": [encoded_image_data],
+            "image/format": ["jpeg"],
+            "image/class/label": label,
+            "image/height": [height],
+            "image/width": [width]
+        }
diff --git a/tensor2tensor/data_generators/image_test.py b/tensor2tensor/data_generators/image_test.py
new file mode 100644
index 000000000..c5b4f14be
--- /dev/null
+++ b/tensor2tensor/data_generators/image_test.py
@@ -0,0 +1,71 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Image generators test."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+import numpy as np
+from tensor2tensor.data_generators import image
+
+import tensorflow as tf
+
+
+class ImageTest(tf.test.TestCase):
+
+  def testImageGenerator(self):
+    # 2 random images
+    np.random.seed(1111)  # To avoid any flakiness.
+    image1 = np.random.randint(0, 255, size=(10, 12, 3))
+    image2 = np.random.randint(0, 255, size=(10, 12, 3))
+    # Call image generator on the 2 images with labels [1, 2].
+    encoded_imgs, labels = [], []
+    for dictionary in image.image_generator([image1, image2], [1, 2]):
+      self.assertEqual(
+          sorted(list(dictionary)), [
+              "image/class/label", "image/encoded", "image/format",
+              "image/height", "image/width"
+          ])
+      self.assertEqual(dictionary["image/format"], ["png"])
+      self.assertEqual(dictionary["image/height"], [12])
+      self.assertEqual(dictionary["image/width"], [10])
+      encoded_imgs.append(dictionary["image/encoded"])
+      labels.append(dictionary["image/class/label"])
+
+    # Check that the result labels match the inputs.
+    self.assertEqual(len(labels), 2)
+    self.assertEqual(labels[0], [1])
+    self.assertEqual(labels[1], [2])
+
+    # Decode images and check that they match the inputs.
+    self.assertEqual(len(encoded_imgs), 2)
+    image_t = tf.placeholder(dtype=tf.string)
+    decoded_png_t = tf.image.decode_png(image_t)
+    with self.test_session() as sess:
+      encoded_img1 = encoded_imgs[0]
+      self.assertEqual(len(encoded_img1), 1)
+      decoded1 = sess.run(decoded_png_t, feed_dict={image_t: encoded_img1[0]})
+      self.assertAllClose(decoded1, image1)
+      encoded_img2 = encoded_imgs[1]
+      self.assertEqual(len(encoded_img2), 1)
+      decoded2 = sess.run(decoded_png_t, feed_dict={image_t: encoded_img2[0]})
+      self.assertAllClose(decoded2, image2)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensor2tensor/data_generators/lm_example.py b/tensor2tensor/data_generators/lm_example.py
new file mode 100644
index 000000000..9cf930afc
--- /dev/null
+++ b/tensor2tensor/data_generators/lm_example.py
@@ -0,0 +1,123 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""Convert language modeling data to tf.Example format.
+
+Uses SubwordTextEncoder.
+
+For each line, we generate a tf.Example, with "targets" equal to a sequence
+of subtokens (integers), ending in subtoken id 1 for end-of-sequence.  We add
+a dummy feature "inputs"=[0] for compatability with seq-to-seq models.
+
+If FLAGS.combine_to_length is nonzero, then we combine multiple sequences into
+examples of a constant length, possibly with some padding at the end.
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+from tensor2tensor.data_generators import generator_utils
+from tensor2tensor.data_generators import text_encoder
+
+import tensorflow as tf
+
+tf.app.flags.DEFINE_string(
+    "vocab_file", "", "SubwordTextEncoder vocabulary file")
+
+tf.app.flags.DEFINE_integer(
+    "combine_to_length", 0,
+    "If positive, concatenate documents to form examples with length exactly"
+    " equal to this value.  Documents are still suffixed with subtoken id=1. "
+    " Examples are padded with subtoken id=0.")
+
+tf.app.flags.DEFINE_string("in_filepattern", "", "input filename")
+
+tf.app.flags.DEFINE_string(
+    "out_prefix", "", "The output filename is equal to out_prefix plus "
+    "the last 15 characters of in_file. (e.g. -00001-of-00100)")
+
+FLAGS = tf.app.flags.FLAGS
+
+
+def _make_example(ids, raw_num_bytes):
+  if FLAGS.combine_to_length > 0:
+    ids += [0] * (FLAGS.combine_to_length - len(ids))
+  return generator_utils.to_example({
+      "targets": ids,
+      "inputs": [0],
+      "raw_num_bytes": [raw_num_bytes]
+  }).SerializeToString()
+
+
+def convert_file(in_file, encoder):
+  """Convert a file to examples."""
+  total_bytes = 0
+  total_subtokens = 0
+  total_documents = 0
+  dropped_documents = 0
+
+  combined_subtokens = []
+  combined_num_bytes = 0
+
+  out_file = FLAGS.out_prefix + in_file[-15:]
+  writer = tf.python_io.TFRecordWriter(out_file)
+  out_file = FLAGS.out_prefix + in_file[-15:]
+  print ("in_file", in_file, "out_file", out_file)
+  for line in tf.gfile.Open(in_file):
+    total_documents += 1
+    assert line[-1] == "\n"
+    num_bytes = len(line)
+    total_bytes += num_bytes
+    line = line[:-1]
+    subtokens = encoder.encode(line) + [1]
+    total_subtokens += len(subtokens)
+    if FLAGS.combine_to_length:
+      if len(combined_subtokens) + len(subtokens) > FLAGS.combine_to_length:
+        writer.write(_make_example(combined_subtokens, combined_num_bytes))
+        combined_subtokens = []
+        combined_num_bytes = 0
+      if len(subtokens) <= FLAGS.combine_to_length:
+        combined_subtokens.extend(subtokens)
+        combined_num_bytes += num_bytes
+      else:
+        dropped_documents += 1
+    else:
+      writer.write(_make_example(subtokens, num_bytes))
+  if combined_subtokens:
+    writer.write(_make_example(combined_subtokens, combined_num_bytes))
+  writer.close()
+
+  tf.logging.info("total bytes: %d", total_bytes)
+  tf.logging.info("total subtokens: %d", total_subtokens)
+  tf.logging.info("bytes per subtoken: %f", total_bytes / total_subtokens)
+  tf.logging.info("total documents: %d", total_documents)
+  tf.logging.info("dropped documents: %d", dropped_documents)
+
+
+def main(_):
+  """Convert a file to examples."""
+  encoder = text_encoder.SubwordTextEncoder(FLAGS.vocab_file)
+
+  in_files = tf.gfile.Glob(FLAGS.in_filepattern)
+  assert in_files, "No matching input files"
+  for in_file in in_files:
+    convert_file(in_file, encoder)
+
+
+if __name__ == "__main__":
+  tf.app.run()
diff --git a/tensor2tensor/data_generators/problem_hparams.py b/tensor2tensor/data_generators/problem_hparams.py
new file mode 100644
index 000000000..26249d2bc
--- /dev/null
+++ b/tensor2tensor/data_generators/problem_hparams.py
@@ -0,0 +1,702 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Hyperparameters defining different problems.
+
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+# Dependency imports
+
+from tensor2tensor.data_generators import text_encoder
+from tensor2tensor.utils import modality
+
+import tensorflow as tf
+
+
+def default_problem_hparams():
+  """A set of basic model hyperparameters."""
+  return tf.contrib.training.HParams(
+      # Use this parameter to get comparable perplexity numbers with different
+      # tokenizations.  This value should be set to the ratio of the number of
+      # tokens in the test set according to the tokeization used to the number
+      # of tokens in the test set in the "official" tokenization.  For example,
+      # if we are using a word-piece based model and we want to compute
+      # per-word perplexity, then we set loss_multiplier to the number of
+      # wordpieces per word in the test set.
+      loss_multiplier=1.0,
+
+      # Use this parameter to allow for larger sequences in the batch. Without
+      # the use of this parameter, the size of the inner two dimensions will be
+      # used to judge the sequence length.
+      batch_size_multiplier=1,
+
+      # To make queues of the right capacity, it's good to know the maximal
+      # expected batch size, as it can vary a lot. It only affects performance
+      # of input readers and memory use. The defaults should be safe and fast,
+      # but decrease if your reader uses a lot of memory and increase if slow.
+      max_expected_batch_size_per_shard=64,
+
+      # Modalities used to map from input features to a space compatible with
+      # chosen model architecture.  One modality per feature key.
+      input_modality={},
+
+      # Modality used to map from hidden representation to the target space.
+      target_modality=None,
+
+      # Identifiers used to tell the model which input/target space will be
+      # expected. For example, it can tell that we expect French as characters
+      # as output, or Spanish as sound. An integer with the following semantics:
+      #   0: Generic / unknown output space (default)
+      #   1: Image labels
+      #   2: English characters
+      #   3: English tokens
+      #   4: English bpe tokens
+      #   5: French characters
+      #   6: French tokens
+      #   7: German characters
+      #   8: German tokens
+      #   9: German bpe tokens
+      #   10: Digit cipher lexicon 0
+      #   11: Digit cipher lexicon 1
+      #   12: Audio waveform domain
+      #   13: Audio spectral domain
+      #   14: Parse characters
+      #   15: Parse tokens
+      # Add more above if needed.
+      input_space_id=0,
+      target_space_id=0,
+
+      # Vocabulary per feature key.
+      #   a vocabulary converts to/from human-readable strings.
+      # E.g. {"inputs": text_encoder.ByteTextEncoder(),
+      #       "targets": wordpiece.WordpieceVocab("vocab_filename.txt")}
+      vocabulary={
+          "inputs": text_encoder.TextEncoder(),
+          "targets": text_encoder.TextEncoder()
+      },
+
+      # This is a marker to keep track if the problem was reversed or copied.
+      # Only set automatically, do not override the default.
+      #
+      # These tags can be combined in order to perform copies of the input or
+      # the targets. For instance `problem_copy` will copy the inputs, but
+      # `problem_rev_copy` will copy the targets.
+      was_reversed=False,
+      was_copy=False,)
+
+
+def parse_problem_name(problem_name):
+  """Determines if problem_name specifies a copy and/or reversal.
+
+  Args:
+    problem_name: A string containing a single problem name from FLAGS.problems.
+
+  Returns:
+    base_name: A string with the base problem name.
+    was_reversed: A boolean.
+    was_copy: A boolean.
+  """
+  # Recursively strip tags until we reach a base name.
+  if len(problem_name) > 4 and problem_name[-4:] == "_rev":
+    base, _, was_copy = parse_problem_name(problem_name[:-4])
+    return base, True, was_copy
+  elif len(problem_name) > 5 and problem_name[-5:] == "_copy":
+    base, was_reversed, _ = parse_problem_name(problem_name[:-5])
+    return base, was_reversed, True
+  else:
+    return problem_name, False, False
+
+
+def problem_hparams(problem_name, model_hparams):
+  """Generate problem hyperparameters based on problem name.
+
+  Args:
+    problem_name: a string
+    model_hparams: a tf.contrib.training.HParams
+
+  Returns:
+    a tf.contrib.training.HParams
+
+  Raises:
+    ValueError: if problem_name is unknown.
+  """
+  base_name, was_reversed, was_copy = parse_problem_name(problem_name)
+  if base_name not in _problem_hparams_map:
+    map_str = "\n* ".join(_problem_hparams_map.keys())
+    error_msg = "%s not in the supported set of problems:\n%s" % (base_name,
+                                                                  map_str)
+    raise ValueError(error_msg)
+  p = _problem_hparams_map.get(base_name)(model_hparams)
+  if was_reversed:
+    # Swap modalities.
+    input_modality = p.input_modality["inputs"]
+    target_modality = p.target_modality
+    p.input_modality["inputs"] = target_modality
+    p.target_modality = input_modality
+    # Swap vocabularies.
+    input_vocabulary = p.vocabulary["inputs"]
+    target_vocabulary = p.vocabulary["targets"]
+    p.vocabulary["inputs"] = target_vocabulary
+    p.vocabulary["targets"] = input_vocabulary
+    # Swap input/target space ids.
+    input_space_id = p.input_space_id
+    target_space_id = p.target_space_id
+    p.input_space_id = target_space_id
+    p.target_space_id = input_space_id
+    # Mark that p was reversed.
+    p.was_reversed = True
+  if was_copy:
+    # Duplicate input modality.
+    p.target_modality = p.input_modality["inputs"]
+    # Duplicate input vocabulary.
+    p.vocabulary["targets"] = p.vocabulary["inputs"]
+    # Duplicate input space ids.
+    p.target_space_id = p.input_space_id
+    # Mark that p was reversed.
+    p.was_copy = True
+  return p
+
+
+def test_problem_hparams(model_hparams, input_vocab_size, target_vocab_size):
+  """Problem hparams for testing model bodies."""
+  p = default_problem_hparams()
+  p.input_modality = {
+      "inputs": modality.SymbolModality(model_hparams, input_vocab_size)
+  }
+  p.target_modality = modality.SymbolModality(model_hparams, target_vocab_size)
+  p.vocabulary = {
+      "inputs": text_encoder.TextEncoder(),
+      "targets": text_encoder.TextEncoder()
+  }
+  return p
+
+
+def algorithmic(vocab_size, model_hparams):
+  """Default parameters for algorithmic tasks."""
+  p = default_problem_hparams()
+  p.input_modality = {
+      "inputs": modality.SymbolModality(model_hparams, vocab_size)
+  }
+  p.target_modality = modality.SymbolModality(model_hparams, vocab_size)
+  p.vocabulary = {
+      "inputs": text_encoder.TextEncoder(num_reserved_ids=1),
+      "targets": text_encoder.TextEncoder(num_reserved_ids=1),
+  }
+  p.input_space_id = 10
+  p.target_space_id = 11
+  return p
+
+
+def audio_timit_characters(model_hparams):
+  """English audio transcription benchmark."""
+  p = default_problem_hparams()
+  p.input_modality = {
+      "inputs": modality.AudioModality(model_hparams),
+  }
+  p.target_modality = modality.SymbolModality(model_hparams, 256)
+  p.vocabulary = {
+      "inputs": text_encoder.TextEncoder(),
+      "targets": text_encoder.ByteTextEncoder(),
+  }
+  p.batch_size_multiplier = 256
+  p.loss_multiplier = 2.0
+  p.input_space_id = 12
+  p.target_space_id = 2
+  return p
+
+
+def audio_timit_tokens(model_hparams, wrong_vocab_size):
+  """English audio transcription benchmark.
+
+  Args:
+    model_hparams: a tf.contrib.training.HParams
+    wrong_vocab_size: a number used in the filename indicating the approximate
+      vocabulary size.  This is not to be confused with the actual vocabulary
+      size.
+  Returns:
+    a tf.contrib.training.HParams
+  """
+  p = default_problem_hparams()
+  # This vocab file must be present within the data directory.
+  vocab_filename = os.path.join(model_hparams.data_dir,
+                                "tokens.vocab.%d" % wrong_vocab_size)
+  subtokenizer = text_encoder.SubwordTextEncoder(vocab_filename)
+  p.input_modality = {
+      "inputs": modality.AudioModality(model_hparams),
+  }
+  p.target_modality = modality.SymbolModality(model_hparams,
+                                              subtokenizer.vocab_size)
+  p.vocabulary = {
+      "inputs": text_encoder.TextEncoder(),
+      "targets": subtokenizer,
+  }
+  p.batch_size_multiplier = 256
+  p.loss_multiplier = 2.0
+  p.input_space_id = 13
+  p.target_space_id = 3
+  return p
+
+
+def audio_wsj_characters(model_hparams):
+  """English audio transcription benchmark."""
+  p = default_problem_hparams()
+  p.input_modality = {
+      "inputs": modality.AudioSpectralModality(model_hparams),
+  }
+  p.target_modality = modality.SymbolModality(model_hparams, 256)
+  p.vocabulary = {
+      "inputs": text_encoder.TextEncoder(),
+      "targets": text_encoder.ByteTextEncoder(),
+  }
+  p.batch_size_multiplier = 512
+  p.loss_multiplier = 2.0
+  p.input_space_id = 13
+  p.target_space_id = 2
+  return p
+
+
+def audio_wsj_tokens(model_hparams, wrong_vocab_size):
+  """English audio transcription benchmark.
+
+  Args:
+    model_hparams: a tf.contrib.training.HParams
+    wrong_vocab_size: a number used in the filename indicating the approximate
+      vocabulary size.  This is not to be confused with the actual vocabulary
+      size.
+  Returns:
+    a tf.contrib.training.HParams
+  """
+  p = default_problem_hparams()
+  # This vocab file must be present within the data directory.
+  vocab_filename = os.path.join(model_hparams.data_dir,
+                                "tokens.vocab.%d" % wrong_vocab_size)
+  subtokenizer = text_encoder.SubwordTextEncoder(vocab_filename)
+  p.input_modality = {
+      "inputs": modality.AudioModality(model_hparams),
+  }
+  p.target_modality = modality.SymbolModality(model_hparams,
+                                              subtokenizer.vocab_size)
+  p.vocabulary = {
+      "inputs": text_encoder.TextEncoder(),
+      "targets": subtokenizer,
+  }
+  p.batch_size_multiplier = 512
+  p.loss_multiplier = 2.0
+  p.input_space_id = 12
+  p.target_space_id = 3
+  return p
+
+
+def lm1b_16k(model_hparams):
+  """Billion-word language-modeling benchmark, 16k subtoken vocabulary."""
+  p = default_problem_hparams()
+  p.perplexity_exponent = 1.184206
+  p.input_modality = {}
+  p.target_modality = modality.SymbolModality(model_hparams, 16384)
+  p.vocabulary = {
+      "targets":
+          text_encoder.SubwordTextEncoder(
+              os.path.join(model_hparams.data_dir,
+                           "lm1b_16k.subword_text_encoder"))
+  }
+  p.target_space_id = 3
+  return p
+
+
+def lm1b_64k(model_hparams):
+  """Billion-word language-modeling benchmark, 64k subtoken vocabulary."""
+  p = default_problem_hparams()
+  p.perplexity_exponent = 1.067068
+  p.input_modality = {}
+  p.target_modality = modality.SymbolModality(model_hparams, 65536)
+  p.vocabulary = {
+      "targets":
+          text_encoder.SubwordTextEncoder(
+              os.path.join(model_hparams.data_dir,
+                           "lm1b_64k.subword_text_encoder"))
+  }
+  p.target_space_id = 3
+  return p
+
+
+def wmt_enfr_characters(model_hparams):
+  """English to French translation benchmark."""
+  p = default_problem_hparams()
+  p.input_modality = {"inputs": modality.SymbolModality(model_hparams, 256)}
+  p.target_modality = modality.SymbolModality(model_hparams, 256)
+  p.vocabulary = {
+      "inputs": text_encoder.ByteTextEncoder(),
+      "targets": text_encoder.ByteTextEncoder(),
+  }
+  p.loss_multiplier = 2.0
+  p.input_space_id = 2
+  p.target_space_id = 5
+  return p
+
+
+def wmt_enfr_tokens(model_hparams, wrong_vocab_size):
+  """English to French translation benchmark.
+
+  Args:
+    model_hparams: a tf.contrib.training.HParams
+    wrong_vocab_size: a number used in the filename indicating the approximate
+      vocabulary size.  This is not to be confused with the actual vocabulary
+      size.
+  Returns:
+    a tf.contrib.training.HParams
+  """
+  p = default_problem_hparams()
+  # This vocab file must be present within the data directory.
+  vocab_filename = os.path.join(model_hparams.data_dir,
+                                "tokens.vocab.%d" % wrong_vocab_size)
+  subtokenizer = text_encoder.SubwordTextEncoder(vocab_filename)
+  p.input_modality = {
+      "inputs": modality.SymbolModality(model_hparams, subtokenizer.vocab_size)
+  }
+  p.target_modality = modality.SymbolModality(model_hparams,
+                                              subtokenizer.vocab_size)
+  p.vocabulary = {
+      "inputs": subtokenizer,
+      "targets": subtokenizer,
+  }
+  p.input_space_id = 3
+  p.target_space_id = 6
+  return p
+
+
+def wmt_ende_bpe32k(model_hparams):
+  """English to German translation benchmark."""
+  p = default_problem_hparams()
+  # single modality object enables embedding sharing between inputs and target
+  # when model_hparams.shared_source_target_embedding is True.
+  vocab_size = 40960
+  m = modality.SymbolModality(model_hparams, vocab_size)
+  p.input_modality = {"inputs": m}
+  p.target_modality = m
+  # This vocab file must be present within the data directory.
+  vocab_filename = os.path.join(model_hparams.data_dir, "vocab.bpe.32000")
+  p.vocabulary = {
+      "inputs": text_encoder.TokenTextEncoder(vocab_filename=vocab_filename),
+      "targets": text_encoder.TokenTextEncoder(vocab_filename=vocab_filename),
+  }
+  p.loss_multiplier = 1.4
+  p.input_space_id = 4
+  p.target_space_id = 9
+  return p
+
+
+def wmt_ende_characters(model_hparams):
+  """English to German translation benchmark."""
+  p = default_problem_hparams()
+  p.input_modality = {"inputs": modality.SymbolModality(model_hparams, 256)}
+  p.target_modality = modality.SymbolModality(model_hparams, 256)
+  p.vocabulary = {
+      "inputs": text_encoder.ByteTextEncoder(),
+      "targets": text_encoder.ByteTextEncoder(),
+  }
+  p.loss_multiplier = 2.0
+  p.input_space_id = 2
+  p.target_space_id = 7
+  return p
+
+
+def wmt_ende_tokens(model_hparams, wrong_vocab_size):
+  """English to German translation benchmark."""
+  p = default_problem_hparams()
+  # This vocab file must be present within the data directory.
+  vocab_filename = os.path.join(model_hparams.data_dir,
+                                "tokens.vocab.%d" % wrong_vocab_size)
+  subtokenizer = text_encoder.SubwordTextEncoder(vocab_filename)
+  p.input_modality = {
+      "inputs": modality.SymbolModality(model_hparams, subtokenizer.vocab_size)
+  }
+  p.target_modality = modality.SymbolModality(model_hparams,
+                                              subtokenizer.vocab_size)
+  p.vocabulary = {
+      "inputs": subtokenizer,
+      "targets": subtokenizer,
+  }
+  p.input_space_id = 3
+  p.target_space_id = 8
+  return p
+
+
+def wmt_ende_v2(model_hparams, vocab_size):
+  """English to German translation benchmark with separate vocabularies."""
+  p = default_problem_hparams()
+  # These vocab files must be present within the data directory.
+  source_vocab_filename = os.path.join(model_hparams.data_dir,
+                                       "wmt_ende_v2.en.vocab.%d" % vocab_size)
+  target_vocab_filename = os.path.join(model_hparams.data_dir,
+                                       "wmt_ende_v2.de.vocab.%d" % vocab_size)
+  p.input_modality = {
+      "inputs": modality.SymbolModality(model_hparams, vocab_size)
+  }
+  p.target_modality = modality.SymbolModality(model_hparams, vocab_size)
+  p.vocabulary = {
+      "inputs": text_encoder.SubwordTextEncoder(source_vocab_filename),
+      "targets": text_encoder.SubwordTextEncoder(target_vocab_filename),
+  }
+  p.input_space_id = 3
+  p.target_space_id = 8
+  return p
+
+
+def wmt_concat(model_hparams, wrong_vocab_size):
+  """English to German translation benchmark."""
+  p = default_problem_hparams()
+  # This vocab file must be present within the data directory.
+  vocab_filename = os.path.join(model_hparams.data_dir,
+                                "tokens.vocab.%d" % wrong_vocab_size)
+  subtokenizer = text_encoder.SubwordTextEncoder(vocab_filename)
+  vocab_size = subtokenizer.vocab_size
+  p.input_modality = {}
+  p.target_modality = modality.SymbolModality(model_hparams, vocab_size)
+  p.vocabulary = {"targets": subtokenizer}
+  return p
+
+
+def wmt_parsing_characters(model_hparams):
+  """English to parse tree translation benchmark."""
+  p = default_problem_hparams()
+  p.input_modality = {"inputs": modality.SymbolModality(model_hparams, 256)}
+  p.target_modality = modality.SymbolModality(model_hparams, 256)
+  p.vocabulary = {
+      "inputs": text_encoder.ByteTextEncoder(),
+      "targets": text_encoder.ByteTextEncoder(),
+  }
+  p.loss_multiplier = 2.0
+  p.input_space_id = 2
+  p.target_space_id = 14
+  return p
+
+
+def wmt_parsing_tokens(model_hparams, wrong_vocab_size):
+  """English to parse tree translation benchmark.
+
+  Args:
+    model_hparams: a tf.contrib.training.HParams
+    wrong_vocab_size: a number used in the filename indicating the approximate
+      vocabulary size.  This is not to be confused with the actual vocabulary
+      size.
+  Returns:
+    a tf.contrib.training.HParams
+  """
+  p = default_problem_hparams()
+  # This vocab file must be present within the data directory.
+  vocab_filename = os.path.join(model_hparams.data_dir,
+                                "tokens.vocab.%d" % wrong_vocab_size)
+  subtokenizer = text_encoder.SubwordTextEncoder(vocab_filename)
+  p.input_modality = {
+      "inputs": modality.SymbolModality(model_hparams, subtokenizer.vocab_size)
+  }
+  p.target_modality = modality.SymbolModality(model_hparams,
+                                              subtokenizer.vocab_size)
+  p.vocabulary = {
+      "inputs": subtokenizer,
+      "targets": subtokenizer,
+  }
+  p.input_space_id = 3
+  p.target_space_id = 15
+  return p
+
+
+def wsj_parsing_tokens(model_hparams, wrong_source_vocab_size,
+                       wrong_target_vocab_size):
+  """English to parse tree translation benchmark.
+
+  Args:
+    model_hparams: a tf.contrib.training.HParams
+    wrong_source_vocab_size: a number used in the filename indicating the
+      approximate vocabulary size.  This is not to be confused with the actual
+      vocabulary size.
+    wrong_target_vocab_size: a number used in the filename indicating the
+      approximate target vocabulary size. This is not to be confused with the
+      actual target vocabulary size.
+  Returns:
+    a tf.contrib.training.HParams
+  """
+  p = default_problem_hparams()
+  # This vocab file must be present within the data directory.
+  source_vocab_filename = os.path.join(
+      model_hparams.data_dir,
+      "wsj_source.tokens.vocab.%d" % wrong_source_vocab_size)
+  target_vocab_filename = os.path.join(
+      model_hparams.data_dir,
+      "wsj_target.tokens.vocab.%d" % wrong_target_vocab_size)
+  source_subtokenizer = text_encoder.SubwordTextEncoder(
+      source_vocab_filename)
+  target_subtokenizer = text_encoder.SubwordTextEncoder(
+      target_vocab_filename)
+  p.input_modality = {
+      "inputs": modality.SymbolModality(model_hparams,
+                                        source_subtokenizer.vocab_size)
+  }
+  p.target_modality = modality.SymbolModality(model_hparams,
+                                              target_subtokenizer.vocab_size)
+  p.vocabulary = {
+      "inputs": source_subtokenizer,
+      "targets": target_subtokenizer,
+  }
+  p.input_space_id = 3
+  p.target_space_id = 15
+  return p
+
+
+def image_cifar10(model_hparams):
+  """CIFAR-10."""
+  p = default_problem_hparams()
+  p.input_modality = {"inputs": modality.SmallImageModality(model_hparams)}
+  p.target_modality = modality.ClassLabelModality(model_hparams, 10)
+  p.batch_size_multiplier = 4
+  p.max_expected_batch_size_per_shard = 8
+  p.loss_multiplier = 3.0
+  p.input_space_id = 1
+  p.target_space_id = 1
+  return p
+
+
+def image_mnist(model_hparams):
+  """MNIST."""
+  p = default_problem_hparams()
+  p.input_modality = {"inputs": modality.SymbolModality(model_hparams, 256)}
+  p.target_modality = modality.ClassLabelModality(model_hparams, 10)
+  p.batch_size_multiplier = 4
+  p.max_expected_batch_size_per_shard = 8
+  p.loss_multiplier = 3.0
+  p.input_space_id = 1
+  p.target_space_id = 1
+  return p
+
+
+def image_imagenet(model_hparams):
+  """ImageNet."""
+  p = default_problem_hparams()
+  p.input_modality = {
+      "inputs": modality.ImageModality(model_hparams),
+  }
+  p.target_modality = modality.ClassLabelModality(
+      model_hparams, 1000, is2d=model_hparams.imagenet_use_2d)
+  p.batch_size_multiplier = 256
+  p.max_expected_batch_size_per_shard = 2
+  p.loss_multiplier = 0.7
+  p.input_space_id = 1
+  p.target_space_id = 1
+  return p
+
+
+def image_mscoco_characters(model_hparams):
+  """COCO image captioning with captions as characters."""
+  p = default_problem_hparams()
+  p.input_modality = {"inputs": modality.ImageModality(model_hparams)}
+  p.target_modality = modality.SymbolModality(model_hparams, 256)
+  p.vocabulary = {
+      "inputs": text_encoder.TextEncoder(),
+      "targets": text_encoder.ByteTextEncoder(),
+  }
+  p.batch_size_multiplier = 128
+  p.max_expected_batch_size_per_shard = 2
+  p.loss_multiplier = 2.0
+  p.input_space_id = 1
+  p.target_space_id = 2
+  return p
+
+
+def image_mscoco_tokens(model_hparams, vocab_count):
+  """COCO image captioning with captions as tokens."""
+  p = default_problem_hparams()
+  p.input_modality = {"inputs": modality.ImageModality(model_hparams)}
+  # This vocab file must be present within the data directory.
+  vocab_filename = os.path.join(model_hparams.data_dir,
+                                "tokens.vocab.%d" % vocab_count)
+  subtokenizer = text_encoder.SubwordTextEncoder(vocab_filename)
+  p.target_modality = modality.SymbolModality(model_hparams,
+                                              subtokenizer.vocab_size)
+  p.vocabulary = {
+      "inputs": text_encoder.TextEncoder(),
+      "targets": subtokenizer,
+  }
+  p.batch_size_multiplier = 256
+  p.max_expected_batch_size_per_shard = 2
+  p.input_space_id = 1
+  p.target_space_id = 3
+  return p
+
+
+# Dictionary of named hyperparameter settings for various problems.
+# This is only accessed through the problem_hparams function below.
+_problem_hparams_map = {
+    "algorithmic_addition_binary40": lambda p: algorithmic(3, p),
+    "algorithmic_addition_decimal40": lambda p: algorithmic(11, p),
+    "algorithmic_identity_binary40": lambda p: algorithmic(3, p),
+    "algorithmic_identity_decimal40": lambda p: algorithmic(11, p),
+    "algorithmic_multiplication_binary40": lambda p: algorithmic(3, p),
+    "algorithmic_multiplication_decimal40": lambda p: algorithmic(11, p),
+    "algorithmic_reverse_binary40": lambda p: algorithmic(3, p),
+    "algorithmic_reverse_decimal40": lambda p: algorithmic(11, p),
+    "algorithmic_shift_decimal40": lambda p: algorithmic(21, p),
+    "audio_timit_characters_tune": audio_timit_characters,
+    "audio_timit_characters_test": audio_timit_characters,
+    "audio_timit_tokens_8k_tune": lambda p: audio_timit_tokens(p, 2**13),
+    "audio_timit_tokens_8k_test": lambda p: audio_timit_tokens(p, 2**13),
+    "audio_wsj_characters_tune": audio_wsj_characters,
+    "audio_wsj_characters_test": audio_wsj_characters,
+    "audio_wsj_tokens_8k_tune": lambda p: audio_wsj_tokens(p, 2**13),
+    "audio_wsj_tokens_8k_test": lambda p: audio_wsj_tokens(p, 2**13),
+    "lm1b_16k": lm1b_16k,
+    "lm1b_64k": lm1b_64k,
+    "wmt_parsing_characters": wmt_parsing_characters,
+    "wmt_parsing_tokens_8k": lambda p: wmt_parsing_tokens(p, 2**13),
+    "wsj_parsing_tokens_16k": lambda p: wsj_parsing_tokens(p, 2**14, 2**9),
+    "wsj_parsing_tokens_32k": lambda p: wsj_parsing_tokens(p, 2**15, 2**9),
+    "wmt_enfr_characters": wmt_enfr_characters,
+    "wmt_enfr_tokens_8k": lambda p: wmt_enfr_tokens(p, 2**13),
+    "wmt_enfr_tokens_32k": lambda p: wmt_enfr_tokens(p, 2**15),
+    "wmt_enfr_tokens_32k_shuffled": lambda p: wmt_enfr_tokens(p, 2**15),
+    "wmt_enfr_tokens_32k_combined": lambda p: wmt_enfr_tokens(p, 2**15),
+    "wmt_enfr_tokens_128k": lambda p: wmt_enfr_tokens(p, 2**17),
+    # bytes per subtoken: 3.267350
+    "wmt_ende_concat_8k": lambda p: wmt_concat(p, 2**13),
+    # bytes per subtoken: 4.236272
+    "wmt_ende_concat_32k": lambda p: wmt_concat(p, 2**15),
+    "wmt_ende_characters": wmt_ende_characters,
+    "wmt_ende_tokens_8k": lambda p: wmt_ende_tokens(p, 2**13),
+    "wmt_ende_tokens_32k": lambda p: wmt_ende_tokens(p, 2**15),
+    "wmt_ende_tokens_128k": lambda p: wmt_ende_tokens(p, 2**17),
+    # bytes per subtoken: 4.59291664162
+    "wmt_ende_bpe32k": wmt_ende_bpe32k,
+    "wmt_ende_bpe32k_shuffled": wmt_ende_bpe32k,
+    "wmt_ende_bpe32k_combined": wmt_ende_bpe32k,
+    "wmt_ende_bpe32k_160": wmt_ende_bpe32k,
+    "wmt_ende_v2_32k_combined": lambda p: wmt_ende_v2(p, 2**15),
+    "wmt_ende_v2_16k_combined": lambda p: wmt_ende_v2(p, 2**14),
+    "image_cifar10_tune": image_cifar10,
+    "image_cifar10_test": image_cifar10,
+    "image_mnist_tune": image_mnist,
+    "image_mnist_test": image_mnist,
+    "image_mscoco_characters_tune": image_mscoco_characters,
+    "image_mscoco_characters_test": image_mscoco_characters,
+    "image_mscoco_tokens_8k_tune": lambda p: image_mscoco_tokens(p, 2**13),
+    "image_mscoco_tokens_8k_test": lambda p: image_mscoco_tokens(p, 2**13),
+    "image_mscoco_tokens_32k_tune": lambda p: image_mscoco_tokens(p, 2**15),
+    "image_mscoco_tokens_32k_test": lambda p: image_mscoco_tokens(p, 2**15),
+    "image_mscoco_tokens_128k_tune": lambda p: image_mscoco_tokens(p, 2**17),
+    "image_mscoco_tokens_128k_test": lambda p: image_mscoco_tokens(p, 2**17),
+    "image_imagenet": image_imagenet,
+}
diff --git a/tensor2tensor/data_generators/problem_hparams_test.py b/tensor2tensor/data_generators/problem_hparams_test.py
new file mode 100644
index 000000000..5c8bc5516
--- /dev/null
+++ b/tensor2tensor/data_generators/problem_hparams_test.py
@@ -0,0 +1,48 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for tensor2tensor.problem_hparams."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+from tensor2tensor.data_generators import problem_hparams
+
+import tensorflow as tf
+
+
+class ProblemHparamsTest(tf.test.TestCase):
+
+  def testParseProblemName(self):
+    problem_name = "base"
+    self.assertEqual(problem_hparams.parse_problem_name(problem_name),
+                     ("base", False, False))
+    problem_name = "base_rev"
+    self.assertEqual(
+        problem_hparams.parse_problem_name(problem_name), ("base", True, False))
+    problem_name = "base_copy"
+    self.assertEqual(
+        problem_hparams.parse_problem_name(problem_name), ("base", False, True))
+    problem_name = "base_copy_rev"
+    self.assertEqual(
+        problem_hparams.parse_problem_name(problem_name), ("base", True, True))
+    problem_name = "base_rev_copy"
+    self.assertEqual(
+        problem_hparams.parse_problem_name(problem_name), ("base", True, True))
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensor2tensor/data_generators/replace_oov.py b/tensor2tensor/data_generators/replace_oov.py
new file mode 100644
index 000000000..7e2c8dc50
--- /dev/null
+++ b/tensor2tensor/data_generators/replace_oov.py
@@ -0,0 +1,76 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""Data preprocessor for lm1b benchmark.
+
+Process the raw text file to replace out-of-vocab words with "<UNK>".
+
+The input consists of a tokenized text file, where tokens are separated with
+whitespace.
+
+Outputs a similar text file where the OOV words have been repalced with UNK.
+The whitespace in the output may be different.
+
+This maintains compatibility with the benchmark, which does the same thing.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+import tensorflow as tf
+
+tf.app.flags.DEFINE_string("vocab_file", "",
+                           "text file containing one word per line")
+
+tf.app.flags.DEFINE_string("in_filepattern", "", "input filename")
+
+tf.app.flags.DEFINE_string(
+    "out_prefix", "", "The output filename is equal to out_prefix plus "
+    "the last 15 characters of in_file. (e.g. -00001-of-00100)")
+
+FLAGS = tf.app.flags.FLAGS
+
+
+def replace_oov(vocab, in_file):
+  """Replace out-of-vocab words with <UNK>."""
+  out_file = FLAGS.out_prefix + in_file[-15:]
+  print ("in_file", in_file, "out_file", out_file)
+  with tf.gfile.Open(out_file, "w") as out:
+    for line in tf.gfile.Open(in_file):
+      words = line.split()
+      for i in xrange(len(words)):
+        if not vocab.get(words[i]):
+          words[i] = "UNK"
+      out_line = " ".join(words) + "\n"
+      out.write(out_line)
+
+
+def main(_):
+  vocab = {}
+  with tf.gfile.Open(FLAGS.vocab_file) as vocab_file:
+    for line in vocab_file:
+      vocab[line.strip()] = True
+
+  in_files = tf.gfile.Glob(FLAGS.in_filepattern)
+  assert in_files, "No matching input files"
+  for in_file in in_files:
+    replace_oov(vocab, in_file)
+
+if __name__ == "__main__":
+  tf.app.run()
diff --git a/tensor2tensor/data_generators/snli.py b/tensor2tensor/data_generators/snli.py
new file mode 100644
index 000000000..5613ece4d
--- /dev/null
+++ b/tensor2tensor/data_generators/snli.py
@@ -0,0 +1,167 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Data generators for the SNLI data-set."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import zipfile
+
+# Dependency imports
+
+from tensor2tensor.data_generators import generator_utils
+from tensor2tensor.data_generators import text_encoder
+
+import tensorflow as tf
+
+_EOS = 1
+_SEP = 2
+
+_LABEL_INDEX = 0
+_PARSE1_INDEX = 3
+_PARSE2_INDEX = 4
+_SENT1_INDEX = 5
+_SENT2_INDEX = 6
+
+_LABEL_TO_ID = {
+    'contradiction': 0,
+    'entailment': 1,
+    'neutral': 2,
+}
+
+_EXAMPLES_FILE = 'examples.txt'
+_SNLI_DATA_PATH = 'snli_1.0/snli_1.0_%s.txt'
+_SNLI_ZIP = 'snli_1.0.zip'
+_SNLI_URL = 'https://nlp.stanford.edu/projects/snli/' + _SNLI_ZIP
+
+
+def _download_and_parse_dataset(tmp_dir, train):
+  """Downloads and prepairs the dataset to be parsed by the data_generator."""
+  file_path = generator_utils.maybe_download(tmp_dir, _SNLI_ZIP, _SNLI_URL)
+  zip_ref = zipfile.ZipFile(file_path, 'r')
+  zip_ref.extractall(tmp_dir)
+  zip_ref.close()
+
+  file_name = 'train' if train else 'dev'
+  dataset_file_path = os.path.join(tmp_dir, _SNLI_DATA_PATH % file_name)
+  _parse_dataset(dataset_file_path, tmp_dir, train)
+
+
+def _get_tokens_and_tags(parse_str):
+  """Parse str to tokens and pos tags."""
+  tokens = []
+  parse_split = parse_str.split(' ')
+  for p in parse_split:
+    assert p.startswith('(') or p.endswith(')')
+    if p.endswith(')'):
+      token = p.replace(')', '')
+      tokens.append(token)
+
+  return tokens
+
+
+def _parse_dataset(file_path, tmp_dir, train):
+  """Convert the dataset in to a simpler format.
+
+  This function creates two files. One for being processed to produce a vocab
+  and another to generate the data.
+
+  Args:
+    file_path: string, path to the file to parse.
+    tmp_dir: string, path to the directory to output the files.
+    train: bool, indicating if we are parsing the training set.
+  """
+  input_path = file_path
+  file_name = 'train' if train else 'dev'
+  gen_output_path = os.path.join(tmp_dir, file_name + '.txt')
+  example_output_path = os.path.join(tmp_dir, _EXAMPLES_FILE)
+
+  print('input path: ' + input_path)
+  print('gen_output_path: ' + gen_output_path)
+  print('example_output_path: ' + example_output_path)
+
+  input_file = tf.gfile.Open(input_path, mode='r')
+  examples = []
+  for counter, line in enumerate(input_file):
+    if counter == 0:  # Ignore first line since its a header.
+      continue
+    # Get the token and embedding vector.
+    line_split = line.split('\t')
+
+    parse1 = line_split[_PARSE1_INDEX]
+    parse2 = line_split[_PARSE2_INDEX]
+    consensus_label = line_split[_LABEL_INDEX]
+
+    tokens1 = _get_tokens_and_tags(parse1)
+    tokens2 = _get_tokens_and_tags(parse2)
+
+    tokens1_str = ' '.join(tokens1)
+    tokens2_str = ' '.join(tokens2)
+
+    if consensus_label != '-':
+      examples.append([tokens1_str, tokens2_str, consensus_label])
+
+  input_file.close()
+
+  # Output tab delimited file of lines of examples (sentence1, sentence2, label)
+  with tf.gfile.GFile(gen_output_path, 'w') as f:
+    for tokens1_str, tokens2_str, consensus_label in examples:
+      f.write('%s\t%s\t%s\n' % (tokens1_str, tokens2_str, consensus_label))
+
+  if train:
+    # Output file containing all the sentences for generating the vocab from.
+    with tf.gfile.GFile(example_output_path, 'w') as f:
+      for tokens1_str, tokens2_str, consensus_label in examples:
+        f.write('%s %s\n' % (tokens1_str, tokens2_str))
+
+
+def _get_or_generate_vocab(tmp_dir, vocab_filename, vocab_size):
+  vocab_filepath = os.path.join(tmp_dir, vocab_filename)
+  print('Vocab file written to: ' + vocab_filepath)
+
+  if tf.gfile.Exists(vocab_filepath):
+    gs = text_encoder.SubwordTextEncoder(vocab_filepath)
+    return gs
+  else:
+    example_file = os.path.join(tmp_dir, _EXAMPLES_FILE)
+    gs = text_encoder.SubwordTextEncoder()
+    token_counts = text_encoder.SubwordTextEncoder.get_token_counts(
+        example_file, corpus_max_lines=1000000)
+    gs = gs.build_to_target_size(
+        vocab_size, token_counts, vocab_filepath, min_val=1, max_val=1e3)
+    return gs
+
+
+def snli_token_generator(tmp_dir, train, vocab_size):
+  _download_and_parse_dataset(tmp_dir, train)
+
+  symbolizer_vocab = _get_or_generate_vocab(
+      tmp_dir, 'vocab.subword_text_encoder', vocab_size)
+
+  file_name = 'train' if train else 'dev'
+  data_file = os.path.join(tmp_dir, file_name + '.txt')
+  with tf.gfile.GFile(data_file, mode='r') as f:
+    for line in f:
+      sent1, sent2, label = line.strip().split('\t')
+      sent1_enc = symbolizer_vocab.encode(sent1)
+      sent2_enc = symbolizer_vocab.encode(sent2)
+
+      inputs = sent1_enc + [_SEP] + sent2_enc + [_EOS]
+      yield {
+          'inputs': inputs,
+          'targets': [_LABEL_TO_ID[label]],
+      }
diff --git a/tensor2tensor/data_generators/text_encoder.py b/tensor2tensor/data_generators/text_encoder.py
new file mode 100644
index 000000000..6d9ecb4a8
--- /dev/null
+++ b/tensor2tensor/data_generators/text_encoder.py
@@ -0,0 +1,451 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Encoders for text data.
+
+* TextEncoder: base class
+* ByteTextEncoder: for ascii text
+* TokenTextEncoder: with user-supplied vocabulary file
+* SubwordTextEncoder: invertible
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+import six
+from six.moves import xrange  # pylint: disable=redefined-builtin
+from tensor2tensor.data_generators import tokenizer
+
+import tensorflow as tf
+
+# Reserved tokens for things like padding and EOS symbols.
+PAD = '<pad>'
+EOS = '<EOS>'
+RESERVED_TOKENS = [PAD, EOS]
+
+
+class TextEncoder(object):
+  """Base class for converting from ints to/from human readable strings."""
+
+  def __init__(self, num_reserved_ids=2):
+    self._num_reserved_ids = num_reserved_ids
+
+  def encode(self, s):
+    """Transform a human-readable string into a sequence of int ids.
+
+    The ids should be in the range [num_reserved_ids, vocab_size). Ids [0,
+    num_reserved_ids) are reserved.
+
+    EOS is not appended.
+
+    Args:
+      s: human-readable string to be converted.
+
+    Returns:
+      ids: list of integers
+    """
+    return [int(w) + self._num_reserved_ids for w in s.split()]
+
+  def decode(self, ids):
+    """Transform a sequence of int ids into a human-readable string.
+
+    EOS is not expected in ids.
+
+    Args:
+      ids: list of integers to be converted.
+
+    Returns:
+      s: human-readable string.
+    """
+    decoded_ids = []
+    for id_ in ids:
+      if 0 <= id_ < self._num_reserved_ids:
+        decoded_ids.append(RESERVED_TOKENS[int(id_)])
+      else:
+        decoded_ids.append(id_)
+    return '%s' % decoded_ids
+
+  @property
+  def vocab_size(self):
+    raise NotImplementedError()
+
+
+class ByteTextEncoder(TextEncoder):
+  """Encodes each byte to an id. For 8-bit strings only."""
+
+  def encode(self, s):
+    return [ord(c) + self._num_reserved_ids for c in s]
+
+  def decode(self, ids):
+    decoded_ids = []
+    for id_ in ids:
+      if 0 <= id_ < self._num_reserved_ids:
+        decoded_ids.append(RESERVED_TOKENS[int(id_)])
+      else:
+        decoded_ids.append(chr(id_))
+
+    return ''.join(decoded_ids)
+
+  @property
+  def vocab_size(self):
+    return 2**8 + self._num_reserved_ids
+
+
+class TokenTextEncoder(TextEncoder):
+  """Encoder based on a user-supplied vocabulary."""
+
+  def __init__(self, vocab_filename, reverse=False, num_reserved_ids=2):
+    """Initialize from a file, one token per line."""
+    self._reverse = reverse
+    if vocab_filename is None:
+      self._load_vocab_from_file(vocab_filename)
+
+    super(TokenTextEncoder, self).__init__(num_reserved_ids=num_reserved_ids)
+
+  def encode(self, sentence):
+    """Converts a space-separated string of tokens to a list of ids."""
+    ret = [self._token_to_id[tok] for tok in sentence.strip().split()]
+    if self._reverse:
+      ret = ret[::-1]
+    return ret
+
+  def decode(self, ids):
+    if self._reverse:
+      ids = ids[::-1]
+    return ' '.join([self._safe_id_to_token(i) for i in ids])
+
+  @property
+  def vocab_size(self):
+    return len(self._id_to_token)
+
+  def _safe_id_to_token(self, idx):
+    return self._id_to_token.get(idx, 'ID_%d' % idx)
+
+  def _load_vocab_from_file(self, filename):
+    """Load vocab from a file."""
+    self._token_to_id = {}
+    self._id_to_token = {}
+
+    for idx, tok in enumerate(RESERVED_TOKENS):
+      self._token_to_id[tok] = idx
+      self._id_to_token[idx] = tok
+
+    token_start_idx = self._num_reserved_ids
+    with tf.gfile.Open(filename) as f:
+      for i, line in enumerate(f):
+        idx = token_start_idx + i
+        tok = line.strip()
+        self._token_to_id[tok] = idx
+        self._id_to_token[idx] = tok
+
+
+class SubwordTextEncoder(TextEncoder):
+  """Class for breaking tokens into subtokens.
+
+  Invertibly encodes a string as a sequence of subtokens from a limited
+  vocabulary.
+
+  A SubwordTextEncoder is built from a corpus (so it is tailored to the text in
+  the corpus), and stored to a file. See text_encoder_build_subword.py.
+
+  It can then be loaded and used to encode/decode any text.
+  """
+
+  def __init__(self, filename=None, num_reserved_ids=2):
+    """Read from a file."""
+    self._tokenizer = tokenizer.Tokenizer()
+    if filename is not None:
+      self._load_from_file(filename)
+
+    super(SubwordTextEncoder, self).__init__(num_reserved_ids=num_reserved_ids)
+
+  def encode(self, raw_text):
+    """Converts a string to a list of subtoken ids.
+
+    Args:
+      raw_text: a string.
+    Returns:
+      a list of integers in the range [0, vocab_size)
+    """
+    return self._tokens_to_subtokens(self._tokenizer.encode(raw_text))
+
+  def decode(self, subtokens):
+    """Converts a sequence of subtoken ids to a string.
+
+    Args:
+      subtokens: a list of integers in the range [0, vocab_size)
+    Returns:
+      a string
+    """
+    return self._tokenizer.decode(self._subtokens_to_tokens(subtokens))
+
+  @property
+  def vocab_size(self):
+    """The subtoken vocabulary size."""
+    return len(self._all_subtoken_strings)
+
+  def _tokens_to_subtokens(self, tokens):
+    """Converts a list of tokens to a list of subtoken ids.
+
+    Args:
+      tokens: a list of strings.
+    Returns:
+      a list of integers in the range [0, vocab_size)
+    """
+    ret = []
+    for token in tokens:
+      ret.extend(self._escaped_token_to_subtokens(self._escape_token(token)))
+    return ret
+
+  def _subtokens_to_tokens(self, subtokens):
+    """Converts a list of subtoken ids to a list of tokens.
+
+    Args:
+      subtokens: a list of integers in the range [0, vocab_size)
+    Returns:
+      a list of strings.
+    """
+    concatenated = ''.join(
+        [self.subtoken_to_subtoken_string(s) for s in subtokens])
+    split = concatenated.split('_')
+    return [self._unescape_token(t + '_') for t in split if t]
+
+  def subtoken_to_subtoken_string(self, subtoken):
+    """Subtoken_String (string) corresponding to the given subtoken (id)."""
+    if (subtoken >= 0 and subtoken < self.vocab_size and
+        self._all_subtoken_strings[subtoken]):
+      return self._all_subtoken_strings[subtoken]
+    else:
+      return 'ID%d_' % subtoken
+
+  def _escaped_token_to_subtokens(self, escaped_token):
+    """Converts an escaped token string to a list of subtokens.
+
+    Args:
+      escaped_token: an escaped token
+    Returns:
+      a list of one or more integers.
+    """
+    ret = []
+    pos = 0
+    while pos < len(escaped_token):
+      end = len(escaped_token)
+      while True:
+        subtoken = self._subtoken_string_to_id.get(escaped_token[pos:end], -1)
+        if subtoken != -1:
+          break
+        end -= 1
+      ret.append(subtoken)
+      pos = end
+    return ret
+
+  @classmethod
+  def build_to_target_size(cls,
+                           target_size,
+                           token_counts,
+                           store_filename,
+                           min_val,
+                           max_val,
+                           num_iterations=4):
+    """Builds a SubwordTextEncoder that has `vocab_size` near `target_size`.
+
+    Uses simple recursive binary search to find a `min_count` value that most
+    closely matches the `target_size`.
+
+    Args:
+      target_size: desired vocab_size to approximate.
+      token_counts: a dictionary of string to int.
+      store_filename: a string - where to write the vocabulary.
+      min_val: an integer - lower bound for `min_count`.
+      max_val: an integer - upper bound for `min_count`.
+      num_iterations: an integer.  how many iterations of refinement.
+
+    Returns:
+      a SubwordTextEncoder instance.
+    """
+    present_count = (max_val + min_val) // 2
+    tf.logging.info('Trying min_count %d' % present_count)
+    subtokenizer = cls()
+    subtokenizer.build_from_token_counts(token_counts, store_filename,
+                                         present_count, num_iterations)
+
+    if min_val == max_val or subtokenizer.vocab_size == target_size:
+      return subtokenizer
+    elif subtokenizer.vocab_size > target_size:
+      other_subtokenizer = cls.build_to_target_size(
+          target_size, token_counts, store_filename, present_count + 1, max_val,
+          num_iterations)
+      if (abs(other_subtokenizer.vocab_size - target_size) <
+          abs(subtokenizer.vocab_size - target_size)):
+        return other_subtokenizer
+      else:
+        return subtokenizer
+    else:
+      other_subtokenizer = cls.build_to_target_size(
+          target_size, token_counts, store_filename, min_val, present_count - 1,
+          num_iterations)
+      if (abs(other_subtokenizer.vocab_size - target_size) <
+          abs(subtokenizer.vocab_size - target_size)):
+        return other_subtokenizer
+      else:
+        return subtokenizer
+
+  def build_from_token_counts(self,
+                              token_counts,
+                              store_filename,
+                              min_count,
+                              num_iterations=4):
+    """Train a SubwordTextEncoder based on a dictionary of word counts.
+
+    Args:
+      token_counts: a dictionary of string to int.
+      store_filename: a string - where to write the vocabulary.
+      min_count: an integer - discard subtokens with lower counts.
+      num_iterations: an integer.  how many iterations of refinement.
+    """
+    # We build iteratively.  On each iteration, we segment all the words,
+    # then count the resulting potential subtokens, keeping the ones
+    # with high enough counts for our new vocabulary.
+    for i in xrange(num_iterations):
+      counts = {}
+      for token, count in six.iteritems(token_counts):
+        escaped_token = self._escape_token(token)
+        # we will count all tails of the escaped_token, starting from boundaries
+        # determined by our current segmentation.
+        if i == 0:
+          starts = list(range(len(escaped_token)))
+        else:
+          subtokens = self._escaped_token_to_subtokens(escaped_token)
+          pos = 0
+          starts = []
+          for subtoken in subtokens:
+            starts.append(pos)
+            pos += len(self.subtoken_to_subtoken_string(subtoken))
+        for start in starts:
+          for end in xrange(start + 1, len(escaped_token) + 1):
+            subtoken_string = escaped_token[start:end]
+            counts[subtoken_string] = counts.get(subtoken_string, 0) + count
+      # array of lists of candidate subtoken strings, by length
+      len_to_subtoken_strings = []
+      for subtoken_string, count in six.iteritems(counts):
+        if count < min_count or len(subtoken_string) <= 1:
+          continue
+        while len(len_to_subtoken_strings) <= len(subtoken_string):
+          len_to_subtoken_strings.append([])
+        len_to_subtoken_strings[len(subtoken_string)].append(subtoken_string)
+      new_subtoken_strings = []
+      # consider the candidates longest to shortest, so that if we accept
+      # a longer subtoken string, we can decrement the counts of its prefixes.
+      for subtoken_strings in len_to_subtoken_strings[::-1]:
+        for subtoken_string in subtoken_strings:
+          count = counts[subtoken_string]
+          if count < min_count:
+            continue
+          new_subtoken_strings.append((-count, subtoken_string))
+          for l in xrange(1, len(subtoken_string)):
+            counts[subtoken_string[:l]] -= count
+      # make sure we have all single characters.
+      new_subtoken_strings.extend([(-counts.get(chr(i), 0), chr(i))
+                                   for i in xrange(2**8)])
+      new_subtoken_strings.sort()
+      self._init_from_list([''] * self._num_reserved_ids +
+                           [p[1] for p in new_subtoken_strings])
+      print('vocab_size = %d' % self.vocab_size)
+
+    original = 'This sentence was encoded by the SubwordTextEncoder.'
+    encoded = self.encode(original)
+    print(encoded)
+    print([self.subtoken_to_subtoken_string(s) for s in encoded])
+    decoded = self.decode(encoded)
+    print(decoded)
+    assert decoded == original
+    self._store_to_file(store_filename)
+
+  def _init_from_list(self, subtoken_strings):
+    """Initialize from a list of subtoken strings."""
+    self._all_subtoken_strings = subtoken_strings
+    self._subtoken_string_to_id = {}
+    for i in xrange(len(subtoken_strings)):
+      subtoken_string = subtoken_strings[i]
+      if subtoken_string:
+        self._subtoken_string_to_id[subtoken_string] = i
+
+  def _load_from_file(self, filename):
+    """Load from a file."""
+    subtoken_strings = []
+    with tf.gfile.Open(filename) as f:
+      for line in f:
+        subtoken_strings.append(line.strip()[1:-1].decode('string-escape'))
+    self._init_from_list(subtoken_strings)
+
+  def _store_to_file(self, filename):
+    with tf.gfile.Open(filename, 'w') as f:
+      for subtoken_string in self._all_subtoken_strings:
+        f.write('\'' + subtoken_string.encode('string-escape') + '\'\n')
+
+  def _escape_token(self, token):
+    r"""Translate '\'->'\\' and '_'->'\u', then append '_'.
+
+    Args:
+      token: a string
+    Returns:
+      escaped_token: a string
+    """
+    return token.replace('\\', '\\\\').replace('_', '\\u') + '_'
+
+  def _unescape_token(self, escaped_token):
+    r"""Remove '_' from end, then translate '\\'->'\' and '\u'->'_'.
+
+    TODO(noam): There must be some better way to do this with regexps.
+
+    Args:
+      escaped_token: a string
+    Returns:
+      token: a string
+    """
+    assert escaped_token[-1] == '_'
+    escaped_token = escaped_token[:-1]
+    if '\\' not in escaped_token:
+      return escaped_token
+    ret = ''
+    pos = 0
+    while pos < len(escaped_token):
+      if escaped_token[pos] == '\\' and pos + 1 < len(escaped_token):
+        if escaped_token[pos + 1] == 'u':
+          ret += '_'
+        else:
+          ret += escaped_token[pos + 1]
+        pos += 1
+      pos += 1
+    return ret
+
+  @classmethod
+  def get_token_counts(cls, text_filepattern, corpus_max_lines):
+    """Read the corpus and compute a dictionary of word counts."""
+    tok = tokenizer.Tokenizer()
+    token_counts = {}
+    lines_read = 0
+    filenames = tf.gfile.Glob(text_filepattern)
+    for text_filename in filenames:
+      with tf.gfile.Open(text_filename) as f:
+        for line in f:
+          tokens = tok.encode(line.strip())
+          for t in tokens:
+            token_counts[t] = token_counts.get(t, 0) + 1
+          lines_read += 1
+          if corpus_max_lines > 0 and lines_read > corpus_max_lines:
+            return token_counts
+    return token_counts
diff --git a/tensor2tensor/data_generators/text_encoder_build_subword.py b/tensor2tensor/data_generators/text_encoder_build_subword.py
new file mode 100644
index 000000000..ee71af9f6
--- /dev/null
+++ b/tensor2tensor/data_generators/text_encoder_build_subword.py
@@ -0,0 +1,67 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""Program to build a SubwordTextEncoder.
+
+The flags --min_count and --corpus_max_lines will affect the size of the
+vocabulary.  Try changing these flags until you get a vocabulary
+of the size you want.
+
+Example usage:
+
+python data_generators/text_encoder_build_subword.py \
+    --corpus_filepattern=$LM1B_DIR/train-unk-* \
+    --corpus_max_lines=17500 \
+    --output_fn=$DATA_DIR/lm1b16k.subword_text_encoder \
+    --logtostderr
+
+python data_generators/text_encoder_build_subword.py \
+    --corpus_filepattern=$LM1B_DIR/train-unk-* \
+    --corpus_max_lines=270000 \
+    --output_fn=$DATA_DIR/lm1b64k.subword_text_encoder \
+    --logtostderr
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+from tensor2tensor.data_generators import text_encoder
+
+import tensorflow as tf
+
+tf.app.flags.DEFINE_string('output_fn', '/tmp/my.subword_text_encoder',
+                           'where to store the SubwordTextEncoder')
+tf.app.flags.DEFINE_string('corpus_filepattern', '',
+                           'Corpus of one or more text files')
+tf.app.flags.DEFINE_integer('min_count', 5, 'Minimum subtoken count in corpus')
+tf.app.flags.DEFINE_integer('corpus_max_lines', 10000,
+                            'How many lines of corpus to read')
+tf.app.flags.DEFINE_integer('num_iterations', 4, 'Number of iterations')
+FLAGS = tf.app.flags.FLAGS
+
+
+def main(unused_argv):
+  gs = text_encoder.SubwordTextEncoder()
+  if not FLAGS.corpus_filepattern:
+    raise ValueError('Must provide --corpus_filepattern')
+  token_counts = text_encoder.SubwordTextEncoder.get_token_counts(
+      FLAGS.corpus_filepattern, FLAGS.corpus_max_lines)
+  gs.build_from_token_counts(token_counts, FLAGS.output_fn, FLAGS.min_count,
+                             FLAGS.num_iterations)
+
+
+if __name__ == '__main__':
+  tf.app.run()
diff --git a/tensor2tensor/data_generators/text_encoder_inspect_subword.py b/tensor2tensor/data_generators/text_encoder_inspect_subword.py
new file mode 100644
index 000000000..0ad9a2701
--- /dev/null
+++ b/tensor2tensor/data_generators/text_encoder_inspect_subword.py
@@ -0,0 +1,64 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""Inspect a TFRecord file of tensorflow.Example and show tokenizations.
+
+python data_generators/text_encoder_inspect_subword.py \
+    --logtostderr \
+    --vocab_file=$DATA_DIR/tokens.vocab.8192 \
+    --in_file=$DATA_DIR/wmt_ende_tokens_8k-train-00000-of-00100
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+from tensor2tensor.data_generators import text_encoder
+
+import tensorflow as tf
+
+tf.app.flags.DEFINE_string("vocab_file", "",
+                           "SubwordTextEncoder vocabulary file")
+
+tf.app.flags.DEFINE_string("in_file", "", "input filename")
+
+FLAGS = tf.app.flags.FLAGS
+
+
+def ShowSequence(subtokenizer, subtokens, label):
+  print("%s decoded = %s" % (label, subtokenizer.decode(subtokens)))
+  print("%s subtoken ids = %s" % (label, subtokens))
+  print("%s subtoken strings = %s" %
+        (label,
+         [subtokenizer.subtoken_to_subtoken_string(s) for s in subtokens]))
+  print("")
+
+
+def main(_):
+  """Convert a file to examples."""
+  subtokenizer = text_encoder.SubwordTextEncoder(FLAGS.vocab_file)
+  reader = tf.python_io.tf_record_iterator(FLAGS.in_file)
+  for record in reader:
+    x = tf.train.Example()
+    x.ParseFromString(record)
+    inputs = [int(i) for i in x.features.feature["inputs"].int64_list.value]
+    targets = [int(i) for i in x.features.feature["targets"].int64_list.value]
+    ShowSequence(subtokenizer, inputs, "inputs")
+    ShowSequence(subtokenizer, targets, "targets")
+
+
+if __name__ == "__main__":
+  tf.app.run()
diff --git a/tensor2tensor/data_generators/tokenizer.py b/tensor2tensor/data_generators/tokenizer.py
new file mode 100644
index 000000000..15b199907
--- /dev/null
+++ b/tensor2tensor/data_generators/tokenizer.py
@@ -0,0 +1,117 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A simple invertible tokenizer.
+
+Converts from a raw string to a list of tokens (strings).
+
+This tokenizer has the following desirable properties:
+ - It is invertible.
+ - Punctuation is broken away from adjacent letters.
+ - A single space between words does not produce an extra token.
+
+The tokenization algorithm is as follows:
+
+0.  We classify the 256 characters into "word characters" and
+    "separator characters".  Separator characters are defined as the union of
+    string.punctuation and string.whitespace.  All other characters are
+    "word characters".
+
+1.  Split the text into a list of tokens, splitting at every boundary of a
+    "word character" and a "separator character".  This produces a list which
+    alternates between "word tokens" (strings of word characters) and
+    "separator tokens" (strings of of separator characters).
+
+2.  Remove every token consisting of a single space, unless it is
+    the very first or very last token in the list.  These tokens are now
+    implied by the fact that there are two adjacent word tokens.
+
+e.g.  "Dude - that's so cool."
+        -> ["Dude", " - ", "that", "'", "s", "so", "cool", "."]
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import array
+import string
+
+# Dependency imports
+
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+
+class Tokenizer(object):
+  """Vocab for breaking words into wordpieces.
+  """
+
+  def __init__(self):
+    self._separator_chars = string.punctuation + string.whitespace
+    self._separator_char_mask = array.array(
+        "l", [chr(i) in self._separator_chars for i in xrange(256)])
+    self.token_counts = dict()
+
+  def _increment_token_count(self, token):
+    if token in self.token_counts:
+      self.token_counts[token] += 1
+    else:
+      self.token_counts[token] = 1
+
+  def encode(self, raw_text):
+    """Encode a raw string as a list of tokens.
+
+    Args:
+      raw_text: a string
+    Returns:
+      a list of stirngs.
+    """
+    if not raw_text:
+      return []
+    ret = []
+    token_start = 0
+    for pos in xrange(1, len(raw_text)):
+      if (self._is_separator_char(raw_text[pos]) !=
+          self._is_separator_char(raw_text[pos - 1])):
+        token = raw_text[token_start:pos]
+        if token != " " or token_start == 0:
+          ret.append(token)
+          self._increment_token_count(token)
+        token_start = pos
+    final_token = raw_text[token_start:]
+    ret.append(final_token)
+    self._increment_token_count(final_token)
+    return ret
+
+  def decode(self, tokens):
+    """Decode a list of tokens to a string.
+
+    Args:
+      tokens: a list of stirngs
+    Returns:
+      a string.
+    """
+    ret = ""
+    for i, token in enumerate(tokens):
+      if (i > 0 and self._is_word_char(tokens[i - 1][0]) and
+          self._is_word_char(token[0])):
+        ret += " "
+      ret += token
+    return ret
+
+  def _is_separator_char(self, c):
+    return self._separator_char_mask[ord(c)]
+
+  def _is_word_char(self, c):
+    return not self._is_separator_char(c)
diff --git a/tensor2tensor/data_generators/tokenizer_test.py b/tensor2tensor/data_generators/tokenizer_test.py
new file mode 100644
index 000000000..4102051e6
--- /dev/null
+++ b/tensor2tensor/data_generators/tokenizer_test.py
@@ -0,0 +1,64 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# coding=utf-8
+"""Tests for tensor2tensor.data_generators.tokenizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import random
+
+# Dependency imports
+
+import six
+from six.moves import xrange  # pylint: disable=redefined-builtin
+from tensor2tensor.data_generators import tokenizer
+
+import tensorflow as tf
+
+
+class TokenizerTest(tf.test.TestCase):
+
+  def testEncode(self):
+    t = tokenizer.Tokenizer()
+    self.assertEqual(
+        t.encode("Dude - that's so cool."),
+        ["Dude", " - ", "that", "'", "s", "so", "cool", "."])
+    self.assertEqual(
+        t.encode("Łukasz est né en 1981."),
+        ["Łukasz", "est", "né", "en", "1981", "."])
+    self.assertEqual(
+        t.encode(" Spaces at the ends "),
+        [" ", "Spaces", "at", "the", "ends", " "])
+    self.assertEqual(t.encode("802.11b"), ["802", ".", "11b"])
+    self.assertEqual(t.encode("two. \nlines"), ["two", ". \n", "lines"])
+
+  def testDecode(self):
+    t = tokenizer.Tokenizer()
+    self.assertEqual(
+        t.decode(["Dude", " - ", "that", "'", "s", "so", "cool", "."]),
+        "Dude - that's so cool.")
+
+  def testInvertibilityOnRandomStrings(self):
+    t = tokenizer.Tokenizer()
+    random.seed(123)
+    for _ in xrange(10000):
+      s = "".join([six.int2byte(random.randint(0, 255)) for _ in xrange(10)])
+      self.assertEqual(s, t.decode(t.encode(s)))
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensor2tensor/data_generators/wmt.py b/tensor2tensor/data_generators/wmt.py
new file mode 100644
index 000000000..4ac669f71
--- /dev/null
+++ b/tensor2tensor/data_generators/wmt.py
@@ -0,0 +1,269 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Data generators for WMT data-sets."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import tarfile
+
+# Dependency imports
+
+from tensor2tensor.data_generators import generator_utils
+from tensor2tensor.data_generators import text_encoder
+
+import tensorflow as tf
+
+
+def character_generator(source_path, target_path, eos=None):
+  """Generator for sequence-to-sequence tasks that just uses characters.
+
+  This generator assumes the files at source_path and target_path have
+  the same number of lines and yields dictionaries of "inputs" and "targets"
+  where inputs are characters from the source lines converted to integers,
+  and targets are characters from the target lines, also converted to integers.
+
+  Args:
+    source_path: path to the file with source sentences.
+    target_path: path to the file with target sentences.
+    eos: integer to append at the end of each sequence (default: None).
+
+  Yields:
+    A dictionary {"inputs": source-line, "targets": target-line} where
+    the lines are integer lists converted from characters in the file lines.
+  """
+  eos_list = [] if eos is None else [eos]
+  with tf.gfile.GFile(source_path, mode="r") as source_file:
+    with tf.gfile.GFile(target_path, mode="r") as target_file:
+      source, target = source_file.readline(), target_file.readline()
+      while source and target:
+        source_ints = [ord(c) for c in source.strip()] + eos_list
+        target_ints = [ord(c) for c in target.strip()] + eos_list
+        yield {"inputs": source_ints, "targets": target_ints}
+        source, target = source_file.readline(), target_file.readline()
+
+
+def token_generator(source_path, target_path, token_vocab, eos=None):
+  """Generator for sequence-to-sequence tasks that uses tokens.
+
+  This generator assumes the files at source_path and target_path have
+  the same number of lines and yields dictionaries of "inputs" and "targets"
+  where inputs are token ids from the " "-split source (and target, resp.) lines
+  converted to integers using the token_map.
+
+  Args:
+    source_path: path to the file with source sentences.
+    target_path: path to the file with target sentences.
+    token_vocab: text_encoder.TextEncoder object.
+    eos: integer to append at the end of each sequence (default: None).
+
+  Yields:
+    A dictionary {"inputs": source-line, "targets": target-line} where
+    the lines are integer lists converted from tokens in the file lines.
+  """
+  eos_list = [] if eos is None else [eos]
+  with tf.gfile.GFile(source_path, mode="r") as source_file:
+    with tf.gfile.GFile(target_path, mode="r") as target_file:
+      source, target = source_file.readline(), target_file.readline()
+      while source and target:
+        source_ints = token_vocab.encode(source.strip()) + eos_list
+        target_ints = token_vocab.encode(target.strip()) + eos_list
+        yield {"inputs": source_ints, "targets": target_ints}
+        source, target = source_file.readline(), target_file.readline()
+
+
+def _get_wmt_ende_dataset(directory, filename):
+  """Extract the WMT en-de corpus `filename` to directory unless it's there."""
+  train_path = os.path.join(directory, filename)
+  if not (tf.gfile.Exists(train_path + ".de") and
+          tf.gfile.Exists(train_path + ".en")):
+    # We expect that this file has been downloaded from:
+    # https://drive.google.com/open?id=0B_bZck-ksdkpM25jRUN2X2UxMm8 and placed
+    # in `directory`.
+    corpus_file = os.path.join(directory, "wmt16_en_de.tar.gz")
+    with tarfile.open(corpus_file, "r:gz") as corpus_tar:
+      corpus_tar.extractall(directory)
+  return train_path
+
+
+def ende_bpe_token_generator(tmp_dir, train):
+  """Instance of token generator for the WMT en->de task, training set."""
+  dataset_path = ("train.tok.clean.bpe.32000"
+                  if train else "newstest2013.tok.bpe.32000")
+  train_path = _get_wmt_ende_dataset(tmp_dir, dataset_path)
+  token_path = os.path.join(tmp_dir, "vocab.bpe.32000")
+  token_vocab = text_encoder.TokenTextEncoder(vocab_filename=token_path)
+  return token_generator(train_path + ".en", train_path + ".de", token_vocab, 1)
+
+
+_ENDE_TRAIN_DATASETS = [
+    [
+        "http://data.statmt.org/wmt16/translation-task/training-parallel-nc-v11.tgz",  # pylint: disable=line-too-long
+        ("training-parallel-nc-v11/news-commentary-v11.de-en.en",
+         "training-parallel-nc-v11/news-commentary-v11.de-en.de")
+    ],
+    [
+        "http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz",
+        ("commoncrawl.de-en.en", "commoncrawl.de-en.de")
+    ],
+    [
+        "http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz",
+        ("training/europarl-v7.de-en.en", "training/europarl-v7.de-en.de")
+    ],
+]
+_ENDE_TEST_DATASETS = [
+    [
+        "http://data.statmt.org/wmt16/translation-task/dev.tgz",
+        ("dev/newstest2013.en", "dev/newstest2013.de")
+    ],
+]
+
+_ENFR_TRAIN_DATASETS = [
+    [
+        "http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz",
+        ("commoncrawl.fr-en.en", "commoncrawl.fr-en.fr")
+    ],
+    [
+        "http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz",
+        ("training/europarl-v7.fr-en.en", "training/europarl-v7.fr-en.fr")
+    ],
+    [
+        "http://www.statmt.org/wmt14/training-parallel-nc-v9.tgz",
+        ("training/news-commentary-v9.fr-en.en",
+         "training/news-commentary-v9.fr-en.fr")
+    ],
+    [
+        "http://www.statmt.org/wmt10/training-giga-fren.tar",
+        ("giga-fren.release2.fixed.en.gz", "giga-fren.release2.fixed.fr.gz")
+    ],
+    [
+        "http://www.statmt.org/wmt13/training-parallel-un.tgz",
+        ("un/undoc.2000.fr-en.en", "un/undoc.2000.fr-en.fr")
+    ],
+]
+_ENFR_TEST_DATASETS = [
+    [
+        "http://data.statmt.org/wmt16/translation-task/dev.tgz",
+        ("dev/newstest2013.en", "dev/newstest2013.fr")
+    ],
+]
+
+
+def _compile_data(tmp_dir, datasets, filename):
+  """Concatenate all `datasets` and save to `filename`."""
+  filename = os.path.join(tmp_dir, filename)
+  lang1_lines, lang2_lines = [], []
+  for dataset in datasets:
+    url = dataset[0]
+    compressed_filename = os.path.basename(url)
+    compressed_filepath = os.path.join(tmp_dir, compressed_filename)
+
+    lang1_filename, lang2_filename = dataset[1]
+    lang1_filepath = os.path.join(tmp_dir, lang1_filename)
+    lang2_filepath = os.path.join(tmp_dir, lang2_filename)
+
+    if not os.path.exists(compressed_filepath):
+      generator_utils.maybe_download(tmp_dir, compressed_filename, url)
+    if not os.path.exists(lang1_filepath) or not os.path.exists(lang2_filepath):
+      mode = "r:gz" if "gz" in compressed_filepath else "r"
+      with tarfile.open(compressed_filepath, mode) as corpus_tar:
+        corpus_tar.extractall(tmp_dir)
+    if ".gz" in lang1_filepath:
+      new_filepath = lang1_filepath.strip(".gz")
+      generator_utils.gunzip_file(lang1_filepath, new_filepath)
+      lang1_filepath = new_filepath
+    if ".gz" in lang2_filepath:
+      new_filepath = lang2_filepath.strip(".gz")
+      generator_utils.gunzip_file(lang2_filepath, new_filepath)
+      lang2_filepath = new_filepath
+    with tf.gfile.GFile(lang1_filepath, mode="r") as lang1_file:
+      with tf.gfile.GFile(lang2_filepath, mode="r") as lang2_file:
+        lang1_file_lines = lang1_file.readlines()
+        lang2_file_lines = lang2_file.readlines()
+        assert len(lang1_file_lines) == len(lang2_file_lines), lang1_filepath
+        lang1_lines.extend(lang1_file_lines)
+        lang2_lines.extend(lang2_file_lines)
+
+  write_chunk_size = 10000
+  assert len(lang1_lines) == len(lang2_lines)
+  with tf.gfile.GFile(filename + ".lang1", mode="w") as lang1_file:
+    i = 0
+    while i <= len(lang1_lines):
+      lang1_file.writelines(
+          lang1_lines[i * write_chunk_size:(i + 1) * write_chunk_size])
+      i += 1
+    lang1_file.writelines(lang1_lines[i * write_chunk_size:])
+  with tf.gfile.GFile(filename + ".lang2", mode="w") as lang2_file:
+    i = 0
+    while i <= len(lang2_lines):
+      lang2_file.writelines(
+          lang2_lines[i * write_chunk_size:(i + 1) * write_chunk_size])
+      i += 1
+    lang2_file.writelines(lang2_lines[i * write_chunk_size:])
+  return filename
+
+
+def ende_wordpiece_token_generator(tmp_dir, train, vocab_size):
+  symbolizer_vocab = generator_utils.get_or_generate_vocab(
+      tmp_dir, "tokens.vocab.%d" % vocab_size, vocab_size)
+  datasets = _ENDE_TRAIN_DATASETS if train else _ENDE_TEST_DATASETS
+  tag = "train" if train else "dev"
+  data_path = _compile_data(tmp_dir, datasets, "wmt_ende_tok_%s" % tag)
+  return token_generator(data_path + ".lang1", data_path + ".lang2",
+                         symbolizer_vocab, 1)
+
+
+def ende_character_generator(tmp_dir, train):
+  datasets = _ENDE_TRAIN_DATASETS if train else _ENDE_TEST_DATASETS
+  tag = "train" if train else "dev"
+  data_path = _compile_data(tmp_dir, datasets, "wmt_ende_chr_%s" % tag)
+  return character_generator(data_path + ".lang1", data_path + ".lang2", 1)
+
+
+def enfr_wordpiece_token_generator(tmp_dir, train, vocab_size):
+  """Instance of token generator for the WMT en->fr task."""
+  symbolizer_vocab = generator_utils.get_or_generate_vocab(
+      tmp_dir, "tokens.vocab.%d" % vocab_size, vocab_size)
+  datasets = _ENFR_TRAIN_DATASETS if train else _ENFR_TEST_DATASETS
+  tag = "train" if train else "dev"
+  data_path = _compile_data(tmp_dir, datasets, "wmt_enfr_tok_%s" % tag)
+  return token_generator(data_path + ".lang1", data_path + ".lang2",
+                         symbolizer_vocab, 1)
+
+
+def enfr_character_generator(tmp_dir, train):
+  """Instance of character generator for the WMT en->fr task."""
+  datasets = _ENFR_TRAIN_DATASETS if train else _ENFR_TEST_DATASETS
+  tag = "train" if train else "dev"
+  data_path = _compile_data(tmp_dir, datasets, "wmt_enfr_chr_%s" % tag)
+  return character_generator(data_path + ".lang1", data_path + ".lang2", 1)
+
+
+def parsing_character_generator(tmp_dir, train):
+  filename = "parsing_%s" % ("train" if train else "dev")
+  text_filepath = os.path.join(tmp_dir, filename + ".text")
+  tags_filepath = os.path.join(tmp_dir, filename + ".tags")
+  return character_generator(text_filepath, tags_filepath, 1)
+
+
+def parsing_token_generator(tmp_dir, train, vocab_size):
+  symbolizer_vocab = generator_utils.get_or_generate_vocab(
+      tmp_dir, "tokens.vocab.%d" % vocab_size, vocab_size)
+  filename = "parsing_%s" % ("train" if train else "dev")
+  text_filepath = os.path.join(tmp_dir, filename + ".text")
+  tags_filepath = os.path.join(tmp_dir, filename + ".tags")
+  return token_generator(text_filepath, tags_filepath, symbolizer_vocab, 1)
diff --git a/tensor2tensor/data_generators/wmt_test.py b/tensor2tensor/data_generators/wmt_test.py
new file mode 100644
index 000000000..7121e3d8a
--- /dev/null
+++ b/tensor2tensor/data_generators/wmt_test.py
@@ -0,0 +1,72 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""WMT generators test."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import io
+import os
+import tempfile
+
+# Dependency imports
+
+import six
+from tensor2tensor.data_generators import wmt
+
+import tensorflow as tf
+
+
+class WMTTest(tf.test.TestCase):
+
+  def testCharacterGenerator(self):
+    # Generate a trivial source and target file.
+    tmp_dir = self.get_temp_dir()
+    (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir)
+    with io.open(tmp_file_path + ".src", "wb") as src_file:
+      src_file.write("source1\n")
+      src_file.write("source2\n")
+    with io.open(tmp_file_path + ".tgt", "wb") as tgt_file:
+      tgt_file.write("target1\n")
+      tgt_file.write("target2\n")
+
+    # Call character generator on the generated files.
+    results_src, results_tgt = [], []
+    for dictionary in wmt.character_generator(tmp_file_path + ".src",
+                                              tmp_file_path + ".tgt"):
+      self.assertEqual(sorted(list(dictionary)), ["inputs", "targets"])
+      results_src.append(dictionary["inputs"])
+      results_tgt.append(dictionary["targets"])
+
+    # Check that the results match the files.
+    self.assertEqual(len(results_src), 2)
+    self.assertEqual("".join([six.int2byte(i)
+                              for i in results_src[0]]), "source1")
+    self.assertEqual("".join([six.int2byte(i)
+                              for i in results_src[1]]), "source2")
+    self.assertEqual("".join([six.int2byte(i)
+                              for i in results_tgt[0]]), "target1")
+    self.assertEqual("".join([six.int2byte(i)
+                              for i in results_tgt[1]]), "target2")
+
+    # Clean up.
+    os.remove(tmp_file_path + ".src")
+    os.remove(tmp_file_path + ".tgt")
+    os.remove(tmp_file_path)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensor2tensor/data_generators/wsj_parsing.py b/tensor2tensor/data_generators/wsj_parsing.py
new file mode 100644
index 000000000..a2dda4d9d
--- /dev/null
+++ b/tensor2tensor/data_generators/wsj_parsing.py
@@ -0,0 +1,109 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Data generators for parsing data-sets."""
+
+import os
+
+# Dependency imports
+
+from tensor2tensor.data_generators import generator_utils
+
+import tensorflow as tf
+
+
+def words_and_tags_from_wsj_tree(tree_string):
+  """Generates linearized trees and tokens from the wsj tree format.
+
+  It uses the linearized algorithm described in https://arxiv.org/abs/1412.7449.
+
+  Args:
+    tree_string: tree in wsj format
+
+  Returns:
+    tuple: (words, linearized tree)
+  """
+  stack, tags, words = [], [], []
+  for tok in tree_string.strip().split():
+    if tok[0] == "(":
+      symbol = tok[1:]
+      tags.append(symbol)
+      stack.append(symbol)
+    else:
+      assert tok[-1] == ")"
+      stack.pop()  # Pop the POS-tag.
+      while tok[-2] == ")":
+        tags.append("/" + stack.pop())
+        tok = tok[:-1]
+      words.append(tok[:-1])
+  return str.join(" ", words), str.join(" ", tags[1:-1])  # Strip "TOP" tag.
+
+
+def token_generator(tree_path, source_token_vocab, target_token_vocab,
+                    eos=None):
+  """Generator for parsing as a sequence-to-sequence task that uses tokens.
+
+  This generator assumes the files at source_path and target_path have
+  the same number of lines and yields dictionaries of "inputs" and "targets"
+  where inputs and targets are token ids from source and taret lines
+  converted to integers using the token_map.
+
+  Args:
+    tree_path: path to the file with wsj format trees, one per line.
+    source_token_vocab: GenericVocabulary object for source vocabulary.
+    target_token_vocab: GenericVocabulary object for target vocabulary.
+    eos: integer to append at the end of each sequence (default: None).
+
+  Yields:
+    A dictionary {"inputs": source-line, "targets": target-line} where
+    the lines are integer lists converted from tokens in the file lines.
+  """
+  eos_list = [] if eos is None else [eos]
+  with tf.gfile.GFile(tree_path, mode="r") as tree_file:
+    tree_line = tree_file.readline()
+    while tree_line:
+      source, target = words_and_tags_from_wsj_tree(tree_line)
+      source_ints = source_token_vocab.encode(source.strip()) + eos_list
+      target_ints = target_token_vocab.encode(target.strip()) + eos_list
+      yield {"inputs": source_ints, "targets": target_ints}
+      tree_line = tree_file.readline()
+
+
+def parsing_token_generator(tmp_dir, train, source_vocab_size,
+                            target_vocab_size):
+  """Generator for parsing as a sequence-to-sequence task that uses tokens.
+
+  This generator assumes the files parsing_{train,dev}.wsj, which contain trees
+  in wsj format and wsj_{source,target}.tokens.vocab.<vocab_size> exist in
+  tmp_dir.
+
+  Args:
+    tmp_dir: path to the file with source sentences.
+    train: path to the file with target sentences.
+    source_vocab_size: source vocab size.
+    target_vocab_size: target vocab size.
+
+  Returns:
+    A generator to a dictionary of inputs and outputs.
+  """
+  source_symbolizer_vocab = generator_utils.get_or_generate_vocab(
+      tmp_dir, "wsj_source.tokens.vocab.%d" % source_vocab_size,
+      source_vocab_size)
+  target_symbolizer_vocab = generator_utils.get_or_generate_vocab(
+      tmp_dir, "wsj_target.tokens.vocab.%d" % target_vocab_size,
+      target_vocab_size)
+  filename = "parsing_%s.trees" % ("train" if train else "dev")
+  tree_filepath = os.path.join(tmp_dir, filename)
+  return token_generator(tree_filepath, source_symbolizer_vocab,
+                         target_symbolizer_vocab, 1)
diff --git a/tensor2tensor/models/README.md b/tensor2tensor/models/README.md
new file mode 100644
index 000000000..29b88484f
--- /dev/null
+++ b/tensor2tensor/models/README.md
@@ -0,0 +1,16 @@
+# Constructing T2T Models.
+
+This directory contains T2T models, their hyperparameters, and a number
+of common layers and hyperparameter settings to help construct new models.
+Common building blocks are in `common_layers.py` and `common_attention.py`.
+Common hyperparameters are in `common_hparams.py`. Models are imported in
+`models.py`.
+
+## Adding a new model.
+
+To add a model to the built-in set, create a new file (see, e.g.,
+`neural_gpu.py`) and write your model class inheriting from `T2TModel` there and
+decorate it with `registry.register_model`. Import it in `models.py`.
+
+It is now avaialable to use with the trainer binary (`t2t-trainer`) using the
+`--model=model_name` flag.
diff --git a/tensor2tensor/models/__init__.py b/tensor2tensor/models/__init__.py
new file mode 100644
index 000000000..27d533abc
--- /dev/null
+++ b/tensor2tensor/models/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/tensor2tensor/models/attention_lm.py b/tensor2tensor/models/attention_lm.py
new file mode 100644
index 000000000..581cd767f
--- /dev/null
+++ b/tensor2tensor/models/attention_lm.py
@@ -0,0 +1,169 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Self-attention based language model.
+
+Like transformer.py, but no encoder
+
+decoder: [Self-Attention, Feed-forward] x n
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+
+# Dependency imports
+
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensor2tensor.models import common_attention
+from tensor2tensor.models import common_hparams
+from tensor2tensor.models import common_layers
+from tensor2tensor.utils import registry
+from tensor2tensor.utils import t2t_model
+
+import tensorflow as tf
+
+
+@registry.register_model
+class AttentionLM(t2t_model.T2TModel):
+  """Attention net.  See file docstring."""
+
+  def model_fn_body(self, features, train):
+    # Remove dropout if not training
+    hparams = copy.copy(self._hparams)
+    if not train:
+      hparams.attention_dropout = 0.
+      hparams.relu_dropout = 0.
+      hparams.residual_dropout = 0.
+    targets = features["targets"]
+    targets = tf.squeeze(targets, 2)
+
+    (decoder_input, decoder_self_attention_bias) = attention_lm_prepare_decoder(
+        targets, hparams)
+
+    def residual_fn(x, y):
+      return common_layers.layer_norm(x + tf.nn.dropout(
+          y, 1.0 - hparams.residual_dropout))
+
+    decoder_input = tf.nn.dropout(decoder_input, 1.0 - hparams.residual_dropout)
+    decoder_output = attention_lm_decoder(
+        decoder_input, residual_fn, decoder_self_attention_bias, hparams)
+    decoder_output = tf.expand_dims(decoder_output, 2)
+
+    return decoder_output
+
+
+def attention_lm_prepare_decoder(targets, hparams):
+  """Prepare one shard of the model for the decoder.
+
+  Args:
+    targets: a Tensor.
+    hparams: run hyperparameters
+
+  Returns:
+    decoder_input: a Tensor, bottom of decoder stack
+    decoder_self_attention_bias: a Tensor, containing large negative values
+    to implement masked attention and possibly baises for diagonal alignments
+  """
+  decoder_self_attention_bias = (
+      common_attention.attention_bias_lower_triangle(tf.shape(targets)[1]))
+  decoder_input = common_layers.shift_left_3d(targets)
+  if hparams.pos == "timing":
+    decoder_input = common_attention.add_timing_signal_1d(decoder_input)
+  return (decoder_input, decoder_self_attention_bias)
+
+
+def attention_lm_decoder(decoder_input,
+                         residual_fn,
+                         decoder_self_attention_bias,
+                         hparams,
+                         name="decoder"):
+  """A stack of attention_lm layers.
+
+  Args:
+    decoder_input: a Tensor
+    residual_fn: a function from (layer_input, layer_output) -> combined_output
+    decoder_self_attention_bias: bias Tensor for self-attention
+      (see common_attention.attention_bias())
+    hparams: hyperparameters for model
+    name: a string
+
+  Returns:
+    y: a Tensors
+  """
+  x = decoder_input
+  # Summaries don't work in multi-problem setting yet.
+  summaries = "problems" not in hparams.values() or len(hparams.problems) == 1
+  with tf.variable_scope(name):
+    for layer in xrange(hparams.num_hidden_layers):
+      with tf.variable_scope("layer_%d" % layer):
+        x = residual_fn(
+            x,
+            common_attention.multihead_attention(
+                x,
+                None,
+                decoder_self_attention_bias,
+                hparams.attention_key_channels or hparams.hidden_size,
+                hparams.attention_value_channels or hparams.hidden_size,
+                hparams.hidden_size,
+                hparams.num_heads,
+                hparams.attention_dropout,
+                summaries=summaries,
+                name="decoder_self_attention"))
+        x = residual_fn(x,
+                        common_layers.conv_hidden_relu(
+                            x,
+                            hparams.filter_size,
+                            hparams.hidden_size,
+                            dropout=hparams.relu_dropout))
+  return x
+
+
+@registry.register_hparams
+def attention_lm_base():
+  """Set of hyperparameters."""
+  hparams = common_hparams.basic_params1()
+  hparams.hidden_size = 1024
+  hparams.batch_size = 8192
+  hparams.max_length = 256
+  hparams.dropout = 0.0
+  hparams.clip_grad_norm = 0.  # i.e. no gradient clipping
+  hparams.optimizer_adam_epsilon = 1e-9
+  hparams.learning_rate_decay_scheme = "noam"
+  hparams.learning_rate = 1.0
+  hparams.learning_rate_warmup_steps = 1000
+  hparams.initializer_gain = 1.0
+  hparams.num_hidden_layers = 6
+  hparams.initializer = "uniform_unit_scaling"
+  hparams.weight_decay = 0.0
+  hparams.optimizer_adam_beta1 = 0.9
+  hparams.optimizer_adam_beta2 = 0.98
+  hparams.num_sampled_classes = 0
+  hparams.label_smoothing = 0.1
+  hparams.shared_embedding_and_softmax_weights = int(False)
+
+  hparams.add_hparam("filter_size", 4096)  # Add new ones like this.
+  # attention-related flags
+  hparams.add_hparam("num_heads", 8)
+  hparams.add_hparam("attention_key_channels", 0)
+  hparams.add_hparam("attention_value_channels", 0)
+  hparams.add_hparam("attention_dropout", 0.0)
+  hparams.add_hparam("relu_dropout", 0.0)
+  hparams.add_hparam("pos", "timing")  # timing, none
+  hparams.add_hparam("residual_dropout", 0.1)
+  return hparams
diff --git a/tensor2tensor/models/baseline.py b/tensor2tensor/models/baseline.py
new file mode 100644
index 000000000..78f79eed0
--- /dev/null
+++ b/tensor2tensor/models/baseline.py
@@ -0,0 +1,72 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Baseline models."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+from tensor2tensor.models import common_layers
+from tensor2tensor.utils import registry
+from tensor2tensor.utils import t2t_model
+
+import tensorflow as tf
+
+
+def lstm(inputs, hparams, train, name, initial_state=None):
+  """Run LSTM cell on inputs, assuming they are [batch x time x size]."""
+
+  def dropout_lstm_cell():
+    return tf.contrib.rnn.DropoutWrapper(
+        tf.contrib.rnn.BasicLSTMCell(hparams.hidden_size),
+        input_keep_prob=1.0 - hparams.dropout * tf.to_float(train))
+
+  layers = [dropout_lstm_cell() for _ in range(hparams.num_hidden_layers)]
+  with tf.variable_scope(name):
+    return tf.nn.dynamic_rnn(
+        tf.contrib.rnn.MultiRNNCell(layers),
+        inputs,
+        initial_state=initial_state,
+        dtype=tf.float32,
+        time_major=False)
+
+
+def lstm_seq2seq_internal(inputs, targets, hparams, train):
+  """The basic LSTM seq2seq model, main step used for training."""
+  with tf.variable_scope("lstm_seq2seq"):
+    # Flatten inputs.
+    inputs = common_layers.flatten4d3d(inputs)
+    # LSTM encoder.
+    _, final_encoder_state = lstm(
+        tf.reverse(inputs, axis=[1]), hparams, train, "encoder")
+    # LSTM decoder.
+    shifted_targets = common_layers.shift_left(targets)
+    decoder_outputs, _ = lstm(
+        common_layers.flatten4d3d(shifted_targets),
+        hparams,
+        train,
+        "decoder",
+        initial_state=final_encoder_state)
+    return tf.expand_dims(decoder_outputs, axis=2)
+
+
+@registry.register_model("baseline_lstm_seq2seq")
+class LSTMSeq2Seq(t2t_model.T2TModel):
+
+  def model_fn_body(self, features, train):
+    return lstm_seq2seq_internal(features["inputs"], features["targets"],
+                                 self._hparams, train)
diff --git a/tensor2tensor/models/baseline_test.py b/tensor2tensor/models/baseline_test.py
new file mode 100644
index 000000000..25e191d6f
--- /dev/null
+++ b/tensor2tensor/models/baseline_test.py
@@ -0,0 +1,55 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Baseline models tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+import numpy as np
+
+from tensor2tensor.data_generators import problem_hparams
+from tensor2tensor.models import baseline
+from tensor2tensor.models import common_hparams
+
+import tensorflow as tf
+
+
+class BaselineTest(tf.test.TestCase):
+
+  def testLSTMSeq2Seq(self):
+    vocab_size = 9
+    x = np.random.random_integers(1, high=vocab_size - 1, size=(3, 5, 1, 1))
+    y = np.random.random_integers(1, high=vocab_size - 1, size=(3, 6, 1, 1))
+    hparams = common_hparams.basic_params1()
+    p_hparams = problem_hparams.test_problem_hparams(hparams, vocab_size,
+                                                     vocab_size)
+    with self.test_session() as session:
+      features = {
+          "inputs": tf.constant(x, dtype=tf.int32),
+          "targets": tf.constant(y, dtype=tf.int32),
+      }
+      model = baseline.LSTMSeq2Seq(hparams, p_hparams)
+      sharded_logits, _, _ = model.model_fn(features, True)
+      logits = tf.concat(sharded_logits, 0)
+      session.run(tf.global_variables_initializer())
+      res = session.run(logits)
+    self.assertEqual(res.shape, (3, 6, 1, 1, vocab_size))
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensor2tensor/models/bytenet.py b/tensor2tensor/models/bytenet.py
new file mode 100644
index 000000000..42db05700
--- /dev/null
+++ b/tensor2tensor/models/bytenet.py
@@ -0,0 +1,112 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""ByteNet."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensor2tensor.models import common_hparams
+from tensor2tensor.models import common_layers
+from tensor2tensor.utils import registry
+from tensor2tensor.utils import t2t_model
+
+import tensorflow as tf
+
+
+def residual_dilated_conv(x, repeat, padding, name, hparams, train):
+  """A stack of convolution blocks with residual connections."""
+  with tf.variable_scope(name):
+    k = (hparams.kernel_height, hparams.kernel_width)
+    dilations_and_kernels = [((2**i, 1), k)
+                             for i in xrange(hparams.num_hidden_layers)]
+    for i in xrange(repeat):
+      with tf.variable_scope("repeat_%d" % i):
+        y = common_layers.conv_block(
+            x,
+            hparams.hidden_size,
+            dilations_and_kernels,
+            padding=padding,
+            name="residual_conv")
+        x = common_layers.layer_norm(x + y, hparams.hidden_size, name="lnorm")
+        x = tf.nn.dropout(x, 1.0 - hparams.dropout * tf.to_float(train))
+    return x
+
+
+def bytenet_internal(inputs, targets, hparams, train):
+  """ByteNet, main step used for training."""
+  with tf.variable_scope("bytenet"):
+    # Flatten inputs and extend length by 50%.
+    inputs = tf.expand_dims(common_layers.flatten4d3d(inputs), axis=2)
+    extend_length = tf.to_int32(0.5 * tf.to_float(tf.shape(inputs)[1]))
+    inputs_shape = inputs.shape.as_list()
+    inputs = tf.pad(inputs, [[0, 0], [0, extend_length], [0, 0], [0, 0]])
+    inputs_shape[1] = None
+    inputs.set_shape(inputs_shape)  # Don't lose the other shapes when padding.
+    # Pad inputs and targets to be the same length, divisible by 50.
+    inputs, targets = common_layers.pad_to_same_length(
+        inputs, targets, final_length_divisible_by=50)
+    final_encoder = residual_dilated_conv(
+        inputs, hparams.num_block_repeat, "SAME", "encoder", hparams, train)
+
+    shifted_targets = common_layers.shift_left(targets)
+    kernel = (hparams.kernel_height, hparams.kernel_width)
+    decoder_start = common_layers.conv_block(
+        tf.concat([final_encoder, shifted_targets], axis=3),
+        hparams.hidden_size, [((1, 1), kernel)],
+        padding="LEFT")
+
+    return residual_dilated_conv(
+        decoder_start, hparams.num_block_repeat,
+        "LEFT", "decoder", hparams, train)
+
+
+@registry.register_model
+class ByteNet(t2t_model.T2TModel):
+
+  def model_fn_body(self, features, train):
+    return bytenet_internal(features["inputs"], features["targets"],
+                            self._hparams, train)
+
+
+@registry.register_hparams
+def bytenet_base():
+  """Set of hyperparameters."""
+  hparams = common_hparams.basic_params1()
+  hparams.batch_size = 2048
+  hparams.hidden_size = 768
+  hparams.dropout = 0.2
+  hparams.symbol_dropout = 0.2
+  hparams.label_smoothing = 0.1
+  hparams.clip_grad_norm = 2.0
+  hparams.num_hidden_layers = 4
+  hparams.kernel_height = 3
+  hparams.kernel_width = 1
+  hparams.learning_rate_decay_scheme = "exp50k"
+  hparams.learning_rate = 0.05
+  hparams.learning_rate_warmup_steps = 3000
+  hparams.initializer_gain = 1.0
+  hparams.weight_decay = 3.0
+  hparams.num_sampled_classes = 0
+  hparams.sampling_method = "argmax"
+  hparams.optimizer_adam_epsilon = 1e-6
+  hparams.optimizer_adam_beta1 = 0.85
+  hparams.optimizer_adam_beta2 = 0.997
+  hparams.add_hparam("num_block_repeat", 4)
+  return hparams
diff --git a/tensor2tensor/models/bytenet_test.py b/tensor2tensor/models/bytenet_test.py
new file mode 100644
index 000000000..676220cc8
--- /dev/null
+++ b/tensor2tensor/models/bytenet_test.py
@@ -0,0 +1,54 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""ByteNet tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+import numpy as np
+
+from tensor2tensor.data_generators import problem_hparams
+from tensor2tensor.models import bytenet
+
+import tensorflow as tf
+
+
+class ByteNetTest(tf.test.TestCase):
+
+  def testByteNet(self):
+    vocab_size = 9
+    x = np.random.random_integers(1, high=vocab_size - 1, size=(3, 5, 1, 1))
+    y = np.random.random_integers(1, high=vocab_size - 1, size=(3, 6, 1, 1))
+    hparams = bytenet.bytenet_base()
+    p_hparams = problem_hparams.test_problem_hparams(hparams, vocab_size,
+                                                     vocab_size)
+    with self.test_session() as session:
+      features = {
+          "inputs": tf.constant(x, dtype=tf.int32),
+          "targets": tf.constant(y, dtype=tf.int32),
+      }
+      model = bytenet.ByteNet(hparams, p_hparams)
+      sharded_logits, _, _ = model.model_fn(features, True)
+      logits = tf.concat(sharded_logits, 0)
+      session.run(tf.global_variables_initializer())
+      res = session.run(logits)
+    self.assertEqual(res.shape, (3, 50, 1, 1, vocab_size))
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensor2tensor/models/common_attention.py b/tensor2tensor/models/common_attention.py
new file mode 100644
index 000000000..ccf288a09
--- /dev/null
+++ b/tensor2tensor/models/common_attention.py
@@ -0,0 +1,344 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities for attention."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+# Dependency imports
+
+from tensor2tensor.models import common_layers
+
+import tensorflow as tf
+
+
+def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
+  """Adds a bunch of sinusoids of different frequencies to a Tensor.
+
+  Each channel of the input Tensor is incremented by a sinusoid of a different
+  frequency and phase.
+
+  This allows attention to learn to use absolute and relative positions.
+  Timing signals should be added to some precursors of both the query and the
+  memory inputs to attention.
+
+  The use of relative position is possible because sin(x+y) and cos(x+y) can be
+  experessed in terms of y, sin(x) and cos(x).
+
+  In particular, we use a geometric sequence of timescales starting with
+  min_timescale and ending with max_timescale.  The number of different
+  timescales is equal to channels / 2. For each timescale, we
+  generate the two sinusoidal signals sin(timestep/timescale) and
+  cos(timestep/timescale).  All of these sinusoids are concatenated in
+  the channels dimension.
+
+  Args:
+    x: a Tensor with shape [batch, length, channels]
+    min_timescale: a float
+    max_timescale: a float
+
+  Returns:
+    a Tensor the same shape as x.
+  """
+  length = tf.shape(x)[1]
+  channels = tf.shape(x)[2]
+  position = tf.to_float(tf.range(length))
+  num_timescales = channels // 2
+  log_timescale_increment = (
+      math.log(float(max_timescale) / float(min_timescale)) /
+      (tf.to_float(num_timescales) - 1))
+  inv_timescales = min_timescale * tf.exp(
+      tf.to_float(tf.range(num_timescales)) * -log_timescale_increment)
+  scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales, 0)
+  signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1)
+  signal = tf.pad(signal, [[0, 0], [0, tf.mod(channels, 2)]])
+  signal = tf.reshape(signal, [1, length, channels])
+  return x + signal
+
+
+def add_timing_signal_nd(x, min_timescale=1.0, max_timescale=1.0e4):
+  """Adds a bunch of sinusoids of different frequencies to a Tensor.
+
+  Each channel of the input Tensor is incremented by a sinusoid of a different
+  frequency and phase in one of the positional dimensions.
+
+  This allows attention to learn to use absolute and relative positions.
+  Timing signals should be added to some precursors of both the query and the
+  memory inputs to attention.
+
+  The use of relative position is possible because sin(a+b) and cos(a+b) can be
+  experessed in terms of b, sin(a) and cos(a).
+
+  x is a Tensor with n "positional" dimensions, e.g. one dimension for a
+  sequence or two dimensions for an image
+
+  We use a geometric sequence of timescales starting with
+  min_timescale and ending with max_timescale.  The number of different
+  timescales is equal to channels // (n * 2). For each timescale, we
+  generate the two sinusoidal signals sin(timestep/timescale) and
+  cos(timestep/timescale).  All of these sinusoids are concatenated in
+  the channels dimension.
+
+  Args:
+    x: a Tensor with shape [batch, d1 ... dn, channels]
+    min_timescale: a float
+    max_timescale: a float
+
+  Returns:
+    a Tensor the same shape as x.
+  """
+  static_shape = x.get_shape().as_list()
+  num_dims = len(static_shape) - 2
+  channels = tf.shape(x)[-1]
+  num_timescales = channels // (num_dims * 2)
+  log_timescale_increment = (
+      math.log(float(max_timescale) / float(min_timescale)) /
+      (tf.to_float(num_timescales) - 1))
+  inv_timescales = min_timescale * tf.exp(
+      tf.to_float(tf.range(num_timescales)) * -log_timescale_increment)
+  for dim in xrange(num_dims):
+    length = tf.shape(x)[dim + 1]
+    position = tf.to_float(tf.range(length))
+    scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(
+        inv_timescales, 0)
+    signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1)
+    prepad = dim * 2 * num_timescales
+    postpad = channels - (dim + 1) * 2 * num_timescales
+    signal = tf.pad(signal, [[0, 0], [prepad, postpad]])
+    for _ in xrange(1 + dim):
+      signal = tf.expand_dims(signal, 0)
+    for _ in xrange(num_dims - 1 - dim):
+      signal = tf.expand_dims(signal, -2)
+    x += signal
+  return x
+
+
+def embedding_to_padding(emb):
+  """Input embeddings -> is_padding.
+
+  We have hacked symbol_modality to return all-zero embeddings for padding.
+
+  Args:
+    emb: a Tensor with shape [..., depth].
+  Returns:
+    a boolean Tensor with shape [...].
+  """
+  emb_sum = tf.reduce_sum(tf.abs(emb), axis=-1)
+  return tf.equal(emb_sum, 0.0)
+
+
+def attention_bias_lower_triangle(length):
+  """Create an bias tensor to be added to attention logits.
+
+  Args:
+   length: a Scalar.
+
+  Returns:
+    a `Tensor` with shape [1, 1, length, length].
+  """
+  lower_triangle = tf.matrix_band_part(tf.ones([length, length]), -1, 0)
+  ret = -1e9 * (1.0 - lower_triangle)
+  return tf.reshape(ret, [1, 1, length, length])
+
+
+def attention_bias_ignore_padding(memory_padding):
+  """Create an bias tensor to be added to attention logits.
+
+  Args:
+    memory_padding: a boolean `Tensor` with shape [batch, memory_length].
+
+  Returns:
+    a `Tensor` with shape [batch, 1, 1, memory_length].
+  """
+  ret = tf.to_float(memory_padding) * -1e9
+  return tf.expand_dims(tf.expand_dims(ret, 1), 1)
+
+
+def split_last_dimension(x, n):
+  """Reshape x so that the last dimension becomes two dimensions.
+
+  The first of these two dimensions is n.
+
+  Args:
+    x: a Tensor with shape [..., m]
+    n: an integer.
+
+  Returns:
+    a Tensor with shape [..., n, m/n]
+  """
+  old_shape = x.get_shape().dims
+  last = old_shape[-1]
+  new_shape = old_shape[:-1] + [n] + [last // n if last else None]
+  ret = tf.reshape(x, tf.concat([tf.shape(x)[:-1], [n, -1]], 0))
+  ret.set_shape(new_shape)
+  return ret
+
+
+def combine_last_two_dimensions(x):
+  """Reshape x so that the last two dimension become one.
+
+  Args:
+    x: a Tensor with shape [..., a, b]
+
+  Returns:
+    a Tensor with shape [..., ab]
+  """
+  old_shape = x.get_shape().dims
+  a, b = old_shape[-2:]
+  new_shape = old_shape[:-2] + [a * b if a and b else None]
+  ret = tf.reshape(x, tf.concat([tf.shape(x)[:-2], [-1]], 0))
+  ret.set_shape(new_shape)
+  return ret
+
+
+def split_heads(x, num_heads):
+  """Split channels (dimension 3) into multiple heads (becomes dimension 1).
+
+  Args:
+    x: a Tensor with shape [batch, length, channels]
+    num_heads: an integer
+
+  Returns:
+    a Tensor with shape [batch, num_heads, length, channels / num_heads]
+  """
+  return tf.transpose(split_last_dimension(x, num_heads), [0, 2, 1, 3])
+
+
+def combine_heads(x):
+  """Inverse of split_heads.
+
+  Args:
+    x: a Tensor with shape [batch, num_heads, length, channels / num_heads]
+
+  Returns:
+    a Tensor with shape [batch, length, channels]
+  """
+  return combine_last_two_dimensions(tf.transpose(x, [0, 2, 1, 3]))
+
+
+def attention_image_summary(attn):
+  """Compute color image summary.
+
+  Args:
+    attn: a Tensor with shape [batch, num_heads, query_length, memory_length]
+  """
+  num_heads = attn.get_shape().as_list()[1]
+  # [batch, query_length, memory_length, num_heads]
+  image = tf.transpose(attn, [0, 2, 3, 1])
+  image = tf.pow(image, 0.2)  # for high-dynamic-range
+  # Each head will correspond to one of RGB.
+  # pad the heads to be a multiple of 3
+  image = tf.pad(image, [[0, 0], [0, 0], [0, 0], [0, -num_heads % 3]])
+  image = split_last_dimension(image, 3)
+  image = tf.reduce_max(image, 4)
+  tf.summary.image("attention", image, max_outputs=1)
+
+
+def dot_product_attention(q,
+                          k,
+                          v,
+                          bias,
+                          dropout_rate=0.0,
+                          summaries=False,
+                          name=None):
+  """dot-product attention.
+
+  Args:
+    q: a Tensor with shape [batch, heads, length_q, depth_k]
+    k: a Tensor with shape [batch, heads, length_kv, depth_k]
+    v: a Tensor with shape [batch, heads, length_kv, depth_v]
+    bias: bias Tensor (see attention_bias())
+    dropout_rate: a floating point number
+    summaries: a boolean
+    name: an optional string
+
+  Returns:
+    A Tensor.
+  """
+  with tf.variable_scope(
+      name, default_name="dot_product_attention", values=[q, k, v]):
+    # [batch, num_heads, query_length, memory_length]
+    logits = tf.matmul(q, k, transpose_b=True)
+    if bias is not None:
+      logits += bias
+    weights = tf.nn.softmax(logits, name="attention_weights")
+    # dropping out the attention links for each of the heads
+    weights = tf.nn.dropout(weights, 1.0 - dropout_rate)
+    if summaries and not tf.get_variable_scope().reuse:
+      attention_image_summary(weights)
+    return tf.matmul(weights, v)
+
+
+def multihead_attention(query_antecedent,
+                        memory_antecedent,
+                        bias,
+                        total_key_depth,
+                        total_value_depth,
+                        output_depth,
+                        num_heads,
+                        dropout_rate,
+                        summaries=False,
+                        name=None):
+  """Multihead scaled-dot-product attention with input/output transformations.
+
+  Args:
+    query_antecedent: a Tensor with shape [batch, length_q, channels]
+    memory_antecedent: a Tensor with shape [batch, length_m, channels]
+    bias: bias Tensor (see attention_bias())
+    total_key_depth: an integer
+    total_value_depth: an integer
+    output_depth: an integer
+    num_heads: an integer dividing total_key_depth and total_value_depth
+    dropout_rate: a floating point number
+    summaries: a boolean
+    name: an optional string
+
+  Returns:
+    A Tensor.
+  """
+  with tf.variable_scope(
+      name,
+      default_name="multihead_attention",
+      values=[query_antecedent, memory_antecedent]):
+    if memory_antecedent is None:
+      # self attention
+      combined = common_layers.conv1d(
+          query_antecedent,
+          total_key_depth * 2 + total_value_depth,
+          1,
+          name="qkv_transform")
+      q, k, v = tf.split(
+          combined, [total_key_depth, total_key_depth, total_value_depth],
+          axis=2)
+    else:
+      q = common_layers.conv1d(
+          query_antecedent, total_key_depth, 1, name="q_transform")
+      combined = common_layers.conv1d(
+          memory_antecedent,
+          total_key_depth + total_value_depth,
+          1,
+          name="kv_transform")
+      k, v = tf.split(combined, [total_key_depth, total_value_depth], axis=2)
+    q = split_heads(q, num_heads)
+    k = split_heads(k, num_heads)
+    v = split_heads(v, num_heads)
+    key_depth_per_head = total_key_depth // num_heads
+    q *= key_depth_per_head**-0.5
+    x = dot_product_attention(q, k, v, bias, dropout_rate, summaries)
+    x = combine_heads(x)
+    x = common_layers.conv1d(x, output_depth, 1, name="output_transform")
+    return x
diff --git a/tensor2tensor/models/common_hparams.py b/tensor2tensor/models/common_hparams.py
new file mode 100644
index 000000000..81c41dcc5
--- /dev/null
+++ b/tensor2tensor/models/common_hparams.py
@@ -0,0 +1,193 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Hyperparameters and ranges common to multiple models."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+import six
+from six.moves import zip  # pylint: disable=redefined-builtin
+from tensor2tensor.utils import registry
+
+import tensorflow as tf
+
+
+@registry.register_hparams("basic1")
+def basic_params1():
+  """A set of basic hyperparameters."""
+  return tf.contrib.training.HParams(
+      batch_size=4096,  # in tokens per batch per gpu
+      # This flag controls the number of length buckets in the data reader.
+      # Too many buckets slows down data reading - this needs fixing.
+      # Too few buckets mean lots of wasted padding.
+      # If this value is 1, we have buckets with maximum lengths:
+      # [8, 12, 16, 24, 32, 48 ... (max_length or batch_size)]
+      # If this value is 2, we have buckets with maximum lengths:
+      # [8, 10, 12, 14, 16, 20, 24 ... (max_length or batch_size)]
+      batching_mantissa_bits=1,
+      num_hidden_layers=4,
+      kernel_height=3,
+      kernel_width=1,
+      hidden_size=64,
+      compress_steps=0,
+      dropout=0.2,
+      clip_grad_norm=2.0,
+      initializer="orthogonal",
+      initializer_gain=1.5,
+      label_smoothing=0.1,
+      optimizer="Adam",
+      optimizer_adam_epsilon=1e-6,
+      optimizer_adam_beta1=0.85,
+      optimizer_adam_beta2=0.997,
+      optimizer_momentum_momentum=0.9,
+      weight_decay=0.1,
+      weight_noise=0.0,
+      learning_rate_decay_scheme="none",
+      learning_rate_warmup_steps=100,
+      learning_rate=0.1,
+      sampling_method="argmax",  # "argmax" or "random"
+      problem_choice="adaptive",  # "uniform", "adaptive", "distributed"
+      multiply_embedding_mode="sqrt_depth",
+      symbol_modality_num_shards=16,
+      # setting the max length in a minibatch. 0 means default behavior,
+      # max_length = hparams.batch_size * length_multiplier
+      max_length=0,
+      # in SymbolModality, share the output embeddings and the softmax
+      # variables.
+      # You can also share the input embeddings with the output embeddings
+      # by using a problem_hparams that uses the same modality object for
+      # the input_modality and target_modality.
+      shared_embedding_and_softmax_weights=int(False),)
+
+
+class RangedHParams(object):
+  """Defines parameter ranges for tuning."""
+
+  # From ParameterConfig proto
+  LINEAR_SCALE = 1
+  LOG_SCALE = 2
+  REVERSE_LOG_SCALE = 3
+
+  def __init__(self):
+    self._categorical_params = {}
+    self._discrete_params = {}
+    self._float_params = {}
+    self._int_params = {}
+
+  def _check_reset_and_type_change(self, name, orig_ctr):
+    """Check if name is in orig_ctr or in one of the other type containers."""
+    # Resetting a hyperparameter
+    if name in orig_ctr:
+      tf.logging.warning("Overwriting hparam %s", name)
+
+    ctr_names = [(self._categorical_params,
+                  "categorical"), (self._discrete_params, "discrete"),
+                 (self._float_params, "float"), (self._int_params, "int")]
+    ctrs, names = list(zip(*ctr_names))
+    orig_name = names[ctrs.index(orig_ctr)]
+
+    for ctr, ctr_name in ctr_names:
+      if ctr is orig_ctr:
+        continue
+
+      # Using a different type for the same hyperparameter name
+      if name in ctr:
+        raise ValueError("Setting hyperparameter %s as type %s, but a "
+                         "hyperparemeter of the same name was originally "
+                         "registered as type %s" % (name, ctr_name, orig_name))
+
+  def set_categorical(self, name, categories, length=None):
+    self._check_reset_and_type_change(name, self._categorical_params)
+    self._categorical_params[name] = (name, categories, length)
+
+  def set_discrete(self, name, feasible_points, scale=None, length=None):
+    self._check_reset_and_type_change(name, self._discrete_params)
+    self._discrete_params[name] = (name, feasible_points, scale, length)
+
+  def set_float(self, name, min_val, max_val, scale=None, length=None):
+    self._check_reset_and_type_change(name, self._float_params)
+    self._float_params[name] = (name, min_val, max_val, scale, length)
+
+  def set_int(self, name, min_val, max_val, scale=None, length=None):
+    self._check_reset_and_type_change(name, self._int_params)
+    self._int_params[name] = (name, min_val, max_val, scale, length)
+
+
+def fill_ranged_hparams_from_hparams(hparams, ranged_hparams):
+  """Fill ranged_hparams with singleton values from hparams.
+
+  HParams are placed in RangedHParams with the following functions, according to
+  type:
+    * int: set_discrete
+    * float: set_float
+    * str: set_categorical
+
+  Args:
+    hparams: tf.contrib.training.HParams; contains the hyperparameters to copy
+      over to ranged_hparams.
+    ranged_hparams: RangedHParams; will have hparams values copied to it.
+
+  Raises:
+    ValueError: if hparams contains a hyperparameter not of type
+      {int, float, str, bool}.
+  """
+  for name, (hp_type, is_multivalent) in six.iteritems(hparams._hparam_types):  # pylint: disable=protected-access
+
+    if is_multivalent:
+      raise ValueError("Multivalent hparams not supported in RangedHParams. "
+                       "Hyperparameter %s is multivalent." % name)
+    val = getattr(hparams, name)
+    if hp_type == int:
+      ranged_hparams.set_discrete(name, [val])
+    elif hp_type == float:
+      ranged_hparams.set_float(name, val, val)
+    elif hp_type == str:
+      ranged_hparams.set_categorical(name, [val])
+    else:
+      raise ValueError("Unsupported type %s for param %s" % (hp_type, name))
+
+
+@registry.register_ranged_hparams("basic1")
+def basic_range1(ranged_hparams):
+  """A basic range of hyperparameters."""
+  rhp = ranged_hparams
+
+  hparams = basic_params1()
+  fill_ranged_hparams_from_hparams(hparams, rhp)
+
+  rhp.set_discrete("batch_size", [1024, 2048, 4096])
+  rhp.set_discrete("num_hidden_layers", [1, 2, 3, 4, 5, 6])
+  rhp.set_discrete("hidden_size", [32, 64, 128, 256, 512], scale=rhp.LOG_SCALE)
+  rhp.set_discrete("kernel_height", [1, 3, 5, 7])
+  rhp.set_discrete("kernel_width", [1, 3, 5, 7])
+  rhp.set_discrete("compress_steps", [0, 1, 2])
+  rhp.set_float("dropout", 0.0, 0.5)
+  rhp.set_float("weight_decay", 1e-4, 10.0, scale=rhp.LOG_SCALE)
+  rhp.set_float("label_smoothing", 0.0, 0.2)
+  rhp.set_float("clip_grad_norm", 0.01, 50.0, scale=rhp.LOG_SCALE)
+  rhp.set_float("learning_rate", 0.005, 2.0, scale=rhp.LOG_SCALE)
+  rhp.set_categorical("initializer",
+                      ["uniform", "orthogonal", "uniform_unit_scaling"])
+  rhp.set_float("initializer_gain", 0.5, 3.5)
+  rhp.set_categorical("learning_rate_decay_scheme",
+                      ["none", "sqrt", "noam", "exp10k"])
+  rhp.set_float("optimizer_adam_epsilon", 1e-7, 1e-2, scale=rhp.LOG_SCALE)
+  rhp.set_float("optimizer_adam_beta1", 0.8, 0.9)
+  rhp.set_float("optimizer_adam_beta2", 0.995, 0.999)
+  rhp.set_categorical("optimizer",
+                      ["Adam", "Adagrad", "Momentum", "RMSProp", "SGD"])
diff --git a/tensor2tensor/models/common_layers.py b/tensor2tensor/models/common_layers.py
new file mode 100644
index 000000000..ef6559f9e
--- /dev/null
+++ b/tensor2tensor/models/common_layers.py
@@ -0,0 +1,1340 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Layers common to multiple models."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+# Dependency imports
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+from tensor2tensor.utils import expert_utils as eu
+
+import tensorflow as tf
+
+from tensorflow.python.framework import function
+
+# This is a global setting. When turned off, no @function.Defun is used.
+allow_defun = True
+
+
+def saturating_sigmoid(x):
+  """Saturating sigmoid: 1.2 * sigmoid(x) - 0.1 cut to [0, 1]."""
+  with tf.name_scope("saturating_sigmoid", [x]):
+    y = tf.sigmoid(x)
+    return tf.minimum(1.0, tf.maximum(0.0, 1.2 * y - 0.1))
+
+
+def hard_sigmoid(x, saturation_limit=0.9):
+  saturation_cost = tf.reduce_mean(tf.nn.relu(tf.abs(x) - saturation_limit))
+  x_shifted = 0.5 * x + 0.5
+  return tf.minimum(1.0, tf.nn.relu(x_shifted)), saturation_cost
+
+
+def hard_tanh(x, saturation_limit=0.9):
+  saturation_cost = tf.reduce_mean(tf.nn.relu(tf.abs(x) - saturation_limit))
+  return tf.minimum(1.0, tf.maximum(x, -1.0)), saturation_cost
+
+
+def inverse_exp_decay(max_step, min_value=0.01):
+  """Inverse-decay exponentially from 0.01 to 1.0 reached at max_step."""
+  inv_base = tf.exp(tf.log(min_value) / float(max_step))
+  step = tf.to_float(tf.contrib.framework.get_global_step())
+  return inv_base**tf.maximum(float(max_step) - step, 0.0)
+
+
+def standardize_images(x):
+  """Image standardization on batches (tf.image.per_image_standardization)."""
+  with tf.name_scope("standardize_images", [x]):
+    x = tf.to_float(x)
+    x_mean = tf.reduce_mean(x, axis=[1, 2, 3], keep_dims=True)
+    x_variance = tf.reduce_mean(
+        tf.square(x - x_mean), axis=[1, 2, 3], keep_dims=True)
+    num_pixels = tf.to_float(tf.shape(x)[1] * tf.shape(x)[2] * 3)
+    x = (x - x_mean) / tf.maximum(tf.sqrt(x_variance), tf.rsqrt(num_pixels))
+    # TODO(lukaszkaiser): remove hack below, needed for greedy decoding for now.
+    if x.shape and len(x.shape) == 4 and x.shape[3] == 1:
+      x = tf.concat([x, x, x], axis=3)  # Not used, just a dead tf.cond branch.
+    x.set_shape([None, None, None, 3])
+    return x
+
+
+def image_augmentation(images, do_colors=False):
+  """Image augmentation: cropping, flipping, and color transforms."""
+  images = tf.random_crop(images, [299, 299, 3])
+  images = tf.image.random_flip_left_right(images)
+  if do_colors:  # More augmentation, but might be slow.
+    images = tf.image.random_brightness(images, max_delta=32. / 255.)
+    images = tf.image.random_saturation(images, lower=0.5, upper=1.5)
+    images = tf.image.random_hue(images, max_delta=0.2)
+    images = tf.image.random_contrast(images, lower=0.5, upper=1.5)
+  return images
+
+
+def flatten4d3d(x):
+  """Flatten a 4d-tensor into a 3d-tensor by joining width and height."""
+  xshape = tf.shape(x)
+  result = tf.reshape(x, [xshape[0], xshape[1] * xshape[2], xshape[3]])
+  # Preserve static shapes when available.
+  xshape_static = x.get_shape()
+  result.set_shape([xshape_static[0], None, xshape_static[3]])
+  return result
+
+
+def embedding(x, vocab_size, dense_size, name=None, reuse=None, multiplier=1.0):
+  """Embed x of type int64 into dense vectors, reducing to max 4 dimensions."""
+  with tf.variable_scope(
+      name, default_name="embedding", values=[x], reuse=reuse):
+    embedding_var = tf.get_variable("kernel", [vocab_size, dense_size])
+    # On the backwards pass, we want to convert the gradient from
+    # an indexed-slices to a regular tensor before sending it back to the
+    # parameter server. This avoids excess computation on the parameter server.
+    embedding_var = eu.ConvertGradientToTensor(embedding_var)
+    emb_x = tf.gather(embedding_var, x)
+    if multiplier != 1.0:
+      emb_x *= multiplier
+    shape, static_shape = tf.shape(emb_x), emb_x.shape.as_list()
+    if not static_shape or len(static_shape) < 5:
+      return emb_x
+    # If we had extra channel dimensions, assume it's 1, i.e. shape[3] == 1.
+    assert len(static_shape) == 5
+    return tf.reshape(emb_x, [shape[0], shape[1], shape[2], static_shape[4]])
+
+
+def shift_left(x, pad_value=None):
+  """Shift the second dimension of x right by one."""
+  if pad_value is None:
+    shifted_targets = tf.pad(x, [[0, 0], [1, 0], [0, 0], [0, 0]])[:, :-1, :, :]
+  else:
+    shifted_targets = tf.concat([pad_value, x], axis=1)[:, :-1, :, :]
+  return shifted_targets
+
+
+def shift_left_3d(x, pad_value=None):
+  """Shift the second dimension of x right by one."""
+  if pad_value is None:
+    shifted_targets = tf.pad(x, [[0, 0], [1, 0], [0, 0]])[:, :-1, :]
+  else:
+    shifted_targets = tf.concat([pad_value, x], axis=1)[:, :-1, :]
+  return shifted_targets
+
+
+def conv_stride2_multistep(x, nbr_steps, output_filters, name=None, reuse=None):
+  """Use a strided convolution to downsample x by 2, `nbr_steps` times.
+
+  We use stride and filter size 2 to avoid the checkerboard problem of deconvs.
+  As detailed in http://distill.pub/2016/deconv-checkerboard/.
+
+  Args:
+    x: a `Tensor` with shape `[batch, spatial, depth]` or
+     `[batch, spatial_1, spatial_2, depth]`
+    nbr_steps: number of halving downsample rounds to apply
+    output_filters: an int specifying the filter count for the convolutions
+    name: a string
+    reuse: a boolean
+
+  Returns:
+    a `Tensor` with shape `[batch, spatial / (2**nbr_steps), output_filters]` or
+     `[batch, spatial_1 / (2**nbr_steps), spatial_2 / (2**nbr_steps),
+       output_filters]`
+  """
+  with tf.variable_scope(
+      name, default_name="conv_stride2_multistep", values=[x], reuse=reuse):
+    if nbr_steps == 0:
+      out = conv(x, output_filters, (1, 1))
+      return out, [out]
+    hidden_layers = [x]
+    for i in xrange(nbr_steps):
+      hidden_layers.append(
+          conv(
+              hidden_layers[-1],
+              output_filters, (2, 2),
+              strides=2,
+              activation=tf.nn.relu,
+              name="conv" + str(i)))
+    return hidden_layers[-1], hidden_layers
+
+
+def deconv_stride2_multistep(x,
+                             nbr_steps,
+                             output_filters,
+                             name=None,
+                             reuse=None):
+  """Use a deconvolution to upsample x by 2**`nbr_steps`.
+
+  Args:
+    x: a `Tensor` with shape `[batch, spatial, depth]` or
+     `[batch, spatial_1, spatial_2, depth]`
+    nbr_steps: an int specifying the number of doubling upsample rounds to
+     apply.
+    output_filters: an int specifying the filter count for the deconvolutions
+    name: a string
+    reuse: a boolean
+
+  Returns:
+    a `Tensor` with shape `[batch, spatial * (2**nbr_steps), output_filters]` or
+     `[batch, spatial_1 * (2**nbr_steps), spatial_2 * (2**nbr_steps),
+       output_filters]`
+  """
+  with tf.variable_scope(
+      name, default_name="deconv_stride2_multistep", values=[x], reuse=reuse):
+
+    def deconv1d(cur, i):
+      cur_shape = tf.shape(cur)
+      thicker = conv(
+          cur,
+          output_filters * 2, (1, 1),
+          padding="SAME",
+          activation=tf.nn.relu,
+          name="deconv1d" + str(i))
+      return tf.reshape(thicker,
+                        [cur_shape[0], cur_shape[1] * 2, 1, output_filters])
+
+    def deconv2d(cur, i):
+      thicker = conv(
+          cur,
+          output_filters * 4, (1, 1),
+          padding="SAME",
+          activation=tf.nn.relu,
+          name="deconv2d" + str(i))
+      return tf.depth_to_space(thicker, 2)
+
+    cur = x
+    for i in xrange(nbr_steps):
+      if cur.get_shape()[2] == 1:
+        cur = deconv1d(cur, i)
+      else:
+        cur = tf.cond(
+            tf.equal(tf.shape(cur)[2], 1),
+            lambda idx=i: deconv1d(cur, idx),
+            lambda idx=i: deconv2d(cur, idx))
+    return cur
+
+
+def conv_internal(conv_fn, inputs, filters, kernel_size, **kwargs):
+  """Conditional conv_fn making kernel 1d or 2d depending on inputs shape."""
+  static_shape = inputs.get_shape()
+  if not static_shape or len(static_shape) != 4:
+    raise ValueError("Inputs to conv must have statically known rank 4.")
+  inputs.set_shape([static_shape[0], None, None, static_shape[3]])
+  # Add support for left padding.
+  if "padding" in kwargs and kwargs["padding"] == "LEFT":
+    dilation_rate = (1, 1)
+    if "dilation_rate" in kwargs:
+      dilation_rate = kwargs["dilation_rate"]
+    assert kernel_size[0] % 2 == 1 and kernel_size[1] % 2 == 1
+    height_padding = 2 * (kernel_size[0] // 2) * dilation_rate[0]
+    cond_padding = tf.cond(
+        tf.equal(tf.shape(inputs)[2], 1), lambda: tf.constant(0),
+        lambda: tf.constant(2 * (kernel_size[1] // 2) * dilation_rate[1]))
+    width_padding = 0 if static_shape[2] == 1 else cond_padding
+    padding = [[0, 0], [height_padding, 0], [width_padding, 0], [0, 0]]
+    inputs = tf.pad(inputs, padding)
+    kwargs["padding"] = "VALID"
+  force2d = False  # Special argument we use to force 2d kernels (see below).
+  if "force2d" in kwargs:
+    force2d = kwargs["force2d"]
+
+  def conv2d_kernel(kernel_size_arg, name_suffix):
+    """Call conv2d but add suffix to name."""
+    if "name" in kwargs:
+      original_name = kwargs["name"]
+      name = kwargs.pop("name") + "_" + name_suffix
+    else:
+      original_name = None
+      name = "conv_" + name_suffix
+    original_force2d = None
+    if "force2d" in kwargs:
+      original_force2d = kwargs.pop("force2d")
+    result = conv_fn(inputs, filters, kernel_size_arg, name=name, **kwargs)
+    if original_name is not None:
+      kwargs["name"] = original_name  # Restore for other calls.
+    if original_force2d is not None:
+      kwargs["force2d"] = original_force2d
+    return result
+
+  # Manually setting the shape to be unknown in the middle two dimensions so
+  # that the `tf.cond` below won't throw an error based on the convolution
+  # kernels being too large for the data.
+  inputs._shape = tf.TensorShape([static_shape[0], None, None, static_shape[3]])  # pylint: disable=protected-access
+  if kernel_size[1] == 1 or force2d:
+    # Avoiding the cond below can speed up graph and gradient construction.
+    return conv2d_kernel(kernel_size, "single")
+  return tf.cond(
+      tf.equal(tf.shape(inputs)[2],
+               1), lambda: conv2d_kernel((kernel_size[0], 1), "small"),
+      lambda: conv2d_kernel(kernel_size, "std"))
+
+
+def conv(inputs, filters, kernel_size, **kwargs):
+  return conv_internal(tf.layers.conv2d, inputs, filters, kernel_size, **kwargs)
+
+
+def conv1d(inputs, filters, kernel_size, **kwargs):
+  return tf.squeeze(
+      conv(tf.expand_dims(inputs, 2), filters, (kernel_size, 1), **kwargs), 2)
+
+
+def separable_conv(inputs, filters, kernel_size, **kwargs):
+  return conv_internal(tf.layers.separable_conv2d, inputs, filters, kernel_size,
+                       **kwargs)
+
+
+def subseparable_conv(inputs, filters, kernel_size, **kwargs):
+  """Sub-separable convolution. If separability == 0 it's a separable_conv."""
+
+  def conv_fn(inputs, filters, kernel_size, **kwargs):
+    """Sub-separable convolution, splits into separability-many blocks."""
+    separability = None
+    if "separability" in kwargs:
+      separability = kwargs.pop("separability")
+    if separability:
+      parts = []
+      abs_sep = separability if separability > 0 else -1 * separability
+      for split_idx, split in enumerate(tf.split(inputs, abs_sep, axis=3)):
+        with tf.variable_scope("part_%d" % split_idx):
+          if separability > 0:
+            parts.append(
+                tf.layers.conv2d(split, filters // separability, kernel_size, **
+                                 kwargs))
+          else:
+            parts.append(
+                tf.layers.separable_conv2d(split, filters // abs_sep,
+                                           kernel_size, **kwargs))
+      if separability > 1:
+        result = tf.layers.conv2d(tf.concat(parts, axis=3), filters, (1, 1))
+      elif abs_sep == 1:  # If we have just one block, return it.
+        assert len(parts) == 1
+        result = parts[0]
+      else:
+        result = tf.concat(parts, axis=3)
+    else:
+      result = tf.layers.separable_conv2d(inputs, filters, kernel_size,
+                                          **kwargs)
+    if separability is not None:
+      kwargs["separability"] = separability
+    return result
+
+  return conv_internal(conv_fn, inputs, filters, kernel_size, **kwargs)
+
+
+def layer_norm_compute_python(x, epsilon, scale, bias):
+  """Layer norm raw computation."""
+  mean = tf.reduce_mean(x, axis=[-1], keep_dims=True)
+  variance = tf.reduce_mean(tf.square(x - mean), axis=[-1], keep_dims=True)
+  norm_x = (x - mean) * tf.rsqrt(variance + epsilon)
+  return norm_x * scale + bias
+
+
+@function.Defun(compiled=True)
+def layer_norm_compute_grad(x, epsilon, scale, bias, dy):
+  y = layer_norm_compute_python(x, epsilon, scale, bias)
+  dx = tf.gradients(ys=[y], xs=[x, epsilon, scale, bias], grad_ys=[dy])
+  return dx
+
+
+@function.Defun(
+    compiled=True,
+    separate_compiled_gradients=True,
+    grad_func=layer_norm_compute_grad)
+def layer_norm_compute(x, epsilon, scale, bias):
+  return layer_norm_compute_python(x, epsilon, scale, bias)
+
+
+def layer_norm(x, filters=None, epsilon=1e-6, name=None, reuse=None):
+  """Layer normalize the tensor x, averaging over the last dimension."""
+  if filters is None:
+    filters = x.get_shape()[-1]
+  with tf.variable_scope(
+      name, default_name="layer_norm", values=[x], reuse=reuse):
+    scale = tf.get_variable(
+        "layer_norm_scale", [filters], initializer=tf.ones_initializer())
+    bias = tf.get_variable(
+        "layer_norm_bias", [filters], initializer=tf.zeros_initializer())
+    if allow_defun:
+      result = layer_norm_compute(x, tf.constant(epsilon), scale, bias)
+      result.set_shape(x.get_shape())
+    else:
+      result = layer_norm_compute_python(x, epsilon, scale, bias)
+    return result
+
+
+def noam_norm(x, name=None):
+  """One version of layer normalization."""
+  with tf.name_scope(name, default_name="noam_norm", values=[x]):
+    shape = x.get_shape()
+    ndims = len(shape)
+    return (tf.nn.l2_normalize(x, ndims - 1, epsilon=1.0) *
+            tf.sqrt(tf.to_float(shape[-1])))
+
+
+def residual_function(hparams):
+  """Returns a function for combining layer input and layer output.
+
+  The returned function on x (layer input) and y (layer output) computes:
+    norm_function(x + t
+
+  Args:
+    hparams: model hyperparameters
+
+  Returns:
+    a function from x=<layer input> and y=<layer output> to computed output
+  """
+
+  def residual_fn(x, y):
+    return hparams.norm_function(x + tf.nn.dropout(
+        y, 1.0 - hparams.residual_dropout))
+
+  return residual_fn
+
+
+def conv_block_internal(conv_fn,
+                        inputs,
+                        filters,
+                        dilation_rates_and_kernel_sizes,
+                        first_relu=True,
+                        use_elu=False,
+                        separabilities=None,
+                        **kwargs):
+  """A block of convolutions.
+
+  Args:
+    conv_fn: convolution function, e.g. conv or separable_conv.
+    inputs: a Tensor
+    filters: an Integer
+    dilation_rates_and_kernel_sizes: a list of tuples (dilation, (k_w, k_h))
+    first_relu: whether to do a relu at start (defaults to True)
+    use_elu: whether to use ELUs instead of ReLUs (defaults to False)
+    separabilities: list of separability factors (per-layer).
+    **kwargs: additional arguments (e.g., pooling)
+
+  Returns:
+     a Tensor.
+  """
+  name = kwargs.pop("name") if "name" in kwargs else None
+  mask = kwargs.pop("mask") if "mask" in kwargs else None
+  norm = kwargs.pop("normalizer_fn") if "normalizer_fn" in kwargs else None
+  if norm is None and "normalizer_fn" not in kwargs:
+    norm = lambda x, name: layer_norm(x, filters, name=name)
+  with tf.variable_scope(name, "conv_block", [inputs]):
+    cur, counter = inputs, -1
+    for dilation_rate, kernel_size in dilation_rates_and_kernel_sizes:
+      counter += 1
+      if first_relu or counter > 0:
+        cur = tf.nn.elu(cur) if use_elu else tf.nn.relu(cur)
+      if mask is not None:
+        cur *= mask
+      if separabilities:
+        cur = conv_fn(
+            cur,
+            filters,
+            kernel_size,
+            dilation_rate=dilation_rate,
+            name="conv_block_%d" % counter,
+            use_bias=norm is None,
+            separability=separabilities[counter],
+            **kwargs)
+      else:
+        cur = conv_fn(
+            cur,
+            filters,
+            kernel_size,
+            dilation_rate=dilation_rate,
+            name="conv_block_%d" % counter,
+            use_bias=norm is None,
+            **kwargs)
+      if norm is not None:
+        cur = norm(cur, name="conv_block_norm_%d" % counter)
+    return cur
+
+
+def conv_block(inputs, filters, dilation_rates_and_kernel_sizes, **kwargs):
+  """A block of standard convolutions."""
+  return conv_block_internal(conv, inputs, filters,
+                             dilation_rates_and_kernel_sizes, **kwargs)
+
+
+def separable_conv_block(inputs, filters, dilation_rates_and_kernel_sizes,
+                         **kwargs):
+  """A block of separable convolutions."""
+  return conv_block_internal(separable_conv, inputs, filters,
+                             dilation_rates_and_kernel_sizes, **kwargs)
+
+
+def subseparable_conv_block(inputs, filters, dilation_rates_and_kernel_sizes,
+                            **kwargs):
+  """A block of separable convolutions."""
+  return conv_block_internal(subseparable_conv, inputs, filters,
+                             dilation_rates_and_kernel_sizes, **kwargs)
+
+
+def pool(inputs, window_size, pooling_type, padding, strides=(1, 1)):
+  """Pooling (supports "LEFT")."""
+  with tf.name_scope("pool", [inputs]):
+    static_shape = inputs.get_shape()
+    if not static_shape or len(static_shape) != 4:
+      raise ValueError("Inputs to conv must have statically known rank 4.")
+    # Add support for left padding.
+    if padding == "LEFT":
+      assert window_size[0] % 2 == 1 and window_size[1] % 2 == 1
+      if len(static_shape) == 3:
+        width_padding = 2 * (window_size[1] // 2)
+        padding_ = [[0, 0], [width_padding, 0], [0, 0]]
+      else:
+        height_padding = 2 * (window_size[0] // 2)
+        cond_padding = tf.cond(
+            tf.equal(tf.shape(inputs)[2], 1), lambda: tf.constant(0),
+            lambda: tf.constant(2 * (window_size[1] // 2)))
+        width_padding = 0 if static_shape[2] == 1 else cond_padding
+        padding_ = [[0, 0], [height_padding, 0], [width_padding, 0], [0, 0]]
+      inputs = tf.pad(inputs, padding_)
+      inputs.set_shape([static_shape[0], None, None, static_shape[3]])
+      padding = "VALID"
+    window_size_small = (window_size[0], 1)
+    strides_small = (strides[0], 1)
+    # Manually setting the shape to be unknown in the middle two dimensions so
+    # that the `tf.cond` below won't throw an error based on the convolution
+    # kernels being too large for the data.
+    inputs._shape = tf.TensorShape(  # pylint: disable=protected-access
+        [static_shape[0], None, None, static_shape[3]])
+    return tf.cond(
+        tf.equal(tf.shape(inputs)[2], 1),
+        lambda: tf.nn.pool(  # pylint: disable=g-long-lambda
+            inputs, window_size_small, pooling_type, padding,
+            strides=strides_small),
+        lambda: tf.nn.pool(  # pylint: disable=g-long-lambda
+            inputs, window_size, pooling_type, padding, strides=strides))
+
+
+def conv_block_downsample(x,
+                          kernel,
+                          strides,
+                          padding,
+                          separability=0,
+                          name=None,
+                          reuse=None):
+  """Implements a downwards-striding conv block, like Xception exit flow."""
+  with tf.variable_scope(
+      name, default_name="conv_block_downsample", values=[x], reuse=reuse):
+    hidden_size = int(x.get_shape()[-1])
+    res = conv_block(
+        x,
+        int(1.25 * hidden_size), [((1, 1), kernel)],
+        padding=padding,
+        strides=strides,
+        name="res_conv")
+
+    x = subseparable_conv_block(
+        x,
+        hidden_size, [((1, 1), kernel)],
+        padding=padding,
+        separability=separability,
+        name="conv0")
+    x = subseparable_conv_block(
+        x,
+        int(1.25 * hidden_size), [((1, 1), kernel)],
+        padding=padding,
+        separability=separability,
+        name="conv1")
+    x = pool(x, kernel, "MAX", padding, strides=strides)
+
+    x += res
+
+    x = subseparable_conv_block(
+        x,
+        2 * hidden_size, [((1, 1), kernel)],
+        first_relu=False,
+        padding=padding,
+        separability=separability,
+        name="conv2")
+    x = subseparable_conv_block(
+        x,
+        int(2.5 * hidden_size), [((1, 1), kernel)],
+        padding=padding,
+        separability=separability,
+        name="conv3")
+    return x
+
+
+def decompress_seqcnn(x,
+                      targets,
+                      targets_vocab_size,
+                      dilations_and_kernels,
+                      block_size,
+                      is_2d=False,
+                      embedding_var=None,
+                      name=None,
+                      reuse=None):
+  """Decompress x into targets size using a Sequence CNN at every element."""
+  with tf.variable_scope(
+      name,
+      default_name="decompress_batch_seqcnn",
+      values=[x, targets],
+      reuse=reuse):
+    # We assume targets are [batch x block_size * N x block_size * N x C] if
+    # is_2d=True or [batch, block_size * N, 1, C] otherwise, and C is static.
+    # Let's shift targets to depth and embed.
+    targets_shape, targets_shape_static = tf.shape(targets), targets.get_shape()
+    channels = int(targets_shape_static[-1])
+    hidden_size = int(x.get_shape()[-1])
+    if is_2d:
+      depth_targets = tf.space_to_depth(targets, block_size)
+      factor = channels * block_size * block_size
+    else:
+      depth_targets = tf.reshape(targets, [
+          targets_shape[0], targets_shape[1] // block_size, 1,
+          channels * block_size
+      ])
+      factor = channels * block_size
+    if embedding_var is None:
+      embedding_var = tf.get_variable("targets_embedding",
+                                      [targets_vocab_size, hidden_size])
+    targets_emb = tf.gather(embedding_var, depth_targets)
+    # Flatten x and embedded targets. Flat targets are factor* larger on axis=1.
+    flat_x = tf.reshape(x, [-1, 1, 1, hidden_size])
+    flat_targets = tf.reshape(targets_emb, [-1, factor, 1, hidden_size])
+    shifted_targets = shift_left(flat_targets)
+    # Run a SeqCNN large-batch to produce factor outputs out of every target.
+    flat_x += tf.zeros_like(shifted_targets)  # Broadcast on axis=1.
+    flat_outputs = conv_block(
+        tf.concat([flat_x, shifted_targets], axis=3),
+        hidden_size,
+        dilations_and_kernels,
+        padding="LEFT")
+    # Reshape back to embedded targets shape.
+    outputs = tf.reshape(flat_outputs, [
+        tf.shape(targets_emb)[0],
+        tf.shape(targets_emb)[1],
+        tf.shape(targets_emb)[2], factor * hidden_size
+    ])
+    # Move depth back to target space.
+    if is_2d:
+      outputs = tf.depth_to_space(outputs, 2)
+    else:
+      outputs = tf.reshape(outputs, [
+          tf.shape(outputs)[0], block_size * tf.shape(outputs)[1], 1,
+          hidden_size
+      ])
+    # Final reshape before prediction to ensure target size.
+    outputs = tf.reshape(outputs, [
+        targets_shape[0], targets_shape[1], targets_shape[2], channels,
+        hidden_size
+    ])
+    return tf.layers.dense(outputs, targets_vocab_size)
+
+
+def moe_layer(data_parallelism,
+              ps_devices,
+              xs,
+              train,
+              model_hidden_size,
+              expert_hidden_size,
+              n1,
+              n2,
+              loss_coef,
+              autoscale=True,
+              name=None):
+  """A mixture of experts layer.
+
+  Args:
+    data_parallelism: a expert_utils.Parallelism object.
+    ps_devices: a list of strings
+    xs: a list of input tensors.
+    train: a boolean scalar.
+    model_hidden_size: an integer (input/output size for this layer)
+    expert_hidden_size: an integer (size of each expert's hidden layer)
+    n1: an integer - number of experts (or # of groups for hierarchical MoE)
+    n2: optional integer - size of each group of experts for hierarchical MoE
+    loss_coef: a scalar - multiplier on load-balancing losses
+    autoscale: a boolean
+    name: a string
+
+  Returns:
+    ys: a list of tensors:
+    extra_training_loss: a scalar
+  """
+  dp = data_parallelism
+  with tf.variable_scope(name, default_name="moe"):
+    # Set up the hyperparameters for the gating networks.
+    primary_gating_hp = eu.NoisyTopKGatingParams()
+    primary_gating_hp.num_experts = n1
+    if n2:
+      # hierarchical MoE containing moe_n1 groups of moe_n2 experts.
+      assert n2 > 1
+      secondary_gating_hp = eu.NoisyTopKGatingParams()
+      secondary_gating_hp.num_experts = n2
+    else:
+      # flat mixture of moe_n1 experts.
+      secondary_gating_hp = None
+    # Set up the hyperparameters for the expert networks.
+    # Each expert contains a hidden RELU layer of size filter_size
+    expert_hp = eu.FeedForwardExpertParams()
+    expert_hp.autoscale = autoscale
+    expert_hp.hidden_layer_sizes = [expert_hidden_size]
+    # Create the mixture of experts.
+    moe = eu.DistributedMixtureOfExperts(primary_gating_hp, secondary_gating_hp,
+                                         expert_hp, model_hidden_size,
+                                         model_hidden_size, ps_devices, "moe")
+    # MoE expects input tensors to be 2d.
+    #  Flatten out spatial dimensions.
+    xs_2d = dp(tf.reshape, xs, [[-1, model_hidden_size]] * dp.n)
+    # Call the MoE
+    moe_out_2d, importance, load, _, _ = moe.Eval(
+        dp.devices, xs_2d, train, identifiers=None, summaries=True)
+    # Reshape the output to the original shape.
+    moe_out = dp(tf.reshape, moe_out_2d, dp(tf.shape, xs))
+    # These losses encourage equal load on the different experts.
+    loss = loss_coef * (eu.CVSquared(importance) + eu.CVSquared(load))
+    return moe_out, loss
+
+
+def simple_attention(target, source, bias=None, summaries=True):
+  """A simple attention function.
+
+  Args:
+    target: a `Tensor` with shape `[batch, target_timesteps, depth]` or
+     `[batch, target_timesteps_1, target_timesteps_2, depth]`
+    source: a `Tensor` with shape `[batch, source_timesteps, depth]` or
+     `[batch, source_timesteps_1, source_timesteps_2, depth]`
+    bias: an optional `Tensor` with shape `[batch, timesteps, 1, 1]` used
+     to mask the attention to not attend to padding of input.
+    summaries: Boolean, whether to output summaries.
+
+  Returns:
+    a `Tensor` with same shape as `target`
+  """
+  with tf.name_scope("simple_attention", [target, source]):
+    target_shape = tf.shape(target)
+    source_shape = tf.shape(source)
+    target = tf.reshape(target, [
+        target_shape[0], target_shape[1] * target_shape[2], target_shape[3]
+    ])
+    source = tf.reshape(source, [
+        source_shape[0], source_shape[1] * source_shape[2], source_shape[3]
+    ])
+    attention = tf.matmul(target, source, transpose_b=True)
+    attention *= tf.rsqrt(tf.to_float(tf.shape(target)[2]))
+    if bias is not None:
+      attention += tf.expand_dims(tf.squeeze(bias, axis=[2, 3]), axis=1)
+    attention = tf.nn.softmax(attention)
+    if summaries and not tf.get_variable_scope().reuse:
+      tf.summary.image("attention", tf.expand_dims(attention, 3), max_outputs=5)
+    attended = tf.matmul(attention, source)
+    return tf.reshape(attended, target_shape)
+
+
+def multiscale_conv_sum(inputs, output_size, dilation_rates_and_kernel_sizes,
+                        pooling_type, **kwargs):
+  """Sum of several dilated convolutions.
+
+  For all convolutions with dilation_rate > 1, we first pool the input with
+  width dilation_rate.
+
+  Args:
+    inputs: a Tensor
+    output_size: an Integer
+    dilation_rates_and_kernel_sizes: a list of pairs (dilation, kernel_size)
+    pooling_type: "AVG" or "MAX"
+    **kwargs: additional
+
+  Returns:
+     a Tensor.
+  """
+  name = kwargs.pop("name") if "name" in kwargs else None
+  with tf.variable_scope(name, "multiscale_conv_sum", [inputs]):
+    padding = kwargs["padding"]
+    results, counter = [], -1
+    for dilation_rate, kernel_size in dilation_rates_and_kernel_sizes:
+      counter += 1
+      if dilation_rate > 1:
+        pooled = pool(inputs, kernel_size, pooling_type, padding)
+      else:
+        pooled = inputs
+      results.append(
+          conv(
+              pooled,
+              output_size,
+              kernel_size,
+              dilation_rate=dilation_rate,
+              name="conv_layer%d" % counter,
+              **kwargs))
+    return tf.add_n(results) * (len(results)**-0.5)
+
+
+def multiscale_conv_and_attention(x,
+                                  padding,
+                                  hparams,
+                                  source=None,
+                                  summaries=True):
+  """A common part of t2t layers.
+
+  First, do a linear multiscale convolution
+  Second, do attention (if source is not None)
+
+  Applies residuals and normalization on both steps.
+
+  Args:
+    x: a Tensor.
+    padding: a padding type
+    hparams: hyperparameters for model
+    source: optional source tensor for attention. (encoder output)
+    summaries: Boolean, whether to output summaries.
+
+  Returns:
+    a Tensor.
+  """
+  # TODO(noam): The number of different scales should be a hyperparameter.
+  conv_sum = multiscale_conv_sum(
+      x,
+      hparams.hidden_size, [((hparams.kernel_height**i, hparams.kernel_width**
+                              i), (hparams.kernel_height, hparams.kernel_width))
+                            for i in xrange(3)],
+      "AVG",
+      padding=padding)
+  # For residuals a rescale if necessary if channels differ.
+  if x.get_shape().as_list()[-1] != conv_sum.get_shape().as_list()[-1]:
+    x = conv(x, hparams.hidden_size, (1, 1))
+  x = noam_norm(x + conv_sum)
+  if source is not None:
+    x = noam_norm(x + simple_attention(x, source, summaries=summaries))
+  return x
+
+
+def conv_with_pools(inputs, output_size, kernel_size, pool_sizes, pooling_type,
+                    **kwargs):
+  """Convolution plus 1x1 convolution applied to specified pools.
+
+  For example we might do a regular convolution with kernel size (3, 1),
+  and pools of sizes [(9, 1), (27, 1)].
+
+  Args:
+    inputs: a Tensor
+    output_size: an Integer
+    kernel_size: a tuple of integers
+    pool_sizes: a list of tuples of integers.
+    pooling_type: "AVG" or "MAX"
+    **kwargs: additional keyword args for conv
+
+  Returns:
+     a Tensor.
+  """
+  name = kwargs.pop("name") if "name" in kwargs else None
+  with tf.variable_scope(name, "conv_with_pools", [inputs]):
+    padding = kwargs["padding"]
+    results = []
+    results.append(conv(inputs, output_size, kernel_size, **kwargs))
+    for i, pool_size in enumerate(pool_sizes):
+      pooled = pool(inputs, pool_size, pooling_type, padding)
+      results.append(
+          conv(pooled, output_size, (1, 1), name="pool_%d" % i, **kwargs))
+    return tf.add_n(results) * (len(results)**-0.5)
+
+
+def conv_with_pools_and_attention(x,
+                                  padding,
+                                  hparams,
+                                  source=None,
+                                  summaries=True):
+  """A common part of t2t layers.
+
+  First, do conv_with_pools
+  Second, do attention (if source is not None)
+
+  Applies residuals and normalization on both steps.
+
+  Args:
+    x: a Tensor.
+    padding: a padding type
+    hparams: hyperparameters for model
+    source: optional source tensor for attention. (encoder output)
+    summaries: Boolean, whether to output summaries.
+
+  Returns:
+    a Tensor.
+  """
+  conv_sum = conv_with_pools(
+      x,
+      hparams.hidden_size, (hparams.kernel_height, hparams.kernel_width),
+      hparams.pool_sizes,
+      "AVG",
+      padding=padding)
+  if x.get_shape().as_list()[-1] == conv_sum.get_shape().as_list()[-1]:
+    conv_sum += x
+  x = noam_norm(conv_sum)
+  if source is not None:
+    x = noam_norm(x + simple_attention(x, source, summaries=summaries))
+  return x
+
+
+def get_timing_signal(length,
+                      min_timescale=1,
+                      max_timescale=1e4,
+                      num_timescales=16):
+  """Create Tensor of sinusoids of different frequencies.
+
+  Args:
+    length: Length of the Tensor to create, i.e. Number of steps.
+    min_timescale: a float
+    max_timescale: a float
+    num_timescales: an int
+
+  Returns:
+    Tensor of shape (length, 2*num_timescales)
+  """
+  positions = tf.to_float(tf.range(length))
+  log_timescale_increment = (math.log(max_timescale / min_timescale) /
+                             (num_timescales - 1))
+  inv_timescales = min_timescale * tf.exp(
+      tf.to_float(tf.range(num_timescales)) * -log_timescale_increment)
+  scaled_time = tf.expand_dims(positions, 1) * tf.expand_dims(inv_timescales, 0)
+  return tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1)
+
+
+def add_timing_signal(x, min_timescale=1, max_timescale=1e4, num_timescales=16):
+  """Adds a bunch of sinusoids of different frequencies to a Tensor.
+
+  This allows attention to learn to use absolute and relative positions.
+  The timing signal should be added to some precursor of both the source
+  and the target of the attention.
+
+  The use of relative position is possible because sin(x+y) and cos(x+y) can be
+  experessed in terms of y, sin(x) and cos(x).
+
+  In particular, we use a geometric sequence of timescales starting with
+  min_timescale and ending with max_timescale.  For each timescale, we
+  generate the two sinusoidal signals sin(timestep/timescale) and
+  cos(timestep/timescale).  All of these sinusoids are concatenated in
+  the depth dimension, padded with zeros to be the same depth as the input,
+  and added into input.
+
+  Args:
+    x: a Tensor with shape [?, length, ?, depth]
+    min_timescale: a float
+    max_timescale: a float
+    num_timescales: an int <= depth/2
+
+  Returns:
+    a Tensor the same shape as x.
+  """
+  length = tf.shape(x)[1]
+  depth = tf.shape(x)[3]
+  signal = get_timing_signal(length, min_timescale, max_timescale,
+                             num_timescales)
+  padded_signal = tf.pad(signal, [[0, 0], [0, depth - 2 * num_timescales]])
+  return x + tf.reshape(padded_signal, [1, length, 1, depth])
+
+
+def mask_from_embedding(emb):
+  """Input embeddings -> padding mask.
+
+  We have hacked symbol_modality to return all-zero embeddings for padding.
+  Returns a mask with 0.0 in the padding positions and 1.0 elsewhere.
+
+  Args:
+    emb: a Tensor with shape [batch, width, height, depth].
+  Returns:
+    a 0.0/1.0 Tensor with shape [batch, width, height, 1].
+  """
+  return weights_nonzero(tf.reduce_sum(tf.abs(emb), axis=3, keep_dims=True))
+
+
+def mask_leq(target_length, source_length):
+  """A mask with 1.0 wherever source_pos <= target_pos and 0.0 elsewhere.
+
+  Args:
+    target_length: an integer
+    source_length: an integer
+  Returns:
+    a Tensor with shape [1, target_length, source_length]
+  """
+  return tf.expand_dims(
+      tf.matrix_band_part(tf.ones([target_length, source_length]), -1, 0), 0)
+
+
+def attention_1d_v0(source,
+                    target,
+                    attention_size,
+                    output_size,
+                    num_heads,
+                    mask=None,
+                    transform_source=True,
+                    transform_target=True,
+                    transform_output=True,
+                    summaries=True,
+                    name=None):
+  """multi-headed attention.
+
+  TODO(noam): this could probably be extended to 2d.
+
+  Args:
+    source: a Tensor of shape [batch, source_length, source_depth]
+    target: a Tensor of shape [batch, target_length, target_depth]
+    attention_size: an integer
+    output_size: an integer
+    num_heads: an integer divisor of attention_size
+    mask: a float32 Tensor of shape [batch, target_length, source_length]
+          1.0 means can-see; 0.0 means can't-see.
+          Any dimension can be 1 (supports broadcasting).
+    transform_source: a boolean
+    transform_target: a boolean
+    transform_output: a boolean
+    summaries: a boolean
+    name: an optional string
+
+  Returns:
+    a Tensor of shape [batch, length, output_size]
+  """
+  with tf.variable_scope(name, default_name="attention", values=[target]):
+    source_length = tf.shape(source)[1]
+    target_length = tf.shape(target)[1]
+    batch = tf.shape(source)[0]
+
+    def _maybe_transform(t, size, should_transform, name):
+      if should_transform:
+        return conv1d(t, size, 1, name=name)
+      else:
+        assert t.get_shape()[-1] == size
+        return t
+
+    source_attention = _maybe_transform(source, attention_size,
+                                        transform_source, "source_attention")
+    target_attention = _maybe_transform(target, attention_size,
+                                        transform_target, "target_attention")
+    assert attention_size % num_heads == 0
+    size_per_head = attention_size // num_heads
+    source_attention = tf.reshape(
+        source_attention, [batch, source_length, num_heads, size_per_head])
+    target_attention = tf.reshape(
+        target_attention, [batch, target_length, num_heads, size_per_head])
+    # [batch, num_heads, length, size_per_head]
+    source_attention = tf.transpose(source_attention, [0, 2, 1, 3])
+    target_attention = tf.transpose(target_attention, [0, 2, 1, 3])
+
+    # [batch, num_heads, target_length, source_length]
+    attention = tf.matmul(target_attention, source_attention, transpose_b=True)
+    attention *= size_per_head**-0.5
+
+    if mask is not None:
+      mask = tf.expand_dims(mask, 1)
+      mask = (1.0 - mask) * -1e9
+      attention += mask
+    attention = tf.nn.softmax(attention)
+    if summaries and not tf.get_variable_scope().reuse:
+      # Compute a color image summary.
+      image = tf.reshape(attention,
+                         [batch, num_heads, target_length, source_length])
+      image = tf.transpose(image, [0, 2, 3, 1])
+      image = tf.pow(image, 0.2)  # for high-dynamic-range
+      # Each head will correspond to one of RGB.
+      # pad the heads to be a multiple of 3
+      extra_heads = -num_heads % 3
+      image = tf.pad(image, [[0, 0], [0, 0], [0, 0], [0, -num_heads % 3]])
+      image = tf.reshape(image, [
+          batch, target_length, source_length, 3, (num_heads + extra_heads) // 3
+      ])
+      image = tf.reduce_max(image, 4)
+      tf.summary.image("local_attention", image, max_outputs=1)
+    # output: [batch, num_heads, target_length, size_per_head]
+    output = tf.matmul(attention, source_attention)
+    output = tf.transpose(output, [0, 2, 1, 3])
+    output = tf.reshape(output, [batch, target_length, attention_size])
+    output = _maybe_transform(output, output_size, transform_output,
+                              "attention_output")
+    return output
+
+
+def relu_density_logit(x, reduce_dims):
+  """logit(density(x)).
+
+  Useful for histograms.
+
+  Args:
+    x: a Tensor, typilcally the output of tf.relu
+    reduce_dims: a list of dimensions
+
+  Returns:
+    a Tensor
+  """
+  frac = tf.reduce_mean(tf.to_float(x > 0.0), reduce_dims)
+  scaled = tf.log(frac + math.exp(-10)) - tf.log((1.0 - frac) + math.exp(-10))
+  return scaled
+
+
+def conv_hidden_relu(inputs,
+                     hidden_size,
+                     output_size,
+                     kernel_size=(1, 1),
+                     summaries=True,
+                     dropout=0.0,
+                     **kwargs):
+  """Hidden layer with RELU activation followed by linear projection."""
+  name = kwargs.pop("name") if "name" in kwargs else None
+  with tf.variable_scope(name, "conv_hidden_relu", [inputs]):
+    if inputs.get_shape().ndims == 3:
+      is_3d = True
+      inputs = tf.expand_dims(inputs, 2)
+    else:
+      is_3d = False
+    h = conv(
+        inputs,
+        hidden_size,
+        kernel_size,
+        activation=tf.nn.relu,
+        name="conv1",
+        **kwargs)
+    if dropout != 0.0:
+      h = tf.nn.dropout(h, 1.0 - dropout)
+    if summaries and not tf.get_variable_scope().reuse:
+      tf.summary.histogram("hidden_density_logit",
+                           relu_density_logit(
+                               h, list(range(inputs.shape.ndims - 1))))
+    ret = conv(h, output_size, (1, 1), name="conv2", **kwargs)
+    if is_3d:
+      ret = tf.squeeze(ret, 2)
+    return ret
+
+
+def conv_gru(x,
+             kernel_size,
+             filters,
+             padding="SAME",
+             dilation_rate=(1, 1),
+             name=None,
+             reuse=None):
+  """Convolutional GRU in 1 dimension."""
+
+  # Let's make a shorthand for conv call first.
+  def do_conv(args, name, bias_start, padding):
+    return conv(
+        args,
+        filters,
+        kernel_size,
+        padding=padding,
+        dilation_rate=dilation_rate,
+        bias_initializer=tf.constant_initializer(bias_start),
+        name=name)
+
+  # Here comes the GRU gate.
+  with tf.variable_scope(
+      name, default_name="conv_gru", values=[x], reuse=reuse):
+    reset = saturating_sigmoid(do_conv(x, "reset", 1.0, padding))
+    gate = saturating_sigmoid(do_conv(x, "gate", 1.0, padding))
+    candidate = tf.tanh(do_conv(reset * x, "candidate", 0.0, padding))
+    return gate * x + (1 - gate) * candidate
+
+
+def conv_lstm(x,
+              kernel_size,
+              filters,
+              padding="SAME",
+              dilation_rate=(1, 1),
+              name=None,
+              reuse=None):
+  """Convolutional LSTM in 1 dimension."""
+  with tf.variable_scope(
+      name, default_name="conv_lstm", values=[x], reuse=reuse):
+    gates = conv(
+        x,
+        4 * filters,
+        kernel_size,
+        padding=padding,
+        dilation_rate=dilation_rate)
+    g = tf.split(layer_norm(gates, 4 * filters), 4, axis=3)
+    new_cell = tf.sigmoid(g[0]) * x + tf.sigmoid(g[1]) * tf.tanh(g[3])
+    return tf.sigmoid(g[2]) * tf.tanh(new_cell)
+
+
+def diagonal_conv_gru(x,
+                      kernel_size,
+                      filters,
+                      train,
+                      dropout=0.0,
+                      name=None,
+                      reuse=None):
+  """Diagonal Convolutional GRU as in https://arxiv.org/abs/1702.08727."""
+
+  # Let's make a shorthand for conv call first.
+  def do_conv(args, name, bias_start):
+    return conv(
+        args,
+        filters,
+        kernel_size,
+        padding="SAME",
+        bias_initializer=tf.constant_initializer(bias_start),
+        name=name)
+
+  # Here comes the GRU gate.
+  with tf.variable_scope(
+      name, default_name="diagonal_conv_gru", values=[x], reuse=reuse):
+    reset, reset_cost = hard_sigmoid(do_conv(x, "reset", 0.5))
+    gate, gate_cost = hard_sigmoid(do_conv(x, "gate", 0.7))
+    candidate = tf.tanh(do_conv(reset * x, "candidate", 0.0))
+
+    # Dropout if training.
+    if dropout > 0.0 and train:
+      candidate = tf.nn.dropout(candidate, 1.0 - dropout)
+
+    # Diagonal shift.
+    shift_filters = filters // 3
+    base_filter = ([[0, 1, 0]] * (filters - 2 * shift_filters) +
+                   [[1, 0, 0]] * shift_filters + [[0, 0, 1]] * shift_filters)
+    shift_filter = tf.constant(np.transpose(base_filter), dtype=tf.float32)
+    shift_filter = tf.expand_dims(tf.expand_dims(shift_filter, 0), 3)
+    x_shifted = tf.nn.depthwise_conv2d(
+        x, shift_filter, [1, 1, 1, 1], padding="SAME")
+
+    # Return the gated result and cost.
+    total_cost_avg = 0.5 * (reset_cost + gate_cost)
+    return gate * x_shifted + (1 - gate) * candidate, total_cost_avg
+
+
+def pad_to_same_length(x, y, final_length_divisible_by=1, axis=1):
+  """Pad tensors x and y on axis 1 so that they have the same length."""
+  if axis not in [1, 2]:
+    raise ValueError("Only axis=1 and axis=2 supported for now.")
+  with tf.name_scope("pad_to_same_length", [x, y]):
+    x_length = tf.shape(x)[axis]
+    y_length = tf.shape(y)[axis]
+    max_length = tf.maximum(x_length, y_length)
+    if final_length_divisible_by > 1:
+      # Find the nearest larger-or-equal integer divisible by given number.
+      max_length += final_length_divisible_by - 1
+      max_length //= final_length_divisible_by
+      max_length *= final_length_divisible_by
+    length_diff1 = max_length - x_length
+    length_diff2 = max_length - y_length
+
+    def padding_list(length_diff, arg):
+      if axis == 1:
+        return [[[0, 0], [0, length_diff]],
+                tf.zeros([tf.rank(arg) - 2, 2], dtype=tf.int32)]
+      return [[[0, 0], [0, 0], [0, length_diff]],
+              tf.zeros([tf.rank(arg) - 3, 2], dtype=tf.int32)]
+
+    paddings1 = tf.concat(padding_list(length_diff1, x), axis=0)
+    paddings2 = tf.concat(padding_list(length_diff2, y), axis=0)
+    res_x = tf.pad(x, paddings1)
+    res_y = tf.pad(y, paddings2)
+    # Static shapes are the same except for axis=1.
+    x_shape = x.shape.as_list()
+    x_shape[axis] = None
+    res_x.set_shape(x_shape)
+    y_shape = y.shape.as_list()
+    y_shape[axis] = None
+    res_y.set_shape(y_shape)
+    return res_x, res_y
+
+
+def pad_with_zeros(logits, labels):
+  """Pad labels on the length dimension to match logits length."""
+  with tf.name_scope("pad_with_zeros", [logits, labels]):
+    logits, labels = pad_to_same_length(logits, labels)
+    if len(labels.shape.as_list()) == 3:  # 2-d labels.
+      logits, labels = pad_to_same_length(logits, labels, axis=2)
+    return labels
+
+
+def weights_nonzero(labels):
+  """Assign weight 1.0 to all labels except for padding (id=0)."""
+  return tf.to_float(tf.not_equal(labels, 0))
+
+
+def weights_all(labels):
+  """Assign weight 1.0 to all labels."""
+  return tf.ones_like(labels, dtype=tf.float32)
+
+
+def weights_concatenated(labels):
+  """Assign weight 1.0 to the "target" part of the concatenated labels.
+
+  The labels look like:
+    source English I love you . ID1 target French Je t'aime . ID1 source
+      English the cat ID1 target French le chat ID1 source English ...
+
+  We want to assign weight 1.0 to all words in the target text (including the
+  ID1 end symbol), but not to the source text or the boilerplate.  In the
+  above example, the target words that get positive weight are:
+    Je t'aime . ID1 le chat ID1
+
+  Args:
+    labels: a Tensor
+  Returns:
+    a Tensor
+  """
+  eos_mask = tf.to_int32(tf.equal(labels, 1))
+  sentence_num = tf.cumsum(eos_mask, axis=1, exclusive=True)
+  in_target = tf.equal(tf.mod(sentence_num, 2), 1)
+  # first two tokens of each sentence are boilerplate.
+  sentence_num_plus_one = sentence_num + 1
+  shifted = tf.pad(sentence_num_plus_one, [[0, 0], [2, 0], [0, 0],
+                                           [0, 0]])[:, :-2, :, :]
+  nonboilerplate = tf.equal(sentence_num_plus_one, shifted)
+  ret = tf.to_float(tf.logical_and(nonboilerplate, in_target))
+  return ret
+
+
+def padded_cross_entropy(logits,
+                         labels,
+                         label_smoothing,
+                         weights_fn=weights_nonzero,
+                         reduce_sum=True):
+  """Compute cross-entropy assuming 0s are padding.
+
+  Computes a loss numerator (the sum of losses), and loss denominator
+  (the number of non-padding tokens).
+
+  Args:
+    logits: a `Tensor` with shape `[batch, timesteps, vocab_size]`.
+    labels: an integer `Tensor` with shape `[batch, timesteps]`.
+    label_smoothing: a floating point `Scalar`.
+    weights_fn: A function from labels to weights.
+    reduce_sum: a Boolean, whether to sum at the end or not.
+
+  Returns:
+    loss_numerator: a `Scalar`.  Sum of losses.
+    loss_denominator: a `Scalar.  The number of non-padding target tokens.
+  """
+  confidence = 1.0 - label_smoothing
+  vocab_size = tf.shape(logits)[-1]
+  with tf.name_scope("padded_cross_entropy", [logits, labels]):
+    pad_labels = pad_with_zeros(logits, labels)
+    xent = smoothing_cross_entropy(logits, pad_labels, vocab_size, confidence)
+    weights = weights_fn(pad_labels)
+    if not reduce_sum:
+      return xent * weights, weights
+    return tf.reduce_sum(xent * weights), tf.reduce_sum(weights)
+
+
+def smoothing_cross_entropy(logits, labels, vocab_size, confidence):
+  """Cross entropy with label smoothing to limit over-confidence."""
+  with tf.name_scope("smoothing_cross_entropy", [logits, labels]):
+    # Low confidence is given to all non-true labels, uniformly.
+    low_confidence = (1.0 - confidence) / tf.to_float(vocab_size - 1)
+    # Normalizing constant is the best cross-entropy value with soft targets.
+    # We subtract it just for readability, makes no difference on learning.
+    normalizing = -(confidence * tf.log(confidence) + tf.to_float(
+        vocab_size - 1) * low_confidence * tf.log(low_confidence + 1e-20))
+    # Soft targets.
+    soft_targets = tf.one_hot(
+        tf.cast(labels, tf.int32),
+        depth=vocab_size,
+        on_value=confidence,
+        off_value=low_confidence)
+    xentropy = tf.nn.softmax_cross_entropy_with_logits(
+        logits=logits, labels=soft_targets)
+    return xentropy - normalizing
diff --git a/tensor2tensor/models/common_layers_test.py b/tensor2tensor/models/common_layers_test.py
new file mode 100644
index 000000000..2bd6a53ad
--- /dev/null
+++ b/tensor2tensor/models/common_layers_test.py
@@ -0,0 +1,290 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for common layers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+import numpy as np
+from tensor2tensor.models import common_layers
+
+import tensorflow as tf
+
+
+class CommonLayersTest(tf.test.TestCase):
+
+  def testStandardizeImages(self):
+    x = np.random.rand(5, 7, 7, 3)
+    with self.test_session() as session:
+      y = common_layers.standardize_images(tf.constant(x))
+      res = session.run(y)
+    self.assertEqual(res.shape, (5, 7, 7, 3))
+
+  def testImageAugmentation(self):
+    x = np.random.rand(500, 500, 3)
+    with self.test_session() as session:
+      y = common_layers.image_augmentation(tf.constant(x))
+      res = session.run(y)
+    self.assertEqual(res.shape, (299, 299, 3))
+
+  def testSaturatingSigmoid(self):
+    x = np.array([-120.0, -100.0, 0.0, 100.0, 120.0], dtype=np.float32)
+    with self.test_session() as session:
+      y = common_layers.saturating_sigmoid(tf.constant(x))
+      res = session.run(y)
+    self.assertAllClose(res, [0.0, 0.0, 0.5, 1.0, 1.0])
+
+  def testFlatten4D3D(self):
+    x = np.random.random_integers(1, high=8, size=(3, 5, 2))
+    with self.test_session() as session:
+      y = common_layers.flatten4d3d(common_layers.embedding(x, 10, 7))
+      session.run(tf.global_variables_initializer())
+      res = session.run(y)
+    self.assertEqual(res.shape, (3, 5 * 2, 7))
+
+  def testEmbedding(self):
+    x = np.random.random_integers(1, high=8, size=(3, 5))
+    with self.test_session() as session:
+      y = common_layers.embedding(x, 10, 16)
+      session.run(tf.global_variables_initializer())
+      res = session.run(y)
+    self.assertEqual(res.shape, (3, 5, 16))
+
+  def testConv(self):
+    x = np.random.rand(5, 7, 1, 11)
+    with self.test_session() as session:
+      y = common_layers.conv(tf.constant(x, dtype=tf.float32), 13, (3, 3))
+      session.run(tf.global_variables_initializer())
+      res = session.run(y)
+    self.assertEqual(res.shape, (5, 5, 1, 13))
+
+  def testSeparableConv(self):
+    x = np.random.rand(5, 7, 1, 11)
+    with self.test_session() as session:
+      y = common_layers.separable_conv(
+          tf.constant(x, dtype=tf.float32), 13, (3, 3))
+      session.run(tf.global_variables_initializer())
+      res = session.run(y)
+    self.assertEqual(res.shape, (5, 5, 1, 13))
+
+  def testSubSeparableConv(self):
+    for sep in [0, 1, 2, 4]:
+      x = np.random.rand(5, 7, 1, 12)
+      with self.test_session() as session:
+        with tf.variable_scope("sep_%d" % sep):
+          y = common_layers.subseparable_conv(
+              tf.constant(x, dtype=tf.float32), 16, (3, 3), separability=sep)
+        session.run(tf.global_variables_initializer())
+        res = session.run(y)
+      self.assertEqual(res.shape, (5, 5, 1, 16))
+
+  def testConvBlock(self):
+    x = np.random.rand(5, 7, 1, 11)
+    with self.test_session() as session:
+      y = common_layers.conv_block(
+          tf.constant(x, dtype=tf.float32),
+          13, [(1, (3, 3)), (1, (3, 3))],
+          padding="SAME",
+          normalizer_fn=common_layers.noam_norm)
+      session.run(tf.global_variables_initializer())
+      res = session.run(y)
+    self.assertEqual(res.shape, (5, 7, 1, 13))
+
+  def testSeparableConvBlock(self):
+    x = np.random.rand(5, 7, 1, 11)
+    with self.test_session() as session:
+      y = common_layers.separable_conv_block(
+          tf.constant(x, dtype=tf.float32),
+          13, [(1, (3, 3)), (1, (3, 3))],
+          padding="SAME")
+      session.run(tf.global_variables_initializer())
+      res = session.run(y)
+    self.assertEqual(res.shape, (5, 7, 1, 13))
+
+  def testSubSeparableConvBlock(self):
+    for sep in [0, 1, 2, 4]:
+      x = np.random.rand(5, 7, 1, 12)
+      with self.test_session() as session:
+        with tf.variable_scope("sep_%d" % sep):
+          y = common_layers.subseparable_conv_block(
+              tf.constant(x, dtype=tf.float32),
+              16, [(1, (3, 3)), (1, (3, 3))],
+              padding="SAME",
+              separability=sep)
+        session.run(tf.global_variables_initializer())
+        res = session.run(y)
+      self.assertEqual(res.shape, (5, 7, 1, 16))
+
+  def testPool(self):
+    x = np.random.rand(5, 8, 1, 11)
+    with self.test_session() as session:
+      y = common_layers.pool(
+          tf.constant(x, dtype=tf.float32), (2, 2), "AVG", "SAME")
+      session.run(tf.global_variables_initializer())
+      res = session.run(y)
+    self.assertEqual(res.shape, (5, 8, 1, 11))
+
+  def testConvBlockDownsample(self):
+    x = np.random.rand(5, 7, 1, 11)
+    with self.test_session() as session:
+      y = common_layers.conv_block_downsample(
+          tf.constant(x, dtype=tf.float32), (3, 1), (2, 1), "SAME")
+      session.run(tf.global_variables_initializer())
+      res = session.run(y)
+    self.assertEqual(res.shape, (5, 4, 1, 27))
+
+  def testSimpleAttention(self):
+    x = np.random.rand(5, 7, 1, 11)
+    y = np.random.rand(5, 9, 1, 11)
+    with self.test_session() as session:
+      a = common_layers.simple_attention(
+          tf.constant(x, dtype=tf.float32), tf.constant(y, dtype=tf.float32))
+      session.run(tf.global_variables_initializer())
+      res = session.run(a)
+    self.assertEqual(res.shape, (5, 7, 1, 11))
+
+  def testGetTimingSignal(self):
+    length = 7
+    num_timescales = 10
+    with self.test_session() as session:
+      a = common_layers.get_timing_signal(length, num_timescales=num_timescales)
+      session.run(tf.global_variables_initializer())
+      res = session.run(a)
+    self.assertEqual(res.shape, (length, 2 * num_timescales))
+
+  def testAddTimingSignal(self):
+    batch = 5
+    length = 7
+    height = 3
+    depth = 35
+    x = np.random.rand(batch, length, height, depth)
+    with self.test_session() as session:
+      a = common_layers.add_timing_signal(tf.constant(x, dtype=tf.float32))
+      session.run(tf.global_variables_initializer())
+      res = session.run(a)
+    self.assertEqual(res.shape, (batch, length, height, depth))
+
+  def testAttention1D(self):
+    batch = 5
+    target_length = 7
+    source_length = 13
+    source_depth = 9
+    target_depth = 11
+    attention_size = 21
+    output_size = 15
+    num_heads = 7
+    source = np.random.rand(batch, source_length, source_depth)
+    target = np.random.rand(batch, target_length, target_depth)
+    mask = np.random.rand(batch, target_length, source_length)
+    with self.test_session() as session:
+      a = common_layers.attention_1d_v0(
+          tf.constant(source, dtype=tf.float32),
+          tf.constant(target, dtype=tf.float32), attention_size, output_size,
+          num_heads, tf.constant(mask, dtype=tf.float32))
+      session.run(tf.global_variables_initializer())
+      res = session.run(a)
+    self.assertEqual(res.shape, (batch, target_length, output_size))
+
+  def testMultiscaleConvSum(self):
+    x = np.random.rand(5, 9, 1, 11)
+    with self.test_session() as session:
+      y = common_layers.multiscale_conv_sum(
+          tf.constant(x, dtype=tf.float32),
+          13, [((1, 1), (5, 5)), ((2, 2), (3, 3))],
+          "AVG",
+          padding="SAME")
+      session.run(tf.global_variables_initializer())
+      res = session.run(y)
+    self.assertEqual(res.shape, (5, 9, 1, 13))
+
+  def testConvGRU(self):
+    x = np.random.rand(5, 7, 3, 11)
+    with self.test_session() as session:
+      y = common_layers.conv_gru(tf.constant(x, dtype=tf.float32), (1, 3), 11)
+      z = common_layers.conv_gru(
+          tf.constant(x, dtype=tf.float32), (1, 3), 11, padding="LEFT")
+      session.run(tf.global_variables_initializer())
+      res1 = session.run(y)
+      res2 = session.run(z)
+    self.assertEqual(res1.shape, (5, 7, 3, 11))
+    self.assertEqual(res2.shape, (5, 7, 3, 11))
+
+  def testLayerNorm(self):
+    x = np.random.rand(5, 7, 11)
+    with self.test_session() as session:
+      y = common_layers.layer_norm(tf.constant(x, dtype=tf.float32), 11)
+      session.run(tf.global_variables_initializer())
+      res = session.run(y)
+    self.assertEqual(res.shape, (5, 7, 11))
+
+  def testConvLSTM(self):
+    x = np.random.rand(5, 7, 11, 13)
+    with self.test_session() as session:
+      y = common_layers.conv_lstm(tf.constant(x, dtype=tf.float32), (1, 3), 13)
+      session.run(tf.global_variables_initializer())
+      res = session.run(y)
+    self.assertEqual(res.shape, (5, 7, 11, 13))
+
+  def testPadToSameLength(self):
+    x1 = np.random.rand(5, 7, 11)
+    x2 = np.random.rand(5, 9, 11)
+    with self.test_session() as session:
+      a, b = common_layers.pad_to_same_length(
+          tf.constant(x1, dtype=tf.float32), tf.constant(x2, dtype=tf.float32))
+      c, d = common_layers.pad_to_same_length(
+          tf.constant(x1, dtype=tf.float32),
+          tf.constant(x2, dtype=tf.float32),
+          final_length_divisible_by=4)
+      res1, res2 = session.run([a, b])
+      res1a, res2a = session.run([c, d])
+    self.assertEqual(res1.shape, (5, 9, 11))
+    self.assertEqual(res2.shape, (5, 9, 11))
+    self.assertEqual(res1a.shape, (5, 12, 11))
+    self.assertEqual(res2a.shape, (5, 12, 11))
+
+  def testShiftLeft(self):
+    x1 = np.zeros((5, 7, 1, 11))
+    x1[:, 0, :] = np.ones_like(x1[:, 0, :])
+    expected = np.zeros((5, 7, 1, 11))
+    expected[:, 1, :] = np.ones_like(expected[:, 1, :])
+    with self.test_session() as session:
+      a = common_layers.shift_left(tf.constant(x1, dtype=tf.float32))
+      actual = session.run(a)
+    self.assertAllEqual(actual, expected)
+
+  def testConvStride2MultiStep(self):
+    x1 = np.random.rand(5, 32, 1, 11)
+    with self.test_session() as session:
+      a = common_layers.conv_stride2_multistep(
+          tf.constant(x1, dtype=tf.float32), 4, 16)
+      session.run(tf.global_variables_initializer())
+      actual = session.run(a[0])
+    self.assertEqual(actual.shape, (5, 2, 1, 16))
+
+  def testDeconvStride2MultiStep(self):
+    x1 = np.random.rand(5, 2, 1, 11)
+    with self.test_session() as session:
+      a = common_layers.deconv_stride2_multistep(
+          tf.constant(x1, dtype=tf.float32), 4, 16)
+      session.run(tf.global_variables_initializer())
+      actual = session.run(a)
+    self.assertEqual(actual.shape, (5, 32, 1, 16))
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensor2tensor/models/models.py b/tensor2tensor/models/models.py
new file mode 100644
index 000000000..bf19a307b
--- /dev/null
+++ b/tensor2tensor/models/models.py
@@ -0,0 +1,32 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Models defined in T2T. Imports here force registration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+# pylint: disable=unused-import
+from tensor2tensor.models import attention_lm
+from tensor2tensor.models import baseline
+from tensor2tensor.models import bytenet
+from tensor2tensor.models import multimodel
+from tensor2tensor.models import neural_gpu
+from tensor2tensor.models import slicenet
+from tensor2tensor.models import transformer
+from tensor2tensor.models import xception
+# pylint: enable=unused-import
diff --git a/tensor2tensor/models/multimodel.py b/tensor2tensor/models/multimodel.py
new file mode 100644
index 000000000..bcbf16995
--- /dev/null
+++ b/tensor2tensor/models/multimodel.py
@@ -0,0 +1,159 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""MultiModel."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+from tensor2tensor.models import common_layers
+from tensor2tensor.models import slicenet
+from tensor2tensor.utils import expert_utils as eu
+from tensor2tensor.utils import registry
+from tensor2tensor.utils import t2t_model
+
+import tensorflow as tf
+
+
+def experts(xs, moe_n1, moe_n2, hidden_size, filter_size, dp, ps, train):
+  """Mixture-of-Experts layer."""
+  # Set up the hyperparameters for the gating networks.
+  primary_gating_hp = eu.NoisyTopKGatingParams()
+  primary_gating_hp.num_experts = moe_n1
+  if moe_n2:
+    # Hierarchical MoE containing moe_n1 groups of moe_n2 experts.
+    assert moe_n2 > 1
+    secondary_gating_hp = eu.NoisyTopKGatingParams()
+    secondary_gating_hp.num_experts = moe_n2
+  else:
+    # Flat mixture of moe_n1 experts.
+    secondary_gating_hp = None
+  # Set up the hyperparameters for the expert networks.
+  # Each expert contains a hidden RELU layer of size filter_size
+  expert_hp = eu.FeedForwardExpertParams()
+  expert_hp.hidden_layer_sizes = [filter_size]
+  # Create the mixture of experts.
+  moe = eu.DistributedMixtureOfExperts(primary_gating_hp, secondary_gating_hp,
+                                       expert_hp, hidden_size, hidden_size, ps,
+                                       "moe")
+  # MoE expects input tensors to be 2d.  Flatten out spatial dimensions.
+  xs_2d = dp(tf.reshape, xs, [[-1, hidden_size]] * dp.n)
+  # Call the MoE
+  moe_out_2d, importance, load, _, _ = moe.Eval(
+      dp.devices, xs_2d, train, summaries=False, identifiers=None)
+  # Reshape the output to the original shape.
+  moe_out = dp(tf.reshape, moe_out_2d, dp(tf.shape, xs))
+  # These losses encourage equal load on the different experts.
+  loss = eu.CVSquared(importance) + eu.CVSquared(load)
+
+  # Apply residual and normalize.
+  def add_and_normalize(x, y):
+    return common_layers.layer_norm(x + y, hidden_size, name="moe_norm")
+
+  return dp(add_and_normalize, xs, moe_out), loss
+
+
+@registry.register_model
+class MultiModel(t2t_model.T2TModel):
+
+  def model_fn_body_sharded(self, sharded_features, train):
+    dp = self._data_parallelism
+    hparams = self._hparams
+    targets = sharded_features["targets"]
+
+    def flatten(inputs):
+      return tf.expand_dims(common_layers.flatten4d3d(inputs), axis=2)
+
+    inputs = dp(flatten, sharded_features["inputs"])
+
+    # Encode inputs.
+    def encode_half(inputs, inputs_mask, hparams):
+      # Add timing and encode.
+      inputs = common_layers.add_timing_signal(inputs)
+      return slicenet.multi_conv_res(inputs, "SAME", "encoder1",
+                                     hparams.num_hidden_layers // 2,
+                                     hparams, train, mask=inputs_mask)
+
+    target_space_emb = dp(slicenet.embed_target_space,
+                          sharded_features["target_space_id"],
+                          hparams.hidden_size)
+    inputs_pad = dp(slicenet.embedding_to_padding, inputs)
+    inputs_mask = dp(lambda x: 1.0 - x, inputs_pad)
+    inputs_encoded = dp(encode_half, inputs, inputs_mask, hparams)
+    with tf.variable_scope("experts_enc"):
+      inputs_encoded, expert_loss = experts(
+          inputs_encoded, hparams.moe_n1, hparams.moe_n2, hparams.hidden_size,
+          hparams.hidden_size, dp, self._ps_devices, train)
+      expert_loss *= hparams.moe_loss_coef
+    inputs_encoded = dp(
+        slicenet.multi_conv_res, inputs_encoded, "SAME",
+        "encoder2", hparams.num_hidden_layers, hparams, train,
+        mask=inputs_mask)
+
+    # If we're just predicing a class, there is no use for a decoder, return.
+    target_modality = hparams.problems[self._problem_idx].target_modality
+    if "class_label_modality" in target_modality.name:
+      return inputs_encoded, tf.reduce_mean(expert_loss)
+
+    # Do the middle part.
+    decoder_start, similarity_loss = dp(
+        slicenet.slicenet_middle, inputs_encoded, targets,
+        target_space_emb, inputs_mask, hparams, train)
+
+    # Decode.
+    decoder_half = dp(
+        slicenet.multi_conv_res,
+        decoder_start,
+        "LEFT",
+        "decoder1",
+        hparams.num_hidden_layers // 2,
+        hparams,
+        train,
+        mask=inputs_mask,
+        source=inputs_encoded)
+    with tf.variable_scope("experts_dec"):
+      decoder_half, expert_dec_loss = experts(
+          decoder_half, hparams.moe_n1, hparams.moe_n2, hparams.hidden_size,
+          hparams.hidden_size, dp, self._ps_devices, train)
+      expert_loss += expert_dec_loss * hparams.moe_loss_coef
+    decoder_final = dp(
+        slicenet.multi_conv_res,
+        decoder_half,
+        "LEFT",
+        "decoder2",
+        hparams.num_hidden_layers // 2,
+        hparams,
+        train,
+        mask=inputs_mask,
+        source=inputs_encoded)
+
+    total_loss = tf.reduce_mean(expert_loss) + tf.reduce_mean(similarity_loss)
+    return decoder_final, total_loss
+
+
+@registry.register_hparams("multimodel1p8")
+def multimodel_params1_p8():
+  """Version for eight problem runs."""
+  hparams = slicenet.slicenet_params1()
+  hparams.problem_choice = "distributed"
+  hparams.attention_type = "simple"  # TODO(lukaszkaiser): add transformer.
+  hparams.hidden_size = 1536
+  hparams.moe_n1 = 120
+  hparams.shared_embedding_and_softmax_weights = int(False)
+  hparams.dropout = 0.1
+  hparams.attention_dropout = 0.1
+  hparams.learning_rate_decay_scheme = "exp500k"
+  return hparams
diff --git a/tensor2tensor/models/multimodel_test.py b/tensor2tensor/models/multimodel_test.py
new file mode 100644
index 000000000..8df682c5c
--- /dev/null
+++ b/tensor2tensor/models/multimodel_test.py
@@ -0,0 +1,55 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for Xnet."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+import numpy as np
+
+from tensor2tensor.data_generators import problem_hparams
+from tensor2tensor.models import multimodel
+from tensor2tensor.models import slicenet
+
+import tensorflow as tf
+
+
+class MultiModelTest(tf.test.TestCase):
+
+  def testMultiModel(self):
+    x = np.random.random_integers(0, high=255, size=(3, 5, 4, 3))
+    y = np.random.random_integers(0, high=9, size=(3, 5, 1, 1))
+    hparams = slicenet.slicenet_params1_tiny()
+    p_hparams = problem_hparams.image_cifar10(hparams)
+    hparams.problems = [p_hparams]
+    with self.test_session() as session:
+      features = {
+          "inputs": tf.constant(x, dtype=tf.int32),
+          "targets": tf.constant(y, dtype=tf.int32),
+          "target_space_id": tf.constant(1, dtype=tf.int32),
+      }
+      model = multimodel.MultiModel(hparams, p_hparams)
+      sharded_logits, _, _ = model.model_fn(features, True)
+      logits = tf.concat(sharded_logits, 0)
+      session.run(tf.global_variables_initializer())
+      res = session.run(logits)
+    self.assertEqual(res.shape, (3, 1, 1, 1, 10))
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensor2tensor/models/neural_gpu.py b/tensor2tensor/models/neural_gpu.py
new file mode 100644
index 000000000..39aa735e1
--- /dev/null
+++ b/tensor2tensor/models/neural_gpu.py
@@ -0,0 +1,123 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""The Neural GPU model and its variants."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensor2tensor.models import common_hparams
+from tensor2tensor.models import common_layers
+from tensor2tensor.utils import registry
+from tensor2tensor.utils import t2t_model
+
+import tensorflow as tf
+
+
+def neural_gpu(inputs, hparams, train, name=None):
+  """The core Neural GPU."""
+  with tf.variable_scope(name, "neural_gpu"):
+
+    def step(state, inp):  # pylint: disable=missing-docstring
+      x = tf.nn.dropout(state, 1.0 - hparams.dropout * tf.to_float(train))
+      for layer in xrange(hparams.num_hidden_layers):
+        x = common_layers.conv_gru(
+            x, (hparams.kernel_height, hparams.kernel_width),
+            hparams.hidden_size,
+            name="cgru_%d" % layer)
+      # Padding input is zeroed-out in the modality, we check this by summing.
+      padding_inp = tf.less(tf.reduce_sum(tf.abs(inp), axis=[1, 2]), 0.00001)
+      new_state = tf.where(padding_inp, state, x)  # No-op where inp is padding.
+      return new_state
+
+    return tf.foldl(
+        step,
+        tf.transpose(inputs, [1, 0, 2, 3]),
+        initializer=inputs,
+        parallel_iterations=1,
+        swap_memory=True)
+
+
+@registry.register_model
+class NeuralGPU(t2t_model.T2TModel):
+
+  def model_fn_body(self, features, train):
+    return neural_gpu(features["inputs"], self._hparams, train)
+
+
+def diagonal_neural_gpu(inputs, hparams, train, name=None):
+  """Improved Neural GPU as in https://arxiv.org/abs/1702.08727."""
+  with tf.variable_scope(name, "diagonal_neural_gpu"):
+
+    def step(state_tup, inp):
+      """Single step of the improved Neural GPU."""
+      state, _ = state_tup
+      x = state
+      for layer in xrange(hparams.num_hidden_layers):
+        x, new_loss = common_layers.diagonal_conv_gru(
+            x, (hparams.kernel_height, hparams.kernel_width),
+            hparams.hidden_size,
+            train,
+            dropout=hparams.dropout,
+            name="dcgru_%d" % layer)
+      # Padding input is zeroed-out in the modality, we check this by summing.
+      padding_inp = tf.less(tf.reduce_sum(tf.abs(inp), axis=[1, 2]), 0.00001)
+      new_state = tf.where(padding_inp, state, x)  # No-op where inp is padding.
+      return new_state, new_loss
+
+    final_state, losses = tf.scan(
+        step,
+        tf.transpose(inputs, [1, 0, 2, 3]),
+        initializer=(inputs, tf.constant(0.0)),
+        parallel_iterations=1,
+        swap_memory=True)
+    return final_state[0, :, :, :, :], 2.0 * tf.reduce_mean(losses)
+
+
+@registry.register_model
+class DiagonalNeuralGPU(t2t_model.T2TModel):
+
+  def model_fn_body(self, features, train):
+    return diagonal_neural_gpu(features["inputs"], self._hparams, train)
+
+
+@registry.register_hparams("neural_gpu1")
+def neural_gpu_params1():
+  """Set of hyperparameters."""
+  hparams = common_hparams.basic_params1()
+  hparams.batch_size = 1024
+  hparams.num_hidden_layers = 1
+  hparams.hidden_size = 256
+  hparams.dropout = 0.1
+  hparams.label_smoothing = 0.0
+  hparams.clip_grad_norm = 10.0
+  hparams.num_hidden_layers = 1
+  hparams.kernel_height = 3
+  hparams.kernel_width = 1
+  hparams.learning_rate_decay_scheme = "exp50k"
+  hparams.learning_rate = 0.02
+  hparams.learning_rate_warmup_steps = 3000
+  hparams.initializer_gain = 1.0
+  hparams.weight_decay = 0.0
+  hparams.num_sampled_classes = 0
+  hparams.sampling_method = "argmax"
+  hparams.optimizer_adam_epsilon = 1e-6
+  hparams.optimizer_adam_beta1 = 0.85
+  hparams.optimizer_adam_beta2 = 0.997
+  return hparams
diff --git a/tensor2tensor/models/neural_gpu_test.py b/tensor2tensor/models/neural_gpu_test.py
new file mode 100644
index 000000000..0d4937a5d
--- /dev/null
+++ b/tensor2tensor/models/neural_gpu_test.py
@@ -0,0 +1,62 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for Neural GPU."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+import numpy as np
+
+from tensor2tensor.data_generators import problem_hparams
+from tensor2tensor.models import common_hparams
+from tensor2tensor.models import neural_gpu
+
+import tensorflow as tf
+
+
+class NeuralGPUTest(tf.test.TestCase):
+
+  def testNeuralGPU(self):
+    hparams = common_hparams.basic_params1()
+    batch_size = 3
+    input_length = 5
+    target_length = input_length
+    input_vocab_size = 9
+    target_vocab_size = 11
+    p_hparams = problem_hparams.test_problem_hparams(hparams, input_vocab_size,
+                                                     target_vocab_size)
+    inputs = -1 + np.random.random_integers(
+        input_vocab_size, size=(batch_size, input_length, 1, 1))
+    targets = -1 + np.random.random_integers(
+        target_vocab_size, size=(batch_size, target_length, 1, 1))
+    with self.test_session() as session:
+      features = {
+          "inputs": tf.constant(inputs, dtype=tf.int32),
+          "targets": tf.constant(targets, dtype=tf.int32)
+      }
+      model = neural_gpu.NeuralGPU(hparams, p_hparams)
+      shadred_logits, _, _ = model.model_fn(features, True)
+      logits = tf.concat(shadred_logits, 0)
+      session.run(tf.global_variables_initializer())
+      res = session.run(logits)
+    self.assertEqual(res.shape, (batch_size, target_length, 1, 1,
+                                 target_vocab_size))
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensor2tensor/models/slicenet.py b/tensor2tensor/models/slicenet.py
new file mode 100644
index 000000000..a7e2623cc
--- /dev/null
+++ b/tensor2tensor/models/slicenet.py
@@ -0,0 +1,391 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""SliceNet."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+from six.moves import xrange  # pylint: disable=redefined-builtin
+from six.moves import zip  # pylint: disable=redefined-builtin
+
+from tensor2tensor.models import common_attention
+from tensor2tensor.models import common_hparams
+from tensor2tensor.models import common_layers
+from tensor2tensor.utils import registry
+from tensor2tensor.utils import t2t_model
+
+import tensorflow as tf
+
+
+def get_norm(hparams):
+  """Get the normalizer function."""
+  if hparams.normalizer_fn == "layer":
+    return lambda x, name: common_layers.layer_norm(  # pylint: disable=g-long-lambda
+        x, hparams.hidden_size, name=name)
+  if hparams.normalizer_fn == "batch":
+    return tf.layers.batch_normalization
+  if hparams.normalizer_fn == "noam":
+    return common_layers.noam_norm
+  if hparams.normalizer_fn == "none":
+    return lambda x, name: x
+  raise ValueError("Parameter normalizer_fn must be one of: 'layer', 'batch',"
+                   "'noam', 'none'.")
+
+
+def attention(targets_shifted, inputs_encoded, norm_fn, hparams, train,
+              bias=None):
+  """Complete attention layer with preprocessing."""
+  separabilities = [hparams.separability, hparams.separability]
+  if hparams.separability < 0:
+    separabilities = [hparams.separability - 1, hparams.separability]
+  targets_timed = common_layers.subseparable_conv_block(
+      common_layers.add_timing_signal(targets_shifted),
+      hparams.hidden_size, [((1, 1), (5, 1)), ((4, 1), (5, 1))],
+      normalizer_fn=norm_fn,
+      padding="LEFT",
+      separabilities=separabilities,
+      name="targets_time")
+  if hparams.attention_type == "transformer":
+    targets_timed = tf.squeeze(targets_timed, 2)
+    target_shape = tf.shape(targets_timed)
+    targets_segment = tf.zeros([target_shape[0], target_shape[1]])
+    target_attention_bias = common_attention.attention_bias(
+        targets_segment, targets_segment, lower_triangular=True)
+    inputs_attention_bias = tf.zeros([
+        tf.shape(inputs_encoded)[0], hparams.num_heads,
+        tf.shape(targets_segment)[1],
+        tf.shape(inputs_encoded)[1]
+    ])
+
+    attention_dropout = hparams.attention_dropout * tf.to_float(train)
+    qv = common_attention.multihead_attention(
+        targets_timed,
+        None,
+        target_attention_bias,
+        hparams.hidden_size,
+        hparams.hidden_size,
+        hparams.hidden_size,
+        hparams.num_heads,
+        attention_dropout,
+        name="self_attention",
+        summaries=False)
+    qv = common_attention.multihead_attention(
+        qv,
+        inputs_encoded,
+        inputs_attention_bias,
+        hparams.hidden_size,
+        hparams.hidden_size,
+        hparams.hidden_size,
+        hparams.num_heads,
+        attention_dropout,
+        name="encdec_attention",
+        summaries=False)
+    return tf.expand_dims(qv, 2)
+  elif hparams.attention_type == "simple":
+    targets_with_attention = common_layers.simple_attention(
+        targets_timed, inputs_encoded, bias=bias, summaries=False)
+    return norm_fn(targets_shifted + targets_with_attention, name="attn_norm")
+
+
+def multi_conv_res(x, padding, name, layers, hparams, train,
+                   mask=None, source=None):
+  """A stack of separable convolution blocks with residual connections."""
+  with tf.variable_scope(name):
+    padding_bias = None
+    if mask is not None:
+      padding_bias = (1.0 - mask) * -1e9  # Bias to not attend to padding.
+      if padding == "LEFT":  # Do not mask anything when left-padding.
+        mask = None
+    if (hparams.kernel_scheme in _KERNEL_SCHEMES and
+        hparams.dilation_scheme in _DILATION_SCHEMES):
+      kernels = _KERNEL_SCHEMES[hparams.kernel_scheme]
+      dilations = _DILATION_SCHEMES[hparams.dilation_scheme]
+      dilations_and_kernels = list(zip(dilations, kernels))
+      dilations_and_kernels1 = dilations_and_kernels[:2]
+      dilations_and_kernels2 = dilations_and_kernels[2:]
+    else:
+      k = (hparams.kernel_height, hparams.kernel_width)
+      k2 = (hparams.large_kernel_size, 1)
+      dilations_and_kernels1 = [((1, 1), k), ((1, 1), k)]
+      dilations_and_kernels2 = [((1, 1), k2), ((4, 4), k2)]
+    separabilities1 = [hparams.separability, hparams.separability]
+    separabilities2 = [hparams.separability] * len(dilations_and_kernels2)
+    if hparams.separability < 0:
+      separabilities1 = [hparams.separability - 1, hparams.separability]
+      separabilities2 = [
+          hparams.separability - i
+          for i in reversed(range(len(dilations_and_kernels2)))
+      ]
+    norm_fn = get_norm(hparams)
+    for layer in xrange(layers):
+      with tf.variable_scope("layer_%d" % layer):
+        y = common_layers.subseparable_conv_block(
+            x,
+            hparams.hidden_size,
+            dilations_and_kernels1,
+            normalizer_fn=norm_fn,
+            padding=padding,
+            mask=mask,
+            separabilities=separabilities1,
+            name="residual1")
+        x += common_layers.subseparable_conv_block(
+            x + y,
+            hparams.hidden_size,
+            dilations_and_kernels2,
+            normalizer_fn=norm_fn,
+            padding=padding,
+            mask=mask,
+            separabilities=separabilities2,
+            name="residual2") + y
+        if source is not None and hparams.attention_type != "none":
+          x += attention(x, source, norm_fn, hparams, train, bias=padding_bias)
+        if mask is not None:
+          x *= mask
+    return tf.nn.dropout(x, 1.0 - hparams.dropout * tf.to_float(train))
+
+
+def rank_loss(sentence_emb, image_emb, margin=0.2):
+  """Experimental rank loss, thanks to kkurach@ for the code."""
+  with tf.name_scope("rank_loss"):
+    # Normalize first as this is assumed in cosine similarity later.
+    sentence_emb = tf.nn.l2_normalize(sentence_emb, 1)
+    image_emb = tf.nn.l2_normalize(image_emb, 1)
+    # Both sentence_emb and image_emb have size [batch, depth].
+    scores = tf.matmul(image_emb, tf.transpose(sentence_emb))  # [batch, batch]
+    diagonal = tf.diag_part(scores)  # [batch]
+    cost_s = tf.maximum(0.0, margin - diagonal + scores)  # [batch, batch]
+    cost_im = tf.maximum(
+        0.0, margin - tf.reshape(diagonal, [-1, 1]) + scores)  # [batch, batch]
+    # Clear diagonals.
+    batch_size = tf.shape(sentence_emb)[0]
+    empty_diagonal_mat = tf.ones_like(cost_s) - tf.eye(batch_size)
+    cost_s *= empty_diagonal_mat
+    cost_im *= empty_diagonal_mat
+    return tf.reduce_mean(cost_s) + tf.reduce_mean(cost_im)
+
+
+def similarity_cost(inputs_encoded, targets_encoded):
+  """Loss telling to be more similar to your own targets than to others."""
+  # This is a first very simple version: handle variable-length by padding
+  # to same length and putting everything into batch. In need of a better way.
+  x, y = common_layers.pad_to_same_length(inputs_encoded, targets_encoded)
+  depth = tf.shape(inputs_encoded)[3]
+  x, y = tf.reshape(x, [-1, depth]), tf.reshape(y, [-1, depth])
+  return rank_loss(x, y)
+
+
+def slicenet_middle(inputs_encoded, targets, target_space_emb, mask,
+                    hparams, train):
+  """Middle part of slicenet, connecting encoder and decoder."""
+  norm_fn = get_norm(hparams)
+
+  # Flatten targets and embed target_space_id.
+  targets_flat = tf.expand_dims(common_layers.flatten4d3d(targets), axis=2)
+  target_space_emb = tf.tile(target_space_emb,
+                             [tf.shape(targets_flat)[0], 1, 1, 1])
+
+  # Calculate similarity loss (but don't run if not needed).
+  if len(hparams.problems) > 1 and hparams.sim_loss_mult > 0.00001:
+    targets_timed = common_layers.add_timing_signal(targets_flat)
+    extra_layers = int(hparams.num_hidden_layers * 1.5)
+    with tf.variable_scope(tf.get_variable_scope(), reuse=True):
+      targets_encoded = multi_conv_res(targets_timed, "SAME", "encoder",
+                                       extra_layers, hparams, train)
+    with tf.variable_scope("similarity_loss"):
+      similarity_loss = similarity_cost(inputs_encoded, targets_encoded)
+      similarity_loss *= hparams.sim_loss_mult
+  else:
+    similarity_loss = 0.0
+
+  # Use attention from each target to look at input and retrieve.
+  targets_shifted = common_layers.shift_left(
+      targets_flat, pad_value=target_space_emb)
+  if hparams.attention_type == "none":
+    targets_with_attention = tf.zeros_like(targets_shifted)
+  else:
+    inputs_padding_bias = (1.0 - mask) * -1e9  # Bias to not attend to padding.
+    targets_with_attention = attention(
+        targets_shifted, inputs_encoded, norm_fn, hparams, train,
+        bias=inputs_padding_bias)
+
+  # Positional targets: merge attention and raw.
+  kernel = (hparams.kernel_height, hparams.kernel_width)
+  targets_merged = common_layers.subseparable_conv_block(
+      tf.concat([targets_with_attention, targets_shifted], axis=3),
+      hparams.hidden_size, [((1, 1), kernel)],
+      normalizer_fn=norm_fn,
+      padding="LEFT",
+      separability=4,
+      name="targets_merge")
+
+  return targets_merged, similarity_loss
+
+
+def embed_target_space(target_space_id, hidden_size):
+  target_space_emb = common_layers.embedding(
+      target_space_id, 32, hidden_size, name="target_space_embedding")
+  return tf.reshape(target_space_emb, [1, 1, 1, -1])
+
+
+def embedding_to_padding(emb):
+  """Input embeddings -> is_padding."""
+  emb_sum = tf.reduce_sum(tf.abs(emb), axis=-1, keep_dims=True)
+  return tf.to_float(tf.equal(emb_sum, 0.0))
+
+
+def slicenet_internal(inputs, targets, target_space,
+                      problem_idx, hparams, train):
+  """The slicenet model, main step used for training."""
+  with tf.variable_scope("slicenet"):
+    # Flatten inputs and encode.
+    inputs = tf.expand_dims(common_layers.flatten4d3d(inputs), axis=2)
+    inputs_mask = 1.0 - embedding_to_padding(inputs)
+    inputs = common_layers.add_timing_signal(inputs)  # Add position info.
+    target_space_emb = embed_target_space(target_space, hparams.hidden_size)
+    extra_layers = int(hparams.num_hidden_layers * 1.5)
+    inputs_encoded = multi_conv_res(inputs, "SAME", "encoder", extra_layers,
+                                    hparams, train, mask=inputs_mask)
+    target_modality_name = hparams.problems[problem_idx].target_modality.name
+    if "class_label_modality" in target_modality_name:
+      # If we're just predicing a class, there is no use for a decoder.
+      return inputs_encoded
+    # Do the middle part.
+    decoder_start, similarity_loss = slicenet_middle(
+        inputs_encoded, targets, target_space_emb, inputs_mask, hparams, train)
+    # Decode.
+    decoder_final = multi_conv_res(
+        decoder_start,
+        "LEFT",
+        "decoder",
+        hparams.num_hidden_layers,
+        hparams,
+        train,
+        mask=inputs_mask,
+        source=inputs_encoded)
+    return decoder_final, tf.reduce_mean(similarity_loss)
+
+
+@registry.register_model
+class SliceNet(t2t_model.T2TModel):
+
+  def model_fn_body(self, features, train):
+    return slicenet_internal(features["inputs"], features["targets"],
+                             features["target_space_id"], self._problem_idx,
+                             self._hparams, train)
+
+_KERNEL_SCHEMES = {
+    "3.3.3.3": [(3, 1), (3, 1), (3, 1), (3, 1)],
+    "3.7.7.7": [(3, 1), (7, 1), (7, 1), (7, 1)],
+    "3.7.15.15": [(3, 1), (7, 1), (15, 1), (15, 1)],
+    "3.7.15.31": [(3, 1), (7, 1), (15, 1), (31, 1)],
+    "3.7.15.31.63": [(3, 1), (7, 1), (15, 1), (31, 1), (63, 1)],
+}
+_DILATION_SCHEMES = {
+    "1.1.1.1.1": [(1, 1), (1, 1), (1, 1), (1, 1), (1, 1)],
+    "1.1.1.1": [(1, 1), (1, 1), (1, 1), (1, 1)],
+    "1.1.1.2": [(1, 1), (1, 1), (1, 1), (2, 1)],
+    "1.1.2.4": [(1, 1), (1, 1), (2, 1), (4, 1)],
+    "1.2.4.8": [(1, 1), (2, 1), (4, 1), (8, 1)],
+}
+
+
+@registry.register_hparams("slicenet1")
+def slicenet_params1():
+  """Set of hyperparameters."""
+  hparams = common_hparams.basic_params1()
+  hparams.batch_size = 1024
+  hparams.hidden_size = 768
+  hparams.dropout = 0.5
+  hparams.symbol_dropout = 0.2
+  hparams.label_smoothing = 0.1
+  hparams.clip_grad_norm = 2.0
+  hparams.num_hidden_layers = 4
+  hparams.kernel_height = 3
+  hparams.kernel_width = 1
+  hparams.add_hparam("normalizer_fn", "layer")  # New ones are added like this.
+  hparams.learning_rate_decay_scheme = "exp50k"
+  hparams.learning_rate = 0.05
+  hparams.learning_rate_warmup_steps = 3000
+  hparams.initializer_gain = 1.0
+  hparams.weight_decay = 3.0
+  hparams.num_sampled_classes = 0
+  hparams.sampling_method = "argmax"
+  hparams.optimizer_adam_epsilon = 1e-6
+  hparams.optimizer_adam_beta1 = 0.85
+  hparams.optimizer_adam_beta2 = 0.997
+  hparams.add_hparam("large_kernel_size", 15)
+  hparams.add_hparam("separability", -2)
+  # A dilation scheme, one of _DILATION_SCHEMES.
+  hparams.add_hparam("dilation_scheme", "1.1.1.1")
+  # A kernel scheme, one of _KERNEL_SCHEMES; overrides large_kernel_size.
+  hparams.add_hparam("kernel_scheme", "3.7.15.31")
+  hparams.add_hparam("audio_compression", 8)
+  hparams.add_hparam("moe_n1", 32)
+  hparams.add_hparam("moe_n2", 0)
+  hparams.add_hparam("moe_loss_coef", 1e-2)
+  hparams.add_hparam("imagenet_use_2d", int(True))
+  # attention-related flags
+  hparams.add_hparam("attention_type", "simple")
+  hparams.add_hparam("num_heads", 8)
+  hparams.add_hparam("attention_key_channels", 0)
+  hparams.add_hparam("attention_value_channels", 0)
+  hparams.add_hparam("sim_loss_mult", 0.0)  # Try 10.0 for experiments.
+  hparams.add_hparam("attention_dropout", 0.2)
+  hparams.shared_embedding_and_softmax_weights = int(True)
+  return hparams
+
+
+@registry.register_hparams("slicenet1noam")
+def slicenet_params1_noam():
+  """Version with Noam's decay scheme."""
+  hparams = slicenet_params1()
+  hparams.learning_rate_decay_scheme = "noam"
+  hparams.learning_rate = 1.0
+  hparams.learning_rate_warmup_steps = 4000
+  hparams.initializer = "uniform_unit_scaling"
+  hparams.optimizer_adam_epsilon = 1e-9
+  hparams.optimizer_adam_beta1 = 0.9
+  hparams.optimizer_adam_beta2 = 0.98
+  return hparams
+
+
+@registry.register_hparams("slicenet1tiny")
+def slicenet_params1_tiny():
+  """Version for fast local runs."""
+  hparams = slicenet_params1()
+  hparams.attention_type = "simple"
+  hparams.separability = 0
+  hparams.hidden_size = 128
+  hparams.num_hidden_layers = 2
+  hparams.moe_n1 = 2
+  hparams.batch_size = 512
+  hparams.learning_rate_warmup_steps = 200
+  return hparams
+
+
+@registry.register_ranged_hparams("slicenet1")
+def slicenet_range1(ranged_hparams):
+  """Small range of hyperparameters."""
+  rhp = ranged_hparams
+
+  hparams = slicenet_params1()
+  common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp)
+
+  rhp.set_float("clip_grad_norm", 1.0, 10.0, scale=rhp.LOG_SCALE)
+  rhp.set_float("learning_rate", 0.02, 1.0, scale=rhp.LOG_SCALE)
+  rhp.set_float("optimizer_adam_beta2", 0.995, 0.998)
+  rhp.set_float("weight_decay", 1.0, 5.0)
diff --git a/tensor2tensor/models/slicenet_test.py b/tensor2tensor/models/slicenet_test.py
new file mode 100644
index 000000000..bbeb3a284
--- /dev/null
+++ b/tensor2tensor/models/slicenet_test.py
@@ -0,0 +1,54 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for SliceNet."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+import numpy as np
+
+from tensor2tensor.data_generators import problem_hparams
+from tensor2tensor.models import slicenet
+
+import tensorflow as tf
+
+
+class SliceNetTest(tf.test.TestCase):
+
+  def testSliceNet(self):
+    x = np.random.random_integers(0, high=255, size=(3, 5, 4, 3))
+    y = np.random.random_integers(0, high=9, size=(3, 5, 1, 1))
+    hparams = slicenet.slicenet_params1_tiny()
+    p_hparams = problem_hparams.image_cifar10(hparams)
+    hparams.problems = [p_hparams]
+    with self.test_session() as session:
+      features = {
+          "inputs": tf.constant(x, dtype=tf.int32),
+          "targets": tf.constant(y, dtype=tf.int32),
+          "target_space_id": tf.constant(1, dtype=tf.int32),
+      }
+      model = slicenet.SliceNet(hparams, p_hparams)
+      sharded_logits, _, _ = model.model_fn(features, True)
+      logits = tf.concat(sharded_logits, 0)
+      session.run(tf.global_variables_initializer())
+      res = session.run(logits)
+    self.assertEqual(res.shape, (3, 1, 1, 1, 10))
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py
new file mode 100644
index 000000000..379210d67
--- /dev/null
+++ b/tensor2tensor/models/transformer.py
@@ -0,0 +1,495 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""transformer (attention).
+
+encoder: [Self-Attention, Feed-forward] x n
+decoder: [Self-Attention, Source-Target-Attention, Feed-forward] x n
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+
+# Dependency imports
+
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensor2tensor.models import common_attention
+from tensor2tensor.models import common_hparams
+from tensor2tensor.models import common_layers
+from tensor2tensor.utils import registry
+from tensor2tensor.utils import t2t_model
+
+import tensorflow as tf
+
+
+@registry.register_model
+class Transformer(t2t_model.T2TModel):
+  """Attention net.  See file docstring."""
+
+  def model_fn_body(self, features, train):
+    # Remove dropout if not training
+    hparams = copy.copy(self._hparams)
+    if not train:
+      hparams.attention_dropout = 0.
+      hparams.relu_dropout = 0.
+      hparams.residual_dropout = 0.
+    targets = features["targets"]
+    inputs = features.get("inputs")
+    target_space = features.get("target_space_id")
+
+    inputs = tf.squeeze(inputs, 2)
+    targets = tf.squeeze(targets, 2)
+
+    (encoder_input, encoder_attention_bias, _) = (transformer_prepare_encoder(
+        inputs, target_space, hparams))
+    (decoder_input, decoder_self_attention_bias) = transformer_prepare_decoder(
+        targets, hparams)
+
+    def residual_fn(x, y):
+      return common_layers.layer_norm(x + tf.nn.dropout(
+          y, 1.0 - hparams.residual_dropout))
+
+    # encoder_input = tf.squeeze(encoder_input, 2)
+    # decoder_input = tf.squeeze(decoder_input, 2)
+    encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.residual_dropout)
+    decoder_input = tf.nn.dropout(decoder_input, 1.0 - hparams.residual_dropout)
+    encoder_output = transformer_encoder(encoder_input, residual_fn,
+                                         encoder_attention_bias, hparams)
+
+    decoder_output = transformer_decoder(
+        decoder_input, encoder_output, residual_fn, decoder_self_attention_bias,
+        encoder_attention_bias, hparams)
+    decoder_output = tf.expand_dims(decoder_output, 2)
+
+    return decoder_output
+
+
+def transformer_prepare_encoder(inputs, target_space, hparams):
+  """Prepare one shard of the model for the encoder.
+
+  Args:
+    inputs: a Tensor.
+    target_space: a Tensor.
+    hparams: run hyperparameters
+
+  Returns:
+    encoder_input: a Tensor, bottom of encoder stack
+    encoder_self_attention_bias: a Tensor, containing large negative values
+      to implement masked attention and possibly baises for diagonal
+      alignments
+    encoder_padding: a Tensor
+  """
+  # Flatten inputs.
+  ishape_static = inputs.shape.as_list()
+  encoder_input = inputs
+  encoder_padding = common_attention.embedding_to_padding(encoder_input)
+  encoder_self_attention_bias = common_attention.attention_bias_ignore_padding(
+      encoder_padding)
+  # Append target_space_id embedding to inputs.
+  emb_target_space = common_layers.embedding(
+      target_space, 32, ishape_static[-1], name="target_space_embedding")
+  emb_target_space = tf.reshape(emb_target_space, [1, 1, -1])
+  encoder_input += emb_target_space
+  if hparams.pos == "timing":
+    encoder_input = common_attention.add_timing_signal_1d(encoder_input)
+  return (encoder_input, encoder_self_attention_bias, encoder_padding)
+
+
+def transformer_prepare_decoder(targets, hparams):
+  """Prepare one shard of the model for the decoder.
+
+  Args:
+    targets: a Tensor.
+    hparams: run hyperparameters
+
+  Returns:
+    decoder_input: a Tensor, bottom of decoder stack
+    decoder_self_attention_bias: a Tensor, containing large negative values
+    to implement masked attention and possibly baises for diagonal alignments
+  """
+  decoder_self_attention_bias = (
+      common_attention.attention_bias_lower_triangle(tf.shape(targets)[1]))
+  decoder_input = common_layers.shift_left_3d(targets)
+  if hparams.pos == "timing":
+    decoder_input = common_attention.add_timing_signal_1d(decoder_input)
+  return (decoder_input, decoder_self_attention_bias)
+
+
+def transformer_encoder(encoder_input,
+                        residual_fn,
+                        encoder_self_attention_bias,
+                        hparams,
+                        name="encoder"):
+  """A stack of transformer layers.
+
+  Args:
+    encoder_input: a Tensor
+    residual_fn: a function from (layer_input, layer_output) -> combined_output
+    encoder_self_attention_bias: bias Tensor for self-attention
+       (see common_attention.attention_bias())
+    hparams: hyperparameters for model
+    name: a string
+
+  Returns:
+    y: a Tensors
+  """
+  x = encoder_input
+  # Summaries don't work in multi-problem setting yet.
+  summaries = "problems" not in hparams.values() or len(hparams.problems) == 1
+  with tf.variable_scope(name):
+    for layer in xrange(hparams.num_hidden_layers):
+      with tf.variable_scope("layer_%d" % layer):
+        x = residual_fn(
+            x,
+            common_attention.multihead_attention(
+                x,
+                None,
+                encoder_self_attention_bias,
+                hparams.attention_key_channels or hparams.hidden_size,
+                hparams.attention_value_channels or hparams.hidden_size,
+                hparams.hidden_size,
+                hparams.num_heads,
+                hparams.attention_dropout,
+                summaries=summaries,
+                name="encoder_self_attention"))
+        x = residual_fn(x,
+                        common_layers.conv_hidden_relu(
+                            x,
+                            hparams.filter_size,
+                            hparams.hidden_size,
+                            dropout=hparams.relu_dropout))
+  return x
+
+
+def transformer_decoder(decoder_input,
+                        encoder_output,
+                        residual_fn,
+                        decoder_self_attention_bias,
+                        encoder_decoder_attention_bias,
+                        hparams,
+                        name="decoder"):
+  """A stack of transformer layers.
+
+  Args:
+    decoder_input: a Tensor
+    encoder_output: a Tensor
+    residual_fn: a function from (layer_input, layer_output) -> combined_output
+    decoder_self_attention_bias: bias Tensor for self-attention
+      (see common_attention.attention_bias())
+    encoder_decoder_attention_bias: bias Tensor for encoder-decoder attention
+      (see common_attention.attention_bias())
+    hparams: hyperparameters for model
+    name: a string
+
+  Returns:
+    y: a Tensors
+  """
+  x = decoder_input
+  # Summaries don't work in multi-problem setting yet.
+  summaries = "problems" not in hparams.values() or len(hparams.problems) == 1
+  with tf.variable_scope(name):
+    for layer in xrange(hparams.num_hidden_layers):
+      with tf.variable_scope("layer_%d" % layer):
+        x = residual_fn(
+            x,
+            common_attention.multihead_attention(
+                x,
+                None,
+                decoder_self_attention_bias,
+                hparams.attention_key_channels or hparams.hidden_size,
+                hparams.attention_value_channels or hparams.hidden_size,
+                hparams.hidden_size,
+                hparams.num_heads,
+                hparams.attention_dropout,
+                summaries=summaries,
+                name="decoder_self_attention"))
+        x = residual_fn(
+            x,
+            common_attention.multihead_attention(
+                x,
+                encoder_output,
+                encoder_decoder_attention_bias,
+                hparams.attention_key_channels or hparams.hidden_size,
+                hparams.attention_value_channels or hparams.hidden_size,
+                hparams.hidden_size,
+                hparams.num_heads,
+                hparams.attention_dropout,
+                summaries=summaries,
+                name="encdec_attention"))
+        x = residual_fn(x,
+                        common_layers.conv_hidden_relu(
+                            x,
+                            hparams.filter_size,
+                            hparams.hidden_size,
+                            dropout=hparams.relu_dropout))
+  return x
+
+
+@registry.register_hparams
+def transformer_base():
+  """Set of hyperparameters."""
+  hparams = common_hparams.basic_params1()
+  hparams.hidden_size = 512
+  hparams.batch_size = 4096
+  hparams.max_length = 256
+  hparams.dropout = 0.0
+  hparams.clip_grad_norm = 0.  # i.e. no gradient clipping
+  hparams.optimizer_adam_epsilon = 1e-9
+  hparams.learning_rate_decay_scheme = "noam"
+  hparams.learning_rate = 0.1
+  hparams.learning_rate_warmup_steps = 4000
+  hparams.initializer_gain = 1.0
+  hparams.num_hidden_layers = 6
+  hparams.initializer = "uniform_unit_scaling"
+  hparams.weight_decay = 0.0
+  hparams.optimizer_adam_beta1 = 0.9
+  hparams.optimizer_adam_beta2 = 0.98
+  hparams.num_sampled_classes = 0
+  hparams.label_smoothing = 0.1
+  hparams.shared_embedding_and_softmax_weights = int(True)
+
+  hparams.add_hparam("filter_size", 2048)  # Add new ones like this.
+  # attention-related flags
+  hparams.add_hparam("num_heads", 8)
+  hparams.add_hparam("attention_key_channels", 0)
+  hparams.add_hparam("attention_value_channels", 0)
+  hparams.add_hparam("attention_dropout", 0.0)
+  hparams.add_hparam("relu_dropout", 0.0)
+  hparams.add_hparam("pos", "timing")  # timing, none
+  hparams.add_hparam("residual_dropout", 0.1)
+  hparams.add_hparam("nbr_decoder_problems", 1)
+  return hparams
+
+
+@registry.register_hparams
+def transformer_single_gpu():
+  hparams = transformer_base()
+  hparams.batch_size = 8192
+  hparams.learning_rate_warmup_steps = 16000
+  hparams.batching_mantissa_bits = 2
+  return hparams
+
+
+@registry.register_hparams
+def transformer_tiny():
+  hparams = transformer_base()
+  hparams.hidden_size = 64
+  hparams.filter_size = 128
+  hparams.num_heads = 4
+  return hparams
+
+
+@registry.register_hparams
+def transformer_l2():
+  hparams = transformer_base()
+  hparams.num_hidden_layers = 2
+  return hparams
+
+
+@registry.register_hparams
+def transformer_l4():
+  hparams = transformer_base()
+  hparams.num_hidden_layers = 4
+  return hparams
+
+
+@registry.register_hparams
+def transformer_l8():
+  hparams = transformer_base()
+  hparams.num_hidden_layers = 8
+  return hparams
+
+
+@registry.register_hparams
+def transformer_h1():
+  hparams = transformer_base()
+  hparams.num_heads = 1
+  return hparams
+
+
+@registry.register_hparams
+def transformer_h4():
+  hparams = transformer_base()
+  hparams.num_heads = 4
+  return hparams
+
+
+@registry.register_hparams
+def transformer_h16():
+  hparams = transformer_base()
+  hparams.num_heads = 16
+  return hparams
+
+
+@registry.register_hparams
+def transformer_h32():
+  hparams = transformer_base()
+  hparams.num_heads = 32
+  return hparams
+
+
+@registry.register_hparams
+def transformer_k128():
+  hparams = transformer_base()
+  hparams.attention_key_channels = 128
+  return hparams
+
+
+@registry.register_hparams
+def transformer_k256():
+  hparams = transformer_base()
+  hparams.attention_key_channels = 256
+  return hparams
+
+
+@registry.register_hparams
+def transformer_ff1024():
+  hparams = transformer_base()
+  hparams.filter_size = 1024
+  return hparams
+
+
+@registry.register_hparams
+def transformer_ff4096():
+  hparams = transformer_base()
+  hparams.filter_size = 4096
+  return hparams
+
+
+@registry.register_hparams
+def transformer_dr0():
+  hparams = transformer_base()
+  hparams.residual_dropout = 0.0
+  return hparams
+
+
+@registry.register_hparams
+def transformer_dr2():
+  hparams = transformer_base()
+  hparams.residual_dropout = 0.2
+  return hparams
+
+
+@registry.register_hparams
+def transformer_ls0():
+  hparams = transformer_base()
+  hparams.label_smoothing = 0.0
+  return hparams
+
+
+@registry.register_hparams
+def transformer_ls2():
+  hparams = transformer_base()
+  hparams.label_smoothing = 0.2
+  return hparams
+
+
+@registry.register_hparams
+def transformer_hs256():
+  hparams = transformer_base()
+  hparams.hidden_size = 256
+  return hparams
+
+
+@registry.register_hparams
+def transformer_hs1024():
+  hparams = transformer_base()
+  hparams.hidden_size = 1024
+  return hparams
+
+
+@registry.register_hparams
+def transformer_big_dr1():
+  hparams = transformer_base()
+  hparams.hidden_size = 1024
+  hparams.filter_size = 4096
+  hparams.num_heads = 16
+  hparams.residual_dropout = 0.1
+  hparams.batching_mantissa_bits = 2
+  return hparams
+
+
+@registry.register_hparams
+def transformer_big_enfr():
+  hparams = transformer_big_dr1()
+  hparams.shared_embedding_and_softmax_weights = int(False)
+  hparams.filter_size = 8192
+  hparams.residual_dropout = 0.1
+  return hparams
+
+
+@registry.register_hparams
+def transformer_big_dr2():
+  hparams = transformer_big_dr1()
+  hparams.residual_dropout = 0.2
+  return hparams
+
+
+@registry.register_hparams
+def transformer_big_dr3():
+  hparams = transformer_big_dr1()
+  hparams.residual_dropout = 0.3
+  return hparams
+
+
+@registry.register_hparams
+def transformer_big_single_gpu():
+  hparams = transformer_big_dr1()
+  hparams.learning_rate_warmup_steps = 16000
+  hparams.optimizer_adam_beta2 = 0.998
+  hparams.batching_mantissa_bits = 3
+  return hparams
+
+
+@registry.register_hparams
+def transformer_parsing_base_dr6():
+  """hparams for parsing on wsj only."""
+  hparams = transformer_base()
+  hparams.attention_dropout = 0.2
+  hparams.residual_dropout = 0.2
+  hparams.max_length = 512
+  hparams.learning_rate_warmup_steps = 16000
+  hparams.hidden_size = 1024
+  hparams.learning_rate = 0.5
+  hparams.shared_embedding_and_softmax_weights = int(False)
+  return hparams
+
+
+@registry.register_hparams
+def transformer_parsing_big():
+  """HParams for parsing on wsj semi-supervised."""
+  hparams = transformer_big_dr1()
+  hparams.max_length = 512
+  hparams.shared_source_target_embedding = int(False)
+  hparams.learning_rate_warmup_steps = 4000
+  hparams.batch_size = 2048
+  hparams.learning_rate = 0.5
+  return hparams
+
+
+@registry.register_ranged_hparams("transformer_big_single_gpu")
+def transformer_range1(rhp):
+  """Small range of hyperparameters."""
+  hparams = transformer_big_single_gpu()
+  common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp)
+
+  rhp.set_float("learning_rate", 0.3, 3.0, scale=rhp.LOG_SCALE)
+  rhp.set_float("initializer_gain", 0.5, 2.0)
+  rhp.set_float("optimizer_adam_beta2", 0.97, 0.99)
+  rhp.set_float("weight_decay", 0.0, 2.0)
diff --git a/tensor2tensor/models/transformer_test.py b/tensor2tensor/models/transformer_test.py
new file mode 100644
index 000000000..1b43ce625
--- /dev/null
+++ b/tensor2tensor/models/transformer_test.py
@@ -0,0 +1,63 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for Transformer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+import numpy as np
+
+from tensor2tensor.data_generators import problem_hparams
+from tensor2tensor.models import transformer
+
+import tensorflow as tf
+
+
+class TransformerTest(tf.test.TestCase):
+
+  def _testTransformer(self, net):
+    batch_size = 3
+    input_length = 5
+    target_length = 7
+    vocab_size = 9
+    hparams = transformer.transformer_tiny()
+    p_hparams = problem_hparams.test_problem_hparams(hparams, vocab_size,
+                                                     vocab_size)
+    inputs = -1 + np.random.random_integers(
+        vocab_size, size=(batch_size, input_length, 1, 1))
+    targets = -1 + np.random.random_integers(
+        vocab_size, size=(batch_size, target_length, 1, 1))
+    with self.test_session() as session:
+      features = {
+          "inputs": tf.constant(inputs, dtype=tf.int32),
+          "targets": tf.constant(targets, dtype=tf.int32),
+          "target_space_id": tf.constant(1, dtype=tf.int32),
+      }
+      model = net(hparams, p_hparams)
+      shadred_logits, _, _ = model.model_fn(features, True)
+      logits = tf.concat(shadred_logits, 0)
+      session.run(tf.global_variables_initializer())
+      res = session.run(logits)
+    self.assertEqual(res.shape, (batch_size, target_length, 1, 1, vocab_size))
+
+  def testTransformer(self):
+    self._testTransformer(transformer.Transformer)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensor2tensor/models/xception.py b/tensor2tensor/models/xception.py
new file mode 100644
index 000000000..b6e271c36
--- /dev/null
+++ b/tensor2tensor/models/xception.py
@@ -0,0 +1,89 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Xception."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensor2tensor.models import common_hparams
+from tensor2tensor.models import common_layers
+from tensor2tensor.utils import registry
+from tensor2tensor.utils import t2t_model
+
+import tensorflow as tf
+
+
+def residual_block(x, hparams, train):
+  """A stack of convolution blocks with residual connection."""
+  k = (hparams.kernel_height, hparams.kernel_width)
+  dilations_and_kernels = [((1, 1), k) for _ in xrange(3)]
+  y = common_layers.subseparable_conv_block(
+      x,
+      hparams.hidden_size,
+      dilations_and_kernels,
+      padding="SAME",
+      separability=0,
+      name="residual_block")
+  x = common_layers.layer_norm(x + y, hparams.hidden_size, name="lnorm")
+  return tf.nn.dropout(x, 1.0 - hparams.dropout * tf.to_float(train))
+
+
+def xception_internal(inputs, hparams, train):
+  """Xception body."""
+  with tf.variable_scope("xception"):
+    cur = inputs
+    for i in xrange(hparams.num_hidden_layers):
+      with tf.variable_scope("layer_%d" % i):
+        cur = residual_block(cur, hparams, train)
+    return cur
+
+
+@registry.register_model
+class Xception(t2t_model.T2TModel):
+
+  def model_fn_body(self, features, train):
+    return xception_internal(features["inputs"], self._hparams, train)
+
+
+@registry.register_hparams
+def xception_base():
+  """Set of hyperparameters."""
+  hparams = common_hparams.basic_params1()
+  hparams.batch_size = 4096
+  hparams.hidden_size = 768
+  hparams.dropout = 0.2
+  hparams.symbol_dropout = 0.2
+  hparams.label_smoothing = 0.1
+  hparams.clip_grad_norm = 2.0
+  hparams.num_hidden_layers = 8
+  hparams.kernel_height = 3
+  hparams.kernel_width = 3
+  hparams.learning_rate_decay_scheme = "exp50k"
+  hparams.learning_rate = 0.05
+  hparams.learning_rate_warmup_steps = 3000
+  hparams.initializer_gain = 1.0
+  hparams.weight_decay = 3.0
+  hparams.num_sampled_classes = 0
+  hparams.sampling_method = "argmax"
+  hparams.optimizer_adam_epsilon = 1e-6
+  hparams.optimizer_adam_beta1 = 0.85
+  hparams.optimizer_adam_beta2 = 0.997
+  hparams.add_hparam("imagenet_use_2d", True)
+  return hparams
diff --git a/tensor2tensor/models/xception_test.py b/tensor2tensor/models/xception_test.py
new file mode 100644
index 000000000..106604659
--- /dev/null
+++ b/tensor2tensor/models/xception_test.py
@@ -0,0 +1,54 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Xception tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+import numpy as np
+
+from tensor2tensor.data_generators import problem_hparams
+from tensor2tensor.models import xception
+
+import tensorflow as tf
+
+
+class XceptionTest(tf.test.TestCase):
+
+  def testXception(self):
+    vocab_size = 9
+    x = np.random.random_integers(1, high=vocab_size - 1, size=(3, 5, 1, 1))
+    y = np.random.random_integers(1, high=vocab_size - 1, size=(3, 1, 1, 1))
+    hparams = xception.xception_base()
+    p_hparams = problem_hparams.test_problem_hparams(hparams, vocab_size,
+                                                     vocab_size)
+    with self.test_session() as session:
+      features = {
+          "inputs": tf.constant(x, dtype=tf.int32),
+          "targets": tf.constant(y, dtype=tf.int32),
+      }
+      model = xception.Xception(hparams, p_hparams)
+      sharded_logits, _, _ = model.model_fn(features, True)
+      logits = tf.concat(sharded_logits, 0)
+      session.run(tf.global_variables_initializer())
+      res = session.run(logits)
+    self.assertEqual(res.shape, (3, 5, 1, 1, vocab_size))
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensor2tensor/utils/__init__.py b/tensor2tensor/utils/__init__.py
new file mode 100644
index 000000000..27d533abc
--- /dev/null
+++ b/tensor2tensor/utils/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/tensor2tensor/utils/avg_checkpoints.py b/tensor2tensor/utils/avg_checkpoints.py
new file mode 100644
index 000000000..4e5286f62
--- /dev/null
+++ b/tensor2tensor/utils/avg_checkpoints.py
@@ -0,0 +1,98 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Script to average values of variables in a list of checkpoint files."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+import numpy as np
+import six
+from six.moves import zip  # pylint: disable=redefined-builtin
+import tensorflow as tf
+
+flags = tf.flags
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("checkpoints", "",
+                    "Comma-separated list of checkpoints to average.")
+flags.DEFINE_string("prefix", "",
+                    "Prefix (e.g., directory) to append to each checkpoint.")
+flags.DEFINE_string("output_path", "/tmp/averaged.ckpt",
+                    "Path to output the averaged checkpoint to.")
+
+
+def checkpoint_exists(path):
+  return (tf.gfile.Exists(path) or tf.gfile.Exists(path + ".meta") or
+          tf.gfile.Exists(path + ".index"))
+
+
+def main(unused_argv):
+  # Get the checkpoints list from flags and run some basic checks.
+  checkpoints = [c.strip() for c in FLAGS.checkpoints.split(",")]
+  checkpoints = [c for c in checkpoints if c]
+  if not checkpoints:
+    raise ValueError("No checkpoints provided for averaging.")
+  if flags.FLAGS.prefix:
+    checkpoints = [FLAGS.prefix + c for c in checkpoints]
+  checkpoints = [c for c in checkpoints if checkpoint_exists(c)]
+  if not checkpoints:
+    raise ValueError(
+        "None of the provided checkpoints exist. %s" % FLAGS.checkpoints)
+
+  # Read variables from all checkpoints and average them.
+  tf.logging.info("Reading variables and averaging checkpoints:")
+  for c in checkpoints:
+    tf.logging.info("%s ", c)
+  var_list = tf.contrib.framework.list_variables(checkpoints[0])
+  var_values, var_dtypes = {}, {}
+  for (name, shape) in var_list:
+    if not name.startswith("global_step"):
+      var_values[name] = np.zeros(shape)
+  for checkpoint in checkpoints:
+    reader = tf.contrib.framework.load_checkpoint(checkpoint)
+    for name in var_values:
+      tensor = reader.get_tensor(name)
+      var_dtypes[name] = tensor.dtype
+      var_values[name] += tensor
+    tf.logging.info("Read from checkpoint %s", checkpoint)
+  for name in var_values:  # Average.
+    var_values[name] /= len(checkpoints)
+
+  tf_vars = [
+      tf.get_variable(v, shape=var_values[v].shape, dtype=var_dtypes[name])
+      for v in var_values
+  ]
+  placeholders = [tf.placeholder(v.dtype, shape=v.shape) for v in tf_vars]
+  assign_ops = [tf.assign(v, p) for (v, p) in zip(tf_vars, placeholders)]
+  global_step = tf.Variable(
+      0, name="global_step", trainable=False, dtype=tf.int64)
+  saver = tf.train.Saver(tf.all_variables())
+
+  # Build a model consisting only of variables, set them to the average values.
+  with tf.Session() as sess:
+    sess.run(tf.initialize_all_variables())
+    for p, assign_op, (name, value) in zip(placeholders, assign_ops,
+                                           six.iteritems(var_values)):
+      sess.run(assign_op, {p: value})
+    # Use the built saver to save the averaged checkpoint.
+    saver.save(sess, flags.FLAGS.output_path, global_step=global_step)
+
+  tf.logging.info("Averaged checkpoints saved in %s", flags.FLAGS.output_path)
+
+
+if __name__ == "__main__":
+  tf.app.run()
diff --git a/tensor2tensor/utils/beam_search.py b/tensor2tensor/utils/beam_search.py
new file mode 100644
index 000000000..eacbf467f
--- /dev/null
+++ b/tensor2tensor/utils/beam_search.py
@@ -0,0 +1,419 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Implemetation of beam seach with penalties."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+import tensorflow as tf
+
+# Assuming EOS_ID is 1
+EOS_ID = 1
+# Default value for INF
+INF = 1. * 1e7
+
+
+def log_prob_from_logits(logits):
+  return logits - tf.reduce_logsumexp(logits, axis=2, keep_dims=True)
+
+
+def compute_batch_indices(batch_size, beam_size):
+  """Computes the i'th coodinate that contains the batch index for gathers.
+
+  Batch pos is a tensor like [[0,0,0,0,],[1,1,1,1],..]. It says which
+  batch the beam item is in. This will create the i of the i,j coordinate
+  needed for the gather.
+
+  Args:
+    batch_size: Batch size
+    beam_size: Size of the beam.
+  Returns:
+    batch_pos: [batch_size, beam_size] tensor of ids
+  """
+  batch_pos = tf.range(batch_size * beam_size) // beam_size
+  batch_pos = tf.reshape(batch_pos, [batch_size, beam_size])
+  return batch_pos
+
+
+def compute_topk_scores_and_seq(sequences, scores, scores_to_gather, flags,
+                                beam_size, batch_size):
+  """Given sequences and scores, will gather the top k=beam size sequences.
+
+  This function is used to grow alive, and finished. It takes sequences,
+  scores, and flags, and returns the top k from sequences, scores_to_gather,
+  and flags based on the values in scores.
+
+  Args:
+    sequences: Tensor of sequences that we need to gather from.
+      [batch_size, beam_size, seq_length]
+    scores: Tensor of scores for each sequence in sequences.
+      [batch_size, beam_size]. We will use these to compute the topk.
+    scores_to_gather: Tensor of scores for each sequence in sequences.
+      [batch_size, beam_size]. We will return the gathered scores from here.
+      Scores to gather is different from scores because for grow_alive, we will
+      need to return log_probs, while for grow_finished, we will need to return
+      the length penalized scors.
+    flags: Tensor of bools for sequences that say whether a sequence has reached
+      EOS or not
+    beam_size: int
+    batch_size: int
+  Returns:
+    Tuple of
+    (topk_seq [batch_size, beam_size, decode_length],
+     topk_gathered_scores [batch_size, beam_size],
+     topk_finished_flags[batch_size, beam_size])
+  """
+  _, topk_indexes = tf.nn.top_k(scores, k=beam_size)
+  # The next three steps are to create coordinates for tf.gather_nd to pull
+  # out the topk sequences from sequences based on scores.
+  # batch pos is a tensor like [[0,0,0,0,],[1,1,1,1],..]. It says which
+  # batch the beam item is in. This will create the i of the i,j coordinate
+  # needed for the gather
+  batch_pos = compute_batch_indices(batch_size, beam_size)
+
+  # top coordinates will give us the actual coordinates to do the gather.
+  # stacking will create a tensor of dimension batch * beam * 2, where the
+  # last dimension contains the i,j gathering coordinates.
+  top_coordinates = tf.stack([batch_pos, topk_indexes], axis=2)
+
+  # Gather up the highest scoring sequences
+  topk_seq = tf.gather_nd(sequences, top_coordinates)
+  topk_flags = tf.gather_nd(flags, top_coordinates)
+  topk_gathered_scores = tf.gather_nd(scores_to_gather, top_coordinates)
+  return topk_seq, topk_gathered_scores, topk_flags
+
+
+def beam_search(symbols_to_logits_fn,
+                initial_ids,
+                beam_size,
+                decode_length,
+                vocab_size,
+                alpha,
+                eos_id=EOS_ID):
+  """Beam search with length penalties.
+
+  Uses an interface specific to the sequence cnn models;
+  Requires a function that can take the currently decoded sybmols and return
+  the logits for the next symbol. The implementation is inspired by
+  https://arxiv.org/abs/1609.08144.
+
+  Args:
+    symbols_to_logits_fn: Interface to the model, to provide logits.
+        Shoud take [batch_size, decoded_ids] and return [batch_size, vocab_size]
+    initial_ids: Ids to start off the decoding, this will be the first thing
+        handed to symbols_to_logits_fn (after expanding to beam size)
+        [batch_size]
+    beam_size: Size of the beam.
+    decode_length: Number of steps to decode for.
+    vocab_size: Size of the vocab, must equal the size of the logits returned by
+        symbols_to_logits_fn
+    alpha: alpha for length penalty.
+    eos_id: ID for end of sentence.
+  Returns:
+    Tuple of
+    (decoded beams [batch_size, beam_size, decode_length]
+     decoding probablities [batch_size, beam_size])
+  """
+  batch_size = tf.shape(initial_ids)[0]
+
+  # Assume initial_ids are prob 1.0
+  initial_log_probs = tf.constant([[0.] + [-float("inf")] * (beam_size - 1)])
+  # Expand to beam_size (batch_size, beam_size)
+  alive_log_probs = tf.tile(initial_log_probs, [batch_size, 1])
+
+  # Expand each batch to beam_size
+  alive_seq = tf.tile(tf.expand_dims(initial_ids, 1), [1, beam_size])
+  alive_seq = tf.expand_dims(alive_seq, 2)  # (batch_size, beam_size, 1)
+
+  # Finished will keep track of all the sequences that have finished so far
+  # Finished log probs will be negative infinity in the beginning
+  # finished_flags will keep track of booleans
+  finished_seq = tf.zeros(tf.shape(alive_seq), tf.int32)
+  # Setting the scores of the initial to negative infinity.
+  finished_scores = tf.ones([batch_size, beam_size]) * -INF
+  finished_flags = tf.zeros([batch_size, beam_size], tf.bool)
+
+  def grow_finished(finished_seq, finished_scores, finished_flags, curr_seq,
+                    curr_scores, curr_finished):
+    """Given sequences and scores, will gather the top k=beam size sequences.
+
+    Args:
+      finished_seq: Current finished sequences.
+        [batch_size, beam_size, current_decoded_length]
+      finished_scores: scores for each of these sequences.
+        [batch_size, beam_size]
+      finished_flags: finished bools for each of these sequences.
+        [batch_size, beam_size]
+      curr_seq: current topk sequence that has been grown by one position.
+        [batch_size, beam_size, current_decoded_length]
+      curr_scores: scores for each of these sequences. [batch_size, beam_size]
+      curr_finished: Finished flags for each of these sequences.
+        [batch_size, beam_size]
+    Returns:
+      Tuple of
+        (Topk sequences based on scores,
+         log probs of these sequences,
+         Finished flags of these sequences)
+    """
+    # First append a column of 0'ids to finished to make the same length with
+    # finished scores
+    finished_seq = tf.concat(
+        [finished_seq,
+         tf.zeros([batch_size, beam_size, 1], tf.int32)], axis=2)
+
+    # Set the scores of the unfinished seq in curr_seq to large negative
+    # values
+    curr_scores += (1. - tf.to_float(curr_finished)) * -INF
+    # concatenating the sequences and scores along beam axis
+    curr_finished_seq = tf.concat([finished_seq, curr_seq], axis=1)
+    curr_finished_scores = tf.concat([finished_scores, curr_scores], axis=1)
+    curr_finished_flags = tf.concat([finished_flags, curr_finished], axis=1)
+    return compute_topk_scores_and_seq(
+        curr_finished_seq, curr_finished_scores, curr_finished_scores,
+        curr_finished_flags, beam_size, batch_size)
+
+  def grow_alive(curr_seq, curr_scores, curr_log_probs, curr_finished):
+    """Given sequences and scores, will gather the top k=beam size sequences.
+
+    Args:
+      curr_seq: current topk sequence that has been grown by one position.
+        [batch_size, beam_size, i+1]
+      curr_scores: scores for each of these sequences. [batch_size, beam_size]
+      curr_log_probs: log probs for each of these sequences.
+        [batch_size, beam_size]
+      curr_finished: Finished flags for each of these sequences.
+        [batch_size, beam_size]
+    Returns:
+      Tuple of
+        (Topk sequences based on scores,
+         log probs of these sequences,
+         Finished flags of these sequences)
+    """
+    # Set the scores of the finished seq in curr_seq to large negative
+    # values
+    curr_scores += tf.to_float(curr_finished) * -INF
+    return compute_topk_scores_and_seq(curr_seq, curr_scores, curr_log_probs,
+                                       curr_finished, beam_size, batch_size)
+
+  def grow_topk(i, alive_seq, alive_log_probs):
+    r"""Inner beam seach loop.
+
+    This function takes the current alive sequences, and grows them to topk
+    sequences where k = 2*beam. We use 2*beam because, we could have beam_size
+    number of sequences that might hit <EOS> and there will be no alive
+    sequences to continue. With 2*beam_size, this will not happen. This relies
+    on the assumption the vocab size is > beam size. If this is true, we'll
+    have at least beam_size non <EOS> extensions if we extract the next top
+    2*beam words.
+    Length penalty is given by = (5+len(decode)/6) ^ -\alpha. Pls refer to
+    https://arxiv.org/abs/1609.08144.
+
+    Args:
+      i: loop index
+      alive_seq: Topk sequences decoded so far [batch_size, beam_size, i+1]
+      alive_log_probs: probabilities of these sequences. [batch_size, beam_size]
+    Returns:
+      Tuple of
+        (Topk sequences extended by the next word,
+         The log probs of these sequences,
+         The scores with length penalty of these sequences,
+         Flags indicating which of these sequences have finished decoding)
+    """
+    # Get the logits for all the possible next symbols
+    flat_ids = tf.reshape(alive_seq, [batch_size * beam_size, -1])
+
+    # (batch_size * beam_size, decoded_length)
+    flat_logits = symbols_to_logits_fn(flat_ids)
+    logits = tf.reshape(flat_logits, (batch_size, beam_size, -1))
+
+    # Convert logits to normalized log probs
+    candidate_log_probs = log_prob_from_logits(logits)
+
+    # Multiply the probabilites by the current probabilites of the beam.
+    # (batch_size, beam_size, vocab_size) + (batch_size, beam_size, 1)
+    log_probs = candidate_log_probs + tf.expand_dims(alive_log_probs, axis=2)
+
+    length_penalty = tf.pow(((5. + tf.to_float(i + 1)) / 6.), alpha)
+
+    curr_scores = log_probs / length_penalty
+    # Flatten out (beam_size, vocab_size) probs in to a list of possibilites
+    flat_curr_scores = tf.reshape(curr_scores, [-1, beam_size * vocab_size])
+
+    topk_scores, topk_ids = tf.nn.top_k(flat_curr_scores, k=beam_size * 2)
+
+    # Recovering the log probs becuase we will need to send them back
+    topk_log_probs = topk_scores * length_penalty
+
+    # Work out what beam the top probs are in.
+    topk_beam_index = topk_ids // vocab_size
+    topk_ids %= vocab_size  # Unflatten the ids
+
+    # The next three steps are to create coordinates for tf.gather_nd to pull
+    # out the correct seqences from id's that we need to grow.
+    # We will also use the coordinates to gather the booleans of the beam items
+    # that survived.
+    batch_pos = compute_batch_indices(batch_size, beam_size * 2)
+
+    # top beams will give us the actual coordinates to do the gather.
+    # stacking will create a tensor of dimension batch * beam * 2, where the
+    # last dimension contains the i,j gathering coordinates.
+    topk_coordinates = tf.stack([batch_pos, topk_beam_index], axis=2)
+
+    # Gather up the most probable 2*beams both for the ids and finished_in_alive
+    # bools
+    topk_seq = tf.gather_nd(alive_seq, topk_coordinates)
+
+    # Append the most probable alive
+    topk_seq = tf.concat([topk_seq, tf.expand_dims(topk_ids, axis=2)], axis=2)
+
+    topk_finished = tf.equal(topk_ids, eos_id)
+
+    return topk_seq, topk_log_probs, topk_scores, topk_finished
+
+  def inner_loop(i, alive_seq, alive_log_probs, finished_seq, finished_scores,
+                 finished_flags):
+    """Inner beam seach loop.
+
+    There are three groups of tensors, alive, finished, and topk.
+    The alive group contains information about the current alive sequences
+    The topk group contains information about alive + topk current decoded words
+    the finished group contains information about finished sentences, that is,
+    the ones that have decoded to <EOS>. These are what we return.
+    The general beam search algorithm is as follows:
+    While we haven't terminated (pls look at termination condition)
+      1. Grow the current alive to get beam*2 topk sequences
+      2. Among the topk, keep the top beam_size ones that haven't reached EOS
+      into alive
+      3. Among the topk, keep the top beam_size ones have reached EOS into
+      finished
+    Repeat
+    To make things simple with using fixed size tensors, we will end
+    up inserting unfinished sequences into finished in the beginning. To stop
+    that we add -ve INF to the score of the unfinished sequence so that when a
+    true finished sequence does appear, it will have a higher score than all the
+    unfinished ones.
+
+    Args:
+      i: loop index
+      alive_seq: Topk sequences decoded so far [batch_size, beam_size, i+1]
+      alive_log_probs: probabilities of the beams. [batch_size, beam_size]
+      finished_seq: Current finished sequences.
+        [batch_size, beam_size, i+1]
+      finished_scores: scores for each of these sequences.
+        [batch_size, beam_size]
+      finished_flags: finished bools for each of these sequences.
+        [batch_size, beam_size]
+
+    Returns:
+      Tuple of
+        (Incremented loop index
+         New alive sequences,
+         Log probs of the alive sequences,
+         New finished sequences,
+         Scores of the new finished sequences,
+         Flags inidicating which sequence in finished as reached EOS)
+    """
+
+    # Each inner loop, we carry out three steps:
+    # 1. Get the current topk items.
+    # 2. Extract the ones that have finished and haven't finished
+    # 3. Recompute the contents of finished based on scores.
+    topk_seq, topk_log_probs, topk_scores, topk_finished = grow_topk(
+        i, alive_seq, alive_log_probs)
+    alive_seq, alive_log_probs, _ = grow_alive(topk_seq, topk_scores,
+                                               topk_log_probs, topk_finished)
+    finished_seq, finished_scores, finished_flags = grow_finished(
+        finished_seq, finished_scores, finished_flags, topk_seq, topk_scores,
+        topk_finished)
+
+    return (i + 1, alive_seq, alive_log_probs, finished_seq, finished_scores,
+            finished_flags)
+
+  def _is_finished(i, unused_alive_seq, alive_log_probs, unused_finished_seq,
+                   finished_scores, finished_in_finished):
+    """Checking termination condition.
+
+    We terminate when we decoded up to decode_length or the lowest scoring item
+    in finished has a greater score that the higest prob item in alive divided
+    by the max length penalty
+
+    Args:
+      i: loop index
+      alive_log_probs: probabilities of the beams. [batch_size, beam_size]
+      finished_scores: scores for each of these sequences.
+        [batch_size, beam_size]
+      finished_in_finished: finished bools for each of these sequences.
+        [batch_size, beam_size]
+
+    Returns:
+      Bool.
+    """
+    max_length_penalty = tf.pow(((5. + tf.to_float(decode_length)) / 6.), alpha)
+    # The best possible score of the most likley alive sequence
+    lower_bound_alive_scores = alive_log_probs[:, 0] / max_length_penalty
+
+    # Now to compute the lowest score of a finished sequence in finished
+    # If the sequence isn't finished, we multiply it's score by 0. since
+    # scores are all -ve, taking the min will give us the score of the lowest
+    # finished item.
+    lowest_score_of_fininshed_in_finished = tf.reduce_min(
+        finished_scores * tf.to_float(finished_in_finished), axis=1)
+    # If none of the sequences have finished, then the min will be 0 and
+    # we have to replace it by -ve INF if it is. The score of any seq in alive
+    # will be much higher than -ve INF and the termination condition will not
+    # be met.
+    lowest_score_of_fininshed_in_finished += (
+        (1. - tf.to_float(tf.reduce_any(finished_in_finished, 1))) * -INF)
+
+    bound_is_met = tf.reduce_all(
+        tf.greater(lowest_score_of_fininshed_in_finished,
+                   lower_bound_alive_scores))
+
+    return tf.logical_and(
+        tf.less(i, decode_length), tf.logical_not(bound_is_met))
+
+  (_, alive_seq, alive_log_probs, finished_seq, finished_scores,
+   finished_flags) = tf.while_loop(
+       _is_finished,
+       inner_loop, [
+           tf.constant(0), alive_seq, alive_log_probs, finished_seq,
+           finished_scores, finished_flags
+       ],
+       shape_invariants=[
+           tf.TensorShape([]),
+           tf.TensorShape([None, None, None]),
+           alive_log_probs.get_shape(),
+           tf.TensorShape([None, None, None]),
+           finished_scores.get_shape(),
+           finished_flags.get_shape()
+       ],
+       parallel_iterations=1,
+       back_prop=False)
+
+  alive_seq.set_shape((None, beam_size, None))
+  finished_seq.set_shape((None, beam_size, None))
+
+  # Accounting for corner case: It's possible that no sequence in alive for a
+  # particular batch item ever reached EOS. In that case, we should just copy
+  # the contents of alive for that batch item. tf.reduce_any(finished_flags, 1)
+  # if 0, means that no sequence for that batch index had reached EOS. We need
+  # to do the same for the scores as well.
+  finished_seq = tf.where(
+      tf.reduce_any(finished_flags, 1), finished_seq, alive_seq)
+  finished_scores = tf.where(
+      tf.reduce_any(finished_flags, 1), finished_scores, alive_log_probs)
+  return finished_seq, finished_scores
diff --git a/tensor2tensor/utils/beam_search_test.py b/tensor2tensor/utils/beam_search_test.py
new file mode 100644
index 000000000..33439b41f
--- /dev/null
+++ b/tensor2tensor/utils/beam_search_test.py
@@ -0,0 +1,281 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for tensor2tensor.beam_search."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+import numpy as np
+from tensor2tensor.utils import beam_search
+
+import tensorflow as tf
+
+
+class BeamSearchTest(tf.test.TestCase):
+
+  def testShapes(self):
+    batch_size = 2
+    beam_size = 3
+    vocab_size = 4
+    decode_length = 10
+
+    initial_ids = tf.constant([0, 0])  # GO
+
+    def symbols_to_logits(_):
+      # Just return random logits
+      return tf.random_uniform((batch_size * beam_size, vocab_size))
+
+    final_ids, final_probs = beam_search.beam_search(
+        symbols_to_logits, initial_ids, beam_size, decode_length, vocab_size,
+        0.)
+
+    self.assertEqual(final_ids.get_shape().as_list(), [None, beam_size, None])
+
+    self.assertEqual(final_probs.get_shape().as_list(), [None, beam_size])
+
+  def testComputeTopkScoresAndSeq(self):
+    batch_size = 2
+    beam_size = 3
+
+    sequences = tf.constant([[[2, 3], [4, 5], [6, 7], [19, 20]],
+                             [[8, 9], [10, 11], [12, 13], [80, 17]]])
+
+    scores = tf.constant([[-0.1, -2.5, 0., -1.5],
+                          [-100., -5., -0.00789, -1.34]])
+    flags = tf.constant([[True, False, False, True],
+                         [False, False, False, True]])
+
+    topk_seq, topk_scores, topk_flags = beam_search.compute_topk_scores_and_seq(
+        sequences, scores, scores, flags, beam_size, batch_size)
+
+    with self.test_session():
+      topk_seq = topk_seq.eval()
+      topk_scores = topk_scores.eval()
+      topk_flags = topk_flags.eval()
+
+    exp_seq = [[[6, 7], [2, 3], [19, 20]], [[12, 13], [80, 17], [10, 11]]]
+    exp_scores = [[0., -0.1, -1.5], [-0.00789, -1.34, -5.]]
+
+    exp_flags = [[False, True, True], [False, True, False]]
+    self.assertAllEqual(exp_seq, topk_seq)
+    self.assertAllClose(exp_scores, topk_scores)
+    self.assertAllEqual(exp_flags, topk_flags)
+
+  def testGreedyBatchOne(self):
+    batch_size = 1
+    beam_size = 1
+    vocab_size = 2
+    decode_length = 3
+
+    initial_ids = tf.constant([0] * batch_size)  # GO
+
+    # Test that beam search finds the most probable sequence.
+    # These probabilities represent the following search
+    #
+    #               G0 (0)
+    #                  / \
+    #                /     \
+    #              /         \
+    #            /             \
+    #         0(0.7)          1(0.3)
+    #           / \
+    #          /   \
+    #         /     \
+    #     0(0.4) 1(0.6)
+    #        /\
+    #       /  \
+    #      /    \
+    #    0(0.5) 1(0.5)
+    # and the following decoding probabilities
+    # 0000 - 0.7 * 0.4  * 0.1
+    # 0001 - 0.7 * 0.4  * 0.9
+    # 001 - 0.7 * 0.6 (Best)
+    # 01 = 0.3
+    #
+    # 001 is the most likely sequence under these probabilities.
+    probabilities = tf.constant([[[0.7, 0.3]], [[0.4, 0.6]], [[0.5, 0.5]]])
+
+    def symbols_to_logits(ids):
+      pos = tf.shape(ids)[1]
+      logits = tf.to_float(tf.log(probabilities[pos - 1, :]))
+      return logits
+
+    final_ids, final_probs = beam_search.beam_search(
+        symbols_to_logits,
+        initial_ids,
+        beam_size,
+        decode_length,
+        vocab_size,
+        0.0,
+        eos_id=1)
+
+    with self.test_session():
+      ids = final_ids.eval()
+      probs = final_probs.eval()
+    self.assertAllEqual([[[0, 0, 1]]], ids)
+    self.assertAllClose([[0.7 * 0.6]], np.exp(probs))
+
+  def testNotGreedyBeamTwo(self):
+    batch_size = 1
+    beam_size = 2
+    vocab_size = 3
+    decode_length = 3
+
+    initial_ids = tf.constant([0] * batch_size)  # GO
+    probabilities = tf.constant([[[0.1, 0.1, 0.8], [0.1, 0.1, 0.8]],
+                                 [[0.4, 0.5, 0.1], [0.2, 0.4, 0.4]],
+                                 [[0.05, 0.9, 0.05], [0.4, 0.4, 0.2]]])
+
+    def symbols_to_logits(ids):
+      pos = tf.shape(ids)[1]
+      logits = tf.to_float(tf.log(probabilities[pos - 1, :]))
+      return logits
+
+    final_ids, final_probs = beam_search.beam_search(
+        symbols_to_logits,
+        initial_ids,
+        beam_size,
+        decode_length,
+        vocab_size,
+        0.0,
+        eos_id=1)
+
+    with self.test_session():
+      ids = final_ids.eval()
+      probs = final_probs.eval()
+    self.assertAllEqual([[[0, 2, 1, 0], [0, 2, 0, 1]]], ids)
+    self.assertAllClose([[0.8 * 0.5, 0.8 * 0.4 * 0.9]], np.exp(probs))
+
+  def testGreedyWithCornerCase(self):
+    batch_size = 1
+    beam_size = 1
+    vocab_size = 3
+    decode_length = 2
+
+    initial_ids = tf.constant([0] * batch_size)  # GO
+    probabilities = tf.constant([[0.2, 0.1, 0.7], [0.4, 0.1, 0.5]])
+
+    def symbols_to_logits(ids):
+      pos = tf.shape(ids)[1]
+      logits = tf.to_float(tf.log(probabilities[pos - 1, :]))
+      return logits
+
+    final_ids, final_probs = beam_search.beam_search(
+        symbols_to_logits,
+        initial_ids,
+        beam_size,
+        decode_length,
+        vocab_size,
+        0.0,
+        eos_id=1)
+
+    with self.test_session():
+      ids = final_ids.eval()
+      probs = final_probs.eval()
+    self.assertAllEqual([[[0, 2, 2]]], ids)
+    self.assertAllClose([[0.7 * 0.5]], np.exp(probs))
+
+  def testNotGreedyBatchTwoBeamTwoWithAlpha(self):
+    batch_size = 2
+    beam_size = 2
+    vocab_size = 3
+    decode_length = 3
+
+    initial_ids = tf.constant([0] * batch_size)  # GO
+    # Probabilities for position * batch * beam * vocab
+    # Probabilities have been set such that with alpha = 3.5, the less probable
+    # but longer sequence will have a better score than the shorter sequence
+    # with higher log prob in batch 1, and the order will be reverse in batch
+    # 2. That is, the shorter sequence will still have a higher score in spite
+    # of the length penalty
+    probabilities = tf.constant([[[[0.1, 0.1, 0.8], [0.1, 0.1, 0.8]],
+                                  [[0.1, 0.1, 0.8], [0.1, 0.1, 0.8]]],
+                                 [[[0.4, 0.5, 0.1], [0.2, 0.4, 0.4]],
+                                  [[0.3, 0.6, 0.1], [0.2, 0.4, 0.4]]],
+                                 [[[0.05, 0.9, 0.05], [0.4, 0.4, 0.2]],
+                                  [[0.05, 0.9, 0.05], [0.4, 0.4, 0.2]]]])
+
+    def symbols_to_logits(ids):
+      pos = tf.shape(ids)[1]
+      logits = tf.to_float(tf.log(probabilities[pos - 1, :]))
+      return logits
+
+    final_ids, final_scores = beam_search.beam_search(
+        symbols_to_logits,
+        initial_ids,
+        beam_size,
+        decode_length,
+        vocab_size,
+        3.5,
+        eos_id=1)
+
+    with self.test_session():
+      ids = final_ids.eval()
+      scores = final_scores.eval()
+    self.assertAllEqual([[[0, 2, 0, 1], [0, 2, 1, 0]], [[0, 2, 1, 0],
+                                                        [0, 2, 0, 1]]], ids)
+    self.assertAllClose([[
+        np.log(0.8 * 0.4 * 0.9) / (8. / 6.)**3.5,
+        np.log(0.8 * 0.5) / (7. / 6.)**3.5
+    ], [
+        np.log(0.8 * 0.6) / (7. / 6.)**3.5,
+        np.log(0.8 * 0.3 * 0.9) / (8. / 6.)**3.5
+    ]], scores)
+
+  def testNotGreedyBeamTwoWithAlpha(self):
+    batch_size = 1
+    beam_size = 2
+    vocab_size = 3
+    decode_length = 3
+
+    initial_ids = tf.constant([0] * batch_size)  # GO
+    # Probabilities for position * batch * beam * vocab
+    # Probabilities have been set such that with alpha = 3.5, the less probable
+    # but longer sequence will have a better score that the shorter sequence
+    # with higher log prob.
+    probabilities = tf.constant([[[0.1, 0.1, 0.8], [0.1, 0.1, 0.8]],
+                                 [[0.4, 0.5, 0.1], [0.2, 0.4, 0.4]],
+                                 [[0.05, 0.9, 0.05], [0.4, 0.4, 0.2]]])
+
+    def symbols_to_logits(ids):
+      pos = tf.shape(ids)[1]
+      logits = tf.to_float(tf.log(probabilities[pos - 1, :]))
+      return logits
+
+    # Disable early stopping
+    final_ids, final_scores = beam_search.beam_search(
+        symbols_to_logits,
+        initial_ids,
+        beam_size,
+        decode_length,
+        vocab_size,
+        3.5,
+        eos_id=1)
+
+    with self.test_session():
+      ids = final_ids.eval()
+      scores = final_scores.eval()
+    self.assertAllClose([[
+        np.log(0.8 * 0.4 * 0.9) / (8. / 6.)**3.5,
+        np.log(0.8 * 0.5) / (7. / 6.)**3.5
+    ]], scores)
+    self.assertAllEqual([[[0, 2, 0, 1], [0, 2, 1, 0]]], ids)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensor2tensor/utils/bleu_hook.py b/tensor2tensor/utils/bleu_hook.py
new file mode 100644
index 000000000..eb8749b3f
--- /dev/null
+++ b/tensor2tensor/utils/bleu_hook.py
@@ -0,0 +1,123 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""BLEU metric util used during eval for MT."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import math
+
+# Dependency imports
+
+import numpy as np
+# pylint: disable=redefined-builtin
+from six.moves import xrange
+from six.moves import zip
+# pylint: enable=redefined-builtin
+
+import tensorflow as tf
+
+
+def _get_ngrams(segment, max_order):
+  """Extracts all n-grams upto a given maximum order from an input segment.
+
+  Args:
+    segment: text segment from which n-grams will be extracted.
+    max_order: maximum length in tokens of the n-grams returned by this
+        methods.
+
+  Returns:
+    The Counter containing all n-grams upto max_order in segment
+    with a count of how many times each n-gram occurred.
+  """
+  ngram_counts = collections.Counter()
+  for order in xrange(1, max_order + 1):
+    for i in xrange(0, len(segment) - order + 1):
+      ngram = tuple(segment[i:i + order])
+      ngram_counts[ngram] += 1
+  return ngram_counts
+
+
+def compute_bleu(reference_corpus,
+                 translation_corpus,
+                 max_order=4,
+                 use_bp=True):
+  """Computes BLEU score of translated segments against one or more references.
+
+  Args:
+    reference_corpus: list of references for each translation. Each
+        reference should be tokenized into a list of tokens.
+    translation_corpus: list of translations to score. Each translation
+        should be tokenized into a list of tokens.
+    max_order: Maximum n-gram order to use when computing BLEU score.
+    use_bp: boolean, whether to apply brevity penalty.
+
+  Returns:
+    BLEU score.
+  """
+  reference_length = 0
+  translation_length = 0
+  bp = 1.0
+  geo_mean = 0
+
+  matches_by_order = [0] * max_order
+  possible_matches_by_order = [0] * max_order
+  precisions = []
+
+  for (references, translations) in zip(reference_corpus, translation_corpus):
+    reference_length += len(references)
+    translation_length += len(translations)
+    ref_ngram_counts = _get_ngrams(references, max_order)
+    translation_ngram_counts = _get_ngrams(translations, max_order)
+
+    overlap = dict((ngram,
+                    min(count, translation_ngram_counts[ngram]))
+                   for ngram, count in ref_ngram_counts.items())
+
+    for ngram in overlap:
+      matches_by_order[len(ngram) - 1] += overlap[ngram]
+    for ngram in translation_ngram_counts:
+      possible_matches_by_order[len(ngram)-1] += translation_ngram_counts[ngram]
+
+  precisions = [0] * max_order
+  for i in xrange(0, max_order):
+    if possible_matches_by_order[i] > 0:
+      precisions[i] = matches_by_order[i] / possible_matches_by_order[i]
+    else:
+      precisions[i] = 0.0
+
+  if max(precisions) > 0:
+    p_log_sum = sum(math.log(p) for p in precisions if p)
+    geo_mean = math.exp(p_log_sum/max_order)
+
+  if use_bp:
+    ratio = translation_length / reference_length
+    bp = math.exp(1 - 1. / ratio) if ratio < 1.0 else 1.0
+
+  bleu = geo_mean * bp
+  return np.float32(bleu)
+
+
+def padded_bleu_score(predictions,
+                      labels, **unused_kwargs):
+  """Bleu score computation between labels and predictions on non-0s."""
+  outputs = tf.to_int32(tf.argmax(predictions, axis=-1))
+  # Convert the outputs and labels to a [batch_size, input_length] tensor.
+  outputs = tf.squeeze(outputs)
+  labels = tf.squeeze(labels)
+
+  bleu = tf.py_func(compute_bleu, (labels, outputs), tf.float32)
+  return bleu, tf.constant(1.0)
diff --git a/tensor2tensor/utils/bleu_hook_test.py b/tensor2tensor/utils/bleu_hook_test.py
new file mode 100644
index 000000000..1838affd6
--- /dev/null
+++ b/tensor2tensor/utils/bleu_hook_test.py
@@ -0,0 +1,59 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for tensor2tensor.utils.bleu_hook."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+from tensor2tensor.utils import bleu_hook
+
+import tensorflow as tf
+
+
+class BleuHookTest(tf.test.TestCase):
+
+  def testComputeBleuEqual(self):
+    translation_corpus = [[1, 2, 3]]
+    reference_corpus = [[1, 2, 3]]
+    bleu = bleu_hook.compute_bleu(reference_corpus, translation_corpus)
+    actual_bleu = 1.0
+    self.assertEqual(bleu, actual_bleu)
+
+  def testComputeNotEqual(self):
+    translation_corpus = [[1, 2, 3, 4]]
+    reference_corpus = [[5, 6, 7, 8]]
+    bleu = bleu_hook.compute_bleu(reference_corpus, translation_corpus)
+    actual_bleu = 0.0
+    self.assertEqual(bleu, actual_bleu)
+
+  def testComputeMultipleBatch(self):
+    translation_corpus = [[1, 2, 3, 4], [5, 6, 7, 0]]
+    reference_corpus = [[1, 2, 3, 4], [5, 6, 7, 10]]
+    bleu = bleu_hook.compute_bleu(reference_corpus, translation_corpus)
+    actual_bleu = 0.7231
+    self.assertAllClose(bleu, actual_bleu, atol=1e-03)
+
+  def testComputeMultipleNgrams(self):
+    reference_corpus = [[1, 2, 1, 13], [12, 6, 7, 4, 8, 9, 10]]
+    translation_corpus = [[1, 2, 1, 3], [5, 6, 7, 4]]
+    bleu = bleu_hook.compute_bleu(reference_corpus, translation_corpus)
+    actual_bleu = 0.486
+    self.assertAllClose(bleu, actual_bleu, atol=1e-03)
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensor2tensor/utils/data_reader.py b/tensor2tensor/utils/data_reader.py
new file mode 100644
index 000000000..0080ecaa6
--- /dev/null
+++ b/tensor2tensor/utils/data_reader.py
@@ -0,0 +1,346 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Data reader module."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import os
+
+# Dependency imports
+
+import six
+from six.moves import zip  # pylint: disable=redefined-builtin
+
+from tensor2tensor.data_generators import problem_hparams
+from tensor2tensor.models import common_layers
+
+import tensorflow as tf
+
+
+def examples_queue(data_sources,
+                   data_fields_to_features,
+                   training,
+                   capacity=32,
+                   data_items_to_decoders=None,
+                   data_items_to_decode=None):
+  """Contruct a queue of training or evaluation examples.
+
+  This function will create a reader from files given by data_sources,
+  then enqueue the tf.Examples from these files, shuffling if training
+  is true, and finally parse these tf.Examples to tensors.
+
+  The dictionary data_fields_to_features for an image dataset can be this:
+
+  data_fields_to_features = {
+    'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''),
+    'image/format': tf.FixedLenFeature((), tf.string, default_value='raw'),
+    'image/class/label': tf.FixedLenFeature(
+        [1], tf.int64, default_value=tf.zeros([1], dtype=tf.int64)),
+  }
+
+  and for a simple algorithmic dataset with variable-length data it is this:
+
+  data_fields_to_features = {
+    'inputs': tf.VarLenFeature(tf.int64),
+    'targets': tf.VarLenFeature(tf.int64),
+  }
+
+  The data_items_to_decoders dictionary argument can be left as None if there
+  is no decoding to be performed. But, e.g. for images, it should be set so that
+  the images are decoded from the features, e.g., like this for MNIST:
+
+  data_items_to_decoders = {
+    'image': tfexample_decoder.Image(
+      image_key = 'image/encoded',
+      format_key = 'image/format',
+      shape=[28, 28],
+      channels=1),
+    'label': tfexample_decoder.Tensor('image/class/label'),
+  }
+
+  These arguments are compatible with the use of tf.contrib.slim.data module,
+  see there for more documentation.
+
+  Args:
+    data_sources: a list or tuple of sources from which the data will be read,
+      for example [/path/to/train@128, /path/to/train2*, /tmp/.../train3*]
+    data_fields_to_features: a dictionary from data fields in the data sources
+      to features, such as tf.VarLenFeature(tf.int64), see above for examples.
+    training: a Boolean, whether to read for training or evaluation.
+    capacity: integer, queue capacity; set to 2 * max_batch_size or more.
+    data_items_to_decoders: a dictionary mapping data items (that will be
+      in the returned result) to decoders that will decode them using features
+      defined in data_fields_to_features; see above for examples. By default
+      (if this is None), we grab the tensor from every feature.
+    data_items_to_decode: a subset of data items that will be decoded;
+      by default (if this is None), we decode all items.
+
+  Returns:
+    A dictionary mapping each data_field to a corresponding 1D int64 tensor
+    read from the created queue.
+
+  Raises:
+    ValueError: if no files are found with the provided data_prefix or no data
+      fields were provided.
+  """
+  with tf.name_scope("examples_queue"):
+    # Read serialized examples using slim parallel_reader.
+    num_epochs = None if training else 1
+    _, example_serialized = tf.contrib.slim.parallel_reader.parallel_read(
+        data_sources,
+        tf.TFRecordReader,
+        num_epochs=num_epochs,
+        shuffle=training,
+        capacity=2 * capacity,
+        min_after_dequeue=capacity,
+        num_readers=4 if training else 1)
+
+    if data_items_to_decoders is None:
+      data_items_to_decoders = {
+          field: tf.contrib.slim.tfexample_decoder.Tensor(field)
+          for field in data_fields_to_features
+      }
+
+    decoder = tf.contrib.slim.tfexample_decoder.TFExampleDecoder(
+        data_fields_to_features, data_items_to_decoders)
+
+    if data_items_to_decode is None:
+      data_items_to_decode = list(data_items_to_decoders)
+
+    decoded = decoder.decode(example_serialized, items=data_items_to_decode)
+    return {
+        field: tensor
+        for (field, tensor) in zip(data_items_to_decode, decoded)
+    }
+
+
+def input_pipeline(data_file_pattern, capacity, mode):
+  """Input pipeline, returns a dictionary of tensors from queues."""
+  # Read from image TFRecords if the file has "image" in its name.
+  if data_file_pattern and "image" in data_file_pattern:
+    data_fields = {
+        "image/encoded": tf.FixedLenFeature((), tf.string),
+        "image/format": tf.FixedLenFeature((), tf.string),
+        "image/class/label": tf.VarLenFeature(tf.int64)
+    }
+    data_items_to_decoders = {
+        "inputs":
+            tf.contrib.slim.tfexample_decoder.Image(
+                image_key="image/encoded",
+                format_key="image/format",
+                channels=1 if "mnist" in data_file_pattern else 3),
+        "targets":
+            tf.contrib.slim.tfexample_decoder.Tensor("image/class/label"),
+    }
+  elif data_file_pattern and "audio" in data_file_pattern:
+    data_type = tf.int64 if "timit" in data_file_pattern else tf.float32
+    data_fields = {
+        "inputs": tf.VarLenFeature(data_type),
+        "audio/sample_count": tf.FixedLenFeature((), tf.int64),
+        "audio/sample_width": tf.FixedLenFeature((), tf.int64),
+        "targets": tf.VarLenFeature(tf.int64),
+    }
+    data_items_to_decoders = None
+  else:
+    data_fields = {
+        "inputs": tf.VarLenFeature(tf.int64),
+        "targets": tf.VarLenFeature(tf.int64)
+    }
+    data_items_to_decoders = None
+
+  # Create placeholders for input, rather than reading data from disk.
+  if data_file_pattern is None:
+    feature_map = {}
+    for (field, tp) in data_fields:
+      if field != "targets":
+        feature_map[field] = tf.placeholder(
+            dtype=tp, shape=[None] * 4, name=field)
+    return feature_map
+
+  # Now the non-trivial case construction.
+  examples = examples_queue(
+      [data_file_pattern],
+      data_fields,
+      training=(mode == tf.contrib.learn.ModeKeys.TRAIN),
+      capacity=capacity,
+      data_items_to_decoders=data_items_to_decoders)
+
+  if "image" in data_file_pattern:
+    # Small single-example pre-processing for images.
+    examples["inputs"] = tf.cast(examples["inputs"], tf.int64)
+    if ("image_imagenet" in data_file_pattern or
+        "image_mscoco" in data_file_pattern):
+      # For imagnet/coco, resize images to 299x299 as is standard.
+      def resize(img):
+        return tf.to_int64(tf.image.resize_images(img, [299, 299]))
+
+      def preprocess(img):
+        img = tf.image.resize_images(img, [360, 360])
+        img = common_layers.image_augmentation(tf.to_float(img) / 255.)
+        return tf.to_int64(img * 255.)
+
+      inputs = examples["inputs"]
+      if mode == tf.contrib.learn.ModeKeys.TRAIN:
+        examples["inputs"] = tf.cond(  # Preprocess 80% of the time.
+            tf.less(tf.random_uniform([]), 0.8),
+            lambda img=inputs: preprocess(img),
+            lambda img=inputs: resize(img))
+      else:
+        examples["inputs"] = tf.to_int64(resize(inputs))
+  elif "audio" in data_file_pattern:
+    # Reshape audio to proper shape
+    sample_count = tf.to_int32(examples.pop("audio/sample_count"))
+    sample_width = tf.to_int32(examples.pop("audio/sample_width"))
+    channel_count = 1
+    examples["inputs"] = tf.reshape(examples["inputs"],
+                                    [sample_count, sample_width, channel_count])
+    if "wsj" in data_file_pattern:
+      examples["inputs"] = tf.bitcast(examples["inputs"], tf.int32)
+  elif "a2q_20161229" in data_file_pattern:
+    # we forgot the EOS when we preprocessed this data.
+    examples["targets"] = tf.concat([examples["targets"], [1]], 0)
+
+  # We do not want int64s as they do are not supported on GPUs.
+  return {k: tf.to_int32(v) for (k, v) in six.iteritems(examples)}
+
+
+def batch_examples(examples, batching_scheme):
+  """Given a queue of examples, create batches of examples with similar lengths.
+
+  We assume that examples is a dictionary with string keys and tensor values,
+  possibly coming from a queue, e.g., constructed by examples_queue above.
+  Each tensor in examples is assumed to be 1D. We will put tensors of similar
+  length into batches togeter. We return a dictionary with the same keys as
+  examples, and with values being batches of size batch_size. If elements have
+  different lengths, they are padded with 0s. This function is based on
+  tf.contrib.training.bucket_by_sequence_length so see there for details.
+
+  For example, if examples is a queue containing [1, 2, 3] and [4], then
+  this function with batch_size=2 will return a batch [[1, 2, 3], [4, 0, 0]].
+
+  Args:
+    examples: a dictionary with string keys and 1D tensor values.
+    batching_scheme: a dictionary containing
+      "boundaries": a list of integers for the boundaries that will be
+        used for bucketing; see tf.contrib.training.bucket_by_sequence_length
+        for more details.
+      "batch_sizes": a list of batch sizes corresponding to the buckets
+      "max_length": an integer.  We drop sequences which are longer.
+
+  Returns:
+    A dictionary with the same keys as examples and with values being batches
+    of examples padded with 0s, i.e., [batch_size x length] tensors.
+  """
+  with tf.name_scope("batch_examples"):
+    # The queue to bucket on will be chosen based on maximum length.
+    max_length = 0
+    for v in examples.values():
+      # For images the sequence length is the size of the spatial dimensions.
+      sequence_length = (tf.shape(v)[0] if len(v.get_shape()) < 3 else
+                         tf.shape(v)[0] * tf.shape(v)[1])
+      max_length = tf.maximum(max_length, sequence_length)
+    (_, outputs) = tf.contrib.training.bucket_by_sequence_length(
+        max_length,
+        examples,
+        batching_scheme["batch_sizes"],
+        [b + 1 for b in batching_scheme["boundaries"]],
+        capacity=2,  # Number of full batches to store, we don't need many.
+        bucket_capacities=[2 * b for b in batching_scheme["batch_sizes"]],
+        dynamic_pad=True,
+        keep_input=(max_length <= batching_scheme["max_length"]))
+    return outputs
+
+
+def bucket_boundaries(max_length, min_length=8, mantissa_bits=2):
+  """A default set of length-bucket boundaries."""
+  x = min_length
+  boundaries = []
+  while x < max_length:
+    boundaries.append(x)
+    x += 2**max(0, int(math.log(x, 2)) - mantissa_bits)
+  return boundaries
+
+
+def hparams_to_batching_scheme(hparams,
+                               drop_long_sequences=False,
+                               shard_multiplier=1,
+                               length_multiplier=1):
+  """A batching scheme based on model hyperparameters.
+
+  Every batch containins a number of sequences divisible by `shard_multiplier`.
+
+  If `drop_long_sequences` is True, then sequences longer than
+  `hparams.batch_size` are dropped.  This prevents generating batches with
+  more than the usual number of tokens, which can cause out-of-memory errors.
+
+  Args:
+    hparams: a hyperparameters.
+    drop_long_sequences: a boolean.
+    shard_multiplier: an integer increasing the batch_size to suit splitting
+      across datashards.
+    length_multiplier: an integer multiplier that is used to increase the
+      batch sizes and sequence length tolerance.
+
+  Returns:
+     a dictionary
+  """
+  max_length = hparams.max_length or hparams.batch_size
+  boundaries = bucket_boundaries(
+      max_length, mantissa_bits=hparams.batching_mantissa_bits)
+  batch_sizes = [
+      max(1, hparams.batch_size // length)
+      for length in boundaries + [max_length]
+  ]
+  batch_sizes = [b * shard_multiplier for b in batch_sizes]
+  max_length *= length_multiplier
+  boundaries = [boundary * length_multiplier for boundary in boundaries]
+  return {
+      "boundaries": boundaries,
+      "batch_sizes": batch_sizes,
+      "max_length": (max_length if drop_long_sequences else 10**9)
+  }
+
+
+def constant_batching_scheme(constant_batch_size_in_sequences):
+  """A batching scheme with constant batch size.
+
+  Args:
+    constant_batch_size_in_sequences: an integer
+
+  Returns:
+     a dictionary
+  """
+  boundaries = bucket_boundaries(1024)
+  batch_sizes = [constant_batch_size_in_sequences] * (1 + len(boundaries))
+  return {
+      "boundaries": boundaries,
+      "batch_sizes": batch_sizes,
+      "max_length": 10**9
+  }
+
+
+def get_datasets(problems, data_dir, mode):
+  """Return the location of a dataset for a given mode."""
+  datasets = []
+  for problem in problems.split("-"):
+    problem, _, _ = problem_hparams.parse_problem_name(problem)
+    path = os.path.join(data_dir, problem)
+    if mode == tf.contrib.learn.ModeKeys.TRAIN:
+      datasets.append("%s-train*" % path)
+    else:
+      datasets.append("%s-dev*" % path)
+  return datasets
diff --git a/tensor2tensor/utils/data_reader_test.py b/tensor2tensor/utils/data_reader_test.py
new file mode 100644
index 000000000..883a3673a
--- /dev/null
+++ b/tensor2tensor/utils/data_reader_test.py
@@ -0,0 +1,147 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Data reader test."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import tempfile
+
+# Dependency imports
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensor2tensor.data_generators import generator_utils
+from tensor2tensor.utils import data_reader
+
+import tensorflow as tf
+
+
+class DataReaderTest(tf.test.TestCase):
+
+  def testExamplesQueue(self):
+    tf.set_random_seed(1)
+    tmp_dir = self.get_temp_dir()
+    (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir)
+    tmp_file_name = os.path.basename(tmp_file_path)
+
+    # Generate a file with 100 examples.
+    def test_generator():
+      for i in xrange(100):
+        yield {"inputs": [i], "targets": [i], "floats": [i + 0.5]}
+
+    generator_utils.generate_files(test_generator(), tmp_file_name, tmp_dir)
+    self.assertTrue(tf.gfile.Exists(tmp_file_path + "-00000-of-00001"))
+
+    examples_train = data_reader.examples_queue(
+        [tmp_file_path + "*"], {
+            "inputs": tf.VarLenFeature(tf.int64),
+            "targets": tf.VarLenFeature(tf.int64)
+        },
+        training=True)
+    examples_eval = data_reader.examples_queue(
+        [tmp_file_path + "*"], {
+            "inputs": tf.VarLenFeature(tf.int64),
+            "targets": tf.VarLenFeature(tf.int64),
+            "floats": tf.VarLenFeature(tf.float32)
+        },
+        training=False)
+    with tf.train.MonitoredSession() as session:
+      # Evaluation data comes in the same order as in the file, check 10.
+      for i in xrange(10):
+        examples = session.run(examples_eval)
+        self.assertEqual(len(examples["inputs"]), 1)
+        self.assertEqual(len(examples["targets"]), 1)
+        self.assertEqual(examples["inputs"][0], i)
+        self.assertEqual(examples["targets"][0], i)
+        self.assertEqual(examples["floats"][0], i + 0.5)
+      # Training data is shuffled.
+      is_shuffled = False
+      for i in xrange(10):
+        examples = session.run(examples_train)
+        self.assertEqual(len(examples["inputs"]), 1)
+        self.assertEqual(len(examples["targets"]), 1)
+        self.assertEqual(examples["inputs"][0], examples["targets"][0])
+        if examples["inputs"][0] != i:
+          is_shuffled = True
+      self.assertTrue(is_shuffled)
+
+    # Clean up.
+    os.remove(tmp_file_path + "-00000-of-00001")
+    os.remove(tmp_file_path)
+
+  def testBatchExamples(self):
+    tf.set_random_seed(1)
+    tmp_dir = self.get_temp_dir()
+    (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir)
+    tmp_file_name = os.path.basename(tmp_file_path)
+
+    # Generate a file with 100 examples, n-th example of length n + 1.
+    def test_generator():
+      for i in xrange(100):
+        yield {"inputs": [i + 1 for _ in xrange(i + 1)], "targets": [i + 1]}
+
+    generator_utils.generate_files(test_generator(), tmp_file_name, tmp_dir)
+    self.assertTrue(tf.gfile.Exists(tmp_file_path + "-00000-of-00001"))
+
+    examples_train = data_reader.examples_queue([tmp_file_path + "*"], {
+        "inputs": tf.VarLenFeature(tf.int64),
+        "targets": tf.VarLenFeature(tf.int64)
+    }, True)
+    batch_train = data_reader.batch_examples(examples_train, 4)
+    examples_eval = data_reader.examples_queue([tmp_file_path + "*"], {
+        "inputs": tf.VarLenFeature(tf.int64),
+        "targets": tf.VarLenFeature(tf.int64)
+    }, False)
+    batch_eval = data_reader.batch_examples(examples_eval, 2)
+    session, coord = tf.Session(), tf.train.Coordinator()
+    with session.as_default():
+      tf.train.start_queue_runners(coord=coord)
+
+      # Evaluation data comes in the same order as in the file.
+      # The first batch will be inputs=[[1, 0], [2, 2]], targets=[[1], [2]].
+      examples = session.run(batch_eval)
+      self.assertAllClose(examples["inputs"], np.array([[1, 0], [2, 2]]))
+      self.assertAllClose(examples["targets"], np.array([[1], [2]]))
+      # Check the second batch too.
+      examples = session.run(batch_eval)
+      self.assertAllClose(examples["inputs"],
+                          np.array([[3, 3, 3, 0], [4, 4, 4, 4]]))
+      self.assertAllClose(examples["targets"], np.array([[3], [4]]))
+
+      # Training data is shuffled but shouldn't have too many pads.
+      for _ in xrange(10):
+        examples = session.run(batch_train)
+        inputs = examples["inputs"]
+        # Only 3 out of 4 examples in a batch have padding zeros at all.
+        pad_per_example = (inputs.size - np.count_nonzero(inputs)) // 3
+        # Default bucketing is in steps of 8 until 64 and 32 later.
+        if int(max(examples["targets"])) < 64:
+          self.assertLess(pad_per_example, 8)
+        else:
+          self.assertLess(pad_per_example, 32)
+
+    # Clean up.
+    coord.request_stop()
+    coord.join()
+    os.remove(tmp_file_path + "-00000-of-00001")
+    os.remove(tmp_file_path)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensor2tensor/utils/expert_utils.py b/tensor2tensor/utils/expert_utils.py
new file mode 100644
index 000000000..8d3d1d50c
--- /dev/null
+++ b/tensor2tensor/utils/expert_utils.py
@@ -0,0 +1,1284 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities for creating Sparsely-Gated Mixture-of-Experts Layers.
+
+See the most recent draft of our ICLR paper:
+https://openreview.net/pdf?id=B1ckMDqlg
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+# Dependency imports
+
+import six
+from six.moves import xrange  # pylint: disable=redefined-builtin
+from six.moves import zip  # pylint: disable=redefined-builtin
+import tensorflow as tf
+
+from tensorflow.python.framework import function
+
+
+def NoisyTopKGatingParams():
+  """Hyperparams defining NoisyTopK Gating Network.
+
+  Returns:
+    a tf.contrib.training.HParams object
+  """
+  return tf.contrib.training.HParams(
+      gating_class=NoisyTopKGating,
+      num_experts=16,  # The number of experts
+      k=2,  # 'The number of experts to use per example
+      input_size=None,  # size of input to MoE.  Set by MoE class
+      dtype=tf.float32,  # floating point data type
+      initializer=tf.zeros_initializer(),  # initializer for weight matrices
+      noisy_gating=True,  # Add tunable noise (necessary for load-balancing)
+      noise_epsilon=1e-2,  # Added to noise stddev for numerical stability
+  )
+
+
+def FeedForwardExpertParams():
+  """Hyperparameters defining feed-forward expert networks.
+
+  Returns:
+    a tf.contrib.training.HParams object
+  """
+  return tf.contrib.training.HParams(
+      # The class that implements the expert network
+      expert_class=FeedForwardExpert,
+      input_size=None,  # Size of input to MoE.  Set by MoE class.
+      # List of hidden layer sizes, or None for no hidden layers.
+      # The length of this list determines the number of hidden layers
+      hidden_layer_sizes=None,
+      output_size=None,  # Size of output from MoE.  Set by MoE class.
+      dtype=tf.float32,  # Floating point data type)
+      # Activation function applied at each hidden layer)
+      hidden_activation=tf.nn.relu,
+      initializer=None,  # Optional initializer for weight matrices.)
+      # If autoscale=True, At each hidden/output layer, multiply by
+      # rsqrt(prev_layer_size / input_size).  This scaling happens
+      # before application of hidden_activation)
+      autoscale=True,)
+
+
+def _SetInputOutputSizes(hp, input_size, output_size):
+  """Fill in the input_size and output_size hyperparameters.
+
+  This is used by LocalMixtureOfExperts and DistributedMixtureOfExperts to
+  fill in the input_size and output_size on the gating parameters and expert
+  parameters so that the user does not have to set them in multiple places.
+
+  Args:
+    hp: a hyperparameters
+    input_size: an integer
+    output_size: an integer
+  """
+  if hp.input_size is None:
+    hp.input_size = input_size
+  else:
+    assert hp.input_size == input_size
+  if output_size is not None:
+    if hp.output_size is None:
+      hp.output_size = output_size
+    else:
+      assert hp.output_size == output_size
+
+
+class FeedForwardExpert(object):
+  """An object representing a feed forward network (used as an expert).
+  """
+
+  def __init__(self, hp, name):
+    """Creates a FeedForwardExpert.
+
+    Args:
+      hp: hyperparameters.  Call FeedForwardExpertParams() to create these.
+      name: a string.
+    """
+    self._hp = hp
+    hidden_layer_sizes = hp.hidden_layer_sizes or []
+    num_layers = 1 + len(hidden_layer_sizes)
+    layer_sizes = [hp.input_size] + hidden_layer_sizes + [hp.output_size]
+    self._layer_sizes = layer_sizes
+    self._w = []
+    for layer in range(num_layers):
+      shape = layer_sizes[layer:layer + 2]
+      self._w.append(
+          tf.get_variable('%s_layer_%d' % (name, layer), shape, hp.dtype,
+                          hp.initializer))
+
+  def Eval(self, x):
+    """Evaluate the FeedForwardExpert on the given input.
+
+    Args:
+      x: a `Tensor` of shape `[batch_size, hp.input_size]`
+
+    Returns:
+      a `Tensor` of shape `[batch_size, hp.output_size]`
+    """
+    hp = self._hp
+    num_layers = len(self._w)
+    for i in xrange(num_layers):
+      x = tf.matmul(x, self._w[i])
+      if hp.autoscale and self._layer_sizes[i] != hp.input_size:
+        x *= (self._layer_sizes[i] / hp.input_size)**-0.5
+      if i + 1 < num_layers and hp.hidden_activation:
+        x = hp.hidden_activation(x)
+    return x
+
+  @property
+  def vars(self):
+    return self._w
+
+
+@function.Defun(
+    python_grad_func=lambda x, dy: tf.convert_to_tensor(dy),
+    shape_func=lambda op: [op.inputs[0].get_shape()])
+def ConvertGradientToTensor(x):
+  """Identity operation whose gradient is converted to a `Tensor`.
+
+  Currently, the gradient to `tf.concat` is particularly expensive to
+  compute if dy is an `IndexedSlices` (a lack of GPU implementation
+  forces the gradient operation onto CPU).  This situation occurs when
+  the output of the `tf.concat` is eventually passed to `tf.gather`.
+  It is sometimes faster to convert the gradient to a `Tensor`, so as
+  to get the cheaper gradient for `tf.concat`.  To do this, replace
+  `tf.concat(x)` with `ConvertGradientToTensor(tf.concat(x))`.
+
+  Args:
+    x: A `Tensor`.
+
+  Returns:
+    The input `Tensor`.
+  """
+  return x
+
+
+class Parallelism(object):
+  """Helper class for creating sets of parallel function calls.
+
+  The purpose of this class is to replace this code:
+
+      e = []
+      f = []
+      for i in xrange(len(devices)):
+        with tf.device(devices[i]):
+          e_, f_ = func(a[i], b[i], c)
+          e.append(e_)
+          f.append(f_)
+
+  with this code:
+
+      e, f = expert_utils.Parallelism(devices)(func, a, b, c)
+  """
+
+  def __init__(self,
+               device_names_or_functions,
+               reuse=None,
+               caching_devices=None,
+               daisy_chain_variables=False):
+    """Create a Parallelism.
+
+    Args:
+      device_names_or_functions: A list of of length n, containing device names
+        or device functions (see `tf.device`)
+      reuse: True or None.  Whether to reuse variables created in the first
+        replica in the subsequent replicas.
+      caching_devices: Either `None`, or a list of length n containing device
+        names.
+      daisy_chain_variables: a boolean - if true, then copies variables in a
+        daisy chain between devices.
+
+    Returns:
+      a Parallelism.
+    """
+    assert device_names_or_functions
+    self._devices = device_names_or_functions
+    self._n = len(device_names_or_functions)
+    self._reuse = reuse
+    self._caching_devices = self._MaybeRepeat(caching_devices)
+    self._daisy_chain_variables = daisy_chain_variables
+
+  def __call__(self, fn, *args, **kwargs):
+    """A parallel set of function calls (using the specified devices).
+
+    Args:
+      fn: a function or a list of n functions.
+      *args: additional args.  Each arg should either be not a list, or a list
+         of length n.
+      **kwargs: additional keyword args.  Each arg should either be not a
+         list, or a list of length n.
+
+    Returns:
+      either a single list of length n (if fn does not return a tuple), or a
+      tuple of lists of length n (if fn returns a tuple).
+    """
+    # Construct lists or args and kwargs for each function.
+    if args:
+      my_args = TransposeListOfLists([self._MaybeRepeat(arg) for arg in args])
+    else:
+      my_args = [[] for _ in xrange(self.n)]
+    my_kwargs = [{} for _ in xrange(self.n)]
+    for k, v in six.iteritems(kwargs):
+      vals = self._MaybeRepeat(v)
+      for i in xrange(self.n):
+        my_kwargs[i][k] = vals[i]
+
+    # Construct lists of functions.
+    fns = self._MaybeRepeat(fn)
+
+    # Now make the parallel call.
+    outputs = []
+    cache = {}
+    for i in xrange(self.n):
+
+      def DaisyChainGetter(getter, name, *args, **kwargs):
+        """Get a variable and cache in a daisy chain."""
+        device_var_key = (self._devices[i], name)
+        if device_var_key in cache:
+          # if we have the variable on the correct device, return it.
+          return cache[device_var_key]
+        if name in cache:
+          # if we have it on a different device, copy it from the last device
+          v = tf.identity(cache[name])
+        else:
+          var = getter(name, *args, **kwargs)
+          v = tf.identity(var._ref())  # pylint: disable=protected-access
+        # update the cache
+        cache[name] = v
+        cache[device_var_key] = v
+        return v
+
+      # Variable scope will not reset caching_device on reused variables,
+      # so we make a custom getter that uses identity to cache the variable.
+      # pylint: disable=cell-var-from-loop
+      def CachingGetter(getter, name, *args, **kwargs):
+        v = getter(name, *args, **kwargs)
+        key = (self._caching_devices[i], name)
+        if key in cache:
+          return cache[key]
+        with tf.device(self._caching_devices[i]):
+          ret = tf.identity(v._ref())  # pylint: disable=protected-access
+        cache[key] = ret
+        return ret
+
+      if self._daisy_chain_variables:
+        custom_getter = DaisyChainGetter
+      elif self._caching_devices:
+        custom_getter = CachingGetter
+      else:
+        custom_getter = None
+      # pylint: enable=cell-var-from-loop
+      with tf.name_scope('parallel_%d' % i):
+        with tf.variable_scope(
+            tf.get_variable_scope(),
+            reuse=True if i > 0 and self._reuse else None,
+            caching_device=self._caching_devices[i],
+            custom_getter=custom_getter):
+          with tf.device(self._devices[i]):
+            outputs.append(fns[i](*my_args[i], **my_kwargs[i]))
+    if isinstance(outputs[0], tuple):
+      outputs = list(zip(*outputs))
+      outputs = tuple([list(o) for o in outputs])
+    return outputs
+
+  @property
+  def n(self):
+    return self._n
+
+  @property
+  def devices(self):
+    return self._devices
+
+  def _MaybeRepeat(self, x):
+    """Utility function for processing arguments that are singletons or lists.
+
+    Args:
+      x: either a list of self.n elements, or not a list.
+
+    Returns:
+      a list of self.n elements.
+    """
+    if isinstance(x, list):
+      assert len(x) == self.n
+      return x
+    else:
+      return [x] * self.n
+
+
+def Parallel(device_names_or_functions, fn, *args):
+  """Deprecated interface.
+
+  Use `Parallelism(device_names_or_functions)(fn, *args)` instead.
+
+  Args:
+    device_names_or_functions: A list of length n.
+    fn: a function or a list of n functions.
+    *args: additional args.  Each arg should either be not a list, or a list
+       of length n.
+
+  Returns:
+    either a single list of length n (if fn does not return a tuple), or a
+    tuple of lists of length n (if fn returns a tuple).
+  """
+  return Parallelism(device_names_or_functions)(fn, *args)
+
+
+def _RowwiseUnsortedSegmentSum(values, indices, n):
+  """UnsortedSegmentSum on each row.
+
+  Args:
+    values: a `Tensor` with shape `[batch_size, k]`.
+    indices: an integer `Tensor` with shape `[batch_size, k]`.
+    n: an integer.
+  Returns:
+    A `Tensor` with the same type as `values` and shape `[batch_size, n]`.
+  """
+  batch, k = tf.unstack(tf.shape(indices), num=2)
+  indices_flat = tf.reshape(indices, [-1]) + tf.div(tf.range(batch * k), k) * n
+  ret_flat = tf.unsorted_segment_sum(
+      tf.reshape(values, [-1]), indices_flat, batch * n)
+  return tf.reshape(ret_flat, [batch, n])
+
+
+def _NormalDistributionCDF(x, stddev):
+  """Evaluates the CDF of the normal distribution.
+
+  Normal distribution with mean 0 and standard deviation stddev,
+  evaluated at x=x.
+
+  input and output `Tensor`s have matching shapes.
+
+  Args:
+    x: a `Tensor`
+    stddev: a `Tensor` with the same shape as `x`.
+
+  Returns:
+    a `Tensor` with the same shape as `x`.
+
+  """
+  return 0.5 * (1.0 + tf.erf(x / (math.sqrt(2) * stddev + 1e-20)))
+
+
+def _ProbInTopK(clean_values, noisy_values, noise_stddev, noisy_top_values, k):
+  """Helper function to NoisyTopKGating.
+
+  Computes the probability that value is in top k, given different random noise.
+
+  This gives us a way of backpropagating from a loss that balances the number
+  of times each expert is in the top k experts per example.
+
+  In the case of no noise, pass in None for noise_stddev, and the result will
+  not be differentiable.
+
+  Args:
+    clean_values: a `Tensor` of shape [batch, n].
+    noisy_values: a `Tensor` of shape [batch, n].  Equal to clean values plus
+      normally distributed noise with standard deviation noise_stddev.
+    noise_stddev: a `Tensor` of shape [batch, n], or None
+    noisy_top_values: a `Tensor` of shape [batch, m].
+       'values' Output of tf.top_k(noisy_top_values, m).  m >= k+1
+    k: an integer.
+
+  Returns:
+    a `Tensor` of shape [batch, n].
+  """
+  batch = tf.shape(clean_values)[0]
+  m = tf.shape(noisy_top_values)[1]
+  top_values_flat = tf.reshape(noisy_top_values, [-1])
+  # we want to compute the threshold that a particular value would have to
+  # exceed in order to make the top k.  This computation differs depending
+  # on whether the value is already in the top k.
+  threshold_positions_if_in = tf.range(batch) * m + k
+  threshold_if_in = tf.expand_dims(
+      tf.gather(top_values_flat, threshold_positions_if_in), 1)
+  is_in = tf.greater(noisy_values, threshold_if_in)
+  if noise_stddev is None:
+    return tf.to_float(is_in)
+  threshold_positions_if_out = threshold_positions_if_in - 1
+  threshold_if_out = tf.expand_dims(
+      tf.gather(top_values_flat, threshold_positions_if_out), 1)
+  # is each value currently in the top k.
+  prob_if_in = _NormalDistributionCDF(clean_values - threshold_if_in,
+                                      noise_stddev)
+  prob_if_out = _NormalDistributionCDF(clean_values - threshold_if_out,
+                                       noise_stddev)
+  prob = tf.where(is_in, prob_if_in, prob_if_out)
+  return prob
+
+
+def CVSquared(x):
+  """The squared coefficient of variation of a sample.
+
+  Useful as a loss to encourage a positive distribution to be more uniform.
+  Epsilons added for numerical stability.
+  Returns 0 for an empty Tensor.
+
+  Args:
+    x: a `Tensor`.
+
+  Returns:
+    a `Scalar`.
+  """
+  epsilon = 1e-10
+  float_size = tf.to_float(tf.size(x)) + epsilon
+  mean = tf.reduce_sum(x) / float_size
+  variance = tf.reduce_sum(tf.square(x - mean)) / float_size
+  return variance / (tf.square(mean) + epsilon)
+
+
+def MaxOverload(load):
+  """The load of the hardest-hit device relative to average.
+
+  This is useful for monitoring the performance of MoEs.
+
+  The load of an expert is the number of examples assigned to that expert.
+  The load of a device is the sum of the loads of all experts on that device.
+
+  The input to this function is generally the 'load' output of
+  DistributedMixtureOfExperts.Eval(), which is either a 1d or 2d `Tensor` of
+  per-expert loads.  In either case, the fist dimension corresponds to devices.
+
+  This function sums over all dimensions other than dimension zero, then
+  computes the ratio of the maxmium value to the mean value.
+
+  Args:
+    load: a 1d or 2d `Tensor`.
+
+  Returns:
+    a `Scalar`.
+  """
+  per_device_load = tf.reduce_sum(tf.reshape(load, [tf.shape(load)[0], -1]), 1)
+  return (tf.reduce_max(per_device_load) /
+          (tf.reduce_mean(per_device_load) + 1e-10))
+
+
+def _GatesToLoad(gates):
+  """Compute the true load per expert, given the gates.
+
+  The load is the number of examples for which the corresponding gate is >0.
+
+  Args:
+    gates: a `Tensor` of shape [batch_size, n]
+  Returns:
+    a float32 `Tensor` of shape [n]
+  """
+  return tf.reduce_sum(tf.to_float(gates > 0), 0)
+
+
+def _MyTopK(x, k):
+  """GPU-compatible version of top-k that works for very small constant k.
+
+  Calls argmax repeatedly.
+
+  Args:
+    x: a 2d Tensor.
+    k: a small integer.
+
+  Returns:
+    values: a Tensor of shape [batch_size, k]
+    indices: a int32 Tensor of shape [batch_size, k]
+  """
+  if k > 10:
+    return tf.nn.top_k(x, k)
+  values = []
+  indices = []
+  depth = tf.shape(x)[1]
+  for i in xrange(k):
+    values.append(tf.reduce_max(x, 1))
+    argmax = tf.argmax(x, 1)
+    indices.append(argmax)
+    if i + 1 < k:
+      x += tf.one_hot(argmax, depth, -1e9)
+  return tf.stack(values, axis=1), tf.to_int32(tf.stack(indices, axis=1))
+
+
+class NoisyTopKGating(object):
+  """Noisy top-k gating network.
+
+  See paper: https://arxiv.org/abs/1701.06538.
+  """
+
+  def __init__(self, hp, name):
+    """Create a NoisyTopKGating network.
+
+    Args:
+      hp: a hyperparameters created by NoisyTopKGatingParams()
+      name: a string
+    """
+    self._vars = []
+    self._hp = hp
+    self._w_gate = tf.get_variable('%s_gate' % name,
+                                   [hp.input_size,
+                                    hp.num_experts], hp.dtype, hp.initializer)
+    self._vars.append(self._w_gate)
+    if hp.noisy_gating:
+      self._w_noise = tf.get_variable('%s_noise' % name,
+                                      [hp.input_size, hp.num_experts], hp.dtype,
+                                      hp.initializer)
+      self._vars.append(self._w_noise)
+
+  def Eval(self, x, train=True, summaries=False):
+    """Compute noisy top-k gating.
+
+    Args:
+      x: a `Tensor` of shape `[batch_size, input_size]`.
+      train: a boolean `Scalar`.   Setting this to false turns off noise.
+      summaries: a boolean.  Whether to add summaries.
+    Returns:
+      gates: a `Tensor` of shape `[batch_size, n]`
+      load: a `Tensor` of shape `[n]`.
+        If we are using noise, this is a smooth approximation of the load,
+        and you can define a loss in terms of it to help with load-balancing.
+    """
+    with tf.variable_scope('NoisyTopKGating'):
+      hp = self._hp
+      clean_logits = tf.matmul(x, self._w_gate)
+      if hp.noisy_gating:
+        raw_noise_stddev = tf.matmul(x, self._w_noise)
+        noise_stddev = ((tf.nn.softplus(raw_noise_stddev) + hp.noise_epsilon) *
+                        (tf.to_float(train)))
+        noisy_logits = clean_logits + (
+            tf.random_normal(tf.shape(clean_logits)) * noise_stddev)
+        logits = noisy_logits
+        if summaries:
+          tf.summary.histogram('noisy_logits', noisy_logits)
+          tf.summary.histogram('noise_stddev', noise_stddev)
+      else:
+        logits = clean_logits
+      top_logits, top_indices = _MyTopK(logits, min(hp.k + 1, hp.num_experts))
+      top_k_logits = tf.slice(top_logits, [0, 0], [-1, hp.k])
+      top_k_indices = tf.slice(top_indices, [0, 0], [-1, hp.k])
+      top_k_gates = tf.nn.softmax(top_k_logits)
+      # This will be a `Tensor` of shape `[batch_size, n]`, with zeros in the
+      # positions corresponding to all but the top k experts per example.
+      gates = _RowwiseUnsortedSegmentSum(top_k_gates, top_k_indices,
+                                         hp.num_experts)
+      if hp.noisy_gating and hp.k < hp.num_experts:
+        load = tf.reduce_sum(
+            _ProbInTopK(clean_logits, noisy_logits, noise_stddev, top_logits,
+                        hp.k), 0)
+      else:
+        load = _GatesToLoad(gates)
+      if summaries:
+        tf.summary.histogram('importance', tf.reduce_sum(gates, 0))
+        tf.summary.histogram('load', load)
+      return gates, load
+
+  @property
+  def vars(self):
+    return self._vars
+
+
+class LocalMixtureOfExperts(object):
+  """A MoE on a single device.
+  """
+
+  def __init__(self, gating_hp, expert_hp, input_size, output_size, name):
+    """Create a LocalMixtureOfExperts.
+
+    Args:
+      gating_hp: hyperparameters for the gating network.
+        e.g. NoisyTopKGatingParams()
+      expert_hp: hyperparameters for the expert networks.
+        e.g. FeedForwardExpertParams()
+      input_size: an integer.
+      output_size: an integer.
+      name: a string.
+    """
+    self._name = name
+    _SetInputOutputSizes(gating_hp, input_size, None)
+    _SetInputOutputSizes(expert_hp, input_size, output_size)
+    self._gating_hp = gating_hp
+    self._gating = gating_hp.gating_class(gating_hp, name + '_gating')
+    self._expert_hp = expert_hp
+    self._experts = [
+        expert_hp.expert_class(expert_hp, name + '_%d' % i)
+        for i in xrange(gating_hp.num_experts)
+    ]
+
+  def Eval(self,
+           x,
+           train=True,
+           per_example_multiplier=None,
+           summaries=False,
+           identifiers=None):
+    """Evaluate mixture of experts.
+
+    We provide a convenient debugging tool for determining the set of examples
+    that we passed to each expert.  The caller may provide a `Tensor` of
+    "identifiers", of any type whose first dimension matches the number of
+    input examples. The function will then return a list
+    "expert_to_identifiers", with one `Tensor` for each expert containing the
+    identifiers for all examples assigned to that expert.  A parallel list of
+    `Tensor`s, "expert_to_gates", is also returned, containing the
+    corresponding gate values.
+
+    Args:
+      x: a `Tensor` of shape `[batch_size, input_size]`
+      train: a boolean Scalar.  Are we in training mode?
+      per_example_multiplier: an optional `Tensor` of shape `[batch_size]` which
+        gets multiplied into the gate values.  If this LocalMixtureOfExperts
+        represents one secondary MoE in a hierarchical MoE, then we pass in
+        in the gate values from the primary gating function here.  This causes
+        the computed values (`y`, `importance` and `expert_to_gates`) to also
+        reflect the primary gate values.
+      summaries: an boolean.  Enable summaries.
+      identifiers: an optional `Tensor` whose first dimension is equal to
+        batch_size.
+
+    Returns:
+      y: a `Tensor` of shape `[batch_size, output_size]`.  Output of the MoE.
+      importance: a `Tensor` of shape `[n]`.  Batchwise sum of gates.
+      load: a `Tensor` of shape `[n]`.  Smooth estimator of the number of
+        examples passed to each expert.  This is useful for load-balancing,
+        as any gradient on this `Tensor` will back-propagate to the gating
+        network.
+      expert_to_identifiers:  if `identifiers` was passed in, a list of
+        length `num_experts`.  Each element is a `Tensor` whose shape matches
+        that of `identifiers` in all but the first dimension.  Contains the
+        slices of `identifiers` corresponding to the batch elements that were
+        dispatched to that expert.
+      expert_to_gates:  A list of length `num_experts`.  Each element contains
+        a 1-dimensional tensor
+    """
+    gating_hp = self._gating_hp
+    gates, load = self._gating.Eval(x, train, summaries)
+    if per_example_multiplier is not None:
+      gates *= tf.expand_dims(per_example_multiplier, 1)
+    dispatcher = SparseDispatcher(gating_hp.num_experts, gates)
+    expert_input = dispatcher.Dispatch(x)
+    expert_output = [
+        self._experts[i].Eval(expert_input[i])
+        for i in xrange(gating_hp.num_experts)
+    ]
+    y = dispatcher.Combine(expert_output)
+    if identifiers is not None:
+      expert_to_identifiers = dispatcher.Dispatch(identifiers)
+    else:
+      expert_to_identifiers = None
+    return (y, tf.reduce_sum(gates, 0), load, expert_to_identifiers,
+            dispatcher.ExpertToGates())
+
+  @property
+  def vars(self):
+    ret = []
+    for x in self._experts:
+      ret.extend(x.vars)
+    ret.extend(self._gating.vars)
+    return ret
+
+
+class DistributedMixtureOfExperts(object):
+  """Distributed (optionally Hierarchical) Mixture of Experts.
+
+  This class implements the scheme described in our paper.
+  See link at the top of this file.
+
+  The model is trained synchronously using one large TF graph using
+  multiple devices.
+
+  The conventional (non-MoE) layers use data-parallelism, with each device
+  processing a subset of the training batch.   We call these datashards.
+
+  The MoE layer (this object) uses model parallelism.  Each expert is assigned
+  to a particular device, which hosts the expert parameters and performs the
+  expert computation for all examples assigned to that expert.  In the case
+  of a hierarchical MoE, each second-level MoE is assigned to a device.
+  """
+
+  def __init__(self, primary_gating_hp, secondary_gating_hp, expert_hp,
+               input_size, output_size, expert_devices, name):
+    """Create a DistributedMixtureOfExperts.
+
+    If `secondary_gating_hp` is `None`, then this is a flat MoE with
+    `primary_gating_hp.num_experts` experts. Otherwise, this is a hierarchical
+    MoE with `primary_gating_hp.num_experts` groups of
+    `secondary_gating_hp.num_experts` experts.
+
+    The assignemnt of experts (or groups of experts) to devices is by
+    round-robin.   So to make equal use of all the devices, one should set
+    `primary_gating_hp.num_experts` to the number of devices or a multiple
+    thereof.
+
+    Args:
+      primary_gating_hp: hyperparameters for the primary gating network.
+        e.g. NoisyTopKGatingParams().
+      secondary_gating_hp: hyperparameters for the secondary gating network.
+        e.g. NoisyTopKGatingParams().  None indicates a flat MoE.
+      expert_hp: hyperparameters for the expert networks.
+        e.g. FeedForwardExpertParams()
+      input_size: an integer.
+      output_size: an integer.
+      expert_devices: a list of device strings.  The devices to be used for
+        the experts.
+      name: a string.
+    """
+    self._name = name
+    # fill in the missing values in the hyperparameters
+    _SetInputOutputSizes(primary_gating_hp, input_size, None)
+    _SetInputOutputSizes(expert_hp, input_size, output_size)
+    self._is_hierarchical = secondary_gating_hp is not None
+    self._primary_gating_hp = primary_gating_hp
+    self._primary_gating = primary_gating_hp.gating_class(
+        primary_gating_hp, name + '_primary_gating')
+    n1 = self._primary_gating_hp.num_experts
+    # round robin assignment of experts to devices.
+    expert_devices = [
+        expert_devices[i % len(expert_devices)] for i in xrange(n1)
+    ]
+    self._expert_devices = expert_devices
+    self._all_vars = []
+    self._all_vars.extend(self._primary_gating.vars)
+    if self._is_hierarchical:
+      # hierarchical MoE
+      self._secondary_moe = []
+      for i in xrange(n1):
+        with tf.device(expert_devices[i]):
+          secondary_moe = LocalMixtureOfExperts(secondary_gating_hp, expert_hp,
+                                                input_size, output_size,
+                                                '%s_secondary_%d' % (name, i))
+          self._secondary_moe.append(secondary_moe)
+          self._all_vars.extend(secondary_moe.vars)
+    else:
+      # flat MoE
+      self._experts = []
+      for i in xrange(n1):
+        with tf.device(expert_devices[i]):
+          expert = expert_hp.expert_class(expert_hp, name + '_%d' % i)
+          self._experts.append(expert)
+          self._all_vars.extend(expert.vars)
+
+  def Eval(self,
+           datashard_devices,
+           xs,
+           train=True,
+           summaries=False,
+           identifiers=None,
+           shadow_xs=None):
+    """Evaluate MoE on given inputs.
+
+    This class is designed for the case where the rest of the model is using
+    data parallelism.   We receive an array of input `Tensor`s, one per
+    datashard, and we produce a list of output Tensors, one per datashard.
+
+    We provide a convenient debugging tool for determining the set of examples
+    that we passed to each expert.  The caller may provide a `Tensor` of
+    "identifiers", of any type whose first dimension matches the number of
+    input examples. The function will then return a list
+    "expert_to_identifiers", with one `Tensor` for each expert containing the
+    identifiers for all examples assigned to that expert.  A parallel list of
+    `Tensor`s, "expert_to_gates", is also returned, containing the
+    corresponding gate values.
+
+    Args:
+      datashard_devices: a `list` of device strings of length `num_datashards`.
+        Which devices to use for the output tensors.
+      xs: A `list` of `Tensor`s of length `num_datashards`.  Each has shape
+        `[batch_size[d], input_size].
+      train: a boolean `Scalar`.   When train=`True`, noise is added to the
+        gating function.
+      summaries: a boolean.  Whether to write summaries.
+      identifiers: an optional list of tensors.
+        Each tensor has shape [<batch_size[datashard]>, extra_dims]
+      shadow_xs: Optional `list` of `Tensor`s of length `num_datashards`.  Each
+        has shape `[batch_size[d], input_size]. Shadow_xs is useful if you want
+        to dispatch a transformed version of xs to the experts, but you want
+        untransformed xs for the gating network.
+
+    Returns:
+      ys: the output (a list of one tensor per datashard).  Each has shape
+         `[batch_size[d], output_size].
+      importance: a `Tensor` of shape `[n]` for a flat MoE or `[n1, n2]` for a
+         hierarchical MoE.  Batchwise sum of gates.
+      load:  a `Tensor` of shape `[n]` for a flat MoE or `[n1, n2]` for a
+         hierarchical MoE.  Smooth estimator of the number of
+         examples passed to each expert.  This is useful for load-balancing,
+         as any gradient on this `Tensor` will back-propagate to the gating
+         network.
+      expert_to_identifiers:  if `identifiers` was passed in, a list of
+         length `num_experts`.  Each element is a `Tensor` whose shape matches
+         that of `identifiers` in all but the first dimension.  Contains the
+         slices of `identifiers` corresponding to the batch elements that were
+         dispatched to that expert.
+      expert_to_gates: a list of one tensor per expert.
+         Each tensor has shape [<num_examples[expert]>]
+
+    """
+    n1 = self._primary_gating_hp.num_experts
+    epsilon = 1e-10
+    assert len(datashard_devices) == len(xs)
+    num_datashards = len(xs)
+    expert_devices = self._expert_devices
+    has_identifiers = identifiers is not None
+    # pylint: disable=unbalanced-tuple-unpacking
+    primary_gates, primary_smooth_load = Parallel(
+        datashard_devices, self._primary_gating.Eval, xs, train,
+        [summaries] + [False] * (num_datashards - 1))
+    primary_importance = tf.add_n(
+        Parallel(datashard_devices, tf.reduce_sum, primary_gates, 0))
+    primary_smooth_load = tf.add_n(primary_smooth_load)
+    primary_true_load = tf.add_n(
+        Parallel(datashard_devices, _GatesToLoad, primary_gates))
+    primary_dispatcher = DistributedSparseDispatcher(
+        datashard_devices, expert_devices, primary_gates)
+
+    if shadow_xs is None:
+      secondary_input = primary_dispatcher.Dispatch(xs)
+    else:
+      secondary_input = primary_dispatcher.Dispatch(shadow_xs)
+
+    primary_expert_to_identifiers = (primary_dispatcher.Dispatch(identifiers)
+                                     if has_identifiers else None)
+    primary_expert_to_gates = primary_dispatcher.ExpertToGates()
+    if not self._is_hierarchical:
+      # one-level distributed mixture of experts
+      secondary_output = Parallel(expert_devices, lambda a, b: a.Eval(b),
+                                  self._experts, secondary_input)
+      ys = primary_dispatcher.Combine(secondary_output)
+      return (ys, primary_importance, primary_smooth_load,
+              primary_expert_to_identifiers, primary_expert_to_gates)
+    # two-level hierarchical MoE
+    (secondary_output, secondary_importance, secondary_load,
+     secondary_expert_to_identifiers, secondary_expert_to_gates) = (Parallel(
+         expert_devices, [m.Eval for m in self._secondary_moe], secondary_input,
+         train, primary_expert_to_gates, [summaries] + [False] * (n1 - 1),
+         primary_expert_to_identifiers))
+    # pylint: enable=unbalanced-tuple-unpacking
+    ys = primary_dispatcher.Combine(secondary_output, multiply_by_gates=False)
+    importance = tf.stack(secondary_importance)
+    load = tf.stack(secondary_load) * tf.expand_dims(primary_smooth_load / (
+        primary_true_load + epsilon), 1)
+    expert_to_identifiers = []
+    if identifiers is not None:
+      for el in secondary_expert_to_identifiers:
+        expert_to_identifiers.extend(el)
+    expert_to_gates = []
+    for el in secondary_expert_to_gates:
+      expert_to_gates.extend(el)
+    return (ys, importance, load, expert_to_identifiers, expert_to_gates)
+
+  @property
+  def vars(self):
+    return self._all_vars
+
+
+class SparseDispatcher(object):
+  """Helper for implementing a mixture of experts.
+
+  Example use:
+
+  gates: a float32 `Tensor` with shape `[batch_size, num_experts]`
+  inputs: a float32 `Tensor` with shape `[batch_size, input_size]`
+  experts: a list of length `num_experts` containing sub-networks.
+
+    dispatcher = SparseDispatcher(num_experts, gates)
+    expert_inputs = dispatcher.Dispatch(inputs)
+    expert_outputs = [experts[i](expert_inputs[i]) for i in range(num_experts)]
+    outputs = dispatcher.Combine(expert_outputs)
+
+  The preceding code sets the output for a particular example b to:
+  output[b] = Sum_i(gates[b, i] * experts[i](inputs[b]))
+
+  This class takes advantage of sparsity in the gate matrix by including in the
+  `Tensor`s for expert i only the batch elements for which `gates[b, i] > 0`.
+  """
+
+  def __init__(self, num_experts, gates):
+    """Create a SparseDispatcher.
+
+    Args:
+      num_experts: an integer.
+      gates: a `Tensor` of shape `[batch_size, num_experts]`.
+
+    Returns:
+      a SparseDispatcher
+    """
+    self._gates = gates
+    self._num_experts = num_experts
+
+    where = tf.to_int32(tf.where(tf.transpose(gates) > 0))
+    self._expert_index, self._batch_index = tf.unstack(where, num=2, axis=1)
+    self._part_sizes_tensor = tf.reduce_sum(tf.to_int32(gates > 0), [0])
+    self._nonzero_gates = tf.gather(
+        tf.reshape(self._gates, [-1]),
+        self._batch_index * num_experts + self._expert_index)
+
+  def Dispatch(self, inp):
+    """Create one input Tensor for each expert.
+
+    The `Tensor` for a expert `i` contains the slices of `inp` corresponding
+    to the batch elements `b` where `gates[b, i] > 0`.
+
+    Args:
+      inp: a `Tensor` of shape '[batch_size, <extra_input_dims>]`
+    Returns:
+      a list of `num_experts` `Tensor`s with shapes
+        `[expert_batch_size_i, <extra_input_dims>]`.
+    """
+    inp = tf.gather(inp, self._batch_index)
+    return tf.split(inp, self._part_sizes_tensor, 0)
+
+  def Combine(self, expert_out, multiply_by_gates=True):
+    """Sum together the expert output, weighted by the gates.
+
+    The slice corresponding to a particular batch element `b` is computed
+    as the sum over all experts `i` of the expert output, weighted by the
+    corresponding gate values.  If `multiply_by_gates` is set to False, the
+    gate values are ignored.
+
+    Args:
+      expert_out: a list of `num_experts` `Tensor`s, each with shape
+        `[expert_batch_size_i, <extra_output_dims>]`.
+      multiply_by_gates: a boolean
+
+    Returns:
+      a `Tensor` with shape `[batch_size, <extra_output_dims>]`.
+    """
+    # see comments on ConvertGradientToTensor
+    stitched = ConvertGradientToTensor(tf.concat(expert_out, 0))
+    if multiply_by_gates:
+      stitched *= tf.expand_dims(self._nonzero_gates, 1)
+    combined = tf.unsorted_segment_sum(stitched, self._batch_index,
+                                       tf.shape(self._gates)[0])
+    return combined
+
+  def ExpertToGates(self):
+    """Gate values corresponding to the examples in the per-expert `Tensor`s.
+
+    Returns:
+      a list of `num_experts` one-dimensional `Tensor`s with type `tf.float32`
+          and shapes `[expert_batch_size_i]`
+    """
+    return tf.split(self._nonzero_gates, self._part_sizes_tensor, 0)
+
+  @property
+  def part_sizes(self):
+    return self._part_sizes_tensor
+
+
+class DistributedSparseDispatcher(object):
+  """A distributed version of SparseDispatcher.
+
+  Instead of one batch of input examples, we simultaneously process
+  num_datashards batches of input examples.  The per-expert `Tensor`s contain
+  a combination of examples from the different datashards.
+
+  Each datashard is associated with a particular device and each expert is
+  associated with a particular device.  All per-datashard and per-expert
+  `Tensor`s are created on those devices.  There is no single-device bottleneck.
+  """
+
+  def __init__(self, datashard_devices, expert_devices, gates):
+    """Create a DistributedSparseDispatcher.
+
+    Args:
+      datashard_devices: a list of num_datashards device strings.
+      expert_devices: a list of num_experts device strings.
+      gates: a list of num_datashards `Tensor`s of shapes
+        `[batch_size[d], num_experts]`.
+
+    Returns:
+      a DistributedSparseDispatcher
+    """
+    self._gates = gates
+    self._num_experts = len(expert_devices)
+    assert len(gates) == len(datashard_devices)
+    self._num_datashards = len(gates)
+    self._datashard_devices = datashard_devices
+    self._expert_devices = expert_devices
+    self._dispatchers = Parallel(self._datashard_devices, SparseDispatcher,
+                                 self._num_experts, gates)
+
+  def Dispatch(self, inp):
+    """Create one input Tensor for each expert.
+
+    Args:
+      inp: a list of length num_datashards `Tensor`s with shapes
+        `[batch_size[d], <extra_input_dims>]`.
+    Returns:
+      a list of `num_experts` `Tensor`s with shapes
+        `[num_examples[i], <extra_input_dims>]`.
+    """
+    dispatched = Parallel(self._datashard_devices, lambda a, b: a.Dispatch(b),
+                          self._dispatchers, inp)
+    ret = Parallel(self._expert_devices, tf.concat,
+                   TransposeListOfLists(dispatched), 0)
+    if ret[0].dtype == tf.float32:
+      # see comments on ConvertGradientToTensor
+      ret = Parallel(self._expert_devices, ConvertGradientToTensor, ret)
+    return ret
+
+  def Combine(self, expert_out, multiply_by_gates=True):
+    """Sum together the expert output, multiplied by the corresponding gates.
+
+    Args:
+      expert_out: a list of `num_experts` `Tensor`s, each with shape
+        `[expert_batch_size_i, <extra_output_dims>]`.
+      multiply_by_gates: a boolean.
+
+    Returns:
+      a list of num_datashards `Tensor`s with shapes
+        `[batch_size[d], <extra_output_dims>]`.
+    """
+    expert_part_sizes = tf.unstack(
+        tf.stack([
+            self._dispatchers[d].part_sizes
+            for d in xrange(self._num_datashards)
+        ]),
+        num=self._num_experts,
+        axis=1)
+    # list of lists of shape [num_experts][num_datashards]
+    expert_output_parts = Parallel(self._expert_devices, tf.split, expert_out,
+                                   expert_part_sizes)
+    expert_output_parts_t = TransposeListOfLists(expert_output_parts)
+    ret = []
+    for d in xrange(self._num_datashards):
+      with tf.device(self._datashard_devices[d]):
+        ret.append(self._dispatchers[d].Combine(
+            # see comments on ConvertGradientToTensor
+            ConvertGradientToTensor(tf.concat(expert_output_parts_t[d], 0)),
+            multiply_by_gates=multiply_by_gates))
+    return ret
+
+  def ExpertToGates(self):
+    """Gate values corresponding to the examples in the per-expert `Tensor`s.
+
+    Returns:
+      a list of `num_experts` one-dimensional `Tensor`s of type `tf.float32`.
+    """
+    return Parallel(self._expert_devices, tf.concat,
+                    TransposeListOfLists(
+                        Parallel(self._datashard_devices, [
+                            self._dispatchers[d].ExpertToGates
+                            for d in xrange(self._num_datashards)
+                        ])), 0)
+
+
+def TransposeListOfLists(lol):
+  """Transpose a list of equally-sized python lists.
+
+  Args:
+    lol: a list of lists
+  Returns:
+    a list of lists
+  """
+  assert lol, 'cannot pass the empty list'
+  return [list(x) for x in zip(*lol)]
+
+
+class DistributedSingleDispatcher(object):
+  """Dispatches to experts according to gates.
+
+  Each example goes to one expert.
+
+  Unlike SparseDispatcher, the gates are one-dimensional `Tensor`s of integer
+  expert ids.  There are no weights.
+  """
+
+  def __init__(self, data_parallelism, model_parallelism, gates):
+    """Constructs a Dispatcher.
+
+    Args:
+      data_parallelism: a Parallelism object.
+      model_parallelism: a Parallelism object.
+      gates: a list of 1d integer `Tensor`s, one per datashard.
+        Says which expert to use for each batch element.
+
+    Returns:
+      a DistributedSingleDispatcher
+    """
+    gates = data_parallelism(tf.to_int32, gates)
+    self._gates = gates
+    self._data_parallelism = data_parallelism
+    self._model_parallelism = model_parallelism
+
+    # Compute the sizes number of examples going from each datashard to each
+    # expert.
+    def _PartSizes(gates):
+      return tf.unsorted_segment_sum(
+          tf.ones_like(gates), gates, model_parallelism.n)
+
+    part_sizes_by_datashard = data_parallelism(_PartSizes, gates)
+    self._part_sizes_by_expert = tf.unstack(
+        tf.stack(part_sizes_by_datashard), num=model_parallelism.n, axis=1)
+
+    # These indices will be used to combine the output on the datashards.
+    def _StitchIndices(gates):
+      return tf.dynamic_partition(
+          tf.range(tf.size(gates)), gates, model_parallelism.n)
+
+    self._stitch_indices = data_parallelism(_StitchIndices, gates)
+
+  def Dispatch(self, d_tensors):
+    """Reshuffles input `Tensor`s to produce output `Tensor`s.
+
+    The dimensions of all input and output `Tensor`s match, except for
+    dimension 0.  In dimension 0, the input `Tensor`s match the corresponding
+    `gates` `Tensor`s which were passed to the constructor.
+
+    Args:
+      d_tensors: a list of `Tensor`s, one per datashard.
+
+    Returns:
+      a list of `Tensor`s, one per expert.
+
+    """
+    parts = self._data_parallelism(tf.dynamic_partition, d_tensors, self._gates,
+                                   self._model_parallelism.n)
+    parts_by_expert = TransposeListOfLists(parts)
+    x_tensors = self._model_parallelism(tf.concat, parts_by_expert, 0)
+    return x_tensors
+
+  def Combine(self, x_tensors):
+    """Reshuffles per-expert `Tensor`s to produce per-datashard `Tensor`s.
+
+    Dispatch must have been called at least once first.
+
+    The dimensions of all input and output `Tensor`s match, except for
+    dimension 0.  In dimension 0, the input `Tensor`s match the corresponding
+    outputs of `Dispatch`, and the output `Tensor`s match the corresponding
+    `gates` `Tensor`s which were passed to the constructor.
+
+    Args:
+      x_tensors: a list of `Tensor`s, one per expert.
+
+    Returns:
+      a list of `Tensor`s, one per datashard.
+    """
+    parts = self._model_parallelism(tf.split, x_tensors,
+                                    self._part_sizes_by_expert)
+    d_tensors = self._data_parallelism(tf.dynamic_stitch, self._stitch_indices,
+                                       TransposeListOfLists(parts))
+    return d_tensors
+
+
+def ParallelEmbeddingLookup(params, ids, data_parallelism):
+  """Mod-sharded embedding lookup with multiple datashards.
+
+  TODO(noam): does this work when vocab_size is not a multiple of `num_shards`?
+
+  Args:
+    params:  A list of `num_shards` `Tensors`, each with shapes
+       `[vocab_size / num_params, depth]`.
+    ids: A list of `num_datashards` one-dimensional ineger `Tensors`,
+       with shapes `[batch_size[i]]`
+    data_parallelism: A Parallelism object.
+
+  Returns:
+    a list of `num_datashards` `Tensors`, each with shape
+       `[batch_size[i], depth]`.
+  """
+  param_devices = [x.device for x in params]
+  model_parallelism = Parallelism(param_devices)
+  num_shards = len(param_devices)
+  # pylint: disable=unbalanced-tuple-unpacking
+  ids, unique_idx = data_parallelism(tf.unique, ids)
+  # pylint: enable=unbalanced-tuple-unpacking
+  gates = data_parallelism(tf.mod, ids, num_shards)
+  ids_div = data_parallelism(tf.div, ids, num_shards)
+  dispatcher = DistributedSingleDispatcher(data_parallelism, model_parallelism,
+                                           gates)
+  x_ids_div = dispatcher.Dispatch(ids_div)
+  params = model_parallelism(ConvertGradientToTensor, params)
+  x_emb = model_parallelism(tf.gather, params, x_ids_div)
+  r_emb = dispatcher.Combine(x_emb)
+  r_emb = data_parallelism(tf.gather, r_emb, unique_idx)
+  return r_emb
+
+
+def SampledSoftmaxLoss(features, sampler, num_classes, target_classes,
+                       target_params, sampled_classes, sampled_params):
+  """Loss for training softmax classifiers on large label vocabulary.
+
+  This function assumes that we have already chosen the sampled classes and
+  fetched the parameters for the target classes and the sampled classes.
+
+  Args:
+    features: a Tensor with shape [batch_size, hidden_size]
+    sampler: a candidate sampler object
+      (see learning/brain/google/python/ops/candidate_sampling.py)
+    num_classes: an integer
+    target_classes: an integer Tensor with shape [batch_size]
+    target_params: a Tensor with shape [batch_size, hidden_size]
+      The parameters corresponding to the target classes.
+    sampled_classes: an integer tensor with shape [num_sampled_classes]
+    sampled_params: a Tensor with shape [num_sampled_classes, hidden_size]
+      The parameters corresponding to the sampled classes.
+
+  Returns:
+    a Tensor with shape [batch_size]
+  """
+  sampled_logits = (tf.matmul(features, sampled_params, transpose_b=True) -
+                    sampler.log_expected_count(sampled_classes))
+  target_logits = (tf.reduce_sum(target_params * features, 1) -
+                   sampler.log_expected_count(target_classes))
+  sampled_log_denominator = tf.reduce_logsumexp(
+      sampled_logits, [1], name='SampledLogDenominator')
+  sampled_classes_mask = tf.unsorted_segment_sum(
+      tf.fill(tf.shape(sampled_classes), float('-inf')), sampled_classes,
+      num_classes)
+  target_log_denominator = (
+      target_logits + tf.gather(sampled_classes_mask, target_classes))
+  combined_log_denominator = tf.reduce_logsumexp(
+      tf.stack([sampled_log_denominator, target_log_denominator]), [0])
+  loss = combined_log_denominator - target_logits
+  return loss
+
+
+def ParallelSampledSoftmaxLoss(params,
+                               features,
+                               target_classes,
+                               sampler,
+                               num_classes,
+                               data_parallelism,
+                               target_weights=None):
+  """Computes sampled softmax loss across many datashards.
+
+  This is used during training to efficiently train a softmax classifier layer.
+
+  Args:
+    params: A list of num_param_shards Tensors, each with shape
+      [num_classes / num_param_shards, num_features].
+      The parameters are assumed to be mod-sharded by class.
+    features: a list of num_datashards Tensors, each with shape
+      [batch_size_i, num_features]
+    target_classes: A list of num_datashards integer Tensors each with shape
+       [batch_size_i]
+    sampler: a candidate sampler object
+      (see learning/brain/google/python/ops/candidate_sampling.py)
+    num_classes: an Integer
+    data_parallelism: a Parallelism object
+    target_weights: an optional list of num_datashards Tensors each with
+      shape [batch_size_i]
+  Returns:
+     a Scalar.
+  """
+  sampled_classes = data_parallelism(sampler.sample)
+  sampled_params = ParallelEmbeddingLookup(params, sampled_classes,
+                                           data_parallelism)
+  target_params = ParallelEmbeddingLookup(params, target_classes,
+                                          data_parallelism)
+  ret = data_parallelism(SampledSoftmaxLoss, features, sampler, num_classes,
+                         target_classes, target_params, sampled_classes,
+                         sampled_params)
+  if target_weights is not None:
+    ret = data_parallelism(tf.multiply, ret, target_weights)
+  ret = data_parallelism(tf.reduce_sum, ret)
+  ret = tf.add_n(ret)
+  return ret
diff --git a/tensor2tensor/utils/metrics.py b/tensor2tensor/utils/metrics.py
new file mode 100644
index 000000000..4dc952a08
--- /dev/null
+++ b/tensor2tensor/utils/metrics.py
@@ -0,0 +1,155 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utils for metrics used in eval."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+
+# Dependency imports
+
+import six
+
+from tensor2tensor.models import common_layers
+from tensor2tensor.utils import bleu_hook
+
+import tensorflow as tf
+
+flags = tf.flags
+FLAGS = flags.FLAGS
+
+
+def padded_accuracy_topk(predictions,
+                         labels,
+                         k,
+                         weights_fn=common_layers.weights_nonzero):
+  """Percentage of times that top-k predictions matches labels on non-0s."""
+  with tf.variable_scope("padded_accuracy_topk", values=[predictions, labels]):
+    padded_labels = common_layers.pad_with_zeros(predictions, labels)
+    weights = weights_fn(padded_labels)
+    effective_k = tf.minimum(k, tf.shape(predictions)[-1])
+    _, outputs = tf.nn.top_k(predictions, k=effective_k)
+    outputs = tf.to_int32(outputs)
+    padded_labels = tf.expand_dims(padded_labels, axis=-1)
+    padded_labels += tf.zeros_like(outputs)  # Pad to same shape.
+    same = tf.to_float(tf.equal(outputs, padded_labels))
+    same_topk = tf.reduce_sum(same, axis=-1)
+    return same_topk, weights
+
+
+def padded_accuracy_top5(predictions,
+                         labels,
+                         weights_fn=common_layers.weights_nonzero):
+  return padded_accuracy_topk(predictions, labels, 5, weights_fn)
+
+
+def padded_sequence_accuracy(predictions,
+                             labels,
+                             weights_fn=common_layers.weights_nonzero):
+  """Percentage of times that predictions matches labels everywhere (non-0)."""
+  with tf.variable_scope(
+      "padded_sequence_accuracy", values=[predictions, labels]):
+    padded_labels = common_layers.pad_with_zeros(predictions, labels)
+    weights = weights_fn(padded_labels)
+    outputs = tf.to_int32(tf.argmax(predictions, axis=-1))
+    not_correct = tf.to_float(tf.not_equal(outputs, padded_labels)) * weights
+    axis = list(range(1, len(outputs.get_shape())))
+    correct_seq = 1.0 - tf.minimum(1.0, tf.reduce_sum(not_correct, axis=axis))
+    return correct_seq, tf.constant(1.0)
+
+
+def padded_neg_log_perplexity(predictions,
+                              labels,
+                              weights_fn=common_layers.weights_nonzero):
+  """Average log-perplexity exluding padding 0s. No smoothing."""
+  num, den = common_layers.padded_cross_entropy(
+      predictions, labels, 0.0, weights_fn=weights_fn, reduce_sum=False)
+  return (-num, den)
+
+
+def padded_accuracy(predictions,
+                    labels,
+                    weights_fn=common_layers.weights_nonzero):
+  """Percentage of times that predictions matches labels on non-0s."""
+  with tf.variable_scope("padded_accuracy", values=[predictions, labels]):
+    padded_labels = common_layers.pad_with_zeros(predictions, labels)
+    weights = weights_fn(padded_labels)
+    outputs = tf.to_int32(tf.argmax(predictions, axis=-1))
+    return tf.to_float(tf.equal(outputs, padded_labels)), weights
+
+
+def create_evaluation_metrics(problems):
+  """Creates the evaluation metrics for the model.
+
+  Args:
+    problems: List of strings containing the name of the problems.
+
+  Returns:
+    A dictionary with keys that are strings naming the evaluation
+    metrics and values that are functions taking arguments of
+    (predictions, targets), returning a tuple of a tensor of the
+    metric's value together with an op to update the metric's value.
+  """
+
+  def append_metric_fns(metric_tup, eval_metrics):
+    """Append problem-specific and global metrics to eval_metrics."""
+    metric_name, metric_function = metric_tup
+    def fn(predictions, labels, weights, idx, weights_fn):
+      # The 'weights' argument represents problem-choice here,
+      # we need to keep this name because MetricSpecs checks it.
+      problem_choice = weights
+      (scores, weights) = tf.cond(
+          tf.equal(idx, problem_choice),  # pylint: disable=cell-var-from-loop
+          lambda: metric_function(predictions, labels, weights_fn=weights_fn),
+          lambda: (tf.constant(0.0), tf.constant(0.0)))
+      # The tf.metrics.mean function assures correct aggregation.
+      return tf.metrics.mean(scores, weights)
+
+    for i, problem in enumerate(problems):
+      name = "metrics-%s/%s" % (problem, metric_name)
+      weights_fn = (common_layers.weights_concatenated
+                    if "concat" in problem else common_layers.weights_nonzero)
+      eval_metrics[name] = functools.partial(fn, idx=i, weights_fn=weights_fn)
+
+    def global_fn(predictions, labels, weights):
+      (scores, weights) = metric_function(predictions, labels)
+      return tf.metrics.mean(scores, weights)
+
+    eval_metrics["metrics/%s" % metric_name] = global_fn
+
+  eval_metrics = dict()
+
+  # Metrics are functions that take predictions and labels and return
+  # a tensor of metrics and a tensor of weights.
+  # The results are passed to tf.metrics.mean to accumulate properly.
+  metrics_list = [("accuracy", padded_accuracy), ("accuracy_top5",
+                                                  padded_accuracy_top5),
+                  ("accuracy_per_sequence", padded_sequence_accuracy),
+                  ("neg_log_perplexity", padded_neg_log_perplexity)]
+
+  # TODO(nikip): Extend this to support use of custom metrics for problems.
+  for problem in problems:
+    if "wmt" in problem:
+      metrics_list.append(("bleu_score", bleu_hook.padded_bleu_score))
+
+  for metric in metrics_list:
+    append_metric_fns(metric, eval_metrics)
+
+  return {
+      k: tf.contrib.learn.MetricSpec(
+          v, prediction_key="predictions", weight_key="problem_choice")
+      for (k, v) in six.iteritems(eval_metrics)
+  }
diff --git a/tensor2tensor/utils/metrics_test.py b/tensor2tensor/utils/metrics_test.py
new file mode 100644
index 000000000..0472d4f21
--- /dev/null
+++ b/tensor2tensor/utils/metrics_test.py
@@ -0,0 +1,88 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for tensor2tensor.utils.metrics."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+import numpy as np
+from tensor2tensor.utils import metrics
+
+import tensorflow as tf
+
+
+class CommonLayersTest(tf.test.TestCase):
+
+  def testAccuracyMetric(self):
+    predictions = np.random.randint(1, 5, size=(12, 12, 12, 1))
+    targets = np.random.randint(1, 5, size=(12, 12, 12, 1))
+    expected = np.mean((predictions == targets).astype(float))
+    with self.test_session() as session:
+      scores, _ = metrics.padded_accuracy(
+          tf.one_hot(predictions, depth=5, dtype=tf.float32),
+          tf.constant(targets, dtype=tf.int32))
+      a = tf.reduce_mean(scores)
+      session.run(tf.global_variables_initializer())
+      actual = session.run(a)
+    self.assertAlmostEqual(actual, expected)
+
+  def testAccuracyTopKMetric(self):
+    predictions = np.random.randint(1, 5, size=(12, 12, 12, 1))
+    targets = np.random.randint(1, 5, size=(12, 12, 12, 1))
+    expected = np.mean((predictions == targets).astype(float))
+    with self.test_session() as session:
+      predicted = tf.one_hot(predictions, depth=5, dtype=tf.float32)
+      scores1, _ = metrics.padded_accuracy_topk(
+          predicted, tf.constant(targets, dtype=tf.int32), k=1)
+      scores2, _ = metrics.padded_accuracy_topk(
+          predicted, tf.constant(targets, dtype=tf.int32), k=7)
+      a1 = tf.reduce_mean(scores1)
+      a2 = tf.reduce_mean(scores2)
+      session.run(tf.global_variables_initializer())
+      actual1, actual2 = session.run([a1, a2])
+    self.assertAlmostEqual(actual1, expected)
+    self.assertAlmostEqual(actual2, 1.0)
+
+  def testSequenceAccuracyMetric(self):
+    predictions = np.random.randint(4, size=(12, 12, 12, 1))
+    targets = np.random.randint(4, size=(12, 12, 12, 1))
+    expected = np.mean(
+        np.prod((predictions == targets).astype(float), axis=(1, 2)))
+    with self.test_session() as session:
+      scores, _ = metrics.padded_sequence_accuracy(
+          tf.one_hot(predictions, depth=4, dtype=tf.float32),
+          tf.constant(targets, dtype=tf.int32))
+      a = tf.reduce_mean(scores)
+      session.run(tf.global_variables_initializer())
+      actual = session.run(a)
+    self.assertEqual(actual, expected)
+
+  def testNegativeLogPerplexity(self):
+    predictions = np.random.randint(4, size=(12, 12, 12, 1))
+    targets = np.random.randint(4, size=(12, 12, 12, 1))
+    with self.test_session() as session:
+      scores, _ = metrics.padded_neg_log_perplexity(
+          tf.one_hot(predictions, depth=4, dtype=tf.float32),
+          tf.constant(targets, dtype=tf.int32))
+      a = tf.reduce_mean(scores)
+      session.run(tf.global_variables_initializer())
+      actual = session.run(a)
+    self.assertEqual(actual.shape, ())
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensor2tensor/utils/modality.py b/tensor2tensor/utils/modality.py
new file mode 100644
index 000000000..e6b1c9994
--- /dev/null
+++ b/tensor2tensor/utils/modality.py
@@ -0,0 +1,564 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Modalities define the bottom and top of the model (not the body)."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import re
+
+# Dependency imports
+
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensor2tensor.models import common_layers
+from tensor2tensor.utils import expert_utils as eu
+import tensorflow as tf
+
+
+class Modality(object):
+  """Abstract Modality class for data transformations.
+
+  An abstract class representing modalities for transforming data to a space
+  interpretable by sequence models. It has 3 functions:
+  * inputs_bottom:  called on inputs entering the model.
+  * targets_bottom: called on targets entering the model (e.g., the decoder).
+  * targets_top :   called on targets to generate predictions.
+
+  For example, think about a modality for images. The inputs_bottom function
+  represents the part of the model applied to an incoming image, e.g., an entry
+  flow of a convolutional network. The targets_top function represents the top
+  part of a model that is generating images, e.g., a PixelCNN network. The final
+  function targets_bottom represents the auto-regressive part of the network.
+  It is applied to the already-generated part of an image, which is given to
+  the decoder to generate the next part. In some cases, e.g., for text, it is
+  the same as the inputs_bottom function, as that is the default we use. But,
+  e.g., for images, a different function might be needed to regress properly.
+
+  All 3 functions have simple and sharded versions. A sub-class only needs
+  to implement the simple version, the default sharding will be used then.
+  """
+
+  def __init__(self, model_hparams):
+    self._model_hparams = model_hparams
+
+  @property
+  def name(self):
+    camelcase_name = type(self).__name__  # DeCamelCase for TF readability.
+    return re.sub("([A-Z]+)", r"_\1", camelcase_name).lower()[1:]
+
+  @property
+  def targets_dimensionality(self):
+    """Integer, the last dimension of the predictions (vocab size)."""
+    raise NotImplementedError("Abstract Method")
+
+  @property
+  def _body_input_depth(self):
+    return self._model_hparams.hidden_size
+
+  def inputs_bottom_simple(self, x):
+    """Transform one shard of input.
+
+    Args:
+      x: An int32 Tensor with shape [batch, p0, p1, input_channels]
+    Returns:
+      A float32 Tensor with shape [batch, p0, p1, body_input_depth]
+    """
+    raise NotImplementedError("Abstract Method")
+
+  def inputs_bottom_sharded(self, xs, data_parallelism):
+    """Transform the inputs.
+
+    Args:
+      xs: A list of num_datashards Tensors (one per shard)
+        each with shape [batch, p0, p1, depth]
+      data_parallelism: a expert_utils.Parallelism object
+    Returns:
+      shaded_body_input: A list of num_datashards Tensors, each with shape
+        [batch, p0, p1, body_input_depth].
+    """
+    return data_parallelism(self.inputs_bottom_simple, xs)
+
+  def targets_bottom_simple(self, x):
+    """Transform one shard of targets.
+
+    Args:
+      x: An int32 Tensor with shape [batch, p0, p1, target_channels]
+    Returns:
+      A float32 Tensor with shape [batch, p0, p1, body_input_depth]
+    """
+    with tf.variable_scope("targets_bottom_simple"):
+      return self.inputs_bottom_simple(x)
+
+  def targets_bottom_sharded(self, xs, data_parallelism):
+    """Transform the targets.
+
+    Args:
+      xs: A list of num_datashards Tensors (one per shard)
+        each with shape [batch, p0, p1, target_channels]
+      data_parallelism: a expert_utils.Parallelism object
+    Returns:
+      shaded_body_input: A list of num_datashards Tensors, each with shape
+        [batch, p0, p1, body_input_depth].
+    """
+    return data_parallelism(self.targets_bottom_simple, xs)
+
+  def targets_top_simple(self, body_output, targets):
+    """Transform one shard of output.
+
+    Most classes will override this function.
+
+    Args:
+      body_output: A Tensor with shape [batch, p0, p1, body_output_depth]
+      targets: A Tensor with shape [batch, p0, p1, targets_channels,
+        targets_dimensionality]
+    Returns:
+      A Tensor of class logits.
+    """
+    raise NotImplementedError("Abstract Method")
+
+  def targets_top_sharded(self,
+                          sharded_body_output,
+                          sharded_targets,
+                          data_parallelism,
+                          weights_fn=common_layers.weights_nonzero):
+    """Transform all shards of targets.
+
+    Classes with cross-shard interaction will override this function.
+
+    Args:
+      sharded_body_output: A list of Tensors.
+      sharded_targets: A list of Tensors.
+      data_parallelism: a expert_utils.Parallelism object.
+      weights_fn: function from targets to target weights.
+    Returns:
+      shaded_logits: A list of Tensors.
+      training_loss: a Scalar.
+    """
+    sharded_logits = data_parallelism(self.targets_top_simple,
+                                      sharded_body_output, sharded_targets)
+    loss_num, loss_den = data_parallelism(
+        common_layers.padded_cross_entropy,
+        sharded_logits,
+        sharded_targets,
+        self._model_hparams.label_smoothing,
+        weights_fn=weights_fn)
+    loss = tf.add_n(loss_num) / tf.maximum(1.0, tf.add_n(loss_den))
+    return sharded_logits, loss
+
+
+class SymbolModality(Modality):
+  """Modality for sets of discrete symbols.
+
+  Input:
+    Embedding.
+
+  Output:
+    Linear transformation + softmax.
+  """
+
+  def __init__(self, model_hparams, vocab_size):
+    super(SymbolModality, self).__init__(model_hparams)
+    self._vocab_size = vocab_size
+    self._datashard_device_to_embedding = None
+    self._datashard_device_to_softmax_weights = None
+
+  @property
+  def name(self):
+    return "symbol_modality_%d_%d" % (self._vocab_size, self._body_input_depth)
+
+  @property
+  def targets_dimensionality(self):
+    return self._vocab_size
+
+  def _get_weights(self):
+    """Create or get concatenated embedding or softmax variable.
+
+    Returns:
+       a list of self._num_shards Tensors.
+    """
+    num_shards = self._model_hparams.symbol_modality_num_shards
+    shards = []
+    for i in xrange(num_shards):
+      shard_size = (self._vocab_size // num_shards) + (
+          1 if i < self._vocab_size % num_shards else 0)
+      var_name = "weights_%d" % i
+      shards.append(
+          tf.get_variable(
+              var_name, [shard_size, self._body_input_depth],
+              initializer=tf.random_normal_initializer(
+                  0.0, self._body_input_depth**-0.5)))
+    if num_shards == 1:
+      ret = shards[0]
+    else:
+      ret = tf.concat(shards, 0)
+    ret = eu.ConvertGradientToTensor(ret)
+    return ret
+
+  def bottom_simple(self, x, name, reuse):
+    with tf.variable_scope(name, reuse=reuse):
+      # Squeeze out the channels dimension.
+      x = tf.squeeze(x, axis=3)
+      var = self._get_weights()
+      ret = tf.gather(var, x)
+      if self._model_hparams.multiply_embedding_mode == "sqrt_depth":
+        ret *= self._body_input_depth**0.5
+      ret *= tf.expand_dims(tf.to_float(tf.not_equal(x, 0)), -1)
+      return ret
+
+  def inputs_bottom_simple(self, x):
+    if self._model_hparams.shared_embedding_and_softmax_weights:
+      return self.bottom_simple(x, "shared", reuse=None)
+    else:
+      return self.bottom_simple(x, "input_emb", reuse=None)
+
+  def targets_bottom_simple(self, x):
+    if self._model_hparams.shared_embedding_and_softmax_weights:
+      return self.bottom_simple(x, "shared", reuse=True)
+    else:
+      return self.bottom_simple(x, "target_emb", reuse=None)
+
+  def targets_top_simple(self, body_output, targets):
+    """Generate logits.
+
+    Args:
+      body_output: A Tensor with shape [batch, p0, p1, body_input_depth]
+      targets: A Tensor with shape [batch, p0, p1, 1]
+    Returns:
+      logits: A Tensor with shape  [batch, p0, p1, ?, vocab_size].
+    """
+    if self._model_hparams.shared_embedding_and_softmax_weights:
+      scope_name = "shared"
+      reuse = True
+    else:
+      scope_name = "softmax"
+      reuse = False
+    with tf.variable_scope(scope_name, reuse=reuse):
+      var = self._get_weights()
+      shape = tf.shape(body_output)[:-1]
+      body_output = tf.reshape(body_output, [-1, self._body_input_depth])
+      logits = tf.matmul(body_output, var, transpose_b=True)
+      logits = tf.reshape(logits, tf.concat([shape, [self._vocab_size]], 0))
+      # insert a channels dimension
+      return tf.expand_dims(logits, 3)
+
+
+class SmallImageModality(Modality):
+  """Performs strided conv compressions for small image data."""
+
+  def __init__(self, model_hparams):
+    super(SmallImageModality, self).__init__(model_hparams)
+
+  @property
+  def targets_dimensionality(self):
+    return 256
+
+  def inputs_bottom_simple(self, inputs):
+    with tf.variable_scope(self.name):
+      inputs = common_layers.standardize_images(inputs)
+      # TODO(lukaszkaiser): summaries here don't work in multi-problem case yet.
+      # tf.summary.image("inputs", inputs, max_outputs=2)
+      if self._model_hparams.compress_steps > 0:
+        strides = (2, 2)
+      else:
+        strides = (1, 1)
+      return common_layers.conv_block(
+          inputs,
+          self._body_input_depth, [((1, 1), (3, 3))],
+          first_relu=False,
+          strides=strides,
+          padding="SAME",
+          force2d=True,
+          name="small_image_conv")
+
+  def targets_bottom_simple(self, inputs):
+    with tf.variable_scope(self.name):
+      inputs = common_layers.standardize_images(inputs)
+      if self._model_hparams.compress_steps > 0:
+        kernel, strides = (2, 2), (2, 2)  # Crucial to not leak!
+      else:
+        kernel, strides = (1, 1), (1, 1)
+      return common_layers.conv_block(
+          inputs,
+          self._body_input_depth, [((1, 1), kernel)],
+          first_relu=False,
+          strides=strides,
+          force2d=True,
+          name="small_image_conv")
+
+  def targets_top_simple(self, body_output, targets):
+    with tf.variable_scope(self.name):
+      if self._model_hparams.compress_steps == 0:
+        targets_shape = tf.shape(targets)
+        channels = targets.shape.as_list()[-1]
+        outputs = tf.layers.dense(body_output, 256 * channels)
+        return tf.reshape(outputs, [
+            targets_shape[0], targets_shape[1], targets_shape[2], 3, 256
+        ])
+      dilations_kernels = [((1, 1), (3, 1)), ((2, 1), (3, 1)), ((4, 1), (3, 1))]
+      return common_layers.decompress_seqcnn(
+          body_output, targets, 256, dilations_kernels, 2, is_2d=True)
+
+  def targets_top_sharded(self,
+                          sharded_body_output,
+                          sharded_targets,
+                          data_parallelism,
+                          weights_fn=common_layers.weights_all):
+    # Call the default implementation, but weight 1.0 on 0s by default.
+    # (Since we're processing images and so have no padding and some pixel 0s.)
+    return super(SmallImageModality, self).targets_top_sharded(
+        sharded_body_output,
+        sharded_targets,
+        data_parallelism,
+        weights_fn=weights_fn)
+
+
+class ImageModality(Modality):
+  """Performs embedding and strided conv compressions for large image data."""
+
+  def __init__(self, model_hparams):
+    super(ImageModality, self).__init__(model_hparams)
+
+  @property
+  def targets_dimensionality(self):
+    return 256
+
+  def inputs_bottom_simple(self, inputs):
+    """Transform input from data space to model space.
+
+    Perform the Xception "Entry flow", which consists of two convolutional
+    filter upscalings followed by three residually connected separable
+    convolution blocks.
+
+    Args:
+      inputs: A Tensor with shape [batch, ...]
+    Returns:
+      body_input: A Tensor with shape [batch, ?, ?, body_input_depth].
+    """
+    with tf.variable_scope(self.name):
+
+      def xnet_resblock(x, filters, res_relu, name):
+        with tf.variable_scope(name):
+          y = common_layers.separable_conv_block(
+              x,
+              filters, [((1, 1), (3, 3)), ((1, 1), (3, 3))],
+              first_relu=True,
+              padding="SAME",
+              force2d=True,
+              name="sep_conv_block")
+          y = common_layers.pool(y, (3, 3), "MAX", "SAME", strides=(2, 2))
+          return y + common_layers.conv_block(
+              x,
+              filters, [((1, 1), (1, 1))],
+              padding="SAME",
+              strides=(2, 2),
+              first_relu=res_relu,
+              force2d=True,
+              name="res_conv0")
+
+      inputs = common_layers.standardize_images(inputs)
+      # TODO(lukaszkaiser): summaries here don't work in multi-problem case yet.
+      # tf.summary.image("inputs", inputs, max_outputs=2)
+      x = common_layers.conv_block(
+          inputs,
+          32, [((1, 1), (3, 3))],
+          first_relu=False,
+          padding="SAME",
+          strides=(2, 2),
+          force2d=True,
+          name="conv0")
+      x = common_layers.conv_block(
+          x, 64, [((1, 1), (3, 3))], padding="SAME", force2d=True, name="conv1")
+      x = xnet_resblock(x, min(128, self._body_input_depth), True, "block0")
+      x = xnet_resblock(x, min(256, self._body_input_depth), False, "block1")
+      return xnet_resblock(x, self._body_input_depth, False, "block2")
+
+  def targets_top_simple(self, body_output, _):
+    # TODO(lukaszkaiser): work on a better way to generate large images.
+    with tf.variable_scope(self.name):
+      decompressed_inputs = common_layers.deconv_stride2_multistep(
+          body_output,
+          self._model_hparams.compress_steps,
+          body_output.get_shape()[-1],
+          name="deconv")
+      return common_layers.conv(
+          decompressed_inputs, self._vocab_size, (1, 1), padding="SAME")
+
+
+class AudioModality(Modality):
+  """Performs strided conv compressions for audio data."""
+
+  def __init__(self, model_hparams):
+    super(AudioModality, self).__init__(model_hparams)
+
+  def inputs_bottom_simple(self, inputs):
+    """Transform input from data space to model space.
+
+    Args:
+      inputs: A Tensor with shape [batch, ...]
+    Returns:
+      body_input: A Tensor with shape [batch, ?, ?, body_input_depth].
+    """
+    with tf.variable_scope(self.name):
+      # TODO(aidangomez): Will need to sort out a better audio pipeline
+      def xnet_resblock(x, filters, res_relu, name):
+        with tf.variable_scope(name):
+          # Typically audio samples are >100k samples in length and have a width
+          # of 2 or 4. Mono audio has a single channel while stereo has 2.
+          y = common_layers.separable_conv_block(
+              x,
+              filters, [((1, 1), (3, 3)), ((1, 1), (3, 3))],
+              first_relu=True,
+              padding="SAME",
+              force2d=True,
+              name="sep_conv_block")
+          y = common_layers.pool(y, (3, 3), "MAX", "SAME", strides=(2, 2))
+          return y + common_layers.conv_block(
+              x,
+              filters, [((1, 1), (1, 1))],
+              padding="SAME",
+              strides=(2, 2),
+              first_relu=res_relu,
+              force2d=True,
+              name="res_conv0")
+
+      x = tf.to_float(inputs) / 255.
+      x.set_shape([None, None, None, 1])
+      for i in xrange(self._model_hparams.audio_compression):
+        x = xnet_resblock(x, 2**(i + 1), True, "compress_block_%d" % i)
+      return xnet_resblock(x, self._body_input_depth, False,
+                           "compress_block_final")
+
+
+class AudioSpectralModality(Modality):
+  """Performs strided conv compressions for audio spectral data."""
+
+  def __init__(self, model_hparams):
+    super(AudioSpectralModality, self).__init__(model_hparams)
+
+  def inputs_bottom_simple(self, inputs):
+    """Transform input from data space to model space.
+
+    Args:
+      inputs: A Tensor with shape [batch, ...]
+    Returns:
+      body_input: A Tensor with shape [batch, ?, ?, body_input_depth].
+    """
+    with tf.variable_scope(self.name):
+      # TODO(aidangomez): Will need to sort out a better audio pipeline
+      def xnet_resblock(x, filters, res_relu, name):
+        with tf.variable_scope(name):
+          # We only stride along the length dimension to preserve the spectral
+          # bins (which are tiny in dimensionality relative to length)
+          y = common_layers.separable_conv_block(
+              x,
+              filters, [((1, 1), (3, 3)), ((1, 1), (3, 3))],
+              first_relu=True,
+              padding="SAME",
+              force2d=True,
+              name="sep_conv_block")
+          y = common_layers.pool(y, (3, 3), "MAX", "SAME", strides=(2, 1))
+          return y + common_layers.conv_block(
+              x,
+              filters, [((1, 1), (1, 1))],
+              padding="SAME",
+              strides=(2, 1),
+              first_relu=res_relu,
+              force2d=True,
+              name="res_conv0")
+
+      # Bitcast back from int32
+      x = tf.bitcast(inputs, tf.float32)
+      x.set_shape([None, None, None, 1])
+      for i in xrange(self._model_hparams.audio_compression):
+        x = xnet_resblock(x, 2**(i + 1), True, "compress_block_%d" % i)
+      return xnet_resblock(x, self._body_input_depth, False,
+                           "compress_block_final")
+
+
+class ClassLabelModality(Modality):
+  """Used for label data."""
+
+  def __init__(self, model_hparams, vocab_size, is2d=False):
+    super(ClassLabelModality, self).__init__(model_hparams)
+    self._vocab_size = vocab_size
+    self._is_2d = is2d
+    self._kernel = (3, 3) if is2d else (5, 1)
+    self._strides = (2, 2) if is2d else (4, 1)
+    self._padding = "SAME" if is2d else "LEFT"
+
+  @property
+  def name(self):
+    return "class_label_modality_%d_%d" % (self._vocab_size,
+                                           self._body_input_depth)
+
+  @property
+  def targets_dimensionality(self):
+    return self._vocab_size
+
+  def inputs_bottom_simple(self, x):
+    with tf.variable_scope(self.name):
+      return common_layers.embedding(
+          x,
+          self._vocab_size,
+          self._body_input_depth,
+          multiplier=self._body_input_depth**0.5 if
+          self._model_hparams.multiply_embedding_mode == "sqrt_depth" else 1.0)
+
+  def targets_bottom_simple(self, x):
+    with tf.variable_scope(self.name):
+      return tf.zeros([tf.shape(x)[0], 1, 1, self._body_input_depth])
+
+  def targets_top_simple(self, body_output, _):
+    """Transform inputs from model space to target space.
+
+    Perform the Xception "Exit flow", consisting of a single residual block and
+    two separable convolutional upscalings followed by global spatial average
+    pooling.
+
+    Args:
+      body_output: A Tensor with shape [batch, ?, ?, body_output_size].
+    Returns:
+      a Tensors, each with shape [batch_size, ?, ?, vocab_size]
+    """
+    with tf.variable_scope(self.name):
+      x = body_output
+
+      # Assume input is a square with self._body_input_depth channels.
+      if self._is_2d:
+        length_float = tf.to_float(tf.shape(x)[1])
+        spatial_dim_float = tf.sqrt(length_float)
+        spatial_dim = tf.to_int32(spatial_dim_float)
+        x = tf.reshape(x, [-1, spatial_dim, spatial_dim,
+                           self._body_input_depth])
+      x = common_layers.conv_block_downsample(x, self._kernel, self._strides,
+                                              self._padding)
+      x = tf.nn.relu(x)
+      x = tf.reduce_mean(x, axis=[1, 2], keep_dims=True)
+      res = common_layers.conv(x, self._vocab_size, (1, 1))
+      return tf.expand_dims(res, 3)
+
+  def targets_top_sharded(self,
+                          sharded_body_output,
+                          sharded_targets,
+                          data_parallelism,
+                          weights_fn=common_layers.weights_all):
+    # Call the default implementation, but weight 1.0 on 0s by default.
+    # (Since we're processing images and so have no padding and some labels 0.)
+    return super(ClassLabelModality, self).targets_top_sharded(
+        sharded_body_output,
+        sharded_targets,
+        data_parallelism,
+        weights_fn=weights_fn)
diff --git a/tensor2tensor/utils/modality_test.py b/tensor2tensor/utils/modality_test.py
new file mode 100644
index 000000000..0b22b4eff
--- /dev/null
+++ b/tensor2tensor/utils/modality_test.py
@@ -0,0 +1,88 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for Modalities."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+import numpy as np
+
+from tensor2tensor.utils import expert_utils
+from tensor2tensor.utils import modality
+
+import tensorflow as tf
+
+
+class ModalityTest(tf.test.TestCase):
+
+  def testSymbolModalityInputs(self):
+    batch_size = 10
+    num_datashards = 5
+    length = 5
+    vocab_size = 5000
+    hidden_size = 9
+    model_hparams = tf.contrib.training.HParams(
+        symbol_modality_num_shards=4,
+        hidden_size=hidden_size,
+        multiply_embedding_mode="sqrt_depth",
+        shared_embedding_and_softmax_weights=0)
+    x = -1 + np.random.random_integers(vocab_size, size=(
+        batch_size, length, 1, 1))
+    m = modality.SymbolModality(model_hparams, vocab_size)
+    data_parallelism = expert_utils.Parallelism(
+        ["/device:CPU:0"] * num_datashards, reuse=True)
+    with self.test_session() as session:
+      xs = tf.split(x, num_datashards)
+      sharded_output = m.inputs_bottom_sharded(xs, data_parallelism)
+      output = tf.concat(sharded_output, 0)
+      session.run(tf.global_variables_initializer())
+      res = session.run(output)
+    self.assertEqual(res.shape, (batch_size, length, 1, hidden_size))
+
+  def testSymbolModalityTargets(self):
+    batch_size = 10
+    num_datashards = 5
+    length = 6
+    height = 7
+    hidden_size = 9
+    vocab_size = 11
+    model_hparams = tf.contrib.training.HParams(
+        symbol_modality_num_shards=4,
+        hidden_size=hidden_size,
+        label_smoothing=0.2,
+        shared_embedding_and_softmax_weights=0)
+    body_output = -1 + np.random.random_integers(
+        100, size=(batch_size, length, height, hidden_size))
+    targets = -1 + np.random.random_integers(
+        vocab_size, size=(batch_size, length, height, 1))
+    m = modality.SymbolModality(model_hparams, vocab_size)
+    data_parallelism = expert_utils.Parallelism(
+        ["/device:CPU:0"] * num_datashards, reuse=True)
+    with self.test_session() as session:
+      sharded_body_output = tf.split(tf.to_float(body_output), num_datashards)
+      sharded_targets = tf.split(targets, num_datashards)
+      sharded_logits, train_loss = m.targets_top_sharded(
+          sharded_body_output, sharded_targets, data_parallelism)
+      logits = tf.concat(sharded_logits, 0)
+      session.run(tf.global_variables_initializer())
+      res1, res2 = session.run((logits, train_loss))
+    self.assertEqual(res1.shape, (batch_size, length, height, 1, vocab_size))
+    self.assertEqual(res2.shape, ())
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensor2tensor/utils/registry.py b/tensor2tensor/utils/registry.py
new file mode 100644
index 000000000..7be75b919
--- /dev/null
+++ b/tensor2tensor/utils/registry.py
@@ -0,0 +1,184 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Registry for models, hyperparameter settings, problem types, and datasets.
+
+Define a new model by subclassing T2TModel and register it:
+
+```
+@registry.register_model
+class MyModel(T2TModel):
+  ...
+```
+
+Access by snake-cased name: `registry.model("my_model")`. If you're using
+`trainer.py`, you can pass on the command-line: `--model=my_model`.
+
+See all the models registered: `registry.list_models()`.
+
+For hyperparameter sets:
+  * Register: `registry.register_hparams`
+  * List: `registry.list_hparams`
+  * Retrieve by name: `registry.hparams`
+  * Command-line flag in `trainer.py`: `--hparams_set=name`
+
+For hyperparameter ranges:
+  * Register: `registry.register_ranged_hparams`
+  * List: `registry.list_ranged_hparams`
+  * Retrieve by name: `registry.ranged_hparams`
+  * Command-line flag in `trainer.py`: `--hparams_range=name`
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import inspect
+import re
+
+# Dependency imports
+
+from tensor2tensor.utils import t2t_model
+
+import tensorflow as tf
+
+_MODELS = {}
+_HPARAMS = {}
+_RANGED_HPARAMS = {}
+
+# Camel case to snake case utils
+_first_cap_re = re.compile("(.)([A-Z][a-z0-9]+)")
+_all_cap_re = re.compile("([a-z])([A-Z])")
+
+
+def _convert_camel_to_snake(name):
+  s1 = _first_cap_re.sub(r"\1_\2", name)
+  return _all_cap_re.sub(r"\1_\2", s1).lower()
+
+
+def _reset():
+  for ctr in [_MODELS, _HPARAMS, _RANGED_HPARAMS]:
+    ctr.clear()
+
+
+def _default_name(obj):
+  return _convert_camel_to_snake(obj.__name__)
+
+
+def register_model(name=None):
+  """Register a model. name defaults to class name snake-cased."""
+
+  def decorator(model_cls, registration_name=None):
+    """Registers & returns model_cls with registration_name or default name."""
+    model_name = registration_name or _default_name(model_cls)
+    if model_name in _MODELS:
+      raise ValueError("Model %s already registered." % model_name)
+    if (not inspect.isclass(model_cls) or
+        not issubclass(model_cls, t2t_model.T2TModel)):
+      tf.logging.warning("Model %s is not an instance of T2TModel. "
+                         "Object is expected to abide by its API.", model_name)
+    _MODELS[model_name] = model_cls
+    return model_cls
+
+  # Handle if decorator was used without parens
+  if callable(name):
+    model_cls = name
+    return decorator(model_cls, registration_name=_default_name(model_cls))
+
+  return lambda model_cls: decorator(model_cls, name)
+
+
+def model(name):
+  if name not in _MODELS:
+    raise ValueError("Model %s never registered." % name)
+  return _MODELS[name]
+
+
+def list_models():
+  return list(_MODELS)
+
+
+def register_hparams(name=None):
+  """Register an HParams set. name defaults to function name snake-cased."""
+
+  def decorator(hp_fn, registration_name=None):
+    """Registers & returns hp_fn with registration_name or default name."""
+    hp_name = registration_name or _default_name(hp_fn)
+    if hp_name in _HPARAMS:
+      raise ValueError("HParams set %s already registered." % hp_name)
+    _HPARAMS[hp_name] = hp_fn
+    return hp_fn
+
+  # Handle if decorator was used without parens
+  if callable(name):
+    hp_fn = name
+    return decorator(hp_fn, registration_name=_default_name(hp_fn))
+
+  return lambda hp_fn: decorator(hp_fn, name)
+
+
+def hparams(name):
+  if name not in _HPARAMS:
+    raise ValueError("HParams set %s never registered." % name)
+  return _HPARAMS[name]
+
+
+def list_hparams():
+  return list(_HPARAMS)
+
+
+def register_ranged_hparams(name=None):
+  """Register a RangedHParams set. name defaults to fn name snake-cased."""
+
+  def decorator(rhp_fn, registration_name=None):
+    """Registers & returns hp_fn with registration_name or default name."""
+    rhp_name = registration_name or _default_name(rhp_fn)
+    if rhp_name in _RANGED_HPARAMS:
+      raise ValueError("RangedHParams set %s already registered." % rhp_name)
+    # Check that the fn takes a single argument
+    args, varargs, keywords, _ = inspect.getargspec(rhp_fn)
+    if len(args) != 1 or varargs is not None or keywords is not None:
+      raise ValueError("RangedHParams set function must take a single "
+                       "argument, the RangedHParams object.")
+
+    _RANGED_HPARAMS[rhp_name] = rhp_fn
+    return rhp_fn
+
+  # Handle if decorator was used without parens
+  if callable(name):
+    rhp_fn = name
+    return decorator(rhp_fn, registration_name=_default_name(rhp_fn))
+
+  return lambda rhp_fn: decorator(rhp_fn, name)
+
+
+def ranged_hparams(name):
+  if name not in _RANGED_HPARAMS:
+    raise ValueError("RangedHParams set %s never registered." % name)
+  return _RANGED_HPARAMS[name]
+
+
+def list_ranged_hparams():
+  return list(_RANGED_HPARAMS)
+
+
+def help_string():
+  help_str = """Registry contents:
+
+  Models: %s
+
+  HParams: %s
+
+  RangedHParams: %s
+  """
+  return help_str % (list_models(), list_hparams(), list_ranged_hparams())
diff --git a/tensor2tensor/utils/registry_test.py b/tensor2tensor/utils/registry_test.py
new file mode 100644
index 000000000..54ccca749
--- /dev/null
+++ b/tensor2tensor/utils/registry_test.py
@@ -0,0 +1,202 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for tensor2tensor.registry."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+from tensor2tensor.utils import registry
+from tensor2tensor.utils import t2t_model
+
+import tensorflow as tf
+
+# pylint: disable=unused-variable
+
+
+class ModelRegistryTest(tf.test.TestCase):
+
+  def setUp(self):
+    registry._reset()
+
+  def testT2TModelRegistration(self):
+
+    @registry.register_model
+    class MyModel1(t2t_model.T2TModel):
+      pass
+
+    model = registry.model("my_model1")
+    self.assertTrue(model is MyModel1)
+
+  def testNamedRegistration(self):
+
+    @registry.register_model("model2")
+    class MyModel1(t2t_model.T2TModel):
+      pass
+
+    model = registry.model("model2")
+    self.assertTrue(model is MyModel1)
+
+  def testNonT2TModelRegistration(self):
+
+    @registry.register_model
+    def model_fn():
+      pass
+
+    model = registry.model("model_fn")
+    self.assertTrue(model is model_fn)
+
+  def testUnknownModel(self):
+    with self.assertRaisesRegexp(ValueError, "never registered"):
+      registry.model("not_registered")
+
+  def testDuplicateRegistration(self):
+
+    @registry.register_model
+    def m1():
+      pass
+
+    with self.assertRaisesRegexp(ValueError, "already registered"):
+
+      @registry.register_model("m1")
+      def m2():
+        pass
+
+  def testListModels(self):
+
+    @registry.register_model
+    def m1():
+      pass
+
+    @registry.register_model
+    def m2():
+      pass
+
+    self.assertSetEqual(set(["m1", "m2"]), set(registry.list_models()))
+
+  def testSnakeCase(self):
+    convert = registry._convert_camel_to_snake
+
+    self.assertEqual("typical_camel_case", convert("TypicalCamelCase"))
+    self.assertEqual("numbers_fuse2gether", convert("NumbersFuse2Gether"))
+    self.assertEqual("lstm_seq2seq", convert("LSTMSeq2Seq"))
+    self.assertEqual("starts_lower", convert("startsLower"))
+    self.assertEqual("starts_lower_caps", convert("startsLowerCAPS"))
+    self.assertEqual("caps_fuse_together", convert("CapsFUSETogether"))
+    self.assertEqual("startscap", convert("Startscap"))
+    self.assertEqual("s_tartscap", convert("STartscap"))
+
+
+class HParamRegistryTest(tf.test.TestCase):
+
+  def setUp(self):
+    registry._reset()
+
+  def testHParamSet(self):
+
+    @registry.register_hparams
+    def my_hparams_set():
+      pass
+
+    @registry.register_ranged_hparams
+    def my_hparams_range(_):
+      pass
+
+    self.assertTrue(registry.hparams("my_hparams_set") is my_hparams_set)
+    self.assertTrue(
+        registry.ranged_hparams("my_hparams_range") is my_hparams_range)
+
+  def testNamedRegistration(self):
+
+    @registry.register_hparams("a")
+    def my_hparams_set():
+      pass
+
+    @registry.register_ranged_hparams("a")
+    def my_hparams_range(_):
+      pass
+
+    self.assertTrue(registry.hparams("a") is my_hparams_set)
+    self.assertTrue(registry.ranged_hparams("a") is my_hparams_range)
+
+  def testUnknownHparams(self):
+    with self.assertRaisesRegexp(ValueError, "never registered"):
+      registry.hparams("not_registered")
+    with self.assertRaisesRegexp(ValueError, "never registered"):
+      registry.ranged_hparams("not_registered")
+
+  def testDuplicateRegistration(self):
+
+    @registry.register_hparams
+    def hp1():
+      pass
+
+    with self.assertRaisesRegexp(ValueError, "already registered"):
+
+      @registry.register_hparams("hp1")
+      def hp2():
+        pass
+
+    @registry.register_ranged_hparams
+    def rhp1(_):
+      pass
+
+    with self.assertRaisesRegexp(ValueError, "already registered"):
+
+      @registry.register_ranged_hparams("rhp1")
+      def rhp2(_):
+        pass
+
+  def testListHparams(self):
+
+    @registry.register_hparams
+    def hp1():
+      pass
+
+    @registry.register_hparams("hp2_named")
+    def hp2():
+      pass
+
+    @registry.register_ranged_hparams
+    def rhp1(_):
+      pass
+
+    @registry.register_ranged_hparams("rhp2_named")
+    def rhp2(_):
+      pass
+
+    self.assertSetEqual(set(["hp1", "hp2_named"]), set(registry.list_hparams()))
+    self.assertSetEqual(
+        set(["rhp1", "rhp2_named"]), set(registry.list_ranged_hparams()))
+
+  def testRangeSignatureCheck(self):
+
+    with self.assertRaisesRegexp(ValueError, "must take a single argument"):
+
+      @registry.register_ranged_hparams
+      def rhp_bad():
+        pass
+
+    with self.assertRaisesRegexp(ValueError, "must take a single argument"):
+
+      @registry.register_ranged_hparams
+      def rhp_bad2(a, b):  # pylint: disable=unused-argument
+        pass
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensor2tensor/utils/t2t_model.py b/tensor2tensor/utils/t2t_model.py
new file mode 100644
index 000000000..80c06e347
--- /dev/null
+++ b/tensor2tensor/utils/t2t_model.py
@@ -0,0 +1,429 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""T2TModel Base Class."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+# Dependency imports
+
+import six
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensor2tensor.utils import beam_search
+from tensor2tensor.utils import expert_utils as eu
+from tensor2tensor.utils import modality
+
+import tensorflow as tf
+
+
+def _with_timing(fn, msg):
+
+  def fn_with_timing(*args, **kwargs):
+    start_time = time.time()
+    res = fn(*args, **kwargs)
+    tf.logging.info("Doing %s took %.3f sec." % (msg, time.time() - start_time))
+    return res
+
+  return fn_with_timing
+
+
+class T2TModel(object):
+  """Abstract base class for models.
+
+  Subclassess generally only need to override `build_model`.
+  """
+
+  def __init__(self,
+               hparams,
+               problem_hparams,
+               problem_idx=0,
+               data_parallelism=None,
+               ps_devices=None):
+    """Create a T2TModel.
+
+    Args:
+      hparams: a hyperparameters object.
+      problem_hparams: a hyperparameters object.
+      problem_idx: an integer.
+      data_parallelism: a expert_utils.parallelism
+        (specifies devices for data parallelism).
+      ps_devices: a list of devices to be used for experts
+
+    Returns:
+      a T2TModel
+    """
+    if data_parallelism is None:
+      data_parallelism = eu.Parallelism([""])
+    if ps_devices is None:
+      ps_devices = [""]
+    self._hparams = hparams
+    self._data_parallelism = data_parallelism
+    self._num_datashards = data_parallelism.n
+    self._ps_devices = ps_devices
+    self._problem_hparams = problem_hparams
+    self._problem_idx = problem_idx
+
+  @property
+  def has_input(self):
+    return self._input_modality
+
+  def infer(self,
+            features=None,
+            decode_length=50,
+            beam_size=1,
+            top_beams=1,
+            last_position_only=False,
+            alpha=0.0):
+    """A inference method.
+
+    Quadratic time in decode_length.
+
+    Args:
+      features: an map of string to `Tensor`
+      decode_length: an integer.  How many additional timesteps to decode.
+      beam_size: number of beams.
+      top_beams: an integer. How many of the beams to return.
+      last_position_only: a boolean, speed-up by computing last position only.
+      alpha: Float that controls the length penalty. larger the alpha, stronger
+        the preference for slonger translations.
+
+    Returns:
+       samples: an integer `Tensor`.
+    """
+    if beam_size == 1:
+      tf.logging.info("Greedy Decoding")
+      return self._greedy_infer(features, decode_length, last_position_only)
+    else:
+      tf.logging.info("Beam Decoding with beam size %d" % beam_size)
+      return self._beam_decode(features, decode_length, beam_size, top_beams,
+                               last_position_only, alpha)
+
+  def _beam_decode(self, features, decode_length, beam_size, top_beams,
+                   last_position_only, alpha):
+    """Beam search decoding.
+
+    Args:
+      features: an map of string to `Tensor`
+      decode_length: an integer.  How many additional timesteps to decode.
+      beam_size: number of beams.
+      top_beams: an integer. How many of the beams to return.
+      last_position_only: a boolean, speed-up by computing last position only.
+      alpha: Float that controls the length penalty. larger the alpha, stronger
+        the preference for slonger translations.
+
+    Returns:
+       samples: an integer `Tensor`. Top samples from the beam search
+    """
+
+    def symbols_to_logits_fn(ids):
+      """Go from ids to logits."""
+      ids = tf.expand_dims(tf.expand_dims(ids, axis=2), axis=3)
+      ids = tf.pad(ids[:, 1:], [[0, 0], [0, 1], [0, 0], [0, 0]])
+
+      features["targets"] = ids
+      self._coverage = None
+      sharded_logits, _, _ = self.model_fn(
+          features, False, last_position_only=last_position_only)
+      # now self._coverage is a coverage tensor for the first datashard.
+      # it has shape [batch_size] and contains floats between 0 and
+      # source_length.
+      logits = sharded_logits[0]  # Assuming we have one shard.
+      if last_position_only:
+        return tf.squeeze(logits, axis=[1, 2, 3])
+      current_output_position = tf.shape(ids)[1] - 1  # -1 due to the pad above.
+      logits = logits[:, current_output_position, :, :]
+      return tf.squeeze(logits, axis=[1, 2])
+
+    batch_size = tf.shape(features["inputs"])[0]
+    initial_ids = tf.zeros([batch_size], dtype=tf.int32)
+
+    inputs_old = features["inputs"]
+    features["inputs"] = tf.expand_dims(features["inputs"], 1)
+    if len(features["inputs"].shape) < 5:
+      features["inputs"] = tf.expand_dims(features["inputs"], 4)
+    # Expand the inputs in to the beam size.
+    features["inputs"] = tf.tile(features["inputs"], [1, beam_size, 1, 1, 1])
+    s = tf.shape(features["inputs"])
+    features["inputs"] = tf.reshape(features["inputs"],
+                                    [s[0] * s[1], s[2], s[3], s[4]])
+
+    target_modality = self._hparams.problems[self._problem_idx].target_modality
+    vocab_size = target_modality.targets_dimensionality
+    # Setting decode length to input length + decode_length
+    decode_length = tf.shape(features["inputs"])[1] + tf.constant(decode_length)
+    ids, scores = beam_search.beam_search(symbols_to_logits_fn, initial_ids,
+                                          beam_size, decode_length, vocab_size,
+                                          alpha)
+
+    # Set inputs back to the unexpanded inputs to not to confuse the Estimator!
+    features["inputs"] = inputs_old
+
+    # Return `top_beams` decodings (also remove initial id from the beam search)
+    return_scores = False  # TODO(lukaszkaiser): make it work multi-problem.
+    if top_beams == 1:
+      if return_scores:
+        return {"outputs": ids[:, 0, 1:], "scores": scores}
+      return ids[:, 0, 1:]
+    else:
+      if return_scores:
+        return {"outputs": ids[:, :top_beams, 1:], "scores": scores}
+      return ids[:, :top_beams, 1:]
+
+  def _greedy_infer(self, features, decode_length, last_position_only):
+    """A slow greedy inference method.
+
+    Quadratic time in decode_length.
+
+    Args:
+      features: an map of string to `Tensor`
+      decode_length: an integer.  How many additional timesteps to decode.
+      last_position_only: a boolean, speed-up by computing last position only.
+
+    Returns:
+       samples: an integer `Tensor`.
+    """
+    if not features:
+      features = {}
+    inputs_old = None
+    if "inputs" in features and len(features["inputs"].shape) < 4:
+      inputs_old = features["inputs"]
+      features["inputs"] = tf.expand_dims(features["inputs"], 2)
+
+    def infer_step(recent_output, _):
+      """Inference step."""
+      recent_output.set_shape([None, None, None, 1])
+      padded = tf.pad(recent_output, [[0, 0], [0, 1], [0, 0], [0, 0]])
+      features["targets"] = padded
+      # This is inefficient in that it generates samples at all timesteps,
+      # not just the last one, except if last_position_only is set (dangerous).
+      samples = self.sample(features, last_position_only=last_position_only)
+      # Concatenate the already-generated recent_output with last timestep
+      # of the newly-generated samples.
+      if last_position_only:
+        cur_sample = samples[:, -1, :, :]
+      else:
+        cur_sample = samples[:, tf.shape(recent_output)[1], :, :]
+      cur_sample = tf.to_int64(tf.expand_dims(cur_sample, axis=1))
+      samples = tf.concat([recent_output, cur_sample], axis=1)
+      samples.set_shape([None, None, None, 1])
+      return samples
+
+    # Create an initial output tensor. This will be passed
+    # to the infer_step, which adds one timestep at every iteration.
+    if "partial_targets" in features:
+      initial_output = tf.convert_to_tensor(features["partial_targets"])
+    else:
+      batch_size = tf.shape(features["inputs"])[0]
+      initial_output = tf.zeros((batch_size, 0, 1, 1), dtype=tf.int64)
+    # Hack: foldl complains when the output shape is less specified than the
+    # input shape, so we confuse it about the input shape.
+    initial_output = tf.slice(initial_output, [0, 0, 0, 0],
+                              tf.shape(initial_output))
+    if isinstance(self._hparams.problems[self._problem_idx].target_modality,
+                  modality.ClassLabelModality):
+      decode_length = 1
+    else:
+      decode_length = tf.shape(features["inputs"])[1] + decode_length
+    result = tf.foldl(
+        infer_step,
+        tf.range(decode_length),
+        initializer=initial_output,
+        back_prop=False,
+        parallel_iterations=1)
+    if inputs_old is not None:  # Restore to not confuse Estimator.
+      features["inputs"] = inputs_old
+    return result
+
+  def sample(self, features, last_position_only=False):
+    """Run the model and extract samples.
+
+    Args:
+      features: an map of string to `Tensor`.
+      last_position_only: a boolean, speed-up by computing last position only.
+
+    Returns:
+       samples: an integer `Tensor`.
+    """
+    sharded_logits, _, _ = self.model_fn(
+        features, False, last_position_only=last_position_only)
+    if self._hparams.sampling_method == "argmax":
+      sharded_samples = self._data_parallelism(tf.argmax, sharded_logits, 4)
+    else:
+      assert self._hparams.sampling_method == "random"
+
+      def _multinomial_squeeze(logits):
+        reshaped_logits = tf.reshape(logits, [-1, tf.shape(logits)[-1]])
+        choices = tf.multinomial(reshaped_logits, 1)
+        choices = tf.reshape(choices,
+                             tf.shape(logits)[:logits.get_shape().ndims - 1])
+        return choices
+
+      sharded_samples = self._data_parallelism(_multinomial_squeeze,
+                                               sharded_logits)
+    return tf.concat(sharded_samples, 0)
+
+  def _shard_features(self, features):  # pylint: disable=missing-docstring
+    sharded_features = dict()
+    for k, v in six.iteritems(features):
+      v = tf.convert_to_tensor(v)
+      if not v.shape.as_list():
+        v = tf.expand_dims(v, axis=-1)
+        v = tf.tile(v, [self._num_datashards])
+      sharded_features[k] = self._data_parallelism(tf.identity,
+                                                   tf.split(
+                                                       v, self._num_datashards,
+                                                       0))
+    return sharded_features
+
+  def model_fn(self, features, train, skip=False, last_position_only=False):
+    """Computes the entire model and produces sharded logits and training loss.
+
+    Args:
+      features: A dictionary of feature name to tensor.
+      train: a boolean `Scalar` (whether we are in training mode).
+      skip: a boolean, if we're just dummy-calling and actually skip this model
+        (but we need to create variables to not confuse distributed training).
+      last_position_only: a boolean, compute logits for only the last position.
+
+    Returns:
+      sharded_logits: a list of `Tensor`s, one per datashard.
+      training_loss: a floating point `Scalar`.
+    """
+    start_time = time.time()
+    dp = self._data_parallelism
+
+    sharded_features = self._shard_features(features)
+
+    # Construct the model bottom for inputs.
+    transformed_features = {}
+    all_previous_modalities = []
+
+    for key, input_modality in six.iteritems(
+        self._problem_hparams.input_modality):
+      previous_modalities = [
+          self._hparams.problems[i].input_modality[key].name
+          for i in xrange(self._problem_idx)
+      ]
+      all_previous_modalities.extend(previous_modalities)
+      do_reuse = input_modality.name in all_previous_modalities
+      with tf.variable_scope(input_modality.name, reuse=do_reuse):
+        transformed_features[key] = input_modality.inputs_bottom_sharded(
+            sharded_features[key], dp)
+      all_previous_modalities.append(input_modality.name)
+
+    # Target space id just gets copied to every shard.
+    if "target_space_id" in features:
+      transformed_features["target_space_id"] = [features["target_space_id"]
+                                                ] * self._num_datashards
+
+    # Targets are transformed by the autoregressive part of the modality
+    previous_tgt_modalities = [
+        self._hparams.problems[i].target_modality.name
+        for i in xrange(self._problem_idx)
+    ]
+    all_previous_modalities.extend(previous_tgt_modalities)
+
+    target_modality = self._problem_hparams.target_modality
+    target_reuse = target_modality.name in previous_tgt_modalities
+    with tf.variable_scope(target_modality.name, reuse=target_reuse):
+      transformed_features["targets"] = target_modality.targets_bottom_sharded(
+          sharded_features["targets"], dp)
+
+    # Construct the model body.
+    with tf.variable_scope("body", reuse=self._problem_idx > 0):
+      if skip:
+        body_outputs, extra_loss = transformed_features["targets"], 0.0
+      else:
+        body_outputs, extra_loss = self.model_fn_body_sharded(
+            transformed_features, train)
+
+    with tf.variable_scope(target_modality.name, reuse=target_reuse):
+      if not last_position_only:
+        sharded_logits, training_loss = (target_modality.targets_top_sharded(
+            body_outputs, sharded_features["targets"], self._data_parallelism))
+
+        training_loss *= self._problem_hparams.loss_multiplier
+      else:
+        # Take body outputs for the last position only, and targets too.
+        # TODO(lukaszkaiser): warning, this doesn't work for all modalities!
+        last_position_body_outputs = [
+            tf.expand_dims(body_shard[:, -1, :, :], axis=[1])
+            for body_shard in body_outputs
+        ]
+        last_position_targets = [
+            tf.expand_dims(target_shard[:, -1:, :, :], axis=[1])
+            for target_shard in sharded_features["targets"]
+        ]
+        sharded_logits, training_loss = (target_modality.targets_top_sharded(
+            last_position_body_outputs, last_position_targets,
+            self._data_parallelism))
+
+        training_loss = None
+
+    tf.logging.info("This model_fn took %.3f sec." % (time.time() - start_time))
+    return sharded_logits, training_loss, extra_loss
+
+  def model_fn_body_sharded(self, sharded_features, train):
+    """Mixture-of-experts models will override this function.
+
+    Compute model body on all datashards.
+
+    Args:
+      sharded_features: map from string to list of Tensors each with shape
+         [batch, ?, ?, body_input_size]
+      train: A boolean `Scalar` (whether we are in training mode).
+
+    Returns:
+      sharded_body_output:
+          a list of Tensors, each with shape [batch, O, P, body_output_size]
+      extra_loss: a Scalar.
+    """
+    with tf.name_scope("model"):
+      datashard_to_features = [{
+          k: v[d]
+          for k, v in six.iteritems(sharded_features)
+      } for d in xrange(self._num_datashards)]
+      output = self._data_parallelism(
+          _with_timing(self.model_fn_body, "model_fn_body"),
+          datashard_to_features, train)
+      if isinstance(output, tuple):
+        loss = tf.reduce_mean(output[1])
+        output = output[0]
+      else:
+        loss = 0.0
+      return output, loss
+
+  def model_fn_body(self, features, train):
+    """Most models will override this function.
+
+    Compute label logits for one shard as a function of the transformed
+    features.
+
+    Args:
+      features: A dictionary of key to Tensor.  Each Tensor has shape
+         `[batch_size, ?, ?, hidden_size]`.
+      train: A boolean `Scalar` (whether we are in training mode).
+
+    Returns:
+      a `Tensor` of logits with shape `[batch_size, O, P, body_output_size]`.
+    """
+    raise NotImplementedError("Abstract Method")
+
+  @property
+  def hparams(self):
+    return self._hparams
diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py
new file mode 100644
index 000000000..87f56f76c
--- /dev/null
+++ b/tensor2tensor/utils/trainer_utils.py
@@ -0,0 +1,1302 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities for trainer binary."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import operator
+import os
+import sys
+
+# Dependency imports
+
+import numpy as np
+import six
+# pylint: disable=redefined-builtin
+from six.moves import input
+from six.moves import xrange
+from six.moves import zip
+# pylint: enable=redefined-builtin
+
+from tensor2tensor.data_generators import problem_hparams
+from tensor2tensor.models import models  # pylint: disable=unused-import
+from tensor2tensor.utils import data_reader
+from tensor2tensor.utils import expert_utils as eu
+from tensor2tensor.utils import metrics
+from tensor2tensor.utils import registry
+
+import tensorflow as tf
+from tensorflow.contrib.learn.python.learn import learn_runner
+from tensorflow.python.ops import init_ops
+
+
+# Number of samples to draw for an image input (in such cases as captioning)
+IMAGE_DECODE_LENGTH = 100
+
+flags = tf.flags
+FLAGS = flags.FLAGS
+
+flags.DEFINE_bool("registry_help", False,
+                  "If True, logs the contents of the registry and exits.")
+flags.DEFINE_string("master", "", "Address of TensorFlow master.")
+flags.DEFINE_string("schedule", "local_run",
+                    "Method of tf.contrib.learn.Experiment to run.")
+flags.DEFINE_string("output_dir", "", "Base output directory for run.")
+flags.DEFINE_string("model", "", "Which model to use.")
+flags.DEFINE_string("hparams_set", "", "Which parameters to use.")
+flags.DEFINE_string("hparams_range", "", "Parameters range.")
+flags.DEFINE_string(
+    "hparams", "",
+    """A comma-separated list of `name=value` hyperparameter values. This flag
+    is used to override hyperparameter settings either when manually selecting
+    hyperparameters or when using Vizier. If a hyperparameter setting is
+    specified by this flag then it must be a valid hyperparameter name for the
+    model.""")
+flags.DEFINE_string("problems", "", "Dash separated list of problems to "
+                    "solve.")
+flags.DEFINE_string("data_dir", "/tmp/data", "Directory with training data.")
+flags.DEFINE_string("worker_job", "/job:worker", "name of worker job")
+flags.DEFINE_integer("worker_gpu", 1, "How many GPUs to use.")
+flags.DEFINE_integer("worker_replicas", 1, "How many workers to use.")
+flags.DEFINE_integer("worker_id", 0, "Which worker task are we.")
+flags.DEFINE_integer("ps_gpu", 0, "How many GPUs to use per ps.")
+flags.DEFINE_string("gpu_order", "", "Optional order for daisy-chaining gpus."
+                    " e.g. \"1 3 2 4\"")
+flags.DEFINE_string("ps_job", "/job:ps", "name of ps job")
+flags.DEFINE_integer("ps_replicas", 0, "How many ps replicas.")
+flags.DEFINE_bool("experimental_optimize_placement", False,
+                  "Optimize ops placement with experimental session options.")
+flags.DEFINE_bool("sync", False, "Sync compute on PS.")
+flags.DEFINE_bool("infer_use_last_position_only", False,
+                  "In inference, use last position only for speedup.")
+flags.DEFINE_integer("train_steps", 250000,
+                     "The number of steps to run training for.")
+flags.DEFINE_integer("eval_steps", 10, "Number of steps in evaluation.")
+flags.DEFINE_integer("keep_checkpoint_max", 20,
+                     "How many recent checkpoints to keep.")
+flags.DEFINE_bool("interactive", False, "Interactive local inference mode.")
+flags.DEFINE_bool("endless_dec", False, "Run decoding endlessly. Temporary.")
+flags.DEFINE_bool("save_images", False, "Save inference input images.")
+flags.DEFINE_string("decode_from_file", None, "Path to decode file")
+flags.DEFINE_string("decode_to_file", None, "Path to inference output file")
+flags.DEFINE_integer("decode_shards", 1, "How many shards to decode.")
+flags.DEFINE_integer("decode_problem_id", 0, "Which problem to decode.")
+flags.DEFINE_integer("decode_extra_length", 50, "Added decode length.")
+flags.DEFINE_integer("decode_batch_size", 32, "Batch size for decoding. "
+                     "The decodes will be written to <filename>.decodes in"
+                     "format result\tinput")
+flags.DEFINE_integer("beam_size", 4, "The beam size for beam decoding")
+flags.DEFINE_float("alpha", 0.6, "Alpha for length penalty")
+flags.DEFINE_bool("return_beams", False,
+                  "Whether to return 1 (False) or all (True) beams. The \n "
+                  "output file will have the format "
+                  "<beam1>\t<beam2>..\t<input>")
+flags.DEFINE_bool("daisy_chain_variables", True,
+                  "copy variables around in a daisy chain")
+
+
+def make_experiment_fn(data_dir, model_name, train_steps, eval_steps):
+  """Returns experiment_fn for learn_runner. Wraps create_experiment."""
+
+  def experiment_fn(output_dir):
+    return create_experiment(
+        output_dir=output_dir,
+        data_dir=data_dir,
+        model_name=model_name,
+        train_steps=train_steps,
+        eval_steps=eval_steps)
+
+  return experiment_fn
+
+
+def create_experiment(output_dir, data_dir, model_name, train_steps,
+                      eval_steps):
+  hparams = create_hparams(FLAGS.hparams_set, FLAGS.data_dir)
+  estimator, input_fns = create_experiment_components(
+      hparams=hparams,
+      output_dir=output_dir,
+      data_dir=data_dir,
+      model_name=model_name)
+  return tf.contrib.learn.Experiment(
+      estimator=estimator,
+      train_input_fn=input_fns["train"],
+      eval_input_fn=input_fns["eval"],
+      eval_metrics=metrics.create_evaluation_metrics(FLAGS.problems.split("-")),
+      train_steps=train_steps,
+      eval_steps=eval_steps,
+      train_monitors=[])
+
+
+def create_experiment_components(hparams, output_dir, data_dir, model_name):
+  """Constructs and returns Estimator and train/eval input functions."""
+  hparams.problems = [
+      problem_hparams.problem_hparams(problem, hparams)
+      for problem in FLAGS.problems.split("-")
+  ]
+
+  num_datashards = data_parallelism().n
+
+  tf.logging.info("Creating experiment, storing model files in %s", output_dir)
+
+  train_problems_data = get_datasets_for_mode(data_dir,
+                                              tf.contrib.learn.ModeKeys.TRAIN)
+  train_input_fn = get_input_fn(
+      mode=tf.contrib.learn.ModeKeys.TRAIN,
+      hparams=hparams,
+      data_file_patterns=train_problems_data,
+      num_datashards=num_datashards)
+
+  eval_problems_data = get_datasets_for_mode(data_dir,
+                                             tf.contrib.learn.ModeKeys.EVAL)
+  eval_input_fn = get_input_fn(
+      mode=tf.contrib.learn.ModeKeys.EVAL,
+      hparams=hparams,
+      data_file_patterns=eval_problems_data,
+      num_datashards=num_datashards)
+  estimator = tf.contrib.learn.Estimator(
+      model_fn=model_builder(model_name, hparams=hparams),
+      model_dir=output_dir,
+      config=tf.contrib.learn.RunConfig(
+          master=FLAGS.master,
+          model_dir=output_dir,
+          session_config=session_config(),
+          keep_checkpoint_max=20))
+  return estimator, {"train": train_input_fn, "eval": eval_input_fn}
+
+
+def log_registry():
+  tf.logging.info(registry.help_string())
+  if FLAGS.registry_help:
+    sys.exit(0)
+
+
+def create_hparams(params_id, data_dir):
+  """Returns hyperparameters, including any flag value overrides.
+
+  If the hparams FLAG is set, then it will use any values specified in
+  hparams to override any individually-set hyperparameter. This logic
+  allows tuners to override hyperparameter settings to find optimal values.
+
+  Args:
+    params_id: which set of parameters to choose (must be in _PARAMS above).
+    data_dir: the directory containing the training data.
+
+  Returns:
+    The hyperparameters as a tf.contrib.training.HParams object.
+  """
+  hparams = registry.hparams(params_id)()
+  hparams.add_hparam("data_dir", data_dir)
+  # Command line flags override any of the preceding hyperparameter values.
+  if FLAGS.hparams:
+    hparams = hparams.parse(FLAGS.hparams)
+  return hparams
+
+
+def run(data_dir, model, output_dir, train_steps, eval_steps, schedule):
+  """Runs an Estimator locally or distributed.
+
+  This function chooses one of two paths to execute:
+
+  1. Running locally if schedule=="local_run".
+  3. Distributed training/evaluation otherwise.
+
+  Args:
+    data_dir: The directory the data can be found in.
+    model: The name of the model to use.
+    output_dir: The directory to store outputs in.
+    train_steps: The number of steps to run training for.
+    eval_steps: The number of steps to run evaluation for.
+    schedule: (str) The schedule to run. The value here must
+      be the name of one of Experiment's methods.
+  """
+  if schedule == "local_run":
+    # Run the local demo.
+    run_locally(
+        data_dir=data_dir,
+        model=model,
+        output_dir=output_dir,
+        train_steps=train_steps,
+        eval_steps=eval_steps)
+  else:
+    # Perform distributed training/evaluation.
+    learn_runner.run(
+        experiment_fn=make_experiment_fn(
+            data_dir=data_dir,
+            model_name=model,
+            train_steps=train_steps,
+            eval_steps=eval_steps),
+        schedule=schedule,
+        output_dir=FLAGS.output_dir)
+
+
+def validate_flags():
+  if not FLAGS.model:
+    raise ValueError("Must specify a model with --model.")
+  if not FLAGS.problems:
+    raise ValueError("Must specify a set of problems with --problems.")
+  if not (FLAGS.hparams_set or FLAGS.hparams_range):
+    raise ValueError("Must specify either --hparams_set or --hparams_range.")
+  if not FLAGS.schedule:
+    raise ValueError("Must specify --schedule.")
+  if not FLAGS.output_dir:
+    FLAGS.output_dir = "/tmp/tensor2tensor"
+    tf.logging.warning("It is strongly recommended to specify --output_dir. "
+                       "Using default output_dir=%s.", FLAGS.output_dir)
+
+
+def session_config():
+  """The TensorFlow Session config to use."""
+  graph_options = tf.GraphOptions(optimizer_options=tf.OptimizerOptions(
+      opt_level=tf.OptimizerOptions.L1, do_function_inlining=False))
+  if FLAGS.experimental_optimize_placement:
+    rewrite_options = tf.RewriterConfig(optimize_tensor_layout=True)
+    rewrite_options.optimizers.append("pruning")
+    rewrite_options.optimizers.append("constfold")
+    rewrite_options.optimizers.append("layout")
+    graph_options = tf.GraphOptions(
+        rewrite_options=rewrite_options, infer_shapes=True)
+  config = tf.ConfigProto(
+      allow_soft_placement=True, graph_options=graph_options)
+
+  return config
+
+
+def model_builder(model, hparams):
+  """Returns a function to build the model.
+
+  Args:
+    model: The name of the model to use.
+    hparams: The hyperparameters.
+
+  Returns:
+    A function to build the model's graph. This function is called by
+    the Estimator object to construct the graph.
+  """
+
+  def initializer():
+    if hparams.initializer == "orthogonal":
+      return tf.orthogonal_initializer(gain=hparams.initializer_gain)
+    elif hparams.initializer == "uniform":
+      max_val = 0.1 * hparams.initializer_gain
+      return tf.random_uniform_initializer(-max_val, max_val)
+    elif hparams.initializer == "normal_unit_scaling":
+      return init_ops.variance_scaling_initializer(
+          hparams.initializer_gain, mode="fan_avg", distribution="normal")
+    elif hparams.initializer == "uniform_unit_scaling":
+      return init_ops.variance_scaling_initializer(
+          hparams.initializer_gain, mode="fan_avg", distribution="uniform")
+    else:
+      raise ValueError("Unrecognized initializer: %s" % hparams.initializer)
+
+  def learning_rate_decay():
+    """Inverse-decay learning rate until warmup_steps, then decay."""
+    warmup_steps = tf.to_float(
+        hparams.learning_rate_warmup_steps * FLAGS.worker_replicas)
+    step = tf.to_float(tf.contrib.framework.get_global_step())
+    if hparams.learning_rate_decay_scheme == "noam":
+      return 5000.0 * hparams.hidden_size**-0.5 * tf.minimum(
+          (step + 1) * warmup_steps**-1.5, (step + 1)**-0.5)
+    elif hparams.learning_rate_decay_scheme == "exp100k":
+      return 0.94**(step // 100000)
+
+    inv_base = tf.exp(tf.log(0.01) / warmup_steps)
+    inv_decay = inv_base**(warmup_steps - step)
+    if hparams.learning_rate_decay_scheme == "sqrt":
+      decay = _sqrt_decay(step - warmup_steps)
+    elif hparams.learning_rate_decay_scheme == "exp10k":
+      decay = _exp_decay_after(step - warmup_steps, 0.9995,
+                               FLAGS.train_steps - warmup_steps - 10000)
+    elif hparams.learning_rate_decay_scheme == "exp50k":
+      decay = _exp_decay_after(step - warmup_steps, 0.99995,
+                               FLAGS.train_steps - warmup_steps - 50000)
+    elif hparams.learning_rate_decay_scheme == "exp500k":
+      decay = _exp_decay_after(step - warmup_steps, 0.9999955,
+                               FLAGS.train_steps - warmup_steps - 500000)
+    elif hparams.learning_rate_decay_scheme == "none":
+      decay = tf.constant(1.0)
+    else:
+      raise ValueError("Unrecognized learning rate decay scheme: %s" %
+                       hparams.learning_rate_decay_scheme)
+    return tf.cond(
+        step < warmup_steps,
+        lambda: inv_decay,
+        lambda: decay,
+        name="learning_rate_decay_warump_cond")
+
+  def model_fn(features, targets, mode):
+    """Creates the prediction, loss, and train ops.
+
+    Args:
+      features: A dictionary of tensors keyed by the feature name.
+      targets: A tensor representing the labels (targets).
+      mode: The execution mode, as defined in tf.contrib.learn.ModeKeys.
+
+    Returns:
+      A tuple consisting of the prediction, loss, and train_op.
+    """
+    if mode == tf.contrib.learn.ModeKeys.INFER and FLAGS.interactive:
+      features = _interactive_input_tensor_to_features_dict(features, hparams)
+    if mode == tf.contrib.learn.ModeKeys.INFER and FLAGS.decode_from_file:
+      features = _decode_input_tensor_to_features_dict(features, hparams)
+    # A dictionary containing:
+    #  - problem_choice: A Tensor containing an integer indicating which problem
+    #                    was selected for this run.
+    #  - predictions: A Tensor containing the model's output predictions.
+    run_info = dict()
+    run_info["problem_choice"] = features["problem_choice"]
+
+    if targets is not None:
+      features["targets"] = targets
+
+    dp = data_parallelism()
+
+    # Add input statistics for incoming features.
+    with tf.name_scope("input_stats"):
+      for (k, v) in six.iteritems(features):
+        if isinstance(v, tf.Tensor) and v.get_shape().ndims > 1:
+          tf.summary.scalar("%s_batch" % k, tf.shape(v)[0] // dp.n)
+          tf.summary.scalar("%s_length" % k, tf.shape(v)[1])
+          nonpadding = tf.to_float(tf.not_equal(v, 0))
+          tf.summary.scalar("%s_nonpadding_tokens" % k,
+                            tf.reduce_sum(nonpadding))
+          tf.summary.scalar("%s_nonpadding_fraction" % k,
+                            tf.reduce_mean(nonpadding))
+
+    tf.get_variable_scope().set_initializer(initializer())
+    train = mode == tf.contrib.learn.ModeKeys.TRAIN
+
+    # Get multi-problem logits and loss based on features["problem_choice"].
+    def nth_model(n):
+      """Build the model for the n-th problem, plus some added variables."""
+      model_class = registry.model(model)(
+          hparams, hparams.problems[n], n, dp, _ps_devices(all_workers=True))
+      if mode == tf.contrib.learn.ModeKeys.INFER:
+        return model_class.infer(
+            features,
+            beam_size=FLAGS.beam_size,
+            top_beams=FLAGS.beam_size if FLAGS.return_beams else 1,
+            last_position_only=FLAGS.infer_use_last_position_only,
+            alpha=FLAGS.alpha,
+            decode_length=FLAGS.decode_extra_length)
+      # In distributed mode, we build graph for problem=0 and problem=worker_id.
+      skipping_is_on = hparams.problem_choice == "distributed" and train
+      problem_worker_id = FLAGS.worker_id % len(hparams.problems)
+      skip_this_one = n != 0 and n % FLAGS.worker_replicas != problem_worker_id
+      # On worker 0 also build graph for problems <= 1.
+      # TODO(lukaszkaiser): why is this hack needed for variables init? Repair.
+      skip_this_one = skip_this_one and (FLAGS.worker_id != 0 or n > 1)
+      sharded_logits, training_loss, extra_loss = model_class.model_fn(
+          features, train, skip=(skipping_is_on and skip_this_one))
+      with tf.variable_scope("losses_avg", reuse=True):
+        loss_moving_avg = tf.get_variable("problem_%d/training_loss" % n)
+        o1 = loss_moving_avg.assign(loss_moving_avg * 0.9 + training_loss * 0.1)
+        loss_moving_avg = tf.get_variable("problem_%d/extra_loss" % n)
+        o2 = loss_moving_avg.assign(loss_moving_avg * 0.9 + extra_loss * 0.1)
+        loss_moving_avg = tf.get_variable("problem_%d/total_loss" % n)
+        total_loss = training_loss + extra_loss
+        o3 = loss_moving_avg.assign(loss_moving_avg * 0.9 + total_loss * 0.1)
+      with tf.variable_scope("train_stats"):  # Count steps for this problem.
+        problem_steps = tf.get_variable(
+            "problem_%d_steps" % n, initializer=0, trainable=False)
+        o4 = problem_steps.assign_add(1)
+      with tf.control_dependencies([o1, o2, o3, o4]):  # Make sure the ops run.
+        total_loss = tf.identity(total_loss)
+      return [total_loss] + sharded_logits  # Need to flatten for cond later.
+
+    result_list = _cond_on_index(nth_model, features["problem_choice"], 0,
+                                 len(hparams.problems) - 1)
+
+    if mode == tf.contrib.learn.ModeKeys.INFER:
+      # Beam search in sequence model returns both decodes withe key "outputs"
+      # and scores with they key "scores". If return list is a dict, we expect
+      # that it will have keys "outputs", a tensor of int32 and scores, a
+      # tensor of floats. This is useful if we want to return scores from
+      # estimator.predict
+      if not isinstance(result_list, dict):
+        ret = {"outputs": result_list}, None, None
+      else:
+        ret = {
+            "outputs": result_list["outputs"],
+            "scores": result_list["scores"]
+        }, None, None
+      if "inputs" in features:
+        ret[0]["inputs"] = features["inputs"]
+      if "infer_targets" in features:
+        ret[0]["targets"] = features["infer_targets"]
+      return ret
+
+    sharded_logits, total_loss = result_list[1:], result_list[0]
+    if mode == tf.contrib.learn.ModeKeys.EVAL:
+      logits = tf.concat(sharded_logits, 0)
+      # For evaluation, return the logits layer as our predictions.
+      run_info["predictions"] = logits
+      train_op = None
+      return run_info, total_loss, None
+
+    assert mode == tf.contrib.learn.ModeKeys.TRAIN
+
+    # Some training statistics.
+    with tf.name_scope("training_stats"):
+      learning_rate = hparams.learning_rate * learning_rate_decay()
+      learning_rate /= math.sqrt(float(FLAGS.worker_replicas))
+      tf.summary.scalar("learning_rate", learning_rate)
+      global_step = tf.to_float(tf.contrib.framework.get_global_step())
+      for n in xrange(len(hparams.problems)):
+        with tf.variable_scope("losses_avg", reuse=True):
+          total_loss_var = tf.get_variable("problem_%d/total_loss" % n)
+          training_loss_var = tf.get_variable("problem_%d/training_loss" % n)
+          extra_loss_var = tf.get_variable("problem_%d/extra_loss" % n)
+        tf.summary.scalar("loss_avg_%d/total_loss" % n, total_loss_var)
+        tf.summary.scalar("loss_avg_%d/training_loss" % n, training_loss_var)
+        tf.summary.scalar("loss_avg_%d/extra_loss" % n, extra_loss_var)
+        with tf.variable_scope("train_stats", reuse=True):
+          nth_steps = tf.get_variable("problem_%d_steps" % n, dtype=tf.int32)
+        tf.summary.scalar("problem_%d_frequency" % n,
+                          tf.to_float(nth_steps) / (global_step + 1.0))
+
+    # Log trainable weights and add decay.
+    total_size, total_embedding, weight_decay_loss = 0, 0, 0.0
+    all_weights = {v.name: v for v in tf.trainable_variables()}
+    for v_name in sorted(list(all_weights)):
+      v = all_weights[v_name]
+      v_size = int(np.prod(np.array(v.shape.as_list())))
+      tf.logging.info("Weight    %s\tshape    %s\tsize    %d",
+                      v.name[:-2].ljust(80), str(v.shape).ljust(20), v_size)
+      if "embedding" in v_name:
+        total_embedding += v_size
+      total_size += v_size
+      if hparams.weight_decay > 0.0 and len(v.shape.as_list()) > 1:
+        # Add weight regularization if set and the weight is not a bias (dim>1).
+        with tf.device(v._ref().device):  # pylint: disable=protected-access
+          v_loss = tf.nn.l2_loss(v) / v_size
+        weight_decay_loss += v_loss
+      is_body = len(v_name) > 5 and v_name[:5] == "body/"
+      if hparams.weight_noise > 0.0 and is_body:
+        # Add weight noise if set in hparams.
+        with tf.device(v._ref().device):  # pylint: disable=protected-access
+          scale = learning_rate * 0.001
+          noise = tf.truncated_normal(v.shape) * hparams.weight_noise * scale
+          noise_op = v.assign_add(noise)
+        with tf.control_dependencies([noise_op]):
+          total_loss = tf.identity(total_loss)
+    tf.logging.info("Total trainable variables size: %d", total_size)
+    tf.logging.info("Total embedding variables size: %d", total_embedding)
+    tf.logging.info("Total non-embedding variables size: %d",
+                    total_size - total_embedding)
+    total_loss += weight_decay_loss * hparams.weight_decay
+
+    # Define the train_op for the TRAIN mode.
+    opt = _ConditionalOptimizer(hparams.optimizer, learning_rate, hparams)
+    tf.logging.info("Computing gradients for global model_fn.")
+    train_op = tf.contrib.layers.optimize_loss(
+        name="training",
+        loss=total_loss,
+        global_step=tf.contrib.framework.get_global_step(),
+        learning_rate=learning_rate,
+        clip_gradients=hparams.clip_grad_norm or None,
+        optimizer=opt,
+        colocate_gradients_with_ops=True)
+
+    tf.logging.info("Global model_fn finished.")
+    return run_info, total_loss, train_op
+
+  return model_fn
+
+
+def run_locally(data_dir, model, output_dir, train_steps, eval_steps):
+  """Runs an Estimator locally.
+
+  This function demonstrates model training, evaluation, inference locally.
+
+  Args:
+    data_dir: The directory the data can be found in.
+    model: The name of the model to use.
+    output_dir: The directory to store outputs in.
+    train_steps: The number of steps to run training for.
+    eval_steps: The number of steps to run evaluation for.
+  """
+  train_problems_data = get_datasets_for_mode(data_dir,
+                                              tf.contrib.learn.ModeKeys.TRAIN)
+
+  # For a local run, we can train, evaluate, predict.
+  hparams = create_hparams(FLAGS.hparams_set, FLAGS.data_dir)
+  hparams.problems = [
+      problem_hparams.problem_hparams(problem, hparams)
+      for problem in FLAGS.problems.split("-")
+  ]
+
+  estimator = tf.contrib.learn.Estimator(
+      model_fn=model_builder(model, hparams=hparams),
+      model_dir=output_dir,
+      config=tf.contrib.learn.RunConfig(
+          session_config=session_config(),
+          keep_checkpoint_max=FLAGS.keep_checkpoint_max))
+
+  num_datashards = data_parallelism().n
+
+  if train_steps > 0:
+    # Train.
+    tf.logging.info("Performing local training.")
+    estimator.fit(
+        input_fn=get_input_fn(
+            mode=tf.contrib.learn.ModeKeys.TRAIN,
+            hparams=hparams,
+            data_file_patterns=train_problems_data,
+            num_datashards=num_datashards),
+        steps=train_steps,
+        monitors=[])
+
+  if eval_steps > 0:
+    # Evaluate.
+    tf.logging.info("Performing local evaluation.")
+    eval_problems_data = get_datasets_for_mode(data_dir,
+                                               tf.contrib.learn.ModeKeys.EVAL)
+    eval_input_fn = get_input_fn(
+        mode=tf.contrib.learn.ModeKeys.EVAL,
+        hparams=hparams,
+        data_file_patterns=eval_problems_data,
+        num_datashards=num_datashards)
+    unused_metrics = estimator.evaluate(
+        input_fn=eval_input_fn,
+        steps=eval_steps,
+        metrics=metrics.create_evaluation_metrics(FLAGS.problems.split("-")))
+
+  # Predict.
+  if FLAGS.interactive:
+    infer_input_fn = _interactive_input_fn(hparams)
+    for problem_idx, example in infer_input_fn:
+      targets_vocab = hparams.problems[problem_idx].vocabulary["targets"]
+      result_iter = estimator.predict(input_fn=lambda e=example: e)
+      for result in result_iter:
+        if FLAGS.return_beams:
+          beams = np.split(result["outputs"], FLAGS.beam_size, axis=0)
+          scores = None
+          if "scores" in result:
+            scores = np.split(result["scores"], FLAGS.beam_size, axis=0)
+          for k, beam in enumerate(beams):
+            tf.logging.info("BEAM %d:" % k)
+            if scores is not None:
+              tf.logging.info("%s\tScore:%f" %
+                              (targets_vocab.decode(beam.flatten()), scores[k]))
+            else:
+              tf.logging.info(targets_vocab.decode(beam.flatten()))
+        else:
+          tf.logging.info(targets_vocab.decode(result["outputs"].flatten()))
+  # Predict from file
+  elif FLAGS.decode_from_file is not None:
+    problem_id = FLAGS.decode_problem_id
+    inputs_vocab = hparams.problems[problem_id].vocabulary["inputs"]
+    targets_vocab = hparams.problems[problem_id].vocabulary["targets"]
+    tf.logging.info("Performing Decoding from a file.")
+    sorted_inputs, sorted_keys = _get_sorted_inputs()
+    num_decode_batches = (len(sorted_inputs) - 1) // FLAGS.decode_batch_size + 1
+    input_fn = _decode_batch_input_fn(problem_id, num_decode_batches,
+                                      sorted_inputs, inputs_vocab)
+
+    # strips everything after the first <EOS> id, which is assumed to be 1
+    def _save_until_eos(hyp):  #  pylint: disable=missing-docstring
+      ret = []
+      index = 0
+      # until you reach <EOS> id
+      while index < len(hyp) and hyp[index] != 1:
+        ret.append(hyp[index])
+        index += 1
+      return np.array(ret)
+
+    decodes = []
+    for _ in range(num_decode_batches):
+      result_iter = estimator.predict(input_fn=input_fn.next, as_iterable=True)
+      for result in result_iter:
+
+        def log_fn(inputs, outputs):
+          decoded_inputs = inputs_vocab.decode(
+              _save_until_eos(inputs.flatten()))
+          tf.logging.info("Inference results INPUT: %s" % decoded_inputs)
+
+          decoded_outputs = targets_vocab.decode(
+              _save_until_eos(outputs.flatten()))
+          tf.logging.info("Inference results OUTPUT: %s" % decoded_outputs)
+          return decoded_outputs
+
+        if FLAGS.return_beams:
+          beam_decodes = []
+          output_beams = np.split(result["outputs"], FLAGS.beam_size, axis=0)
+          for k, beam in enumerate(output_beams):
+            tf.logging.info("BEAM %d:" % k)
+            beam_decodes.append(log_fn(result["inputs"], beam))
+          decodes.append(str.join("\t", beam_decodes))
+
+        else:
+          decodes.append(log_fn(result["inputs"], result["outputs"]))
+
+    # Reversing the decoded inputs and outputs because they were reversed in
+    # _decode_batch_input_fn
+    sorted_inputs.reverse()
+    decodes.reverse()
+    # Dumping inputs and outputs to file FLAGS.decode_from_file.decodes in
+    # format result\tinput in the same order as original inputs
+    if FLAGS.decode_shards > 1:
+      base_filename = FLAGS.decode_from_file + ("%.2d" % FLAGS.worker_id)
+    else:
+      base_filename = FLAGS.decode_from_file
+    decode_filename = (
+        base_filename + "." + FLAGS.model + "." + FLAGS.hparams_set + ".beam" +
+        str(FLAGS.beam_size) + ".a" + str(FLAGS.alpha) + ".alpha" +
+        str(FLAGS.alpha) + ".decodes")
+    tf.logging.info("Writing decodes into %s" % decode_filename)
+    outfile = tf.gfile.Open(decode_filename, "w")
+    for index in range(len(sorted_inputs)):
+      outfile.write("%s\t%s\n" % (decodes[sorted_keys[index]],
+                                  sorted_inputs[sorted_keys[index]]))
+  else:
+    for i, problem in enumerate(FLAGS.problems.split("-")):
+      inputs_vocab = hparams.problems[i].vocabulary.get("inputs", None)
+      targets_vocab = hparams.problems[i].vocabulary["targets"]
+      tf.logging.info("Performing local inference.")
+      infer_problems_data = get_datasets_for_mode(
+          data_dir, tf.contrib.learn.ModeKeys.INFER)
+      infer_input_fn = get_input_fn(
+          mode=tf.contrib.learn.ModeKeys.INFER,
+          hparams=hparams,
+          data_file_patterns=infer_problems_data,
+          num_datashards=num_datashards,
+          fixed_problem=i)
+      result_iter = estimator.predict(
+          input_fn=infer_input_fn, as_iterable=FLAGS.endless_dec)
+
+      def log_fn(inputs, targets, outputs, problem, j):
+        """Log inference results."""
+        if "image" in problem and FLAGS.save_images:
+          save_path = os.path.join(FLAGS.output_dir,
+                                   "%s_prediction_%d.jpg" % (problem, j))
+          show_and_save_image(inputs / 255., save_path)
+        elif inputs_vocab:
+          decoded_inputs = inputs_vocab.decode(inputs.flatten())
+          tf.logging.info("Inference results INPUT: %s" % decoded_inputs)
+
+        decoded_outputs = targets_vocab.decode(outputs.flatten())
+        decoded_targets = targets_vocab.decode(targets.flatten())
+        tf.logging.info("Inference results OUTPUT: %s" % decoded_outputs)
+        if FLAGS.decode_to_file:
+          output_filepath = FLAGS.decode_to_file + ".outputs." + problem
+          output_file = tf.gfile.Open(output_filepath, "a")
+          output_file.write(decoded_outputs + "\n")
+          target_filepath = FLAGS.decode_to_file + ".targets." + problem
+          target_file = tf.gfile.Open(target_filepath, "a")
+          target_file.write(decoded_targets + "\n")
+
+      # The function predict() returns an iterable over the network's
+      # predictions from the test input. if FLAGS.endless_dec is set, it will
+      # decode over the dev set endlessly, looping over it. We use the returned
+      # iterator to log inputs and decodes.
+      if FLAGS.endless_dec:
+        tf.logging.info("Warning: Decoding endlessly")
+        for j, result in enumerate(result_iter):
+          inputs, targets, outputs = (result["inputs"], result["targets"],
+                                      result["outputs"])
+          if FLAGS.return_beams:
+            output_beams = np.split(outputs, FLAGS.beam_size, axis=0)
+            for k, beam in enumerate(output_beams):
+              tf.logging.info("BEAM %d:" % k)
+              log_fn(inputs, targets, beam, problem, j)
+          else:
+            log_fn(inputs, targets, outputs, problem, j)
+      else:
+        for j, (inputs, targets, outputs) in enumerate(
+            zip(result_iter["inputs"], result_iter["targets"], result_iter[
+                "outputs"])):
+          if FLAGS.return_beams:
+            output_beams = np.split(outputs, FLAGS.beam_size, axis=0)
+            for k, beam in enumerate(output_beams):
+              tf.logging.info("BEAM %d:" % k)
+              log_fn(inputs, targets, beam, problem, j)
+          else:
+            log_fn(inputs, targets, outputs, problem, j)
+
+
+def _decode_batch_input_fn(problem_id, num_decode_batches, sorted_inputs,
+                           vocabulary):
+  tf.logging.info(" batch %d" % num_decode_batches)
+  # First reverse all the input sentences so that if you're going to get OOMs,
+  # you'll see it in the first batch
+  sorted_inputs.reverse()
+  for b in range(num_decode_batches):
+    tf.logging.info("Deocding batch %d" % b)
+    batch_length = 0
+    batch_inputs = []
+    for inputs in sorted_inputs[b * FLAGS.decode_batch_size:(
+        b + 1) * FLAGS.decode_batch_size]:
+      input_ids = vocabulary.encode(inputs)
+      input_ids.append(1)  # Assuming EOS=1.
+      batch_inputs.append(input_ids)
+      if len(input_ids) > batch_length:
+        batch_length = len(input_ids)
+    final_batch_inputs = []
+    for input_ids in batch_inputs:
+      assert len(input_ids) <= batch_length
+      x = input_ids + [0] * (batch_length - len(input_ids))
+      final_batch_inputs.append(x)
+    yield {
+        "inputs": np.array(final_batch_inputs),
+        "problem_choice": np.array(problem_id)
+    }
+
+
+def get_datasets_for_mode(data_dir, mode):
+  return data_reader.get_datasets(FLAGS.problems, data_dir, mode)
+
+
+def _cond_on_index(fn, index_tensor, cur_idx, max_idx):
+  """Call fn(index_tensor) using tf.cond in [cur_id, max_idx]."""
+  if cur_idx == max_idx:
+    return fn(cur_idx)
+  return tf.cond(
+      tf.equal(index_tensor, cur_idx), lambda: fn(cur_idx),
+      lambda: _cond_on_index(fn, index_tensor, cur_idx + 1, max_idx))
+
+
+def _interactive_input_fn(hparams):
+  """Generator that reads from the terminal and yields "interactive inputs".
+
+  Due to temporary limitations in tf.learn, if we don't want to reload the
+  whole graph, then we are stuck encoding all of the input as one fixed-size
+  numpy array.
+
+  We yield int64 arrays with shape [const_array_size].  The format is:
+  [num_samples, decode_length, len(input ids), <input ids>, <padding>]
+
+  Args:
+    hparams: model hparams
+  Yields:
+    numpy arrays
+
+  Raises:
+    Exception: when `input_type` is invalid.
+  """
+  num_samples = 3
+  decode_length = 100
+  input_type = "text"
+  problem_id = 0
+  p_hparams = hparams.problems[problem_id]
+  has_input = "inputs" in p_hparams.input_modality
+  vocabulary = p_hparams.vocabulary["inputs" if has_input else "targets"]
+  # This should be longer than the longest input.
+  const_array_size = 10000
+  while True:
+    prompt = ("INTERACTIVE MODE  num_samples=%d  decode_length=%d  \n"
+              "  it=<input_type>     ('text' or 'image')\n"
+              "  pr=<problem_num>    (set the problem number)\n"
+              "  in=<input_problem>  (set the input problem number)\n"
+              "  ou=<output_problem> (set the output problem number)\n"
+              "  ns=<num_samples>    (changes number of samples)\n"
+              "  dl=<decode_length>  (changes decode legnth)\n"
+              "  <%s>                (decode)\n"
+              "  q                   (quit)\n"
+              ">" % (num_samples, decode_length, "source_string"
+                     if has_input else "target_prefix"))
+    input_string = input(prompt)
+    if input_string == "q":
+      return
+    elif input_string[:3] == "pr=":
+      problem_id = int(input_string[3:])
+      p_hparams = hparams.problems[problem_id]
+      has_input = "inputs" in p_hparams.input_modality
+      vocabulary = p_hparams.vocabulary["inputs" if has_input else "targets"]
+    elif input_string[:3] == "in=":
+      problem = int(input_string[3:])
+      p_hparams.input_modality = hparams.problems[problem].input_modality
+      p_hparams.input_space_id = hparams.problems[problem].input_space_id
+    elif input_string[:3] == "ou=":
+      problem = int(input_string[3:])
+      p_hparams.target_modality = hparams.problems[problem].target_modality
+      p_hparams.target_space_id = hparams.problems[problem].target_space_id
+    elif input_string[:3] == "ns=":
+      num_samples = int(input_string[3:])
+    elif input_string[:3] == "dl=":
+      decode_length = int(input_string[3:])
+    elif input_string[:3] == "it=":
+      input_type = input_string[3:]
+    else:
+      if input_type == "text":
+        input_ids = vocabulary.encode(input_string)
+        if has_input:
+          input_ids.append(1)  # assume 1 means end-of-source
+        x = [num_samples, decode_length, len(input_ids)] + input_ids
+        assert len(x) < const_array_size
+        x += [0] * (const_array_size - len(x))
+        yield problem_id, {
+            "inputs": np.array(x),
+            "problem_choice": np.array(problem_id)
+        }
+      elif input_type == "image":
+        input_path = input_string
+        img = read_image(input_path)
+        yield problem_id, {
+            "inputs": img,
+            "problem_choice": np.array(problem_id)
+        }
+      else:
+        raise Exception("Unsupported input type.")
+
+
+def read_image(path):
+  try:
+    import matplotlib.image as im  # pylint: disable=g-import-not-at-top
+  except ImportError as e:
+    tf.logging.warning(
+        "Reading an image requires matplotlib to be installed: %s", e)
+    raise NotImplementedError("Image reading not implemented.")
+  return im.imread(path)
+
+
+def show_and_save_image(img, save_path):
+  try:
+    import matplotlib.pyplot as plt  # pylint: disable=g-import-not-at-top
+  except ImportError as e:
+    tf.logging.warning("Showing and saving an image requires matplotlib to be "
+                       "installed: %s", e)
+    raise NotImplementedError("Image display and save not implemented.")
+  plt.imshow(img)
+  plt.savefig(save_path)
+
+
+def _get_sorted_inputs():
+  """Returning inputs sorted according to length.
+
+  Returns:
+    a sorted list of inputs
+
+  """
+  tf.logging.info("Getting sorted inputs")
+  # read file and sort inputs according them according to input length.
+  if FLAGS.decode_shards > 1:
+    decode_filename = FLAGS.decode_from_file + ("%.2d" % FLAGS.worker_id)
+  else:
+    decode_filename = FLAGS.decode_from_file
+  inputs = [line.strip() for line in tf.gfile.Open(decode_filename)]
+  input_lens = [(i, len(line.strip().split())) for i, line in enumerate(inputs)]
+  sorted_input_lens = sorted(input_lens, key=operator.itemgetter(1))
+  # We'll need the keys to rearrange the inputs back into their original order
+  sorted_keys = {}
+  sorted_inputs = []
+  for i, (index, _) in enumerate(sorted_input_lens):
+    sorted_inputs.append(inputs[index])
+    sorted_keys[index] = i
+  return sorted_inputs, sorted_keys
+
+
+def _interactive_input_tensor_to_features_dict(feature_map, hparams):
+  """Convert the interactive input format (see above) to a dictionary.
+
+  Args:
+    feature_map: a dictionary with keys `problem_choice` and `input` containing
+      Tensors.
+    hparams: model hyperparameters
+
+  Returns:
+    a features dictionary, as expected by the decoder.
+  """
+  inputs = tf.constant(feature_map["inputs"])
+  input_is_image = False if len(inputs.shape) < 3 else True
+
+  def input_fn(problem_choice, x=inputs):  # pylint: disable=missing-docstring
+    p_hparams = hparams.problems[problem_choice]
+    if not input_is_image:
+      # Remove the batch dimension.
+      num_samples = x[0]
+      length = x[2]
+      x = tf.slice(x, [3], tf.to_int32([length]))
+      x = tf.reshape(x, [1, -1, 1, 1])
+      # Transform into a batch of size num_samples to get that many random
+      # decodes.
+      x = tf.tile(x, tf.to_int32([num_samples, 1, 1, 1]))
+    else:
+      x = tf.image.resize_images(x, [299, 299])
+      x = tf.reshape(x, [1, 299, 299, -1])
+      x = tf.to_int32(x)
+    return (tf.constant(p_hparams.input_space_id),
+            tf.constant(p_hparams.target_space_id), x)
+
+  input_space_id, target_space_id, x = _cond_on_index(
+      input_fn, feature_map["problem_choice"], 0, len(hparams.problems) - 1)
+
+  features = {}
+  features["problem_choice"] = tf.constant(feature_map["problem_choice"])
+  features["input_space_id"] = input_space_id
+  features["target_space_id"] = target_space_id
+  features["decode_length"] = (IMAGE_DECODE_LENGTH
+                               if input_is_image else inputs[1])
+  features["inputs"] = x
+  return features
+
+
+def _decode_input_tensor_to_features_dict(feature_map, hparams):
+  """Convert the interactive input format (see above) to a dictionary.
+
+  Args:
+    feature_map: a dictionary with keys `problem_choice` and `input` containing
+      Tensors.
+    hparams: model hyperparameters
+
+  Returns:
+    a features dictionary, as expected by the decoder.
+  """
+  inputs = tf.constant(feature_map["inputs"])
+  input_is_image = False
+
+  def input_fn(problem_choice, x=inputs):  # pylint: disable=missing-docstring
+    p_hparams = hparams.problems[problem_choice]
+    # Add a third empty dimension dimension
+    x = tf.expand_dims(x, axis=[2])
+    x = tf.to_int32(x)
+    return (tf.constant(p_hparams.input_space_id),
+            tf.constant(p_hparams.target_space_id), x)
+
+  input_space_id, target_space_id, x = _cond_on_index(
+      input_fn, feature_map["problem_choice"], 0, len(hparams.problems) - 1)
+
+  features = {}
+  features["problem_choice"] = feature_map["problem_choice"]
+  features["input_space_id"] = input_space_id
+  features["target_space_id"] = target_space_id
+  features["decode_length"] = (IMAGE_DECODE_LENGTH
+                               if input_is_image else tf.shape(x)[1] + 50)
+  features["inputs"] = x
+  return features
+
+
+def get_input_fn(mode,
+                 hparams,
+                 data_file_patterns=None,
+                 num_datashards=None,
+                 fixed_problem=None):
+  """Provides input to the graph, either from disk or via a placeholder.
+
+  This function produces an input function that will feed data into
+  the network. There are two modes of operation:
+
+  1. If data_file_pattern and all subsequent arguments are None, then
+     it creates a placeholder for a serialized tf.Example proto.
+  2. If data_file_pattern is defined, it will read the data from the
+     files at the given location. Use this mode for training,
+     evaluation, and testing prediction.
+
+  Args:
+    mode: The execution mode, as defined in tf.contrib.learn.ModeKeys.
+    hparams: HParams object.
+    data_file_patterns: The list of file patterns to use to read in data. Set to
+      `None` if you want to create a placeholder for the input data. The
+      `problems` flag is a list of problem names joined by the `-` character.
+      The flag's string is then split along the `-` and each problem gets its
+      own example queue.
+    num_datashards: An integer.
+    fixed_problem: An integer indicating the problem to fetch data for, or None
+      if the input is to be randomly selected.
+
+  Returns:
+    A function that returns a dictionary of features and the target labels.
+  """
+
+  def input_fn():
+    """Supplies input to our model.
+
+    This function supplies input to our model, where this input is a
+    function of the mode. For example, we supply different data if
+    we're performing training versus evaluation.
+
+    Returns:
+      A tuple consisting of 1) a dictionary of tensors whose keys are
+      the feature names, and 2) a tensor of target labels if the mode
+      is not INFER (and None, otherwise).
+
+    Raises:
+      ValueError: if one of the parameters has an unsupported value.
+    """
+    problem_count, batches = len(data_file_patterns), []
+    with tf.name_scope("input_queues"):
+      for n in xrange(problem_count):
+        if fixed_problem is not None and n != fixed_problem:
+          continue
+        with tf.name_scope("problem_%d" % n):
+          with tf.device("/cpu:0"):  # Input queues are on CPU.
+            capacity = hparams.problems[n].max_expected_batch_size_per_shard
+            capacity *= num_datashards
+            examples = data_reader.input_pipeline(data_file_patterns[n],
+                                                  capacity, mode)
+            drop_long_sequences = mode == tf.contrib.learn.ModeKeys.TRAIN
+            batch_size_multiplier = hparams.problems[n].batch_size_multiplier
+            feature_map = data_reader.batch_examples(
+                examples,
+                data_reader.hparams_to_batching_scheme(
+                    hparams,
+                    shard_multiplier=num_datashards,
+                    drop_long_sequences=drop_long_sequences,
+                    length_multiplier=batch_size_multiplier))
+
+        # Reverse inputs and targets features if the problem was reversed.
+        if hparams.problems[n].was_reversed:
+          inputs = feature_map["inputs"]
+          targets = feature_map["targets"]
+          feature_map["inputs"] = targets
+          feature_map["targets"] = inputs
+
+        # Use the inputs as the targets if the problem is a copy problem.
+        if hparams.problems[n].was_copy:
+          feature_map["targets"] = feature_map["inputs"]
+
+        # Ensure inputs and targets are proper rank.
+        while len(feature_map["inputs"].get_shape()) != 4:
+          feature_map["inputs"] = tf.expand_dims(feature_map["inputs"], axis=-1)
+        while len(feature_map["targets"].get_shape()) != 4:
+          feature_map["targets"] = tf.expand_dims(
+              feature_map["targets"], axis=-1)
+
+        batches.append(
+            (feature_map["inputs"], feature_map["targets"], tf.constant(n),
+             tf.constant(hparams.problems[n].input_space_id),
+             tf.constant(hparams.problems[n].target_space_id)))
+
+    # We choose which problem to process.
+    loss_moving_avgs = []  # Need loss moving averages for that.
+    for n in xrange(problem_count):
+      with tf.variable_scope("losses_avg"):
+        loss_moving_avgs.append(
+            tf.get_variable(
+                "problem_%d/total_loss" % n, initializer=100.0,
+                trainable=False))
+        tf.get_variable(
+            "problem_%d/training_loss" % n, initializer=100.0, trainable=False)
+        tf.get_variable(
+            "problem_%d/extra_loss" % n, initializer=100.0, trainable=False)
+    if fixed_problem is None:
+      if (hparams.problem_choice == "uniform" or
+          mode != tf.contrib.learn.ModeKeys.TRAIN):
+        problem_choice = tf.random_uniform(
+            [], maxval=problem_count, dtype=tf.int32)
+      elif hparams.problem_choice == "adaptive":
+        loss_moving_avgs = tf.stack(loss_moving_avgs)
+        problem_choice = tf.multinomial(
+            tf.reshape(loss_moving_avgs, [1, -1]), 1)
+        problem_choice = tf.to_int32(tf.squeeze(problem_choice))
+      elif hparams.problem_choice == "distributed":
+        assert FLAGS.worker_replicas >= problem_count
+        assert FLAGS.worker_replicas % problem_count == 0
+        problem_choice = tf.to_int32(FLAGS.worker_id % problem_count)
+      else:
+        raise ValueError("Value of hparams.problem_choice is %s and must be "
+                         "one of [uniform, adaptive, distributed]",
+                         hparams.problem_choice)
+
+      # Inputs and targets conditional on problem_choice.
+      rand_inputs, rand_target, choice, inp_id, tgt_id = _cond_on_index(
+          lambda n: batches[n], problem_choice, 0, problem_count - 1)
+    else:
+      problem_choice = tf.constant(fixed_problem)
+      # Take the only constructed batch, which is the fixed_problem.
+      rand_inputs, rand_target, choice, inp_id, tgt_id = batches[0]
+
+    # Set shapes so the ranks are clear.
+    rand_inputs.set_shape([None, None, None, None])
+    rand_target.set_shape([None, None, None, None])
+    choice.set_shape([])
+    inp_id.set_shape([])
+    tgt_id.set_shape([])
+    #  Forced shape obfuscation is necessary for inference.
+    if mode == tf.contrib.learn.ModeKeys.INFER:
+      rand_inputs._shape = tf.TensorShape([None, None, None, None])  # pylint: disable=protected-access
+      rand_target._shape = tf.TensorShape([None, None, None, None])  # pylint: disable=protected-access
+
+    # Final feature map.
+    rand_feature_map = {
+        "inputs": rand_inputs,
+        "problem_choice": choice,
+        "input_space_id": inp_id,
+        "target_space_id": tgt_id
+    }
+    if mode == tf.contrib.learn.ModeKeys.INFER:
+      rand_feature_map["infer_targets"] = rand_target
+      rand_target = None
+    return rand_feature_map, rand_target
+
+  return input_fn
+
+
+class _ConditionalOptimizer(tf.train.Optimizer):
+  """Conditional optimizer."""
+
+  def __init__(self, optimizer_name, lr, hparams, skip_condition_tensor=False):
+    self._skip_condition = skip_condition_tensor
+    if optimizer_name == "Adam":
+      # We change the default epsilon for Adam and re-scale lr.
+      # Using LazyAdam as it's much faster for large vocabulary embeddings.
+      self._opt = tf.contrib.opt.LazyAdamOptimizer(
+          lr / 500.0,
+          beta1=hparams.optimizer_adam_beta1,
+          beta2=hparams.optimizer_adam_beta2,
+          epsilon=hparams.optimizer_adam_epsilon)
+    elif optimizer_name == "Momentum":
+      self._opt = tf.train.MomentumOptimizer(
+          lr, momentum=hparams.optimizer_momentum_momentum)
+    else:
+      self._opt = tf.contrib.layers.OPTIMIZER_CLS_NAMES[optimizer_name](lr)
+
+  def compute_gradients(self, loss, var_list, colocate_gradients_with_ops):
+    return self._opt.compute_gradients(
+        loss, var_list, colocate_gradients_with_ops=colocate_gradients_with_ops)
+
+  def apply_gradients(self, gradients, global_step=None, name=None):
+
+    def opt_gradients():
+      return self._opt.apply_gradients(
+          gradients, global_step=global_step, name=name)
+
+    if self._skip_condition is False:
+      return opt_gradients()
+    return tf.cond(
+        self._skip_condition,
+        tf.no_op,
+        opt_gradients,
+        name="conditional_optimizer_gradients_skip_cond")
+
+
+def _sqrt_decay(step):
+  """Decay like 1 / sqrt(step), multiplied by 500 to normalize."""
+  return 500.0 / tf.sqrt(tf.maximum(step, 1.0))
+
+
+def _exp_decay_after(step, rate, from_which_step):
+  """Decay exponentially by rate (per step) starting at from_which_step."""
+  return tf.cond(
+      step < from_which_step,
+      lambda: tf.constant(1.0),
+      lambda: rate**(step - from_which_step),
+      name="exponential_decay_step_cond")
+
+
+def _ps_replicas(all_workers=False):
+  if all_workers:
+    return list(range(FLAGS.ps_replicas))
+  # Worker K will be using replicas {0,...n-1} + K*n if we have n replicas.
+  num_replicas = FLAGS.ps_replicas // FLAGS.worker_replicas
+  return [d + FLAGS.worker_id * num_replicas for d in xrange(num_replicas)]
+
+
+def _gpu_order(num_gpus):
+  if FLAGS.gpu_order:
+    ret = [int(s) for s in FLAGS.gpu_order.split(" ")]
+    if len(ret) == num_gpus:
+      return ret
+  return list(range(num_gpus))
+
+
+def _ps_gpus(all_workers=False):
+  ps_gpus = []
+  for d in _ps_replicas(all_workers=all_workers):
+    ps_gpus.extend([(d, gpu) for gpu in _gpu_order(FLAGS.ps_gpu)])
+  return ps_gpus
+
+
+def _ps_devices(all_workers=False):
+  """List of ps devices (where to put the experts).
+
+  Args:
+    all_workers: whether the list is for all async workers or just this one.
+
+  Returns:
+    a list of device names
+  """
+  if FLAGS.ps_replicas > 0:
+    if FLAGS.ps_gpu > 0:
+      return [
+          FLAGS.ps_job + "/task:%d/GPU:%d" % (d, gpu)
+          for (d, gpu) in _ps_gpus(all_workers=all_workers)
+      ]
+    else:
+      return [
+          FLAGS.ps_job + "/task:%d" % d
+          for d in _ps_replicas(all_workers=all_workers)
+      ]
+  else:
+    if FLAGS.worker_gpu > 0:
+      return ["gpu:%d" % d for d in _gpu_order(FLAGS.worker_gpu)]
+    else:
+      return [""]
+
+
+def data_parallelism(all_workers=False):
+  """Over which devices do we split each training batch.
+
+  In old-fashioned async mode, we split the batch over all GPUs on the
+  current worker.
+
+  In sync mode, we split the batch over all the parameter server GPUs.
+
+  This function returns an expert_utils.Parallelism object, which can be used
+  to build the model.  It is configured in a way that any variables created
+  by `tf.get_variable` will be assigned to the parameter servers and shared
+  between datashards.
+
+  Args:
+    all_workers: whether the devices are all async workers or just this one.
+
+  Returns:
+    a expert_utils.Parallelism.
+  """
+
+  def _replica_device_setter(worker_device):
+    if FLAGS.ps_replicas == 0:
+      return worker_device
+    return tf.train.replica_device_setter(
+        worker_device=worker_device,
+        ps_tasks=FLAGS.ps_replicas,
+        ps_device=FLAGS.ps_job + "/GPU:0" if FLAGS.ps_gpu > 0 else FLAGS.ps_job)
+
+  if FLAGS.schedule == "local_run":
+    assert not FLAGS.sync
+    datashard_devices = ["gpu:%d" % d for d in _gpu_order(FLAGS.worker_gpu)]
+    caching_devices = None
+  elif FLAGS.sync:
+    assert FLAGS.ps_replicas > 0
+    datashard_devices = [
+        _replica_device_setter(d) for d in _ps_devices(all_workers=all_workers)
+    ]
+    if FLAGS.ps_gpu > 0 and FLAGS.ps_replicas > 1:
+      caching_devices = [
+          FLAGS.ps_job + "/task:%d/cpu:0" % d
+          for (d, _) in _ps_gpus(all_workers=all_workers)
+      ]
+    else:
+      caching_devices = None
+  else:
+    # old fashioned async - compute on worker
+    if FLAGS.worker_gpu > 1:
+      datashard_devices = [
+          _replica_device_setter(FLAGS.worker_job + "/GPU:%d" % d)
+          for d in _gpu_order(FLAGS.worker_gpu)
+      ]
+      caching_devices = [FLAGS.worker_job + "/GPU:0"] * FLAGS.worker_gpu
+    else:
+      datashard_devices = [_replica_device_setter(FLAGS.worker_job)]
+      caching_devices = None
+  tf.logging.info("datashard_devices: %s", datashard_devices)
+  tf.logging.info("caching_devices: %s", caching_devices)
+  return eu.Parallelism(
+      datashard_devices,
+      reuse=True,
+      caching_devices=caching_devices,
+      daisy_chain_variables=FLAGS.daisy_chain_variables)
diff --git a/tensor2tensor/utils/trainer_utils_test.py b/tensor2tensor/utils/trainer_utils_test.py
new file mode 100644
index 000000000..4e0807d4e
--- /dev/null
+++ b/tensor2tensor/utils/trainer_utils_test.py
@@ -0,0 +1,41 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for trainer_utils."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+from tensor2tensor.utils import registry
+from tensor2tensor.utils import trainer_utils as utils  # pylint: disable=unused-import
+
+import tensorflow as tf
+
+
+class TrainerUtilsTest(tf.test.TestCase):
+
+  def testModelsImported(self):
+    models = registry.list_models()
+    self.assertTrue("baseline_lstm_seq2seq" in models)
+
+  def testHParamsImported(self):
+    hparams = registry.list_hparams()
+    self.assertTrue("transformer_base" in hparams)
+
+
+if __name__ == "__main__":
+  tf.test.main()