From b4d475bf0966ac148f61d7f42bd9b46155bb04f6 Mon Sep 17 00:00:00 2001
From: "Joshua V. Dillon" <jvdillon@google.com>
Date: Sun, 12 Feb 2017 16:22:57 -0800
Subject: [PATCH] Misc cleanups for style and consistency. Change: 147297316

---
 .../dirichlet_multinomial_test.py             |   7 +-
 .../python/kernel_tests/multinomial_test.py   |   7 +-
 .../kernel_tests/vector_student_t_test.py     |  14 +-
 .../distributions/python/ops/bernoulli.py     |  10 +-
 .../contrib/distributions/python/ops/beta.py  |  10 +-
 .../distributions/python/ops/bijector.py      | 188 +++++++++---------
 .../distributions/python/ops/binomial.py      |  12 +-
 .../distributions/python/ops/categorical.py   |  15 +-
 .../contrib/distributions/python/ops/chi2.py  |  10 +-
 .../distributions/python/ops/dirichlet.py     |  22 +-
 .../python/ops/dirichlet_multinomial.py       |  28 +--
 .../distributions/python/ops/distribution.py  |  90 +++++----
 .../python/ops/distribution_util.py           |  79 ++++----
 .../distributions/python/ops/exponential.py   |  14 +-
 .../contrib/distributions/python/ops/gamma.py |  10 +-
 .../distributions/python/ops/gumbel.py        |  12 +-
 .../distributions/python/ops/inverse_gamma.py |  10 +-
 .../python/ops/kullback_leibler.py            |   6 +-
 .../distributions/python/ops/laplace.py       |  12 +-
 .../distributions/python/ops/logistic.py      |  10 +-
 .../distributions/python/ops/mixture.py       |  20 +-
 .../distributions/python/ops/multinomial.py   |  25 ++-
 .../distributions/python/ops/mvn_diag.py      |   6 +-
 .../python/ops/mvn_diag_plus_low_rank.py      |   8 +-
 .../python/ops/mvn_linear_operator.py         |   8 +-
 .../distributions/python/ops/mvn_tril.py      |   6 +-
 .../distributions/python/ops/normal.py        |  12 +-
 .../python/ops/normal_conjugate_posteriors.py |  34 ++--
 .../python/ops/onehot_categorical.py          |  20 +-
 .../distributions/python/ops/poisson.py       |   8 +-
 .../python/ops/quantized_distribution.py      |  17 +-
 .../python/ops/relaxed_bernoulli.py           |   8 +-
 .../python/ops/relaxed_onehot_categorical.py  |  22 +-
 .../contrib/distributions/python/ops/shape.py |  73 ++++---
 .../distributions/python/ops/special_math.py  |  20 +-
 .../distributions/python/ops/student_t.py     |  35 ++--
 .../python/ops/transformed_distribution.py    |  18 +-
 .../distributions/python/ops/uniform.py       |  10 +-
 .../python/ops/vector_student_t.py            |  59 +++---
 .../distributions/python/ops/wishart.py       | 100 +++++-----
 40 files changed, 542 insertions(+), 533 deletions(-)

diff --git a/tensorflow/contrib/distributions/python/kernel_tests/dirichlet_multinomial_test.py b/tensorflow/contrib/distributions/python/kernel_tests/dirichlet_multinomial_test.py
index 235ce209458041..60703e6997c90c 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/dirichlet_multinomial_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/dirichlet_multinomial_test.py
@@ -222,9 +222,10 @@ def testCovarianceFromSampling(self):
       dist = ds.DirichletMultinomial(n, alpha)
       x = dist.sample(int(250e3), seed=1)
       sample_mean = math_ops.reduce_mean(x, 0)
-      x_centered = x - sample_mean[None, ...]
+      x_centered = x - sample_mean[array_ops.newaxis, ...]
       sample_cov = math_ops.reduce_mean(math_ops.matmul(
-          x_centered[..., None], x_centered[..., None, :]), 0)
+          x_centered[..., array_ops.newaxis],
+          x_centered[..., array_ops.newaxis, :]), 0)
       sample_var = array_ops.matrix_diag_part(sample_cov)
       sample_stddev = math_ops.sqrt(sample_var)
       [
@@ -317,7 +318,7 @@ def testCovarianceNAlphaBroadcast(self):
       dist = ds.DirichletMultinomial(ns, alpha)
       covariance = dist.covariance()
       expected_covariance = shared_matrix * (
-          ns * (ns + alpha_0) / (1 + alpha_0))[..., None]
+          ns * (ns + alpha_0) / (1 + alpha_0))[..., array_ops.newaxis]
 
       self.assertEqual([4, 3, 3], covariance.get_shape())
       self.assertAllClose(expected_covariance, covariance.eval())
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/multinomial_test.py b/tensorflow/contrib/distributions/python/kernel_tests/multinomial_test.py
index ded12c9c4d4ebd..06ea27d8860681 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/multinomial_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/multinomial_test.py
@@ -239,7 +239,7 @@ def testCovarianceFromSampling(self):
     # via broadcast between alpha, n.
     theta = np.array([[1., 2, 3],
                       [2.5, 4, 0.01]], dtype=np.float32)
-    theta /= np.sum(theta, 1)[..., None]
+    theta /= np.sum(theta, 1)[..., array_ops.newaxis]
     # Ideally we'd be able to test broadcasting but, the multinomial sampler
     # doesn't support different total counts.
     n = np.float32(5)
@@ -247,9 +247,10 @@ def testCovarianceFromSampling(self):
       dist = ds.Multinomial(n, theta)  # batch_shape=[2], event_shape=[3]
       x = dist.sample(int(250e3), seed=1)
       sample_mean = math_ops.reduce_mean(x, 0)
-      x_centered = x - sample_mean[None, ...]
+      x_centered = x - sample_mean[array_ops.newaxis, ...]
       sample_cov = math_ops.reduce_mean(math_ops.matmul(
-          x_centered[..., None], x_centered[..., None, :]), 0)
+          x_centered[..., array_ops.newaxis],
+          x_centered[..., array_ops.newaxis, :]), 0)
       sample_var = array_ops.matrix_diag_part(sample_cov)
       sample_stddev = math_ops.sqrt(sample_var)
       [
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/vector_student_t_test.py b/tensorflow/contrib/distributions/python/kernel_tests/vector_student_t_test.py
index 0a4e7fb5b5b512..9d0ffd63763329 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/vector_student_t_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/vector_student_t_test.py
@@ -176,7 +176,7 @@ def testProbScalarBaseDistributionNonScalarTransform(self):
     x = 2. * self._rng.rand(4, 3, 3).astype(np.float32) - 1.
 
     expected_mst = _FakeVectorStudentT(
-        df=np.tile(df, len(scale_diag)),
+        df=np.tile(df, reps=len(scale_diag)),
         loc=loc,
         scale_tril=scale_tril)
 
@@ -207,7 +207,7 @@ def testProbScalarBaseDistributionNonScalarTransformDynamic(self):
     x = 2. * self._rng.rand(4, 3, 3).astype(np.float32) - 1.
 
     expected_mst = _FakeVectorStudentT(
-        df=np.tile(df, len(scale_diag)),
+        df=np.tile(df, reps=len(scale_diag)),
         loc=loc,
         scale_tril=scale_tril)
 
@@ -236,8 +236,9 @@ def testProbNonScalarBaseDistributionScalarTransform(self):
 
     expected_mst = _FakeVectorStudentT(
         df=df,
-        loc=np.tile(loc[None, :], [len(df), 1]),
-        scale_tril=np.tile(scale_tril[None, :, :], [len(df), 1, 1]))
+        loc=np.tile(loc[array_ops.newaxis, :], reps=[len(df), 1]),
+        scale_tril=np.tile(scale_tril[array_ops.newaxis, :, :],
+                           reps=[len(df), 1, 1]))
 
     with self.test_session():
       actual_mst = _VectorStudentT(df=df, loc=loc, scale_diag=scale_diag,
@@ -261,8 +262,9 @@ def testProbNonScalarBaseDistributionScalarTransformDynamic(self):
 
     expected_mst = _FakeVectorStudentT(
         df=df,
-        loc=np.tile(loc[None, :], [len(df), 1]),
-        scale_tril=np.tile(scale_tril[None, :, :], [len(df), 1, 1]))
+        loc=np.tile(loc[array_ops.newaxis, :], reps=[len(df), 1]),
+        scale_tril=np.tile(scale_tril[array_ops.newaxis, :, :],
+                           reps=[len(df), 1, 1]))
 
     with self.test_session():
       df_pl = array_ops.placeholder(dtypes.float32, name="df")
diff --git a/tensorflow/contrib/distributions/python/ops/bernoulli.py b/tensorflow/contrib/distributions/python/ops/bernoulli.py
index 60f8c114d8e7f5..7e984c4881ecc4 100644
--- a/tensorflow/contrib/distributions/python/ops/bernoulli.py
+++ b/tensorflow/contrib/distributions/python/ops/bernoulli.py
@@ -57,15 +57,15 @@ def __init__(self,
         Bernoulli distribution. Only one of `logits` or `probs` should be passed
         in.
       dtype: The type of the event samples. Default: `int32`.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`,
+      allow_nan_stats: Python `bool`, default `True`. When `True`,
         statistics (e.g., mean, mode, variance) use the value "`NaN`" to
-        indicate the result is undefined.  When `False`, an exception is raised
+        indicate the result is undefined. When `False`, an exception is raised
         if one or more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
 
     Raises:
       ValueError: If p and logits are passed, or if neither are passed.
@@ -114,7 +114,7 @@ def _event_shape(self):
     return tensor_shape.scalar()
 
   def _sample_n(self, n, seed=None):
-    new_shape = array_ops.concat(([n], self.batch_shape_tensor()), 0)
+    new_shape = array_ops.concat([[n], self.batch_shape_tensor()], 0)
     uniform = random_ops.random_uniform(
         new_shape, seed=seed, dtype=self.probs.dtype)
     sample = math_ops.less(uniform, self.probs)
diff --git a/tensorflow/contrib/distributions/python/ops/beta.py b/tensorflow/contrib/distributions/python/ops/beta.py
index 53149b3acd25fe..4a59c6ccf4a3e3 100644
--- a/tensorflow/contrib/distributions/python/ops/beta.py
+++ b/tensorflow/contrib/distributions/python/ops/beta.py
@@ -139,15 +139,15 @@ def __init__(self,
       concentration0: Positive floating-point `Tensor` indicating mean
         number of failures; aka "beta". Otherwise has same semantics as
         `concentration1`.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`, statistics
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
         (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-        result is undefined.  When `False`, an exception is raised if one or
+        result is undefined. When `False`, an exception is raised if one or
         more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
     with ops.name_scope(name, values=[concentration1,
@@ -267,7 +267,7 @@ def _variance(self):
   @distribution_util.AppendDocstring(
       """Note: The mode is undefined when `concentration1 <= 1` or
       `concentration0 <= 1`. If `self.allow_nan_stats` is `True`, `NaN`
-      is used for undefined modes.  If `self.allow_nan_stats` is `False` an
+      is used for undefined modes. If `self.allow_nan_stats` is `False` an
       exception is raised when one or more modes are undefined.""")
   def _mode(self):
     mode = (self.concentration1 - 1.) / (self.total_concentration - 2.)
diff --git a/tensorflow/contrib/distributions/python/ops/bijector.py b/tensorflow/contrib/distributions/python/ops/bijector.py
index adb5a4722862f0..d3242af066f640 100644
--- a/tensorflow/contrib/distributions/python/ops/bijector.py
+++ b/tensorflow/contrib/distributions/python/ops/bijector.py
@@ -20,7 +20,7 @@
 
 Differentiable, bijective transformations of continuous random variables alter
 the calculations made in the cumulative/probability distribution functions and
-sample function.  This module provides a standard interface for making these
+sample function. This module provides a standard interface for making these
 manipulations.
 
 For more details and examples, see the `Bijector` docstring.
@@ -180,7 +180,7 @@ class Bijector(object):
   [diffeomorphism](https://en.wikipedia.org/wiki/Diffeomorphism), i.e., a
   bijective, differentiable function. A `Bijector` is used by
   `TransformedDistribution` but can be generally used for transforming a
-  `Distribution` generated `Tensor`.  A `Bijector` is characterized by three
+  `Distribution` generated `Tensor`. A `Bijector` is characterized by three
   operations:
 
   1. Forward Evaluation
@@ -198,7 +198,7 @@ class Bijector(object):
      "The log of the determinant of the matrix of all first-order partial
      derivatives of the inverse function."
      Useful for inverting a transformation to compute one probability in terms
-     of another.  Geometrically, the det(Jacobian) is the volume of the
+     of another. Geometrically, the det(Jacobian) is the volume of the
      transformation and is used to scale the probability.
 
   By convention, transformations of random variables are named in terms of the
@@ -210,7 +210,7 @@ class Bijector(object):
     - Basic properties:
 
     ```python
-    x = ... # A tensor.
+    x = ...  # A tensor.
     # Evaluate forward transformation.
     fwd_x = my_bijector.forward(x)
     x == my_bijector.inverse(fwd_x)
@@ -267,7 +267,7 @@ def _forward_log_det_jacobian(self, x):
             if self.event_ndims is None:
               raise ValueError("Jacobian requires known event_ndims.")
             event_dims = array_ops.shape(x)[-self.event_ndims:]
-            return math_ops.reduce_sum(x, reduction_indices=event_dims)
+            return math_ops.reduce_sum(x, axis=event_dims)
         ```
 
     - "Affine"
@@ -292,8 +292,8 @@ def _forward_log_det_jacobian(self, x):
   partitioning:
 
   - Consider the `Exp` `Bijector` applied to a `Tensor` which has sample, batch,
-    and event (S, B, E) shape semantics.  Suppose
-    the `Tensor`'s partitioned-shape is `(S=[4], B=[2], E=[3, 3])`.
+    and event (S, B, E) shape semantics. Suppose the `Tensor`'s
+    partitioned-shape is `(S=[4], B=[2], E=[3, 3])`.
 
     For `Exp`, the shape of the `Tensor` returned by `forward` and `inverse` is
     unchanged, i.e., `[4, 2, 3, 3]`. However the shape returned by
@@ -308,7 +308,7 @@ def _forward_log_det_jacobian(self, x):
 
   - If the `Bijector`'s use is limited to `TransformedDistribution` (or friends
     like `QuantizedDistribution`) then depending on your use, you may not need
-    to implement all of `_forward` and `_inverse` functions.  Examples:
+    to implement all of `_forward` and `_inverse` functions. Examples:
       1. Sampling (e.g., `sample`) only requires `_forward`.
       2. Probability functions (e.g., `prob`, `cdf`, `survival`) only require
          `_inverse` (and related).
@@ -316,7 +316,7 @@ def _forward_log_det_jacobian(self, x):
         `_inverse` can be implemented as a cache lookup.
 
     See `Example Use` [above] which shows how these functions are used to
-    transform a distribution.  (Note: `_forward` could theoretically be
+    transform a distribution. (Note: `_forward` could theoretically be
     implemented as a cache lookup but this would require controlling the
     underlying sample generation mechanism.)
 
@@ -334,7 +334,7 @@ def _forward_log_det_jacobian(self, x):
 
   - Subclasses should implement `_forward_event_shape`,
     `_forward_event_shape_tensor` (and `inverse` counterparts) if the
-    transformation is shape-changing.  By default the event-shape is assumed
+    transformation is shape-changing. By default the event-shape is assumed
     unchanged from input.
 
   Tips for implementing `_inverse` and `_inverse_log_det_jacobian`:
@@ -343,14 +343,14 @@ def _forward_log_det_jacobian(self, x):
     can be implemented as a cache lookup.
 
   - The inverse `log o det o Jacobian` can be implemented as the negative of the
-    forward `log o det o Jacobian`.  This is useful if the `inverse` is
+    forward `log o det o Jacobian`. This is useful if the `inverse` is
     implemented as a cache or the inverse Jacobian is computationally more
     expensive (e.g., `CholeskyOuterProduct` `Bijector`). The following
     demonstrates the suggested implementation.
 
     ```python
     def _inverse_and_log_det_jacobian(self, y):
-       x = # ... implement inverse, possibly via cache.
+       x = ...  # implement inverse, possibly via cache.
        return x, -self._forward_log_det_jac(x)  # Note negation.
     ```
 
@@ -414,10 +414,10 @@ def __init__(self,
     Args:
       event_ndims: number of dimensions associated with event coordinates.
       graph_parents: Python list of graph prerequisites of this `Bijector`.
-      is_constant_jacobian: `Boolean` indicating that the Jacobian is not a
+      is_constant_jacobian: Python `bool` indicating that the Jacobian is not a
         function of the input.
-      validate_args: `Boolean`, default `False`.  Whether to validate input with
-        asserts. If `validate_args` is `False`, and the inputs are invalid,
+      validate_args: Python `bool`, default `False`. Whether to validate input
+        with asserts. If `validate_args` is `False`, and the inputs are invalid,
         correct behavior is not guaranteed.
       dtype: `tf.dtype` supported by this `Bijector`. `None` means dtype is not
         enforced.
@@ -462,7 +462,7 @@ def is_constant_jacobian(self):
     Note: Jacobian is either constant for both forward and inverse or neither.
 
     Returns:
-      `Boolean`.
+      is_constant_jacobian: Python `bool`.
     """
     return self._is_constant_jacobian
 
@@ -733,7 +733,7 @@ def _call_inverse_and_inverse_log_det_jacobian(self, y, name, **kwargs):
       elif self.is_constant_jacobian:
         self._constant_ildj = ildj
       # We use the mapped version of x, even if we re-computed x above with a
-      # call to self._inverse_and_inverse_log_det_jacobian.  This prevents
+      # call to self._inverse_and_inverse_log_det_jacobian. This prevents
       # re-evaluation of the inverse in a common case.
       x = x if mapping.x is None else mapping.x
       mapping = mapping.merge(x=x, ildj=ildj)
@@ -886,7 +886,7 @@ class Inline(Bijector):
     forward_fn=tf.exp,
     inverse_fn=tf.log,
     inverse_log_det_jacobian_fn=(
-      lambda y: -tf.reduce_sum(tf.log(y), reduction_indices=-1)),
+      lambda y: -tf.reduce_sum(tf.log(y), axis=-1)),
     name="exp")
   ```
 
@@ -922,11 +922,11 @@ def __init__(self,
         static event shape changes. Default: shape is assumed unchanged.
       inverse_event_shape_tensor_fn: Python callable implementing non-identical
         event shape changes. Default: shape is assumed unchanged.
-      is_constant_jacobian: `Boolean` indicating that the Jacobian is constant
-        for all input arguments.
-      validate_args: `Boolean` indicating whether arguments should be checked
-        for correctness.
-      name: `String`, name given to ops managed by this object.
+      is_constant_jacobian: Python `bool` indicating that the Jacobian is
+        constant for all input arguments.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str`, name given to ops managed by this object.
     """
     super(Inline, self).__init__(
         event_ndims=0,
@@ -1021,9 +1021,9 @@ def __init__(self, bijector, validate_args=False, name=None):
 
     Args:
       bijector: Bijector instance.
-      validate_args: `Boolean` indicating whether arguments should be checked
-        for correctness.
-      name: `String`, name given to ops managed by this object.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str`, name given to ops managed by this object.
     """
 
     self._bijector = bijector
@@ -1103,10 +1103,10 @@ def __init__(self, bijectors=(), validate_args=False, name=None):
     Args:
       bijectors: Python list of bijector instances. An empty list makes this
         bijector equivalent to the `Identity` bijector.
-      validate_args: `Boolean` indicating whether arguments should be checked
-        for correctness.
-      name: `String`, name given to ops managed by this object. Default: E.g.,
-        `Chain([Exp(), Softplus()]).name == "chain_of_exp_of_softplus"`.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str`, name given to ops managed by this object. Default:
+        E.g., `Chain([Exp(), Softplus()]).name == "chain_of_exp_of_softplus"`.
 
     Raises:
       ValueError: if bijectors have different dtypes.
@@ -1246,9 +1246,9 @@ def __init__(self,
         `Y = g(X) = (1 + X * c)**(1 / c)` where `c` is the `power`.
       event_ndims: Python scalar indicating the number of dimensions associated
         with a particular draw from the distribution.
-      validate_args: `Boolean` indicating whether arguments should be checked
-        for correctness.
-      name: `String` name given to ops managed by this object.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str` name given to ops managed by this object.
 
     Raises:
       ValueError: if `power < 0` or is not known statically.
@@ -1285,24 +1285,24 @@ def _inverse_and_inverse_log_det_jacobian(self, y):
     event_dims = self._event_dims_tensor(y)
     if self.power == 0.:
       x = math_ops.log(y)
-      ildj = -math_ops.reduce_sum(x, reduction_indices=event_dims)
+      ildj = -math_ops.reduce_sum(x, axis=event_dims)
       return x, ildj
     # TODO(jvdillon): If large y accuracy is an issue, consider using
     # (y**self.power - 1.) / self.power when y >> 1.
     x = math_ops.expm1(math_ops.log(y) * self.power) / self.power
     ildj = (self.power - 1.) * math_ops.reduce_sum(
         math_ops.log(y),
-        reduction_indices=event_dims)
+        axis=event_dims)
     return x, ildj
 
   def _forward_log_det_jacobian(self, x):
     x = self._maybe_assert_valid_x(x)
     event_dims = self._event_dims_tensor(x)
     if self.power == 0.:
-      return math_ops.reduce_sum(x, reduction_indices=event_dims)
+      return math_ops.reduce_sum(x, axis=event_dims)
     return (1. / self.power - 1.) * math_ops.reduce_sum(
         math_ops.log1p(x * self.power),
-        reduction_indices=event_dims)
+        axis=event_dims)
 
   def _maybe_assert_valid_x(self, x):
     if not self.validate_args or self.power == 0.:
@@ -1351,9 +1351,9 @@ def __init__(self,
     Args:
       event_ndims: Scalar `int32` `Tensor` indicating the number of dimensions
         associated with a particular draw from the distribution.
-      validate_args: `Boolean` indicating whether arguments should be checked
-        for correctness.
-      name: `String` name given to ops managed by this object.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str` name given to ops managed by this object.
     """
     super(Exp, self).__init__(
         event_ndims=event_ndims,
@@ -1376,8 +1376,8 @@ def __init__(self, tril, v, diag=None, validate_args=False):
       tril: `Tensor` of shape `[B1,..,Bb, d, d]`.
       v: `Tensor` of shape `[B1,...,Bb, d, k]`.
       diag: `Tensor` of shape `[B1,...,Bb, k, k]` or None
-      validate_args: `Boolean` indicating whether arguments should be checked
-        for correctness.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
     """
     self._m = tril
     self._v = v
@@ -1477,7 +1477,7 @@ def sqrt_log_abs_det(self):
         linalg_ops.matrix_determinant(self._woodbury_sandwiched_term())))
     # Reduction is ok because we always prepad inputs to this class.
     log_det_m = math_ops.reduce_sum(math_ops.log(math_ops.abs(
-        array_ops.matrix_diag_part(self._m))), reduction_indices=[-1])
+        array_ops.matrix_diag_part(self._m))), axis=[-1])
     return log_det_c + 2. * self._d.sqrt_log_abs_det() + log_det_m
 
   def _woodbury_sandwiched_term(self):
@@ -1591,34 +1591,34 @@ def __init__(self,
     `scale_diag != None` means `scale += tf.diag(scale_diag)`.
 
     Args:
-      shift: Numeric `Tensor`.  If this is set to `None`, no shift is applied.
+      shift: Floating-point `Tensor`. If this is set to `None`, no shift is
+        applied.
       scale_identity_multiplier: floating point rank 0 `Tensor` representing a
         scaling done to the identity matrix.
         When `scale_identity_multiplier = scale_diag = scale_tril = None` then
         `scale += IdentityMatrix`. Otherwise no scaled-identity-matrix is added
         to `scale`.
-      scale_diag: Numeric `Tensor` representing the diagonal matrix.
-        `scale_diag` has shape [N1, N2, ... k], which represents a k x k
+      scale_diag: Floating-point `Tensor` representing the diagonal matrix.
+        `scale_diag` has shape [N1, N2, ...  k], which represents a k x k
         diagonal matrix.
         When `None` no diagonal term is added to `scale`.
-      scale_tril: Numeric `Tensor` representing the diagonal matrix.
-        `scale_diag` has shape [N1, N2, ... k, k], which represents a k x k
+      scale_tril: Floating-point `Tensor` representing the diagonal matrix.
+        `scale_diag` has shape [N1, N2, ...  k, k], which represents a k x k
         lower triangular matrix.
         When `None` no `scale_tril` term is added to `scale`.
         The upper triangular elements above the diagonal are ignored.
-      scale_perturb_factor: Numeric `Tensor` representing factor matrix with
-        last two dimensions of shape `(k, r)`.
-        When `None`, no rank-r update is added to `scale`.
-      scale_perturb_diag: Numeric `Tensor` representing the diagonal matrix.
-        `scale_perturb_diag` has shape [N1, N2, ... r], which represents an
-        r x r Diagonal matrix.
-        When `None` low rank updates will take the form `scale_perturb_factor *
-        scale_perturb_factor.T`.
+      scale_perturb_factor: Floating-point `Tensor` representing factor matrix
+        with last two dimensions of shape `(k, r)`. When `None`, no rank-r
+        update is added to `scale`.
+      scale_perturb_diag: Floating-point `Tensor` representing the diagonal
+        matrix. `scale_perturb_diag` has shape [N1, N2, ...  r], which
+        represents an `r x r` diagonal matrix. When `None` low rank updates will
+        take the form `scale_perturb_factor * scale_perturb_factor.T`.
       event_ndims: Scalar `int32` `Tensor` indicating the number of dimensions
         associated with a particular draw from the distribution. Must be 0 or 1.
-      validate_args: `Boolean` indicating whether arguments should be checked
-        for correctness.
-      name: `String` name given to ops managed by this object.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str` name given to ops managed by this object.
 
     Raises:
       ValueError: if `perturb_diag` is specified but not `perturb_factor`.
@@ -1692,17 +1692,19 @@ def _create_scale_operator(self, identity_multiplier, diag, tril,
     Args:
       identity_multiplier: floating point rank 0 `Tensor` representing a scaling
         done to the identity matrix.
-      diag: Numeric `Tensor` representing the diagonal matrix. `scale_diag` has
-        shape [N1, N2, ... k], which represents a k x k diagonal matrix.
-      tril: Numeric `Tensor` representing the diagonal matrix. `scale_tril` has
-        shape [N1, N2, ... k], which represents a k x k lower triangular matrix.
-      perturb_diag: Numeric `Tensor` representing the diagonal matrix of the
-        low rank update.
-      perturb_factor: Numeric `Tensor` representing factor matrix.
+      diag: Floating-point `Tensor` representing the diagonal matrix.
+        `scale_diag` has shape [N1, N2, ...  k], which represents a k x k
+        diagonal matrix.
+      tril: Floating-point `Tensor` representing the diagonal matrix.
+        `scale_tril` has shape [N1, N2, ...  k], which represents a k x k lower
+        triangular matrix.
+      perturb_diag: Floating-point `Tensor` representing the diagonal matrix of
+        the low rank update.
+      perturb_factor: Floating-point `Tensor` representing factor matrix.
       event_ndims: Scalar `int32` `Tensor` indicating the number of dimensions
-        associated with a particular draw from the distribution.  Must be 0 or 1
-      validate_args: `Boolean` indicating whether arguments should be checked
-        for correctness.
+        associated with a particular draw from the distribution. Must be 0 or 1
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
 
     Returns:
       scale. In the case of scaling by a constant, scale is a
@@ -1759,7 +1761,7 @@ def _create_scale_operator(self, identity_multiplier, diag, tril,
         return identity_multiplier
       # Infer the shape from the V and D.
       v_shape = array_ops.shape(perturb_factor)
-      identity_shape = array_ops.concat((v_shape[:-1], (v_shape[-2],)), 0)
+      identity_shape = array_ops.concat([v_shape[:-1], [v_shape[-2]]], 0)
       scaled_identity = operator_pd_identity.OperatorPDIdentity(
           identity_shape,
           perturb_factor.dtype.base_dtype,
@@ -1807,7 +1809,7 @@ def _preprocess_diag(self, identity_multiplier, diag, event_ndims):
   def _process_matrix(self, matrix, min_rank, event_ndims):
     """Helper to __init__ which gets matrix in batch-ready form."""
     # Pad the matrix so that matmul works in the case of a matrix and vector
-    # input.  Keep track if the matrix was padded, to distinguish between a
+    # input. Keep track if the matrix was padded, to distinguish between a
     # rank 3 tensor and a padded rank 2 tensor.
     # TODO(srvasude): Remove side-effects from functions. Its currently unbroken
     # but error-prone since the function call order may change in the future.
@@ -1895,7 +1897,7 @@ class AffineLinearOperator(Bijector):
   where `*` denotes the scalar product.
 
   Note: we don't always simply transpose `X` (but write it this way for
-  brevity).  Actually the input `X` undergoes the following transformation
+  brevity). Actually the input `X` undergoes the following transformation
   before being premultiplied by `scale`:
 
   1. If there are no sample dims, we call `X = tf.expand_dims(X, 0)`, i.e.,
@@ -1910,8 +1912,8 @@ class AffineLinearOperator(Bijector):
   (For more details see `shape.make_batch_of_event_sample_matrices`.)
 
   The result of the above transformation is that `X` can be regarded as a batch
-  of matrices where each column is a draw from the distribution.  After
-  premultiplying by `scale`, we take the inverse of this procedure.  The input
+  of matrices where each column is a draw from the distribution. After
+  premultiplying by `scale`, we take the inverse of this procedure. The input
   `Y` also undergoes the same transformation before/after premultiplying by
   `inv(scale)`.
 
@@ -1952,14 +1954,14 @@ def __init__(self,
     """Instantiates the `AffineLinearOperator` bijector.
 
     Args:
-      shift: Numeric `Tensor`.
-      scale:  Subclass of `LinearOperator`.  Represents the (batch) positive
+      shift: Floating-point `Tensor`.
+      scale:  Subclass of `LinearOperator`. Represents the (batch) positive
         definite matrix `M` in `R^{k x k}`.
       event_ndims: Scalar `integer` `Tensor` indicating the number of dimensions
         associated with a particular draw from the distribution. Must be 0 or 1.
-      validate_args: `Boolean` indicating whether arguments should be checked
-        for correctness.
-      name: `String` name given to ops managed by this object.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str` name given to ops managed by this object.
 
     Raises:
       ValueError: if `event_ndims` is not 0 or 1.
@@ -2126,21 +2128,21 @@ def _inverse_and_inverse_log_det_jacobian(self, y):
     event_dims = self._event_dims_tensor(y)
     # Could also do:
     #   ildj = math_ops.reduce_sum(y - distribution_util.softplus_inverse(y),
-    #                              reduction_indices=event_dims)
+    #                              axis=event_dims)
     # but the following is more numerically stable. Ie,
     # Y = Log[1 + exp{X}] ==> X = Log[exp{Y} - 1]
     # ==> dX/dY = exp{Y} / (exp{Y} - 1)
     #           = 1 / (1 - exp{-Y}),
-    # which is the most stable for large Y > 0.  For small Y, we use
+    # which is the most stable for large Y > 0. For small Y, we use
     # 1 - exp{-Y} approx Y.
     ildj = -math_ops.reduce_sum(math_ops.log(-math_ops.expm1(-y)),
-                                reduction_indices=event_dims)
+                                axis=event_dims)
     return distribution_util.softplus_inverse(y), ildj
 
   def _forward_log_det_jacobian(self, x):  # pylint: disable=unused-argument
     event_dims = self._event_dims_tensor(x)
     return -math_ops.reduce_sum(
-        nn_ops.softplus(-x), reduction_indices=event_dims)
+        nn_ops.softplus(-x), axis=event_dims)
 
 
 class SoftmaxCentered(Bijector):
@@ -2148,7 +2150,7 @@ class SoftmaxCentered(Bijector):
 
   To implement [softmax](https://en.wikipedia.org/wiki/Softmax_function) as a
   bijection, the forward transformation appends a value to the input and the
-  inverse removes this coordinate.  The appended coordinate represents a pivot,
+  inverse removes this coordinate. The appended coordinate represents a pivot,
   e.g., `softmax(x) = exp(x-c) / sum(exp(x-c))` where `c` is the implicit last
   coordinate.
 
@@ -2169,7 +2171,7 @@ class SoftmaxCentered(Bijector):
 
   At first blush it may seem like the [Invariance of domain](
   https://en.wikipedia.org/wiki/Invariance_of_domain) theorem implies this
-  implementation is not a bijection.  However, the appended dimension
+  implementation is not a bijection. However, the appended dimension
   makes the (forward) image non-open and the theorem does not directly apply.
   """
 
@@ -2287,12 +2289,12 @@ def _inverse(self, y):
                               depth=ndims,
                               on_value=shape[-1]-np.array(1, dtype=shape.dtype),
                               dtype=shape.dtype)
-    size = array_ops.concat((shape[:-1], np.asarray([1], dtype=shape.dtype)), 0)
+    size = array_ops.concat([shape[:-1], np.asarray([1], dtype=shape.dtype)], 0)
     log_normalization = -array_ops.strided_slice(x, begin, begin + size)
 
     # Here we slice out all but the last coordinate; see above for idea.
     begin = array_ops.zeros_like(shape)
-    size = array_ops.concat((shape[:-1], [shape[-1] - 1]), 0)
+    size = array_ops.concat([shape[:-1], [shape[-1] - 1]], 0)
     x = array_ops.strided_slice(x, begin, begin + size)
 
     x += log_normalization
@@ -2330,7 +2332,7 @@ def _inverse_log_det_jacobian(self, y):
     #       or by noting that det{ dX/dY } = 1 / det{ dY/dX } from Bijector
     #       docstring "Tip".
     # (2) - https://en.wikipedia.org/wiki/Matrix_determinant_lemma
-    return -math_ops.reduce_sum(math_ops.log(y), reduction_indices=-1)
+    return -math_ops.reduce_sum(math_ops.log(y), axis=-1)
 
   def _forward_log_det_jacobian(self, x):
     if self._static_event_ndims == 0:
@@ -2343,10 +2345,10 @@ def _forward_log_det_jacobian(self, x):
       #   log_normalization = 1 + reduce_sum(exp(logits))
       #   -log_normalization + reduce_sum(logits - log_normalization)
       log_normalization = nn_ops.softplus(
-          math_ops.reduce_logsumexp(x, reduction_indices=-1, keep_dims=True))
+          math_ops.reduce_logsumexp(x, axis=-1, keep_dims=True))
       fldj = (-log_normalization +
               math_ops.reduce_sum(x - log_normalization,
-                                  reduction_indices=-1,
+                                  axis=-1,
                                   keep_dims=True))
       return array_ops.squeeze(fldj, squeeze_dims=-1)
 
@@ -2391,9 +2393,9 @@ def __init__(self, event_ndims=2, validate_args=False,
       event_ndims: `constant` `int32` scalar `Tensor` indicating the number of
         dimensions associated with a particular draw from the distribution. Must
         be 0 or 2.
-      validate_args: `Boolean` indicating whether arguments should be checked
-        for correctness.
-      name: `String` name given to ops managed by this object.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str` name given to ops managed by this object.
 
     Raises:
       ValueError: if event_ndims is neither 0 or 2.
diff --git a/tensorflow/contrib/distributions/python/ops/binomial.py b/tensorflow/contrib/distributions/python/ops/binomial.py
index 273b7620314c1f..4b29aa17e080a9 100644
--- a/tensorflow/contrib/distributions/python/ops/binomial.py
+++ b/tensorflow/contrib/distributions/python/ops/binomial.py
@@ -120,7 +120,7 @@ def __init__(self,
     Args:
       total_count: Non-negative floating point tensor with shape broadcastable
         to `[N1,..., Nm]` with `m >= 0` and the same dtype as `probs` or
-        `logits`.  Defines this as a batch of `N1 x ... x Nm` different Binomial
+        `logits`. Defines this as a batch of `N1 x ...  x Nm` different Binomial
         distributions. Its components should be equal to integer values.
       logits: Floating point tensor representing the log-odds of a
         positive event with shape broadcastable to `[N1,..., Nm]` `m >= 0`, and
@@ -131,15 +131,15 @@ def __init__(self,
         `[N1,..., Nm]` `m >= 0`, `probs in [0, 1]`. Each entry represents the
         probability of success for independent Binomial distributions. Only one
         of `logits` or `probs` should be passed in.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`, statistics
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
         (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-        result is undefined.  When `False`, an exception is raised if one or
+        result is undefined. When `False`, an exception is raised if one or
         more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
     with ops.name_scope(name, values=[total_count, logits, probs]) as ns:
@@ -221,7 +221,7 @@ def _variance(self):
 
   @distribution_util.AppendDocstring(
       """Note that when `(1 + total_count) * probs` is an integer, there are
-      actually two modes.  Namely, `(1 + total_count) * probs` and
+      actually two modes. Namely, `(1 + total_count) * probs` and
       `(1 + total_count) * probs - 1` are both modes. Here we return only the
       larger of the two modes.""")
   def _mode(self):
diff --git a/tensorflow/contrib/distributions/python/ops/categorical.py b/tensorflow/contrib/distributions/python/ops/categorical.py
index 67f3a1cc9360e1..6908faa5ad6718 100644
--- a/tensorflow/contrib/distributions/python/ops/categorical.py
+++ b/tensorflow/contrib/distributions/python/ops/categorical.py
@@ -97,15 +97,15 @@ def __init__(
         represents a vector of probabilities for each class. Only one of
         `logits` or `probs` should be passed in.
       dtype: The type of the event samples (default: int32).
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`, statistics
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
         (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-        result is undefined.  When `False`, an exception is raised if one or
+        result is undefined. When `False`, an exception is raised if one or
         more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
     with ops.name_scope(name, values=[logits, probs]) as ns:
@@ -133,9 +133,8 @@ def __init__(
             dtype=dtypes.int32,
             name="event_size")
       else:
-        self._event_size = array_ops.gather(logits_shape,
-                                            self._batch_rank,
-                                            name="event_size")
+        with ops.name_scope(name="event_size"):
+          self._event_size = logits_shape[self._batch_rank]
 
       if logits_shape_static[:-1].is_fully_defined():
         self._batch_shape_val = constant_op.constant(
@@ -192,7 +191,7 @@ def _sample_n(self, n, seed=None):
     samples = math_ops.cast(samples, self.dtype)
     ret = array_ops.reshape(
         array_ops.transpose(samples),
-        array_ops.concat(([n], self.batch_shape_tensor()), 0))
+        array_ops.concat([[n], self.batch_shape_tensor()], 0))
     return ret
 
   def _log_prob(self, k):
diff --git a/tensorflow/contrib/distributions/python/ops/chi2.py b/tensorflow/contrib/distributions/python/ops/chi2.py
index d980c705f14f3c..6f3e3700c8f5d0 100644
--- a/tensorflow/contrib/distributions/python/ops/chi2.py
+++ b/tensorflow/contrib/distributions/python/ops/chi2.py
@@ -70,16 +70,16 @@ def __init__(self,
 
     Args:
       df: Floating point tensor, the degrees of freedom of the
-        distribution(s).  `df` must contain only positive values.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+        distribution(s). `df` must contain only positive values.
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`, statistics
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
         (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-        result is undefined.  When `False`, an exception is raised if one or
+        result is undefined. When `False`, an exception is raised if one or
         more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
     # Even though all stats of chi2 are defined for valid parameters, this is
diff --git a/tensorflow/contrib/distributions/python/ops/dirichlet.py b/tensorflow/contrib/distributions/python/ops/dirichlet.py
index bd713cf08f2bfd..8c95bb3ce6c282 100644
--- a/tensorflow/contrib/distributions/python/ops/dirichlet.py
+++ b/tensorflow/contrib/distributions/python/ops/dirichlet.py
@@ -142,15 +142,15 @@ def __init__(self,
         `concentration.shape = [N1, N2, ..., Nm, k]` then
         `batch_shape = [N1, N2, ..., Nm]` and
         `event_shape = [k]`.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`, statistics
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
         (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-        result is undefined.  When `False`, an exception is raised if one or
+        result is undefined. When `False`, an exception is raised if one or
         more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
     with ops.name_scope(name, values=[concentration]) as ns:
@@ -225,12 +225,13 @@ def _entropy(self):
             axis=-1))
 
   def _mean(self):
-    return self.concentration / self.total_concentration[..., None]
+    return self.concentration / self.total_concentration[..., array_ops.newaxis]
 
   def _covariance(self):
     x = self._variance_scale_term() * self._mean()
     return array_ops.matrix_set_diag(
-        -math_ops.matmul(x[..., None], x[..., None, :]),  # outer prod
+        -math_ops.matmul(x[..., array_ops.newaxis],
+                         x[..., array_ops.newaxis, :]),  # outer prod
         self._variance())
 
   def _variance(self):
@@ -240,16 +241,17 @@ def _variance(self):
 
   def _variance_scale_term(self):
     """Helper to `_covariance` and `_variance` which computes a shared scale."""
-    return math_ops.rsqrt(1. + self.total_concentration[..., None])
+    return math_ops.rsqrt(1. + self.total_concentration[..., array_ops.newaxis])
 
   @distribution_util.AppendDocstring(
       """Note: The mode is undefined when any `concentration <= 1`. If
-      `self.allow_nan_stats` is `True`, `NaN` is used for undefined modes.  If
+      `self.allow_nan_stats` is `True`, `NaN` is used for undefined modes. If
       `self.allow_nan_stats` is `False` an exception is raised when one or more
       modes are undefined.""")
   def _mode(self):
     k = math_ops.cast(self.event_shape_tensor()[0], self.dtype)
-    mode = (self.concentration - 1.) / (self.total_concentration[..., None] - k)
+    mode = (self.concentration - 1.) / (
+        self.total_concentration[..., array_ops.newaxis] - k)
     if self.allow_nan_stats:
       nan = array_ops.fill(
           array_ops.shape(mode),
@@ -290,7 +292,7 @@ def _maybe_assert_valid_sample(self, x):
             x,
             message="samples must be positive"),
         distribution_util.assert_close(
-            array_ops.ones((), dtype=self.dtype),
+            array_ops.ones([], dtype=self.dtype),
             math_ops.reduce_sum(x, -1),
             message="sample last-dimension must sum to `1`"),
     ], x)
diff --git a/tensorflow/contrib/distributions/python/ops/dirichlet_multinomial.py b/tensorflow/contrib/distributions/python/ops/dirichlet_multinomial.py
index 11b6826c1ab024..8a8b500331737c 100644
--- a/tensorflow/contrib/distributions/python/ops/dirichlet_multinomial.py
+++ b/tensorflow/contrib/distributions/python/ops/dirichlet_multinomial.py
@@ -36,10 +36,10 @@
 
 
 _dirichlet_multinomial_sample_note = """For each batch of counts,
-`value = [n_0, ... ,n_{k-1}]`, `P[value]` is the probability that after sampling
-`self.total_count` draws from this Dirichlet-Multinomial distribution, the
-number of draws falling in class `j` is `n_j`. Since this definition is
-[exchangeable]( https://en.wikipedia.org/wiki/Exchangeable_random_variables);
+`value = [n_0, ..., n_{k-1}]`, `P[value]` is the probability that after
+sampling `self.total_count` draws from this Dirichlet-Multinomial distribution,
+the number of draws falling in class `j` is `n_j`. Since this definition is
+[exchangeable](https://en.wikipedia.org/wiki/Exchangeable_random_variables);
 different sequences have the same counts so the probability includes a
 combinatorial coefficient.
 
@@ -153,22 +153,22 @@ def __init__(self,
     Args:
       total_count:  Non-negative floating point tensor, whose dtype is the same
         as `concentration`. The shape is broadcastable to `[N1,..., Nm]` with
-        `m >= 0`.  Defines this as a batch of `N1 x ... x Nm` different
+        `m >= 0`. Defines this as a batch of `N1 x ... x Nm` different
         Dirichlet multinomial distributions. Its components should be equal to
         integer values.
       concentration: Positive floating point tensor, whose dtype is the
         same as `n` with shape broadcastable to `[N1,..., Nm, k]` `m >= 0`.
         Defines this as a batch of `N1 x ... x Nm` different `k` class Dirichlet
         multinomial distributions.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`, statistics
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
         (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-        result is undefined.  When `False`, an exception is raised if one or
+        result is undefined. When `False`, an exception is raised if one or
         more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
     with ops.name_scope(name, values=[total_count, concentration]) as ns:
@@ -177,8 +177,7 @@ def __init__(self,
       #   we use the last dimension for the distribution, whereas
       #   the batch dimensions are the leading dimensions, which forces the
       #   distribution dimension to be defined explicitly (i.e. it cannot be
-      #   created automatically by prepending).  This forces enough
-      #   explicitness.
+      #   created automatically by prepending). This forces enough explicitness.
       # * All calls involving `counts` eventually require a broadcast between
       #  `counts` and concentration.
       self._total_count = self._maybe_assert_valid_total_count(
@@ -261,7 +260,7 @@ def _prob(self, counts):
 
   def _mean(self):
     return self.total_count * (self.concentration /
-                               self.total_concentration[..., None])
+                               self.total_concentration[..., array_ops.newaxis])
 
   @distribution_util.AppendDocstring(
       """The covariance for each batch member is defined as the following:
@@ -284,7 +283,8 @@ def _mean(self):
   def _covariance(self):
     x = self._variance_scale_term() * self._mean()
     return array_ops.matrix_set_diag(
-        -math_ops.matmul(x[..., None], x[..., None, :]),  # outer prod
+        -math_ops.matmul(x[..., array_ops.newaxis],
+                         x[..., array_ops.newaxis, :]),  # outer prod
         self._variance())
 
   def _variance(self):
@@ -296,7 +296,7 @@ def _variance_scale_term(self):
     """Helper to `_covariance` and `_variance` which computes a shared scale."""
     # We must take care to expand back the last dim whenever we use the
     # total_concentration.
-    c0 = self.total_concentration[..., None]
+    c0 = self.total_concentration[..., array_ops.newaxis]
     return math_ops.sqrt((1. + c0 / self.total_count) / (1. + c0))
 
   def _maybe_assert_valid_concentration(self, concentration, validate_args):
diff --git a/tensorflow/contrib/distributions/python/ops/distribution.py b/tensorflow/contrib/distributions/python/ops/distribution.py
index 31b34cb1b51fe2..7f2e83f61493e5 100644
--- a/tensorflow/contrib/distributions/python/ops/distribution.py
+++ b/tensorflow/contrib/distributions/python/ops/distribution.py
@@ -63,8 +63,8 @@ def _copy_fn(fn):
   """
   if not callable(fn):
     raise TypeError("fn is not callable: %s" % fn)
-  # The blessed way to copy a function.  copy.deepcopy fails to create
-  # a non-reference copy.  Since:
+  # The blessed way to copy a function. copy.deepcopy fails to create a
+  # non-reference copy. Since:
   #   types.FunctionType == type(lambda: None),
   # and the docstring for the function type states:
   #
@@ -129,7 +129,7 @@ def __new__(mcs, classname, baseclasses, attrs):
       ValueError:  If a `Distribution` public method lacks a docstring.
     """
     if not baseclasses:  # Nothing to be done for Distribution
-      raise TypeError("Expected non-empty baseclass.  Does Distribution "
+      raise TypeError("Expected non-empty baseclass. Does Distribution "
                       "not subclass _BaseDistribution?")
     which_base = [
         base for base in baseclasses
@@ -185,7 +185,7 @@ class ReparameterizationType(object):
 
   `NOT_REPARAMETERIZED`: Samples from the distribution are not fully
     reparameterized, and straight-through gradients are either partially
-    unsupported or are not supported at all.  In this case, for purposes of
+    unsupported or are not supported at all. In this case, for purposes of
     e.g. RL or variational inference, it is generally safest to wrap the
     sample results in a `stop_gradients` call and instead use policy
     gradients / surrogate loss instead.
@@ -234,8 +234,8 @@ class Distribution(_BaseDistribution):
   ### Subclassing
 
   Subclasses are expected to implement a leading-underscore version of the
-  same-named function.  The argument signature should be identical except for
-  the omission of `name="..."`.  For example, to enable `log_prob(value,
+  same-named function. The argument signature should be identical except for
+  the omission of `name="..."`. For example, to enable `log_prob(value,
   name="log_prob")` a subclass should implement `_log_prob(value)`.
 
   Subclasses can append to public-level docstrings by providing
@@ -248,7 +248,7 @@ def _log_prob(self, value):
   ```
 
   would add the string "Some other details." to the `log_prob` function
-  docstring.  This is implemented as a simple decorator to avoid python
+  docstring. This is implemented as a simple decorator to avoid python
   linter complaining about missing Args/Returns/Raises sections in the
   partial docstrings.
 
@@ -261,7 +261,7 @@ def _log_prob(self, value):
   `log_prob` reflect this broadcasting, as does the return value of `sample` and
   `sample_n`.
 
-  `sample_n_shape = (n,) + batch_shape + event_shape`, where `sample_n_shape` is
+  `sample_n_shape = [n] + batch_shape + event_shape`, where `sample_n_shape` is
   the shape of the `Tensor` returned from `sample_n`, `n` is the number of
   samples, `batch_shape` defines how many independent distributions there are,
   and `event_shape` defines the shape of samples from each of those independent
@@ -286,19 +286,19 @@ def _log_prob(self, value):
   # `event_shape_t` is a `Tensor` which will evaluate to [].
   event_shape_t = u.event_shape_tensor()
 
-  # Sampling returns a sample per distribution.  `samples` has shape
-  # (5, 2, 2), which is (n,) + batch_shape + event_shape, where n=5,
-  # batch_shape=(2, 2), and event_shape=().
+  # Sampling returns a sample per distribution. `samples` has shape
+  # [5, 2, 2], which is [n] + batch_shape + event_shape, where n=5,
+  # batch_shape=[2, 2], and event_shape=[].
   samples = u.sample_n(5)
 
   # The broadcasting holds across methods. Here we use `cdf` as an example. The
   # same holds for `log_cdf` and the likelihood functions.
 
-  # `cum_prob` has shape (2, 2) as the `value` argument was broadcasted to the
+  # `cum_prob` has shape [2, 2] as the `value` argument was broadcasted to the
   # shape of the `Uniform` instance.
   cum_prob_broadcast = u.cdf(4.0)
 
-  # `cum_prob`'s shape is (2, 2), one per distribution. No broadcasting
+  # `cum_prob`'s shape is [2, 2], one per distribution. No broadcasting
   # occurred.
   cum_prob_per_dist = u.cdf([[4.0, 5.0],
                              [6.0, 7.0]])
@@ -311,9 +311,9 @@ def _log_prob(self, value):
   ### Parameter values leading to undefined statistics or distributions.
 
   Some distributions do not have well-defined statistics for all initialization
-  parameter values.  For example, the beta distribution is parameterized by
-  positive real numbers `a` and `b`, and does not have well-defined mode if
-  `a < 1` or `b < 1`.
+  parameter values. For example, the beta distribution is parameterized by
+  positive real numbers `concentration1` and `concentration0`, and does not have
+  well-defined mode if `concentration1 < 1` or `concentration0 < 1`.
 
   The user is given the option of raising an exception or returning `NaN`.
 
@@ -356,25 +356,28 @@ def __init__(self,
 
     Args:
       dtype: The type of the event samples. `None` implies no type-enforcement.
-      is_continuous: Python boolean. If `True` this
-        `Distribution` is continuous over its supported domain.
+      is_continuous: Python `bool`. If `True` this `Distribution` is continuous
+        over its supported domain.
       reparameterization_type: Instance of `ReparameterizationType`.
         If `distributions.FULLY_REPARAMETERIZED`, this
         `Distribution` can be reparameterized in terms of some standard
         distribution with a function whose Jacobian is constant for the support
-        of the standard distribution.  If `distributions.NOT_REPARAMETERIZED`,
+        of the standard distribution. If `distributions.NOT_REPARAMETERIZED`,
         then no such reparameterization is available.
-      validate_args: Python boolean.  Whether to validate input with asserts.
-        If `validate_args` is `False`, and the inputs are invalid,
-        correct behavior is not guaranteed.
-      allow_nan_stats: Python boolean.  If `False`, raise an
-        exception if a statistic (e.g., mean, mode) is undefined for any batch
-        member. If True, batch members with valid parameters leading to
-        undefined statistics will return `NaN` for this statistic.
-      parameters: Python dictionary of parameters used to instantiate this
+      validate_args: Python `bool`, default `False`. When `True` distribution
+        parameters are checked for validity despite possibly degrading runtime
+        performance. When `False` invalid inputs may silently render incorrect
+        outputs.
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
+        (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
+        result is undefined. When `False`, an exception is raised if one or
+        more of the statistic's batch members are undefined.
+      parameters: Python `dict` of parameters used to instantiate this
         `Distribution`.
-      graph_parents: Python list of graph prerequisites of this `Distribution`.
-      name: A name for this distribution. Default: subclass name.
+      graph_parents: Python `list` of graph prerequisites of this
+        `Distribution`.
+      name: Python `str` name prefixed to Ops created by this class. Default:
+        subclass name.
 
     Raises:
       ValueError: if any member of graph_parents is `None` or not a `Tensor`.
@@ -419,8 +422,8 @@ def param_static_shapes(cls, sample_shape):
 
     This is a class method that describes what key/value arguments are required
     to instantiate the given `Distribution` so that a particular shape is
-    returned for that instance's call to `sample()`.  Assumes that
-    the sample's shape is known statically.
+    returned for that instance's call to `sample()`. Assumes that the sample's
+    shape is known statically.
 
     Subclasses should override class method `_param_shapes` to return
     constant-valued tensors when constant values are fed.
@@ -493,25 +496,24 @@ def reparameterization_type(self):
 
   @property
   def allow_nan_stats(self):
-    """Python boolean describing behavior when a stat is undefined.
+    """Python `bool` describing behavior when a stat is undefined.
 
-    Stats return +/- infinity when it makes sense.  E.g., the variance
-    of a Cauchy distribution is infinity.  However, sometimes the
-    statistic is undefined, e.g., if a distribution's pdf does not achieve a
-    maximum within the support of the distribution, the mode is undefined.
-    If the mean is undefined, then by definition the variance is undefined.
-    E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-    it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-    undefined.
+    Stats return +/- infinity when it makes sense. E.g., the variance of a
+    Cauchy distribution is infinity. However, sometimes the statistic is
+    undefined, e.g., if a distribution's pdf does not achieve a maximum within
+    the support of the distribution, the mode is undefined. If the mean is
+    undefined, then by definition the variance is undefined. E.g. the mean for
+    Student's T for df = 1 is undefined (no clear way to say it is either + or -
+    infinity), so the variance = E[(X - mean)**2] is also undefined.
 
     Returns:
-      allow_nan_stats: Python boolean.
+      allow_nan_stats: Python `bool`.
     """
     return self._allow_nan_stats
 
   @property
   def validate_args(self):
-    """Python boolean indicated possibly expensive checks are enabled."""
+    """Python `bool` indicating possibly expensive checks are enabled."""
     return self._validate_args
 
   def copy(self, **override_parameters_kwargs):
@@ -611,7 +613,7 @@ def is_scalar_event(self, name="is_scalar_event"):
       name: The name to give this op.
 
     Returns:
-      is_scalar_event: `Boolean` `scalar` `Tensor`.
+      is_scalar_event: `bool` scalar `Tensor`.
     """
     with self._name_scope(name):
       return ops.convert_to_tensor(
@@ -625,7 +627,7 @@ def is_scalar_batch(self, name="is_scalar_batch"):
       name: The name to give this op.
 
     Returns:
-      is_scalar_batch: `Boolean` `scalar` `Tensor`.
+      is_scalar_batch: `bool` scalar `Tensor`.
     """
     with self._name_scope(name):
       return ops.convert_to_tensor(
diff --git a/tensorflow/contrib/distributions/python/ops/distribution_util.py b/tensorflow/contrib/distributions/python/ops/distribution_util.py
index 10b4a6ceabde05..0acbb957e2c8a9 100644
--- a/tensorflow/contrib/distributions/python/ops/distribution_util.py
+++ b/tensorflow/contrib/distributions/python/ops/distribution_util.py
@@ -42,8 +42,8 @@ def assert_close(
   """Assert that that x and y are within machine epsilon of each other.
 
   Args:
-    x: Numeric `Tensor`
-    y: Numeric `Tensor`
+    x: Floating-point `Tensor`
+    y: Floating-point `Tensor`
     data: The tensors to print out if the condition is `False`. Defaults to
       error message and first few entries of `x` and `y`.
     summarize: Print this many entries of each tensor.
@@ -80,7 +80,7 @@ def assert_integer_form(
   """Assert that x has integer components (or floats equal to integers).
 
   Args:
-    x: Numeric `Tensor`
+    x: Floating-point `Tensor`
     data: The tensors to print out if the condition is `False`. Defaults to
       error message and first few entries of `x` and `y`.
     summarize: Print this many entries of each tensor.
@@ -113,7 +113,7 @@ def same_dynamic_shape(a, b):
     b: `Tensor`
 
   Returns:
-    `Boolean` `Tensor` representing if both tensors have the same shape.
+    `bool` `Tensor` representing if both tensors have the same shape.
   """
   a = ops.convert_to_tensor(a, name="a")
   b = ops.convert_to_tensor(b, name="b")
@@ -142,15 +142,15 @@ def get_logits_and_probs(logits=None,
   """Converts logit to probabilities (or vice-versa), and returns both.
 
   Args:
-    logits: Numeric `Tensor` representing log-odds.
-    probs: Numeric `Tensor` representing probabilities.
-    multidimensional: `Boolean`, default `False`.
+    logits: Floating-point `Tensor` representing log-odds.
+    probs: Floating-point `Tensor` representing probabilities.
+    multidimensional: Python `bool`, default `False`.
       If `True`, represents whether the last dimension of `logits` or `probs`,
-      a `[N1, N2, ... k]` dimensional tensor, representing the
+      a `[N1, N2, ...  k]` dimensional tensor, representing the
       logit or probability of `shape[-1]` classes.
-    validate_args: `Boolean`, default `False`.  When `True`, either assert `0 <=
-      probs <= 1` (if not `multidimensional`) or that the last dimension of
-      `probs` sums to one.
+    validate_args: Python `bool`, default `False`. When `True`, either assert
+      `0 <= probs <= 1` (if not `multidimensional`) or that the last dimension
+      of `probs` sums to one.
     name: A name for this operation (optional).
 
   Returns:
@@ -189,7 +189,7 @@ def get_logits_and_probs(logits=None,
         # Here we don't compute the multidimensional case, in a manner
         # consistent with respect to the unidimensional case. We do so
         # following the TF convention. Typically, you might expect to see
-        # logits = log(probs) - log(gather(probs, pivot)). A side-effect of
+        # logits = log(probs) - log(probs[pivot]). A side-effect of
         # being consistent with the TF approach is that the unidimensional case
         # implicitly handles the second dimension but the multidimensional case
         # explicitly keeps the pivot dimension.
@@ -208,10 +208,10 @@ def log_combinations(n, counts, name="log_combinations"):
   where `i` runs over all `k` classes.
 
   Args:
-    n: Numeric `Tensor` broadcastable with `counts`. This represents `n`
+    n: Floating-point `Tensor` broadcastable with `counts`. This represents `n`
       outcomes.
-    counts: Numeric `Tensor` broadcastable with `n`. This represents counts
-      in `k` classes, where `k` is the last dimension of the tensor.
+    counts: Floating-point `Tensor` broadcastable with `n`. This represents
+      counts in `k` classes, where `k` is the last dimension of the tensor.
     name: A name for this operation (optional).
 
   Returns:
@@ -220,15 +220,14 @@ def log_combinations(n, counts, name="log_combinations"):
   # First a bit about the number of ways counts could have come in:
   # E.g. if counts = [1, 2], then this is 3 choose 2.
   # In general, this is (sum counts)! / sum(counts!)
-  # The sum should be along the last dimension of counts.  This is the
+  # The sum should be along the last dimension of counts. This is the
   # "distribution" dimension. Here n a priori represents the sum of counts.
   with ops.name_scope(name, values=[n, counts]):
     n = ops.convert_to_tensor(n, name="n")
     counts = ops.convert_to_tensor(counts, name="counts")
     total_permutations = math_ops.lgamma(n + 1)
     counts_factorial = math_ops.lgamma(counts + 1)
-    redundant_permutations = math_ops.reduce_sum(counts_factorial,
-                                                 reduction_indices=[-1])
+    redundant_permutations = math_ops.reduce_sum(counts_factorial, axis=[-1])
     return total_permutations - redundant_permutations
 
 
@@ -242,7 +241,7 @@ def matrix_diag_transform(matrix, transform=None, name=None):
   matrix_values = tf.contrib.layers.fully_connected(activations, 4)
   matrix = tf.reshape(matrix_values, (batch_size, 2, 2))
 
-  # Make the diagonal positive.  If the upper triangle was zero, this would be a
+  # Make the diagonal positive. If the upper triangle was zero, this would be a
   # valid Cholesky factor.
   chol = matrix_diag_transform(matrix, transform=tf.nn.softplus)
 
@@ -264,7 +263,7 @@ def matrix_diag_transform(matrix, transform=None, name=None):
   # This is a fully trainable multivariate normal!
   dist = tf.contrib.distributions.MVNCholesky(mu, chol)
 
-  # Standard log loss.  Minimizing this will "train" mu and chol, and then dist
+  # Standard log loss. Minimizing this will "train" mu and chol, and then dist
   # will be a distribution predicting labels as multivariate Gaussians.
   loss = -1 * tf.reduce_mean(dist.log_prob(labels))
   ```
@@ -272,9 +271,9 @@ def matrix_diag_transform(matrix, transform=None, name=None):
   Args:
     matrix:  Rank `R` `Tensor`, `R >= 2`, where the last two dimensions are
       equal.
-    transform:  Element-wise function mapping `Tensors` to `Tensors`.  To
-      be applied to the diagonal of `matrix`.  If `None`, `matrix` is returned
-      unchanged.  Defaults to `None`.
+    transform:  Element-wise function mapping `Tensors` to `Tensors`. To
+      be applied to the diagonal of `matrix`. If `None`, `matrix` is returned
+      unchanged. Defaults to `None`.
     name:  A name to give created ops.
       Defaults to "matrix_diag_transform".
 
@@ -308,7 +307,7 @@ def rotate_transpose(x, shift, name="rotate_transpose"):
   Example:
 
     ```python
-    x = ... # Tensor of shape [1, 2, 3, 4].
+    x = ...  # Tensor of shape [1, 2, 3, 4].
     rotate_transpose(x, -1)  # result shape: [2, 3, 4, 1]
     rotate_transpose(x, -2)  # result shape: [3, 4, 1, 2]
     rotate_transpose(x,  1)  # result shape: [4, 1, 2, 3]
@@ -321,7 +320,7 @@ def rotate_transpose(x, shift, name="rotate_transpose"):
     x: `Tensor`.
     shift: `Tensor`. Number of dimensions to transpose left (shift<0) or
       transpose right (shift>0).
-    name: `String`. The name to give this op.
+    name: Python `str`. The name to give this op.
 
   Returns:
     rotated_x: Input `Tensor` with dimensions circularly rotated by shift.
@@ -363,7 +362,7 @@ def rotate_transpose(x, shift, name="rotate_transpose"):
                               ndims - math_ops.mod(shift, ndims))
       first = math_ops.range(0, shift)
       last = math_ops.range(shift, ndims)
-      perm = array_ops.concat((last, first), 0)
+      perm = array_ops.concat([last, first], 0)
       return array_ops.transpose(x, perm=perm)
 
 
@@ -383,7 +382,7 @@ def pick_vector(cond,
     cond: `Tensor`. Must have `dtype=tf.bool` and be scalar.
     true_vector: `Tensor` of one dimension. Returned when cond is `True`.
     false_vector: `Tensor` of one dimension. Returned when cond is `False`.
-    name: `String`. The name to give this op.
+    name: Python `str`. The name to give this op.
 
   Example:
 
@@ -419,7 +418,7 @@ def pick_vector(cond,
              false_vector.name, false_vector.dtype))
     n = array_ops.shape(true_vector)[0]
     return array_ops.slice(
-        array_ops.concat((true_vector, false_vector), 0),
+        array_ops.concat([true_vector, false_vector], 0),
         [array_ops.where(cond, 0, n)], [array_ops.where(cond, n, -1)])
 
 
@@ -438,13 +437,13 @@ def fill_lower_triangular(x, validate_args=False, name="fill_lower_triangular"):
   b2, ..., bK, n, n]` where `n` is such that `d = n(n+1)/2`, i.e.,
   `n = int(0.5 * (math.sqrt(1. + 8. * d) - 1.))`.
 
-  Although the non-batch complexity is O(n^2), large constants and sub-optimal
+  Although the non-batch complexity is O(n**2), large constants and sub-optimal
   vectorization means the complexity of this function is 5x slower than zeroing
-  out the upper triangular, i.e., `tf.matrix_band_part(X, -1, 0)`.  This
+  out the upper triangular, i.e., `tf.matrix_band_part(X, -1, 0)`. This
   function becomes competitive only when several matmul/cholesky/etc ops can be
-  ellided in constructing the input.  Example: wiring a fully connected layer as
+  ellided in constructing the input. Example: wiring a fully connected layer as
   a covariance matrix; this function reduces the final layer by 2x and possibly
-  reduces the network arch complexity considerably.  In most cases it is better
+  reduces the network arch complexity considerably. In most cases it is better
   to simply build a full matrix and zero out the upper triangular elements,
   e.g., `tril = tf.matrix_band_part(full, -1, 0)`, rather than directly
   construct a lower triangular.
@@ -463,10 +462,10 @@ def fill_lower_triangular(x, validate_args=False, name="fill_lower_triangular"):
 
   Args:
     x: `Tensor` representing lower triangular elements.
-    validate_args: `Boolean`, default `False`.  Whether to ensure the shape of
-      `x` can be mapped to a lower triangular matrix (controls non-static checks
-      only).
-    name: `String`. The name to give this op.
+    validate_args: Python `bool`, default `False`. Whether to ensure the shape
+      of `x` can be mapped to a lower triangular matrix (controls non-static
+      checks only).
+    name: Python `str`. The name to give this op.
 
   Returns:
     tril: `Tensor` with lower triangular elements filled from `x`.
@@ -476,7 +475,7 @@ def fill_lower_triangular(x, validate_args=False, name="fill_lower_triangular"):
       lower triangular matrix.
   """
   # TODO(jvdillon): Replace this code with dedicated op when it exists.
-  with ops.name_scope(name, values=(x,)):
+  with ops.name_scope(name, values=[x]):
     x = ops.convert_to_tensor(x, name="x")
     if (x.get_shape().ndims is not None and
         x.get_shape()[-1].value is not None):
@@ -509,7 +508,7 @@ def tril_ids(n):
         ids = np.arange(n**2, dtype=np.int32)
         rows = (ids / n).astype(np.int32)  # Implicit floor.
         # We need to stop incrementing the index when we encounter
-        # upper-triangular elements.  The idea here is to compute the
+        # upper-triangular elements. The idea here is to compute the
         # lower-right number of zeros then by "symmetry" subtract this from the
         # total number of zeros, n(n-1)/2.
         # Then we note that: n(n-1)/2 - (n-r)*(n-r-1)/2 = r(2n-r-1)/2
@@ -586,7 +585,7 @@ def softplus_inverse(x, name=None):
     #       = Log[1 - exp{-x}] + x                           (3)
     # (2) is the "obvious" inverse, but (3) is more stable than (2) for large x.
     # For small x (e.g. x = 1e-10), (3) will become -inf since 1 - exp{-x} will
-    # be zero.  To fix this, we use 1 - exp{-x} approx x for small x > 0.
+    # be zero. To fix this, we use 1 - exp{-x} approx x for small x > 0.
     #
     # In addition to the numerically stable derivation above, we clamp
     # small/large values to be congruent with the logic in:
@@ -598,7 +597,7 @@ def softplus_inverse(x, name=None):
     # gradient of `where` behaves like `pred*pred_true + (1-pred)*pred_false`
     # thus an `inf` in an unselected path results in `0*inf=nan`. We are careful
     # to overwrite `x` with ones only when we will never actually use this
-    # value.  Note that we use ones and not zeros since `log(expm1(0.)) = -inf`.
+    # value. Note that we use ones and not zeros since `log(expm1(0.)) = -inf`.
     threshold = np.log(np.finfo(x.dtype.as_numpy_dtype).eps) + 2.
     is_too_small = math_ops.less(x, np.exp(threshold))
     is_too_large = math_ops.greater(x, -threshold)
diff --git a/tensorflow/contrib/distributions/python/ops/exponential.py b/tensorflow/contrib/distributions/python/ops/exponential.py
index 6bff48c31787de..0d49721e7eeea6 100644
--- a/tensorflow/contrib/distributions/python/ops/exponential.py
+++ b/tensorflow/contrib/distributions/python/ops/exponential.py
@@ -78,15 +78,15 @@ def __init__(self,
     Args:
       rate: Floating point tensor, equivalent to `1 / mean`. Must contain only
         positive values.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`, statistics
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
         (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-        result is undefined.  When `False`, an exception is raised if one or
+        result is undefined. When `False`, an exception is raised if one or
         more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
     # Even though all statistics of are defined for valid inputs, this is not
@@ -96,7 +96,7 @@ def __init__(self,
     with ops.name_scope(name, values=[rate]) as ns:
       self._rate = ops.convert_to_tensor(rate, name="rate")
     super(Exponential, self).__init__(
-        concentration=array_ops.ones((), dtype=self._rate.dtype),
+        concentration=array_ops.ones([], dtype=self._rate.dtype),
         rate=self._rate,
         allow_nan_stats=allow_nan_stats,
         validate_args=validate_args,
@@ -116,13 +116,13 @@ def rate(self):
     return self._rate
 
   def _sample_n(self, n, seed=None):
-    shape = array_ops.concat(([n], array_ops.shape(self._rate)), 0)
+    shape = array_ops.concat([[n], array_ops.shape(self._rate)], 0)
     # Sample uniformly-at-random from the open-interval (0, 1).
     sampled = random_ops.random_uniform(
         shape,
         minval=np.nextafter(self.dtype.as_numpy_dtype(0.),
                             self.dtype.as_numpy_dtype(1.)),
-        maxval=array_ops.ones((), dtype=self.dtype),
+        maxval=array_ops.ones([], dtype=self.dtype),
         seed=seed,
         dtype=self.dtype)
     return -math_ops.log(sampled) / self._rate
diff --git a/tensorflow/contrib/distributions/python/ops/gamma.py b/tensorflow/contrib/distributions/python/ops/gamma.py
index cec72f1ec665f0..a0c64b47aaf2b8 100644
--- a/tensorflow/contrib/distributions/python/ops/gamma.py
+++ b/tensorflow/contrib/distributions/python/ops/gamma.py
@@ -112,15 +112,15 @@ def __init__(self,
         distribution(s). Must contain only positive values.
       rate: Floating point tensor, the inverse scale params of the
         distribution(s). Must contain only positive values.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`, statistics
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
         (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-        result is undefined.  When `False`, an exception is raised if one or
+        result is undefined. When `False`, an exception is raised if one or
         more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
 
     Raises:
       TypeError: if `concentration` and `rate` are different dtypes.
@@ -231,7 +231,7 @@ def _stddev(self):
 
   @distribution_util.AppendDocstring(
       """The mode of a gamma distribution is `(shape - 1) / rate` when
-      `shape > 1`, and `NaN` otherwise.  If `self.allow_nan_stats` is `False`,
+      `shape > 1`, and `NaN` otherwise. If `self.allow_nan_stats` is `False`,
       an exception will be raised rather than returning `NaN`.""")
   def _mode(self):
     mode = (self.concentration - 1.) / self.rate
diff --git a/tensorflow/contrib/distributions/python/ops/gumbel.py b/tensorflow/contrib/distributions/python/ops/gumbel.py
index 8a445f87f13d0d..704a0021ba7f11 100644
--- a/tensorflow/contrib/distributions/python/ops/gumbel.py
+++ b/tensorflow/contrib/distributions/python/ops/gumbel.py
@@ -110,15 +110,15 @@ def __init__(self,
       loc: Floating point tensor, the means of the distribution(s).
       scale: Floating point tensor, the scales of the distribution(s).
         scale must contain only positive values.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`,
+      allow_nan_stats: Python `bool`, default `True`. When `True`,
         statistics (e.g., mean, mode, variance) use the value "`NaN`" to
-        indicate the result is undefined.  When `False`, an exception is raised
+        indicate the result is undefined. When `False`, an exception is raised
         if one or more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
 
     Raises:
       TypeError: if loc and scale are different dtypes.
@@ -129,7 +129,7 @@ def __init__(self,
                                     validate_args else []):
         self._loc = array_ops.identity(loc, name="loc")
         self._scale = array_ops.identity(scale, name="scale")
-        contrib_tensor_util.assert_same_float_dtype((self._loc, self._scale))
+        contrib_tensor_util.assert_same_float_dtype([self._loc, self._scale])
     super(_Gumbel, self).__init__(
         dtype=self._scale.dtype,
         is_continuous=True,
@@ -171,7 +171,7 @@ def _event_shape(self):
     return tensor_shape.scalar()
 
   def _sample_n(self, n, seed=None):
-    shape = array_ops.concat(([n], array_ops.shape(self.mean())), 0)
+    shape = array_ops.concat([[n], self.batch_shape_tensor()], 0)
     np_dtype = self.dtype.as_numpy_dtype
     # Uniform variates must be sampled from the interval (0,1] rather than
     # [0,1], as they are passed through log() to compute Gumbel variates.
diff --git a/tensorflow/contrib/distributions/python/ops/inverse_gamma.py b/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
index 7b6700341ebb67..a74fb350d1706d 100644
--- a/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
+++ b/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
@@ -111,15 +111,15 @@ def __init__(self,
         distribution(s). Must contain only positive values.
       rate: Floating point tensor, the inverse scale params of the
         distribution(s). Must contain only positive values.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`, statistics
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
         (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-        result is undefined.  When `False`, an exception is raised if one or
+        result is undefined. When `False`, an exception is raised if one or
         more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
 
 
     Raises:
@@ -223,7 +223,7 @@ def _entropy(self):
   @distribution_util.AppendDocstring(
       """The mean of an inverse gamma distribution is
       `rate / (concentration - 1)`, when `concentration > 1`, and `NaN`
-      otherwise.  If `self.allow_nan_stats` is `False`, an exception will be
+      otherwise. If `self.allow_nan_stats` is `False`, an exception will be
       raised rather than returning `NaN`""")
   def _mean(self):
     mean = self.rate / (self.concentration - 1.)
diff --git a/tensorflow/contrib/distributions/python/ops/kullback_leibler.py b/tensorflow/contrib/distributions/python/ops/kullback_leibler.py
index 47411817bfe165..f24f01235a2072 100644
--- a/tensorflow/contrib/distributions/python/ops/kullback_leibler.py
+++ b/tensorflow/contrib/distributions/python/ops/kullback_leibler.py
@@ -66,7 +66,7 @@ def kl(dist_a, dist_b, allow_nan=False, name=None):
     dist_b: The second distribution.
     allow_nan: If `False` (default), a runtime error is raised
       if the KL returns NaN values for any batch entry of the given
-      distributions.  If `True`, the KL may return a NaN for the given entry.
+      distributions. If `True`, the KL may return a NaN for the given entry.
     name: (optional) Name scope to use for created operations.
 
   Returns:
@@ -80,7 +80,7 @@ def kl(dist_a, dist_b, allow_nan=False, name=None):
   if kl_fn is None:
     raise NotImplementedError(
         "No KL(dist_a || dist_b) registered for dist_a type %s and dist_b "
-        "type %s" % ((type(dist_a).__name__, type(dist_b).__name__)))
+        "type %s" % (type(dist_a).__name__, type(dist_b).__name__))
 
   with ops.name_scope("KullbackLeibler"):
     kl_t = kl_fn(dist_a, dist_b, name=name)
@@ -95,7 +95,7 @@ def kl(dist_a, dist_b, allow_nan=False, name=None):
             math_ops.logical_not(
                 math_ops.reduce_any(math_ops.is_nan(kl_t))),
             ["KL calculation between %s and %s returned NaN values "
-             "(and was called with allow_nan=False).  Values:"
+             "(and was called with allow_nan=False). Values:"
              % (dist_a.name, dist_b.name), kl_t])]):
       return array_ops.identity(kl_t, name="checked_kl")
 
diff --git a/tensorflow/contrib/distributions/python/ops/laplace.py b/tensorflow/contrib/distributions/python/ops/laplace.py
index c47b66620c723e..4951a41fd5509d 100644
--- a/tensorflow/contrib/distributions/python/ops/laplace.py
+++ b/tensorflow/contrib/distributions/python/ops/laplace.py
@@ -86,15 +86,15 @@ def __init__(self,
         of the distribution.
       scale: Positive floating point tensor which characterizes the spread of
         the distribution.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`,
+      allow_nan_stats: Python `bool`, default `True`. When `True`,
         statistics (e.g., mean, mode, variance) use the value "`NaN`" to
-        indicate the result is undefined.  When `False`, an exception is raised
+        indicate the result is undefined. When `False`, an exception is raised
         if one or more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
 
     Raises:
       TypeError: if `loc` and `scale` are of different dtype.
@@ -105,7 +105,7 @@ def __init__(self,
                                     validate_args else []):
         self._loc = array_ops.identity(loc, name="loc")
         self._scale = array_ops.identity(scale, name="scale")
-        contrib_tensor_util.assert_same_float_dtype((self._loc, self._scale))
+        contrib_tensor_util.assert_same_float_dtype([self._loc, self._scale])
       super(Laplace, self).__init__(
           dtype=self._loc.dtype,
           is_continuous=True,
@@ -147,7 +147,7 @@ def _event_shape(self):
     return tensor_shape.scalar()
 
   def _sample_n(self, n, seed=None):
-    shape = array_ops.concat(([n], self.batch_shape_tensor()), 0)
+    shape = array_ops.concat([[n], self.batch_shape_tensor()], 0)
     # Sample uniformly-at-random from the open-interval (-1, 1).
     uniform_samples = random_ops.random_uniform(
         shape=shape,
diff --git a/tensorflow/contrib/distributions/python/ops/logistic.py b/tensorflow/contrib/distributions/python/ops/logistic.py
index 2fb00dbc581732..c9aab3dde117f4 100644
--- a/tensorflow/contrib/distributions/python/ops/logistic.py
+++ b/tensorflow/contrib/distributions/python/ops/logistic.py
@@ -108,13 +108,13 @@ def __init__(self,
       loc: Floating point tensor, the means of the distribution(s).
       scale: Floating point tensor, the scales of the distribution(s). Must
         contain only positive values.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`, statistics
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
         (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-        result is undefined.  When `False`, an exception is raised if one or
+        result is undefined. When `False`, an exception is raised if one or
         more of the statistic's batch members are undefined.
       name: The name to give Ops created by the initializer.
 
@@ -127,7 +127,7 @@ def __init__(self,
                                     validate_args else []):
         self._loc = array_ops.identity(loc, name="loc")
         self._scale = array_ops.identity(scale, name="scale")
-        contrib_tensor_util.assert_same_float_dtype((self._loc, self._scale))
+        contrib_tensor_util.assert_same_float_dtype([self._loc, self._scale])
     super(Logistic, self).__init__(
         dtype=self._scale.dtype,
         is_continuous=True,
@@ -169,7 +169,7 @@ def _event_shape(self):
     return tensor_shape.scalar()
 
   def _sample_n(self, n, seed=None):
-    shape = array_ops.concat(([n], array_ops.shape(self.mean())), 0)
+    shape = array_ops.concat([[n], self.batch_shape_tensor()], 0)
     np_dtype = self.dtype.as_numpy_dtype
     minval = np.nextafter(np_dtype(0), np_dtype(1))
     uniform = random_ops.random_uniform(shape=shape,
diff --git a/tensorflow/contrib/distributions/python/ops/mixture.py b/tensorflow/contrib/distributions/python/ops/mixture.py
index 2ba3d2546d4730..1c421c8fa09d98 100644
--- a/tensorflow/contrib/distributions/python/ops/mixture.py
+++ b/tensorflow/contrib/distributions/python/ops/mixture.py
@@ -66,13 +66,13 @@ def __init__(self,
       components: A list or tuple of `Distribution` instances.
         Each instance must have the same type, be defined on the same domain,
         and have matching `event_shape` and `batch_shape`.
-      validate_args: `Boolean`, default `False`.  If `True`, raise a runtime
+      validate_args: Python `bool`, default `False`. If `True`, raise a runtime
         error if batch or event ranks are inconsistent between cat and any of
-        the distributions.  This is only checked if the ranks cannot be
+        the distributions. This is only checked if the ranks cannot be
         determined statically at graph construction time.
-      allow_nan_stats: Boolean, default `True`.  If `False`, raise an
+      allow_nan_stats: Boolean, default `True`. If `False`, raise an
        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
-        batch member.  If `True`, batch members with valid parameters leading to
+        batch member. If `True`, batch members with valid parameters leading to
         undefined statistics will return NaN for this statistic.
       name: A name for this distribution (optional).
 
@@ -265,7 +265,7 @@ def _sample_n(self, n, seed=None):
       else:
         event_shape = self.event_shape_tensor()
 
-      # Get indices into the raw cat sampling tensor.  We will
+      # Get indices into the raw cat sampling tensor. We will
       # need these to stitch sample values back out after sampling
       # within the component partitions.
       samples_raw_indices = array_ops.reshape(
@@ -315,7 +315,7 @@ def _sample_n(self, n, seed=None):
         # For sample s, batch element b of component c, we get the
         # partitioned batch indices from
         # partitioned_batch_indices[c]; and shift each element by
-        # the sample index.  The final lookup can be thought of as
+        # the sample index. The final lookup can be thought of as
         # a matrix gather along locations (s, b) in
         # samples_class_c where the n_class rows correspond to
         # samples within this component and the batch_size columns
@@ -329,7 +329,7 @@ def _sample_n(self, n, seed=None):
             partitioned_batch_indices[c])
         samples_class_c = array_ops.reshape(
             samples_class_c,
-            array_ops.concat(([n_class * batch_size], event_shape), 0))
+            array_ops.concat([[n_class * batch_size], event_shape], 0))
         samples_class_c = array_ops.gather(
             samples_class_c, lookup_partitioned_batch_indices,
             name="samples_class_c_gather")
@@ -340,8 +340,8 @@ def _sample_n(self, n, seed=None):
           indices=partitioned_samples_indices, data=samples_class)
       # Reshape back to proper sample, batch, and event shape.
       ret = array_ops.reshape(lhs_flat_ret,
-                              array_ops.concat((samples_shape,
-                                                self.event_shape_tensor()), 0))
+                              array_ops.concat([samples_shape,
+                                                self.event_shape_tensor()], 0))
       ret.set_shape(
           tensor_shape.TensorShape(static_samples_shape).concatenate(
               self.event_shape))
@@ -361,7 +361,7 @@ def entropy_lower_bound(self, name="entropy_lower_bound"):
     \\)
 
     where \\( p \\) is the prior distribution, \\( q \\) is the variational,
-    and \\( H[q] \\) is the entropy of \\( q \\).  If there is a lower bound
+    and \\( H[q] \\) is the entropy of \\( q \\). If there is a lower bound
     \\( G[q] \\) such that \\( H[q] \geq G[q] \\) then it can be used in
     place of \\( H[q] \\).
 
diff --git a/tensorflow/contrib/distributions/python/ops/multinomial.py b/tensorflow/contrib/distributions/python/ops/multinomial.py
index bcf2acb715288a..de8a96a6dcc60b 100644
--- a/tensorflow/contrib/distributions/python/ops/multinomial.py
+++ b/tensorflow/contrib/distributions/python/ops/multinomial.py
@@ -136,7 +136,7 @@ def __init__(self,
     Args:
       total_count: Non-negative floating point tensor with shape broadcastable
         to `[N1,..., Nm]` with `m >= 0`. Defines this as a batch of
-        `N1 x ... x Nm` different Multinomial distributions.  Its components
+        `N1 x ... x Nm` different Multinomial distributions. Its components
         should be equal to integer values.
       logits: Floating point tensor representing the log-odds of a
         positive event with shape broadcastable to `[N1,..., Nm, k], m >= 0`,
@@ -144,19 +144,19 @@ def __init__(self,
         `N1 x ... x Nm` different `k` class Multinomial distributions. Only one
         of `logits` or `probs` should be passed in.
       probs: Positive floating point tensor with shape broadcastable to
-        `[N1,..., Nm, k]` `m >= 0` and same dtype as `total_count`.  Defines
+        `[N1,..., Nm, k]` `m >= 0` and same dtype as `total_count`. Defines
         this as a batch of `N1 x ... x Nm` different `k` class Multinomial
         distributions. `probs`'s components in the last portion of its shape
         should sum to `1`. Only one of `logits` or `probs` should be passed in.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`, statistics
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
         (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-        result is undefined.  When `False`, an exception is raised if one or
+        result is undefined. When `False`, an exception is raised if one or
         more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
     with ops.name_scope(name, values=[total_count, logits, probs]) as ns:
@@ -169,7 +169,7 @@ def __init__(self,
           multidimensional=True,
           validate_args=validate_args,
           name=name)
-      self._mean_val = self._total_count[..., None] * self._probs
+      self._mean_val = self._total_count[..., array_ops.newaxis] * self._probs
     super(Multinomial, self).__init__(
         dtype=self._probs.dtype,
         is_continuous=False,
@@ -229,7 +229,7 @@ def _sample_n(self, n, seed=None):
         seed=seed)
     draws = array_ops.reshape(draws, shape=[-1, n, n_draws])
     x = math_ops.reduce_sum(array_ops.one_hot(draws, depth=k),
-                            reduction_indices=-2)  # shape: [B, n, k]
+                            axis=-2)  # shape: [B, n, k]
     x = array_ops.transpose(x, perm=[1, 0, 2])
     final_shape = array_ops.concat([[n], self.batch_shape_tensor(), [k]], 0)
     return array_ops.reshape(x, final_shape)
@@ -254,13 +254,16 @@ def _mean(self):
     return array_ops.identity(self._mean_val)
 
   def _covariance(self):
-    p = self.probs * array_ops.ones_like(self.total_count)[..., None]
+    p = self.probs * array_ops.ones_like(
+        self.total_count)[..., array_ops.newaxis]
     return array_ops.matrix_set_diag(
-        -math_ops.matmul(self._mean_val[..., None], p[..., None, :]),
+        -math_ops.matmul(self._mean_val[..., array_ops.newaxis],
+                         p[..., array_ops.newaxis, :]),  # outer product
         self._variance())
 
   def _variance(self):
-    p = self.probs * array_ops.ones_like(self.total_count)[..., None]
+    p = self.probs * array_ops.ones_like(
+        self.total_count)[..., array_ops.newaxis]
     return self._mean_val - self._mean_val * p
 
   def _maybe_assert_valid_total_count(self, total_count, validate_args):
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_diag.py b/tensorflow/contrib/distributions/python/ops/mvn_diag.py
index edc52517696945..d409f35fa52965 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_diag.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_diag.py
@@ -180,15 +180,15 @@ def __init__(self,
         `k x k` identity matrices added to `scale`. When both
         `scale_identity_multiplier` and `scale_diag` are `None` then `scale` is
         the `Identity`.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`,
+      allow_nan_stats: Python `bool`, default `True`. When `True`,
         statistics (e.g., mean, mode, variance) use the value "`NaN`" to
         indicate the result is undefined. When `False`, an exception is raised
         if one or more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
 
     Raises:
       ValueError: if at most `scale_identity_multiplier` is specified.
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_diag_plus_low_rank.py b/tensorflow/contrib/distributions/python/ops/mvn_diag_plus_low_rank.py
index 51487cf3a32fee..9806839106ad08 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_diag_plus_low_rank.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_diag_plus_low_rank.py
@@ -199,18 +199,18 @@ def __init__(self,
       scale_perturb_diag: Floating-point `Tensor` representing a diagonal matrix
         inside the rank-`r` perturbation added to `scale`. May have shape
         `[B1, ..., Bb, r]`, `b >= 0`, and characterizes `b`-batches of `r x r`
-        diagonal matrices inside the perturbation added to `scale`.  When
+        diagonal matrices inside the perturbation added to `scale`. When
         `None`, an identity matrix is used inside the perturbation. Can only be
         specified if `scale_perturb_factor` is also specified.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`,
+      allow_nan_stats: Python `bool`, default `True`. When `True`,
         statistics (e.g., mean, mode, variance) use the value "`NaN`" to
         indicate the result is undefined. When `False`, an exception is raised
         if one or more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
 
     Raises:
       ValueError: if at most `scale_identity_multiplier` is specified.
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py b/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py
index f6f26a0b1d22fe..d16d4aa2fbae89 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py
@@ -160,10 +160,10 @@ def __init__(self,
         `b >= 0` and `k` is the event size.
       scale: Instance of `LinearOperator` with same `dtype` as `loc` and shape
         `[B1, ..., Bb, k, k]`.
-      validate_args: `Boolean`, default `False`. Whether to validate input
+      validate_args: Python `bool`, default `False`. Whether to validate input
         with asserts. If `validate_args` is `False`, and the inputs are
         invalid, correct behavior is not guaranteed.
-      allow_nan_stats: `Boolean`, default `True`. If `False`, raise an
+      allow_nan_stats: Python `bool`, default `True`. If `False`, raise an
         exception if a statistic (e.g. mean/mode/etc...) is undefined for any
         batch member If `True`, batch members with valid parameters leading to
         undefined statistics will return NaN for this statistic.
@@ -324,7 +324,7 @@ def _kl_brute_force(a, b, name=None):
 
   This `Op` computes the trace by solving `C_b^{-1} C_a`. Although efficient
   methods for solving systems with `C_b` may be available, a dense version of
-  (the square root of) `C_a` is used, so performance is `O(B s k^2)` where `B`
+  (the square root of) `C_a` is used, so performance is `O(B s k**2)` where `B`
   is the batch size, and `s` is the cost of solving `C_b x = y` for vectors `x`
   and `y`.
 
@@ -362,7 +362,7 @@ def is_diagonal(x):
     #   tr[inv(Cb) Ca] = tr[inv(B)' inv(B) A A']
     #                  = tr[inv(B) A A' inv(B)']
     #                  = tr[(inv(B) A) (inv(B) A)']
-    #                  = sum_{ij} (inv(B) A)_{ij}^2
+    #                  = sum_{ij} (inv(B) A)_{ij}**2
     #                  = ||inv(B) A||_F**2
     # where ||.||_F is the Frobenius norm and the second equality follows from
     # the cyclic permutation property.
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_tril.py b/tensorflow/contrib/distributions/python/ops/mvn_tril.py
index 8fdc0822c49def..f0ba05b0a8b3d2 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_tril.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_tril.py
@@ -161,15 +161,15 @@ def __init__(self,
       scale_tril: Floating-point, lower-triangular `Tensor` with non-zero
         diagonal elements. `scale_tril` has shape `[B1, ..., Bb, k, k]` where
         `b >= 0` and `k` is the event size.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`,
+      allow_nan_stats: Python `bool`, default `True`. When `True`,
         statistics (e.g., mean, mode, variance) use the value "`NaN`" to
         indicate the result is undefined. When `False`, an exception is raised
         if one or more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
 
     Raises:
       ValueError: if neither `loc` nor `scale_tril` are specified.
diff --git a/tensorflow/contrib/distributions/python/ops/normal.py b/tensorflow/contrib/distributions/python/ops/normal.py
index 52634d2ff4c9dd..770a81cf1af04a 100644
--- a/tensorflow/contrib/distributions/python/ops/normal.py
+++ b/tensorflow/contrib/distributions/python/ops/normal.py
@@ -117,15 +117,15 @@ def __init__(self,
       loc: Floating point tensor; the means of the distribution(s).
       scale: Floating point tensor; the stddevs of the distribution(s).
         Must contain only positive values.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`,
+      allow_nan_stats: Python `bool`, default `True`. When `True`,
         statistics (e.g., mean, mode, variance) use the value "`NaN`" to
-        indicate the result is undefined.  When `False`, an exception is raised
+        indicate the result is undefined. When `False`, an exception is raised
         if one or more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
 
     Raises:
       TypeError: if `loc` and `scale` have different `dtype`.
@@ -136,7 +136,7 @@ def __init__(self,
                                     validate_args else []):
         self._loc = array_ops.identity(loc, name="loc")
         self._scale = array_ops.identity(scale, name="scale")
-        contrib_tensor_util.assert_same_float_dtype((self._loc, self._scale))
+        contrib_tensor_util.assert_same_float_dtype([self._loc, self._scale])
     super(Normal, self).__init__(
         dtype=self._scale.dtype,
         is_continuous=True,
@@ -180,7 +180,7 @@ def _event_shape(self):
     return tensor_shape.scalar()
 
   def _sample_n(self, n, seed=None):
-    shape = array_ops.concat(([n], array_ops.shape(self.mean())), 0)
+    shape = array_ops.concat([[n], self.batch_shape_tensor()], 0)
     sampled = random_ops.random_normal(
         shape=shape, mean=0., stddev=1., dtype=self.loc.dtype, seed=seed)
     return sampled * self.scale + self.loc
diff --git a/tensorflow/contrib/distributions/python/ops/normal_conjugate_posteriors.py b/tensorflow/contrib/distributions/python/ops/normal_conjugate_posteriors.py
index bb4970ae908a9e..10e934326a1f09 100644
--- a/tensorflow/contrib/distributions/python/ops/normal_conjugate_posteriors.py
+++ b/tensorflow/contrib/distributions/python/ops/normal_conjugate_posteriors.py
@@ -28,7 +28,7 @@ def normal_conjugates_known_scale_posterior(prior, scale, s, n):
 
   This model assumes that `n` observations (with sum `s`) come from a
   Normal with unknown mean `loc` (described by the Normal `prior`)
-  and known variance `scale^2`.  The "known scale posterior" is
+  and known variance `scale**2`. The "known scale posterior" is
   the distribution of the unknown `loc`.
 
   Accepts a prior Normal distribution object, having parameters
@@ -38,12 +38,12 @@ def normal_conjugates_known_scale_posterior(prior, scale, s, n):
   `n` (the number(s) of observations).
 
   Returns a posterior (also Normal) distribution object, with parameters
-  `(loc', scale'^2)`, where:
+  `(loc', scale'**2)`, where:
 
   ```
-  mu ~ N(mu', sigma'^2)
-  sigma'^2 = 1/(1/sigma0^2 + n/sigma^2),
-  mu' = (mu0/sigma0^2 + s/sigma^2) * sigma'^2.
+  mu ~ N(mu', sigma'**2)
+  sigma'**2 = 1/(1/sigma0**2 + n/sigma**2),
+  mu' = (mu0/sigma0**2 + s/sigma**2) * sigma'**2.
   ```
 
   Distribution parameters from `prior`, as well as `scale`, `s`, and `n`.
@@ -54,8 +54,8 @@ def normal_conjugates_known_scale_posterior(prior, scale, s, n):
       the prior distribution having parameters `(loc0, scale0)`.
     scale: tensor of type `dtype`, taking values `scale > 0`.
       The known stddev parameter(s).
-    s: Tensor of type `dtype`.  The sum(s) of observations.
-    n: Tensor of type `int`.  The number(s) of observations.
+    s: Tensor of type `dtype`. The sum(s) of observations.
+    n: Tensor of type `int`. The number(s) of observations.
 
   Returns:
     A new Normal posterior distribution object for the unknown observation
@@ -87,7 +87,7 @@ def normal_conjugates_known_scale_predictive(prior, scale, s, n):
 
   This model assumes that `n` observations (with sum `s`) come from a
   Normal with unknown mean `loc` (described by the Normal `prior`)
-  and known variance `scale^2`.  The "known scale predictive"
+  and known variance `scale**2`. The "known scale predictive"
   is the distribution of new observations, conditioned on the existing
   observations and our prior.
 
@@ -97,20 +97,20 @@ def normal_conjugates_known_scale_predictive(prior, scale, s, n):
   and statistical estimates `s` (the sum(s) of the observations) and
   `n` (the number(s) of observations).
 
-  Calculates the Normal distribution(s) `p(x | sigma^2)`:
+  Calculates the Normal distribution(s) `p(x | sigma**2)`:
 
   ```
-  p(x | sigma^2) = int N(x | mu, sigma^2) N(mu | prior.loc, prior.scale**2) dmu
-                 = N(x | prior.loc, 1/(sigma^2 + prior.scale**2))
+  p(x | sigma**2) = int N(x | mu, sigma**2)N(mu | prior.loc, prior.scale**2) dmu
+                  = N(x | prior.loc, 1 / (sigma**2 + prior.scale**2))
   ```
 
   Returns the predictive posterior distribution object, with parameters
-  `(loc', scale'^2)`, where:
+  `(loc', scale'**2)`, where:
 
   ```
-  sigma_n^2 = 1/(1/sigma0^2 + n/sigma^2),
-  mu' = (mu0/sigma0^2 + s/sigma^2) * sigma_n^2.
-  sigma'^2 = sigma_n^2 + sigma^2,
+  sigma_n**2 = 1/(1/sigma0**2 + n/sigma**2),
+  mu' = (mu0/sigma0**2 + s/sigma**2) * sigma_n**2.
+  sigma'**2 = sigma_n**2 + sigma**2,
   ```
 
   Distribution parameters from `prior`, as well as `scale`, `s`, and `n`.
@@ -121,8 +121,8 @@ def normal_conjugates_known_scale_predictive(prior, scale, s, n):
       the prior distribution having parameters `(loc0, scale0)`.
     scale: tensor of type `dtype`, taking values `scale > 0`.
       The known stddev parameter(s).
-    s: Tensor of type `dtype`.  The sum(s) of observations.
-    n: Tensor of type `int`.  The number(s) of observations.
+    s: Tensor of type `dtype`. The sum(s) of observations.
+    n: Tensor of type `int`. The number(s) of observations.
 
   Returns:
     A new Normal predictive distribution object.
diff --git a/tensorflow/contrib/distributions/python/ops/onehot_categorical.py b/tensorflow/contrib/distributions/python/ops/onehot_categorical.py
index 22f0d6d35f7924..7ebc48f004665c 100644
--- a/tensorflow/contrib/distributions/python/ops/onehot_categorical.py
+++ b/tensorflow/contrib/distributions/python/ops/onehot_categorical.py
@@ -43,11 +43,11 @@ class OneHotCategorical(distribution.Distribution):
   K is the number of classes.
 
   This class provides methods to create indexed batches of OneHotCategorical
-  distributions.  If the provided `logits` or `probs` is rank 2 or higher, for
+  distributions. If the provided `logits` or `probs` is rank 2 or higher, for
   every fixed set of leading dimensions, the last dimension represents one
-  single OneHotCategorical distribution.  When calling distribution
+  single OneHotCategorical distribution. When calling distribution
   functions (e.g. `dist.prob(x)`), `logits` and `x` are broadcast to the
-  same shape (if possible).  In all cases, the last dimension of `logits,x`
+  same shape (if possible). In all cases, the last dimension of `logits,x`
   represents single OneHotCategorical distributions.
 
   #### Examples
@@ -105,15 +105,15 @@ def __init__(
         vector of probabilities for each class. Only one of `logits` or `probs`
         should be passed in.
       dtype: The type of the event samples (default: int32).
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`, statistics
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
         (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-        result is undefined.  When `False`, an exception is raised if one or
+        result is undefined. When `False`, an exception is raised if one or
         more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
     with ops.name_scope(name, values=[logits, probs]) as ns:
@@ -173,7 +173,7 @@ def _event_shape(self):
     return self.logits.get_shape().with_rank_at_least(1)[-1:]
 
   def _sample_n(self, n, seed=None):
-    sample_shape = array_ops.concat(([n], array_ops.shape(self.logits)), 0)
+    sample_shape = array_ops.concat([[n], array_ops.shape(self.logits)], 0)
     logits = self.logits
     if logits.get_shape().ndims == 2:
       logits_2d = logits
@@ -231,8 +231,8 @@ def _assert_valid_sample(self, x):
     return control_flow_ops.with_dependencies([
         check_ops.assert_non_positive(x),
         distribution_util.assert_close(
-            array_ops.zeros((), dtype=self.dtype),
-            math_ops.reduce_logsumexp(x, reduction_indices=[-1])),
+            array_ops.zeros([], dtype=self.dtype),
+            math_ops.reduce_logsumexp(x, axis=[-1])),
     ], x)
 
 
diff --git a/tensorflow/contrib/distributions/python/ops/poisson.py b/tensorflow/contrib/distributions/python/ops/poisson.py
index e1ddc9a0e18b9d..64904824c1df77 100644
--- a/tensorflow/contrib/distributions/python/ops/poisson.py
+++ b/tensorflow/contrib/distributions/python/ops/poisson.py
@@ -70,15 +70,15 @@ def __init__(self,
     Args:
       rate: Floating point tensor, the rate parameter of the
         distribution(s). `rate` must be positive.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`, statistics
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
         (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-        result is undefined.  When `False`, an exception is raised if one or
+        result is undefined. When `False`, an exception is raised if one or
         more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
     with ops.name_scope(name, values=[rate]) as ns:
diff --git a/tensorflow/contrib/distributions/python/ops/quantized_distribution.py b/tensorflow/contrib/distributions/python/ops/quantized_distribution.py
index 7466b1c22091ad..1ee77c05e4b4a2 100644
--- a/tensorflow/contrib/distributions/python/ops/quantized_distribution.py
+++ b/tensorflow/contrib/distributions/python/ops/quantized_distribution.py
@@ -38,8 +38,9 @@ def _logsum_expbig_minus_expsmall(big, small):
   To work correctly, we should have the pointwise relation:  `small <= big`.
 
   Args:
-    big: Numeric `Tensor`
-    small: Numeric `Tensor` with same `dtype` as `big` and broadcastable shape.
+    big: Floating-point `Tensor`
+    small: Floating-point `Tensor` with same `dtype` as `big` and broadcastable
+      shape.
 
   Returns:
     `Tensor` of same `dtype` of `big` and broadcast shape.
@@ -61,14 +62,14 @@ def _logsum_expbig_minus_expsmall(big, small):
 """
 
 _prob_note = _prob_base_note + """
-The base distribution's `cdf` method must be defined on `y - 1`.  If the
+The base distribution's `cdf` method must be defined on `y - 1`. If the
 base distribution has a `survival_function` method, results will be more
 accurate for large values of `y`, and in this case the `survival_function` must
 also be defined on `y - 1`.
 """
 
 _log_prob_note = _prob_base_note + """
-The base distribution's `log_cdf` method must be defined on `y - 1`.  If the
+The base distribution's `log_cdf` method must be defined on `y - 1`. If the
 base distribution has a `log_survival_function` method results will be more
 accurate for large values of `y`, and in this case the `log_survival_function`
 must also be defined on `y - 1`.
@@ -194,19 +195,19 @@ def __init__(self,
       distribution:  The base distribution class to transform. Typically an
         instance of `Distribution`.
       low: `Tensor` with same `dtype` as this distribution and shape
-        able to be added to samples.  Should be a whole number.  Default `None`.
+        able to be added to samples. Should be a whole number. Default `None`.
         If provided, base distribution's `prob` should be defined at
         `low`.
       high: `Tensor` with same `dtype` as this distribution and shape
-        able to be added to samples.  Should be a whole number.  Default `None`.
+        able to be added to samples. Should be a whole number. Default `None`.
         If provided, base distribution's `prob` should be defined at
         `high - 1`.
         `high` must be strictly greater than `low`.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
 
     Raises:
       TypeError: If `dist_cls` is not a subclass of
diff --git a/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py b/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py
index b257b64c7e908a..55e87fda152133 100644
--- a/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py
+++ b/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py
@@ -151,15 +151,15 @@ def __init__(self,
       probs: An N-D `Tensor` representing the probability of a positive event.
         Each entry in the `Tensor` parameterizes an independent Bernoulli
         distribution. Only one of `logits` or `probs` should be passed in.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`, statistics
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
         (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-        result is undefined.  When `False`, an exception is raised if one or
+        result is undefined. When `False`, an exception is raised if one or
         more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
 
     Raises:
       ValueError: If both `probs` and `logits` are passed, or if neither.
diff --git a/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py b/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
index bd22b4d25ac5ac..0e52743e1ce9cd 100644
--- a/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
+++ b/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
@@ -151,15 +151,15 @@ def __init__(
         the last dimension represents a vector of probabilities for each
         class. Only one of `logits` or `probs` should be passed in.
       dtype: The type of the event samples (default: int32).
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`, statistics
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
         (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-        result is undefined.  When `False`, an exception is raised if one or
+        result is undefined. When `False`, an exception is raised if one or
         more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
     with ops.name_scope(name, values=[logits, probs, temperature]) as ns:
@@ -230,7 +230,7 @@ def _event_shape(self):
     return self.logits.get_shape().with_rank_at_least(1)[-1:]
 
   def _sample_n(self, n, seed=None):
-    sample_shape = array_ops.concat(([n], array_ops.shape(self.logits)), 0)
+    sample_shape = array_ops.concat([[n], array_ops.shape(self.logits)], 0)
     logits = self.logits * array_ops.ones(sample_shape)
     logits_2d = array_ops.reshape(logits, [-1, self.event_size])
     np_dtype = self.dtype.as_numpy_dtype
@@ -238,8 +238,8 @@ def _sample_n(self, n, seed=None):
     # Uniform variates must be sampled from the interval (0,1] rather than
     # [0,1], as they are passed through log() to compute Gumbel variates.
     # We need to use np.finfo(np_dtype).tiny because it is the smallest,
-    # positive, "normal" number.  A "normal" number is such that the mantissa
-    # has an implicit leading 1.  Normal, positive numbers x, y have the
+    # positive, "normal" number. A "normal" number is such that the mantissa
+    # has an implicit leading 1. Normal, positive numbers x, y have the
     # reasonable property that: x + y >= max(x, y).
     # minval=np.nextafter(np.float32(0),1)) can cause
     # tf.random_uniform(dtype=tf.float32) to sample 0.
@@ -290,8 +290,8 @@ def _assert_valid_sample(self, x):
     return control_flow_ops.with_dependencies([
         check_ops.assert_non_positive(x),
         distribution_util.assert_close(
-            array_ops.zeros((), dtype=self.dtype),
-            math_ops.reduce_logsumexp(x, reduction_indices=[-1])),
+            array_ops.zeros([], dtype=self.dtype),
+            math_ops.reduce_logsumexp(x, axis=[-1])),
     ], x)
 
 
@@ -394,9 +394,9 @@ def __init__(
         of `logits` or `probs` should be passed in.
       dtype: The type of the event samples (default: int32).
       validate_args: Unused in this distribution.
-      allow_nan_stats: `Boolean`, default `True`.  If `False`, raise an
+      allow_nan_stats: Python `bool`, default `True`. If `False`, raise an
         exception if a statistic (e.g. mean/mode/etc...) is undefined for any
-        batch member.  If `True`, batch members with valid parameters leading to
+        batch member. If `True`, batch members with valid parameters leading to
         undefined statistics will return NaN for this statistic.
       name: A name for this distribution (optional).
     """
diff --git a/tensorflow/contrib/distributions/python/ops/shape.py b/tensorflow/contrib/distributions/python/ops/shape.py
index 6e47df34c49aa4..90acd14a1f0045 100644
--- a/tensorflow/contrib/distributions/python/ops/shape.py
+++ b/tensorflow/contrib/distributions/python/ops/shape.py
@@ -39,7 +39,7 @@ class _DistributionShape(object):
       - `dims`: indexes into `shape`; useful for transpose, reduce.
 
     `Tensor`s sampled from a `Distribution` can be partitioned by `sample_dims`,
-    `batch_dims`, and `event_dims`.  To understand the semantics of these
+    `batch_dims`, and `event_dims`. To understand the semantics of these
     dimensions, consider when two of the three are fixed and the remaining
     is varied:
       - `sample_dims`: indexes independent draws from identical
@@ -78,7 +78,7 @@ class _DistributionShape(object):
       ```python
       sample_dims = [0]
       tf.reduce_mean(Normal(loc=1.3, scale=1.).sample_n(1000),
-                     reduction_indices=sample_dims)  # ~= 1.3
+                     axis=sample_dims)  # ~= 1.3
       ```
 
     - Batch dimensions:
@@ -93,13 +93,13 @@ class _DistributionShape(object):
             ~= 1/n sum_{i=1}^n P(X=x|y_i),   y_i ~iid Laplace(0,1)
              = tf.reduce_mean(Normal(loc=Laplace(0., 1.).sample_n(n=1000),
                                      scale=tf.ones(1000)).prob(x),
-                              reduction_indices=batch_dims)
+                              axis=batch_dims)
       ```
 
       The `Laplace` distribution generates a `Tensor` of shape `[1000]`. When
       fed to a `Normal`, this is interpreted as 1000 different locations, i.e.,
-      1000 non-identical Normals.  Therefore a single call to `prob(x)` yields
-      1000 probabilities, one for every location.  The average over this batch
+      1000 non-identical Normals. Therefore a single call to `prob(x)` yields
+      1000 probabilities, one for every location. The average over this batch
       yields the marginal.
 
     - Event dimensions:
@@ -139,8 +139,8 @@ class _DistributionShape(object):
     # E = [2, 2]
 
     # 100 iid samples from two, non-identical trivariate Normal distributions.
-    mu    = ... # shape(2, 3)
-    sigma = ... # shape(2, 3, 3)
+    mu    = ...  # shape(2, 3)
+    sigma = ...  # shape(2, 3, 3)
     X = MultivariateNormal(mu, sigma).sample(shape=[4, 25])
     # S = [4, 25]
     # B = [2]
@@ -154,7 +154,7 @@ class _DistributionShape(object):
 
     For example, when `validate_args=False` and `event_ndims` is a
     non-constant `Tensor`, it is checked to be a non-negative integer at graph
-    execution. (Same for `batch_ndims`).  Constant `Tensor`s and non-`Tensor`
+    execution. (Same for `batch_ndims`). Constant `Tensor`s and non-`Tensor`
     arguments are always checked for correctness since this can be done for
     "free," i.e., during graph construction.
   """
@@ -167,7 +167,7 @@ def __init__(self,
     """Construct `DistributionShape` with fixed `batch_ndims`, `event_ndims`.
 
     `batch_ndims` and `event_ndims` are fixed throughout the lifetime of a
-    `Distribution`.  They may only be known at graph execution.
+    `Distribution`. They may only be known at graph execution.
 
     If both `batch_ndims` and `event_ndims` are python scalars (rather than
     either being a `Tensor`), functions in this class automatically perform
@@ -175,16 +175,16 @@ def __init__(self,
 
     Args:
       batch_ndims: `Tensor`. Number of `dims` (`rank`) of the batch portion of
-        indexes of a `Tensor`.  A "batch" is a non-identical distribution, i.e,
+        indexes of a `Tensor`. A "batch" is a non-identical distribution, i.e,
         Normal with different parameters.
       event_ndims: `Tensor`. Number of `dims` (`rank`) of the event portion of
         indexes of a `Tensor`. An "event" is what is sampled from a
         distribution, i.e., a trivariate Normal has an event shape of [3] and a
         4 dimensional Wishart has an event shape of [4, 4].
-      validate_args: `Boolean`, default `False`. When `True`, non-`tf.constant`
-        `Tensor` arguments are checked for correctness. (`tf.constant`
-        arguments are always checked.)
-      name: `String`. The name prepended to Ops created by this class.
+      validate_args: Python `bool`, default `False`. When `True`,
+        non-`tf.constant` `Tensor` arguments are checked for correctness.
+        (`tf.constant` arguments are always checked.)
+      name: Python `str`. The name prepended to Ops created by this class.
 
     Raises:
       ValueError: if either `batch_ndims` or `event_ndims` are: `None`,
@@ -234,7 +234,7 @@ def get_ndims(self, x, name="get_ndims"):
 
     Args:
       x: `Tensor`.
-      name: `String`. The name to give this op.
+      name: Python `str`. The name to give this op.
 
     Returns:
       ndims: Scalar number of dimensions associated with a `Tensor`.
@@ -251,7 +251,7 @@ def get_sample_ndims(self, x, name="get_sample_ndims"):
 
     Args:
       x: `Tensor`.
-      name: `String`. The name to give this op.
+      name: Python `str`. The name to give this op.
 
     Returns:
       sample_ndims: `Tensor` (0D, `int32`).
@@ -285,7 +285,7 @@ def get_dims(self, x, name="get_dims"):
     Example:
 
     ```python
-    x = ... # Tensor with shape [4, 3, 2, 1]
+    x = ...  # Tensor with shape [4, 3, 2, 1]
     sample_dims, batch_dims, event_dims = _DistributionShape(
       batch_ndims=2, event_ndims=1).get_dims(x)
     # sample_dims == [0]
@@ -296,7 +296,7 @@ def get_dims(self, x, name="get_dims"):
 
     Args:
       x: `Tensor`.
-      name: `String`. The name to give this op.
+      name: Python `str`. The name to give this op.
 
     Returns:
       sample_dims: `Tensor` (1D, `int32`).
@@ -306,8 +306,8 @@ def get_dims(self, x, name="get_dims"):
     with self._name_scope(name, values=[x]):
       def make_dims(start_sum, size, name):
         """Closure to make dims range."""
-        start_sum = start_sum if start_sum else (
-            array_ops.zeros((), dtype=dtypes.int32, name="zero"),)
+        start_sum = start_sum if start_sum else [
+            array_ops.zeros([], dtype=dtypes.int32, name="zero")]
         if self._is_all_constant_helper(size, *start_sum):
           start = sum(tensor_util.constant_value(s) for s in start_sum)
           stop = start + tensor_util.constant_value(size)
@@ -317,9 +317,9 @@ def make_dims(start_sum, size, name):
           start = sum(start_sum)
           return math_ops.range(start, start + size)
       sample_ndims = self.get_sample_ndims(x, name=name)
-      return (make_dims((), sample_ndims, name="sample_dims"),
-              make_dims((sample_ndims,), self.batch_ndims, name="batch_dims"),
-              make_dims((sample_ndims, self.batch_ndims),
+      return (make_dims([], sample_ndims, name="sample_dims"),
+              make_dims([sample_ndims], self.batch_ndims, name="batch_dims"),
+              make_dims([sample_ndims, self.batch_ndims],
                         self.event_ndims, name="event_dims"))
 
   def get_shape(self, x, name="get_shape"):
@@ -327,7 +327,7 @@ def get_shape(self, x, name="get_shape"):
 
     Args:
       x: `Tensor`.
-      name: `String`. The name to give this op.
+      name: Python `str`. The name to give this op.
 
     Returns:
       sample_shape: `Tensor` (1D, `int32`).
@@ -338,8 +338,8 @@ def get_shape(self, x, name="get_shape"):
       x = ops.convert_to_tensor(x, name="x")
       def slice_shape(start_sum, size, name):
         """Closure to slice out shape."""
-        start_sum = start_sum if start_sum else (
-            array_ops.zeros((), dtype=dtypes.int32, name="zero"),)
+        start_sum = start_sum if start_sum else [
+            array_ops.zeros([], dtype=dtypes.int32, name="zero")]
         if (x.get_shape().ndims is not None and
             self._is_all_constant_helper(size, *start_sum)):
           start = sum(tensor_util.constant_value(s) for s in start_sum)
@@ -347,14 +347,13 @@ def slice_shape(start_sum, size, name):
           slice_ = x.get_shape()[start:stop].as_list()
           if all(s is not None for s in slice_):
             return ops.convert_to_tensor(slice_, dtype=dtypes.int32, name=name)
-          # Fall-through intended.
-        return array_ops.slice(array_ops.shape(x), (sum(start_sum),), (size,))
+        return array_ops.slice(array_ops.shape(x), [sum(start_sum)], [size])
       sample_ndims = self.get_sample_ndims(x, name=name)
-      return (slice_shape((), sample_ndims,
+      return (slice_shape([], sample_ndims,
                           name="sample_shape"),
-              slice_shape((sample_ndims,), self.batch_ndims,
+              slice_shape([sample_ndims], self.batch_ndims,
                           name="batch_shape"),
-              slice_shape((sample_ndims, self.batch_ndims), self.event_ndims,
+              slice_shape([sample_ndims, self.batch_ndims], self.event_ndims,
                           name="event_shape"))
 
   # TODO(jvdillon): Make remove expand_batch_dim and make expand_batch_dim=False
@@ -371,9 +370,9 @@ def make_batch_of_event_sample_matrices(
 
     Args:
       x: `Tensor`.
-      expand_batch_dim: Python `Boolean` scalar. If `True` the batch dims will
-        be expanded such that batch_ndims>=1.
-      name: `String`. The name to give this op.
+      expand_batch_dim: Python `bool`. If `True` the batch dims will be expanded
+        such that `batch_ndims >= 1`.
+      name: Python `str`. The name to give this op.
 
     Returns:
       x: `Tensor`. Input transposed/reshaped to `B_+E_+S_`.
@@ -412,9 +411,9 @@ def undo_make_batch_of_event_sample_matrices(
     Args:
       x: `Tensor` of shape `B_+E_+S_`.
       sample_shape: `Tensor` (1D, `int32`).
-      expand_batch_dim: Python `Boolean` scalar. If `True` the batch dims will
-        be expanded such that batch_ndims>=1.
-      name: `String`. The name to give this op.
+      expand_batch_dim: Python `bool`. If `True` the batch dims will be expanded
+        such that `batch_ndims>=1`.
+      name: Python `str`. The name to give this op.
 
     Returns:
       x: `Tensor`. Input transposed/reshaped to `S+B+E`.
diff --git a/tensorflow/contrib/distributions/python/ops/special_math.py b/tensorflow/contrib/distributions/python/ops/special_math.py
index bc0a8c2b273bf8..e5e5e1963e063f 100644
--- a/tensorflow/contrib/distributions/python/ops/special_math.py
+++ b/tensorflow/contrib/distributions/python/ops/special_math.py
@@ -45,7 +45,7 @@
 
 # Upper bound values were chosen by examining for which values of 'x'
 # Log[cdf(x)] is 0, after which point we need to use the approximation
-# Log[cdf(x)] = Log[1 - cdf(-x)] approx -cdf(-x).  We chose a value slightly
+# Log[cdf(x)] = Log[1 - cdf(-x)] approx -cdf(-x). We chose a value slightly
 # conservative, meaning we use the approximation earlier than needed.
 LOGNDTR_FLOAT64_UPPER = 8
 LOGNDTR_FLOAT32_UPPER = 5
@@ -59,7 +59,7 @@ def ndtr(x, name="ndtr"):
 
   ```
                     1       / x
-     ndtr(x)  = ----------  |    exp(-0.5 t^2) dt
+     ndtr(x)  = ----------  |    exp(-0.5 t**2) dt
                 sqrt(2 pi)  /-inf
 
               = 0.5 (1 + erf(x / sqrt(2)))
@@ -106,7 +106,7 @@ def log_ndtr(x, series_order=3, name="log_ndtr"):
   For details of the Normal distribution function see `ndtr`.
 
   This function calculates `(log o ndtr)(x)` by either calling `log(ndtr(x))` or
-  using an asymptotic series.  Specifically:
+  using an asymptotic series. Specifically:
   - For `x > upper_segment`, use the approximation `-ndtr(-x)` based on
     `log(1-x) ~= -x, x << 1`.
   - For `lower_segment < x <= upper_segment`, use the existing `ndtr` technique
@@ -127,19 +127,19 @@ def log_ndtr(x, series_order=3, name="log_ndtr"):
 
   ```
      ndtr(x) = scale * (1 + sum) + R_N
-     scale   = exp(-0.5 x^2) / (-x sqrt(2 pi))
-     sum     = Sum{(-1)^n (2n-1)!! / (x^2)^n, n=1:N}
-     R_N     = O(exp(-0.5 x^2) (2N+1)!! / |x|^{2N+3})
+     scale   = exp(-0.5 x**2) / (-x sqrt(2 pi))
+     sum     = Sum{(-1)^n (2n-1)!! / (x**2)^n, n=1:N}
+     R_N     = O(exp(-0.5 x**2) (2N+1)!! / |x|^{2N+3})
   ```
 
-  where `(2n-1)!! = (2n-1) (2n-3) (2n-5) ... (3) (1)` is a
+  where `(2n-1)!! = (2n-1) (2n-3) (2n-5) ...  (3) (1)` is a
   [double-factorial](https://en.wikipedia.org/wiki/Double_factorial).
 
 
   Args:
     x: `Tensor` of type `float32`, `float64`.
     series_order: Positive Python `integer`. Maximum depth to
-      evaluate the asymptotic expansion.  This is the `N` above.
+      evaluate the asymptotic expansion. This is the `N` above.
     name: Python string. A name for the operation (default="log_ndtr").
 
   Returns:
@@ -176,7 +176,7 @@ def log_ndtr(x, series_order=3, name="log_ndtr"):
     #     which extends the range of validity of this function.
     # * We use one fixed series_order for all of 'x', rather than adaptive.
     # * Our docstring properly reflects that this is an asymptotic series, not a
-    #   Tayor series.  We also provided a correct bound on the remainder.
+    #   Taylor series. We also provided a correct bound on the remainder.
     # * We need to use the max/min in the _log_ndtr_lower arg to avoid nan when
     #   x=0. This happens even though the branch is unchosen because when x=0
     #   the gradient of a select involves the calculation 1*dy+0*(-inf)=nan
@@ -262,7 +262,7 @@ def log_cdf_laplace(x, name="log_cdf_laplace"):
     #   exp{-x} --> inf, for x << -1
     safe_exp_neg_x = math_ops.exp(-math_ops.abs(x))
 
-    # log1p(z) = log(1 + z) approx z for |z| << 1.  This approxmation is used
+    # log1p(z) = log(1 + z) approx z for |z| << 1. This approxmation is used
     # internally by log1p, rather than being done explicitly here.
     upper_solution = math_ops.log1p(-0.5 * safe_exp_neg_x)
 
diff --git a/tensorflow/contrib/distributions/python/ops/student_t.py b/tensorflow/contrib/distributions/python/ops/student_t.py
index 2d097ff95fff80..24db27a2f5233e 100644
--- a/tensorflow/contrib/distributions/python/ops/student_t.py
+++ b/tensorflow/contrib/distributions/python/ops/student_t.py
@@ -74,7 +74,7 @@ class StudentT(distribution.Distribution):
   ```
 
   Notice that `scale` has semantics more similar to standard deviation than
-  variance.  However it is not actually the std. deviation; the Student's
+  variance. However it is not actually the std. deviation; the Student's
   t-distribution std. dev. is `scale sqrt(df / (df - 2))` when `df > 2`.
 
   #### Examples
@@ -134,22 +134,22 @@ def __init__(self,
     supports broadcasting (e.g. `df + loc + scale` is a valid operation).
 
     Args:
-      df: Numeric `Tensor`. The degrees of freedom of the distribution(s).
-        `df` must contain only positive values.
-      loc: Numeric `Tensor`. The mean(s) of the distribution(s).
-      scale: Numeric `Tensor`. The scaling factor(s) for the distribution(s).
-        Note that `scale` is not technically the standard deviation of this
-        distribution but has semantics more similar to standard deviation than
-        variance.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      df: Floating-point `Tensor`. The degrees of freedom of the
+        distribution(s). `df` must contain only positive values.
+      loc: Floating-point `Tensor`. The mean(s) of the distribution(s).
+      scale: Floating-point `Tensor`. The scaling factor(s) for the
+        distribution(s). Note that `scale` is not technically the standard
+        deviation of this distribution but has semantics more similar to
+        standard deviation than variance.
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`,
+      allow_nan_stats: Python `bool`, default `True`. When `True`,
         statistics (e.g., mean, mode, variance) use the value "`NaN`" to
-        indicate the result is undefined.  When `False`, an exception is raised
+        indicate the result is undefined. When `False`, an exception is raised
         if one or more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
 
     Raises:
       TypeError: if loc and scale are different dtypes.
@@ -257,8 +257,9 @@ def _cdf(self, x):
     return array_ops.where(math_ops.less(y, 0.), neg_cdf, 1. - neg_cdf)
 
   def _entropy(self):
-    v = array_ops.ones(self.batch_shape_tensor(), dtype=self.dtype)[..., None]
-    u = v * self.df[..., None]
+    v = array_ops.ones(self.batch_shape_tensor(),
+                       dtype=self.dtype)[..., array_ops.newaxis]
+    u = v * self.df[..., array_ops.newaxis]
     beta_arg = array_ops.concat([u, v], -1) / 2.
     return (math_ops.log(math_ops.abs(self.scale)) +
             0.5 * math_ops.log(self.df) +
@@ -269,7 +270,7 @@ def _entropy(self):
 
   @distribution_util.AppendDocstring(
       """The mean of Student's T equals `loc` if `df > 1`, otherwise it is
-      `NaN`.  If `self.allow_nan_stats=True`, then an exception will be raised
+      `NaN`. If `self.allow_nan_stats=True`, then an exception will be raised
       rather than returning `NaN`.""")
   def _mean(self):
     mean = self.loc * array_ops.ones(self.batch_shape_tensor(),
@@ -286,7 +287,7 @@ def _mean(self):
       return control_flow_ops.with_dependencies(
           [
               check_ops.assert_less(
-                  array_ops.ones((), dtype=self.dtype),
+                  array_ops.ones([], dtype=self.dtype),
                   self.df,
                   message="mean not defined for components of df <= 1"),
           ],
@@ -329,7 +330,7 @@ def _variance(self):
       return control_flow_ops.with_dependencies(
           [
               check_ops.assert_less(
-                  array_ops.ones((), dtype=self.dtype),
+                  array_ops.ones([], dtype=self.dtype),
                   self.df,
                   message="variance not defined for components of df <= 1"),
           ],
diff --git a/tensorflow/contrib/distributions/python/ops/transformed_distribution.py b/tensorflow/contrib/distributions/python/ops/transformed_distribution.py
index 058eaa0ade0cfc..067e96a18cf242 100644
--- a/tensorflow/contrib/distributions/python/ops/transformed_distribution.py
+++ b/tensorflow/contrib/distributions/python/ops/transformed_distribution.py
@@ -139,7 +139,7 @@ class TransformedDistribution(distributions.Distribution):
   Write `cdf(Y=y)` for an absolutely continuous cumulative distribution function
   of random variable `Y`; write the probability density function `pdf(Y=y) :=
   d^k / (dy_1,...,dy_k) cdf(Y=y)` for its derivative wrt to `Y` evaluated at
-  `y`.  Assume that `Y = g(X)` where `g` is a deterministic diffeomorphism,
+  `y`. Assume that `Y = g(X)` where `g` is a deterministic diffeomorphism,
   i.e., a non-random, continuous, differentiable, and invertible function.
   Write the inverse of `g` as `X = g^{-1}(Y)` and `(J o g)(x)` for the Jacobian
   of `g` evaluated at `x`.
@@ -214,7 +214,7 @@ class TransformedDistribution(distributions.Distribution):
       forward_fn=tf.exp,
       inverse_fn=tf.log,
       inverse_log_det_jacobian_fn=(
-        lambda y: -tf.reduce_sum(tf.log(y), reduction_indices=-1)),
+        lambda y: -tf.reduce_sum(tf.log(y), axis=-1)),
     name="LogNormalTransformedDistribution")
   ```
 
@@ -230,7 +230,7 @@ class TransformedDistribution(distributions.Distribution):
 
   A `TransformedDistribution`'s batch- and event-shape are implied by the base
   distribution unless explicitly overridden by `batch_shape` or `event_shape`
-  arguments.  Specifying an overriding `batch_shape` (`event_shape`) is
+  arguments. Specifying an overriding `batch_shape` (`event_shape`) is
   permitted only if the base distribution has scalar batch-shape (event-shape).
   The bijector is applied to the distribution as if the distribution possessed
   the overridden shape(s). The following example demonstrates how to construct a
@@ -275,11 +275,11 @@ def __init__(self,
         `batch_shape`; valid only if `distribution.is_scalar_batch()`.
       event_shape: `integer` vector `Tensor` which overrides `distribution`
         `event_shape`; valid only if `distribution.is_scalar_event()`.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      name: `String` name prefixed to Ops created by this class. Default:
+      name: Python `str` name prefixed to Ops created by this class. Default:
         `bijector.name + distribution.name`.
     """
     parameters = locals()
@@ -318,7 +318,7 @@ def __init__(self,
       # To convert a scalar distribution into a multivariate distribution we
       # will draw dims from the sample dims, which are otherwise iid. This is
       # easy to do except in the case that the base distribution has batch dims
-      # and we're overriding event shape.  When that case happens the event dims
+      # and we're overriding event shape. When that case happens the event dims
       # will incorrectly be to the left of the batch dims. In this case we'll
       # cyclically permute left the new dims.
       self._needs_rotation = _logical_and(
@@ -367,7 +367,7 @@ def _event_shape_tensor(self):
 
   def _event_shape(self):
     # If there's a chance that the event_shape has been overriden, we return
-    # what we statically know about the `event_shape_override`.  This works
+    # what we statically know about the `event_shape_override`. This works
     # because: `_is_maybe_event_override` means `static_override` is `None` or a
     # non-empty list, i.e., we don't statically know the `event_shape` or we do.
     #
@@ -388,12 +388,12 @@ def _batch_shape_tensor(self):
 
   def _batch_shape(self):
     # If there's a chance that the batch_shape has been overriden, we return
-    # what we statically know about the `batch_shape_override`.  This works
+    # what we statically know about the `batch_shape_override`. This works
     # because: `_is_maybe_batch_override` means `static_override` is `None` or a
     # non-empty list, i.e., we don't statically know the `batch_shape` or we do.
     #
     # Notice that this implementation parallels the `_event_shape` except that
-    # the `bijector` doesn't get to alter the `batch_shape`.  Recall that
+    # the `bijector` doesn't get to alter the `batch_shape`. Recall that
     # `batch_shape` is a property of a distribution while `event_shape` is
     # shared between both the `distribution` instance and the `bijector`.
     static_override = tensor_util.constant_value(self._override_batch_shape)
diff --git a/tensorflow/contrib/distributions/python/ops/uniform.py b/tensorflow/contrib/distributions/python/ops/uniform.py
index 80cc9c30155c00..1465bd81fb0fa2 100644
--- a/tensorflow/contrib/distributions/python/ops/uniform.py
+++ b/tensorflow/contrib/distributions/python/ops/uniform.py
@@ -87,15 +87,15 @@ def __init__(self,
         have `low < high`.
       high: Floating point tensor, upper boundary of the output interval. Must
         have `low < high`.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`, statistics
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
         (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-        result is undefined.  When `False`, an exception is raised if one or
+        result is undefined. When `False`, an exception is raised if one or
         more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
 
     Raises:
       InvalidArgumentError: if `low >= high` and `validate_args=False`.
@@ -158,7 +158,7 @@ def _event_shape(self):
     return tensor_shape.scalar()
 
   def _sample_n(self, n, seed=None):
-    shape = array_ops.concat(([n], self.batch_shape_tensor()), 0)
+    shape = array_ops.concat([[n], self.batch_shape_tensor()], 0)
     samples = random_ops.random_uniform(shape=shape,
                                         dtype=self.dtype,
                                         seed=seed)
diff --git a/tensorflow/contrib/distributions/python/ops/vector_student_t.py b/tensorflow/contrib/distributions/python/ops/vector_student_t.py
index b7df4285a196b0..89128e66b7df3c 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_student_t.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_student_t.py
@@ -141,7 +141,7 @@ class _VectorStudentT(transformed_distribution.TransformedDistribution):
   [Student's t-distributions](
   https://en.wikipedia.org/wiki/Student%27s_t-distribution)
   and should not be confused with the [Multivate Student's t-distribution](
-  https://en.wikipedia.org/wiki/Multivariate_t-distribution).  The
+  https://en.wikipedia.org/wiki/Multivariate_t-distribution). The
   traditional Multivariate Student's t-distribution is type of
   [elliptical distribution](
   https://en.wikipedia.org/wiki/Elliptical_distribution); it has PDF:
@@ -215,42 +215,39 @@ def __init__(self,
     The `event_shape` is the event shape of `Affine.event_shape`.
 
     Args:
-      df: Numeric `Tensor`. The degrees of freedom of the distribution(s).
-        `df` must contain only positive values.
-        Must be scalar if `loc`, `scale_*` imply non-scalar batch_shape or
-        must have the same `batch_shape` implied by `loc`, `scale_*`.
-      loc: Numeric `Tensor`.  If this is set to `None`, no `loc` is applied.
+      df: Floating-point `Tensor`. The degrees of freedom of the
+        distribution(s). `df` must contain only positive values. Must be
+        scalar if `loc`, `scale_*` imply non-scalar batch_shape or must have the
+        same `batch_shape` implied by `loc`, `scale_*`.
+      loc: Floating-point `Tensor`. If this is set to `None`, no `loc` is
+        applied.
       scale_identity_multiplier: floating point rank 0 `Tensor` representing a
-        scaling done to the identity matrix.
-        When `scale_identity_multiplier = scale_diag=scale_tril = None` then
-        `scale += IdentityMatrix`. Otherwise no scaled-identity-matrix is added
-        to `scale`.
-      scale_diag: Numeric `Tensor` representing the diagonal matrix.
-        `scale_diag` has shape [N1, N2, ... k], which represents a k x k
-        diagonal matrix.
-        When `None` no diagonal term is added to `scale`.
-      scale_tril: Numeric `Tensor` representing the diagonal matrix.
-        `scale_diag` has shape [N1, N2, ... k, k], which represents a k x k
-        lower triangular matrix.
-        When `None` no `scale_tril` term is added to `scale`.
-        The upper triangular elements above the diagonal are ignored.
-      scale_perturb_factor: Numeric `Tensor` representing factor matrix with
-        last two dimensions of shape `(k, r)`.
-        When `None`, no rank-r update is added to `scale`.
-      scale_perturb_diag: Numeric `Tensor` representing the diagonal matrix.
-        `scale_perturb_diag` has shape [N1, N2, ... r], which represents an
-        r x r Diagonal matrix.
-        When `None` low rank updates will take the form `scale_perturb_factor *
-        scale_perturb_factor.T`.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+        scaling done to the identity matrix. When `scale_identity_multiplier =
+        scale_diag=scale_tril = None` then `scale += IdentityMatrix`. Otherwise
+        no scaled-identity-matrix is added to `scale`.
+      scale_diag: Floating-point `Tensor` representing the diagonal matrix.
+        `scale_diag` has shape [N1, N2, ..., k], which represents a k x k
+        diagonal matrix. When `None` no diagonal term is added to `scale`.
+      scale_tril: Floating-point `Tensor` representing the diagonal matrix.
+        `scale_diag` has shape [N1, N2, ..., k, k], which represents a k x k
+        lower triangular matrix. When `None` no `scale_tril` term is added to
+        `scale`. The upper triangular elements above the diagonal are ignored.
+      scale_perturb_factor: Floating-point `Tensor` representing factor matrix
+        with last two dimensions of shape `(k, r)`. When `None`, no rank-r
+        update is added to `scale`.
+      scale_perturb_diag: Floating-point `Tensor` representing the diagonal
+        matrix. `scale_perturb_diag` has shape [N1, N2, ..., r], which
+        represents an r x r Diagonal matrix. When `None` low rank updates will
+        take the form `scale_perturb_factor * scale_perturb_factor.T`.
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`,
+      allow_nan_stats: Python `bool`, default `True`. When `True`,
         statistics (e.g., mean, mode, variance) use the value "`NaN`" to
-        indicate the result is undefined.  When `False`, an exception is raised
+        indicate the result is undefined. When `False`, an exception is raised
         if one or more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
     graph_parents = [df, loc, scale_identity_multiplier, scale_diag,
diff --git a/tensorflow/contrib/distributions/python/ops/wishart.py b/tensorflow/contrib/distributions/python/ops/wishart.py
index 911c951668f076..aec84a073b60ba 100644
--- a/tensorflow/contrib/distributions/python/ops/wishart.py
+++ b/tensorflow/contrib/distributions/python/ops/wishart.py
@@ -87,20 +87,20 @@ def __init__(self,
       df: `float` or `double` tensor, the degrees of freedom of the
         distribution(s). `df` must be greater than or equal to `k`.
       scale_operator_pd: `float` or `double` instance of `OperatorPDBase`.
-      cholesky_input_output_matrices: `Boolean`. Any function which whose input
-        or output is a matrix assumes the input is Cholesky and returns a
+      cholesky_input_output_matrices: Python `bool`. Any function which whose
+        input or output is a matrix assumes the input is Cholesky and returns a
         Cholesky factored matrix. Example `log_prob` input takes a Cholesky and
         `sample_n` returns a Cholesky when
         `cholesky_input_output_matrices=True`.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`, statistics
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
         (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-        result is undefined.  When `False`, an exception is raised if one or
+        result is undefined. When `False`, an exception is raised if one or
         more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
 
     Raises:
       TypeError: if scale is not floating-type
@@ -210,9 +210,9 @@ def _sample_n(self, n, seed):
     batch_ndims = array_ops.shape(batch_shape)[0]
 
     ndims = batch_ndims + 3  # sample_ndims=1, event_ndims=2
-    shape = array_ops.concat(((n,), batch_shape, event_shape), 0)
+    shape = array_ops.concat([[n], batch_shape, event_shape], 0)
 
-    # Complexity: O(nbk^2)
+    # Complexity: O(nbk**2)
     x = random_ops.random_normal(shape=shape,
                                  mean=0.,
                                  stddev=1.,
@@ -222,7 +222,7 @@ def _sample_n(self, n, seed):
     # Complexity: O(nbk)
     # This parametrization is equivalent to Chi2, i.e.,
     # ChiSquared(k) == Gamma(alpha=k/2, beta=1/2)
-    g = random_ops.random_gamma(shape=(n,),
+    g = random_ops.random_gamma(shape=[n],
                                 alpha=self._multi_gamma_sequence(
                                     0.5 * self.df, self.dimension),
                                 beta=0.5,
@@ -230,30 +230,30 @@ def _sample_n(self, n, seed):
                                 seed=distribution_util.gen_new_seed(
                                     seed, "wishart"))
 
-    # Complexity: O(nbk^2)
+    # Complexity: O(nbk**2)
     x = array_ops.matrix_band_part(x, -1, 0)  # Tri-lower.
 
     # Complexity: O(nbk)
     x = array_ops.matrix_set_diag(x, math_ops.sqrt(g))
 
     # Make batch-op ready.
-    # Complexity: O(nbk^2)
-    perm = array_ops.concat((math_ops.range(1, ndims), (0,)), 0)
+    # Complexity: O(nbk**2)
+    perm = array_ops.concat([math_ops.range(1, ndims), [0]], 0)
     x = array_ops.transpose(x, perm)
-    shape = array_ops.concat((batch_shape, (event_shape[0], -1)), 0)
+    shape = array_ops.concat([batch_shape, [event_shape[0]], [-1]], 0)
     x = array_ops.reshape(x, shape)
 
     # Complexity: O(nbM) where M is the complexity of the operator solving a
-    # vector system.  E.g., for OperatorPDDiag, each matmul is O(k^2), so
-    # this complexity is O(nbk^2). For OperatorPDCholesky, each matmul is
+    # vector system. E.g., for OperatorPDDiag, each matmul is O(k**2), so
+    # this complexity is O(nbk**2). For OperatorPDCholesky, each matmul is
     # O(k^3) so this step has complexity O(nbk^3).
     x = self.scale_operator_pd.sqrt_matmul(x)
 
     # Undo make batch-op ready.
-    # Complexity: O(nbk^2)
-    shape = array_ops.concat((batch_shape, event_shape, (n,)), 0)
+    # Complexity: O(nbk**2)
+    shape = array_ops.concat([batch_shape, event_shape, [n]], 0)
     x = array_ops.reshape(x, shape)
-    perm = array_ops.concat(((ndims - 1,), math_ops.range(0, ndims - 1)), 0)
+    perm = array_ops.concat([[ndims - 1], math_ops.range(0, ndims - 1)], 0)
     x = array_ops.transpose(x, perm)
 
     if not self.cholesky_input_output_matrices:
@@ -278,7 +278,7 @@ def _log_prob(self, x):
         array_ops.shape(x_sqrt), [0], [sample_ndims])
 
     # We need to be able to pre-multiply each matrix by its corresponding
-    # batch scale matrix.  Since a Distribution Tensor supports multiple
+    # batch scale matrix. Since a Distribution Tensor supports multiple
     # samples per batch, this means we need to reshape the input matrix `x`
     # so that the first b dimensions are batch dimensions and the last two
     # are of shape [dimension, dimensions*number_of_samples]. Doing these
@@ -288,10 +288,10 @@ def _log_prob(self, x):
     # this reshaping so what we're left with is a Tensor partitionable by
     # sample, batch, event dimensions.
 
-    # Complexity: O(nbk^2) since transpose must access every element.
+    # Complexity: O(nbk**2) since transpose must access every element.
     scale_sqrt_inv_x_sqrt = x_sqrt
-    perm = array_ops.concat((math_ops.range(sample_ndims, ndims),
-                             math_ops.range(0, sample_ndims)), 0)
+    perm = array_ops.concat([math_ops.range(sample_ndims, ndims),
+                             math_ops.range(0, sample_ndims)], 0)
     scale_sqrt_inv_x_sqrt = array_ops.transpose(scale_sqrt_inv_x_sqrt, perm)
     shape = array_ops.concat(
         (batch_shape, (math_ops.cast(
@@ -300,37 +300,37 @@ def _log_prob(self, x):
     scale_sqrt_inv_x_sqrt = array_ops.reshape(scale_sqrt_inv_x_sqrt, shape)
 
     # Complexity: O(nbM*k) where M is the complexity of the operator solving
-    # a vector system.  E.g., for OperatorPDDiag, each solve is O(k), so
-    # this complexity is O(nbk^2). For OperatorPDCholesky, each solve is
-    # O(k^2) so this step has complexity O(nbk^3).
+    # a vector system. E.g., for OperatorPDDiag, each solve is O(k), so
+    # this complexity is O(nbk**2). For OperatorPDCholesky, each solve is
+    # O(k**2) so this step has complexity O(nbk^3).
     scale_sqrt_inv_x_sqrt = self.scale_operator_pd.sqrt_solve(
         scale_sqrt_inv_x_sqrt)
 
     # Undo make batch-op ready.
-    # Complexity: O(nbk^2)
-    shape = array_ops.concat((batch_shape, event_shape, sample_shape), 0)
+    # Complexity: O(nbk**2)
+    shape = array_ops.concat([batch_shape, event_shape, sample_shape], 0)
     scale_sqrt_inv_x_sqrt = array_ops.reshape(scale_sqrt_inv_x_sqrt, shape)
-    perm = array_ops.concat((math_ops.range(ndims - sample_ndims, ndims),
-                             math_ops.range(0, ndims - sample_ndims)), 0)
+    perm = array_ops.concat([math_ops.range(ndims - sample_ndims, ndims),
+                             math_ops.range(0, ndims - sample_ndims)], 0)
     scale_sqrt_inv_x_sqrt = array_ops.transpose(scale_sqrt_inv_x_sqrt, perm)
 
     # Write V = SS', X = LL'. Then:
     # tr[inv(V) X] = tr[inv(S)' inv(S) L L']
     #              = tr[inv(S) L L' inv(S)']
     #              = tr[(inv(S) L) (inv(S) L)']
-    #              = sum_{ik} (inv(S) L)_{ik}^2
+    #              = sum_{ik} (inv(S) L)_{ik}**2
     # The second equality follows from the cyclic permutation property.
-    # Complexity: O(nbk^2)
+    # Complexity: O(nbk**2)
     trace_scale_inv_x = math_ops.reduce_sum(
         math_ops.square(scale_sqrt_inv_x_sqrt),
-        reduction_indices=[-2, -1])
+        axis=[-2, -1])
 
     # Complexity: O(nbk)
     half_log_det_x = math_ops.reduce_sum(
         math_ops.log(array_ops.matrix_diag_part(x_sqrt)),
-        reduction_indices=[-1])
+        axis=[-1])
 
-    # Complexity: O(nbk^2)
+    # Complexity: O(nbk**2)
     log_prob = ((self.df - self.dimension - 1.) * half_log_det_x -
                 0.5 * trace_scale_inv_x -
                 self.log_normalization())
@@ -420,14 +420,14 @@ def _multi_lgamma(self, a, p, name="multi_lgamma"):
       seq = self._multi_gamma_sequence(a, p)
       return (0.25 * p * (p - 1.) * math.log(math.pi) +
               math_ops.reduce_sum(math_ops.lgamma(seq),
-                                  reduction_indices=(-1,)))
+                                  axis=[-1]))
 
   def _multi_digamma(self, a, p, name="multi_digamma"):
     """Computes the multivariate digamma function; Psi_p(a)."""
     with self._name_scope(name, values=[a, p]):
       seq = self._multi_gamma_sequence(a, p)
       return math_ops.reduce_sum(math_ops.digamma(seq),
-                                 reduction_indices=(-1,))
+                                 axis=[-1])
 
 
 class WishartCholesky(_WishartOperatorPD):
@@ -469,7 +469,7 @@ class WishartCholesky(_WishartOperatorPD):
   dist = tf.contrib.distributions.WishartCholesky(df=df, scale=chol_scale)
 
   # Evaluate this on an observation in R^3, returning a scalar.
-  x = ... # A 3x3 positive definite matrix.
+  x = ...  # A 3x3 positive definite matrix.
   dist.prob(x)  # Shape is [], a scalar.
 
   # Evaluate this on a two observations, each in R^{3x3}, returning a length two
@@ -506,20 +506,20 @@ def __init__(self,
         or equal to dimension of the scale matrix.
       scale: `float` or `double` `Tensor`. The Cholesky factorization of
         the symmetric positive definite scale matrix of the distribution.
-      cholesky_input_output_matrices: `Boolean`. Any function which whose input
-        or output is a matrix assumes the input is Cholesky and returns a
+      cholesky_input_output_matrices: Python `bool`. Any function which whose
+        input or output is a matrix assumes the input is Cholesky and returns a
         Cholesky factored matrix. Example `log_prob` input takes a Cholesky and
         `sample_n` returns a Cholesky when
         `cholesky_input_output_matrices=True`.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`, statistics
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
         (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-        result is undefined.  When `False`, an exception is raised if one or
+        result is undefined. When `False`, an exception is raised if one or
         more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
     with ops.name_scope(name, values=[scale]) as ns:
@@ -569,7 +569,7 @@ class WishartFull(_WishartOperatorPD):
   dist = tf.contrib.distributions.WishartFull(df=df, scale=scale)
 
   # Evaluate this on an observation in R^3, returning a scalar.
-  x = ... # A 3x3 positive definite matrix.
+  x = ...  # A 3x3 positive definite matrix.
   dist.prob(x)  # Shape is [], a scalar.
 
   # Evaluate this on a two observations, each in R^{3x3}, returning a length two
@@ -606,20 +606,20 @@ def __init__(self,
         or equal to dimension of the scale matrix.
       scale: `float` or `double` `Tensor`. The symmetric positive definite
         scale matrix of the distribution.
-      cholesky_input_output_matrices: `Boolean`. Any function which whose input
-        or output is a matrix assumes the input is Cholesky and returns a
+      cholesky_input_output_matrices: Python `bool`. Any function which whose
+        input or output is a matrix assumes the input is Cholesky and returns a
         Cholesky factored matrix. Example `log_prob` input takes a Cholesky and
         `sample_n` returns a Cholesky when
         `cholesky_input_output_matrices=True`.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`, statistics
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
         (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-        result is undefined.  When `False`, an exception is raised if one or
+        result is undefined. When `False`, an exception is raised if one or
         more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
     with ops.name_scope(name, values=[scale]) as ns: