Add an op for singular value decomposition (SVD) of a dense matrix or…

… batches of dense matrices. This calls Eigen::JacobiSVD<Matrix, Eigen::HouseholderQRPreconditioner> which is known to be rather slow. This change is primarily intended to get the TensorFlow interfaces and functionality in place. We intend to swap out the "backend" with a higher performance algorithm implementation in the future. This CL also contains a small refactoring of the LinearAlgebraOp base class: 1. I moved the initial processing of inputs and outputs into separate helper functions so Compute() is not so long. 2. The derived classes are now allowed to return fewer output matrix shapes (n) than the number of op outputs (m) in which case empty (shape[0]) tensors are returned for the last m-n outputs. Fixed a few Python linter errors that were blocking presubmit. Change: 128990912
SunshlnW · Aug 1, 2016 · c0944a3 · c0944a3
1 parent 48e869f
commit c0944a3
Show file tree

Hide file tree

Showing 12 changed files with 653 additions and 195 deletions.
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
@@ -1023,6 +1023,7 @@ tf_kernel_libraries(
         "matrix_solve_ls_op",
         "matrix_solve_op",
         "matrix_triangular_solve_op",
+        "svd_op",
     ],
     deps = [
         ":linalg_ops_common",

diff --git a/tensorflow/core/kernels/linalg_ops_common.cc b/tensorflow/core/kernels/linalg_ops_common.cc
@@ -90,19 +90,35 @@ void LinearAlgebraOp<Scalar, SupportsBatchOperation>::Compute(
   TensorInputs inputs;
   TensorShapes input_matrix_shapes;
   TensorShape batch_shape;
+  AnalyzeInputs(context, &inputs, &input_matrix_shapes, &batch_shape);
+
+  TensorShapes output_matrix_shapes;
+  TensorOutputs outputs;
+  PrepareOutputs(context, input_matrix_shapes, batch_shape, &outputs,
+                 &output_matrix_shapes);
+
+  // Process the individual matrix problems in parallel using a threadpool.
+  auto shard = [this, &inputs, &input_matrix_shapes, &outputs,
+                &output_matrix_shapes, context](int64 begin, int64 end) {
+    for (int64 i = begin; i < end; ++i) {
+      ComputeTensorSlice(context, i, inputs, input_matrix_shapes, outputs,
+                         output_matrix_shapes);
+    }
+  };
+  auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
+  Shard(worker_threads.num_threads, worker_threads.workers,
+        batch_shape.num_elements(), GetCostPerUnit(input_matrix_shapes), shard);
+}
+
+template <typename Scalar, bool SupportsBatchOperation>
+void LinearAlgebraOp<Scalar, SupportsBatchOperation>::AnalyzeInputs(
+    OpKernelContext* context, TensorInputs* inputs,
+    TensorShapes* input_matrix_shapes, TensorShape* batch_shape) {
   int input_rank = -1;
-  int num_batch_matrices = 1;
   for (int i = 0; i < NumMatrixInputs(context); ++i) {
     const Tensor& in = context->input(i);
     if (i == 0) {
-      // If the tensor rank is greater than 2, we consider the inner-most
-      // dimensions as matrices, and loop over all the other outer ("batch")
-      // dimensions to compute the results.
       input_rank = in.dims();
-      for (int dim = 0; dim < input_rank - 2; ++dim) {
-        num_batch_matrices *= in.dim_size(dim);
-        batch_shape.AddDim(in.dim_size(dim));
-      }
       if (SupportsBatchOperation) {
         OP_REQUIRES(
             context, input_rank >= 2,
@@ -114,14 +130,21 @@ void LinearAlgebraOp<Scalar, SupportsBatchOperation>::Compute(
             errors::InvalidArgument("Input tensor ", i,
                                     " must have rank == 2, got", input_rank));
       }
+
+      // If the tensor rank is greater than 2, we consider the inner-most
+      // dimensions as matrices, and loop over all the other outer ("batch")
+      // dimensions to compute the results.
+      for (int dim = 0; dim < input_rank - 2; ++dim) {
+        batch_shape->AddDim(in.dim_size(dim));
+      }
     } else {
       // Make sure that all inputs have the same rank and outer dimensions.
       OP_REQUIRES(context, input_rank == in.dims(),
                   errors::InvalidArgument(
                       "All input tensors must have the same rank."));
       for (int dim = 0; dim < input_rank - 2; ++dim) {
         OP_REQUIRES(
-            context, in.dim_size(dim) == batch_shape.dim_size(dim),
+            context, in.dim_size(dim) == batch_shape->dim_size(dim),
             errors::InvalidArgument(
                 "All input tensors must have the same outer dimensions."));
       }
@@ -131,64 +154,59 @@ void LinearAlgebraOp<Scalar, SupportsBatchOperation>::Compute(
     const int col_dimension = input_rank - 1;
     const int64 num_rows = in.dim_size(row_dimension);
     const int64 num_cols = in.dim_size(col_dimension);
-    input_matrix_shapes.push_back(TensorShape({num_rows, num_cols}));
-    inputs.push_back(in);
+    // TODO(rmlarsen): Use emplace_back when it is added to InlinedVector. Same
+    // in several places below.
+    input_matrix_shapes->push_back(TensorShape({num_rows, num_cols}));
+    inputs->push_back(in);
   }
   // Have the derived class validate that the inputs are as expected.
-  ValidateInputMatrixShapes(context, input_matrix_shapes);
+  ValidateInputMatrixShapes(context, *input_matrix_shapes);
+}
 
-  // Get shape for each of the matrix outputs.
-  const TensorShapes output_matrix_shapes =
-      GetOutputMatrixShapes(input_matrix_shapes);
-  // Make sure the number of outputs is what the derived class expects.
+template <typename Scalar, bool SupportsBatchOperation>
+void LinearAlgebraOp<Scalar, SupportsBatchOperation>::PrepareOutputs(
+    OpKernelContext* context, const TensorShapes& input_matrix_shapes,
+    const TensorShape& batch_shape, TensorOutputs* outputs,
+    TensorShapes* output_matrix_shapes) {
+  // Get shape for each of the matrix outputs produced by the derived class.
+  *output_matrix_shapes = GetOutputMatrixShapes(input_matrix_shapes);
+  const int num_outputs = output_matrix_shapes->size();
+
+  // Make sure the number of op outputs is what the derived class expects.
   OP_REQUIRES(
-      context, output_matrix_shapes.size() == context->num_outputs(),
+      context, num_outputs <= context->num_outputs(),
       errors::Internal(
-          "Derived class expected (%d) output matrices for op, got (%d).",
-          output_matrix_shapes.size(), context->num_outputs()));
+          "Derived class expected more outputs (%d) that the op has (%d).",
+          num_outputs, context->num_outputs()));
 
   // Allocate outputs.
-  TensorShapes output_shapes;
-  TensorOutputs outputs;
   for (int i = 0; i < context->num_outputs(); ++i) {
-    OP_REQUIRES(context, output_matrix_shapes[i].dims() <= 2,
-                errors::InvalidArgument(
-                    "Rank of matrix output no. %d must be 0, 1 or 2, got %d.",
-                    i, output_matrix_shapes[i].dims()));
-
-    // The final output has the shape of the outer batch dimensions concatenated
-    // with the output_matrix_shape (if the output is not scalar).
-    TensorShape output_shape;
-    if (input_rank == 2) {
-      output_shape = output_matrix_shapes[i];
-    } else {
-      output_shape = batch_shape;
-      // Add the inner dimensions that depend on the operation implemented by
-      // the derived class.
-      for (int dim = 0; dim < output_matrix_shapes[i].dims(); ++dim) {
-        output_shape.AddDim(output_matrix_shapes[i].dim_size(dim));
+    TensorShape output_tensor_shape({0});
+    if (i < num_outputs) {
+      // This output is used, set up output shape and allocate it.
+      const TensorShape& output_matrix_shape = output_matrix_shapes->at(i);
+      OP_REQUIRES(context, output_matrix_shape.dims() <= 2,
+                  errors::InvalidArgument(
+                      "Rank of matrix output no. %d must be 0, 1 or 2, got %d.",
+                      i, output_matrix_shape.dims()));
+
+      // The final output has the shape of the outer batch dimensions
+      // concatenated with the output_matrix_shape (if the output is not
+      // scalar).
+      output_tensor_shape = batch_shape;
+      for (int dim = 0; dim < output_matrix_shape.dims(); ++dim) {
+        output_tensor_shape.AddDim(output_matrix_shape.dim_size(dim));
       }
     }
-    output_shapes.push_back(output_shape);
     Tensor* out = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(i, output_shape, &out));
-    outputs.push_back(out);
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(i, output_tensor_shape, &out));
+    outputs->push_back(out);
   }
-
-  auto shard = [this, &inputs, &input_matrix_shapes, &outputs,
-                &output_matrix_shapes, context](int64 begin, int64 end) {
-    for (int64 i = begin; i < end; ++i) {
-      ComputeTensorSlice(context, i, inputs, input_matrix_shapes, outputs,
-                         output_matrix_shapes);
-    }
-  };
-  auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
-  Shard(worker_threads.num_threads, worker_threads.workers, num_batch_matrices,
-        GetCostPerUnit(input_matrix_shapes), shard);
 }
 
-template <typename Scalar, bool SupportsBatchOperationT>
-void LinearAlgebraOp<Scalar, SupportsBatchOperationT>::ComputeTensorSlice(
+template <typename Scalar, bool SupportsBatchOperation>
+void LinearAlgebraOp<Scalar, SupportsBatchOperation>::ComputeTensorSlice(
     OpKernelContext* context, int64 matrix_index, const TensorInputs& inputs,
     const TensorShapes& input_matrix_shapes, const TensorOutputs& outputs,
     const TensorShapes& output_matrix_shapes) {
@@ -204,7 +222,7 @@ void LinearAlgebraOp<Scalar, SupportsBatchOperationT>::ComputeTensorSlice(
   }
 
   MatrixMaps matrix_outputs;
-  for (int i = 0; i < outputs.size(); ++i) {
+  for (int i = 0; i < output_matrix_shapes.size(); ++i) {
     // The output matrix shape may not be a matrix.
     int num_output_rows = output_matrix_shapes[i].dims() >= 1
                               ? output_matrix_shapes[i].dim_size(0)

diff --git a/tensorflow/core/kernels/linalg_ops_common.h b/tensorflow/core/kernels/linalg_ops_common.h
@@ -43,7 +43,7 @@ template <typename Scalar, bool SupportsBatchOperationT>
 class LinearAlgebraOp : public OpKernel {
  public:
   explicit LinearAlgebraOp(OpKernelConstruction* context) : OpKernel(context) {}
-  ~LinearAlgebraOp() override {}
+
   void Compute(OpKernelContext* context) override;
 
  protected:
@@ -80,19 +80,26 @@ class LinearAlgebraOp : public OpKernel {
                                    const TensorShapes& input_matrix_shapes);
 
   // Returns the output shapes of each individual matrix operation. Output
-  // matrices shapes must be rank 0, 1, or 2.  Scalar outputs are rank 0.
-  // For many ops the output dimensions are the same as the input dimensions,
+  // matrices shapes must be rank 0, 1, or 2. Scalar outputs are rank 0.
+  //
+  // The derived class may return a number of shapes (N) less than
+  // context->num_outputs() (M) to indicate that a only leading subset of
+  // the outputs will be populated. In this case, a dummy scalar tensor with
+  // value zero will be return for the last M-N outputs.
+  //
+  // For many ops, the output dimensions are the same as the input dimensions,
   // so we provide that as a default implementation for convenience.
   virtual TensorShapes GetOutputMatrixShapes(
       const TensorShapes& input_matrix_shapes) const {
     return input_matrix_shapes;
   }
 
-  // Returns the cost per matrix operation. Cost per unit is assumed to be
-  // roughly 1ns, based on comments in core/util/work_sharder.cc.
-  // Many linear algebra ops take roughly max(m,n) * min(m,n)^2, where the first
-  // input matrix is m-by-n. We provide that as a default implementation for
-  // convenience.
+  // Returns the cost per matrix operation. This is used to determine the
+  // number of threads to use for parallelizing calls to ComputeMatrix in
+  // batch mode. Cost per unit is assumed to be roughly 1ns, based on comments
+  // in core/util/work_sharder.cc. Many linear algebra ops take roughly max(m,n)
+  // * min(m,n)^2, where the first input matrix is m-by-n. We provide that as a
+  // default implementation for convenience.
   virtual int64 GetCostPerUnit(const TensorShapes& input_matrix_shapes) const {
     double m = static_cast<double>(input_matrix_shapes[0].dim_size(0));
     double n = static_cast<double>(input_matrix_shapes[0].dim_size(1));
@@ -111,7 +118,9 @@ class LinearAlgebraOp : public OpKernel {
   // Performs a single matrix computation given input matrices, and
   // stores the result in outputs. For batch operations, this will be called
   // repeatedly for a single call to Compute() when multiple matrices exist in
-  // input Tensors with rank > 2.
+  // input Tensors with rank > 2. In this case the calls to ComputeMatrix are
+  // parallelized. The number of threads used is determined by a cost model from
+  // the value returned by GetCostPerUnit().
   virtual void ComputeMatrix(OpKernelContext* context,
                              const ConstMatrixMaps& inputs,
                              MatrixMaps* outputs) = 0;
@@ -142,6 +151,15 @@ class LinearAlgebraOp : public OpKernel {
                           const TensorShapes& input_matrix_shapes,
                           const TensorOutputs& outputs,
                           const TensorShapes& output_matrix_shapes);
+
+  void AnalyzeInputs(OpKernelContext* context, TensorInputs* inputs,
+                     TensorShapes* input_matrix_shapes,
+                     TensorShape* batch_shape);
+
+  void PrepareOutputs(OpKernelContext* context,
+                      const TensorShapes& input_matrix_shapes,
+                      const TensorShape& batch_shape, TensorOutputs* outputs,
+                      TensorShapes* output_matrix_shapes);
 };
 
 // Declare that LinearAlgebraOp is explicitly instantiated in

diff --git a/tensorflow/core/kernels/svd_op.cc b/tensorflow/core/kernels/svd_op.cc
@@ -0,0 +1,105 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/linalg_ops.cc.
+#include <algorithm>
+
+#include "third_party/eigen3/Eigen/SVD"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/linalg_ops_common.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+template <class Scalar, bool SupportsBatchOperation>
+class SvdOp : public LinearAlgebraOp<Scalar, SupportsBatchOperation> {
+ public:
+  typedef LinearAlgebraOp<Scalar, SupportsBatchOperation> Base;
+
+  explicit SvdOp(OpKernelConstruction* context) : Base(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("compute_uv", &compute_uv_));
+    OP_REQUIRES_OK(context, context->GetAttr("full_matrices", &full_matrices_));
+  }
+
+  using TensorShapes = typename Base::TensorShapes;
+
+  void ValidateInputMatrixShapes(
+      OpKernelContext* context,
+      const TensorShapes& input_matrix_shapes) const final {
+    Base::ValidateSingleMatrix(context, input_matrix_shapes);
+  }
+
+  TensorShapes GetOutputMatrixShapes(
+      const TensorShapes& input_matrix_shapes) const final {
+    int64 m = input_matrix_shapes[0].dim_size(0);
+    int64 n = input_matrix_shapes[0].dim_size(1);
+    int64 min_size = std::min(m, n);
+    if (compute_uv_) {
+      return TensorShapes({TensorShape({min_size}),
+                           TensorShape({m, full_matrices_ ? m : min_size}),
+                           TensorShape({n, full_matrices_ ? n : min_size})});
+    } else {
+      return TensorShapes({TensorShape({min_size})});
+    }
+  }
+
+  // TODO(rmlarsen): This should depend on compute_uv. See b/30409375.
+  int64 GetCostPerUnit(const TensorShapes& input_matrix_shapes) const final {
+    double m = static_cast<double>(input_matrix_shapes[0].dim_size(0));
+    double n = static_cast<double>(input_matrix_shapes[0].dim_size(1));
+    double cost = 12 * std::max(m, n) * std::min(m, n) * std::min(m, n);
+    return cost >= static_cast<double>(kint64max) ? kint64max
+                                                  : static_cast<int64>(cost);
+  }
+
+  using Matrix = typename Base::Matrix;
+  using MatrixMaps = typename Base::MatrixMaps;
+  using ConstMatrixMap = typename Base::ConstMatrixMap;
+  using ConstMatrixMaps = typename Base::ConstMatrixMaps;
+
+  void ComputeMatrix(OpKernelContext* context, const ConstMatrixMaps& inputs,
+                     MatrixMaps* outputs) final {
+    Eigen::JacobiSVD<Matrix, Eigen::HouseholderQRPreconditioner> svd;
+    if (compute_uv_) {
+      svd.compute(inputs[0],
+                  (full_matrices_ ? Eigen::ComputeFullU | Eigen::ComputeFullV
+                                  : Eigen::ComputeThinU | Eigen::ComputeThinV));
+      outputs->at(0) = svd.singularValues();
+      outputs->at(1) = svd.matrixU();
+      outputs->at(2) = svd.matrixV();
+    } else {
+      svd.compute(inputs[0]);
+      outputs->at(0) = svd.singularValues();
+    }
+  }
+
+ private:
+  bool compute_uv_;
+  bool full_matrices_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(SvdOp);
+};
+
+REGISTER_LINALG_OP("Svd", (SvdOp<float, false>), float);
+REGISTER_LINALG_OP("Svd", (SvdOp<double, false>), double);
+REGISTER_LINALG_OP("BatchSvd", (SvdOp<float, true>), float);
+REGISTER_LINALG_OP("BatchSvd", (SvdOp<double, true>), double);
+
+}  // namespace tensorflow