From 938902bade73fb90ecc178a6c46cdf2b30d3a69a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <nobody@tensorflow.org>
Date: Fri, 29 Jan 2016 10:20:54 -0800
Subject: [PATCH] Sped up construction of BCast helper class for the very
 common case where both shapes are the same by using much more straightforward
 code to achieve the same ultimate initialization of the various instance
 variables with simpler code.

Added benchmark for this to bcast_test.cc.  Speeds up the same_shape
case by 65% (67 ns to 23 ns for a two-dimensional shape) without
any real effect on the different shape case.

Run on machine with (40 X 2801 MHz CPUs); 2016/01/28-11:12:26
CPU: Intel Ivybridge with HyperThreading (20 cores) dL1:32KB dL2:256KB dL3:25MB
Benchmark                          Base (ns)  New (ns) Improvement
------------------------------------------------------------------
BM_BCastSetup/0                          122       122     +0.0%
BM_BCastSetup/1                           67        23    +65.7%
Change: 113374076
---
 tensorflow/core/util/bcast.cc      | 220 ++++++++++++++++-------------
 tensorflow/core/util/bcast.h       |   3 +-
 tensorflow/core/util/bcast_test.cc |  25 ++++
 3 files changed, 145 insertions(+), 103 deletions(-)

diff --git a/tensorflow/core/util/bcast.cc b/tensorflow/core/util/bcast.cc
index 46cf2dfafb35c1..c045ee902b1a08 100644
--- a/tensorflow/core/util/bcast.cc
+++ b/tensorflow/core/util/bcast.cc
@@ -22,114 +22,132 @@ namespace tensorflow {
 void BCast::Reverse(Vec* shape) { std::reverse(shape->begin(), shape->end()); }
 
 BCast::BCast(const Vec& sx, const Vec& sy) {
-  // Reverse the shape of x and y for convenience.
-  // After the reverse, 0-th is the inner-most dimension.
-  Vec x = sx;
-  Reverse(&x);
-  Vec y = sy;
-  Reverse(&y);
-
-  // 1-extend and align x and y so that they are the same size.
-  if (x.size() > y.size()) {
-    y.resize(x.size(), 1);
+  if (sx == sy) {
+    // Fast path for common case of identical shapes for sx and sy
+    int64 elements = 1;
+    const int n = sx.size();
+    output_.resize(n);
+    for (int i = 0; i < n; i++) {
+      int64 dim = sx[i];
+      elements *= dim;
+      output_[i] = dim;
+    }
+    x_reshape_.push_back(elements);
+    y_reshape_.push_back(elements);
+    x_bcast_.push_back(1);
+    y_bcast_.push_back(1);
+    result_.push_back(elements);
+    // grad_x_reduce_ and grad_y_reduce_ are left as empty
   } else {
-    x.resize(y.size(), 1);
-  }
+    // Reverse the shape of x and y for convenience.
+    // After the reverse, 0-th is the inner-most dimension.
+    Vec x = sx;
+    Reverse(&x);
+    Vec y = sy;
+    Reverse(&y);
 
-  // Going through each dimension starting from the inner-most
-  // dimension, compares dimension of x and y. They are compatible if
-  // they are equal or either is 1.
-  enum State {
-    UNKNOWN,
-    SAME,
-    X_ONE,
-    Y_ONE,
-  };
-  State prev = UNKNOWN;
-  const int64 n = x.size();
-  for (int i = 0; i < n; ++i) {
-    // Output shape.
-    State curr = UNKNOWN;
-    const int64 x_i = x[i];  // i-th dimension of x.
-    CHECK_GE(x_i, 0);
-    const int64 y_i = y[i];  // i-th dimension of y.
-    CHECK_GE(y_i, 0);
-    int64 o_i;   // i-th dimension of the output.
-    int64 bx_i;  // i-th broadcast for x.
-    int64 by_i;  // i-th broadcast for y.
-    // Invariant:
-    //   o_i = x_i * bx_i = y_i * by_i
-    if (x_i == y_i) {
-      // No broadcast.
-      o_i = x_i;
-      bx_i = 1;
-      by_i = 1;
-      curr = SAME;
-    } else if (x_i == 1) {
-      // x broadcast to y on this dimension.
-      o_i = y_i;
-      bx_i = y_i;
-      by_i = 1;
-      grad_x_reduce_idx_.push_back(n - 1 - i);
-      curr = X_ONE;
-    } else if (y_i == 1) {
-      // y broadcast to x on this dimension.
-      o_i = x_i;
-      bx_i = 1;
-      by_i = x_i;
-      grad_y_reduce_idx_.push_back(n - 1 - i);
-      curr = Y_ONE;
+    // 1-extend and align x and y so that they are the same size.
+    if (x.size() > y.size()) {
+      y.resize(x.size(), 1);
     } else {
-      valid_ = false;
-      return;
+      x.resize(y.size(), 1);
     }
-    output_.push_back(o_i);
-    // Reshape/broadcast.
-    // Invariant:
-    //  result[i] == x_reshape[i] * x_bcast[i] == y_reshape_[i] * y_bcast_[i]
-    if (curr == SAME && x_i == 1) {
-      // Both side are 1s.
-      grad_x_reduce_idx_.push_back(n - 1 - i);
-      grad_y_reduce_idx_.push_back(n - 1 - i);
-      continue;
-    } else if (prev == curr) {
-      // It is a run of the same cases (no broadcast, x broadcast to
-      // y, y broadcast to x). We can reshape the input so that fewer
-      // dimensions are involved in the intermediate computation.
-      result_.back() *= o_i;
-      x_reshape_.back() *= x_i;
-      x_bcast_.back() *= bx_i;
-      y_reshape_.back() *= y_i;
-      y_bcast_.back() *= by_i;
-    } else {
-      result_.push_back(o_i);
-      x_reshape_.push_back(x_i);
-      x_bcast_.push_back(bx_i);
-      y_reshape_.push_back(y_i);
-      y_bcast_.push_back(by_i);
+
+    // Going through each dimension starting from the inner-most
+    // dimension, compares dimension of x and y. They are compatible if
+    // they are equal or either is 1.
+    enum State {
+      UNKNOWN,
+      SAME,
+      X_ONE,
+      Y_ONE,
+    };
+    State prev = UNKNOWN;
+    const int64 n = x.size();
+    for (int i = 0; i < n; ++i) {
+      // Output shape.
+      State curr = UNKNOWN;
+      const int64 x_i = x[i];  // i-th dimension of x.
+      CHECK_GE(x_i, 0);
+      const int64 y_i = y[i];  // i-th dimension of y.
+      CHECK_GE(y_i, 0);
+      int64 o_i;   // i-th dimension of the output.
+      int64 bx_i;  // i-th broadcast for x.
+      int64 by_i;  // i-th broadcast for y.
+      // Invariant:
+      //   o_i = x_i * bx_i = y_i * by_i
+      if (x_i == y_i) {
+        // No broadcast.
+        o_i = x_i;
+        bx_i = 1;
+        by_i = 1;
+        curr = SAME;
+      } else if (x_i == 1) {
+        // x broadcast to y on this dimension.
+        o_i = y_i;
+        bx_i = y_i;
+        by_i = 1;
+        grad_x_reduce_idx_.push_back(n - 1 - i);
+        curr = X_ONE;
+      } else if (y_i == 1) {
+        // y broadcast to x on this dimension.
+        o_i = x_i;
+        bx_i = 1;
+        by_i = x_i;
+        grad_y_reduce_idx_.push_back(n - 1 - i);
+        curr = Y_ONE;
+      } else {
+        valid_ = false;
+        return;
+      }
+      output_.push_back(o_i);
+      // Reshape/broadcast.
+      // Invariant:
+      //  result[i] == x_reshape[i] * x_bcast[i] == y_reshape_[i] * y_bcast_[i]
+      if (curr == SAME && x_i == 1) {
+        // Both side are 1s.
+        grad_x_reduce_idx_.push_back(n - 1 - i);
+        grad_y_reduce_idx_.push_back(n - 1 - i);
+        continue;
+      } else if (prev == curr) {
+        // It is a run of the same cases (no broadcast, x broadcast to
+        // y, y broadcast to x). We can reshape the input so that fewer
+        // dimensions are involved in the intermediate computation.
+        result_.back() *= o_i;
+        x_reshape_.back() *= x_i;
+        x_bcast_.back() *= bx_i;
+        y_reshape_.back() *= y_i;
+        y_bcast_.back() *= by_i;
+      } else {
+        result_.push_back(o_i);
+        x_reshape_.push_back(x_i);
+        x_bcast_.push_back(bx_i);
+        y_reshape_.push_back(y_i);
+        y_bcast_.push_back(by_i);
+      }
+      prev = curr;
     }
-    prev = curr;
-  }
 
-  if (result_.empty()) {
-    // Can happen when both x and y are effectively scalar.
-    result_.push_back(1);
-    x_reshape_.push_back(1);
-    x_bcast_.push_back(1);
-    y_reshape_.push_back(1);
-    y_bcast_.push_back(1);
-  }
+    if (result_.empty()) {
+      // Can happen when both x and y are effectively scalar.
+      result_.push_back(1);
+      x_reshape_.push_back(1);
+      x_bcast_.push_back(1);
+      y_reshape_.push_back(1);
+      y_bcast_.push_back(1);
+    }
 
-  // Reverse all vectors since x and y were reversed at very
-  // beginning.
-  Reverse(&x_reshape_);
-  Reverse(&x_bcast_);
-  Reverse(&y_reshape_);
-  Reverse(&y_bcast_);
-  Reverse(&result_);
-  Reverse(&output_);
-  Reverse(&grad_x_reduce_idx_);
-  Reverse(&grad_y_reduce_idx_);
+    // Reverse all vectors since x and y were reversed at very
+    // beginning.
+    Reverse(&x_reshape_);
+    Reverse(&x_bcast_);
+    Reverse(&y_reshape_);
+    Reverse(&y_bcast_);
+    Reverse(&result_);
+    Reverse(&output_);
+    Reverse(&grad_x_reduce_idx_);
+    Reverse(&grad_y_reduce_idx_);
+  }
 }
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/util/bcast.h b/tensorflow/core/util/bcast.h
index 1d01ce809dab9c..1eeb97778f10a5 100644
--- a/tensorflow/core/util/bcast.h
+++ b/tensorflow/core/util/bcast.h
@@ -67,7 +67,7 @@ namespace tensorflow {
 // TODO(zhifengc): Adds support for n-ary (n >= 2).
 class BCast {
  public:
-  // A vector of int32 representing the shape of tensor. The 0-th
+  // A vector of int64 representing the shape of tensor. The 0-th
   // element is the outer-most dimension and the last element is the
   // inner-most dimension. Note that we do not use TensorShape since
   // it's more convenient to manipulate Vec directly for this module.
@@ -104,7 +104,6 @@ class BCast {
   Vec grad_y_reduce_idx_;
 
   static void Reverse(Vec* shape);
-  static bool HasZero(const Vec& shape);
 
   TF_DISALLOW_COPY_AND_ASSIGN(BCast);
 };
diff --git a/tensorflow/core/util/bcast_test.cc b/tensorflow/core/util/bcast_test.cc
index f2c095097578eb..a9602e6db8d015 100644
--- a/tensorflow/core/util/bcast_test.cc
+++ b/tensorflow/core/util/bcast_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
 
 namespace tensorflow {
 namespace {
@@ -57,6 +58,15 @@ TEST(BCastTest, Basic_SameShape) {
             "[][]");
 }
 
+TEST(BCastTest, Basic_SameShapeWithZeroDim) {
+  // Effectively no broadcast needed.
+  EXPECT_EQ(BCast({11, 7, 0, 3, 2}, {11, 7, 0, 3, 2}),
+            "[0][1][0][1]"
+            "[0]"
+            "[11,7,0,3,2]"
+            "[][]");
+}
+
 TEST(BCastTest, Basic_Scalar_Scalar) {
   // Effectively it's a scalar and a scalar.
   // [1, 1] [1]
@@ -237,5 +247,20 @@ TEST(BCastTest, TestZeroDimensionShape) {
             "[0,1,3][]");
 }
 
+static void BM_BCastSetup(int iters, int same_shape) {
+  if (same_shape) {
+    testing::SetLabel("same_shapes");
+    while (--iters > 0) {
+      class BCast b({1000, 100}, {1000, 100});
+    }
+  } else {
+    testing::SetLabel("different_shapes");
+    while (--iters > 0) {
+      class BCast b({3, 1, 5}, {2, 0, 3, 0, 5});
+    }
+  }
+}
+BENCHMARK(BM_BCastSetup)->Arg(0)->Arg(1);
+
 }  // namespace
 }  // namespace tensorflow