Skip to content

Commit

Permalink
[XLA] Avoid implicit layout change for non-layout-changing instructions.
Browse files Browse the repository at this point in the history
Add routine InstructionRequiresInputLayoutEqualToOutputLayout to layout
assignment and use it to decide whether the output layout of an instruction
need to be propagated to the operands of the instruction during layout
assignment. This also fixes the performance issue exposed in b/112646847 where
a kSlice instruction with implicit layout change is fused to a multiple output
fusion for reduction and causes memory throttle.

Add test cases.

PiperOrigin-RevId: 210622658
  • Loading branch information
bixia1 authored and tensorflower-gardener committed Aug 28, 2018
1 parent 6729cd7 commit bf7b20e
Show file tree
Hide file tree
Showing 3 changed files with 227 additions and 10 deletions.
122 changes: 112 additions & 10 deletions tensorflow/compiler/xla/service/layout_assignment.cc
Original file line number Diff line number Diff line change
Expand Up @@ -980,16 +980,17 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOperandLayoutFromOutputLayout(
CHECK(ShapeUtil::IsArray(instruction->shape()));
CHECK(ShapeUtil::IsArray(operand->shape()));

if (instruction->IsElementwiseOnOperand(operand_no) &&
!ShapeUtil::IsScalar(operand->shape()) &&
if (!ShapeUtil::IsScalar(operand->shape()) &&
ShapeUtil::Rank(operand->shape()) ==
ShapeUtil::Rank(instruction->shape())) {
// Assign operands the same layout as the instruction, so that
ShapeUtil::Rank(instruction->shape()) &&
InstructionRequiresInputLayoutEqualToOutputLayout(instruction)) {
// Propagate the result layout to the operand layout if the instruction
// requires the same layout out for the result and the operand.
//
// For elementwise operations, using the same layout for the operands and
// the result also has the following benefits:
// 1) the elementwise operation can reuse its operand's buffer, and
// 2) the input and output elements can reuse the same linear index.
//
// TODO(jingyue): Other operations, such as kSlice and kConcat, can benefit
// from assigning the same layout to input and output.
return absl::make_unique<Layout>(output_layout);
}

Expand Down Expand Up @@ -1058,9 +1059,9 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOutputLayoutFromOperandLayout(
CHECK(ShapeUtil::IsArray(user->shape()) &&
ShapeUtil::IsArray(operand->shape()));

if (user->IsElementwiseOnOperand(operand_no) &&
!ShapeUtil::IsScalar(operand->shape()) &&
ShapeUtil::Rank(operand->shape()) == ShapeUtil::Rank(user->shape())) {
if (!ShapeUtil::IsScalar(operand->shape()) &&
ShapeUtil::Rank(operand->shape()) == ShapeUtil::Rank(user->shape()) &&
InstructionRequiresInputLayoutEqualToOutputLayout(user)) {
// Assign users the same layout as the operand.
return absl::make_unique<Layout>(operand_layout);
}
Expand Down Expand Up @@ -1803,6 +1804,107 @@ StatusOr<bool> LayoutAssignment::Run(HloModule* module) {
return true;
}

bool LayoutAssignment::InstructionRequiresInputLayoutEqualToOutputLayout(
const HloInstruction* instruction) {
switch (instruction->opcode()) {
case HloOpcode::kAbs:
case HloOpcode::kAdd:
case HloOpcode::kAnd:
case HloOpcode::kAtan2:
case HloOpcode::kBitcastConvert:
case HloOpcode::kCeil:
case HloOpcode::kClamp:
case HloOpcode::kClz:
case HloOpcode::kComplex:
case HloOpcode::kConcatenate:
case HloOpcode::kConditional:
case HloOpcode::kConvert:
case HloOpcode::kCos:
case HloOpcode::kCrossReplicaSum:
case HloOpcode::kAllToAll:
case HloOpcode::kCollectivePermute:
case HloOpcode::kCustomCall:
case HloOpcode::kDivide:
case HloOpcode::kDynamicSlice:
case HloOpcode::kDynamicUpdateSlice:
case HloOpcode::kEq:
case HloOpcode::kExp:
case HloOpcode::kExpm1:
case HloOpcode::kFft:
case HloOpcode::kFloor:
case HloOpcode::kGe:
case HloOpcode::kGt:
case HloOpcode::kImag:
case HloOpcode::kIsFinite:
case HloOpcode::kLe:
case HloOpcode::kLog:
case HloOpcode::kLog1p:
case HloOpcode::kLt:
case HloOpcode::kMap:
case HloOpcode::kMaximum:
case HloOpcode::kMinimum:
case HloOpcode::kMultiply:
case HloOpcode::kNe:
case HloOpcode::kNegate:
case HloOpcode::kNot:
case HloOpcode::kOr:
case HloOpcode::kXor:
case HloOpcode::kPad:
case HloOpcode::kPower:
case HloOpcode::kReal:
case HloOpcode::kReducePrecision:
case HloOpcode::kReduceWindow:
case HloOpcode::kRemainder:
case HloOpcode::kReverse:
case HloOpcode::kRoundNearestAfz:
case HloOpcode::kSelect:
case HloOpcode::kSelectAndScatter:
case HloOpcode::kShiftLeft:
case HloOpcode::kShiftRightArithmetic:
case HloOpcode::kShiftRightLogical:
case HloOpcode::kSign:
case HloOpcode::kSin:
case HloOpcode::kSlice:
case HloOpcode::kSort:
case HloOpcode::kSubtract:
case HloOpcode::kTanh:
case HloOpcode::kTupleSelect:
case HloOpcode::kWhile:
return true;
case HloOpcode::kBatchNormGrad:
case HloOpcode::kBatchNormInference:
case HloOpcode::kBatchNormTraining:
case HloOpcode::kBitcast:
case HloOpcode::kBroadcast:
case HloOpcode::kCall:
case HloOpcode::kConstant:
case HloOpcode::kConvolution:
case HloOpcode::kCopy:
case HloOpcode::kDomain:
case HloOpcode::kDot:
case HloOpcode::kFusion:
case HloOpcode::kGather:
case HloOpcode::kGetTupleElement:
case HloOpcode::kInfeed:
case HloOpcode::kIota:
case HloOpcode::kOutfeed:
case HloOpcode::kParameter:
case HloOpcode::kRecv:
case HloOpcode::kRecvDone:
case HloOpcode::kReduce:
case HloOpcode::kReshape:
case HloOpcode::kRng:
case HloOpcode::kScatter:
case HloOpcode::kSend:
case HloOpcode::kSendDone:
case HloOpcode::kAfterAll:
case HloOpcode::kTrace:
case HloOpcode::kTranspose:
case HloOpcode::kTuple:
return false;
}
}

Status LayoutAssignment::Init() {
computation_layouts_.clear();
*entry_computation_layout_ = saved_entry_computation_layout_;
Expand Down
5 changes: 5 additions & 0 deletions tensorflow/compiler/xla/service/layout_assignment.h
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,11 @@ class LayoutAssignment : public HloPassInterface {
// (any layouts were changed).
StatusOr<bool> Run(HloModule* module) override;

// Returns true if the instruction requires that operands with the same rank
// as the output have to have the same layout as the output.
virtual bool InstructionRequiresInputLayoutEqualToOutputLayout(
const HloInstruction* instruction);

protected:
// These methods, invoked by PropagateConstraints, propagate a layout
// constraint to its neighbors (i.e. operands and users) in order to minimize
Expand Down
110 changes: 110 additions & 0 deletions tensorflow/compiler/xla/service/layout_assignment_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -861,5 +861,115 @@ TEST_F(LayoutAssignmentTest, ChannelLayoutMismatch) {
ShapeUtil::MakeShapeWithLayout(F32, {2, 2}, {1, 0})));
}

TEST_F(LayoutAssignmentTest, CopySliceOperandToAvoidImplicitLayoutChange) {
const char* module_str = R"(
HloModule CopySliceOperandToAvoidImplicitLayoutChange
ENTRY CopySliceOperandToAvoidImplicitLayoutChange {
par0 = f32[3,4]{1,0} parameter(0)
par1 = f32[4,5]{0,1} parameter(1)
slice0 = f32[3,4] slice(par1), slice={[1:4],[1:5]}
ROOT add0 = f32[3,4]{1,0} add(par0,slice0)
}
)";

auto module = ParseHloString(module_str).ValueOrDie();
module =
backend()
.compiler()
->RunHloPasses(std::move(module), backend().default_stream_executor(),
/*device_allocator=*/nullptr)
.ConsumeValueOrDie();

auto copy = FindInstruction(module.get(), "copy.1");
auto slice = FindInstruction(module.get(), "slice0");
EXPECT_EQ(slice->operand(0), copy);
EXPECT_TRUE(
LayoutUtil::Equal(slice->shape().layout(), copy->shape().layout()));
}

TEST_F(LayoutAssignmentTest, CopyDSliceOperandToAvoidImplicitLayoutChange) {
const char* module_str = R"(
HloModule CopyDSliceOperandToAvoidImplicitLayoutChange
ENTRY CopyDSliceOperandToAvoidImplicitLayoutChange {
par0 = f32[3,4]{1,0} parameter(0)
par1 = f32[4,5]{0,1} parameter(1)
par2 = s32[2] parameter(2)
dslice0 = f32[3,4] dynamic-slice(par1, par2), dynamic_slice_sizes={3,4}
ROOT add0 = f32[3,4]{1,0} add(par0,dslice0)
}
)";

auto module = ParseHloString(module_str).ValueOrDie();
module =
backend()
.compiler()
->RunHloPasses(std::move(module), backend().default_stream_executor(),
/*device_allocator=*/nullptr)
.ConsumeValueOrDie();

auto copy = FindInstruction(module.get(), "copy.1");
auto dslice = FindInstruction(module.get(), "dslice0");
EXPECT_EQ(dslice->operand(0), copy);
EXPECT_TRUE(
LayoutUtil::Equal(dslice->shape().layout(), copy->shape().layout()));
}

TEST_F(LayoutAssignmentTest, CopyConcatOperandToAvoidImplicitLayoutChange) {
const char* module_str = R"(
HloModule CopyConcatOperandToAvoidImplicitLayoutChange
ENTRY CopyConcatOperandToAvoidImplicitLayoutChange {
par0 = f32[3,8]{1,0} parameter(0)
par1 = f32[3,5]{0,1} parameter(1)
par2 = f32[3,3]{1,0} parameter(2)
concat0 = f32[3,8] concatenate(f32[3,5] par1, f32[3,3] par2),
dimensions={1}
ROOT add0 = f32[3,8]{1,0} add(par0,concat0)
}
)";

auto module = ParseHloString(module_str).ValueOrDie();
module =
backend()
.compiler()
->RunHloPasses(std::move(module), backend().default_stream_executor(),
/*device_allocator=*/nullptr)
.ConsumeValueOrDie();

auto copy = FindInstruction(module.get(), "copy.1");
auto concat = FindInstruction(module.get(), "concat0");
EXPECT_EQ(concat->operand(0), copy);
EXPECT_TRUE(
LayoutUtil::Equal(concat->shape().layout(), copy->shape().layout()));
}

TEST_F(LayoutAssignmentTest,
ConvolutionOperandWithImplicitLayoutChangeNotCopied) {
const char* module_str = R"(
HloModule ConvolutionOperandWithImplicitLayoutChangeNotCopied
ENTRY ConvolutionOperandWithImplicitLayoutChangeNotCopied {
par0 = f32[128,3,230,230]{2,3,1,0} parameter(0)
par1 = f32[7,7,3,64]{3,2,0,1} parameter(1)
ROOT convolution0 = f32[128,64,112,112]{3,2,1,0} convolution(par0, par1),
window={size=7x7 stride=2x2}, dim_labels=bf01_01io->bf01,
feature_group_count=1
}
)";

auto module = ParseHloString(module_str).ValueOrDie();
module =
backend()
.compiler()
->RunHloPasses(std::move(module), backend().default_stream_executor(),
/*device_allocator=*/nullptr)
.ConsumeValueOrDie();

auto copy = FindInstruction(module.get(), "copy.1");
EXPECT_EQ(copy, nullptr);
}

} // namespace
} // namespace xla

0 comments on commit bf7b20e

Please sign in to comment.