Skip to content

Commit

Permalink
Faster unsigned division by constants (dotnet#49585)
Browse files Browse the repository at this point in the history
* Faster unsigned division by constants

* Fix Arm build and add some tests.

* Improve register allocation

* Fix ARM64 codegen

* Fix MULHI flags

* Remove ARM32 codegen

* Widen 32bit UDIV to 64bit MULHI when possible. Improve register allocation.

* Always widen 32bit UDIV to 64bit MUL/MULHI

* Cleanup

* Final optimization

* Fix typo
  • Loading branch information
pentp authored May 18, 2021
1 parent 63ea9c6 commit e4b4807
Show file tree
Hide file tree
Showing 15 changed files with 328 additions and 120 deletions.
10 changes: 10 additions & 0 deletions THIRD-PARTY-NOTICES.TXT
Original file line number Diff line number Diff line change
Expand Up @@ -942,3 +942,13 @@ OF SUCH DAMAGES.
You acknowledge that this software is not designed, licensed or
intended for use in the design, construction, operation or
maintenance of any nuclear facility.


License notice for "Faster Unsigned Division by Constants"
------------------------------

Reference implementations of computing and using the "magic number" approach to dividing
by constants, including codegen instructions. The unsigned division incorporates the
"round down" optimization per ridiculous_fish.

This is free and unencumbered software. Any copyright is dedicated to the Public Domain.
3 changes: 2 additions & 1 deletion src/coreclr/jit/assertionprop.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5152,8 +5152,9 @@ Compiler::fgWalkResult Compiler::optVNConstantPropCurStmt(BasicBlock* block, Sta
case GT_INTRINSIC:
break;

case GT_INC_SATURATE:
case GT_MULHI:
assert(false && "Unexpected GT_MULHI node encountered before lowering");
assert(false && "Unexpected GT_INC_SATURATE/GT_MULHI node encountered before lowering");
break;

case GT_JTRUE:
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/codegen.h
Original file line number Diff line number Diff line change
Expand Up @@ -843,6 +843,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX

void genCodeForDivMod(GenTreeOp* treeNode);
void genCodeForMul(GenTreeOp* treeNode);
void genCodeForIncSaturate(GenTree* treeNode);
void genCodeForMulHi(GenTreeOp* treeNode);
void genLeaInstruction(GenTreeAddrMode* lea);
void genSetRegToCond(regNumber dstReg, GenTree* tree);
Expand Down
22 changes: 22 additions & 0 deletions src/coreclr/jit/codegenarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1753,6 +1753,28 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTre
}
}

// Produce code for a GT_INC_SATURATE node.
void CodeGen::genCodeForIncSaturate(GenTree* tree)
{
regNumber targetReg = tree->GetRegNum();
var_types targetType = tree->TypeGet();

// The arithmetic node must be sitting in a register (since it's not contained)
assert(!tree->isContained());
// The dst can only be a register.
assert(targetReg != REG_NA);

GenTree* operand = tree->gtGetOp1();
assert(!operand->isContained());
// The src must be a register.
regNumber operandReg = genConsumeReg(operand);

GetEmitter()->emitIns_R_R_I(INS_adds, emitActualTypeSize(tree), targetReg, operandReg, 1);
GetEmitter()->emitIns_R_R_COND(INS_cinv, emitActualTypeSize(tree), targetReg, targetReg, INS_COND_HS);

genProduceReg(tree);
}

// Generate code to get the high N bits of a N*N=2N bit multiplication result
void CodeGen::genCodeForMulHi(GenTreeOp* treeNode)
{
Expand Down
4 changes: 4 additions & 0 deletions src/coreclr/jit/codegenarmarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,10 @@ void CodeGen::genCodeForTreeNode(GenTree* treeNode)

#ifdef TARGET_ARM64

case GT_INC_SATURATE:
genCodeForIncSaturate(treeNode);
break;

case GT_MULHI:
genCodeForMulHi(treeNode->AsOp());
break;
Expand Down
25 changes: 25 additions & 0 deletions src/coreclr/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -605,6 +605,27 @@ void CodeGen::genCodeForBswap(GenTree* tree)
genProduceReg(tree);
}

// Produce code for a GT_INC_SATURATE node.
void CodeGen::genCodeForIncSaturate(GenTree* tree)
{
regNumber targetReg = tree->GetRegNum();
var_types targetType = tree->TypeGet();

GenTree* operand = tree->gtGetOp1();
assert(operand->isUsedFromReg());
regNumber operandReg = genConsumeReg(operand);

if (operandReg != targetReg)
{
inst_RV_RV(INS_mov, targetReg, operandReg, targetType);
}

inst_RV_IV(INS_add, targetReg, 1, emitActualTypeSize(targetType));
inst_RV_IV(INS_sbb, targetReg, 0, emitActualTypeSize(targetType));

genProduceReg(tree);
}

// Generate code to get the high N bits of a N*N=2N bit multiplication result
void CodeGen::genCodeForMulHi(GenTreeOp* treeNode)
{
Expand Down Expand Up @@ -1608,6 +1629,10 @@ void CodeGen::genCodeForTreeNode(GenTree* treeNode)
genCodeForIndir(treeNode->AsIndir());
break;

case GT_INC_SATURATE:
genCodeForIncSaturate(treeNode);
break;

case GT_MULHI:
#ifdef TARGET_X86
case GT_MUL_LONG:
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -10793,6 +10793,7 @@ class GenTreeVisitor
case GT_RETFILT:
case GT_RUNTIMELOOKUP:
case GT_KEEPALIVE:
case GT_INC_SATURATE:
{
GenTreeUnOp* const unOp = node->AsUnOp();
if (unOp->gtOp1 != nullptr)
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/compiler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4328,6 +4328,7 @@ void GenTree::VisitOperands(TVisitor visitor)
#endif // FEATURE_ARG_SPLIT
case GT_RETURNTRAP:
case GT_KEEPALIVE:
case GT_INC_SATURATE:
visitor(this->AsUnOp()->gtOp1);
return;

Expand Down
2 changes: 2 additions & 0 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5217,6 +5217,7 @@ bool GenTree::TryGetUse(GenTree* def, GenTree*** use)
case GT_BSWAP:
case GT_BSWAP16:
case GT_KEEPALIVE:
case GT_INC_SATURATE:
if (def == this->AsUnOp()->gtOp1)
{
*use = &this->AsUnOp()->gtOp1;
Expand Down Expand Up @@ -9315,6 +9316,7 @@ GenTreeUseEdgeIterator::GenTreeUseEdgeIterator(GenTree* node)
case GT_BSWAP:
case GT_BSWAP16:
case GT_KEEPALIVE:
case GT_INC_SATURATE:
#if FEATURE_ARG_SPLIT
case GT_PUTARG_SPLIT:
#endif // FEATURE_ARG_SPLIT
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/gtlist.h
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ GTNODE(RSH , GenTreeOp ,0,GTK_BINOP)
GTNODE(RSZ , GenTreeOp ,0,GTK_BINOP)
GTNODE(ROL , GenTreeOp ,0,GTK_BINOP)
GTNODE(ROR , GenTreeOp ,0,GTK_BINOP)
GTNODE(INC_SATURATE , GenTreeOp ,0,GTK_UNOP) // saturating increment, used in division by a constant (LowerUnsignedDivOrMod)
GTNODE(MULHI , GenTreeOp ,1,GTK_BINOP) // returns high bits (top N bits of the 2N bit result of an NxN multiply)
// GT_MULHI is used in division by a constant (fgMorphDivByConst). We turn
// the div into a MULHI + some adjustments. In codegen, we only use the
Expand Down
172 changes: 117 additions & 55 deletions src/coreclr/jit/lower.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5171,31 +5171,48 @@ bool Lowering::LowerUnsignedDivOrMod(GenTreeOp* divMod)
if (!comp->opts.MinOpts() && (divisorValue >= 3))
{
size_t magic;
bool add;
int shift;
bool increment;
int preShift;
int postShift;
bool simpleMul = false;

if (type == TYP_INT)
{
magic = MagicDivide::GetUnsigned32Magic(static_cast<uint32_t>(divisorValue), &add, &shift);
magic =
MagicDivide::GetUnsigned32Magic(static_cast<uint32_t>(divisorValue), &increment, &preShift, &postShift);

#ifdef TARGET_64BIT
// avoid inc_saturate/multiple shifts by widening to 32x64 MULHI
if (increment || (preShift
#ifdef TARGET_XARCH
// IMUL reg,reg,imm32 can't be used if magic<0 because of sign-extension
&& static_cast<int32_t>(magic) < 0
#endif
))
{
magic = MagicDivide::GetUnsigned64Magic(static_cast<uint64_t>(divisorValue), &increment, &preShift,
&postShift, 32);
}
// otherwise just widen to regular multiplication
else
{
postShift += 32;
simpleMul = true;
}
#endif
}
else
{
#ifdef TARGET_64BIT
magic = MagicDivide::GetUnsigned64Magic(static_cast<uint64_t>(divisorValue), &add, &shift);
magic =
MagicDivide::GetUnsigned64Magic(static_cast<uint64_t>(divisorValue), &increment, &preShift, &postShift);
#else
unreached();
#endif
}
assert(divMod->MarkedDivideByConstOptimized());

// Depending on the "add" flag returned by GetUnsignedMagicNumberForDivide we need to generate:
// add == false (when divisor == 3 for example):
// div = (dividend MULHI magic) RSZ shift
// add == true (when divisor == 7 for example):
// mulhi = dividend MULHI magic
// div = (((dividend SUB mulhi) RSZ 1) ADD mulhi)) RSZ (shift - 1)
const bool requiresAdjustment = add;
const bool requiresDividendMultiuse = requiresAdjustment || !isDiv;
const bool requiresDividendMultiuse = !isDiv;
const BasicBlock::weight_t curBBWeight = m_block->getBBWeight(comp);

if (requiresDividendMultiuse)
Expand All @@ -5204,62 +5221,107 @@ bool Lowering::LowerUnsignedDivOrMod(GenTreeOp* divMod)
dividend = ReplaceWithLclVar(dividendUse);
}

// Insert a new GT_MULHI node before the existing GT_UDIV/GT_UMOD node.
// The existing node will later be transformed into a GT_RSZ/GT_SUB that
// computes the final result. This way don't need to find and change the use
// of the existing node.
GenTree* mulhi = comp->gtNewOperNode(GT_MULHI, type, dividend, divisor);
mulhi->gtFlags |= GTF_UNSIGNED;
divisor->AsIntCon()->SetIconValue(magic);
BlockRange().InsertBefore(divMod, mulhi);
GenTree* firstNode = mulhi;
GenTree* firstNode = nullptr;
GenTree* adjustedDividend = dividend;

if (requiresAdjustment)
// If "increment" flag is returned by GetUnsignedMagic we need to do Saturating Increment first
if (increment)
{
dividend = comp->gtNewLclvNode(dividend->AsLclVar()->GetLclNum(), dividend->TypeGet());
GenTree* sub = comp->gtNewOperNode(GT_SUB, type, dividend, mulhi);
BlockRange().InsertBefore(divMod, dividend, sub);

GenTree* one = comp->gtNewIconNode(1, TYP_INT);
GenTree* rsz = comp->gtNewOperNode(GT_RSZ, type, sub, one);
BlockRange().InsertBefore(divMod, one, rsz);

LIR::Use mulhiUse(BlockRange(), &sub->AsOp()->gtOp2, sub);
mulhi = ReplaceWithLclVar(mulhiUse);

mulhi = comp->gtNewLclvNode(mulhi->AsLclVar()->GetLclNum(), mulhi->TypeGet());
GenTree* add = comp->gtNewOperNode(GT_ADD, type, rsz, mulhi);
BlockRange().InsertBefore(divMod, mulhi, add);

mulhi = add;
shift -= 1;
adjustedDividend = comp->gtNewOperNode(GT_INC_SATURATE, type, adjustedDividend);
BlockRange().InsertBefore(divMod, adjustedDividend);
firstNode = adjustedDividend;
assert(!preShift);
}
// if "preShift" is required, then do a right shift before
else if (preShift)
{
GenTree* preShiftBy = comp->gtNewIconNode(preShift, TYP_INT);
adjustedDividend = comp->gtNewOperNode(GT_RSZ, type, adjustedDividend, preShiftBy);
BlockRange().InsertBefore(divMod, preShiftBy, adjustedDividend);
firstNode = preShiftBy;
}
else if (type != TYP_I_IMPL)
{
adjustedDividend = comp->gtNewCastNode(TYP_I_IMPL, adjustedDividend, true, TYP_U_IMPL);
BlockRange().InsertBefore(divMod, adjustedDividend);
firstNode = adjustedDividend;
}

#ifdef TARGET_XARCH
// force input transformation to RAX because the following MULHI will kill RDX:RAX anyway and LSRA often causes
// reduntant copies otherwise
if (firstNode && !simpleMul)
adjustedDividend->SetRegNum(REG_RAX);
#endif

GenTree* shiftBy = comp->gtNewIconNode(shift, TYP_INT);
BlockRange().InsertBefore(divMod, shiftBy);
divisor->gtType = TYP_I_IMPL;
divisor->AsIntCon()->SetIconValue(magic);

if (isDiv)
if (isDiv && !postShift && type == TYP_I_IMPL)
{
divMod->SetOper(GT_RSZ);
divMod->gtOp1 = mulhi;
divMod->gtOp2 = shiftBy;
divMod->SetOper(GT_MULHI);
divMod->gtOp1 = adjustedDividend;
divMod->gtFlags |= GTF_UNSIGNED;
}
else
{
GenTree* div = comp->gtNewOperNode(GT_RSZ, type, mulhi, shiftBy);
// Insert a new GT_MULHI node before the existing GT_UDIV/GT_UMOD node.
// The existing node will later be transformed into a GT_RSZ/GT_SUB that
// computes the final result. This way don't need to find and change the use
// of the existing node.
GenTree* mulhi = comp->gtNewOperNode(simpleMul ? GT_MUL : GT_MULHI, TYP_I_IMPL, adjustedDividend, divisor);
mulhi->gtFlags |= GTF_UNSIGNED;
BlockRange().InsertBefore(divMod, mulhi);
if (!firstNode)
firstNode = mulhi;

if (postShift)
{
GenTree* shiftBy = comp->gtNewIconNode(postShift, TYP_INT);
BlockRange().InsertBefore(divMod, shiftBy);

// divisor UMOD dividend = dividend SUB (div MUL divisor)
GenTree* divisor = comp->gtNewIconNode(divisorValue, type);
GenTree* mul = comp->gtNewOperNode(GT_MUL, type, div, divisor);
dividend = comp->gtNewLclvNode(dividend->AsLclVar()->GetLclNum(), dividend->TypeGet());
if (isDiv && type == TYP_I_IMPL)
{
divMod->SetOper(GT_RSZ);
divMod->gtOp1 = mulhi;
divMod->gtOp2 = shiftBy;
}
else
{
mulhi = comp->gtNewOperNode(GT_RSZ, TYP_I_IMPL, mulhi, shiftBy);
BlockRange().InsertBefore(divMod, mulhi);
}
}

if (!isDiv)
{
// divisor UMOD dividend = dividend SUB (div MUL divisor)
GenTree* divisor = comp->gtNewIconNode(divisorValue, type);
GenTree* mul = comp->gtNewOperNode(GT_MUL, type, mulhi, divisor);
dividend = comp->gtNewLclvNode(dividend->AsLclVar()->GetLclNum(), dividend->TypeGet());

divMod->SetOper(GT_SUB);
divMod->gtOp1 = dividend;
divMod->gtOp2 = mul;
divMod->SetOper(GT_SUB);
divMod->gtOp1 = dividend;
divMod->gtOp2 = mul;

BlockRange().InsertBefore(divMod, div, divisor, mul, dividend);
BlockRange().InsertBefore(divMod, divisor, mul, dividend);
}
else if (type != TYP_I_IMPL)
{
#ifdef TARGET_ARMARCH
divMod->SetOper(GT_CAST);
divMod->gtFlags |= GTF_UNSIGNED;
divMod->AsCast()->gtCastType = TYP_UINT;
#else
divMod->SetOper(GT_BITCAST);
#endif
divMod->gtOp1 = mulhi;
divMod->gtOp2 = nullptr;
}
}
ContainCheckRange(firstNode, divMod);

if (firstNode)
ContainCheckRange(firstNode, divMod);
return true;
}
#endif
Expand Down
Loading

0 comments on commit e4b4807

Please sign in to comment.