From 78c32477682376b2e33263e0c73666f57bd2756b Mon Sep 17 00:00:00 2001 From: Will Smith Date: Wed, 12 Oct 2022 19:20:18 -0700 Subject: [PATCH 01/53] Using 3 instruction sequence for x64 multiply --- src/coreclr/jit/morph.cpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index d4eee4b3d21c30..69717abcc20f99 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -12624,6 +12624,19 @@ GenTree* Compiler::fgOptimizeMultiply(GenTreeOp* mul) return mul; } +#ifdef TARGET_AMD64 + if (op1->OperIs(GT_LCL_VAR)) + { + ssize_t multPlusOne = mult + 1; + + if (isPow2(multPlusOne)) + { + op2->AsIntConCommon()->SetIconValue(multPlusOne); + return fgMorphTree(gtNewOperNode(GT_SUB, mul->gtType, gtCloneExpr(mul), gtCloneExpr(op1))); + } + } +#endif + #ifdef TARGET_XARCH // Should we try to replace integer multiplication with lea/add/shift sequences? bool mulShiftOpt = compCodeOpt() != SMALL_CODE; From 06a60e1dd1a16571e8abdd21a3e5188827c26c01 Mon Sep 17 00:00:00 2001 From: Will Smith Date: Wed, 12 Oct 2022 20:20:19 -0700 Subject: [PATCH 02/53] Do not do this in morph. Do it in codegen now. --- src/coreclr/jit/codegenxarch.cpp | 18 ++++++++++++++++++ src/coreclr/jit/morph.cpp | 13 ------------- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index b634ceab5ff0df..f2bf6d91ce9450 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -1106,6 +1106,24 @@ void CodeGen::genCodeForMul(GenTreeOp* treeNode) inst_RV_SH(INS_shl, size, targetReg, shiftAmount); } +#ifdef TARGET_AMD64 + else if (!requiresOverflowCheck && rmOp->isUsedFromReg()) + { + ssize_t immPlusOne = imm + 1; + + if (isPow2(immPlusOne)) + { + uint64_t zextImm = static_cast(static_cast(immPlusOne)); + unsigned int shiftAmount = genLog2(zextImm ); + + inst_Mov(targetType, targetReg, rmOp->GetRegNum(), /* canSkip */ true); + + inst_RV_SH(INS_shl, size, targetReg, shiftAmount); + + GetEmitter()->emitIns_R_R(INS_sub, emitTypeSize(treeNode->gtType), targetReg, rmOp->GetRegNum()); + } + } +#endif else { // use the 3-op form with immediate diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index 69717abcc20f99..d4eee4b3d21c30 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -12624,19 +12624,6 @@ GenTree* Compiler::fgOptimizeMultiply(GenTreeOp* mul) return mul; } -#ifdef TARGET_AMD64 - if (op1->OperIs(GT_LCL_VAR)) - { - ssize_t multPlusOne = mult + 1; - - if (isPow2(multPlusOne)) - { - op2->AsIntConCommon()->SetIconValue(multPlusOne); - return fgMorphTree(gtNewOperNode(GT_SUB, mul->gtType, gtCloneExpr(mul), gtCloneExpr(op1))); - } - } -#endif - #ifdef TARGET_XARCH // Should we try to replace integer multiplication with lea/add/shift sequences? bool mulShiftOpt = compCodeOpt() != SMALL_CODE; From 7fb0095416e751ca41aa9ec0747ea05793fe9f8c Mon Sep 17 00:00:00 2001 From: Will Smith Date: Wed, 12 Oct 2022 21:19:45 -0700 Subject: [PATCH 03/53] Fixing codegen --- src/coreclr/jit/codegenxarch.cpp | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index f2bf6d91ce9450..e90d0763e0316e 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -1107,21 +1107,16 @@ void CodeGen::genCodeForMul(GenTreeOp* treeNode) inst_RV_SH(INS_shl, size, targetReg, shiftAmount); } #ifdef TARGET_AMD64 - else if (!requiresOverflowCheck && rmOp->isUsedFromReg()) + else if (!requiresOverflowCheck && rmOp->isUsedFromReg() && isPow2(imm + 1)) { - ssize_t immPlusOne = imm + 1; + uint64_t zextImm = static_cast(static_cast(imm + 1)); + unsigned int shiftAmount = genLog2(zextImm ); - if (isPow2(immPlusOne)) - { - uint64_t zextImm = static_cast(static_cast(immPlusOne)); - unsigned int shiftAmount = genLog2(zextImm ); - - inst_Mov(targetType, targetReg, rmOp->GetRegNum(), /* canSkip */ true); + inst_Mov(targetType, targetReg, rmOp->GetRegNum(), /* canSkip */ true); - inst_RV_SH(INS_shl, size, targetReg, shiftAmount); + inst_RV_SH(INS_shl, size, targetReg, shiftAmount); - GetEmitter()->emitIns_R_R(INS_sub, emitTypeSize(treeNode->gtType), targetReg, rmOp->GetRegNum()); - } + GetEmitter()->emitIns_R_R(INS_sub, emitTypeSize(treeNode->gtType), targetReg, rmOp->GetRegNum()); } #endif else From 86edb6c3946aad91ce5440ef450e6ba947f10ad9 Mon Sep 17 00:00:00 2001 From: Will Smith Date: Thu, 13 Oct 2022 09:06:39 -0700 Subject: [PATCH 04/53] Only allow values under 127 and do not skip mov - correctness testing --- src/coreclr/jit/codegenxarch.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index e90d0763e0316e..593ae5e790e927 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -1107,12 +1107,12 @@ void CodeGen::genCodeForMul(GenTreeOp* treeNode) inst_RV_SH(INS_shl, size, targetReg, shiftAmount); } #ifdef TARGET_AMD64 - else if (!requiresOverflowCheck && rmOp->isUsedFromReg() && isPow2(imm + 1)) + else if (!requiresOverflowCheck && rmOp->isUsedFromReg() && (imm <= 127) && isPow2(imm + 1)) { uint64_t zextImm = static_cast(static_cast(imm + 1)); unsigned int shiftAmount = genLog2(zextImm ); - inst_Mov(targetType, targetReg, rmOp->GetRegNum(), /* canSkip */ true); + inst_Mov(targetType, targetReg, rmOp->GetRegNum(), /* canSkip */ false); inst_RV_SH(INS_shl, size, targetReg, shiftAmount); From 0955d117b212209026888b607f8f291d248df380 Mon Sep 17 00:00:00 2001 From: Will Smith Date: Thu, 13 Oct 2022 12:12:18 -0700 Subject: [PATCH 05/53] Try to fix tests --- src/coreclr/jit/codegenxarch.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 593ae5e790e927..977f27e3d2d554 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -1107,11 +1107,13 @@ void CodeGen::genCodeForMul(GenTreeOp* treeNode) inst_RV_SH(INS_shl, size, targetReg, shiftAmount); } #ifdef TARGET_AMD64 - else if (!requiresOverflowCheck && rmOp->isUsedFromReg() && (imm <= 127) && isPow2(imm + 1)) + else if (!requiresOverflowCheck && isUnsignedMultiply && rmOp->isUsedFromReg() && (imm == 7) && isPow2(imm + 1)) { + // Use shift for constant multiply when legal uint64_t zextImm = static_cast(static_cast(imm + 1)); unsigned int shiftAmount = genLog2(zextImm ); + // Copy reg src to dest register inst_Mov(targetType, targetReg, rmOp->GetRegNum(), /* canSkip */ false); inst_RV_SH(INS_shl, size, targetReg, shiftAmount); From 865cf95f02dd2e6efd4e76f676f4ea09e8f7f647 Mon Sep 17 00:00:00 2001 From: Will Smith Date: Thu, 13 Oct 2022 15:04:47 -0700 Subject: [PATCH 06/53] cleanup --- src/coreclr/jit/codegenxarch.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 977f27e3d2d554..e5f3ae2ed56973 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -1107,18 +1107,18 @@ void CodeGen::genCodeForMul(GenTreeOp* treeNode) inst_RV_SH(INS_shl, size, targetReg, shiftAmount); } #ifdef TARGET_AMD64 - else if (!requiresOverflowCheck && isUnsignedMultiply && rmOp->isUsedFromReg() && (imm == 7) && isPow2(imm + 1)) + else if (!requiresOverflowCheck && isUnsignedMultiply && rmOp->isUsedFromReg() && isPow2(imm + 1)) { // Use shift for constant multiply when legal uint64_t zextImm = static_cast(static_cast(imm + 1)); - unsigned int shiftAmount = genLog2(zextImm ); + unsigned int shiftAmount = genLog2(zextImm); // Copy reg src to dest register - inst_Mov(targetType, targetReg, rmOp->GetRegNum(), /* canSkip */ false); + inst_Mov(targetType, targetReg, rmOp->GetRegNum(), /* canSkip */ true); inst_RV_SH(INS_shl, size, targetReg, shiftAmount); - GetEmitter()->emitIns_R_R(INS_sub, emitTypeSize(treeNode->gtType), targetReg, rmOp->GetRegNum()); + GetEmitter()->emitIns_R_R(INS_sub, size, targetReg, rmOp->GetRegNum()); } #endif else From 999eac0de3282ec3cb54ec9d6343a9ca08b39579 Mon Sep 17 00:00:00 2001 From: Will Smith Date: Thu, 13 Oct 2022 22:01:30 -0700 Subject: [PATCH 07/53] Moving to Lowering --- src/coreclr/jit/codegenxarch.cpp | 15 -------- src/coreclr/jit/lower.h | 1 + src/coreclr/jit/lowerxarch.cpp | 60 ++++++++++++++++++++++++++++++++ 3 files changed, 61 insertions(+), 15 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index e5f3ae2ed56973..b634ceab5ff0df 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -1106,21 +1106,6 @@ void CodeGen::genCodeForMul(GenTreeOp* treeNode) inst_RV_SH(INS_shl, size, targetReg, shiftAmount); } -#ifdef TARGET_AMD64 - else if (!requiresOverflowCheck && isUnsignedMultiply && rmOp->isUsedFromReg() && isPow2(imm + 1)) - { - // Use shift for constant multiply when legal - uint64_t zextImm = static_cast(static_cast(imm + 1)); - unsigned int shiftAmount = genLog2(zextImm); - - // Copy reg src to dest register - inst_Mov(targetType, targetReg, rmOp->GetRegNum(), /* canSkip */ true); - - inst_RV_SH(INS_shl, size, targetReg, shiftAmount); - - GetEmitter()->emitIns_R_R(INS_sub, size, targetReg, rmOp->GetRegNum()); - } -#endif else { // use the 3-op form with immediate diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index f37aa6d2be5e03..0562edf420ec12 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -361,6 +361,7 @@ class Lowering final : public Phase GenTree* TryLowerAndOpToExtractLowestSetBit(GenTreeOp* andNode); GenTree* TryLowerAndOpToAndNot(GenTreeOp* andNode); GenTree* TryLowerXorOpToGetMaskUpToLowestSetBit(GenTreeOp* xorNode); + GenTree* TryLowerMulOpToLshSub(GenTreeOp* mulOp); void LowerBswapOp(GenTreeOp* node); #elif defined(TARGET_ARM64) bool IsValidConstForMovImm(GenTreeHWIntrinsic* node); diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 048faf2e1843a3..5a9c31715b30b1 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -120,6 +120,12 @@ GenTree* Lowering::LowerMul(GenTreeOp* mul) { assert(mul->OperIsMul()); + GenTree* replacementNode = TryLowerMulOpToLshSub(mul); + if (replacementNode != nullptr) + { + return replacementNode->gtNext; + } + ContainCheckMul(mul); return mul->gtNext; @@ -4147,6 +4153,60 @@ GenTree* Lowering::TryLowerXorOpToGetMaskUpToLowestSetBit(GenTreeOp* xorNode) return blsmskNode; } +//---------------------------------------------------------------------------------------------- +// Lowering::TryLowerAndOpToAndNot: Lowers a tree AND(X, NOT(Y)) to HWIntrinsic::AndNot +// +// Arguments: +// andNode - GT_AND node of integral type +// +// Return Value: +// Returns the replacement node if one is created else nullptr indicating no replacement +// +// Notes: +// Performs containment checks on the replacement node if one is created +GenTree* Lowering::TryLowerMulOpToLshSub(GenTreeOp* mulOp) +{ + assert(mulOp->OperIs(GT_MUL)); + + if (!varTypeIsLong(mulOp)) + return nullptr; + + if (!mulOp->gtOverflow()) + return nullptr; + + GenTree* op1 = mulOp->gtGetOp1(); + GenTree* op2 = mulOp->gtGetOp2(); + + if (op1->isContained() || op2->isContained()) + return nullptr; + + if (!op1->OperIs(GT_LCL_VAR)) + return nullptr; + + if (!op2->IsCnsIntOrI()) + return nullptr; + + GenTreeIntConCommon* cns = op2->AsIntConCommon(); + + ssize_t cnsVal = cns->IconValue() + 1; + + if (!isPow2(cnsVal)) + return nullptr; + + mulOp->ChangeOper(GT_SUB); + cns->SetIconValue(genLog2((unsigned int)cnsVal)); + + mulOp->gtOp1 = comp->gtNewOperNode(GT_LSH, mulOp->gtType, op1, cns); + mulOp->gtOp2 = comp->gtClone(op1); + + BlockRange().InsertBefore(mulOp, mulOp->gtOp2); + BlockRange().InsertBefore(mulOp, cns); + BlockRange().InsertBefore(mulOp, op1); + BlockRange().InsertBefore(mulOp, mulOp->gtOp1); + + return LowerNode(mulOp); +} + //---------------------------------------------------------------------------------------------- // Lowering::LowerBswapOp: Tries to contain GT_BSWAP node when possible // From 6ebf58e1880bfefce857bd225d444039ce1e6420 Mon Sep 17 00:00:00 2001 From: Will Smith Date: Thu, 13 Oct 2022 22:17:26 -0700 Subject: [PATCH 08/53] Quick fix --- src/coreclr/jit/lowerxarch.cpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 5a9c31715b30b1..f048d108d36235 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -120,10 +120,13 @@ GenTree* Lowering::LowerMul(GenTreeOp* mul) { assert(mul->OperIsMul()); - GenTree* replacementNode = TryLowerMulOpToLshSub(mul); - if (replacementNode != nullptr) + if (mul->OperIs(GT_MUL)) { - return replacementNode->gtNext; + GenTree* replacementNode = TryLowerMulOpToLshSub(mul); + if (replacementNode != nullptr) + { + return replacementNode->gtNext; + } } ContainCheckMul(mul); @@ -4171,7 +4174,7 @@ GenTree* Lowering::TryLowerMulOpToLshSub(GenTreeOp* mulOp) if (!varTypeIsLong(mulOp)) return nullptr; - if (!mulOp->gtOverflow()) + if (mulOp->gtOverflow()) return nullptr; GenTree* op1 = mulOp->gtGetOp1(); @@ -4204,7 +4207,7 @@ GenTree* Lowering::TryLowerMulOpToLshSub(GenTreeOp* mulOp) BlockRange().InsertBefore(mulOp, op1); BlockRange().InsertBefore(mulOp, mulOp->gtOp1); - return LowerNode(mulOp); + return mulOp; } //---------------------------------------------------------------------------------------------- From 6e8f28c24c5744683df6f1aa5c03628f8225767f Mon Sep 17 00:00:00 2001 From: Will Smith Date: Thu, 13 Oct 2022 22:43:44 -0700 Subject: [PATCH 09/53] Fully works in lowering now --- src/coreclr/jit/lowerxarch.cpp | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index f048d108d36235..8719f7c7de5762 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -4157,10 +4157,10 @@ GenTree* Lowering::TryLowerXorOpToGetMaskUpToLowestSetBit(GenTreeOp* xorNode) } //---------------------------------------------------------------------------------------------- -// Lowering::TryLowerAndOpToAndNot: Lowers a tree AND(X, NOT(Y)) to HWIntrinsic::AndNot +// Lowering::TryLowerMulOpToLshSub: Lowers a tree MUL(X, CNS) to SUB(LSH(X, CNS), X) // // Arguments: -// andNode - GT_AND node of integral type +// mulOp - GT_MUL node of integral type // // Return Value: // Returns the replacement node if one is created else nullptr indicating no replacement @@ -4202,10 +4202,15 @@ GenTree* Lowering::TryLowerMulOpToLshSub(GenTreeOp* mulOp) mulOp->gtOp1 = comp->gtNewOperNode(GT_LSH, mulOp->gtType, op1, cns); mulOp->gtOp2 = comp->gtClone(op1); - BlockRange().InsertBefore(mulOp, mulOp->gtOp2); + BlockRange().Remove(op1); + BlockRange().Remove(cns); + BlockRange().InsertBefore(mulOp, mulOp->gtGetOp2()); BlockRange().InsertBefore(mulOp, cns); BlockRange().InsertBefore(mulOp, op1); - BlockRange().InsertBefore(mulOp, mulOp->gtOp1); + BlockRange().InsertBefore(mulOp, mulOp->gtGetOp1()); + + ContainCheckBinary(mulOp); + ContainCheckShiftRotate(mulOp->gtGetOp1()->AsOp()); return mulOp; } From 94968a43050ca809ecb8426678144cc16141e0e3 Mon Sep 17 00:00:00 2001 From: Will Smith Date: Fri, 14 Oct 2022 00:08:44 -0700 Subject: [PATCH 10/53] Account for all ints --- src/coreclr/jit/lowerxarch.cpp | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 8719f7c7de5762..599a589b5eed72 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -4157,7 +4157,7 @@ GenTree* Lowering::TryLowerXorOpToGetMaskUpToLowestSetBit(GenTreeOp* xorNode) } //---------------------------------------------------------------------------------------------- -// Lowering::TryLowerMulOpToLshSub: Lowers a tree MUL(X, CNS) to SUB(LSH(X, CNS), X) +// Lowering::TryLowerMulOpToLshSub: Lowers a tree MUL(X, CNS) to SUB(LSH(X, log2(CNS)), X) // // Arguments: // mulOp - GT_MUL node of integral type @@ -4171,7 +4171,7 @@ GenTree* Lowering::TryLowerMulOpToLshSub(GenTreeOp* mulOp) { assert(mulOp->OperIs(GT_MUL)); - if (!varTypeIsLong(mulOp)) + if (!varTypeIsIntegral(mulOp)) return nullptr; if (mulOp->gtOverflow()) @@ -4196,8 +4196,14 @@ GenTree* Lowering::TryLowerMulOpToLshSub(GenTreeOp* mulOp) if (!isPow2(cnsVal)) return nullptr; + unsigned int shiftAmount = genLog2((unsigned int)cnsVal); + + // If shift amount is less than or equal to two then this will emit INS_lea, so return. + if (shiftAmount <= 2) + return nullptr; + mulOp->ChangeOper(GT_SUB); - cns->SetIconValue(genLog2((unsigned int)cnsVal)); + cns->SetIconValue(shiftAmount); mulOp->gtOp1 = comp->gtNewOperNode(GT_LSH, mulOp->gtType, op1, cns); mulOp->gtOp2 = comp->gtClone(op1); From 64b523ae133a23d8a4c9ce3889181bbb8ccb739a Mon Sep 17 00:00:00 2001 From: Will Smith Date: Fri, 14 Oct 2022 01:51:16 -0700 Subject: [PATCH 11/53] Take into account codegen opts --- src/coreclr/jit/lower.h | 2 +- src/coreclr/jit/lowerxarch.cpp | 55 +++++++++++++++++++++++++++------- 2 files changed, 46 insertions(+), 11 deletions(-) diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index 0562edf420ec12..92c15239b7562d 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -361,7 +361,7 @@ class Lowering final : public Phase GenTree* TryLowerAndOpToExtractLowestSetBit(GenTreeOp* andNode); GenTree* TryLowerAndOpToAndNot(GenTreeOp* andNode); GenTree* TryLowerXorOpToGetMaskUpToLowestSetBit(GenTreeOp* xorNode); - GenTree* TryLowerMulOpToLshSub(GenTreeOp* mulOp); + GenTree* TryLowerMul(GenTreeOp* mulOp); void LowerBswapOp(GenTreeOp* node); #elif defined(TARGET_ARM64) bool IsValidConstForMovImm(GenTreeHWIntrinsic* node); diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 599a589b5eed72..cf9de341f0a68c 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -122,7 +122,7 @@ GenTree* Lowering::LowerMul(GenTreeOp* mul) if (mul->OperIs(GT_MUL)) { - GenTree* replacementNode = TryLowerMulOpToLshSub(mul); + GenTree* replacementNode = TryLowerMul(mul); if (replacementNode != nullptr) { return replacementNode->gtNext; @@ -4157,7 +4157,7 @@ GenTree* Lowering::TryLowerXorOpToGetMaskUpToLowestSetBit(GenTreeOp* xorNode) } //---------------------------------------------------------------------------------------------- -// Lowering::TryLowerMulOpToLshSub: Lowers a tree MUL(X, CNS) to SUB(LSH(X, log2(CNS)), X) +// Lowering::TryLowerMul: Lowers a tree MUL(X, CNS) to SUB(LSH(X, log2(CNS)), X) // // Arguments: // mulOp - GT_MUL node of integral type @@ -4167,10 +4167,13 @@ GenTree* Lowering::TryLowerXorOpToGetMaskUpToLowestSetBit(GenTreeOp* xorNode) // // Notes: // Performs containment checks on the replacement node if one is created -GenTree* Lowering::TryLowerMulOpToLshSub(GenTreeOp* mulOp) +GenTree* Lowering::TryLowerMul(GenTreeOp* mulOp) { assert(mulOp->OperIs(GT_MUL)); +#if TARGET_X86 + return nullptr; +#else if (!varTypeIsIntegral(mulOp)) return nullptr; @@ -4189,20 +4192,51 @@ GenTree* Lowering::TryLowerMulOpToLshSub(GenTreeOp* mulOp) if (!op2->IsCnsIntOrI()) return nullptr; - GenTreeIntConCommon* cns = op2->AsIntConCommon(); + GenTreeIntConCommon* cns = op2->AsIntConCommon(); + ssize_t cnsVal = cns->IconValue(); - ssize_t cnsVal = cns->IconValue() + 1; + // Use GT_LSH if cnsVal is a power of two. + // This is handled in codegen. + if (isPow2(cnsVal)) + { + return nullptr; + } - if (!isPow2(cnsVal)) + // Use GT_LEA if cnsVal is 3, 5, or 9. + // This is handled in codegen. + if (cnsVal == 3 || cnsVal == 5 || cnsVal == 9) + { return nullptr; + } - unsigned int shiftAmount = genLog2((unsigned int)cnsVal); + ssize_t cnsValPlusOne = cnsVal + 1; + ssize_t cnsValMinusOne = cnsVal - 1; + + bool useSub = isPow2(cnsValPlusOne); - // If shift amount is less than or equal to two then this will emit INS_lea, so return. - if (shiftAmount <= 2) + if (!useSub && !isPow2(cnsValMinusOne)) return nullptr; - mulOp->ChangeOper(GT_SUB); + if (useSub) + { + cnsVal = cnsValPlusOne; + } + else + { + cnsVal = cnsValMinusOne; + } + + unsigned int shiftAmount = genLog2((unsigned int)cnsVal); + + if (useSub) + { + mulOp->ChangeOper(GT_SUB); + } + else + { + mulOp->ChangeOper(GT_ADD); + } + cns->SetIconValue(shiftAmount); mulOp->gtOp1 = comp->gtNewOperNode(GT_LSH, mulOp->gtType, op1, cns); @@ -4219,6 +4253,7 @@ GenTree* Lowering::TryLowerMulOpToLshSub(GenTreeOp* mulOp) ContainCheckShiftRotate(mulOp->gtGetOp1()->AsOp()); return mulOp; +#endif } //---------------------------------------------------------------------------------------------- From 1cde9c09b6564db0a854a48dcaf82898e7c7b1b8 Mon Sep 17 00:00:00 2001 From: Will Smith Date: Fri, 14 Oct 2022 01:53:27 -0700 Subject: [PATCH 12/53] Minor cleanup --- src/coreclr/jit/lowerxarch.cpp | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index cf9de341f0a68c..cac18797841866 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -4220,23 +4220,15 @@ GenTree* Lowering::TryLowerMul(GenTreeOp* mulOp) if (useSub) { cnsVal = cnsValPlusOne; - } - else - { - cnsVal = cnsValMinusOne; - } - - unsigned int shiftAmount = genLog2((unsigned int)cnsVal); - - if (useSub) - { mulOp->ChangeOper(GT_SUB); } else { + cnsVal = cnsValMinusOne; mulOp->ChangeOper(GT_ADD); } + unsigned int shiftAmount = genLog2((unsigned int)cnsVal); cns->SetIconValue(shiftAmount); mulOp->gtOp1 = comp->gtNewOperNode(GT_LSH, mulOp->gtType, op1, cns); From c6fba6e02b9365fad694789444ce3f295288c41c Mon Sep 17 00:00:00 2001 From: Will Smith Date: Fri, 14 Oct 2022 01:54:56 -0700 Subject: [PATCH 13/53] Minor cleanup --- src/coreclr/jit/lowerxarch.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index cac18797841866..e81e0225b2831f 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -4198,16 +4198,12 @@ GenTree* Lowering::TryLowerMul(GenTreeOp* mulOp) // Use GT_LSH if cnsVal is a power of two. // This is handled in codegen. if (isPow2(cnsVal)) - { return nullptr; - } // Use GT_LEA if cnsVal is 3, 5, or 9. // This is handled in codegen. if (cnsVal == 3 || cnsVal == 5 || cnsVal == 9) - { return nullptr; - } ssize_t cnsValPlusOne = cnsVal + 1; ssize_t cnsValMinusOne = cnsVal - 1; From ff2b9a12cd4ebb4512323c69a8af39e888675420 Mon Sep 17 00:00:00 2001 From: Will Smith Date: Fri, 14 Oct 2022 13:43:30 -0700 Subject: [PATCH 14/53] Fixed test --- src/coreclr/jit/lowerxarch.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index e81e0225b2831f..d0fd036b8aa4fe 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -4173,7 +4173,7 @@ GenTree* Lowering::TryLowerMul(GenTreeOp* mulOp) #if TARGET_X86 return nullptr; -#else +#else // !TARGET_X86 if (!varTypeIsIntegral(mulOp)) return nullptr; @@ -4224,7 +4224,7 @@ GenTree* Lowering::TryLowerMul(GenTreeOp* mulOp) mulOp->ChangeOper(GT_ADD); } - unsigned int shiftAmount = genLog2((unsigned int)cnsVal); + unsigned int shiftAmount = genLog2(static_cast(static_cast(cnsVal))); cns->SetIconValue(shiftAmount); mulOp->gtOp1 = comp->gtNewOperNode(GT_LSH, mulOp->gtType, op1, cns); @@ -4241,7 +4241,7 @@ GenTree* Lowering::TryLowerMul(GenTreeOp* mulOp) ContainCheckShiftRotate(mulOp->gtGetOp1()->AsOp()); return mulOp; -#endif +#endif // !TARGET_X86 } //---------------------------------------------------------------------------------------------- From 338dbe56c2e28f626d04fbbeb9267ba89971d62d Mon Sep 17 00:00:00 2001 From: Will Smith Date: Fri, 14 Oct 2022 16:10:45 -0700 Subject: [PATCH 15/53] Added int multiply disasm checks. Fixed SuperFileCheck namespace bug. Made SuperFileCheck anchors more likely to match. --- src/coreclr/tools/SuperFileCheck/Program.cs | 6 +- src/tests/JIT/opt/Multiply/IntMultiply.cs | 181 ++++++++++++++++++ src/tests/JIT/opt/Multiply/IntMultiply.csproj | 26 +++ 3 files changed, 210 insertions(+), 3 deletions(-) create mode 100644 src/tests/JIT/opt/Multiply/IntMultiply.cs create mode 100644 src/tests/JIT/opt/Multiply/IntMultiply.csproj diff --git a/src/coreclr/tools/SuperFileCheck/Program.cs b/src/coreclr/tools/SuperFileCheck/Program.cs index 79b5d1548b101a..6d9e8d4f5aab98 100644 --- a/src/coreclr/tools/SuperFileCheck/Program.cs +++ b/src/coreclr/tools/SuperFileCheck/Program.cs @@ -224,7 +224,7 @@ static string GetFullyQualifiedEnclosingTypeName(MethodDeclarationSyntax methodD if (namespaceDecl != null) { var identifiers = - namespaceDecl.DescendantTokens().Where(x => x.IsKind(SyntaxKind.IdentifierToken)).Select(x => x.ValueText); + namespaceDecl.Name.DescendantTokens().Where(x => x.IsKind(SyntaxKind.IdentifierToken)).Select(x => x.ValueText); return $"{String.Join(".", identifiers)}.{qualifiedTypeName}"; } @@ -373,8 +373,8 @@ static string PreProcessMethod(MethodDeclarationInfo methodDeclInfo, string[] ch var methodName = methodDeclInfo.FullyQualifiedName.Replace("*", "{{.*}}"); // Change wild-card to FileCheck wild-card syntax. // Create anchors from the first prefix. - var startAnchorText = $"// {checkPrefixes[0]}-LABEL: {methodName}"; - var endAnchorText = $"// {checkPrefixes[0]}: {methodName}"; + var startAnchorText = $"// {checkPrefixes[0]}-LABEL: for method {methodName}"; + var endAnchorText = $"// {checkPrefixes[0]}: for method {methodName}"; // Create temp source file based on the source text of the method. // Newlines are added to pad the text so FileCheck's error messages will correspond diff --git a/src/tests/JIT/opt/Multiply/IntMultiply.cs b/src/tests/JIT/opt/Multiply/IntMultiply.cs new file mode 100644 index 00000000000000..3b75ac52a7797c --- /dev/null +++ b/src/tests/JIT/opt/Multiply/IntMultiply.cs @@ -0,0 +1,181 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Runtime.CompilerServices; + +namespace CodeGenTests +{ + static class IntMultiply + { + [MethodImpl(MethodImplOptions.NoInlining)] + static uint UInt32_MultiplyWithUInt32MaxValue(uint value) + { + // X64-FULL-LINE: mov [[REG0:[a-z]+]], [[REG1:[a-z]+]] + // X64-FULL-LINE-NEXT: neg [[REG0]] + return value * UInt32.MaxValue; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static ulong UInt64_MultiplyWithUInt32MaxValue(ulong value) + { + // X64-FULL-LINE: mov [[REG0:[a-z]+]], [[REG1:[a-z]+]] + // X64-FULL-LINE-NEXT: shl [[REG0]], 32 + // X64-FULL-LINE-NEXT: sub [[REG0]], [[REG1]] + return value * (ulong)UInt32.MaxValue; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static ulong UInt64_MultiplyWithUInt32MaxValuePlusOne(ulong value) + { + // X64-FULL-LINE: mov [[REG0:[a-z]+]], [[REG1:[a-z]+]] + // X64-FULL-LINE-NEXT: shl [[REG0]], 32 + return value * ((ulong)UInt32.MaxValue + 1); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static ulong UInt64_MultiplyWithUInt32MaxValuePlusTwo(ulong value) + { + // X64-FULL-LINE: mov [[REG0:[a-z]+]], [[REG1:[a-z]+]] + // X64-FULL-LINE-NEXT: shl [[REG0]], 32 + // X64-FULL-LINE-NEXT: add [[REG0]], [[REG1]] + return value * ((ulong)UInt32.MaxValue + 2); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static ulong UInt64_MultiplyWith2(ulong value) + { + // X64-FULL-LINE: lea [[REG0:[a-z]+]], {{\[}}[[REG1:[a-z]+]]+[[REG1]]{{\]}} + return value * 2; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static ulong UInt64_MultiplyWith3(ulong value) + { + // X64-FULL-LINE: lea [[REG0:[a-z]+]], {{\[}}[[REG1:[a-z]+]]+2*[[REG1]]{{\]}} + return value * 3; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static ulong UInt64_MultiplyWith4(ulong value) + { + // X64-FULL-LINE: mov [[REG0:[a-z]+]], [[REG1:[a-z]+]] + // X64-FULL-LINE-NEXT: shl [[REG0]], 2 + return value * 4; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static ulong UInt64_MultiplyWith5(ulong value) + { + // X64-FULL-LINE: lea [[REG0:[a-z]+]], {{\[}}[[REG1:[a-z]+]]+4*[[REG1]]{{\]}} + return value * 5; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static ulong UInt64_MultiplyWith6(ulong value) + { + // X64-FULL-LINE: lea [[REG0:[a-z]+]], {{\[}}[[REG1:[a-z]+]]+2*[[REG1]]{{\]}} + return value * 6; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static ulong UInt64_MultiplyWith7(ulong value) + { + // X64-FULL-LINE: mov [[REG0:[a-z]+]], [[REG1:[a-z]+]] + // X64-FULL-LINE-NEXT: shl [[REG0]], 3 + // X64-FULL-LINE-NEXT: sub [[REG0]], [[REG1]] + return value * 7; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static ulong UInt64_MultiplyWith8(ulong value) + { + // X64-FULL-LINE: mov [[REG0:[a-z]+]], [[REG1:[a-z]+]] + // X64-FULL-LINE-NEXT: shl [[REG0]], 3 + return value * 8; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static ulong UInt64_MultiplyWith9(ulong value) + { + // X64-FULL-LINE: lea [[REG0:[a-z]+]], {{\[}}[[REG1:[a-z]+]]+8*[[REG1]]{{\]}} + return value * 9; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static ulong UInt64_MultiplyWith15(ulong value) + { + // X64-FULL-LINE: mov [[REG0:[a-z]+]], [[REG1:[a-z]+]] + // X64-FULL-LINE-NEXT: shl [[REG0]], 4 + // X64-FULL-LINE-NEXT: sub [[REG0]], [[REG1]] + return value * 15; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static ulong UInt64_MultiplyWith16(ulong value) + { + // X64-FULL-LINE: mov [[REG0:[a-z]+]], [[REG1:[a-z]+]] + // X64-FULL-LINE-NEXT: shl [[REG0]], 4 + return value * 16; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static ulong UInt64_MultiplyWith17(ulong value) + { + // X64-FULL-LINE: mov [[REG0:[a-z]+]], [[REG1:[a-z]+]] + // X64-FULL-LINE-NEXT: shl [[REG0]], 4 + // X64-FULL-LINE-NEXT: add [[REG0]], [[REG1]] + return value * 17; + } + + static int Main() + { + if (UInt32_MultiplyWithUInt32MaxValue(1) != UInt32.MaxValue) + return 0; + + if (UInt64_MultiplyWithUInt32MaxValue(1) != (ulong)UInt32.MaxValue) + return 0; + + if (UInt64_MultiplyWithUInt32MaxValuePlusOne(1) != ((ulong)UInt32.MaxValue + 1)) + return 0; + + if (UInt64_MultiplyWithUInt32MaxValuePlusTwo(1) != ((ulong)UInt32.MaxValue + 2)) + return 0; + + if (UInt64_MultiplyWith2(1) != 2) + return 0; + + if (UInt64_MultiplyWith3(1) != 3) + return 0; + + if (UInt64_MultiplyWith4(1) != 4) + return 0; + + if (UInt64_MultiplyWith5(1) != 5) + return 0; + + if (UInt64_MultiplyWith6(1) != 6) + return 0; + + if (UInt64_MultiplyWith7(1) != 7) + return 0; + + if (UInt64_MultiplyWith8(1) != 8) + return 0; + + if (UInt64_MultiplyWith9(1) != 9) + return 0; + + if (UInt64_MultiplyWith15(1) != 15) + return 0; + + if (UInt64_MultiplyWith16(1) != 16) + return 0; + + if (UInt64_MultiplyWith17(1) != 17) + return 0; + + return 100; + } + } +} diff --git a/src/tests/JIT/opt/Multiply/IntMultiply.csproj b/src/tests/JIT/opt/Multiply/IntMultiply.csproj new file mode 100644 index 00000000000000..ebb302cec184c5 --- /dev/null +++ b/src/tests/JIT/opt/Multiply/IntMultiply.csproj @@ -0,0 +1,26 @@ + + + Exe + + + None + True + + + + true + + + + + + + From 106c6b71541664bc2552fa073c9b4aeeb90957a3 Mon Sep 17 00:00:00 2001 From: Will Smith Date: Fri, 14 Oct 2022 16:13:30 -0700 Subject: [PATCH 16/53] Update comments --- src/coreclr/jit/lowerxarch.cpp | 39 ++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index d0fd036b8aa4fe..cc0861baa5bafe 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -4157,7 +4157,10 @@ GenTree* Lowering::TryLowerXorOpToGetMaskUpToLowestSetBit(GenTreeOp* xorNode) } //---------------------------------------------------------------------------------------------- -// Lowering::TryLowerMul: Lowers a tree MUL(X, CNS) to SUB(LSH(X, log2(CNS)), X) +// Lowering::TryLowerMul: +// Lowers a tree MUL(X, CNS) to SUB(LSH(X, CNS_SHIFT), X) +// or +// Lowers a tree MUL(X, CNS) to ADD(LSH(X, CNS_SHIFT), X) // // Arguments: // mulOp - GT_MUL node of integral type @@ -4167,21 +4170,21 @@ GenTree* Lowering::TryLowerXorOpToGetMaskUpToLowestSetBit(GenTreeOp* xorNode) // // Notes: // Performs containment checks on the replacement node if one is created -GenTree* Lowering::TryLowerMul(GenTreeOp* mulOp) +GenTree* Lowering::TryLowerMul(GenTreeOp* node) { - assert(mulOp->OperIs(GT_MUL)); + assert(node->OperIs(GT_MUL)); #if TARGET_X86 return nullptr; #else // !TARGET_X86 - if (!varTypeIsIntegral(mulOp)) + if (!varTypeIsIntegral(node)) return nullptr; - if (mulOp->gtOverflow()) + if (node->gtOverflow()) return nullptr; - GenTree* op1 = mulOp->gtGetOp1(); - GenTree* op2 = mulOp->gtGetOp2(); + GenTree* op1 = node->gtGetOp1(); + GenTree* op2 = node->gtGetOp2(); if (op1->isContained() || op2->isContained()) return nullptr; @@ -4216,31 +4219,31 @@ GenTree* Lowering::TryLowerMul(GenTreeOp* mulOp) if (useSub) { cnsVal = cnsValPlusOne; - mulOp->ChangeOper(GT_SUB); + node->ChangeOper(GT_SUB); } else { cnsVal = cnsValMinusOne; - mulOp->ChangeOper(GT_ADD); + node->ChangeOper(GT_ADD); } unsigned int shiftAmount = genLog2(static_cast(static_cast(cnsVal))); cns->SetIconValue(shiftAmount); - mulOp->gtOp1 = comp->gtNewOperNode(GT_LSH, mulOp->gtType, op1, cns); - mulOp->gtOp2 = comp->gtClone(op1); + node->gtOp1 = comp->gtNewOperNode(GT_LSH, node->gtType, op1, cns); + node->gtOp2 = comp->gtClone(op1); BlockRange().Remove(op1); BlockRange().Remove(cns); - BlockRange().InsertBefore(mulOp, mulOp->gtGetOp2()); - BlockRange().InsertBefore(mulOp, cns); - BlockRange().InsertBefore(mulOp, op1); - BlockRange().InsertBefore(mulOp, mulOp->gtGetOp1()); + BlockRange().InsertBefore(node, node->gtGetOp2()); + BlockRange().InsertBefore(node, cns); + BlockRange().InsertBefore(node, op1); + BlockRange().InsertBefore(node, node->gtGetOp1()); - ContainCheckBinary(mulOp); - ContainCheckShiftRotate(mulOp->gtGetOp1()->AsOp()); + ContainCheckBinary(node); + ContainCheckShiftRotate(node->gtGetOp1()->AsOp()); - return mulOp; + return node; #endif // !TARGET_X86 } From 8fc5b3716995377338b03b1d9722ab6a6dbba56d Mon Sep 17 00:00:00 2001 From: Will Smith Date: Fri, 14 Oct 2022 16:15:14 -0700 Subject: [PATCH 17/53] Update comments --- src/coreclr/jit/lowerxarch.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index cc0861baa5bafe..9e2fb514d7f4c5 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -4174,6 +4174,7 @@ GenTree* Lowering::TryLowerMul(GenTreeOp* node) { assert(node->OperIs(GT_MUL)); +// We do not do this optimization in X86 as it is not recommended. #if TARGET_X86 return nullptr; #else // !TARGET_X86 From e5835db8f6345cf665780565a80cda8a7d2fa41e Mon Sep 17 00:00:00 2001 From: Will Smith Date: Fri, 14 Oct 2022 16:21:55 -0700 Subject: [PATCH 18/53] Update comments --- src/coreclr/jit/lowerxarch.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 9e2fb514d7f4c5..2fce28bb0db6de 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -4163,7 +4163,7 @@ GenTree* Lowering::TryLowerXorOpToGetMaskUpToLowestSetBit(GenTreeOp* xorNode) // Lowers a tree MUL(X, CNS) to ADD(LSH(X, CNS_SHIFT), X) // // Arguments: -// mulOp - GT_MUL node of integral type +// node - GT_MUL node of integral type // // Return Value: // Returns the replacement node if one is created else nullptr indicating no replacement From 3be48bfa6b251be8a53c1530b01b1122abf73b2f Mon Sep 17 00:00:00 2001 From: Will Smith Date: Fri, 14 Oct 2022 16:26:00 -0700 Subject: [PATCH 19/53] Update comments --- src/coreclr/jit/lower.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index 92c15239b7562d..931a250a4d6cb7 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -361,7 +361,7 @@ class Lowering final : public Phase GenTree* TryLowerAndOpToExtractLowestSetBit(GenTreeOp* andNode); GenTree* TryLowerAndOpToAndNot(GenTreeOp* andNode); GenTree* TryLowerXorOpToGetMaskUpToLowestSetBit(GenTreeOp* xorNode); - GenTree* TryLowerMul(GenTreeOp* mulOp); + GenTree* TryLowerMul(GenTreeOp* node); void LowerBswapOp(GenTreeOp* node); #elif defined(TARGET_ARM64) bool IsValidConstForMovImm(GenTreeHWIntrinsic* node); From b3d4a5faea6c52b40e14e875e086a8b9aedd6df3 Mon Sep 17 00:00:00 2001 From: Will Smith Date: Fri, 14 Oct 2022 17:21:26 -0700 Subject: [PATCH 20/53] Formatting --- src/coreclr/jit/lowerxarch.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 2fce28bb0db6de..6eec3eae87442c 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -4177,7 +4177,7 @@ GenTree* Lowering::TryLowerMul(GenTreeOp* node) // We do not do this optimization in X86 as it is not recommended. #if TARGET_X86 return nullptr; -#else // !TARGET_X86 +#else // !TARGET_X86 if (!varTypeIsIntegral(node)) return nullptr; From 843617ae33a1b2a56d0b2f1e6488861bcb6b164b Mon Sep 17 00:00:00 2001 From: Will Smith Date: Sun, 16 Oct 2022 13:30:44 -0700 Subject: [PATCH 21/53] Fixing build --- src/coreclr/jit/lower.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index 931a250a4d6cb7..5ceca1bd8a607b 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -317,6 +317,7 @@ class Lowering final : public Phase void LowerPutArgStkOrSplit(GenTreePutArgStk* putArgNode); #ifdef TARGET_XARCH void LowerPutArgStk(GenTreePutArgStk* putArgStk); + GenTree* TryLowerMul(GenTreeOp* node); #endif // TARGET_XARCH bool TryCreateAddrMode(GenTree* addr, bool isContainable, GenTree* parent); @@ -361,7 +362,6 @@ class Lowering final : public Phase GenTree* TryLowerAndOpToExtractLowestSetBit(GenTreeOp* andNode); GenTree* TryLowerAndOpToAndNot(GenTreeOp* andNode); GenTree* TryLowerXorOpToGetMaskUpToLowestSetBit(GenTreeOp* xorNode); - GenTree* TryLowerMul(GenTreeOp* node); void LowerBswapOp(GenTreeOp* node); #elif defined(TARGET_ARM64) bool IsValidConstForMovImm(GenTreeHWIntrinsic* node); From 74b1071710d6fb1b5dfdc7e7535d49bf7bf42fc1 Mon Sep 17 00:00:00 2001 From: TIHan Date: Mon, 17 Oct 2022 10:41:54 -0700 Subject: [PATCH 22/53] Fixing build again --- src/coreclr/jit/lower.h | 2 +- src/coreclr/jit/lowerxarch.cpp | 186 ++++++++++++++++----------------- 2 files changed, 94 insertions(+), 94 deletions(-) diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index 5ceca1bd8a607b..ddf3f286527344 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -317,7 +317,7 @@ class Lowering final : public Phase void LowerPutArgStkOrSplit(GenTreePutArgStk* putArgNode); #ifdef TARGET_XARCH void LowerPutArgStk(GenTreePutArgStk* putArgStk); - GenTree* TryLowerMul(GenTreeOp* node); + GenTree* TryLowerMulToShlSubOrShlAdd(GenTreeOp* node); #endif // TARGET_XARCH bool TryCreateAddrMode(GenTree* addr, bool isContainable, GenTree* parent); diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 6eec3eae87442c..a164d92235a726 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -105,6 +105,98 @@ void Lowering::LowerStoreIndir(GenTreeStoreInd* node) ContainCheckStoreIndir(node); } +//---------------------------------------------------------------------------------------------- +// Lowering::TryLowerMulToShlSubOrShlAdd: +// Lowers a tree MUL(X, CNS) to SUB(LSH(X, CNS_SHIFT), X) +// or +// Lowers a tree MUL(X, CNS) to ADD(LSH(X, CNS_SHIFT), X) +// +// Arguments: +// node - GT_MUL node of integral type +// +// Return Value: +// Returns the replacement node if one is created else nullptr indicating no replacement +// +// Notes: +// Performs containment checks on the replacement node if one is created +GenTree* Lowering::TryLowerMulToShlSubOrShlAdd(GenTreeOp* node) +{ + assert(node->OperIs(GT_MUL)); + +// We do not do this optimization in X86 as it is not recommended. +#if TARGET_X86 + return nullptr; +#else // !TARGET_X86 + if (!varTypeIsIntegral(node)) + return nullptr; + + if (node->gtOverflow()) + return nullptr; + + GenTree* op1 = node->gtGetOp1(); + GenTree* op2 = node->gtGetOp2(); + + if (op1->isContained() || op2->isContained()) + return nullptr; + + if (!op1->OperIs(GT_LCL_VAR)) + return nullptr; + + if (!op2->IsCnsIntOrI()) + return nullptr; + + GenTreeIntConCommon* cns = op2->AsIntConCommon(); + ssize_t cnsVal = cns->IconValue(); + + // Use GT_LSH if cnsVal is a power of two. + // This is handled in codegen. + if (isPow2(cnsVal)) + return nullptr; + + // Use GT_LEA if cnsVal is 3, 5, or 9. + // This is handled in codegen. + if (cnsVal == 3 || cnsVal == 5 || cnsVal == 9) + return nullptr; + + ssize_t cnsValPlusOne = cnsVal + 1; + ssize_t cnsValMinusOne = cnsVal - 1; + + bool useSub = isPow2(cnsValPlusOne); + + if (!useSub && !isPow2(cnsValMinusOne)) + return nullptr; + + if (useSub) + { + cnsVal = cnsValPlusOne; + node->ChangeOper(GT_SUB); + } + else + { + cnsVal = cnsValMinusOne; + node->ChangeOper(GT_ADD); + } + + unsigned int shiftAmount = genLog2(static_cast(static_cast(cnsVal))); + cns->SetIconValue(shiftAmount); + + node->gtOp1 = comp->gtNewOperNode(GT_LSH, node->gtType, op1, cns); + node->gtOp2 = comp->gtClone(op1); + + BlockRange().Remove(op1); + BlockRange().Remove(cns); + BlockRange().InsertBefore(node, node->gtGetOp2()); + BlockRange().InsertBefore(node, cns); + BlockRange().InsertBefore(node, op1); + BlockRange().InsertBefore(node, node->gtGetOp1()); + + ContainCheckBinary(node); + ContainCheckShiftRotate(node->gtGetOp1()->AsOp()); + + return node; +#endif // !TARGET_X86 +} + //------------------------------------------------------------------------ // LowerMul: Lower a GT_MUL/GT_MULHI/GT_MUL_LONG node. // @@ -122,7 +214,7 @@ GenTree* Lowering::LowerMul(GenTreeOp* mul) if (mul->OperIs(GT_MUL)) { - GenTree* replacementNode = TryLowerMul(mul); + GenTree* replacementNode = TryLowerMulToShlSubOrShlAdd(mul); if (replacementNode != nullptr) { return replacementNode->gtNext; @@ -4156,98 +4248,6 @@ GenTree* Lowering::TryLowerXorOpToGetMaskUpToLowestSetBit(GenTreeOp* xorNode) return blsmskNode; } -//---------------------------------------------------------------------------------------------- -// Lowering::TryLowerMul: -// Lowers a tree MUL(X, CNS) to SUB(LSH(X, CNS_SHIFT), X) -// or -// Lowers a tree MUL(X, CNS) to ADD(LSH(X, CNS_SHIFT), X) -// -// Arguments: -// node - GT_MUL node of integral type -// -// Return Value: -// Returns the replacement node if one is created else nullptr indicating no replacement -// -// Notes: -// Performs containment checks on the replacement node if one is created -GenTree* Lowering::TryLowerMul(GenTreeOp* node) -{ - assert(node->OperIs(GT_MUL)); - -// We do not do this optimization in X86 as it is not recommended. -#if TARGET_X86 - return nullptr; -#else // !TARGET_X86 - if (!varTypeIsIntegral(node)) - return nullptr; - - if (node->gtOverflow()) - return nullptr; - - GenTree* op1 = node->gtGetOp1(); - GenTree* op2 = node->gtGetOp2(); - - if (op1->isContained() || op2->isContained()) - return nullptr; - - if (!op1->OperIs(GT_LCL_VAR)) - return nullptr; - - if (!op2->IsCnsIntOrI()) - return nullptr; - - GenTreeIntConCommon* cns = op2->AsIntConCommon(); - ssize_t cnsVal = cns->IconValue(); - - // Use GT_LSH if cnsVal is a power of two. - // This is handled in codegen. - if (isPow2(cnsVal)) - return nullptr; - - // Use GT_LEA if cnsVal is 3, 5, or 9. - // This is handled in codegen. - if (cnsVal == 3 || cnsVal == 5 || cnsVal == 9) - return nullptr; - - ssize_t cnsValPlusOne = cnsVal + 1; - ssize_t cnsValMinusOne = cnsVal - 1; - - bool useSub = isPow2(cnsValPlusOne); - - if (!useSub && !isPow2(cnsValMinusOne)) - return nullptr; - - if (useSub) - { - cnsVal = cnsValPlusOne; - node->ChangeOper(GT_SUB); - } - else - { - cnsVal = cnsValMinusOne; - node->ChangeOper(GT_ADD); - } - - unsigned int shiftAmount = genLog2(static_cast(static_cast(cnsVal))); - cns->SetIconValue(shiftAmount); - - node->gtOp1 = comp->gtNewOperNode(GT_LSH, node->gtType, op1, cns); - node->gtOp2 = comp->gtClone(op1); - - BlockRange().Remove(op1); - BlockRange().Remove(cns); - BlockRange().InsertBefore(node, node->gtGetOp2()); - BlockRange().InsertBefore(node, cns); - BlockRange().InsertBefore(node, op1); - BlockRange().InsertBefore(node, node->gtGetOp1()); - - ContainCheckBinary(node); - ContainCheckShiftRotate(node->gtGetOp1()->AsOp()); - - return node; -#endif // !TARGET_X86 -} - //---------------------------------------------------------------------------------------------- // Lowering::LowerBswapOp: Tries to contain GT_BSWAP node when possible // From 90b7e7d3f0ebba7b96060df9bd8a06c4f1aea5d7 Mon Sep 17 00:00:00 2001 From: TIHan Date: Mon, 17 Oct 2022 14:39:38 -0700 Subject: [PATCH 23/53] minor rename --- src/coreclr/jit/lower.h | 2 +- src/coreclr/jit/lowerxarch.cpp | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index ddf3f286527344..1dad394f97b4a5 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -317,7 +317,7 @@ class Lowering final : public Phase void LowerPutArgStkOrSplit(GenTreePutArgStk* putArgNode); #ifdef TARGET_XARCH void LowerPutArgStk(GenTreePutArgStk* putArgStk); - GenTree* TryLowerMulToShlSubOrShlAdd(GenTreeOp* node); + GenTree* TryLowerMulToLshSubOrLshAdd(GenTreeOp* node); #endif // TARGET_XARCH bool TryCreateAddrMode(GenTree* addr, bool isContainable, GenTree* parent); diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index a164d92235a726..c669d97b24bcab 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -106,7 +106,7 @@ void Lowering::LowerStoreIndir(GenTreeStoreInd* node) } //---------------------------------------------------------------------------------------------- -// Lowering::TryLowerMulToShlSubOrShlAdd: +// Lowering::TryLowerMulToLshSubOrLshAdd: // Lowers a tree MUL(X, CNS) to SUB(LSH(X, CNS_SHIFT), X) // or // Lowers a tree MUL(X, CNS) to ADD(LSH(X, CNS_SHIFT), X) @@ -119,7 +119,7 @@ void Lowering::LowerStoreIndir(GenTreeStoreInd* node) // // Notes: // Performs containment checks on the replacement node if one is created -GenTree* Lowering::TryLowerMulToShlSubOrShlAdd(GenTreeOp* node) +GenTree* Lowering::TryLowerMulToLshSubOrLshAdd(GenTreeOp* node) { assert(node->OperIs(GT_MUL)); @@ -214,7 +214,7 @@ GenTree* Lowering::LowerMul(GenTreeOp* mul) if (mul->OperIs(GT_MUL)) { - GenTree* replacementNode = TryLowerMulToShlSubOrShlAdd(mul); + GenTree* replacementNode = TryLowerMulToLshSubOrLshAdd(mul); if (replacementNode != nullptr) { return replacementNode->gtNext; From ead83a55716910d9fa0123778e814be94d84a0d0 Mon Sep 17 00:00:00 2001 From: TIHan Date: Mon, 17 Oct 2022 16:17:29 -0700 Subject: [PATCH 24/53] Moving x64 multiply codegen optimizations to lowering --- src/coreclr/jit/codegenxarch.cpp | 28 ++-------------- src/coreclr/jit/lower.h | 2 +- src/coreclr/jit/lowerxarch.cpp | 57 ++++++++++++++++++++++++++------ 3 files changed, 50 insertions(+), 37 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index b634ceab5ff0df..e5765eac5e741f 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -1085,33 +1085,11 @@ void CodeGen::genCodeForMul(GenTreeOp* treeNode) if (immOp != nullptr) { - // CQ: When possible use LEA for mul by imm 3, 5 or 9 ssize_t imm = immOp->AsIntConCommon()->IconValue(); - if (!requiresOverflowCheck && rmOp->isUsedFromReg() && ((imm == 3) || (imm == 5) || (imm == 9))) - { - // We will use the LEA instruction to perform this multiply - // Note that an LEA with base=x, index=x and scale=(imm-1) computes x*imm when imm=3,5 or 9. - unsigned int scale = (unsigned int)(imm - 1); - GetEmitter()->emitIns_R_ARX(INS_lea, size, targetReg, rmOp->GetRegNum(), rmOp->GetRegNum(), scale, 0); - } - else if (!requiresOverflowCheck && rmOp->isUsedFromReg() && (imm == genFindLowestBit(imm)) && (imm != 0)) - { - // Use shift for constant multiply when legal - uint64_t zextImm = static_cast(static_cast(imm)); - unsigned int shiftAmount = genLog2(zextImm); - - // Copy reg src to dest register - inst_Mov(targetType, targetReg, rmOp->GetRegNum(), /* canSkip */ true); - - inst_RV_SH(INS_shl, size, targetReg, shiftAmount); - } - else - { - // use the 3-op form with immediate - ins = GetEmitter()->inst3opImulForReg(targetReg); - emit->emitInsBinary(ins, size, rmOp, immOp); - } + // use the 3-op form with immediate + ins = GetEmitter()->inst3opImulForReg(targetReg); + emit->emitInsBinary(ins, size, rmOp, immOp); } else // we have no contained immediate operand { diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index 1dad394f97b4a5..5340a13dddcaf9 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -317,7 +317,7 @@ class Lowering final : public Phase void LowerPutArgStkOrSplit(GenTreePutArgStk* putArgNode); #ifdef TARGET_XARCH void LowerPutArgStk(GenTreePutArgStk* putArgStk); - GenTree* TryLowerMulToLshSubOrLshAdd(GenTreeOp* node); + GenTree* TryLowerMulWithConstant(GenTreeOp* node); #endif // TARGET_XARCH bool TryCreateAddrMode(GenTree* addr, bool isContainable, GenTree* parent); diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index c669d97b24bcab..64b6c991bc6841 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -106,7 +106,7 @@ void Lowering::LowerStoreIndir(GenTreeStoreInd* node) } //---------------------------------------------------------------------------------------------- -// Lowering::TryLowerMulToLshSubOrLshAdd: +// Lowering::TryLowerMulWithConstant: // Lowers a tree MUL(X, CNS) to SUB(LSH(X, CNS_SHIFT), X) // or // Lowers a tree MUL(X, CNS) to ADD(LSH(X, CNS_SHIFT), X) @@ -119,14 +119,10 @@ void Lowering::LowerStoreIndir(GenTreeStoreInd* node) // // Notes: // Performs containment checks on the replacement node if one is created -GenTree* Lowering::TryLowerMulToLshSubOrLshAdd(GenTreeOp* node) +GenTree* Lowering::TryLowerMulWithConstant(GenTreeOp* node) { assert(node->OperIs(GT_MUL)); -// We do not do this optimization in X86 as it is not recommended. -#if TARGET_X86 - return nullptr; -#else // !TARGET_X86 if (!varTypeIsIntegral(node)) return nullptr; @@ -149,14 +145,53 @@ GenTree* Lowering::TryLowerMulToLshSubOrLshAdd(GenTreeOp* node) ssize_t cnsVal = cns->IconValue(); // Use GT_LSH if cnsVal is a power of two. - // This is handled in codegen. if (isPow2(cnsVal)) - return nullptr; + { + // Use shift for constant multiply when legal + unsigned int shiftAmount = genLog2(static_cast(static_cast(cnsVal))); + + cns->SetIconValue(shiftAmount); + node->ChangeOper(GT_LSH); + + ContainCheckShiftRotate(node); + + return node; + } // Use GT_LEA if cnsVal is 3, 5, or 9. - // This is handled in codegen. if (cnsVal == 3 || cnsVal == 5 || cnsVal == 9) - return nullptr; + { + LIR::Use use; + if (BlockRange().TryGetUse(node, &use)) + { + // We will use the LEA instruction to perform this multiply + // Note that an LEA with base=x, index=x and scale=(imm-1) computes x*imm when imm=3,5 or 9. + unsigned int scale = (unsigned int)(cnsVal - 1); + + GenTree* lea = OffsetByIndexWithScale(op1, comp->gtClone(op1), scale); + + BlockRange().Remove(cns); + BlockRange().Remove(op1); + BlockRange().InsertBefore(node, lea->gtGetOp2()); + BlockRange().InsertBefore(node, lea->gtGetOp1()); + BlockRange().InsertBefore(node, lea); + + use.ReplaceWith(lea); + + BlockRange().Remove(node); + + return lea; + } + else + { + return nullptr; + } + } + +// We do not do this optimization in X86 as it is not recommended. +#if TARGET_X86 + return nullptr; +#else // !TARGET_X86 ssize_t cnsValPlusOne = cnsVal + 1; ssize_t cnsValMinusOne = cnsVal - 1; @@ -214,7 +249,7 @@ GenTree* Lowering::LowerMul(GenTreeOp* mul) if (mul->OperIs(GT_MUL)) { - GenTree* replacementNode = TryLowerMulToLshSubOrLshAdd(mul); + GenTree* replacementNode = TryLowerMulWithConstant(mul); if (replacementNode != nullptr) { return replacementNode->gtNext; From b5193c00dd1ee06373a628c78d7695863bb40e3e Mon Sep 17 00:00:00 2001 From: TIHan Date: Tue, 18 Oct 2022 18:38:09 -0700 Subject: [PATCH 25/53] Using ReplaceWithLclVar --- src/coreclr/jit/lowerxarch.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 64b6c991bc6841..61416fd20196df 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -135,9 +135,6 @@ GenTree* Lowering::TryLowerMulWithConstant(GenTreeOp* node) if (op1->isContained() || op2->isContained()) return nullptr; - if (!op1->OperIs(GT_LCL_VAR)) - return nullptr; - if (!op2->IsCnsIntOrI()) return nullptr; @@ -164,6 +161,9 @@ GenTree* Lowering::TryLowerMulWithConstant(GenTreeOp* node) LIR::Use use; if (BlockRange().TryGetUse(node, &use)) { + LIR::Use op1Use(BlockRange(), &node->gtOp1, node); + op1 = ReplaceWithLclVar(op1Use); + // We will use the LEA instruction to perform this multiply // Note that an LEA with base=x, index=x and scale=(imm-1) computes x*imm when imm=3,5 or 9. unsigned int scale = (unsigned int)(cnsVal - 1); @@ -193,6 +193,9 @@ GenTree* Lowering::TryLowerMulWithConstant(GenTreeOp* node) return nullptr; #else // !TARGET_X86 + LIR::Use op1Use(BlockRange(), &node->gtOp1, node); + op1 = ReplaceWithLclVar(op1Use); + ssize_t cnsValPlusOne = cnsVal + 1; ssize_t cnsValMinusOne = cnsVal - 1; From 214625fd7fc1ad6eab21040a59ffcad02c98f07c Mon Sep 17 00:00:00 2001 From: TIHan Date: Wed, 19 Oct 2022 12:38:55 -0700 Subject: [PATCH 26/53] Feedback. Removed use of FULL-LINE as it is more readable not strictly necessary. Forgot to add an additional instruction to a disasm test. --- src/coreclr/jit/lowerxarch.cpp | 3 +- src/tests/JIT/opt/Multiply/IntMultiply.cs | 61 ++++++++++--------- src/tests/JIT/opt/Multiply/IntMultiply.csproj | 15 +---- 3 files changed, 35 insertions(+), 44 deletions(-) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index c669d97b24bcab..54c04ab35670f2 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -126,7 +126,7 @@ GenTree* Lowering::TryLowerMulToLshSubOrLshAdd(GenTreeOp* node) // We do not do this optimization in X86 as it is not recommended. #if TARGET_X86 return nullptr; -#else // !TARGET_X86 +#endif // TARGET_X86 if (!varTypeIsIntegral(node)) return nullptr; @@ -194,7 +194,6 @@ GenTree* Lowering::TryLowerMulToLshSubOrLshAdd(GenTreeOp* node) ContainCheckShiftRotate(node->gtGetOp1()->AsOp()); return node; -#endif // !TARGET_X86 } //------------------------------------------------------------------------ diff --git a/src/tests/JIT/opt/Multiply/IntMultiply.cs b/src/tests/JIT/opt/Multiply/IntMultiply.cs index 3b75ac52a7797c..1bb274f0df9765 100644 --- a/src/tests/JIT/opt/Multiply/IntMultiply.cs +++ b/src/tests/JIT/opt/Multiply/IntMultiply.cs @@ -11,120 +11,121 @@ static class IntMultiply [MethodImpl(MethodImplOptions.NoInlining)] static uint UInt32_MultiplyWithUInt32MaxValue(uint value) { - // X64-FULL-LINE: mov [[REG0:[a-z]+]], [[REG1:[a-z]+]] - // X64-FULL-LINE-NEXT: neg [[REG0]] + // X64: mov [[REG0:[a-z]+]], [[REG1:[a-z]+]] + // X64-NEXT: neg [[REG0]] return value * UInt32.MaxValue; } [MethodImpl(MethodImplOptions.NoInlining)] static ulong UInt64_MultiplyWithUInt32MaxValue(ulong value) { - // X64-FULL-LINE: mov [[REG0:[a-z]+]], [[REG1:[a-z]+]] - // X64-FULL-LINE-NEXT: shl [[REG0]], 32 - // X64-FULL-LINE-NEXT: sub [[REG0]], [[REG1]] + // X64: mov [[REG0:[a-z]+]], [[REG1:[a-z]+]] + // X64-NEXT: shl [[REG0]], 32 + // X64-NEXT: sub [[REG0]], [[REG1]] return value * (ulong)UInt32.MaxValue; } [MethodImpl(MethodImplOptions.NoInlining)] static ulong UInt64_MultiplyWithUInt32MaxValuePlusOne(ulong value) { - // X64-FULL-LINE: mov [[REG0:[a-z]+]], [[REG1:[a-z]+]] - // X64-FULL-LINE-NEXT: shl [[REG0]], 32 + // X64: mov [[REG0:[a-z]+]], [[REG1:[a-z]+]] + // X64-NEXT: shl [[REG0]], 32 return value * ((ulong)UInt32.MaxValue + 1); } [MethodImpl(MethodImplOptions.NoInlining)] static ulong UInt64_MultiplyWithUInt32MaxValuePlusTwo(ulong value) { - // X64-FULL-LINE: mov [[REG0:[a-z]+]], [[REG1:[a-z]+]] - // X64-FULL-LINE-NEXT: shl [[REG0]], 32 - // X64-FULL-LINE-NEXT: add [[REG0]], [[REG1]] + // X64: mov [[REG0:[a-z]+]], [[REG1:[a-z]+]] + // X64-NEXT: shl [[REG0]], 32 + // X64-NEXT: add [[REG0]], [[REG1]] return value * ((ulong)UInt32.MaxValue + 2); } [MethodImpl(MethodImplOptions.NoInlining)] static ulong UInt64_MultiplyWith2(ulong value) { - // X64-FULL-LINE: lea [[REG0:[a-z]+]], {{\[}}[[REG1:[a-z]+]]+[[REG1]]{{\]}} + // X64: lea [[REG0:[a-z]+]], {{\[}}[[REG1:[a-z]+]]+[[REG1]]{{\]}} return value * 2; } [MethodImpl(MethodImplOptions.NoInlining)] static ulong UInt64_MultiplyWith3(ulong value) { - // X64-FULL-LINE: lea [[REG0:[a-z]+]], {{\[}}[[REG1:[a-z]+]]+2*[[REG1]]{{\]}} + // X64: lea [[REG0:[a-z]+]], {{\[}}[[REG1:[a-z]+]]+2*[[REG1]]{{\]}} return value * 3; } [MethodImpl(MethodImplOptions.NoInlining)] static ulong UInt64_MultiplyWith4(ulong value) { - // X64-FULL-LINE: mov [[REG0:[a-z]+]], [[REG1:[a-z]+]] - // X64-FULL-LINE-NEXT: shl [[REG0]], 2 + // X64: mov [[REG0:[a-z]+]], [[REG1:[a-z]+]] + // X64-NEXT: shl [[REG0]], 2 return value * 4; } [MethodImpl(MethodImplOptions.NoInlining)] static ulong UInt64_MultiplyWith5(ulong value) { - // X64-FULL-LINE: lea [[REG0:[a-z]+]], {{\[}}[[REG1:[a-z]+]]+4*[[REG1]]{{\]}} + // X64: lea [[REG0:[a-z]+]], {{\[}}[[REG1:[a-z]+]]+4*[[REG1]]{{\]}} return value * 5; } [MethodImpl(MethodImplOptions.NoInlining)] static ulong UInt64_MultiplyWith6(ulong value) { - // X64-FULL-LINE: lea [[REG0:[a-z]+]], {{\[}}[[REG1:[a-z]+]]+2*[[REG1]]{{\]}} + // X64: lea [[REG0:[a-z]+]], {{\[}}[[REG1:[a-z]+]]+2*[[REG1]]{{\]}} + // X64-NEXT: add [[REG0]], [[REG0]] return value * 6; } [MethodImpl(MethodImplOptions.NoInlining)] static ulong UInt64_MultiplyWith7(ulong value) { - // X64-FULL-LINE: mov [[REG0:[a-z]+]], [[REG1:[a-z]+]] - // X64-FULL-LINE-NEXT: shl [[REG0]], 3 - // X64-FULL-LINE-NEXT: sub [[REG0]], [[REG1]] + // X64: mov [[REG0:[a-z]+]], [[REG1:[a-z]+]] + // X64-NEXT: shl [[REG0]], 3 + // X64-NEXT: sub [[REG0]], [[REG1]] return value * 7; } [MethodImpl(MethodImplOptions.NoInlining)] static ulong UInt64_MultiplyWith8(ulong value) { - // X64-FULL-LINE: mov [[REG0:[a-z]+]], [[REG1:[a-z]+]] - // X64-FULL-LINE-NEXT: shl [[REG0]], 3 + // X64: mov [[REG0:[a-z]+]], [[REG1:[a-z]+]] + // X64-NEXT: shl [[REG0]], 3 return value * 8; } [MethodImpl(MethodImplOptions.NoInlining)] static ulong UInt64_MultiplyWith9(ulong value) { - // X64-FULL-LINE: lea [[REG0:[a-z]+]], {{\[}}[[REG1:[a-z]+]]+8*[[REG1]]{{\]}} + // X64: lea [[REG0:[a-z]+]], {{\[}}[[REG1:[a-z]+]]+8*[[REG1]]{{\]}} return value * 9; } [MethodImpl(MethodImplOptions.NoInlining)] static ulong UInt64_MultiplyWith15(ulong value) { - // X64-FULL-LINE: mov [[REG0:[a-z]+]], [[REG1:[a-z]+]] - // X64-FULL-LINE-NEXT: shl [[REG0]], 4 - // X64-FULL-LINE-NEXT: sub [[REG0]], [[REG1]] + // X64: mov [[REG0:[a-z]+]], [[REG1:[a-z]+]] + // X64-NEXT: shl [[REG0]], 4 + // X64-NEXT: sub [[REG0]], [[REG1]] return value * 15; } [MethodImpl(MethodImplOptions.NoInlining)] static ulong UInt64_MultiplyWith16(ulong value) { - // X64-FULL-LINE: mov [[REG0:[a-z]+]], [[REG1:[a-z]+]] - // X64-FULL-LINE-NEXT: shl [[REG0]], 4 + // X64: mov [[REG0:[a-z]+]], [[REG1:[a-z]+]] + // X64-NEXT: shl [[REG0]], 4 return value * 16; } [MethodImpl(MethodImplOptions.NoInlining)] static ulong UInt64_MultiplyWith17(ulong value) { - // X64-FULL-LINE: mov [[REG0:[a-z]+]], [[REG1:[a-z]+]] - // X64-FULL-LINE-NEXT: shl [[REG0]], 4 - // X64-FULL-LINE-NEXT: add [[REG0]], [[REG1]] + // X64: mov [[REG0:[a-z]+]], [[REG1:[a-z]+]] + // X64-NEXT: shl [[REG0]], 4 + // X64-NEXT: add [[REG0]], [[REG1]] return value * 17; } diff --git a/src/tests/JIT/opt/Multiply/IntMultiply.csproj b/src/tests/JIT/opt/Multiply/IntMultiply.csproj index ebb302cec184c5..43448e90aa4688 100644 --- a/src/tests/JIT/opt/Multiply/IntMultiply.csproj +++ b/src/tests/JIT/opt/Multiply/IntMultiply.csproj @@ -10,17 +10,8 @@ true + + + - - - - From 9ff6393f32729ba6a9cd069b8c5ec6352371f592 Mon Sep 17 00:00:00 2001 From: TIHan Date: Wed, 19 Oct 2022 13:44:58 -0700 Subject: [PATCH 27/53] Minor fix --- src/coreclr/jit/lowerxarch.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index fd1457eb1bd22a..905b0f1182245b 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -193,9 +193,6 @@ GenTree* Lowering::TryLowerMulWithConstant(GenTreeOp* node) return nullptr; #endif // TARGET_X86 - LIR::Use op1Use(BlockRange(), &node->gtOp1, node); - op1 = ReplaceWithLclVar(op1Use); - ssize_t cnsValPlusOne = cnsVal + 1; ssize_t cnsValMinusOne = cnsVal - 1; @@ -204,6 +201,9 @@ GenTree* Lowering::TryLowerMulWithConstant(GenTreeOp* node) if (!useSub && !isPow2(cnsValMinusOne)) return nullptr; + LIR::Use op1Use(BlockRange(), &node->gtOp1, node); + op1 = ReplaceWithLclVar(op1Use); + if (useSub) { cnsVal = cnsValPlusOne; From 75313fbe9ec70caae112f5677f840176e6c39ed1 Mon Sep 17 00:00:00 2001 From: TIHan Date: Wed, 19 Oct 2022 18:15:07 -0700 Subject: [PATCH 28/53] Formatting --- src/coreclr/jit/lowerxarch.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 54c04ab35670f2..72d9720c5ec3dd 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -126,7 +126,7 @@ GenTree* Lowering::TryLowerMulToLshSubOrLshAdd(GenTreeOp* node) // We do not do this optimization in X86 as it is not recommended. #if TARGET_X86 return nullptr; -#endif // TARGET_X86 +#endif // TARGET_X86 if (!varTypeIsIntegral(node)) return nullptr; From 79bc2d5f5d77c35e587e02b5977bf2cbc4ba0db0 Mon Sep 17 00:00:00 2001 From: TIHan Date: Thu, 20 Oct 2022 11:05:06 -0700 Subject: [PATCH 29/53] Adding codegen opts back, but disabling lowering opts if min-opts is enabled --- src/coreclr/jit/codegenxarch.cpp | 27 ++++++++++++-- src/coreclr/jit/lowerxarch.cpp | 63 ++++++++++++++++++++++++-------- 2 files changed, 71 insertions(+), 19 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index e5765eac5e741f..d7fd4222a0c11a 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -1087,9 +1087,30 @@ void CodeGen::genCodeForMul(GenTreeOp* treeNode) { ssize_t imm = immOp->AsIntConCommon()->IconValue(); - // use the 3-op form with immediate - ins = GetEmitter()->inst3opImulForReg(targetReg); - emit->emitInsBinary(ins, size, rmOp, immOp); + if (!requiresOverflowCheck && rmOp->isUsedFromReg() && ((imm == 3) || (imm == 5) || (imm == 9))) + { + // We will use the LEA instruction to perform this multiply + // Note that an LEA with base=x, index=x and scale=(imm-1) computes x*imm when imm=3,5 or 9. + unsigned int scale = (unsigned int)(imm - 1); + GetEmitter()->emitIns_R_ARX(INS_lea, size, targetReg, rmOp->GetRegNum(), rmOp->GetRegNum(), scale, 0); + } + else if (!requiresOverflowCheck && rmOp->isUsedFromReg() && (imm == genFindLowestBit(imm)) && (imm != 0)) + { + // Use shift for constant multiply when legal + uint64_t zextImm = static_cast(static_cast(imm)); + unsigned int shiftAmount = genLog2(zextImm); + + // Copy reg src to dest register + inst_Mov(targetType, targetReg, rmOp->GetRegNum(), /* canSkip */ true); + + inst_RV_SH(INS_shl, size, targetReg, shiftAmount); + } + else + { + // use the 3-op form with immediate + ins = GetEmitter()->inst3opImulForReg(targetReg); + emit->emitInsBinary(ins, size, rmOp, immOp); + } } else // we have no contained immediate operand { diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index bcfa6fc30232ce..2832ac51f03926 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -123,6 +123,12 @@ GenTree* Lowering::TryLowerMulWithConstant(GenTreeOp* node) { assert(node->OperIs(GT_MUL)); + //// Do not do this optimization with min-opts. + //// Codegen has similar optimizations, though limited, + //// so it might be able to do the optimization even with min-opts. + if (comp->opts.MinOpts()) + return nullptr; + if (!varTypeIsIntegral(node)) return nullptr; @@ -164,23 +170,52 @@ GenTree* Lowering::TryLowerMulWithConstant(GenTreeOp* node) LIR::Use op1Use(BlockRange(), &node->gtOp1, node); op1 = ReplaceWithLclVar(op1Use); - // We will use the LEA instruction to perform this multiply - // Note that an LEA with base=x, index=x and scale=(imm-1) computes x*imm when imm=3,5 or 9. - unsigned int scale = (unsigned int)(cnsVal - 1); + if (!comp->lvaGetDesc(op1->AsLclVar())->IsEnregisterableLcl()) + { + unsigned lclNumTmp = comp->lvaGrabTemp(true DEBUGARG("lclNumTmp")); + GenTree* lclvNodeStore = comp->gtNewTempAssign(lclNumTmp, op1); + GenTree* tmpTree = comp->gtNewLclvNode(lclNumTmp, op1->TypeGet()); - GenTree* lea = OffsetByIndexWithScale(op1, comp->gtClone(op1), scale); + // We will use the LEA instruction to perform this multiply + // Note that an LEA with base=x, index=x and scale=(imm-1) computes x*imm when imm=3,5 or 9. + unsigned int scale = (unsigned int)(cnsVal - 1); - BlockRange().Remove(cns); - BlockRange().Remove(op1); - BlockRange().InsertBefore(node, lea->gtGetOp2()); - BlockRange().InsertBefore(node, lea->gtGetOp1()); - BlockRange().InsertBefore(node, lea); + GenTree* lea = OffsetByIndexWithScale(tmpTree, comp->gtClone(tmpTree), scale); - use.ReplaceWith(lea); + BlockRange().Remove(cns); + BlockRange().Remove(op1); + BlockRange().InsertBefore(node, op1); + BlockRange().InsertBefore(node, lclvNodeStore); + BlockRange().InsertBefore(node, lea->gtGetOp2()); + BlockRange().InsertBefore(node, lea->gtGetOp1()); + BlockRange().InsertBefore(node, lea); - BlockRange().Remove(node); + use.ReplaceWith(lea); + + BlockRange().Remove(node); + + return lea; + } + else + { + // We will use the LEA instruction to perform this multiply + // Note that an LEA with base=x, index=x and scale=(imm-1) computes x*imm when imm=3,5 or 9. + unsigned int scale = (unsigned int)(cnsVal - 1); + + GenTree* lea = OffsetByIndexWithScale(op1, comp->gtClone(op1), scale); + + BlockRange().Remove(cns); + BlockRange().Remove(op1); + BlockRange().InsertBefore(node, lea->gtGetOp2()); + BlockRange().InsertBefore(node, lea->gtGetOp1()); + BlockRange().InsertBefore(node, lea); - return lea; + use.ReplaceWith(lea); + + BlockRange().Remove(node); + + return lea; + } } else { @@ -251,11 +286,7 @@ GenTree* Lowering::LowerMul(GenTreeOp* mul) if (mul->OperIs(GT_MUL)) { -<<<<<<< HEAD GenTree* replacementNode = TryLowerMulWithConstant(mul); -======= - GenTree* replacementNode = TryLowerMulToLshSubOrLshAdd(mul); ->>>>>>> upstream/main if (replacementNode != nullptr) { return replacementNode->gtNext; From 47c818d32eb71711c7bba33ae9d3f4855937af93 Mon Sep 17 00:00:00 2001 From: TIHan Date: Thu, 20 Oct 2022 11:42:29 -0700 Subject: [PATCH 30/53] Little more changes --- src/coreclr/jit/codegenxarch.cpp | 8 ++++++-- src/coreclr/jit/lowerxarch.cpp | 12 ++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index d7fd4222a0c11a..3e8ce23b4bf26e 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -1087,14 +1087,18 @@ void CodeGen::genCodeForMul(GenTreeOp* treeNode) { ssize_t imm = immOp->AsIntConCommon()->IconValue(); - if (!requiresOverflowCheck && rmOp->isUsedFromReg() && ((imm == 3) || (imm == 5) || (imm == 9))) + // Only do these optimizations if min-opts is enabled. + // Otherwise, lowering will do them. + if (compiler->opts.MinOpts() && !requiresOverflowCheck && rmOp->isUsedFromReg() && + ((imm == 3) || (imm == 5) || (imm == 9))) { // We will use the LEA instruction to perform this multiply // Note that an LEA with base=x, index=x and scale=(imm-1) computes x*imm when imm=3,5 or 9. unsigned int scale = (unsigned int)(imm - 1); GetEmitter()->emitIns_R_ARX(INS_lea, size, targetReg, rmOp->GetRegNum(), rmOp->GetRegNum(), scale, 0); } - else if (!requiresOverflowCheck && rmOp->isUsedFromReg() && (imm == genFindLowestBit(imm)) && (imm != 0)) + else if (compiler->opts.MinOpts() && !requiresOverflowCheck && rmOp->isUsedFromReg() && + (imm == genFindLowestBit(imm)) && (imm != 0)) { // Use shift for constant multiply when legal uint64_t zextImm = static_cast(static_cast(imm)); diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 2832ac51f03926..bc6e9d9bafaf37 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -170,16 +170,16 @@ GenTree* Lowering::TryLowerMulWithConstant(GenTreeOp* node) LIR::Use op1Use(BlockRange(), &node->gtOp1, node); op1 = ReplaceWithLclVar(op1Use); + // We will use the LEA instruction to perform this multiply + // Note that an LEA with base=x, index=x and scale=(cnsVal-1) computes x*cnsVal when cnsVal=3,5 or 9. + unsigned int scale = (unsigned int)(cnsVal - 1); + if (!comp->lvaGetDesc(op1->AsLclVar())->IsEnregisterableLcl()) { unsigned lclNumTmp = comp->lvaGrabTemp(true DEBUGARG("lclNumTmp")); GenTree* lclvNodeStore = comp->gtNewTempAssign(lclNumTmp, op1); GenTree* tmpTree = comp->gtNewLclvNode(lclNumTmp, op1->TypeGet()); - // We will use the LEA instruction to perform this multiply - // Note that an LEA with base=x, index=x and scale=(imm-1) computes x*imm when imm=3,5 or 9. - unsigned int scale = (unsigned int)(cnsVal - 1); - GenTree* lea = OffsetByIndexWithScale(tmpTree, comp->gtClone(tmpTree), scale); BlockRange().Remove(cns); @@ -198,10 +198,6 @@ GenTree* Lowering::TryLowerMulWithConstant(GenTreeOp* node) } else { - // We will use the LEA instruction to perform this multiply - // Note that an LEA with base=x, index=x and scale=(imm-1) computes x*imm when imm=3,5 or 9. - unsigned int scale = (unsigned int)(cnsVal - 1); - GenTree* lea = OffsetByIndexWithScale(op1, comp->gtClone(op1), scale); BlockRange().Remove(cns); From 55b6212f4d8bfec6caaf7b8992383327fb43b867 Mon Sep 17 00:00:00 2001 From: TIHan Date: Thu, 20 Oct 2022 11:45:10 -0700 Subject: [PATCH 31/53] Formatting --- src/coreclr/jit/lowerxarch.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index bc6e9d9bafaf37..5147f2b531db73 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -176,9 +176,9 @@ GenTree* Lowering::TryLowerMulWithConstant(GenTreeOp* node) if (!comp->lvaGetDesc(op1->AsLclVar())->IsEnregisterableLcl()) { - unsigned lclNumTmp = comp->lvaGrabTemp(true DEBUGARG("lclNumTmp")); + unsigned lclNumTmp = comp->lvaGrabTemp(true DEBUGARG("lclNumTmp")); GenTree* lclvNodeStore = comp->gtNewTempAssign(lclNumTmp, op1); - GenTree* tmpTree = comp->gtNewLclvNode(lclNumTmp, op1->TypeGet()); + GenTree* tmpTree = comp->gtNewLclvNode(lclNumTmp, op1->TypeGet()); GenTree* lea = OffsetByIndexWithScale(tmpTree, comp->gtClone(tmpTree), scale); From 1a736ff18e34a53a369df2cb68638f415b624efc Mon Sep 17 00:00:00 2001 From: TIHan Date: Thu, 20 Oct 2022 12:18:54 -0700 Subject: [PATCH 32/53] Remove codegen opts --- src/coreclr/jit/codegenxarch.cpp | 31 +++---------------------------- src/coreclr/jit/lowerxarch.cpp | 14 ++++++++++---- 2 files changed, 13 insertions(+), 32 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 3e8ce23b4bf26e..e5765eac5e741f 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -1087,34 +1087,9 @@ void CodeGen::genCodeForMul(GenTreeOp* treeNode) { ssize_t imm = immOp->AsIntConCommon()->IconValue(); - // Only do these optimizations if min-opts is enabled. - // Otherwise, lowering will do them. - if (compiler->opts.MinOpts() && !requiresOverflowCheck && rmOp->isUsedFromReg() && - ((imm == 3) || (imm == 5) || (imm == 9))) - { - // We will use the LEA instruction to perform this multiply - // Note that an LEA with base=x, index=x and scale=(imm-1) computes x*imm when imm=3,5 or 9. - unsigned int scale = (unsigned int)(imm - 1); - GetEmitter()->emitIns_R_ARX(INS_lea, size, targetReg, rmOp->GetRegNum(), rmOp->GetRegNum(), scale, 0); - } - else if (compiler->opts.MinOpts() && !requiresOverflowCheck && rmOp->isUsedFromReg() && - (imm == genFindLowestBit(imm)) && (imm != 0)) - { - // Use shift for constant multiply when legal - uint64_t zextImm = static_cast(static_cast(imm)); - unsigned int shiftAmount = genLog2(zextImm); - - // Copy reg src to dest register - inst_Mov(targetType, targetReg, rmOp->GetRegNum(), /* canSkip */ true); - - inst_RV_SH(INS_shl, size, targetReg, shiftAmount); - } - else - { - // use the 3-op form with immediate - ins = GetEmitter()->inst3opImulForReg(targetReg); - emit->emitInsBinary(ins, size, rmOp, immOp); - } + // use the 3-op form with immediate + ins = GetEmitter()->inst3opImulForReg(targetReg); + emit->emitInsBinary(ins, size, rmOp, immOp); } else // we have no contained immediate operand { diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 5147f2b531db73..b12689dfbb14da 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -107,6 +107,9 @@ void Lowering::LowerStoreIndir(GenTreeStoreInd* node) //---------------------------------------------------------------------------------------------- // Lowering::TryLowerMulWithConstant: +// Lowers a tree MUL(X, CNS) to LSH(X, CNS_SHIFT) +// or +// Lowers a tree MUL(X, CNS) to LEA(X, X // Lowers a tree MUL(X, CNS) to SUB(LSH(X, CNS_SHIFT), X) // or // Lowers a tree MUL(X, CNS) to ADD(LSH(X, CNS_SHIFT), X) @@ -123,9 +126,9 @@ GenTree* Lowering::TryLowerMulWithConstant(GenTreeOp* node) { assert(node->OperIs(GT_MUL)); - //// Do not do this optimization with min-opts. - //// Codegen has similar optimizations, though limited, - //// so it might be able to do the optimization even with min-opts. + // Do not do this optimization with min-opts enabled as + // this could create more tmp locals that need to be optimized + // in LSRA. if (comp->opts.MinOpts()) return nullptr; @@ -170,10 +173,13 @@ GenTree* Lowering::TryLowerMulWithConstant(GenTreeOp* node) LIR::Use op1Use(BlockRange(), &node->gtOp1, node); op1 = ReplaceWithLclVar(op1Use); - // We will use the LEA instruction to perform this multiply // Note that an LEA with base=x, index=x and scale=(cnsVal-1) computes x*cnsVal when cnsVal=3,5 or 9. unsigned int scale = (unsigned int)(cnsVal - 1); + // If the local is not enregisterable, create a new tmp local + // that only gets used as the index and base of GT_LEA. + // LSRA should optimize this to ensure that index and base will + // be in the same register. if (!comp->lvaGetDesc(op1->AsLclVar())->IsEnregisterableLcl()) { unsigned lclNumTmp = comp->lvaGrabTemp(true DEBUGARG("lclNumTmp")); From c042862d49ab69a79e183c7810c97f14ffce24f9 Mon Sep 17 00:00:00 2001 From: TIHan Date: Thu, 20 Oct 2022 12:22:41 -0700 Subject: [PATCH 33/53] Update comments --- src/coreclr/jit/lowerxarch.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index b12689dfbb14da..ccbd1997198b09 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -109,7 +109,8 @@ void Lowering::LowerStoreIndir(GenTreeStoreInd* node) // Lowering::TryLowerMulWithConstant: // Lowers a tree MUL(X, CNS) to LSH(X, CNS_SHIFT) // or -// Lowers a tree MUL(X, CNS) to LEA(X, X +// Lowers a tree MUL(X, CNS) to LEA(X, X, CNS_MINUS_ONE) +// or // Lowers a tree MUL(X, CNS) to SUB(LSH(X, CNS_SHIFT), X) // or // Lowers a tree MUL(X, CNS) to ADD(LSH(X, CNS_SHIFT), X) @@ -173,12 +174,11 @@ GenTree* Lowering::TryLowerMulWithConstant(GenTreeOp* node) LIR::Use op1Use(BlockRange(), &node->gtOp1, node); op1 = ReplaceWithLclVar(op1Use); - // Note that an LEA with base=x, index=x and scale=(cnsVal-1) computes x*cnsVal when cnsVal=3,5 or 9. unsigned int scale = (unsigned int)(cnsVal - 1); // If the local is not enregisterable, create a new tmp local // that only gets used as the index and base of GT_LEA. - // LSRA should optimize this to ensure that index and base will + // LSRA should optimize this to ensure that the index and base will // be in the same register. if (!comp->lvaGetDesc(op1->AsLclVar())->IsEnregisterableLcl()) { From bba5ba06dc7df5243b1df79be92840b2e2a68fec Mon Sep 17 00:00:00 2001 From: TIHan Date: Thu, 20 Oct 2022 12:27:31 -0700 Subject: [PATCH 34/53] Added test case for address-exposed --- src/tests/JIT/opt/Multiply/IntMultiply.cs | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/tests/JIT/opt/Multiply/IntMultiply.cs b/src/tests/JIT/opt/Multiply/IntMultiply.cs index 1bb274f0df9765..39a3c9397f0912 100644 --- a/src/tests/JIT/opt/Multiply/IntMultiply.cs +++ b/src/tests/JIT/opt/Multiply/IntMultiply.cs @@ -129,6 +129,24 @@ static ulong UInt64_MultiplyWith17(ulong value) return value * 17; } + + [MethodImpl(MethodImplOptions.NoInlining)] + static ulong UInt64_MultiplyWith5_AddressExposed(ulong value) + { + // X64: mov [[REG0:[a-z]+]], qword ptr + // X64-NOT: mov + // X64-NEXT: lea [[REG1:[a-z]+]], {{\[}}[[REG0:[a-z]+]]+4*[[REG0]]{{\]}} + var value2 = value * 5; + UInt64_AddressExposed(ref value); + return value2; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void UInt64_AddressExposed(ref ulong value) + { + + } + static int Main() { if (UInt32_MultiplyWithUInt32MaxValue(1) != UInt32.MaxValue) @@ -176,6 +194,9 @@ static int Main() if (UInt64_MultiplyWith17(1) != 17) return 0; + if (UInt64_MultiplyWith5_AddressExposed(1) != 5) + return 0; + return 100; } } From 164f233914409f1714d2a4132738eda228ec586c Mon Sep 17 00:00:00 2001 From: TIHan Date: Thu, 20 Oct 2022 17:28:16 -0700 Subject: [PATCH 35/53] Added TryLowerLshWithConstant --- src/coreclr/jit/lower.cpp | 40 +++++++++++++++++++--- src/coreclr/jit/lower.h | 9 ++++- src/coreclr/jit/lowerxarch.cpp | 61 ++++++++++++++++++++++++++++++++++ 3 files changed, 104 insertions(+), 6 deletions(-) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 0d659452800754..00ae2a1c166e60 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -297,7 +297,7 @@ GenTree* Lowering::LowerNode(GenTree* node) case GT_RSH: case GT_RSZ: #if defined(TARGET_XARCH) || defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) - LowerShift(node->AsOp()); + return LowerShift(node->AsOp()); #else ContainCheckShiftRotate(node->AsOp()); #endif @@ -5409,16 +5409,30 @@ bool Lowering::TryCreateAddrMode(GenTree* addr, bool isContainable, GenTree* par BlockRange().Remove(unused); - if (unused->OperIs(GT_ADD, GT_MUL, GT_LSH)) + if (unused->OperIs(GT_ADD, GT_MUL, GT_LSH, GT_LEA, GT_LCL_VAR)) { // Push the first operand and loop back to process the second one. // This minimizes the stack depth because the second one tends to be // a constant so it gets processed and then the first one gets popped. - unusedStack.Push(unused->AsOp()->gtGetOp1()); - unused = unused->AsOp()->gtGetOp2(); + if (unused->OperIs(GT_LEA)) + { + assert(unused->gtGetOp2()->OperIs(GT_LCL_VAR)); + BlockRange().Remove(unused->gtGetOp2()); + unused = nullptr; + } + else if (unused->OperIs(GT_LCL_VAR)) + { + unused = nullptr; + } + else + { + unusedStack.Push(unused->AsOp()->gtGetOp1()); + unused = unused->AsOp()->gtGetOp2(); + } } else { + assert(!unused->OperIs(GT_LCL_VAR)); assert(unused->OperIs(GT_CNS_INT)); break; } @@ -6159,11 +6173,14 @@ GenTree* Lowering::LowerSignedDivOrMod(GenTree* node) // Arguments: // shift - the shift node (GT_LSH, GT_RSH or GT_RSZ) // +// Return Value: +// The next node to lower. +// // Notes: // Remove unnecessary shift count masking, xarch shift instructions // mask the shift count to 5 bits (or 6 bits for 64 bit operations). -void Lowering::LowerShift(GenTreeOp* shift) +GenTree* Lowering::LowerShift(GenTreeOp* shift) { assert(shift->OperIs(GT_LSH, GT_RSH, GT_RSZ)); @@ -6198,6 +6215,17 @@ void Lowering::LowerShift(GenTreeOp* shift) shift->gtOp2->ClearContained(); } +#ifdef TARGET_XARCH + if (shift->OperIs(GT_LSH)) + { + GenTree* replacementNode = TryLowerLshWithConstant(shift); + if (replacementNode != nullptr) + { + return replacementNode->gtNext; + } + } +#endif // TARGET_XARCH + ContainCheckShiftRotate(shift); #ifdef TARGET_ARM64 @@ -6228,6 +6256,8 @@ void Lowering::LowerShift(GenTreeOp* shift) } } #endif + + return shift->gtNext; } void Lowering::WidenSIMD12IfNecessary(GenTreeLclVarCommon* node) diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index 2e3f90828e62ef..aefab0ff6ece56 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -210,6 +210,12 @@ class Lowering final : public Phase return new (comp, GT_LEA) GenTreeAddrMode(resultType, base, nullptr, 0, offset); } + GenTree* IndexWithScale(GenTree* index, unsigned scale) + { + var_types resultType = (index->TypeGet() == TYP_REF) ? TYP_BYREF : index->TypeGet(); + return new (comp, GT_LEA) GenTreeAddrMode(resultType, nullptr, index, scale, 0); + } + GenTree* OffsetByIndex(GenTree* base, GenTree* index) { var_types resultType = (base->TypeGet() == TYP_REF) ? TYP_BYREF : base->TypeGet(); @@ -320,6 +326,7 @@ class Lowering final : public Phase #ifdef TARGET_XARCH void LowerPutArgStk(GenTreePutArgStk* putArgStk); GenTree* TryLowerMulWithConstant(GenTreeOp* node); + GenTree* TryLowerLshWithConstant(GenTreeOp* node); #endif // TARGET_XARCH bool TryCreateAddrMode(GenTree* addr, bool isContainable, GenTree* parent); @@ -344,7 +351,7 @@ class Lowering final : public Phase void LowerStoreLoc(GenTreeLclVarCommon* tree); GenTree* LowerArrElem(GenTreeArrElem* arrElem); void LowerRotate(GenTree* tree); - void LowerShift(GenTreeOp* shift); + GenTree* LowerShift(GenTreeOp* shift); #ifdef FEATURE_SIMD void LowerSIMD(GenTreeSIMD* simdNode); #endif // FEATURE_SIMD diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index ccbd1997198b09..00b823e73113da 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -271,6 +271,67 @@ GenTree* Lowering::TryLowerMulWithConstant(GenTreeOp* node) return node; } +//---------------------------------------------------------------------------------------------- +// Lowering::TryLowerLshWithConstant: +// Lowers a tree LSH(X, CNS) to LEA(X, CNS) +// +// Arguments: +// node - GT_LSH node of integral type +// +// Return Value: +// Returns the replacement node if one is created else nullptr indicating no replacement +// +// Notes: +// Performs containment checks on the replacement node if one is created +GenTree* Lowering::TryLowerLshWithConstant(GenTreeOp* node) +{ + assert(node->OperIs(GT_LSH)); + + if (comp->opts.MinOpts()) + return nullptr; + + GenTree* op1 = node->gtGetOp1(); + GenTree* op2 = node->gtGetOp2(); + + if (op1->isContained() || op2->isContained()) + return nullptr; + + if (!op2->IsCnsIntOrI()) + return nullptr; + + GenTreeIntConCommon* cns = op2->AsIntConCommon(); + ssize_t cnsVal = cns->IconValue(); + + if (cnsVal == 2) + { + LIR::Use use; + if (BlockRange().TryGetUse(node, &use)) + { + LIR::Use op1Use(BlockRange(), &node->gtOp1, node); + op1 = ReplaceWithLclVar(op1Use); + + GenTree* lea = IndexWithScale(op1, static_cast(cnsVal * 2)); + + BlockRange().Remove(cns); + BlockRange().Remove(op1); + BlockRange().InsertBefore(node, lea->gtGetOp2()); + BlockRange().InsertBefore(node, lea); + + use.ReplaceWith(lea); + + BlockRange().Remove(node); + + return lea; + } + else + { + return nullptr; + } + } + + return nullptr; +} + //------------------------------------------------------------------------ // LowerMul: Lower a GT_MUL/GT_MULHI/GT_MUL_LONG node. // From 88311d6f97363f24e6dde5664b16e231d2c6ee8f Mon Sep 17 00:00:00 2001 From: TIHan Date: Thu, 20 Oct 2022 17:34:22 -0700 Subject: [PATCH 36/53] Cleanup --- src/coreclr/jit/lower.cpp | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 00ae2a1c166e60..42154945677e83 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -5409,30 +5409,16 @@ bool Lowering::TryCreateAddrMode(GenTree* addr, bool isContainable, GenTree* par BlockRange().Remove(unused); - if (unused->OperIs(GT_ADD, GT_MUL, GT_LSH, GT_LEA, GT_LCL_VAR)) + if (unused->OperIs(GT_ADD, GT_MUL, GT_LSH)) { // Push the first operand and loop back to process the second one. // This minimizes the stack depth because the second one tends to be // a constant so it gets processed and then the first one gets popped. - if (unused->OperIs(GT_LEA)) - { - assert(unused->gtGetOp2()->OperIs(GT_LCL_VAR)); - BlockRange().Remove(unused->gtGetOp2()); - unused = nullptr; - } - else if (unused->OperIs(GT_LCL_VAR)) - { - unused = nullptr; - } - else - { - unusedStack.Push(unused->AsOp()->gtGetOp1()); - unused = unused->AsOp()->gtGetOp2(); - } + unusedStack.Push(unused->AsOp()->gtGetOp1()); + unused = unused->AsOp()->gtGetOp2(); } else { - assert(!unused->OperIs(GT_LCL_VAR)); assert(unused->OperIs(GT_CNS_INT)); break; } From 160c68e2095aa434e645443dece4e562bda8bef5 Mon Sep 17 00:00:00 2001 From: TIHan Date: Thu, 20 Oct 2022 18:05:06 -0700 Subject: [PATCH 37/53] Removed TryLowerLshWithConstant --- src/coreclr/jit/codegenxarch.cpp | 5 +++ src/coreclr/jit/lower.cpp | 11 ------ src/coreclr/jit/lower.h | 1 - src/coreclr/jit/lowerxarch.cpp | 61 -------------------------------- 4 files changed, 5 insertions(+), 73 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index e5765eac5e741f..02523b9683ca90 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -4434,6 +4434,11 @@ void CodeGen::genCodeForShift(GenTree* tree) GetEmitter()->emitIns_R_ARX(INS_lea, size, tree->GetRegNum(), operandReg, operandReg, 1, 0); } } + else if (tree->OperIs(GT_LSH) && !tree->gtOverflowEx() && !tree->gtSetFlags() && shiftBy->IsIntegralConst(2) && + tree->GetRegNum() != operandReg) + { + GetEmitter()->emitIns_R_ARX(INS_lea, size, tree->GetRegNum(), REG_NA, operandReg, 4, 0); + } else { int shiftByValue = (int)shiftBy->AsIntConCommon()->IconValue(); diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 42154945677e83..49f211e7b81026 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -6201,17 +6201,6 @@ GenTree* Lowering::LowerShift(GenTreeOp* shift) shift->gtOp2->ClearContained(); } -#ifdef TARGET_XARCH - if (shift->OperIs(GT_LSH)) - { - GenTree* replacementNode = TryLowerLshWithConstant(shift); - if (replacementNode != nullptr) - { - return replacementNode->gtNext; - } - } -#endif // TARGET_XARCH - ContainCheckShiftRotate(shift); #ifdef TARGET_ARM64 diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index aefab0ff6ece56..af55d5d3e8f15b 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -326,7 +326,6 @@ class Lowering final : public Phase #ifdef TARGET_XARCH void LowerPutArgStk(GenTreePutArgStk* putArgStk); GenTree* TryLowerMulWithConstant(GenTreeOp* node); - GenTree* TryLowerLshWithConstant(GenTreeOp* node); #endif // TARGET_XARCH bool TryCreateAddrMode(GenTree* addr, bool isContainable, GenTree* parent); diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 00b823e73113da..ccbd1997198b09 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -271,67 +271,6 @@ GenTree* Lowering::TryLowerMulWithConstant(GenTreeOp* node) return node; } -//---------------------------------------------------------------------------------------------- -// Lowering::TryLowerLshWithConstant: -// Lowers a tree LSH(X, CNS) to LEA(X, CNS) -// -// Arguments: -// node - GT_LSH node of integral type -// -// Return Value: -// Returns the replacement node if one is created else nullptr indicating no replacement -// -// Notes: -// Performs containment checks on the replacement node if one is created -GenTree* Lowering::TryLowerLshWithConstant(GenTreeOp* node) -{ - assert(node->OperIs(GT_LSH)); - - if (comp->opts.MinOpts()) - return nullptr; - - GenTree* op1 = node->gtGetOp1(); - GenTree* op2 = node->gtGetOp2(); - - if (op1->isContained() || op2->isContained()) - return nullptr; - - if (!op2->IsCnsIntOrI()) - return nullptr; - - GenTreeIntConCommon* cns = op2->AsIntConCommon(); - ssize_t cnsVal = cns->IconValue(); - - if (cnsVal == 2) - { - LIR::Use use; - if (BlockRange().TryGetUse(node, &use)) - { - LIR::Use op1Use(BlockRange(), &node->gtOp1, node); - op1 = ReplaceWithLclVar(op1Use); - - GenTree* lea = IndexWithScale(op1, static_cast(cnsVal * 2)); - - BlockRange().Remove(cns); - BlockRange().Remove(op1); - BlockRange().InsertBefore(node, lea->gtGetOp2()); - BlockRange().InsertBefore(node, lea); - - use.ReplaceWith(lea); - - BlockRange().Remove(node); - - return lea; - } - else - { - return nullptr; - } - } - - return nullptr; -} - //------------------------------------------------------------------------ // LowerMul: Lower a GT_MUL/GT_MULHI/GT_MUL_LONG node. // From 4c9f7d3745b9a7208b9236fc7440587afef96527 Mon Sep 17 00:00:00 2001 From: TIHan Date: Thu, 20 Oct 2022 18:31:18 -0700 Subject: [PATCH 38/53] Handle GT_LSH to emit lea if const is 3 --- src/coreclr/jit/codegenxarch.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 02523b9683ca90..b8c5e46f57c600 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -4439,6 +4439,11 @@ void CodeGen::genCodeForShift(GenTree* tree) { GetEmitter()->emitIns_R_ARX(INS_lea, size, tree->GetRegNum(), REG_NA, operandReg, 4, 0); } + else if (tree->OperIs(GT_LSH) && !tree->gtOverflowEx() && !tree->gtSetFlags() && shiftBy->IsIntegralConst(3) && + tree->GetRegNum() != operandReg) + { + GetEmitter()->emitIns_R_ARX(INS_lea, size, tree->GetRegNum(), REG_NA, operandReg, 8, 0); + } else { int shiftByValue = (int)shiftBy->AsIntConCommon()->IconValue(); From a80dd11c944596e6ed7c42ffc575811bac7e0422 Mon Sep 17 00:00:00 2001 From: TIHan Date: Fri, 21 Oct 2022 10:15:13 -0700 Subject: [PATCH 39/53] Added tests. --- src/coreclr/jit/codegenxarch.cpp | 16 +- src/coreclr/jit/lowerxarch.cpp | 67 +------ src/tests/JIT/opt/Multiply/IntMultiply.cs | 163 +++++++++++++++++- src/tests/JIT/opt/Multiply/IntMultiply.csproj | 4 +- 4 files changed, 176 insertions(+), 74 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index b8c5e46f57c600..b472f752b122ba 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -1087,9 +1087,19 @@ void CodeGen::genCodeForMul(GenTreeOp* treeNode) { ssize_t imm = immOp->AsIntConCommon()->IconValue(); - // use the 3-op form with immediate - ins = GetEmitter()->inst3opImulForReg(targetReg); - emit->emitInsBinary(ins, size, rmOp, immOp); + if (!requiresOverflowCheck && rmOp->isUsedFromReg() && ((imm == 3) || (imm == 5) || (imm == 9))) + { + // We will use the LEA instruction to perform this multiply + // Note that an LEA with base=x, index=x and scale=(imm-1) computes x*imm when imm=3,5 or 9. + unsigned int scale = (unsigned int)(imm - 1); + GetEmitter()->emitIns_R_ARX(INS_lea, size, targetReg, rmOp->GetRegNum(), rmOp->GetRegNum(), scale, 0); + } + else + { + // use the 3-op form with immediate + ins = GetEmitter()->inst3opImulForReg(targetReg); + emit->emitInsBinary(ins, size, rmOp, immOp); + } } else // we have no contained immediate operand { diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index ccbd1997198b09..0fce8cd63bfc91 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -109,8 +109,6 @@ void Lowering::LowerStoreIndir(GenTreeStoreInd* node) // Lowering::TryLowerMulWithConstant: // Lowers a tree MUL(X, CNS) to LSH(X, CNS_SHIFT) // or -// Lowers a tree MUL(X, CNS) to LEA(X, X, CNS_MINUS_ONE) -// or // Lowers a tree MUL(X, CNS) to SUB(LSH(X, CNS_SHIFT), X) // or // Lowers a tree MUL(X, CNS) to ADD(LSH(X, CNS_SHIFT), X) @@ -151,6 +149,11 @@ GenTree* Lowering::TryLowerMulWithConstant(GenTreeOp* node) GenTreeIntConCommon* cns = op2->AsIntConCommon(); ssize_t cnsVal = cns->IconValue(); + // Use GT_LEA if cnsVal is 3, 5, or 9. + // These are handled in codegen. + if (cnsVal == 3 || cnsVal == 5 || cnsVal == 9) + return nullptr; + // Use GT_LSH if cnsVal is a power of two. if (isPow2(cnsVal)) { @@ -165,66 +168,6 @@ GenTree* Lowering::TryLowerMulWithConstant(GenTreeOp* node) return node; } - // Use GT_LEA if cnsVal is 3, 5, or 9. - if (cnsVal == 3 || cnsVal == 5 || cnsVal == 9) - { - LIR::Use use; - if (BlockRange().TryGetUse(node, &use)) - { - LIR::Use op1Use(BlockRange(), &node->gtOp1, node); - op1 = ReplaceWithLclVar(op1Use); - - unsigned int scale = (unsigned int)(cnsVal - 1); - - // If the local is not enregisterable, create a new tmp local - // that only gets used as the index and base of GT_LEA. - // LSRA should optimize this to ensure that the index and base will - // be in the same register. - if (!comp->lvaGetDesc(op1->AsLclVar())->IsEnregisterableLcl()) - { - unsigned lclNumTmp = comp->lvaGrabTemp(true DEBUGARG("lclNumTmp")); - GenTree* lclvNodeStore = comp->gtNewTempAssign(lclNumTmp, op1); - GenTree* tmpTree = comp->gtNewLclvNode(lclNumTmp, op1->TypeGet()); - - GenTree* lea = OffsetByIndexWithScale(tmpTree, comp->gtClone(tmpTree), scale); - - BlockRange().Remove(cns); - BlockRange().Remove(op1); - BlockRange().InsertBefore(node, op1); - BlockRange().InsertBefore(node, lclvNodeStore); - BlockRange().InsertBefore(node, lea->gtGetOp2()); - BlockRange().InsertBefore(node, lea->gtGetOp1()); - BlockRange().InsertBefore(node, lea); - - use.ReplaceWith(lea); - - BlockRange().Remove(node); - - return lea; - } - else - { - GenTree* lea = OffsetByIndexWithScale(op1, comp->gtClone(op1), scale); - - BlockRange().Remove(cns); - BlockRange().Remove(op1); - BlockRange().InsertBefore(node, lea->gtGetOp2()); - BlockRange().InsertBefore(node, lea->gtGetOp1()); - BlockRange().InsertBefore(node, lea); - - use.ReplaceWith(lea); - - BlockRange().Remove(node); - - return lea; - } - } - else - { - return nullptr; - } - } - // We do not do this optimization in X86 as it is not recommended. #if TARGET_X86 return nullptr; diff --git a/src/tests/JIT/opt/Multiply/IntMultiply.cs b/src/tests/JIT/opt/Multiply/IntMultiply.cs index 39a3c9397f0912..fb4e18943e060b 100644 --- a/src/tests/JIT/opt/Multiply/IntMultiply.cs +++ b/src/tests/JIT/opt/Multiply/IntMultiply.cs @@ -59,8 +59,7 @@ static ulong UInt64_MultiplyWith3(ulong value) [MethodImpl(MethodImplOptions.NoInlining)] static ulong UInt64_MultiplyWith4(ulong value) { - // X64: mov [[REG0:[a-z]+]], [[REG1:[a-z]+]] - // X64-NEXT: shl [[REG0]], 2 + // X64: lea [[REG0:[a-z]+]], {{\[}}4*[[REG1:[a-z]+]]{{\]}} return value * 4; } @@ -82,8 +81,7 @@ static ulong UInt64_MultiplyWith6(ulong value) [MethodImpl(MethodImplOptions.NoInlining)] static ulong UInt64_MultiplyWith7(ulong value) { - // X64: mov [[REG0:[a-z]+]], [[REG1:[a-z]+]] - // X64-NEXT: shl [[REG0]], 3 + // X64: lea [[REG0:[a-z]+]], {{\[}}8*[[REG1]]{{\]}} // X64-NEXT: sub [[REG0]], [[REG1]] return value * 7; } @@ -91,8 +89,7 @@ static ulong UInt64_MultiplyWith7(ulong value) [MethodImpl(MethodImplOptions.NoInlining)] static ulong UInt64_MultiplyWith8(ulong value) { - // X64: mov [[REG0:[a-z]+]], [[REG1:[a-z]+]] - // X64-NEXT: shl [[REG0]], 3 + // X64: lea [[REG0:[a-z]+]], {{\[}}8*[[REG1]]{{\]}} return value * 8; } @@ -103,9 +100,54 @@ static ulong UInt64_MultiplyWith9(ulong value) return value * 9; } + [MethodImpl(MethodImplOptions.NoInlining)] + static ulong UInt64_MultiplyWith10(ulong value) + { + // X64: lea [[REG0:[a-z]+]], {{\[}}[[REG1:[a-z]+]]+4*[[REG1]]{{\]}} + // X64-NEXT: add [[REG0]], [[REG0]] + return value * 10; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static ulong UInt64_MultiplyWith11(ulong value) + { + // We expect 'imul' since the alternative replacement sequence would require 2 three-component LEA instructions which is slower. + + // X64: imul + return value * 11; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static ulong UInt64_MultiplyWith12(ulong value) + { + // X64: lea [[REG0:[a-z]+]], {{\[}}[[REG1:[a-z]+]]+2*[[REG1]]{{\]}} + // X64-NEXT: shl [[REG0]], 2 + return value * 12; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static ulong UInt64_MultiplyWith13(ulong value) + { + // We expect 'imul' since the alternative replacement sequence would require 2 three-component LEA instructions which is slower. + + // X64: imul + return value * 13; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static ulong UInt64_MultiplyWith14(ulong value) + { + // We expect 'imul' since the alternative replacement sequence would require 4 instructions which is too slow. + + // X64: imul + return value * 14; + } + [MethodImpl(MethodImplOptions.NoInlining)] static ulong UInt64_MultiplyWith15(ulong value) { + // We expect these instructions since the alternative replacement sequence would require 2 three-component LEA instructions which is slower. + // X64: mov [[REG0:[a-z]+]], [[REG1:[a-z]+]] // X64-NEXT: shl [[REG0]], 4 // X64-NEXT: sub [[REG0]], [[REG1]] @@ -129,7 +171,75 @@ static ulong UInt64_MultiplyWith17(ulong value) return value * 17; } - + [MethodImpl(MethodImplOptions.NoInlining)] + static ulong UInt64_MultiplyWith18(ulong value) + { + // X64: lea [[REG0:[a-z]+]], {{\[}}[[REG1:[a-z]+]]+8*[[REG1]]{{\]}} + // X64-NEXT: add [[REG0]], [[REG0]] + return value * 18; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static ulong UInt64_MultiplyWith19(ulong value) + { + // We expect 'imul' since the alternative replacement sequence would require 2 three-component LEA instructions which is slower. + + // X64: imul + return value * 19; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static ulong UInt64_MultiplyWith20(ulong value) + { + // X64: lea [[REG0:[a-z]+]], {{\[}}[[REG1:[a-z]+]]+4*[[REG1]]{{\]}} + // X64-NEXT: shl [[REG0]], 2 + return value * 20; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static ulong UInt64_MultiplyWith21(ulong value) + { + // We expect 'imul' since the alternative replacement sequence would require 2 three-component LEA instructions which is slower. + + // X64: imul + return value * 21; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static ulong UInt64_MultiplyWith22(ulong value) + { + // We expect 'imul' since the alternative replacement sequence would require 2 three-component LEA instructions and 1 ADD instruction which is slower. + + // X64: imul + return value * 22; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static ulong UInt64_MultiplyWith23(ulong value) + { + // We expect 'imul' since the alternative replacement sequence would require 1 three-component LEA instructions, 1 SHL instruction, and 1 ADD instruction which is slower. + + // X64: imul + return value * 23; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static ulong UInt64_MultiplyWith24(ulong value) + { + // X64: lea [[REG0:[a-z]+]], {{\[}}[[REG1:[a-z]+]]+4*[[REG1]]{{\]}} + // X64-NEXT: shl [[REG0]], 3 + return value * 24; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static ulong UInt64_MultiplyWith25(ulong value) + { + // We expect 'imul' since the alternative replacement sequence would require 2 three-component LEA instructions which is slower. + + // X64: imul + return value * 25; + } + [MethodImpl(MethodImplOptions.NoInlining)] static ulong UInt64_MultiplyWith5_AddressExposed(ulong value) { @@ -185,6 +295,21 @@ static int Main() if (UInt64_MultiplyWith9(1) != 9) return 0; + if (UInt64_MultiplyWith10(1) != 10) + return 0; + + if (UInt64_MultiplyWith11(1) != 11) + return 0; + + if (UInt64_MultiplyWith12(1) != 12) + return 0; + + if (UInt64_MultiplyWith13(1) != 13) + return 0; + + if (UInt64_MultiplyWith14(1) != 14) + return 0; + if (UInt64_MultiplyWith15(1) != 15) return 0; @@ -194,6 +319,30 @@ static int Main() if (UInt64_MultiplyWith17(1) != 17) return 0; + if (UInt64_MultiplyWith18(1) != 18) + return 0; + + if (UInt64_MultiplyWith19(1) != 19) + return 0; + + if (UInt64_MultiplyWith20(1) != 20) + return 0; + + if (UInt64_MultiplyWith21(1) != 21) + return 0; + + if (UInt64_MultiplyWith22(1) != 22) + return 0; + + if (UInt64_MultiplyWith23(1) != 23) + return 0; + + if (UInt64_MultiplyWith24(1) != 24) + return 0; + + if (UInt64_MultiplyWith25(1) != 25) + return 0; + if (UInt64_MultiplyWith5_AddressExposed(1) != 5) return 0; diff --git a/src/tests/JIT/opt/Multiply/IntMultiply.csproj b/src/tests/JIT/opt/Multiply/IntMultiply.csproj index 43448e90aa4688..42a89c8384d74e 100644 --- a/src/tests/JIT/opt/Multiply/IntMultiply.csproj +++ b/src/tests/JIT/opt/Multiply/IntMultiply.csproj @@ -11,7 +11,7 @@ true - - + + From d8b4228ac73c5e20e834e5ddf40f470bd0da9c13 Mon Sep 17 00:00:00 2001 From: TIHan Date: Fri, 21 Oct 2022 10:20:41 -0700 Subject: [PATCH 40/53] Updated comments --- src/coreclr/jit/codegenxarch.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index b472f752b122ba..9a79866bcffdf0 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -4444,11 +4444,13 @@ void CodeGen::genCodeForShift(GenTree* tree) GetEmitter()->emitIns_R_ARX(INS_lea, size, tree->GetRegNum(), operandReg, operandReg, 1, 0); } } + // Optimize "X<<2" to "lea [reg*4]" - we only do this when the dst and src registers are different since it will remove a 'mov'. else if (tree->OperIs(GT_LSH) && !tree->gtOverflowEx() && !tree->gtSetFlags() && shiftBy->IsIntegralConst(2) && tree->GetRegNum() != operandReg) { GetEmitter()->emitIns_R_ARX(INS_lea, size, tree->GetRegNum(), REG_NA, operandReg, 4, 0); } + // Optimize "X<<2" to "lea [reg*8]" - we only do this when the dst and src registers are different since it will remove a 'mov'. else if (tree->OperIs(GT_LSH) && !tree->gtOverflowEx() && !tree->gtSetFlags() && shiftBy->IsIntegralConst(3) && tree->GetRegNum() != operandReg) { From e7b140d2d8e9ee4a3a3611db6ef4727be92050e1 Mon Sep 17 00:00:00 2001 From: TIHan Date: Fri, 21 Oct 2022 10:22:35 -0700 Subject: [PATCH 41/53] Revert LowerShift --- src/coreclr/jit/lower.cpp | 7 +------ src/coreclr/jit/lower.h | 2 +- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 49f211e7b81026..947ecea3b103f4 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -6159,14 +6159,11 @@ GenTree* Lowering::LowerSignedDivOrMod(GenTree* node) // Arguments: // shift - the shift node (GT_LSH, GT_RSH or GT_RSZ) // -// Return Value: -// The next node to lower. -// // Notes: // Remove unnecessary shift count masking, xarch shift instructions // mask the shift count to 5 bits (or 6 bits for 64 bit operations). -GenTree* Lowering::LowerShift(GenTreeOp* shift) +void Lowering::LowerShift(GenTreeOp* shift) { assert(shift->OperIs(GT_LSH, GT_RSH, GT_RSZ)); @@ -6231,8 +6228,6 @@ GenTree* Lowering::LowerShift(GenTreeOp* shift) } } #endif - - return shift->gtNext; } void Lowering::WidenSIMD12IfNecessary(GenTreeLclVarCommon* node) diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index af55d5d3e8f15b..9650d58cccc403 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -350,7 +350,7 @@ class Lowering final : public Phase void LowerStoreLoc(GenTreeLclVarCommon* tree); GenTree* LowerArrElem(GenTreeArrElem* arrElem); void LowerRotate(GenTree* tree); - GenTree* LowerShift(GenTreeOp* shift); + void LowerShift(GenTreeOp* shift); #ifdef FEATURE_SIMD void LowerSIMD(GenTreeSIMD* simdNode); #endif // FEATURE_SIMD From 384ff0bca7b6085324da980dab80670d10aff566 Mon Sep 17 00:00:00 2001 From: TIHan Date: Fri, 21 Oct 2022 10:23:39 -0700 Subject: [PATCH 42/53] Remove IndexWithScale --- src/coreclr/jit/lower.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index 9650d58cccc403..2e3f90828e62ef 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -210,12 +210,6 @@ class Lowering final : public Phase return new (comp, GT_LEA) GenTreeAddrMode(resultType, base, nullptr, 0, offset); } - GenTree* IndexWithScale(GenTree* index, unsigned scale) - { - var_types resultType = (index->TypeGet() == TYP_REF) ? TYP_BYREF : index->TypeGet(); - return new (comp, GT_LEA) GenTreeAddrMode(resultType, nullptr, index, scale, 0); - } - GenTree* OffsetByIndex(GenTree* base, GenTree* index) { var_types resultType = (base->TypeGet() == TYP_REF) ? TYP_BYREF : base->TypeGet(); From a935699b96400b7ee3be436a8a407fd29e1e191b Mon Sep 17 00:00:00 2001 From: TIHan Date: Fri, 21 Oct 2022 10:24:28 -0700 Subject: [PATCH 43/53] Fix build --- src/coreclr/jit/lower.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 947ecea3b103f4..0d659452800754 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -297,7 +297,7 @@ GenTree* Lowering::LowerNode(GenTree* node) case GT_RSH: case GT_RSZ: #if defined(TARGET_XARCH) || defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) - return LowerShift(node->AsOp()); + LowerShift(node->AsOp()); #else ContainCheckShiftRotate(node->AsOp()); #endif From 21335a6fa56603f5e9b2cd0603d03c2fdc0c2425 Mon Sep 17 00:00:00 2001 From: TIHan Date: Fri, 21 Oct 2022 10:25:01 -0700 Subject: [PATCH 44/53] Update comment --- src/coreclr/jit/codegenxarch.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 9a79866bcffdf0..a96f351d835d07 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -4450,7 +4450,7 @@ void CodeGen::genCodeForShift(GenTree* tree) { GetEmitter()->emitIns_R_ARX(INS_lea, size, tree->GetRegNum(), REG_NA, operandReg, 4, 0); } - // Optimize "X<<2" to "lea [reg*8]" - we only do this when the dst and src registers are different since it will remove a 'mov'. + // Optimize "X<<3" to "lea [reg*8]" - we only do this when the dst and src registers are different since it will remove a 'mov'. else if (tree->OperIs(GT_LSH) && !tree->gtOverflowEx() && !tree->gtSetFlags() && shiftBy->IsIntegralConst(3) && tree->GetRegNum() != operandReg) { From 05ac3f70f700e25cba642008b74d0241185b7dbc Mon Sep 17 00:00:00 2001 From: TIHan Date: Fri, 21 Oct 2022 10:48:19 -0700 Subject: [PATCH 45/53] Formatting --- src/coreclr/jit/codegenxarch.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index a96f351d835d07..01ff4ddd5ff056 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -4444,13 +4444,15 @@ void CodeGen::genCodeForShift(GenTree* tree) GetEmitter()->emitIns_R_ARX(INS_lea, size, tree->GetRegNum(), operandReg, operandReg, 1, 0); } } - // Optimize "X<<2" to "lea [reg*4]" - we only do this when the dst and src registers are different since it will remove a 'mov'. + // Optimize "X<<2" to "lea [reg*4]" - we only do this when the dst and src registers are different since it will + // remove a 'mov'. else if (tree->OperIs(GT_LSH) && !tree->gtOverflowEx() && !tree->gtSetFlags() && shiftBy->IsIntegralConst(2) && tree->GetRegNum() != operandReg) { GetEmitter()->emitIns_R_ARX(INS_lea, size, tree->GetRegNum(), REG_NA, operandReg, 4, 0); } - // Optimize "X<<3" to "lea [reg*8]" - we only do this when the dst and src registers are different since it will remove a 'mov'. + // Optimize "X<<3" to "lea [reg*8]" - we only do this when the dst and src registers are different since it will + // remove a 'mov'. else if (tree->OperIs(GT_LSH) && !tree->gtOverflowEx() && !tree->gtSetFlags() && shiftBy->IsIntegralConst(3) && tree->GetRegNum() != operandReg) { From 85ca0f4f0babc629c3c54b8909d87425909e8550 Mon Sep 17 00:00:00 2001 From: TIHan Date: Fri, 21 Oct 2022 14:39:04 -0700 Subject: [PATCH 46/53] Update comments. Fix disasm test. --- src/coreclr/jit/lowerxarch.cpp | 4 +--- src/tests/JIT/opt/Multiply/IntMultiply.cs | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 0fce8cd63bfc91..b966f86dff66df 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -125,9 +125,7 @@ GenTree* Lowering::TryLowerMulWithConstant(GenTreeOp* node) { assert(node->OperIs(GT_MUL)); - // Do not do this optimization with min-opts enabled as - // this could create more tmp locals that need to be optimized - // in LSRA. + // Do not do these optimizations when min-opts enabled. if (comp->opts.MinOpts()) return nullptr; diff --git a/src/tests/JIT/opt/Multiply/IntMultiply.cs b/src/tests/JIT/opt/Multiply/IntMultiply.cs index fb4e18943e060b..60cc2ca9db93df 100644 --- a/src/tests/JIT/opt/Multiply/IntMultiply.cs +++ b/src/tests/JIT/opt/Multiply/IntMultiply.cs @@ -81,7 +81,7 @@ static ulong UInt64_MultiplyWith6(ulong value) [MethodImpl(MethodImplOptions.NoInlining)] static ulong UInt64_MultiplyWith7(ulong value) { - // X64: lea [[REG0:[a-z]+]], {{\[}}8*[[REG1]]{{\]}} + // X64: lea [[REG0:[a-z]+]], {{\[}}8*[[REG1:[a-z]+]]{{\]}} // X64-NEXT: sub [[REG0]], [[REG1]] return value * 7; } From e633fefb5e04315631b99f8e06b16a4ee6dda5dd Mon Sep 17 00:00:00 2001 From: TIHan Date: Fri, 21 Oct 2022 14:45:40 -0700 Subject: [PATCH 47/53] Another minor test change --- src/tests/JIT/opt/Multiply/IntMultiply.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tests/JIT/opt/Multiply/IntMultiply.cs b/src/tests/JIT/opt/Multiply/IntMultiply.cs index 60cc2ca9db93df..f85632ad417900 100644 --- a/src/tests/JIT/opt/Multiply/IntMultiply.cs +++ b/src/tests/JIT/opt/Multiply/IntMultiply.cs @@ -245,7 +245,7 @@ static ulong UInt64_MultiplyWith5_AddressExposed(ulong value) { // X64: mov [[REG0:[a-z]+]], qword ptr // X64-NOT: mov - // X64-NEXT: lea [[REG1:[a-z]+]], {{\[}}[[REG0:[a-z]+]]+4*[[REG0]]{{\]}} + // X64-NEXT: lea [[REG1:[a-z]+]], {{\[}}[[REG0]]+4*[[REG0]]{{\]}} var value2 = value * 5; UInt64_AddressExposed(ref value); return value2; From 5a459d882c36a40aa04cb7c796a14b7af53d73fb Mon Sep 17 00:00:00 2001 From: TIHan Date: Fri, 21 Oct 2022 18:00:04 -0700 Subject: [PATCH 48/53] Fix test --- src/tests/JIT/opt/Multiply/IntMultiply.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tests/JIT/opt/Multiply/IntMultiply.cs b/src/tests/JIT/opt/Multiply/IntMultiply.cs index f85632ad417900..3dd4aa76e36b5a 100644 --- a/src/tests/JIT/opt/Multiply/IntMultiply.cs +++ b/src/tests/JIT/opt/Multiply/IntMultiply.cs @@ -89,7 +89,7 @@ static ulong UInt64_MultiplyWith7(ulong value) [MethodImpl(MethodImplOptions.NoInlining)] static ulong UInt64_MultiplyWith8(ulong value) { - // X64: lea [[REG0:[a-z]+]], {{\[}}8*[[REG1]]{{\]}} + // X64: lea [[REG0:[a-z]+]], {{\[}}8*[[REG1:[a-z]+]]{{\]}} return value * 8; } From f40aa1cf13e350bf42b8285b046d5f42b2b96fed Mon Sep 17 00:00:00 2001 From: TIHan Date: Sat, 22 Oct 2022 11:51:08 -0700 Subject: [PATCH 49/53] Fix test --- src/tests/JIT/opt/Multiply/IntMultiply.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tests/JIT/opt/Multiply/IntMultiply.cs b/src/tests/JIT/opt/Multiply/IntMultiply.cs index 3dd4aa76e36b5a..a6e62b374e4d55 100644 --- a/src/tests/JIT/opt/Multiply/IntMultiply.cs +++ b/src/tests/JIT/opt/Multiply/IntMultiply.cs @@ -226,7 +226,7 @@ static ulong UInt64_MultiplyWith23(ulong value) [MethodImpl(MethodImplOptions.NoInlining)] static ulong UInt64_MultiplyWith24(ulong value) { - // X64: lea [[REG0:[a-z]+]], {{\[}}[[REG1:[a-z]+]]+4*[[REG1]]{{\]}} + // X64: lea [[REG0:[a-z]+]], {{\[}}[[REG1:[a-z]+]]+2*[[REG1]]{{\]}} // X64-NEXT: shl [[REG0]], 3 return value * 24; } From 19d26d18f48241e2c5b767b2574d565e3826f962 Mon Sep 17 00:00:00 2001 From: TIHan Date: Tue, 25 Oct 2022 13:24:22 -0700 Subject: [PATCH 50/53] Feedback --- src/coreclr/jit/codegenxarch.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 01ff4ddd5ff056..3f6ea659bdfb8f 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -4432,8 +4432,10 @@ void CodeGen::genCodeForShift(GenTree* tree) { emitAttr size = emitTypeSize(tree); + bool mightOptimizeLsh = tree->OperIs(GT_LSH) && !tree->gtOverflowEx() && !tree->gtSetFlags(); + // Optimize "X<<1" to "lea [reg+reg]" or "add reg, reg" - if (tree->OperIs(GT_LSH) && !tree->gtOverflowEx() && !tree->gtSetFlags() && shiftBy->IsIntegralConst(1)) + if (mightOptimizeLsh && shiftBy->IsIntegralConst(1)) { if (tree->GetRegNum() == operandReg) { @@ -4446,15 +4448,13 @@ void CodeGen::genCodeForShift(GenTree* tree) } // Optimize "X<<2" to "lea [reg*4]" - we only do this when the dst and src registers are different since it will // remove a 'mov'. - else if (tree->OperIs(GT_LSH) && !tree->gtOverflowEx() && !tree->gtSetFlags() && shiftBy->IsIntegralConst(2) && - tree->GetRegNum() != operandReg) + else if (mightOptimizeLsh && shiftBy->IsIntegralConst(2) && tree->GetRegNum() != operandReg) { GetEmitter()->emitIns_R_ARX(INS_lea, size, tree->GetRegNum(), REG_NA, operandReg, 4, 0); } // Optimize "X<<3" to "lea [reg*8]" - we only do this when the dst and src registers are different since it will // remove a 'mov'. - else if (tree->OperIs(GT_LSH) && !tree->gtOverflowEx() && !tree->gtSetFlags() && shiftBy->IsIntegralConst(3) && - tree->GetRegNum() != operandReg) + else if (mightOptimizeLsh && shiftBy->IsIntegralConst(3) && tree->GetRegNum() != operandReg) { GetEmitter()->emitIns_R_ARX(INS_lea, size, tree->GetRegNum(), REG_NA, operandReg, 8, 0); } From 923d10b4b366586e9f92b4f8bf666f8e72051e88 Mon Sep 17 00:00:00 2001 From: Will Smith Date: Fri, 28 Oct 2022 13:06:40 -0700 Subject: [PATCH 51/53] Update IntMultiply.cs --- src/tests/JIT/opt/Multiply/IntMultiply.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tests/JIT/opt/Multiply/IntMultiply.cs b/src/tests/JIT/opt/Multiply/IntMultiply.cs index a6e62b374e4d55..b67ef6cf5d2d8e 100644 --- a/src/tests/JIT/opt/Multiply/IntMultiply.cs +++ b/src/tests/JIT/opt/Multiply/IntMultiply.cs @@ -217,7 +217,7 @@ static ulong UInt64_MultiplyWith22(ulong value) [MethodImpl(MethodImplOptions.NoInlining)] static ulong UInt64_MultiplyWith23(ulong value) { - // We expect 'imul' since the alternative replacement sequence would require 1 three-component LEA instructions, 1 SHL instruction, and 1 ADD instruction which is slower. + // We expect 'imul' since the alternative replacement sequence would require 1 three-component LEA instruction, 1 SHL instruction, and 1 ADD instruction which is slower. // X64: imul return value * 23; From 5ac902d7dd583fbe07b360a71e57a337d8a1b7a3 Mon Sep 17 00:00:00 2001 From: Will Smith Date: Mon, 31 Oct 2022 14:55:56 -0700 Subject: [PATCH 52/53] Update codegenxarch.cpp --- src/coreclr/jit/codegenxarch.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 3f6ea659bdfb8f..a09e3c116fb701 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -4446,13 +4446,13 @@ void CodeGen::genCodeForShift(GenTree* tree) GetEmitter()->emitIns_R_ARX(INS_lea, size, tree->GetRegNum(), operandReg, operandReg, 1, 0); } } - // Optimize "X<<2" to "lea [reg*4]" - we only do this when the dst and src registers are different since it will + // Optimize "X<<2" to "lea [reg*4]" -- we only do this when the dst and src registers are different since it will // remove a 'mov'. else if (mightOptimizeLsh && shiftBy->IsIntegralConst(2) && tree->GetRegNum() != operandReg) { GetEmitter()->emitIns_R_ARX(INS_lea, size, tree->GetRegNum(), REG_NA, operandReg, 4, 0); } - // Optimize "X<<3" to "lea [reg*8]" - we only do this when the dst and src registers are different since it will + // Optimize "X<<3" to "lea [reg*8]" -- we only do this when the dst and src registers are different since it will // remove a 'mov'. else if (mightOptimizeLsh && shiftBy->IsIntegralConst(3) && tree->GetRegNum() != operandReg) { From c8fb73b69388a60940780fbeef187230f1a0b90a Mon Sep 17 00:00:00 2001 From: Will Smith Date: Tue, 1 Nov 2022 16:24:48 -0700 Subject: [PATCH 53/53] Update codegenxarch.cpp --- src/coreclr/jit/codegenxarch.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index a09e3c116fb701..3f6ea659bdfb8f 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -4446,13 +4446,13 @@ void CodeGen::genCodeForShift(GenTree* tree) GetEmitter()->emitIns_R_ARX(INS_lea, size, tree->GetRegNum(), operandReg, operandReg, 1, 0); } } - // Optimize "X<<2" to "lea [reg*4]" -- we only do this when the dst and src registers are different since it will + // Optimize "X<<2" to "lea [reg*4]" - we only do this when the dst and src registers are different since it will // remove a 'mov'. else if (mightOptimizeLsh && shiftBy->IsIntegralConst(2) && tree->GetRegNum() != operandReg) { GetEmitter()->emitIns_R_ARX(INS_lea, size, tree->GetRegNum(), REG_NA, operandReg, 4, 0); } - // Optimize "X<<3" to "lea [reg*8]" -- we only do this when the dst and src registers are different since it will + // Optimize "X<<3" to "lea [reg*8]" - we only do this when the dst and src registers are different since it will // remove a 'mov'. else if (mightOptimizeLsh && shiftBy->IsIntegralConst(3) && tree->GetRegNum() != operandReg) {