diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index b634ceab5ff0df..3f6ea659bdfb8f 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -1085,7 +1085,6 @@ void CodeGen::genCodeForMul(GenTreeOp* treeNode) if (immOp != nullptr) { - // CQ: When possible use LEA for mul by imm 3, 5 or 9 ssize_t imm = immOp->AsIntConCommon()->IconValue(); if (!requiresOverflowCheck && rmOp->isUsedFromReg() && ((imm == 3) || (imm == 5) || (imm == 9))) @@ -1095,17 +1094,6 @@ void CodeGen::genCodeForMul(GenTreeOp* treeNode) unsigned int scale = (unsigned int)(imm - 1); GetEmitter()->emitIns_R_ARX(INS_lea, size, targetReg, rmOp->GetRegNum(), rmOp->GetRegNum(), scale, 0); } - else if (!requiresOverflowCheck && rmOp->isUsedFromReg() && (imm == genFindLowestBit(imm)) && (imm != 0)) - { - // Use shift for constant multiply when legal - uint64_t zextImm = static_cast(static_cast(imm)); - unsigned int shiftAmount = genLog2(zextImm); - - // Copy reg src to dest register - inst_Mov(targetType, targetReg, rmOp->GetRegNum(), /* canSkip */ true); - - inst_RV_SH(INS_shl, size, targetReg, shiftAmount); - } else { // use the 3-op form with immediate @@ -4444,8 +4432,10 @@ void CodeGen::genCodeForShift(GenTree* tree) { emitAttr size = emitTypeSize(tree); + bool mightOptimizeLsh = tree->OperIs(GT_LSH) && !tree->gtOverflowEx() && !tree->gtSetFlags(); + // Optimize "X<<1" to "lea [reg+reg]" or "add reg, reg" - if (tree->OperIs(GT_LSH) && !tree->gtOverflowEx() && !tree->gtSetFlags() && shiftBy->IsIntegralConst(1)) + if (mightOptimizeLsh && shiftBy->IsIntegralConst(1)) { if (tree->GetRegNum() == operandReg) { @@ -4456,6 +4446,18 @@ void CodeGen::genCodeForShift(GenTree* tree) GetEmitter()->emitIns_R_ARX(INS_lea, size, tree->GetRegNum(), operandReg, operandReg, 1, 0); } } + // Optimize "X<<2" to "lea [reg*4]" - we only do this when the dst and src registers are different since it will + // remove a 'mov'. + else if (mightOptimizeLsh && shiftBy->IsIntegralConst(2) && tree->GetRegNum() != operandReg) + { + GetEmitter()->emitIns_R_ARX(INS_lea, size, tree->GetRegNum(), REG_NA, operandReg, 4, 0); + } + // Optimize "X<<3" to "lea [reg*8]" - we only do this when the dst and src registers are different since it will + // remove a 'mov'. + else if (mightOptimizeLsh && shiftBy->IsIntegralConst(3) && tree->GetRegNum() != operandReg) + { + GetEmitter()->emitIns_R_ARX(INS_lea, size, tree->GetRegNum(), REG_NA, operandReg, 8, 0); + } else { int shiftByValue = (int)shiftBy->AsIntConCommon()->IconValue(); diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index 30b12a38729e69..2e3f90828e62ef 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -319,7 +319,7 @@ class Lowering final : public Phase void LowerPutArgStkOrSplit(GenTreePutArgStk* putArgNode); #ifdef TARGET_XARCH void LowerPutArgStk(GenTreePutArgStk* putArgStk); - GenTree* TryLowerMulToLshSubOrLshAdd(GenTreeOp* node); + GenTree* TryLowerMulWithConstant(GenTreeOp* node); #endif // TARGET_XARCH bool TryCreateAddrMode(GenTree* addr, bool isContainable, GenTree* parent); diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 72d9720c5ec3dd..b966f86dff66df 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -106,7 +106,9 @@ void Lowering::LowerStoreIndir(GenTreeStoreInd* node) } //---------------------------------------------------------------------------------------------- -// Lowering::TryLowerMulToLshSubOrLshAdd: +// Lowering::TryLowerMulWithConstant: +// Lowers a tree MUL(X, CNS) to LSH(X, CNS_SHIFT) +// or // Lowers a tree MUL(X, CNS) to SUB(LSH(X, CNS_SHIFT), X) // or // Lowers a tree MUL(X, CNS) to ADD(LSH(X, CNS_SHIFT), X) @@ -119,14 +121,14 @@ void Lowering::LowerStoreIndir(GenTreeStoreInd* node) // // Notes: // Performs containment checks on the replacement node if one is created -GenTree* Lowering::TryLowerMulToLshSubOrLshAdd(GenTreeOp* node) +GenTree* Lowering::TryLowerMulWithConstant(GenTreeOp* node) { assert(node->OperIs(GT_MUL)); -// We do not do this optimization in X86 as it is not recommended. -#if TARGET_X86 - return nullptr; -#endif // TARGET_X86 + // Do not do these optimizations when min-opts enabled. + if (comp->opts.MinOpts()) + return nullptr; + if (!varTypeIsIntegral(node)) return nullptr; @@ -139,25 +141,36 @@ GenTree* Lowering::TryLowerMulToLshSubOrLshAdd(GenTreeOp* node) if (op1->isContained() || op2->isContained()) return nullptr; - if (!op1->OperIs(GT_LCL_VAR)) - return nullptr; - if (!op2->IsCnsIntOrI()) return nullptr; GenTreeIntConCommon* cns = op2->AsIntConCommon(); ssize_t cnsVal = cns->IconValue(); - // Use GT_LSH if cnsVal is a power of two. - // This is handled in codegen. - if (isPow2(cnsVal)) - return nullptr; - // Use GT_LEA if cnsVal is 3, 5, or 9. - // This is handled in codegen. + // These are handled in codegen. if (cnsVal == 3 || cnsVal == 5 || cnsVal == 9) return nullptr; + // Use GT_LSH if cnsVal is a power of two. + if (isPow2(cnsVal)) + { + // Use shift for constant multiply when legal + unsigned int shiftAmount = genLog2(static_cast(static_cast(cnsVal))); + + cns->SetIconValue(shiftAmount); + node->ChangeOper(GT_LSH); + + ContainCheckShiftRotate(node); + + return node; + } + +// We do not do this optimization in X86 as it is not recommended. +#if TARGET_X86 + return nullptr; +#endif // TARGET_X86 + ssize_t cnsValPlusOne = cnsVal + 1; ssize_t cnsValMinusOne = cnsVal - 1; @@ -166,6 +179,9 @@ GenTree* Lowering::TryLowerMulToLshSubOrLshAdd(GenTreeOp* node) if (!useSub && !isPow2(cnsValMinusOne)) return nullptr; + LIR::Use op1Use(BlockRange(), &node->gtOp1, node); + op1 = ReplaceWithLclVar(op1Use); + if (useSub) { cnsVal = cnsValPlusOne; @@ -213,7 +229,7 @@ GenTree* Lowering::LowerMul(GenTreeOp* mul) if (mul->OperIs(GT_MUL)) { - GenTree* replacementNode = TryLowerMulToLshSubOrLshAdd(mul); + GenTree* replacementNode = TryLowerMulWithConstant(mul); if (replacementNode != nullptr) { return replacementNode->gtNext; diff --git a/src/tests/JIT/opt/Multiply/IntMultiply.cs b/src/tests/JIT/opt/Multiply/IntMultiply.cs index 1bb274f0df9765..b67ef6cf5d2d8e 100644 --- a/src/tests/JIT/opt/Multiply/IntMultiply.cs +++ b/src/tests/JIT/opt/Multiply/IntMultiply.cs @@ -59,8 +59,7 @@ static ulong UInt64_MultiplyWith3(ulong value) [MethodImpl(MethodImplOptions.NoInlining)] static ulong UInt64_MultiplyWith4(ulong value) { - // X64: mov [[REG0:[a-z]+]], [[REG1:[a-z]+]] - // X64-NEXT: shl [[REG0]], 2 + // X64: lea [[REG0:[a-z]+]], {{\[}}4*[[REG1:[a-z]+]]{{\]}} return value * 4; } @@ -82,8 +81,7 @@ static ulong UInt64_MultiplyWith6(ulong value) [MethodImpl(MethodImplOptions.NoInlining)] static ulong UInt64_MultiplyWith7(ulong value) { - // X64: mov [[REG0:[a-z]+]], [[REG1:[a-z]+]] - // X64-NEXT: shl [[REG0]], 3 + // X64: lea [[REG0:[a-z]+]], {{\[}}8*[[REG1:[a-z]+]]{{\]}} // X64-NEXT: sub [[REG0]], [[REG1]] return value * 7; } @@ -91,8 +89,7 @@ static ulong UInt64_MultiplyWith7(ulong value) [MethodImpl(MethodImplOptions.NoInlining)] static ulong UInt64_MultiplyWith8(ulong value) { - // X64: mov [[REG0:[a-z]+]], [[REG1:[a-z]+]] - // X64-NEXT: shl [[REG0]], 3 + // X64: lea [[REG0:[a-z]+]], {{\[}}8*[[REG1:[a-z]+]]{{\]}} return value * 8; } @@ -103,9 +100,54 @@ static ulong UInt64_MultiplyWith9(ulong value) return value * 9; } + [MethodImpl(MethodImplOptions.NoInlining)] + static ulong UInt64_MultiplyWith10(ulong value) + { + // X64: lea [[REG0:[a-z]+]], {{\[}}[[REG1:[a-z]+]]+4*[[REG1]]{{\]}} + // X64-NEXT: add [[REG0]], [[REG0]] + return value * 10; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static ulong UInt64_MultiplyWith11(ulong value) + { + // We expect 'imul' since the alternative replacement sequence would require 2 three-component LEA instructions which is slower. + + // X64: imul + return value * 11; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static ulong UInt64_MultiplyWith12(ulong value) + { + // X64: lea [[REG0:[a-z]+]], {{\[}}[[REG1:[a-z]+]]+2*[[REG1]]{{\]}} + // X64-NEXT: shl [[REG0]], 2 + return value * 12; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static ulong UInt64_MultiplyWith13(ulong value) + { + // We expect 'imul' since the alternative replacement sequence would require 2 three-component LEA instructions which is slower. + + // X64: imul + return value * 13; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static ulong UInt64_MultiplyWith14(ulong value) + { + // We expect 'imul' since the alternative replacement sequence would require 4 instructions which is too slow. + + // X64: imul + return value * 14; + } + [MethodImpl(MethodImplOptions.NoInlining)] static ulong UInt64_MultiplyWith15(ulong value) { + // We expect these instructions since the alternative replacement sequence would require 2 three-component LEA instructions which is slower. + // X64: mov [[REG0:[a-z]+]], [[REG1:[a-z]+]] // X64-NEXT: shl [[REG0]], 4 // X64-NEXT: sub [[REG0]], [[REG1]] @@ -129,6 +171,92 @@ static ulong UInt64_MultiplyWith17(ulong value) return value * 17; } + [MethodImpl(MethodImplOptions.NoInlining)] + static ulong UInt64_MultiplyWith18(ulong value) + { + // X64: lea [[REG0:[a-z]+]], {{\[}}[[REG1:[a-z]+]]+8*[[REG1]]{{\]}} + // X64-NEXT: add [[REG0]], [[REG0]] + return value * 18; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static ulong UInt64_MultiplyWith19(ulong value) + { + // We expect 'imul' since the alternative replacement sequence would require 2 three-component LEA instructions which is slower. + + // X64: imul + return value * 19; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static ulong UInt64_MultiplyWith20(ulong value) + { + // X64: lea [[REG0:[a-z]+]], {{\[}}[[REG1:[a-z]+]]+4*[[REG1]]{{\]}} + // X64-NEXT: shl [[REG0]], 2 + return value * 20; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static ulong UInt64_MultiplyWith21(ulong value) + { + // We expect 'imul' since the alternative replacement sequence would require 2 three-component LEA instructions which is slower. + + // X64: imul + return value * 21; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static ulong UInt64_MultiplyWith22(ulong value) + { + // We expect 'imul' since the alternative replacement sequence would require 2 three-component LEA instructions and 1 ADD instruction which is slower. + + // X64: imul + return value * 22; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static ulong UInt64_MultiplyWith23(ulong value) + { + // We expect 'imul' since the alternative replacement sequence would require 1 three-component LEA instruction, 1 SHL instruction, and 1 ADD instruction which is slower. + + // X64: imul + return value * 23; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static ulong UInt64_MultiplyWith24(ulong value) + { + // X64: lea [[REG0:[a-z]+]], {{\[}}[[REG1:[a-z]+]]+2*[[REG1]]{{\]}} + // X64-NEXT: shl [[REG0]], 3 + return value * 24; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static ulong UInt64_MultiplyWith25(ulong value) + { + // We expect 'imul' since the alternative replacement sequence would require 2 three-component LEA instructions which is slower. + + // X64: imul + return value * 25; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static ulong UInt64_MultiplyWith5_AddressExposed(ulong value) + { + // X64: mov [[REG0:[a-z]+]], qword ptr + // X64-NOT: mov + // X64-NEXT: lea [[REG1:[a-z]+]], {{\[}}[[REG0]]+4*[[REG0]]{{\]}} + var value2 = value * 5; + UInt64_AddressExposed(ref value); + return value2; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void UInt64_AddressExposed(ref ulong value) + { + + } + static int Main() { if (UInt32_MultiplyWithUInt32MaxValue(1) != UInt32.MaxValue) @@ -167,6 +295,21 @@ static int Main() if (UInt64_MultiplyWith9(1) != 9) return 0; + if (UInt64_MultiplyWith10(1) != 10) + return 0; + + if (UInt64_MultiplyWith11(1) != 11) + return 0; + + if (UInt64_MultiplyWith12(1) != 12) + return 0; + + if (UInt64_MultiplyWith13(1) != 13) + return 0; + + if (UInt64_MultiplyWith14(1) != 14) + return 0; + if (UInt64_MultiplyWith15(1) != 15) return 0; @@ -176,6 +319,33 @@ static int Main() if (UInt64_MultiplyWith17(1) != 17) return 0; + if (UInt64_MultiplyWith18(1) != 18) + return 0; + + if (UInt64_MultiplyWith19(1) != 19) + return 0; + + if (UInt64_MultiplyWith20(1) != 20) + return 0; + + if (UInt64_MultiplyWith21(1) != 21) + return 0; + + if (UInt64_MultiplyWith22(1) != 22) + return 0; + + if (UInt64_MultiplyWith23(1) != 23) + return 0; + + if (UInt64_MultiplyWith24(1) != 24) + return 0; + + if (UInt64_MultiplyWith25(1) != 25) + return 0; + + if (UInt64_MultiplyWith5_AddressExposed(1) != 5) + return 0; + return 100; } } diff --git a/src/tests/JIT/opt/Multiply/IntMultiply.csproj b/src/tests/JIT/opt/Multiply/IntMultiply.csproj index 43448e90aa4688..42a89c8384d74e 100644 --- a/src/tests/JIT/opt/Multiply/IntMultiply.csproj +++ b/src/tests/JIT/opt/Multiply/IntMultiply.csproj @@ -11,7 +11,7 @@ true - - + +