Fast cube root, square root, and reciprocal for x86/SSE CPUs.

References : Posted by kaleja[AT]estarcion[DOT]com
Notes :
All of these methods use SSE instructions or bit twiddling tricks to get a rough approximation to cube root, square root, or reciprocal, which is then refined with one or more Newton-Raphson approximation steps.

Each is named to indicate its approximate level of accuracy and a comment describes its performance relative to the conventional versions.


Code :
// These functions approximate reciprocal, square root, and
// cube root to varying degrees of precision substantially
// faster than the straightforward methods 1/x, sqrtf(x),
// and powf( x, 1.0f/3.0f ). All require SSE-enabled CPU & OS.
// Unlike the powf() solution, the cube roots also correctly
// handle negative inputs.  

#define REALLY_INLINE __forceinline

// ~34 clocks on Pentium M vs. ~360 for powf
REALLY_INLINE float cuberoot_sse_8bits( float x )
{
    float z;
    static const float three = 3.0f;
    _asm
    {
        mov        eax, x                // x as bits
        movss    xmm2, x                // x2: x
        movss    xmm1, three            // x1: 3
        // Magic floating point cube root done with integer math.
        // The exponent is divided by three in such a way that
        // remainder bits get shoved into the top of the normalized
        // mantissa.
        mov        ecx, eax            // copy of x
        and        eax, 0x7FFFFFFF        // exponent & mantissa of x in biased-127
        sub     eax, 0x3F800000        // exponent & mantissa of x in 2's comp
        sar     eax, 10                //
        imul    eax, 341            // 341/1024 ~= .333
        add        eax, 0x3F800000        // back to biased-127
        and     eax, 0x7FFFFFFF        // remask
        and        ecx, 0x80000000        // original sign and mantissa
        or      eax, ecx            // masked new exponent, old sign and mantissa
        mov        z, eax                //
        
        // use SSE to refine the first approximation
        movss    xmm0, z                ;// x0: z
        movss    xmm3, xmm0            ;// x3: z
        mulss    xmm3, xmm0            ;// x3: z*z
        movss    xmm4, xmm3            ;// x4: z*z
        mulss   xmm3, xmm1            ;// x3: 3*z*z
        rcpss    xmm3, xmm3            ;// x3: ~ 1/(3*z*z)
        mulss   xmm4, xmm0            ;// x4: z*z*z
        subss    xmm4, xmm2            ;// x4: z*z*z-x
        mulss    xmm4, xmm3            ;// x4: (z*z*z-x) / (3*z*z)
        subss    xmm0, xmm4            ;// x0: z' accurate to within about 0.3%
        movss    z, xmm0
    }

    return z;
}

// ~60 clocks on Pentium M vs. ~360 for powf
REALLY_INLINE float cuberoot_sse_16bits( float x )
{
    float z;
    static const float three = 3.0f;
    _asm
    {
        mov        eax, x                // x as bits
        movss    xmm2, x                // x2: x
        movss    xmm1, three            // x1: 3
        // Magic floating point cube root done with integer math.
        // The exponent is divided by three in such a way that
        // remainder bits get shoved into the top of the normalized
        // mantissa.
        mov        ecx, eax            // copy of x
        and        eax, 0x7FFFFFFF        // exponent & mantissa of x in biased-127
        sub     eax, 0x3F800000        // exponent & mantissa of x in 2's comp
        sar     eax, 10                //
        imul    eax, 341            // 341/1024 ~= .333
        add        eax, 0x3F800000        // back to biased-127
        and     eax, 0x7FFFFFFF        // remask
        and        ecx, 0x80000000        // original sign and mantissa
        or      eax, ecx            // masked new exponent, old sign and mantissa
        mov        z, eax                //
        
        // use SSE to refine the first approximation
        movss    xmm0, z                ;// x0: z
        movss    xmm3, xmm0            ;// x3: z
        mulss    xmm3, xmm0            ;// x3: z*z
        movss    xmm4, xmm3            ;// x4: z*z
        mulss   xmm3, xmm1            ;// x3: 3*z*z
        rcpss    xmm3, xmm3            ;// x3: ~ 1/(3*z*z)
        mulss   xmm4, xmm0            ;// x4: z*z*z
        subss    xmm4, xmm2            ;// x4: z*z*z-x
        mulss    xmm4, xmm3            ;// x4: (z*z*z-x) / (3*z*z)
        subss    xmm0, xmm4            ;// x0: z' accurate to within about 0.3%

        movss    xmm3, xmm0            ;// x3: z
        mulss    xmm3, xmm0            ;// x3: z*z
        movss    xmm4, xmm3            ;// x4: z*z
        mulss   xmm3, xmm1            ;// x3: 3*z*z
        rcpss    xmm3, xmm3            ;// x3: ~ 1/(3*z*z)
        mulss   xmm4, xmm0            ;// x4: z*z*z
        subss    xmm4, xmm2            ;// x4: z*z*z-x
        mulss    xmm4, xmm3            ;// x4: (z*z*z-x) / (3*z*z)
        subss    xmm0, xmm4            ;// x0: z'' accurate to within about 0.001%

        movss    z, xmm0
    }
    
    return z;
}

// ~77 clocks on Pentium M vs. ~360 for powf
REALLY_INLINE float cuberoot_sse_22bits( float x )
{
    float z;
    static const float three = 3.0f;
    _asm
    {
        mov        eax, x                // x as bits
        movss    xmm2, x                // x2: x
        movss    xmm1, three            // x1: 3
        // Magic floating point cube root done with integer math.
        // The exponent is divided by three in such a way that
        // remainder bits get shoved into the top of the normalized
        // mantissa.
        mov        ecx, eax            // copy of x
        and        eax, 0x7FFFFFFF        // exponent & mantissa of x in biased-127
        sub     eax, 0x3F800000        // exponent & mantissa of x in 2's comp
        sar     eax, 10                //
        imul    eax, 341            // 341/1024 ~= .333
        add        eax, 0x3F800000        // back to biased-127
        and     eax, 0x7FFFFFFF        // remask
        and        ecx, 0x80000000        // original sign and mantissa
        or      eax, ecx            // masked new exponent, old sign and mantissa
        mov        z, eax                //
        
        // use SSE to refine the first approximation
        movss    xmm0, z                // x0: z
        movss    xmm3, xmm0            // x3: z
        mulss    xmm3, xmm0            // x3: z*z
        movss    xmm4, xmm3            // x4: z*z
        mulss   xmm3, xmm1            // x3: 3*z*z
        rcpss    xmm3, xmm3            // x3: ~ 1/(3*z*z)
        mulss   xmm4, xmm0            // x4: z*z*z
        subss    xmm4, xmm2            // x4: z*z*z-x
        mulss    xmm4, xmm3            // x4: (z*z*z-x) / (3*z*z)
        subss    xmm0, xmm4            // x0: z' accurate to within about 0.3%
        
        movss    xmm3, xmm0            // x3: z
        mulss    xmm3, xmm0            // x3: z*z
        movss    xmm4, xmm3            // x4: z*z
        mulss   xmm3, xmm1            // x3: 3*z*z
        rcpss    xmm3, xmm3            // x3: ~ 1/(3*z*z)
        mulss   xmm4, xmm0            // x4: z*z*z
        subss    xmm4, xmm2            // x4: z*z*z-x
        mulss    xmm4, xmm3            // x4: (z*z*z-x) / (3*z*z)
        subss    xmm0, xmm4            // x0: z'' accurate to within about 0.001%

        movss    xmm3, xmm0            // x3: z
        mulss    xmm3, xmm0            // x3: z*z
        movss    xmm4, xmm3            // x4: z*z
        mulss   xmm3, xmm1            // x3: 3*z*z
        rcpss    xmm3, xmm3            // x3: ~ 1/(3*z*z)
        mulss   xmm4, xmm0            // x4: z*z*z
        subss    xmm4, xmm2            // x4: z*z*z-x
        mulss    xmm4, xmm3            // x4: (z*z*z-x) / (3*z*z)
        subss    xmm0, xmm4            // x0: z''' accurate to within about 0.000012%
        
        movss    z, xmm0
    }
    
    return z;
}

// ~6 clocks on Pentium M vs. ~24 for single precision sqrtf
REALLY_INLINE float squareroot_sse_11bits( float x )
{
    float z;
    _asm
    {
        rsqrtss xmm0, x
        rcpss    xmm0, xmm0
        movss    z, xmm0            // z ~= sqrt(x) to 0.038%
    }
    return z;
}

// ~19 clocks on Pentium M vs. ~24 for single precision sqrtf
REALLY_INLINE float squareroot_sse_22bits( float x )
{
    static float half = 0.5f;
    float z;
    _asm
    {
        movss    xmm1, x            // x1: x
        rsqrtss xmm2, xmm1        // x2: ~1/sqrt(x) = 1/z
        rcpss    xmm0, xmm2        // x0: z == ~sqrt(x) to 0.05%
        
        movss    xmm4, xmm0        // x4: z
        movss    xmm3, half
        mulss    xmm4, xmm4        // x4: z*z
        mulss    xmm2, xmm3        // x2: 1 / 2z
        subss    xmm4, xmm1        // x4: z*z-x
        mulss    xmm4, xmm2        // x4: (z*z-x)/2z
        subss    xmm0, xmm4        // x0: z' to 0.000015%
        
        movss    z, xmm0          
    }
    return z;
}

// ~12 clocks on Pentium M vs. ~16 for single precision divide
REALLY_INLINE float reciprocal_sse_22bits( float x )
{
    float z;
    _asm
    {
        rcpss    xmm0, x            // x0: z ~= 1/x
        movss    xmm2, x            // x2: x
        movss    xmm1, xmm0        // x1: z ~= 1/x
        addss    xmm0, xmm0        // x0: 2z
        mulss    xmm1, xmm1        // x1: z^2
        mulss    xmm1, xmm2        // x1: xz^2
        subss    xmm0, xmm1        // x0: z' ~= 1/x to 0.000012%

        movss    z, xmm0          
    }
    return z;
}

Comments
from : Christian[AT]savioursofsoul[DOT]de
comment : Good job!

from : martini[AT]slightlyintoxicated[DOT]com
comment : Dude, what's the freakin' point of using SSE for scalar functions?

from : kaleja[AT]estarcion[DOT]com
comment : The freakin' point is that it's faster than x87, and sometimes you can't do 4 at once. Also, the function signatures (float func(float)) are well established for the scalar versions, but different people have different standards for the vector versions (float* vs __m128& or whatever). Converting from scalar to vector is trivial for the square root and reciprocal examples, and I'm not going to take the time to investigate a 2-way MMX solution to the first approximation of cube root, but you can feel free.

from : nobody[AT]nowhere[DOT]com
comment : These are terrific. Wish I had seen them sooner.

from :
comment : order cheap soma ,cheap soma cheap soma