Playing With The .NET JIT Part 3

Integrating unmanaged code into the managed platform is one of the problem areas with the managed world. Often times the exact costs of calling into unmanaged code is unknown. This obviously leads to some confusion as to when it is appropriate to mix in unmanaged code to help to improve the performance of our application.

PInvoke

There are three ways to access an unmanaged function from managed code. The first is to use the PInvoke capabilities of the language. In C# this is done by declaring a method with external linkage and indicating (using the DllImportAttribute attribute) in which DLL the method may be found. The second way would be to obtain a pointer to the function (using LoadLibrary/GetProcAddress/FreeLibrary), and marshal that pointer to a managed delegate using Marshal.GetDelegateForFunctionPointer. Finally you can write an unmanaged wrapper around the function, using C++/CLI, and invoke that managed method, which will in turn call the unmanaged method.

For the purposes of this post we'll be using two mathematical sample functions. The first being the standard inner product on R3 (aka the dot product), and the second will be a 4x4 matrix multiplication. We'll be comparing two implementations, the first will be a trivial managed implementation of them, and the second will be a SSE2 optimized version. Thanks must be given to Arseny Kapoulkine for the SSE2 version of the matrix multiplication.

First up are the implementations of the inner product functions, it should be noted that I'll be doing the profiling in x64 mode, however the results are similar (albeit a bit slower) for x86.

public static float InnerProduct2(float[] v1, float[] v2) {  
    return v1[0] * v2[0] + v1[1] * v2[1] + v1[2] * v2[2] + v1[3] * v2[3];
}
float __declspec(dllexport) inner_product(float const* v1, float const* v2) {  
    float result;
    __m128 a = _mm_mul_ps(_mm_loadu_ps(v1), _mm_loadu_ps(v2));
    a = _mm_add_ps(a, _mm_shuffle_ps(a, a,  _MM_SHUFFLE(1, 0, 3, 2)));
    _mm_store_ss(&result, _mm_add_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 2, 3))));
    return result;
}

Things that should be noted about these implementations is that they both operate soley on arrays of floats. InnerProduct2 is inlineable since it's only 23 bytes long and is taking reference types as parameters. The unmanaged inner product could also be implemented using the SSE3 haddps instruction, however I decided to keep it as processor neutral as possible by using only SSE2 instructions.

The implementations of the matrix multiplication vary quite significantly as well, the managed version is the trivial implementation, but its expansion into machine code is quite long. The unmanaged version is an SSE2 optimized one, the raw performance boost of using it is quite significant.

public static void MatrixMul2(float[] m1, float[] m2, float[] o) {  
    o[0] = m1[0] * m2[0] + m1[1] * m2[4] + m1[2] * m2[8] + m1[3] * m2[12];
    o[1] = m1[0] * m2[1] + m1[1] * m2[5] + m1[2] * m2[9] + m1[3] * m2[13];
    o[2] = m1[0] * m2[2] + m1[1] * m2[6] + m1[2] * m2[10] + m1[3] * m2[14];
    o[3] = m1[0] * m2[3] + m1[1] * m2[7] + m1[2] * m2[11] + m1[3] * m2[15];

    o[4] = m1[4] * m2[0] + m1[5] * m2[4] + m1[6] * m2[8] + m1[7] * m2[12];
    o[5] = m1[4] * m2[1] + m1[5] * m2[5] + m1[6] * m2[9] + m1[7] * m2[13];
    o[6] = m1[4] * m2[2] + m1[5] * m2[6] + m1[6] * m2[10] + m1[7] * m2[14];
    o[7] = m1[4] * m2[3] + m1[5] * m2[7] + m1[6] * m2[11] + m1[7] * m2[15];

    o[8] = m1[8] * m2[0] + m1[9] * m2[4] + m1[10] * m2[8] + m1[11] * m2[12];
    o[9] = m1[8] * m2[1] + m1[9] * m2[5] + m1[10] * m2[9] + m1[11] * m2[13];
    o[10] = m1[8] * m2[2] + m1[9] * m2[6] + m1[10] * m2[10] + m1[11] * m2[14];
    o[11] = m1[8] * m2[3] + m1[9] * m2[7] + m1[10] * m2[11] + m1[11] * m2[15];

    o[12] = m1[12] * m2[0] + m1[13] * m2[4] + m1[14] * m2[8] + m1[15] * m2[12];
    o[13] = m1[12] * m2[1] + m1[13] * m2[5] + m1[14] * m2[9] + m1[15] * m2[13];
    o[14] = m1[12] * m2[2] + m1[13] * m2[6] + m1[14] * m2[10] + m1[15] * m2[14];
    o[15] = m1[12] * m2[3] + m1[13] * m2[7] + m1[14] * m2[11] + m1[15] * m2[15];
}
void __declspec(dllexport) matrix_mul(float const* m1, float const* m2, float* out)  
{
    __m128 r;

    __m128 col1 = _mm_loadu_ps(m2);
    __m128 col2 = _mm_loadu_ps(m2 + 4);
    __m128 col3 = _mm_loadu_ps(m2 + 8);
    __m128 col4 = _mm_loadu_ps(m2 + 12);

    __m128 row1 = _mm_loadu_ps(m1);

    r = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(row1, row1, _MM_SHUFFLE(0, 0, 0, 0)), col1),
    _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(row1, row1, _MM_SHUFFLE(1, 1, 1, 1)), col2),
    _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(row1, row1, _MM_SHUFFLE(2, 2, 2, 2)), col3),
    _mm_mul_ps(_mm_shuffle_ps(row1, row1, _MM_SHUFFLE(3, 3, 3, 3)), col4))));

    _mm_storeu_ps(out, r);

    __m128 row2 = _mm_loadu_ps(m1 + 4);

    r = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(row2, row2, _MM_SHUFFLE(0, 0, 0, 0)), col1),
    _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(row2, row2, _MM_SHUFFLE(1, 1, 1, 1)), col2),
    _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(row2, row2, _MM_SHUFFLE(2, 2, 2, 2)), col3),
    _mm_mul_ps(_mm_shuffle_ps(row2, row2, _MM_SHUFFLE(3, 3, 3, 3)), col4))));

    _mm_storeu_ps(out + 4, r);

    __m128 row3 = _mm_loadu_ps(m1 + 8);

    r = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(row3, row3, _MM_SHUFFLE(0, 0, 0, 0)), col1),
    _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(row3, row3, _MM_SHUFFLE(1, 1, 1, 1)), col2),
    _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(row3, row3, _MM_SHUFFLE(2, 2, 2, 2)), col3),
    _mm_mul_ps(_mm_shuffle_ps(row3, row3, _MM_SHUFFLE(3, 3, 3, 3)), col4))));

    _mm_storeu_ps(out + 8, r);

    __m128 row4 = _mm_loadu_ps(m1 + 12);

    r = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(row4, row4, _MM_SHUFFLE(0, 0, 0, 0)), col1),
    _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(row4, row4, _MM_SHUFFLE(1, 1, 1, 1)), col2),
    _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(row4, row4, _MM_SHUFFLE(2, 2, 2, 2)), col3),
    _mm_mul_ps(_mm_shuffle_ps(row4, row4, _MM_SHUFFLE(3, 3, 3, 3)), col4))));

    _mm_storeu_ps(out + 12, r);
}

It is trivially obvious that the managed version of the matrix multiplication cannot be inlined. The overhead of the function call is really the least of your worries though (it is the smallest cost of the entire method really). The unmanaged version is a nicely optimized SSE2 method, and requires only a minimal number of loads and stores from main memory, and the loads and stores are reasonably cache friendly (P4 will prefetch 128 bytes of memory).

PInvoke

Of course, the question is, how do these perform against each other when called from a managed application. The profiling setup is quite simple. It simply runs the methods against a set of matricies and vectors (randomly generated) a million times. It repeats those tests several more times (100 in this case), and averages the results. Full optimizations were turned on for both the unmanaged and managed tests. The Internal calls are made from a managed class that directly calls to the unmanaged methods. Both the managed wrapper and the unmanaged methods are hosted in the same DLL (source for the full DLL at the end of this entry).

PInvoke MatrixMul : 00:00:15.0203285 Average: 00:00:00.1502032
Delegate MatrixMul: 00:00:13.1004306 Average: 00:00:00.1310043
Managed MatrixMul: 00:00:10.2809715 Average: 00:00:00.1028097
Internal MatrixMul: 00:00:08.8992407 Average: 00:00:00.0889924
PInvoke Inner Product: 00:00:10.6779944 Average: 00:00:00.1067799
Delegate Inner Product: 00:00:09.3359882 Average: 00:00:00.0933598
Managed Inner Product: 00:00:01.3460812 Average: 00:00:00.0134608
Internal Inner Product: 00:00:05.6842336 Average: 00:00:00.0568423

The first thing to note is that the PInvoke calls for both the matrix multiplication and inner product were the slowest. The delegate calls were only slightly faster than the PInvoke calls. As we move into the managed territory we find the the results begin to diverge. The managed matrix multiplication is slower than the internal matrix multiplication, however the managed inner product is several times faster than the internal one.

Part of the reason behind this divergance is a result of the invocation framework. There is a cost to calling unmanaged methods from managed code, as each method must be wrapped to perform operations such as fixing any managed resources, performing marshalling for non-blittable types, and finally calling the actual native method. After returning the method further marshalling of the return type may be required, along with checks on the condition of the stack and exception checks (SEH exceptions are caught and wrapped in the SEHException class). Even the internal calls to the unmanaged method require some amount of this, although the actual marshalling requirements are avoided, as are some of the other costs. The result is that the costs add up over time, and in the case of the inner product the additional cost overrode the complexity requirements of the method (which is fairly trivial). The case, on the average, is different for the matrix multiplication. The additional costs of the call do not add a significant amount overhead compared to that of the body of the method, which executes faster than that of the managed matrix multiplication due to vectorization.

Performing further testing with counts at 50 and 25 reveal similar results, however the managed matrix multiplication begins to approach the performance of the internal one. However, even at a count of 1 (that's one million matrix multiplications), the internal matrix multiplication is faster than the managed version.

Count = 50
PInvoke MatrixMul : 00:00:07.4730356 Average: 00:00:00.1494607
Delegate MatrixMul: 00:00:06.4519274 Average: 00:00:00.1290385
Managed MatrixMul: 00:00:05.1662482 Average: 00:00:00.1033249
Internal MatrixMul: 00:00:04.3371530 Average: 00:00:00.0867430
PInvoke Inner Product: 00:00:05.3891030 Average: 00:00:00.1077820
Delegate Inner Product: 00:00:04.7625597 Average: 00:00:00.0952511
Managed Inner Product: 00:00:00.6791549 Average: 00:00:00.0135830
Internal Inner Product: 00:00:02.6719175 Average: 00:00:00.0534383

Count = 25
PInvoke MatrixMul : 00:00:03.7432932 Average: 00:00:00.1497317
Delegate MatrixMul: 00:00:03.2074834 Average: 00:00:00.1282993
Managed MatrixMul: 00:00:02.6200096 Average: 00:00:00.1048003
Internal MatrixMul: 00:00:02.2144342 Average: 00:00:00.0885773
PInvoke Inner Product: 00:00:02.8778559 Average: 00:00:00.1151142
Delegate Inner Product: 00:00:02.0178957 Average: 00:00:00.0807158
Managed Inner Product: 00:00:00.3385675 Average: 00:00:00.0135427
Internal Inner Product: 00:00:01.4391529 Average: 00:00:00.0575661

Count = 5
PInvoke MatrixMul : 00:00:00.7642981 Average: 00:00:00.1528596
Delegate MatrixMul: 00:00:00.6407667 Average: 00:00:00.1281533
Managed MatrixMul: 00:00:00.5231416 Average: 00:00:00.1046283
Internal MatrixMul: 00:00:00.4458765 Average: 00:00:00.0891753
PInvoke Inner Product: 00:00:00.5702666 Average: 00:00:00.1140533
Delegate Inner Product: 00:00:00.4122217 Average: 00:00:00.0824443
Managed Inner Product: 00:00:00.0683842 Average: 00:00:00.0136768
Internal Inner Product: 00:00:00.2899304 Average: 00:00:00.0579860

Count = 1
PInvoke MatrixMul : 00:00:00.1476958 Average: 00:00:00.1476958
Delegate MatrixMul: 00:00:00.1337818 Average: 00:00:00.1337818
Managed MatrixMul: 00:00:00.1155993 Average: 00:00:00.1155993
Internal MatrixMul: 00:00:00.0919538 Average: 00:00:00.0919538
PInvoke Inner Product: 00:00:00.1155769 Average: 00:00:00.1155769
Delegate Inner Product: 00:00:00.0906768 Average: 00:00:00.0906768
Managed Inner Product: 00:00:00.0155480 Average: 00:00:00.0155480
Internal Inner Product: 00:00:00.0653527 Average: 00:00:00.0653527

Conclusion

Clearly we should reserve unmanaged operations for longer running methods where the cost of the managed wrappers is negligible compared to the cost of the method. Even heavily optimized methods cost significantly in the wrapping code, and so trivial optimizations are easily overshadowed by that cost. It is best to use unmanaged operations wrapped in a C++/CLI wrapper (and preferably the wrapper will be part of the library that the operations are in). Next time we'll look at the assembly produced by the JIT for these methods under varying circumstances.

Source for Managed DLL:

#pragma managed(push, off)

extern "C" {  
    float __declspec(dllexport) inner_product(float const* v1, float const* v2) {
        float result;
        __m128 a = _mm_mul_ps(_mm_loadu_ps(v1), _mm_loadu_ps(v2));
        a = _mm_add_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2)));
        _mm_store_ss(&result, _mm_add_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 2, 3))));
        return result;
    }

    void __declspec(dllexport) matrix_mul(float const* m1, float const* m2, float* out)
    {
        __m128 r;

        __m128 col1 = _mm_loadu_ps(m2);
        __m128 col2 = _mm_loadu_ps(m2 + 4);
        __m128 col3 = _mm_loadu_ps(m2 + 8);
        __m128 col4 = _mm_loadu_ps(m2 + 12);

        __m128 row1 = _mm_loadu_ps(m1);

        r = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(row1, row1, _MM_SHUFFLE(0, 0, 0, 0)), col1),
        _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(row1, row1, _MM_SHUFFLE(1, 1, 1, 1)), col2),
        _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(row1, row1, _MM_SHUFFLE(2, 2, 2, 2)), col3),
        _mm_mul_ps(_mm_shuffle_ps(row1, row1, _MM_SHUFFLE(3, 3, 3, 3)), col4))));

        _mm_storeu_ps(out, r);
        __m128 row2 = _mm_loadu_ps(m1 + 4);

        r = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(row2, row2, _MM_SHUFFLE(0, 0, 0, 0)), col1),
        _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(row2, row2, _MM_SHUFFLE(1, 1, 1, 1)), col2),
        _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(row2, row2, _MM_SHUFFLE(2, 2, 2, 2)), col3),
        _mm_mul_ps(_mm_shuffle_ps(row2, row2, _MM_SHUFFLE(3, 3, 3, 3)), col4))));

        _mm_storeu_ps(out + 4, r);
        __m128 row3 = _mm_loadu_ps(m1 + 8);

        r = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(row3, row3, _MM_SHUFFLE(0, 0, 0, 0)), col1),
        _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(row3, row3, _MM_SHUFFLE(1, 1, 1, 1)), col2),
        _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(row3, row3, _MM_SHUFFLE(2, 2, 2, 2)), col3),
        _mm_mul_ps(_mm_shuffle_ps(row3, row3, _MM_SHUFFLE(3, 3, 3, 3)), col4))));

        _mm_storeu_ps(out + 8, r);
        __m128 row4 = _mm_loadu_ps(m1 + 12);

        r = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(row4, row4, _MM_SHUFFLE(0, 0, 0, 0)), col1),
        _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(row4, row4, _MM_SHUFFLE(1, 1, 1, 1)), col2),
        _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(row4, row4, _MM_SHUFFLE(2, 2, 2, 2)), col3),
        _mm_mul_ps(_mm_shuffle_ps(row4, row4, _MM_SHUFFLE(3, 3, 3, 3)), col4))));

        _mm_storeu_ps(out + 12, r);
    }
}
#pragma managed(pop)

using namespace System;

namespace ManagedMathLib {  
    public ref class ManagedMath {
    public:
        static IntPtr InnerProductPtr = IntPtr(inner_product);
        static IntPtr MatrixMulPtr = IntPtr(matrix_mul);

        static float InnerProduct(array^ v1, array^ v2) {
            pin_ptr pv1 = &v1[0];
            pin_ptr pv2 = &v2[0];

            return inner_product(pv1, pv2);
        }

        static void MatrixMul(array^ m1, array^ m2, array^ out) {
            pin_ptr pm1 = &m1[0];
            pin_ptr pm2 = &m2[0];
            pin_ptr outp = &out[0];
            matrix_mul(pm1, pm2, outp);
        }
    };
}