One of my other hobbies is game development. I remember my family's first PC, a then state of the art IBM/PC Intel 80386. We had a few games for the machine, courtesy of my father. I remember being fascinated with the DOS version of Rogue (you can still find the binary and source for it here!). I played that, and other games a lot growing up, and ever since I've always been really interested in computer game development.

I've been working, off and on, on my own game developemnt projects. Most are entirely for fun and learning and likely no released game will ever come of it. Some time ago I wrote a simple vector/matrix library, since the basis of most game engines is largely a lot of vector and matrix manipulations. Recently I was curious how the compiler would translate these instructions. Mainly I wondered if it would use the MMX/SIMD instructions. These have been available widely since 1996, so surely the compiler technology has had time to catch up?

Consider the following code:
``````
class vector4 {

public:
vector4() {}
vector4(float x, float y, float z, float w) : _x(x), _y(y), _z(z), _w(w) {};

void cross_product(vector4 &vector_1, vector4 &vector_2)
{
_x = (vector_1._y * vector_2._z) - (vector_1._z * vector_2._y);
_y = (vector_1._z * vector_2._x) - (vector_1._x * vector_2._z);
_z = (vector_1._x * vector_2._y) - (vector_1._y * vector_2._x);
_w = 1.0;
}

private:
float _x;
float _y;
float _z;
float _w;
};
```
```

Pretty straightforward. The method cross will calculate the cross product of these two vectors. Now take a look at the compiled code (using gcc 4.8.1 on Linux). You'll probably note that this includes arg setup, implying a function call. So before you ask, yes, this is compiled with out any optimization flags, but I did check with maximum optimization and the result is basically the same (except optimized it inlined all the function calls).

``````
400b5e:	55                   	push   %rbp
400b5f:	48 89 e5             	mov    %rsp,%rbp
400b62:	48 89 7d f8          	mov    %rdi,-0x8(%rbp)
400b66:	48 89 75 f0          	mov    %rsi,-0x10(%rbp)
400b6a:	48 89 55 e8          	mov    %rdx,-0x18(%rbp)
400b6e:	48 8b 45 f0          	mov    -0x10(%rbp),%rax
400b72:	f3 0f 10 48 04       	movss  0x4(%rax),%xmm1
400b77:	48 8b 45 e8          	mov    -0x18(%rbp),%rax
400b7b:	f3 0f 10 40 08       	movss  0x8(%rax),%xmm0
400b80:	f3 0f 59 c1          	mulss  %xmm1,%xmm0
400b84:	48 8b 45 f0          	mov    -0x10(%rbp),%rax
400b88:	f3 0f 10 50 08       	movss  0x8(%rax),%xmm2
400b8d:	48 8b 45 e8          	mov    -0x18(%rbp),%rax
400b91:	f3 0f 10 48 04       	movss  0x4(%rax),%xmm1
400b96:	f3 0f 59 ca          	mulss  %xmm2,%xmm1
400b9a:	f3 0f 5c c1          	subss  %xmm1,%xmm0
400b9e:	48 8b 45 f8          	mov    -0x8(%rbp),%rax
400ba2:	f3 0f 11 00          	movss  %xmm0,(%rax)
400ba6:	48 8b 45 f0          	mov    -0x10(%rbp),%rax
400baa:	f3 0f 10 48 08       	movss  0x8(%rax),%xmm1
400baf:	48 8b 45 e8          	mov    -0x18(%rbp),%rax
400bb3:	f3 0f 10 00          	movss  (%rax),%xmm0
400bb7:	f3 0f 59 c1          	mulss  %xmm1,%xmm0
400bbb:	48 8b 45 f0          	mov    -0x10(%rbp),%rax
400bbf:	f3 0f 10 10          	movss  (%rax),%xmm2
400bc3:	48 8b 45 e8          	mov    -0x18(%rbp),%rax
400bc7:	f3 0f 10 48 08       	movss  0x8(%rax),%xmm1
400bcc:	f3 0f 59 ca          	mulss  %xmm2,%xmm1
400bd0:	f3 0f 5c c1          	subss  %xmm1,%xmm0
400bd4:	48 8b 45 f8          	mov    -0x8(%rbp),%rax
400bd8:	f3 0f 11 40 04       	movss  %xmm0,0x4(%rax)
400bdd:	48 8b 45 f0          	mov    -0x10(%rbp),%rax
400be1:	f3 0f 10 08          	movss  (%rax),%xmm1
400be5:	48 8b 45 e8          	mov    -0x18(%rbp),%rax
400be9:	f3 0f 10 40 04       	movss  0x4(%rax),%xmm0
400bee:	f3 0f 59 c1          	mulss  %xmm1,%xmm0
400bf2:	48 8b 45 f0          	mov    -0x10(%rbp),%rax
400bf6:	f3 0f 10 50 04       	movss  0x4(%rax),%xmm2
400bfb:	48 8b 45 e8          	mov    -0x18(%rbp),%rax
400bff:	f3 0f 10 08          	movss  (%rax),%xmm1
400c03:	f3 0f 59 ca          	mulss  %xmm2,%xmm1
400c07:	f3 0f 5c c1          	subss  %xmm1,%xmm0
400c0b:	48 8b 45 f8          	mov    -0x8(%rbp),%rax
400c0f:	f3 0f 11 40 08       	movss  %xmm0,0x8(%rax)
400c14:	48 8b 55 f8          	mov    -0x8(%rbp),%rdx
400c18:	8b 05 6e 01 00 00    	mov    0x16e(%rip),%eax        # 400d8c <_IO_stdin_used+0xc>
400c1e:	89 42 0c             	mov    %eax,0xc(%rdx)
400c21:	5d                   	pop    %rbp
400c22:	c3                   	retq
``````

Interesting. It does use the MMX operations, but does not use the packge variants. Consider this: floats in C++ are 32 bit values, and the MMX registers are 128 bits. A vector contains four 32 bit values. What if you could pack all 4 of those in there and do multiple operations at once? Well, you can thanks to the mulps operation. The cross product function can be modified like so to force the use of the packed operations, rather than handling them one at a time. (Apologies in advance for the horrible GAS syntax. I prefer Intel syntax but I dont think its possible to force GCC/GAS to use/accept Intel syntax). We also want to shuffle the floats around as we perform the operations. They are moved into the xmm registers in series (x, y, z, w) but we need them to "line up" properly before we can execute the multiply and subtraction operations.

``````
void cross_product(vector4 &vector_1, vector4 &vector_2)
{
asm("movq %0, %%rsi;"
"movq %1, %%rdi;"
"movaps (%%rsi), %%xmm0;"
"movaps (%%rdi), %%xmm1;"
"movaps %%xmm0, %%xmm2;"
"movaps %%xmm1, %%xmm3;"
"shufps \$0xC9, %%xmm0, %%xmm0;"
"shufps \$0xD2, %%xmm1, %%xmm1;"
"mulps %%xmm1, %%xmm0;"
"shufps \$0xD2, %%xmm2, %%xmm2;"
"shufps \$0xC9, %%xmm3, %%xmm3;"
"mulps %%xmm3, %%xmm2;"
"subps %%xmm2, %%xmm0;"
"movq %2, %%rsi;"
"movaps %%xmm0, (%%rsi);"
: /* none */
: "r" (&vector_1), "r" (&vector_2), "r" (this)
: "%rsi", "%rdi"
);

_w = 1.0;
}

``````

The generated code has the expected setup, but is clearly less intensive computationally than the GCC produced original.

``````
400b02:	55                   	push   %rbp
400b03:	48 89 e5             	mov    %rsp,%rbp
400b06:	48 89 7d f8          	mov    %rdi,-0x8(%rbp)
400b0a:	48 89 75 f0          	mov    %rsi,-0x10(%rbp)
400b0e:	48 89 55 e8          	mov    %rdx,-0x18(%rbp)
400b12:	48 8b 45 f0          	mov    -0x10(%rbp),%rax
400b16:	48 8b 55 e8          	mov    -0x18(%rbp),%rdx
400b1a:	48 8b 4d f8          	mov    -0x8(%rbp),%rcx
400b1e:	48 89 c6             	mov    %rax,%rsi
400b21:	48 89 d7             	mov    %rdx,%rdi
400b24:	0f 28 06             	movaps (%rsi),%xmm0
400b27:	0f 28 0f             	movaps (%rdi),%xmm1
400b2a:	0f 28 d0             	movaps %xmm0,%xmm2
400b2d:	0f 28 d9             	movaps %xmm1,%xmm3
400b30:	0f c6 c0 c9          	shufps \$0xc9,%xmm0,%xmm0
400b34:	0f c6 c9 d2          	shufps \$0xd2,%xmm1,%xmm1
400b38:	0f 59 c1             	mulps  %xmm1,%xmm0
400b3b:	0f c6 d2 d2          	shufps \$0xd2,%xmm2,%xmm2
400b3f:	0f c6 db c9          	shufps \$0xc9,%xmm3,%xmm3
400b43:	0f 59 d3             	mulps  %xmm3,%xmm2
400b46:	0f 5c c2             	subps  %xmm2,%xmm0
400b49:	48 89 ce             	mov    %rcx,%rsi
400b4c:	0f 29 06             	movaps %xmm0,(%rsi)
400b4f:	48 8b 55 f8          	mov    -0x8(%rbp),%rdx
400b53:	8b 05 33 02 00 00    	mov    0x233(%rip),%eax        # 400d8c <_IO_stdin_used+0xc>
400b59:	89 42 0c             	mov    %eax,0xc(%rdx)
400b5c:	5d                   	pop    %rbp
400b5d:	c3                   	retq
``````

Even with maximum optimizations, GCC will still leave this code as is (it might inline it), so you can force better behavior, when necessary.