SSE4.1: PHMINPOSUW — insertion sort

Author: Wojciech Muła
Added on:2008-08-03

Unusual application of PHMINPOSUW instruction as key part of insertion sort for 8 element tables. I guess it won't find any practical usage.

Implementation:

typedef uint16_t table[8];

table max[8] = {
    {0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
    {0x0000, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
    {0x0000, 0x0000, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
    {0x0000, 0x0000, 0x0000, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000},
    {0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0x0000, 0x0000, 0x0000},
    {0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0x0000, 0x0000},
    {0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0x0000},
    {0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff}
};

void sse4_sort(table T) {
    uint32_t dummy;

    __asm__ volatile (
    "       movdqu (%%eax), %%xmm0          \n"
    "       xor %%ecx, %%ecx                \n"     // i = 0
    "1:                                     \n"
    "       phminposuw %%xmm0, %%xmm1       \n"     // find min, and its index j
    "       movd %%xmm1, %%edx              \n"
    "       movw   %%dx, (%%eax, %%ecx, 2)  \n"     // save min at i-th position
    "                                       \n"
    "       shrl   $16, %%edx               \n"
    "       shll    $4, %%edx               \n"
    "                                       \n"
    "       por  max(%%edx), %%xmm0         \n"     // set max at pisition j
    "                                       \n"
    "       addl    $1, %%ecx               \n"     // i += 1
    "       cmp     $8, %%ecx               \n"
    "       jl      1b                      \n"

    :
    : "a" (T)
    : "ecx", "edx"
    );
}