AVX512: printing u64 as binary

Author: Wojciech Muła
Added on:2025-01-18

Problem

Printing 64-bit numbers in binary format can be done nicely with AVX-512 instructions. First, we populate each byte from the number into a separate 64-bit word of an AVX-512 register:

    ┌───┬───┬───┬───┬───┬───┬───┬───┐
x = │ hgfedcba │
    └───┴───┴───┴───┴───┴───┴───┴───┘
      |   |                   |   
                               └──────────────────┐
                            └──────────┐           
         └──────────┐                              
      └──┐                                         
                                                  
         ├─╴┈┈┈╶─┐   ├─╴┈┈┈╶─┐           ├─╴┈┈┈╶─┐   ├─╴┈┈┈╶─┐
                                                      
                                                      
       ┌───┬┈┈┈┬───┬───┬┈┈┈┬───┬┈┈┈┈┈┈┈┬───┬┈┈┈┬───┬───┬┈┈┈┬───┐
zmm0 = │ h │   │ hg │   │ g │       │ b │   │ ba │   │ a │
       └───┴┈┈┈┴───┴───┴┈┈┈┴───┴┈┈┈┈┈┈┈┴───┴┈┈┈┴───┴───┴┈┈┈┴───┘

       │           │                               │           │
       ╰─ word 7 ╶─╯                               ╰─ word 0 ╶─╯

Then, in each byte of 64-bit words we isolate i-th bit, where i is the byte position within a 64-bit word.

     ┈┈┈┬──────────┬──────────┬──────────┬──────────┬──────────┬──────────┬──────────┬──────────┬┈┈┈
zmm0    │ 01010100 │ 01010100 │ 01010100 │ 01010100 │ 01010100 │ 01010100 │ 01010100 │ 01010100 │
     ┈┈┈┴──────────┴──────────┴──────────┴──────────┴──────────┴──────────┴──────────┴──────────┴┈┈┈
     ┈┈┈┬──────────┬──────────┬──────────┬──────────┬──────────┬──────────┬──────────┬──────────┬┈┈┈
zmm1    │ 10000000 │ 01000000 │ 00100000 │ 00010000 │ 00001000 │ 00000100 │ 00000010 │ 00000001 │
     ┈┈┈┴──────────┴──────────┴──────────┴──────────┴──────────┴──────────┴──────────┴──────────┴┈┈┈

zmm0 & zmm1 =

     ┈┈┈┬──────────┬──────────┬──────────┬──────────┬──────────┬──────────┬──────────┬──────────┬┈┈┈
        │ 00010100 │ 01000000 │ 00000000 │ 00010000 │ 00000000 │ 00000100 │ 00000000 │ 00000000 │
     ┈┈┈┴──────────┴──────────┴──────────┴──────────┴──────────┴──────────┴──────────┴──────────┴┈┈┈

Finally, we convert non-zero bytes into ASCII '1' (0x31) and zero bytes into ASCII '0' (0x30). This particular operation can be done in two different ways:

These two methods do not differ in performance, just the first one does not use mask registers.

Variant 1

void avx512_u64_to_bin_ver1(uint64_t v, char* buf) {
    // 1. broadcast word
    const __m512i in = _mm512_set1_epi64(v);

    // 2. in each 64-bit lane populate i-th byte
    const __m512i populate = _mm512_set_epi64(
        0x0000000000000000,
        0x0101010101010101,
        0x0202020202020202,
        0x0303030303030303,
        0x0404040404040404,
        0x0505050505050505,
        0x0606060606060606,
        0x0707070707070707
    );

    const __m512i t0 = _mm512_shuffle_epi8(in, populate);

    // 3. keep j-th bit in each 64-bit word
    const __m512i mask = _mm512_set1_epi64(0x0102040810204080);

    const __m512i t1 = _mm512_and_si512(mask, t0);

    // 4. convert bits to byte 0 or 1
    const __m512i t2 = _mm512_min_epu8(t1, _mm512_set1_epi8(1));

    // 5. convert to ASCII ('0' = 0x30 or '1' = 0x31)
    const __m512i t3 = _mm512_add_epi8(t2, _mm512_set1_epi8('0'));

    _mm512_storeu_si512((__m512i*)buf, t3);
}

Assembly output from GCC 14.1:

vpbroadcastq    %rdi, %zmm1
mov             $0x1, %eax
vpshufb         0x0(%rip), %zmm1, %zmm1
vpbroadcastb    %eax, %zmm0
mov             $0x30, %eax
vpandd          0x0(%rip), %zmm1, %zmm1
vpminub         %zmm1, %zmm0, %zmm0
vpbroadcastb    %eax, %zmm1
vpaddb          %zmm1, %zmm0, %zmm0
vmovdqu64       %zmm0, (%rsi)

Variant 2

void avx512_u64_to_bin_ver2(uint64_t v, char* buf) {
    // 1. broadcast word
    const __m512i in = _mm512_set1_epi64(v);

    // 2. in each 64-bit lane populate i-th byte
    const __m512i populate = _mm512_set_epi64(
        0x0000000000000000,
        0x0101010101010101,
        0x0202020202020202,
        0x0303030303030303,
        0x0404040404040404,
        0x0505050505050505,
        0x0606060606060606,
        0x0707070707070707
    );

    const __m512i t0 = _mm512_shuffle_epi8(in, populate);

    // 3. keep j-th bit in each 64-bit word
    const __m512i bits = _mm512_set1_epi64(0x0102040810204080);
    const __mmask64 mask = _mm512_test_epi8_mask(bits, t0);

    // 4. convert to ASCII ('0' = 0x30 or '1' = 0x31)
    const __m512i t2 = _mm512_mask_add_epi8(
        _mm512_set1_epi8('0'),
        mask,
        _mm512_set1_epi8('0'),
        _mm512_set1_epi8(1)
    );

    _mm512_storeu_si512((__m512i*)buf, t2);
}

Assembly output from GCC 14.1:

vmovdqa64       0x0(%rip), %zmm1
vpbroadcastq    %rdi, %zmm0
mov             $0x30, %eax
vpshufb         0x0(%rip), %zmm0, %zmm0
vptestmb        %zmm0, %zmm1, %k1
vpbroadcastb    %eax, %zmm0
mov             $0x1, %eax
vpbroadcastb    %eax, %zmm1
vmovdqa64       %zmm0, %zmm2
vpaddb          %zmm1, %zmm0, %zmm2{%k1}
vmovdqu64       %zmm2, (%rsi)

Source code

Sample implementation is available at GitHub.