| Author: | Wojciech Muła | 
|---|---|
| Added on: | 2025-01-18 | 
Printing 64-bit numbers in binary format can be done nicely with AVX-512 instructions. First, we populate each byte from the number into a separate 64-bit word of an AVX-512 register:
    ┌───┬───┬───┬───┬───┬───┬───┬───┐
x = │ h │ g │ f │ e │ d │ c │ b │ a │
    └───┴───┴───┴───┴───┴───┴───┴───┘
      |   |                   |   │
      │   │                   │   └──────────────────┐
      │   │                   └──────────┐           │
      │   └──────────┐                   │           │
      └──┐           │                   │           │
         │           │                   │           │
         ├─╴┈┈┈╶─┐   ├─╴┈┈┈╶─┐           ├─╴┈┈┈╶─┐   ├─╴┈┈┈╶─┐
         │       │   │       │           │       │   │       │
         ▼       ▼   ▼       ▼           ▼       ▼   ▼       ▼
       ┌───┬┈┈┈┬───┬───┬┈┈┈┬───┬┈┈┈┈┈┈┈┬───┬┈┈┈┬───┬───┬┈┈┈┬───┐
zmm0 = │ h │   │ h │ g │   │ g │       │ b │   │ b │ a │   │ a │
       └───┴┈┈┈┴───┴───┴┈┈┈┴───┴┈┈┈┈┈┈┈┴───┴┈┈┈┴───┴───┴┈┈┈┴───┘
       │           │                               │           │
       ╰─ word 7 ╶─╯                               ╰─ word 0 ╶─╯Then, in each byte of 64-bit words we isolate i-th bit, where i is the byte position within a 64-bit word.
     ┈┈┈┬──────────┬──────────┬──────────┬──────────┬──────────┬──────────┬──────────┬──────────┬┈┈┈
zmm0    │ 01010100 │ 01010100 │ 01010100 │ 01010100 │ 01010100 │ 01010100 │ 01010100 │ 01010100 │
     ┈┈┈┴──────────┴──────────┴──────────┴──────────┴──────────┴──────────┴──────────┴──────────┴┈┈┈
     ┈┈┈┬──────────┬──────────┬──────────┬──────────┬──────────┬──────────┬──────────┬──────────┬┈┈┈
zmm1    │ 10000000 │ 01000000 │ 00100000 │ 00010000 │ 00001000 │ 00000100 │ 00000010 │ 00000001 │
     ┈┈┈┴──────────┴──────────┴──────────┴──────────┴──────────┴──────────┴──────────┴──────────┴┈┈┈
zmm0 & zmm1 =
     ┈┈┈┬──────────┬──────────┬──────────┬──────────┬──────────┬──────────┬──────────┬──────────┬┈┈┈
        │ 00010100 │ 01000000 │ 00000000 │ 00010000 │ 00000000 │ 00000100 │ 00000000 │ 00000000 │
     ┈┈┈┴──────────┴──────────┴──────────┴──────────┴──────────┴──────────┴──────────┴──────────┴┈┈┈Finally, we convert non-zero bytes into ASCII '1' (0x31) and zero bytes into ASCII '0' (0x30). This particular operation can be done in two different ways:
These two methods do not differ in performance, just the first one does not use mask registers.
void avx512_u64_to_bin_ver1(uint64_t v, char* buf) { // 1. broadcast word const __m512i in = _mm512_set1_epi64(v); // 2. in each 64-bit lane populate i-th byte const __m512i populate = _mm512_set_epi64( 0x0000000000000000, 0x0101010101010101, 0x0202020202020202, 0x0303030303030303, 0x0404040404040404, 0x0505050505050505, 0x0606060606060606, 0x0707070707070707 ); const __m512i t0 = _mm512_shuffle_epi8(in, populate); // 3. keep j-th bit in each 64-bit word const __m512i mask = _mm512_set1_epi64(0x0102040810204080); const __m512i t1 = _mm512_and_si512(mask, t0); // 4. convert bits to byte 0 or 1 const __m512i t2 = _mm512_min_epu8(t1, _mm512_set1_epi8(1)); // 5. convert to ASCII ('0' = 0x30 or '1' = 0x31) const __m512i t3 = _mm512_add_epi8(t2, _mm512_set1_epi8('0')); _mm512_storeu_si512((__m512i*)buf, t3); }
Assembly output from GCC 14.1:
vpbroadcastq %rdi, %zmm1 mov $0x1, %eax vpshufb 0x0(%rip), %zmm1, %zmm1 vpbroadcastb %eax, %zmm0 mov $0x30, %eax vpandd 0x0(%rip), %zmm1, %zmm1 vpminub %zmm1, %zmm0, %zmm0 vpbroadcastb %eax, %zmm1 vpaddb %zmm1, %zmm0, %zmm0 vmovdqu64 %zmm0, (%rsi)
void avx512_u64_to_bin_ver2(uint64_t v, char* buf) { // 1. broadcast word const __m512i in = _mm512_set1_epi64(v); // 2. in each 64-bit lane populate i-th byte const __m512i populate = _mm512_set_epi64( 0x0000000000000000, 0x0101010101010101, 0x0202020202020202, 0x0303030303030303, 0x0404040404040404, 0x0505050505050505, 0x0606060606060606, 0x0707070707070707 ); const __m512i t0 = _mm512_shuffle_epi8(in, populate); // 3. keep j-th bit in each 64-bit word const __m512i bits = _mm512_set1_epi64(0x0102040810204080); const __mmask64 mask = _mm512_test_epi8_mask(bits, t0); // 4. convert to ASCII ('0' = 0x30 or '1' = 0x31) const __m512i t2 = _mm512_mask_add_epi8( _mm512_set1_epi8('0'), mask, _mm512_set1_epi8('0'), _mm512_set1_epi8(1) ); _mm512_storeu_si512((__m512i*)buf, t2); }
Assembly output from GCC 14.1:
vmovdqa64       0x0(%rip), %zmm1
vpbroadcastq    %rdi, %zmm0
mov             $0x30, %eax
vpshufb         0x0(%rip), %zmm0, %zmm0
vptestmb        %zmm0, %zmm1, %k1
vpbroadcastb    %eax, %zmm0
mov             $0x1, %eax
vpbroadcastb    %eax, %zmm1
vmovdqa64       %zmm0, %zmm2
vpaddb          %zmm1, %zmm0, %zmm2{%k1}
vmovdqu64       %zmm2, (%rsi)
Sample implementation is available at GitHub.