Author: | Wojciech Muła |
---|---|
Added on: | 2025-01-18 |
Printing 64-bit numbers in binary format can be done nicely with AVX-512 instructions. First, we populate each byte from the number into a separate 64-bit word of an AVX-512 register:
┌───┬───┬───┬───┬───┬───┬───┬───┐ x = │ h │ g │ f │ e │ d │ c │ b │ a │ └───┴───┴───┴───┴───┴───┴───┴───┘ | | | │ │ │ │ └──────────────────┐ │ │ └──────────┐ │ │ └──────────┐ │ │ └──┐ │ │ │ │ │ │ │ ├─╴┈┈┈╶─┐ ├─╴┈┈┈╶─┐ ├─╴┈┈┈╶─┐ ├─╴┈┈┈╶─┐ │ │ │ │ │ │ │ │ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ┌───┬┈┈┈┬───┬───┬┈┈┈┬───┬┈┈┈┈┈┈┈┬───┬┈┈┈┬───┬───┬┈┈┈┬───┐ zmm0 = │ h │ │ h │ g │ │ g │ │ b │ │ b │ a │ │ a │ └───┴┈┈┈┴───┴───┴┈┈┈┴───┴┈┈┈┈┈┈┈┴───┴┈┈┈┴───┴───┴┈┈┈┴───┘ │ │ │ │ ╰─ word 7 ╶─╯ ╰─ word 0 ╶─╯
Then, in each byte of 64-bit words we isolate i-th bit, where i is the byte position within a 64-bit word.
┈┈┈┬──────────┬──────────┬──────────┬──────────┬──────────┬──────────┬──────────┬──────────┬┈┈┈ zmm0 │ 01010100 │ 01010100 │ 01010100 │ 01010100 │ 01010100 │ 01010100 │ 01010100 │ 01010100 │ ┈┈┈┴──────────┴──────────┴──────────┴──────────┴──────────┴──────────┴──────────┴──────────┴┈┈┈ ┈┈┈┬──────────┬──────────┬──────────┬──────────┬──────────┬──────────┬──────────┬──────────┬┈┈┈ zmm1 │ 10000000 │ 01000000 │ 00100000 │ 00010000 │ 00001000 │ 00000100 │ 00000010 │ 00000001 │ ┈┈┈┴──────────┴──────────┴──────────┴──────────┴──────────┴──────────┴──────────┴──────────┴┈┈┈ zmm0 & zmm1 = ┈┈┈┬──────────┬──────────┬──────────┬──────────┬──────────┬──────────┬──────────┬──────────┬┈┈┈ │ 00010100 │ 01000000 │ 00000000 │ 00010000 │ 00000000 │ 00000100 │ 00000000 │ 00000000 │ ┈┈┈┴──────────┴──────────┴──────────┴──────────┴──────────┴──────────┴──────────┴──────────┴┈┈┈
Finally, we convert non-zero bytes into ASCII '1' (0x31) and zero bytes into ASCII '0' (0x30). This particular operation can be done in two different ways:
These two methods do not differ in performance, just the first one does not use mask registers.
void avx512_u64_to_bin_ver1(uint64_t v, char* buf) { // 1. broadcast word const __m512i in = _mm512_set1_epi64(v); // 2. in each 64-bit lane populate i-th byte const __m512i populate = _mm512_set_epi64( 0x0000000000000000, 0x0101010101010101, 0x0202020202020202, 0x0303030303030303, 0x0404040404040404, 0x0505050505050505, 0x0606060606060606, 0x0707070707070707 ); const __m512i t0 = _mm512_shuffle_epi8(in, populate); // 3. keep j-th bit in each 64-bit word const __m512i mask = _mm512_set1_epi64(0x0102040810204080); const __m512i t1 = _mm512_and_si512(mask, t0); // 4. convert bits to byte 0 or 1 const __m512i t2 = _mm512_min_epu8(t1, _mm512_set1_epi8(1)); // 5. convert to ASCII ('0' = 0x30 or '1' = 0x31) const __m512i t3 = _mm512_add_epi8(t2, _mm512_set1_epi8('0')); _mm512_storeu_si512((__m512i*)buf, t3); }
Assembly output from GCC 14.1:
vpbroadcastq %rdi, %zmm1 mov $0x1, %eax vpshufb 0x0(%rip), %zmm1, %zmm1 vpbroadcastb %eax, %zmm0 mov $0x30, %eax vpandd 0x0(%rip), %zmm1, %zmm1 vpminub %zmm1, %zmm0, %zmm0 vpbroadcastb %eax, %zmm1 vpaddb %zmm1, %zmm0, %zmm0 vmovdqu64 %zmm0, (%rsi)
void avx512_u64_to_bin_ver2(uint64_t v, char* buf) { // 1. broadcast word const __m512i in = _mm512_set1_epi64(v); // 2. in each 64-bit lane populate i-th byte const __m512i populate = _mm512_set_epi64( 0x0000000000000000, 0x0101010101010101, 0x0202020202020202, 0x0303030303030303, 0x0404040404040404, 0x0505050505050505, 0x0606060606060606, 0x0707070707070707 ); const __m512i t0 = _mm512_shuffle_epi8(in, populate); // 3. keep j-th bit in each 64-bit word const __m512i bits = _mm512_set1_epi64(0x0102040810204080); const __mmask64 mask = _mm512_test_epi8_mask(bits, t0); // 4. convert to ASCII ('0' = 0x30 or '1' = 0x31) const __m512i t2 = _mm512_mask_add_epi8( _mm512_set1_epi8('0'), mask, _mm512_set1_epi8('0'), _mm512_set1_epi8(1) ); _mm512_storeu_si512((__m512i*)buf, t2); }
Assembly output from GCC 14.1:
vmovdqa64 0x0(%rip), %zmm1 vpbroadcastq %rdi, %zmm0 mov $0x30, %eax vpshufb 0x0(%rip), %zmm0, %zmm0 vptestmb %zmm0, %zmm1, %k1 vpbroadcastb %eax, %zmm0 mov $0x1, %eax vpbroadcastb %eax, %zmm1 vmovdqa64 %zmm0, %zmm2 vpaddb %zmm1, %zmm0, %zmm2{%k1} vmovdqu64 %zmm2, (%rsi)
Sample implementation is available at GitHub.