Author: | Wojciech Muła |
---|---|
Added on: | 2016-10-16 |
The instruction alignr in Intel SIMD builds a new vector from a subrange of two concatenated vectors; its downside is accepting only compile-time constants. AVX512F lacks of byte-wise instructions, an available variant of alignr works at level of 32-bit words.
Byte-wise alignr is viable in AVX512F, using techniques used to handle so called long-numbers. We can do shifts at 32-bit word granulation using vpalignr (_mm512_alignr_epi32), then byte-wide shift inside each 32-bit word is possible. To perform the latter shift we need bytes from the next 32-bit word.
This force us to build two vectors, having current and next words at corresponding positions. Then these words are shifted accordingly and finally merged into one 32-bit word.
Steps required to perform alignr(hi, lo, shift):
Calculate shift amounts for 32-bit words and bytes:
s32 = shift / 4 // for example 2 s8 = shift % 4 // for example 3 lo = [abcd|efgh|ijkl|mnop|qrst|....|....|....|....|....|....|....|....|....|....|....] hi = [uvwx|yz12|....|....|....|....|....|....|....|....|....|....|....|....|....|....]
If s8 == 0 it means that single 32-bit-wise shift is sufficient:
R = _mm512_alignrvar_epi32(hi, lo, s32)
Otherwise build two vectors:
L = _mm512_alignrvar_epi32(hi, lo, s32) = [ijkl|mnop|qrst|....|....|....|....|....|....|....|....|....|....|....|....|uvwx] H = _mm512_alignrvar_epi32(hi, lo, s32 + 1) = [mnop|qrst|....|....|....|....|....|....|....|....|....|....|....|....|uvwx|yz12]
Do shifts at byte level:
L = _mm512_srli_epi32(L, s8 * 8) // 3*8 right = [l000|p000|t000|.000|.000|.000|.000|.000|.000|.000|.000|.000|.000|.000|.000|x000] H = _mm512_slli_epi32(H, (4 - s8) * 8) // 1*8 left = [0mno|0qrs|0...|0...|0...|0...|0...|0...|0...|0...|0...|0...|0...|0...|0uvw|0yz1]
Finally merge these two vectors:
R = L | H = [lmno|pqrs|t...|....|....|....|....|....|....|....|....|....|....|....|.uvw|xyz1]
Following sample program is a software implementation of byte-wise alignr. Please note that procedures use run-time shifts.
__m512i _mm512_alignrvar_epi32(const __m512i hi, const __m512i lo, int shift) { switch (shift) { case 0: return lo; break; case 1: return _mm512_alignr_epi32(hi, lo, 1); break; case 2: return _mm512_alignr_epi32(hi, lo, 2); break; case 3: return _mm512_alignr_epi32(hi, lo, 3); break; case 4: return _mm512_alignr_epi32(hi, lo, 4); break; case 5: return _mm512_alignr_epi32(hi, lo, 5); break; case 6: return _mm512_alignr_epi32(hi, lo, 6); break; case 7: return _mm512_alignr_epi32(hi, lo, 7); break; case 8: return _mm512_alignr_epi32(hi, lo, 8); break; case 9: return _mm512_alignr_epi32(hi, lo, 9); break; case 10: return _mm512_alignr_epi32(hi, lo, 10); break; case 11: return _mm512_alignr_epi32(hi, lo, 11); break; case 12: return _mm512_alignr_epi32(hi, lo, 12); break; case 13: return _mm512_alignr_epi32(hi, lo, 13); break; case 14: return _mm512_alignr_epi32(hi, lo, 14); break; case 15: return _mm512_alignr_epi32(hi, lo, 15); break; case 16: return hi; break; default: assert(false); } } __m512i _mm512_alignr_epi8(const __m512i hi, const __m512i lo, unsigned shift) { assert(shift < 64); const int s32 = shift / 4; // shift amount in 32-bit words const int s8 = shift % 4; // shift amount within 32-bit word if (s8 == 0) { return _mm512_alignrvar_epi32(hi, lo, s32); } else { const __m512i L = _mm512_alignrvar_epi32(hi, lo, s32); const __m512i H = _mm512_alignrvar_epi32(hi, lo, s32 + 1); return _mm512_or_si512( _mm512_srli_epi32(L, s8 * 8), _mm512_slli_epi32(H, (4 - s8) * 8) ); } }
All sources are available at github.