If you have an input array, and an output array, and you only want to write those elements which pass a condition, what is the most efficient way to do this with AVX2?
Here is a visualization of the problem:
Here is my solution, using compressed indices. It requires a LUT sized 769 bytes, so it is best suited for cases where you have a good sized array of data to work on. (If this looks familiar to a stackoverflow post that is because I am the author).
//Generate Move mask via: _mm256_movemask_ps(_mm256_castsi256_ps(mask)); etc
__m256i MoveMaskToIndices(int moveMask) {
u8 *adr = g_pack_left_table_u8x3 + moveMask * 3;
__m256i indices = _mm256_set1_epi32(*reinterpret_cast<u32*>(adr));//lower 24 bits has our LUT
__m256i m = _mm256_sllv_epi32(indices, _mm256_setr_epi32(29, 26, 23, 20, 17, 14, 11, 8));
//now shift it right to get 3 bits at bottom
__m256i shufmask = _mm256_srli_epi32(m, 29);
return shufmask;
}
//The rest of this code to build the LUT
u32 get_nth_bits(int a) {
u32 out = 0;
int c = 0;
for (int i = 0; i < 8; ++i) {
auto set = (a >> i) & 1;
if (set) {
out |= (i << (c * 3));
c++;
}
}
return out;
}
u8 g_pack_left_table_u8x3[256 * 3 + 1];
void BuildPackMask() {
for (int i = 0; i < 256; ++i) {
*reinterpret_cast<u32*>(&g_pack_left_table_u8x3[i * 3]) = get_nth_bits(i);
}
}
On stackoverflow Peter Cordes came up with a solution that is clever, it avoids the requirement for a LUT by taking advantage of the new BMI(bit manipulation) instruction set. I had not used the BMI instructions before, so this was new to me.This code is x64 only, but you can port to x86 by using the vector shift approach I used ^, and the 3 bit indices instead of 8 bit.
// Uses 64bit pdep / pext to save a step in unpacking.
__m256 compress256(__m256 src, unsigned int mask /* from movmskps */)
{
uint64_t expanded_mask = _pdep_u64(mask, 0x0101010101010101); // unpack each bit to a byte
expanded_mask *= 0xFF; // mask |= mask<<1 | mask<<2 | ... | mask<<7;
// ABC... -> AAAAAAAABBBBBBBBCCCCCCCC...: replicate each bit to fill its byte
const uint64_t identity_indices = 0x0706050403020100; // the identity shuffle for vpermps, packed to one index per byte
uint64_t wanted_indices = _pext_u64(identity_indices, expanded_mask);
__m128i bytevec = _mm_cvtsi64_si128(wanted_indices);
__m256i shufmask = _mm256_cvtepu8_epi32(bytevec);
return _mm256_permutevar8x32_ps(src, shufmask);
}
