I'm still using your vectorclass quite frequently. One of the things I wanted to do recently was an 8x8 transpose. I did this with 24 blends. However, I found a version using intrinsics which is much faster. In fact it's quicker to do four 4x4 transposes with SSE than to do one 8x8 transpose the way I have used the vectorclass. Do you have a suggestion how I could get the same speed with the vector class? I have provided code for two functions. One function uses Vec8f to do the transpose and the other uses intrinsics.
inline void transpose8_vec8f(Vec8f &row0, Vec8f &row1, Vec8f &row2, Vec8f &row3, Vec8f &row4, Vec8f &row5, Vec8f &row6, Vec8f &row7) {
Vec8f tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
tmp0 = blend8f<0, 1, 2, 3, 8, 9, 10, 11>(row0, row1);
tmp1 = blend8f<0, 1, 2, 3, 8, 9, 10, 11>(row2, row3);
tmp2 = blend8f<0, 1, 2, 3, 8, 9, 10, 11>(row4, row5);
tmp3 = blend8f<0, 1, 2, 3, 8, 9, 10, 11>(row6, row7); tmp4 = blend8f<4, 5, 6, 7, 12, 13, 14, 15>(row0, row1);
tmp5 = blend8f<4, 5, 6, 7, 12, 13, 14, 15>(row2, row3);
tmp6 = blend8f<4, 5, 6, 7, 12, 13, 14, 15>(row4, row5);
tmp7 = blend8f<4, 5, 6, 7, 12, 13, 14, 15>(row6, row7);
tmp8 = blend8f<0, 4, 8, 12, 1, 5, 9 ,13>(tmp0, tmp1);
tmp9 = blend8f<0, 4, 8, 12, 1, 5, 9, 13>(tmp2, tmp3);
row0 = blend8f<0, 1, 2, 3, 8, 9, 10, 11>(tmp8, tmp9);
row1 = blend8f<4, 5, 6, 7, 12, 13, 14, 15>(tmp8, tmp9); tmp10 = blend8f<2, 6, 10, 14, 3, 7, 11 ,15>(tmp0, tmp1);
tmp11 = blend8f<2, 6, 10, 14, 3, 7, 11, 15>(tmp2, tmp3);
row2 = blend8f<0, 1, 2, 3, 8, 9, 10, 11>(tmp10, tmp11);
row3 = blend8f<4, 5, 6, 7, 12, 13, 14, 15>(tmp10, tmp11); tmp12 = blend8f<0, 4, 8, 12, 1, 5, 9 ,13>(tmp4, tmp5);
tmp13 = blend8f<0, 4, 8, 12, 1, 5, 9, 13>(tmp6, tmp7);
row4 = blend8f<0, 1, 2, 3, 8, 9, 10, 11>(tmp12, tmp13);
row5 = blend8f<4, 5, 6, 7, 12, 13, 14, 15>(tmp12, tmp13); tmp14 = blend8f<2, 6, 10, 14, 3, 7, 11 ,15>(tmp4, tmp5);
tmp15 = blend8f<2, 6, 10, 14, 3, 7, 11, 15>(tmp6, tmp7);
row6 = blend8f<0, 1, 2, 3, 8, 9, 10, 11>(tmp14, tmp15);
row7 = blend8f<4, 5, 6, 7, 12, 13, 14, 15>(tmp14, tmp15);
}
inline void transpose8_avx(__m256 &row0, __m256 &row1, __m256 &row2, __m256 &row3, __m256 &row4, __m256 &row5, __m256 &row6, __m256 &row7) {
__m256 __t0, __t1, __t2, __t3, __t4, __t5, __t6, __t7;
__m256 __tt0, __tt1, __tt2, __tt3, __tt4, __tt5, __tt6, __tt7;
__t0 = _mm256_unpacklo_ps(row0, row1);
__t1 = _mm256_unpackhi_ps(row0, row1);
__t2 = _mm256_unpacklo_ps(row2, row3);
__t3 = _mm256_unpackhi_ps(row2, row3);
__t4 = _mm256_unpacklo_ps(row4, row5);
__t5 = _mm256_unpackhi_ps(row4, row5);
__t6 = _mm256_unpacklo_ps(row6, row7);
__t7 = _mm256_unpackhi_ps(row6, row7);
__tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0));
__tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2));
__tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0));
__tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2));
__tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0));
__tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2));
__tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0));
__tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2));
row0 = _mm256_permute2f128_ps(__tt0, __tt4, 0x20);
row1 = _mm256_permute2f128_ps(__tt1, __tt5, 0x20);
row2 = _mm256_permute2f128_ps(__tt2, __tt6, 0x20);
row3 = _mm256_permute2f128_ps(__tt3, __tt7, 0x20);
row4 = _mm256_permute2f128_ps(__tt0, __tt4, 0x31);
row5 = _mm256_permute2f128_ps(__tt1, __tt5, 0x31);
row6 = _mm256_permute2f128_ps(__tt2, __tt6, 0x31);
row7 = _mm256_permute2f128_ps(__tt3, __tt7, 0x31);
} |