Hello, I've discovered that the following vectorclass-based code, which transposes a 4x16 matrix of unsigned short integers, performs poorly if compiled for AVX2 instruction set compared to AVX for the same CPU. For example, on the Intel Core i5-4460 (Haswell) CPU (actually on the same machine), the code compiled with gcc -march=native or -march=corei7-avx2 is three times slower than itself compiled with -march=corei7-avx. Something must be wrong there, but I don't know where. It's microarchitecture, compiler, or library (vectorclass) issue?
typedef Vec16us X16T;inline void transpi (X16T y[4], X16T const x[4])
{ // transpose a 4x16 matrix x into a 16x4 matrix y
X16T x0,x1,x2,x3,y0,y1,y2,y3;
x0.load( &x[0] );
x1.load( &x[1] );
x2.load( &x[2] );
x3.load( &x[3] );
y0 = blend16us< 0,16, 1,17, 2,18, 3,19, 4,20, 5,21, 6,22, 7,23>(x0,x2);
y1 = blend16us< 8,24, 9,25,10,26,11,27,12,28,13,29,14,30,15,31>(x0,x2);
y2 = blend16us< 0,16, 1,17, 2,18, 3,19, 4,20, 5,21, 6,22, 7,23>(x1,x3);
y3 = blend16us< 8,24, 9,25,10,26,11,27,12,28,13,29,14,30,15,31>(x1,x3);
x0 = blend16us< 0,16, 1,17, 2,18, 3,19, 4,20, 5,21, 6,22, 7,23>(y0,y2);
x1 = blend16us< 8,24, 9,25,10,26,11,27,12,28,13,29,14,30,15,31>(y0,y2);
x2 = blend16us< 0,16, 1,17, 2,18, 3,19, 4,20, 5,21, 6,22, 7,23>(y1,y3);
x3 = blend16us< 8,24, 9,25,10,26,11,27,12,28,13,29,14,30,15,31>(y1,y3);
x0.stor( &y[0] );
x1.stor( &y[1] );
x2.stor( &y[2] );
x3.stor( &y[3] );
}
|