Vector Class Discussion

Bilinear interpolation of images
Author:  Date: 2013-02-05 05:56
Hi. I have implemented bilinear interpolation of images in C++ on the CPU. I found a blog that does this with SSE2 and SSE3 instructions which gives the fastest results.
fastcpp.blogspot.no/2011/06/bilinear-pixel-interpolation-using-sse.html#comment-form

I have tried to implement the same code using the vector class. However, the speed is less than I hoped for compared to the SSE native code on the blog. In fact using fixed point math is even faster than my code with the vectorclass. I thought you might be interested to see what I have done and perhaps you have some comments to improve my code. Below follows my implementation of GetPixelSSE and GitPixelSSE3 functions on the fastcpp blog.

inline Vec4f CalcWeights_vector(float x, float y) {
Vec4f v1(x);
v1.insert(1,y);
Vec4f v2 = floor(v1);
Vec4f frac = v1 - v2; // dx dy X X
Vec4f frac1 = 1 - frac; // 1-dx, 1-dy X X
Vec4f w_x = blend4f<4, 0, 4, 0>(frac, frac1);
Vec4f w_y = blend4f<5, 5, 1, 1>(frac, frac1);

return w_x * w_y;
}

int GetPixelSSE(const int* data, float u, float v, const int src_width, const int src_height, Vec4f& weights) {
Vec16uc p12x;
Vec16uc p34x;
//Vec16uc p12x = load_partial2(&data[0]);
//Vec16uc p34x = load_partial2(&data[src_width]);

p12x.load(&data[0]);
//p12x.load_partial(8, &data[0]);
p34x.load(&data[src_width]);
//p34x.load_partial(8, &data[src_width]);

Vec8us p12 = extend_low(p12x);
Vec8us p34 = extend_low(p34x);
weights*=256;
Vec4i weighti = round_to_int(weights);
Vec8us weighti2 = compress(weighti, 0);
Vec8us w12 = permute8us<0,0,0,0,1,1,1,1>((Vec8us)weighti2);
Vec8us w34 = permute8us<2,2,2,2,3,3,3,3>((Vec8us)weighti2);

Vec8us L1234 = w12*p12 + w34*p34;
Vec8us Lhi = permute8us<4,5,6,7,-256,-256,-256,-256>(L1234);
L1234 +=Lhi;
L1234/=256;
Vec16uc L = compress(L1234, 0);
Vec4ui out = L;
return out[0];
}

int GetPixelSSE3(const int* data, float u, float v, const int src_width, const int src_height, Vec4f& weights) {
Vec4ui row1, row2;
row1.load(&data[0]);
row2.load(&data[src_width]);
Vec4ui aos = blend4ui<0, 1, 4, 5>(row1, row2);
Vec16uc soa = permute16uc<0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15>((Vec16uc)aos); //AoS to SoA

Vec8us rg = extend_low(soa);
Vec8us ba = extend_high(soa);

Vec4ui redv = extend_low(rg);
Vec4ui greenv = extend_high(rg);
Vec4ui bluev = extend_low(ba);
weights*=256;
Vec4ui wi = round_to_int(weights);

//no mm_madd in vectorclass

int red = horizontal_add(redv*wi)/256;
int green = horizontal_add(greenv*wi)/256;
int blue = horizontal_add(bluev*wi)/256;
int color = red + (green << 8) + (blue << 16);
return color;
}

 
thread Bilinear interpolation of images - Chad Jarvis - 2013-02-05
last replythread Bilinear interpolation of images new - Agner - 2013-02-06
last reply Bilinear interpolation of images new - Chad Jarvis - 2013-02-07