我最初有这个功能,我正在尝试使用循环展开来进一步优化它 - 我遇到了麻烦 - 翻转 for 循环可以提高效率,以及在循环之外进行调用。然而,当我像我一样应用循环展开时,它错过了函数应该做的事情。
double
applyFilter(class Filter *filter, cs1300bmp *input, cs1300bmp *output)
{
long long cycStart, cycStop;
cycStart = rdtscll();
output -> width = input -> width;
output -> height = input -> height;
for(int col = 1; col < (input -> width) - 1; col = col + 1) {
for(int row = 1; row < (input -> height) - 1 ; row = row + 1) {
for(int plane = 0; plane < 3; plane++) {
output -> color[plane][row][col] = 0;
for (int j = 0; j < filter -> getSize(); j++) {
for (int i = 0; i < filter -> getSize(); i++) {
output -> color[plane][row][col]
= output -> color[plane][row][col]
+ (input -> color[plane][row + i - 1][col + j - 1]
* filter -> get(i, j) );
}
}
output -> color[plane][row][col] =
output -> color[plane][row][col] / filter -> getDivisor();
if ( output -> color[plane][row][col] < 0 ) {
output -> color[plane][row][col] = 0;
}
if ( output -> color[plane][row][col] > 255 ) {
output -> color[plane][row][col] = 255;
}
}
}
}
cycStop = rdtscll();
double diff = cycStop - cycStart;
double diffPerPixel = diff / (output -> width * output -> height);
fprintf(stderr, "Took %f cycles to process, or %f cycles per pixel\n",
diff, diff / (output -> width * output -> height));
return diffPerPixel;
}
这是我到达的地方,但它似乎不起作用。我会很感激解释我在循环展开部分做错了什么。
double applyFilter(class Filter *filter, cs1300bmp *input, cs1300bmp *output){
long long cycStart, cycStop;
cycStart = rdtscll();
//start
output -> width = input -> width;
output -> height = input -> height;
//function calls outside loop.
int filterSize = filter -> getSize();
int divisor = filter -> getDivisor();
//intializaions
int inputHlen = input -> height - 1;
int inputWlen = input -> width - 1;
// loop unrolling row + k - 1 , col + k - 1
for(int plane = 0; plane < 3; plane++){
for(int row = 1; row + 3 < inputHlen; row += 4){
for(int col = 1; col +3 < inputWlen; col += 4){
output -> color[plane][row][col] = 0;
output -> color[plane][row + 1][col + 1] = 0;
output -> color[plane][row + 2][col + 2] = 0;
output -> color[plane][row + 3][col + 3] = 0;
int acc1 = output -> color[plane][row][col];
int acc2 = output -> color[plane][row + 1][col + 1];
int acc3 = output -> color[plane][row + 2][col + 2];
int acc4 = output -> color[plane][row + 3][col + 3];
for (int j = 0; j + 3 < filterSize; j += 4) {
for (int i = 0; i + 3 < filterSize; i += 4){
acc1 = acc1 + (input -> color[plane][row + i - 1][col + j - 1] * filter -> get(i, j) );
acc2 = acc2 + (input -> color[plane][row + i][col + j] * filter -> get(i + 1, j + 1) );
acc3 = acc3 + (input -> color[plane][row + i + 1][col + j + 1] * filter -> get(i +2, j + 2) );
acc4 = acc4 + (input -> color[plane][row + i + 2][col + j + 2] * filter -> get(i + 3, j + 3) );
}
}
acc1 = acc1 / divisor;
acc2 = acc2 / divisor;
acc3 = acc3 / divisor;
acc4 = acc4 / divisor;
acc1 = (acc1 < 0) ? 0 : acc1;
acc1 = (acc1 > 255) ? 255 : acc1;
acc2 = (acc1 < 0) ? 0 : acc1;
acc2 = (acc1 > 255) ? 255 : acc1;
acc3 = (acc1 < 0) ? 0 : acc1;
acc3 = (acc1 > 255) ? 255 : acc1;
acc4 = (acc1 < 0) ? 0 : acc1;
acc4 = (acc1 > 255) ? 255 : acc1;
output -> color[plane][row][col] = acc1;
output -> color[plane][row + 1][col + 1] = acc2;
output -> color[plane][row + 2][col + 2] = acc3;
output -> color[plane][row + 3][col + 3] = acc4;
}
}
}
//end
cycStop = rdtscll();
double diff = cycStop - cycStart;
double diffPerPixel = diff / (output -> width * output -> height);
fprintf(stderr, "Took %f cycles to process, or %f cycles per pixel\n",
diff, diff / (output -> width * output -> height));
return diffPerPixel;
}