我想通过将输入矩阵分成块然后转置它们来实现矩阵的转置。我参考了相应的帖子A Cache Efficient Matrix Transpose Program? 并像这样编写我的代码:
#include<iostream>
#include<stdlib.h>
#define m 4
#include<sys/time.h>
#include<time.h>
#include<malloc.h>
using namespace std;
int **a, **b, **c;
int count = 0;
clock_t t1, t2;
int blocksize = 2;
int main(){
a = (int **)malloc(m*sizeof(int *));
for(int i = 0;i<m;i++){
a[i] = (int *)malloc(m*sizeof(int));
}
b = (int **)malloc(m*sizeof(int *));
for(int i = 0;i<m;i++){
b[i] = (int *)malloc(m*sizeof(int));
}
for(int i=0; i<m; i++){
for(int j =0; j<m; j++){
a[i][j]=(2*i)+(3*j);
}
}
for(int i=0; i<m; i++){
for(int j =0; j<m; j++){
cout << a[i][j] << "\t";
}
cout << "\n";
}
cout << "\n";
t1 = clock();
// MAIN BLOCK TRANSPOSE CODE
for (int i = 0; i < m; i += blocksize) {
for (int j = 0; j < m; j += blocksize) {
for (int k = i; k < i + blocksize; ++k) {
for (int l = j; l < j + blocksize; ++l) {
b[k + l*m] = a[l + k*m];
}
}
}
}
t2 = clock();
for(int i=0; i<m; i++){
for(int j =0; j<m; j++){
cout << b[i][j] << "\t";
}
cout << "\n";
}
free(a);
free(b);
cout << "\n";
cout << (double)(t2-t1)/CLOCKS_PER_SEC << "\n";
return 0;
}
但是,代码没有按预期工作。我实现了据说在相应帖子中工作的代码。如果可能,请提供帮助。
输入数组:
0 3 6 9
2 5 8 11
4 7 10 13
6 9 12 15
预期输出数组:
0 2 4 6
3 5 7 9
6 8 10 12
9 11 13 15
获得的结果:
0 3 6 9
Segmentation fault