Problem
I am trying to find out whether mmx
or xmm
registers are faster for copying elements of an array to another array (I know about memcpy()
but I need this function for a very specific purpose).
My souce code is below. The relevant function is copyarray()
. I can use either mmx
or xmm
registers with movq
or movsd
respectively, and the result is correct. However, when I use mmx
registers, any timer I use (either clock()
or QueryPerformanceCounter
) to time the operations returns NaN
.
Compiled with: gcc -std=c99 -O2 -m32 -msse3 -mincoming-stack-boundary=2 -mfpmath=sse,387 -masm=intel copyasm.c -o copyasm.exe
This is a very strange bug and I cannot figure out why using mmx
registers would cause a timer to return NaN
seconds, while using xmm
registers in exactly the same code returns a valid time value
EDIT
Results using xmm
registers:
Elapsed time: 0.000000 seconds, Gigabytes copied per second: inf GB
Residual = 0.000000
0.937437 0.330424 0.883267 0.118717 0.962493 0.584826 0.344371 0.423719
0.937437 0.330424 0.883267 0.118717 0.962493 0.584826 0.344371 0.423719
Results using mmx
register:
Elapsed time: nan seconds, Gigabytes copied per second: inf GB
Residual = 0.000000
0.000000 0.754173 0.615345 0.634724 0.611286 0.547655 0.729637 0.942381
0.935759 0.754173 0.615345 0.634724 0.611286 0.547655 0.729637 0.942381
Source Code
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <x86intrin.h>
#include <windows.h>
inline double
__attribute__ ((gnu_inline))
__attribute__ ((aligned(64))) copyarray(
double* restrict dst,
const double* restrict src,
const int n)
{
// int i = n;
// do {
// *dst++ = *src++;
// i--;
// } while(i);
__asm__ __volatile__
(
"mov ecx, %[n] \n\t"
"mov edi, %[dst] \n\t"
"mov esi, %[src] \n\t"
"xor eax, eax \n\t"
"sub ecx,1 \n\t"
"L%=: \n\t"
"movq mm0, QWORD PTR [esi+ecx*8] \n\t"
"movq QWORD PTR [edi+ecx*8], mm0 \n\t"
"sub ecx, 1 \n\t"
"jge L%= \n\t"
: // no outputs
: // inputs
[dst] "m" (dst),
[src] "m" (src),
[n] "g" (n)
: // register clobber
"eax","ecx","edi","esi",
"mm0"
);
}
void printarray(double* restrict a, int n)
{
for(int i = 0; i < n; ++i) {
printf(" %f ", *(a++));
}
printf("\n");
}
double residual(const double* restrict dst,
const double* restrict src,
const int n)
{
double residual = 0.0;
for(int i = 0; i < n; ++i)
residual += *(dst++) - *(src++);
return(residual);
}
int main()
{
double *A = NULL;
double *B = NULL;
int n = 8;
double memops;
double time3;
clock_t time1;
// LARGE_INTEGER frequency, time1, time2;
// QueryPerformanceFrequency(&frequency);
int trials = 1 << 0;
A = _mm_malloc(n*sizeof(*A), 64);
B = _mm_malloc(n*sizeof(*B), 64);
srand(time(NULL));
for(int i = 0; i < n; ++i)
*(A+i) = (double) rand()/RAND_MAX;
// QueryPerformanceCounter(&time1);
time1 = clock();
for(int i = 0; i < trials; ++i)
copyarray(B,A,n);
// QueryPerformanceCounter(&time2);
// time3 = (double)(time2.QuadPart - time1.QuadPart) / frequency.QuadPart;
time3 = (double) (clock() - time1)/CLOCKS_PER_SEC;
memops = (double) trials*n/time3*sizeof(*A)/1.0e9;
printf("Elapsed time: %f seconds, Gigabytes copied per second: %f GB\n",time3, memops);
printf("Residual = %f\n",residual(B,A,n));
printarray(A,n);
printarray(B,n);
_mm_free(A);
_mm_free(B);
}