我正在学习汇编,并开始在 Digital-Mars C++ 编译器(英特尔 sytanx 更易于阅读)中对 SSE 和 MMX 寄存器进行实验。我已经完成了一个将 var_1 作为值并将其转换为 var_2 数字系统的程序(现在是 8 位。稍后将其扩展为 32 64 128)。程序通过两种方式做到这一点:
__asm
内联%(modulo) 运算符的常用 C++ 方式。
问题:你能告诉我更有效的使用 xmm0-7 和 mm0-7 寄存器的方法吗?你能告诉我如何用 al,ah... 8 位寄存器交换它们的确切字节吗?
与我的计算机(pentium-m centrino 2.0GHz)上的 __asm 相比,C++ 通常方式中的常用 %(modulo) 运算符非常慢。如果你能告诉我如何摆脱 __asmm 中的除法指令,它会更快。
当我运行程序时,它给了我:
(for the values: var_1=17,var_2=2,all loops are 200M times)
17 is 10001 in number system 2
__asm(clock)...........: 7250 <------too bad. it is 8-bit calc.
C++(clock).............: 12250 <------not very slow(var_2 is a power of 2)
(for the values: var_1=33,var_2=7,all loops are 200M times)
33 is 45 in number system 7
__asm(clock)..........: 2875 <-------not good. it is 8-bit calc.
C++(clock)............: 6328 <----------------really slow(var_2 is not a power of 2)
第二个 C++ 代码(带有 % 运算符的那个):///////////////////////////////// //////////////////
t1=clock();//reference time
for(int i=0;i<200000000;i++)
{
y=x;
counter=0;
while(y>g)
{
var_3[counter]=y%g;
y/=g;
counter++;
}
var_3[counter]=y%g;
}
t2=clock();//final time
_asm 代码://///////////////////////////////////// ///////////////////////////////////////// ///////////
__asm // i love assembly in some parts of C++
{
pushf //here does register backup
push eax
push ebx
push ecx
push edx
push edi
mov eax,0h //this will be outer loop counter init to zero
//init of medium-big registers to zero
movd xmm0,eax //cannot set to immediate constant: xmm0=outer loop counter
shufps xmm0,xmm0,0h //this makes all bits zero
movd xmm1,eax
movd xmm2,eax
shufps xmm1,xmm1,0h
shufps xmm2,xmm2,0h
movd xmm2,eax
shufps xmm3,xmm3,0h//could have made pxor xmm3,xmm3(single instruction)
//init complete(xmm0,xmm1,xmm2,xmm3 are zero)
movd xmm1,[var_1] //storing variable_1 to register
movd xmm2,[var_2] //storing var_2 to register
lea ebx,var_3 //calculate var_3 address
movd xmm3,ebx //storing var_3's address to register
for_loop:
mov eax,0h
//this line is index-init to zero(digit array index)
movd edx,xmm2
mov cl,dl //this is the var_1 stored in cl
movd edx,xmm1
mov al,dl //this is the var_2 stored in al
mov edx,0h
dng:
mov ah,00h //preparation for a 8-bit division
div cl //divide
movd ebx,xmm3 //get var_3 address
add ebx,edx //i couldnt find a way to multiply with 4
add ebx,edx //so i added 4 times ^^
add ebx,edx //add
add ebx,edx //last adding
//below, mov [ebx],ah is the only memory accessing instruction
mov [ebx],ah //(8 bit)this line is equivalent to var_3[i]=remainder
inc edx //i++;
cmp al,00h //is division zero?
jne dng //if no, loop again
//here edi register has the number of digits
movd eax,xmm0 //get the outer loop counter from medium-big register
add eax,01h //j++;
movd xmm0,eax //store the new counter to medium-big register
cmp eax,0BEBC200h //is j<(200,000,000) ?
jb for_loop //if yes, go loop again
mov [var_3_size],edx //now we have number of digits too!
//here does registers revert back to old values
pop edi
pop edx
pop ecx
pop ebx
pop eax
popf
}
整个代码:////////////////////////////////////// ///////////////////////////////////
#include <iostream.h>
#include <cmath>
#include<stdlib.h>
#include<stdio.h>
#include<time.h>
int main()
{
srand(time(0));
clock_t t1=clock();
clock_t t2=clock();
int var_1=17; //number itself
int var_2=2; //number system
int var_3[100]; //digits to be showed(maximum 100 as seen )
int var_3_size=0;//asm block will decide what will the number of digits be
for(int i=0;i<100;i++)
{
var_3[i]=0; //here we initialize digits to zeroes
}
t1=clock();//reference time to take
__asm // i love assembly in some parts of C++
{
pushf //here does register backup
push eax
push ebx
push ecx
push edx
push edi
mov eax,0h //this will be outer loop counter init to zero
//init of medium-big registers to zero
movd xmm0,eax //cannot set to immediate constant: xmm0=outer loop counter
shufps xmm0,xmm0,0h //this makes all bits zero
movd xmm1,eax
movd xmm2,eax
shufps xmm1,xmm1,0h
shufps xmm2,xmm2,0h
movd xmm2,eax
shufps xmm3,xmm3,0h
//init complete(xmm0,xmm1,xmm2,xmm3 are zero)
movd xmm1,[var_1] //storing variable_1 to register
movd xmm2,[var_2] //storing var_2 to register
lea ebx,var_3 //calculate var_3 address
movd xmm3,ebx //storing var_3's address to register
for_loop:
mov eax,0h
//this line is index-init to zero(digit array index)
movd edx,xmm2
mov cl,dl //this is the var_1 stored in cl
movd edx,xmm1
mov al,dl //this is the var_2 stored in al
mov edx,0h
dng:
mov ah,00h //preparation for a 8-bit division
div cl //divide
movd ebx,xmm3 //get var_3 address
add ebx,edx //i couldnt find a way to multiply with 4
add ebx,edx //so i added 4 times ^^
add ebx,edx //add
add ebx,edx //last adding
//below, mov [ebx],ah is the only memory accessing instruction
mov [ebx],ah //(8 bit)this line is equivalent to var_3[i]=remainder
inc edx //i++;
cmp al,00h //is division zero?
jne dng //if no, loop again
//here edi register has the number of digits
movd eax,xmm0 //get the outer loop counter from medium-big register
add eax,01h //j++;
movd xmm0,eax //store the new counter to medium-big register
cmp eax,0BEBC200h //is j<(200,000,000) ?
jb for_loop //if yes, go loop again
mov [var_3_size],edx //now we have number of digits too!
//here does registers revert back to old values
pop edi
pop edx
pop ecx
pop ebx
pop eax
popf
}
t2=clock(); //finish time
printf("\n assembly_inline(clocks): %i for the 200 million calculations",(t2-t1));
printf("\n value %i(in decimal) is: ",var_1);
for(int i=var_3_size-1;i>=0;i--)
{
printf("%i",var_3[i]);
}
printf(" in the number system: %i \n",var_2);
//and: more readable form(end easier)
int counter=var_3_size;
int x=var_1;
int g=var_2;
int y=x;// backup
t1=clock();//reference time
for(int i=0;i<200000000;i++)
{
y=x;
counter=0;
while(y>g)
{
var_3[counter]=y%g;
y/=g;
counter++;
}
var_3[counter]=y%g;
}
t2=clock();//final time
printf("\n C++(clocks): %i for the 200 million calculations",(t2-t1));
printf("\n value %i(in decimal) is: ",x);
for(int i=var_3_size-1;i>=0;i--)
{
printf("%i",var_3[i]);
}
printf(" in the number system: %i \n",g);
return 0;
}
编辑:这是 32 位版本
void get_digits_asm()
{
__asm
{
pushf //couldnt store this in other registers
movd xmm0,eax//storing in xmm registers instead of pushing
movd xmm1,ebx//
movd xmm2,ecx//
movd xmm3,edx//
movd xmm4,edi//end of push backups
mov eax,[variable_x]
mov ebx,[number_system]
mov ecx,0h
mov edi,0h
begin_loop:
mov edx,0h
div ebx
lea edi,digits
mov [edi+ecx*4],edx
add ecx,01h
cmp eax,ebx
ja begin_loop
mov edx,0
div ebx
lea edi,digits
mov [edi+ecx*4],edx
inc ecx
mov [digits_total],ecx
movd edi,xmm4//pop edi
movd edx,xmm3//pop edx
movd ecx,xmm2//pop ecx
movd ebx,xmm1//pop ebx
movd eax,xmm0//pop eax
popf
}
}