I have a murmurhash3 implementation in my FastDefaults unit at https://github.com/JBontes/FastCode
Here's the sourcecode for Murmurhash3:
{$pointermath on}
function MurmurHash3(const [ref] HashData; Len: integer; Seed: integer = 0): integer;
const
c1 = $CC9E2D51;
c2 = $1B873593;
r1 = 15;
r2 = 13;
m = 5;
n = $E6546B64;
f1 = $85EBCA6B;
f2 = $C2B2AE35;
{$IFDEF purepascal}
var
i, Len2: integer;
k: cardinal;
remaining: cardinal;
Data: PCardinal;
label
case1, case2, case3, final;
begin
Result:= Seed;
Data:= @HashData;
for i:= 0 to (Len shr 2) - 1 do begin
k:= Data[i];
k:= k * c1;
k:= (k shl r1) or (k shr (32 - r1));
k:= k * c2;
Result:= Result xor k;
Result:= (Result shl r2) or (Result shr (32 - r2));
Result:= Result * m + n;
end; {for i}
Len2:= Len;
remaining:= 0;
case Len and $3 of
1: goto case1;
2: goto case2;
3: goto case3;
else goto final;
end;
case3:
dec(Len2);
inc(remaining, PByte(Data)[Len2] shl 16);
case2:
dec(Len2);
inc(remaining, PByte(Data)[Len2] shl 8);
case1:
dec(Len2);
inc(remaining, PByte(Data)[Len2]);
remaining:= remaining * c1;
remaining:= (remaining shl r1) or (remaining shr (32 - r1));
remaining:= remaining * c2;
Result:= Result xor remaining;
final:
Result:= Result xor Len;
Result:= Result xor (Result shr 16);
Result:= Result * f1;
Result:= Result xor (Result shr 13);
Result:= Result * f2;
Result:= Result xor (Result shr 16);
end;
{$ELSE}
{$REGION 'asm'}
{$IFDEF CPUx86}
asm
push EBX
push EDI
push ESI
xchg ECX,EDX
//EAX = data
//ECX = count in bytes
//EDX = seed
mov ESI,ECX
shr ECX,2
jz @remaining_bytes
@loop:
mov EDI,[EAX]
imul EDI,EDI,c1
rol EDI,r1
imul EDI,EDI,c2
xor EDX,EDI
rol EDX,r2
lea EDX,[EDX*4+EDX+n]
lea EAX,[EAX+4]
dec ECX
jnz @loop
@remaining_bytes:
mov ECX,ESI
and ECX,$3
jz @finalization
xor EBX,EBX
dec ECX
mov BL,byte ptr [EAX+ECX]
jz @process_remaining
shl EBX,8
dec ECX
mov BL,byte ptr [EAX+ECX]
jz @process_remaining
shl EBX,8
mov BL,byte ptr [EAX]
@process_remaining:
imul EBX,EBX,c1
rol EBX,r1
imul EBX,EBX,c2
xor EDX,EBX
@finalization:
xor EDX,ESI
mov EAX,EDX
shr EDX,16
xor EDX,EAX
imul EDX,EDX,f1
mov EAX,EDX
shr EDX,13
xor EDX,EAX
imul EDX,EDX,f2
mov EAX,EDX
shr EDX,16
xor EAX,EDX
pop ESI
pop EDI
pop EBX
end;
{$ENDIF}
{$IFDEF CPUx64}
asm
push RBX
push RDI
push RSI
mov RAX,RCX
mov RCX,RDX
mov RDX,R8
//RAX = data
//RCX = count in bytes
//RDX = seed
mov ESI,ECX
shr ECX,2
jz @remaining_bytes
@loop:
mov EDI, dword ptr [RAX]
imul EDI,EDI,c1
rol EDI,r1
imul EDI,EDI,c2
xor EDX,EDI
rol EDX,r2
lea EDX,dword ptr [EDX*4+EDX+n] // *5 + n
lea RAX,qword ptr [RAX+4]
dec ECX
jnz @loop
@remaining_bytes:
mov ECX,ESI
and ECX,$3
jz @finalization
xor RBX,RBX
dec ECX
mov BL,byte ptr [RAX+RCX]
jz @process_remaining
shl EBX,8
dec ECX
mov BL,byte ptr [RAX+RCX]
jz @process_remaining
shl EBX,8
mov BL,byte ptr [RAX]
@process_remaining:
imul EBX,EBX,c1
rol EBX,r1
imul EBX,EBX,c2
xor EDX,EBX
@finalization:
xor EDX,ESI
mov EAX,EDX
shr EDX,16
xor EDX,EAX
imul EDX,EDX,f1
mov EAX,EDX
shr EDX,13
xor EDX,EAX
imul EDX,EDX,f2
mov EAX,EDX
shr EDX,16
xor EAX,EDX
pop RSI
pop RDI
pop RBX
end;
{$ENDIF}
{$ENDREGION}
{$ENDIF}
The goto's are in place to simulate C's fallthough switch/case statement.
This kind of code is easier to write in asm, which has better register usage than the asm Delphi generates.
Why is your code slow
he only place I can see a problem is here:
Project42.dpr.54: while(len >= 4) do //a for loop is faster.
00420E17 83FE04 cmp esi,$04
00420E1A 7242 jb $00420e5e
//This translates into inefficient code
Project42.dpr.56: k := PLongWord(@S[data])^;
00420E1C 8B0424 mov eax,[esp]
00420E1F 8D4438FF lea eax,[eax+edi-$01]
00420E23 8B00 mov eax,[eax]
Three indirect memory references and the read is misaligned!!
See here for more info: Purpose of memory alignment never mind the accepted answer, see the second answer.
A read of a double word should always happen on a dword boundary.
The @pointer^
trick makes the compiler to add an extra level of indirection (and an extra round-trip to memory (Oops)).
Use {$pointermath on}
and address the pointer as an array.
integer != pointer
Also it's an error to use integer
to store a pointer.
It will break in X64. Use NativeUInt instead.
The equivalent code in my version translates to:
Project42.dpr.128: Data:= @HashData;
00420FCD 89442404 mov [esp+$04],eax
Project42.dpr.129: for i:= 0 to (Len shr 2) - 1 do begin
00420FD1 8B1C24 mov ebx,[esp]
00420FD4 C1EB02 shr ebx,$02
00420FD7 4B dec ebx
00420FD8 85DB test ebx,ebx
00420FDA 7C3E jl $0042101a
00420FDC 43 inc ebx
00420FDD 33D2 xor edx,edx
////------------------------------------------///
Project42.dpr.130: k:= Data[i];
00420FDF 8B442404 mov eax,[esp+$04]
00420FE3 8B0490 mov eax,[eax+edx*4]
Note that there are no less unneeded indirect memory references and reads are properly aligned.
Of course the asm version is slightly better, but the difference between the asm and pascal version should be less than between the two Pascal versions.
This is where I think your performance is wasted.
The misaligned reads eat up a lot of (wasted) cycles on X86.
On other processors the slowdown is even worse.
Other issues with the code
Limiting your implementation to strings is a bad thing to do.
What if I want to hash a record?
Do not force binary data into strings.
Strings are a bad fit for binary data.
Just use an untyped parameter and pointers.
Seed values
I don't think there is a correct seed value. The way I understood it the seed is there so you can chain calls to Murmurhash.
You use the output of the first hash as the seed for the second run.
That way you can feed 2 variables in and get the same output as if you processed those two variables in one go.
Let me know how they perform against your code.