Improved algorithm

This commit is contained in:
Eduard Urbach 2023-07-23 00:46:39 +02:00
parent 6a0ccab604
commit 9f7388accc
Signed by: akyoto
GPG Key ID: C874F672B1AF20C0
3 changed files with 46 additions and 53 deletions

View File

@ -1,23 +1,31 @@
package hash_test
import (
"bytes"
"testing"
"git.akyoto.dev/go/hash"
)
var data = []byte(`<!doctype html>
<html lang="en">
<head>
<title>Hash</title>
</head>
<body>
<main>Test</main>
</body>
</html>`)
func BenchmarkSize(b *testing.B) {
b.Run("8", bench(8))
b.Run("16", bench(16))
b.Run("32", bench(32))
b.Run("64", bench(64))
b.Run("128", bench(128))
b.Run("256", bench(256))
b.Run("512", bench(512))
b.Run("1024", bench(1024))
b.Run("2048", bench(2048))
b.Run("4096", bench(4096))
}
func bench(n int) func(*testing.B) {
return func(b *testing.B) {
tmp := bytes.Repeat([]byte{'a'}, n)
func BenchmarkBytes(b *testing.B) {
for i := 0; i < b.N; i++ {
hash.Bytes(data)
hash.Bytes(tmp)
}
}
}

49
hash.go
View File

@ -9,7 +9,6 @@ func Bytes(in []byte) uint64 {
return add(0, in)
}
// add implements the actual hashing.
func add(x uint64, in []byte) uint64 {
var i int
@ -18,50 +17,34 @@ func add(x uint64, in []byte) uint64 {
// That means we should read 8 uint64 at a time.
for ; i < len(in)-63; i += 64 {
words := (*[8]uint64)(unsafe.Pointer(&in[i]))
x += words[0]
x = (x << 1) | (x >> (64 - 1))
x += words[1]
x = (x << 1) | (x >> (64 - 1))
x += words[2]
x = (x << 1) | (x >> (64 - 1))
x += words[3]
x = (x << 1) | (x >> (64 - 1))
x += words[4]
x = (x << 1) | (x >> (64 - 1))
x += words[5]
x = (x << 1) | (x >> (64 - 1))
x += words[6]
x = (x << 1) | (x >> (64 - 1))
x += words[7]
x = (x << 1) | (x >> (64 - 1))
x = mix(x, words[0])
x = mix(x, words[1])
x = mix(x, words[2])
x = mix(x, words[3])
x = mix(x, words[4])
x = mix(x, words[5])
x = mix(x, words[6])
x = mix(x, words[7])
}
// While we have at least 8 bytes left, convert them to uint64.
for ; i < len(in)-7; i += 8 {
x += *(*uint64)(unsafe.Pointer(&in[i]))
x = (x << 1) | (x >> (64 - 1))
word := *(*uint64)(unsafe.Pointer(&in[i]))
x = mix(x, word)
}
// Hash the remaining bytes.
// At this point we know that there are less than 8 bytes left,
// so we can shift each iteration by 8 bits to assure that hashes
// for tiny data buffers are always unique.
for ; i < len(in); i++ {
x += uint64(in[i])
x = (x << 8) | (x >> (64 - 8))
x = mix(x, uint64(in[i]))
}
// This helps to avoid clashes between different lengths
// of all-zero bytes by making the data length significant.
x += uint64(len(in))
x = mix(x, uint64(len(in)))
return x
}
func mix(x uint64, b uint64) uint64 {
return (x + b) * 0x50003
}

View File

@ -7,14 +7,14 @@ import (
"git.akyoto.dev/go/hash"
)
func TestTinyCollisions(t *testing.T) {
hashes := map[uint64][]byte{}
var hashes = map[uint64][]byte{}
func TestTinyCollisions(t *testing.T) {
for size := 1; size < 8; size++ {
tmp := make([]byte, size)
index := 0
for i := 0; i < 10; i++ {
for i := 0; i < 256; i++ {
tmp[index] += 1
h := hash.Bytes(tmp)
previous, found := hashes[h]
@ -23,22 +23,24 @@ func TestTinyCollisions(t *testing.T) {
t.Fatalf("collision between %v and %v:\nhash %064b", previous, tmp, h)
}
hashes[h] = tmp
save := make([]byte, size)
copy(save, tmp)
hashes[h] = save
index = (index + 1) % size
}
}
}
func TestZeroedCollisions(t *testing.T) {
hashes := map[uint64][]byte{}
zero := make([]byte, 8192)
for size := 1; size <= 8192; size++ {
tmp := make([]byte, size)
for size := 1; size <= len(zero); size++ {
tmp := zero[:size]
h := hash.Bytes(tmp)
previous, found := hashes[h]
if found && !bytes.Equal(tmp, previous) {
t.Fatalf("collision between zeroed sizes %d and %d:\nhash %064b", len(previous), size, h)
t.Fatalf("collision between %v and %v:\nhash %064b", previous, tmp, h)
}
hashes[h] = tmp