Improved algorithm

2023-07-23 00:46:39 +02:00
parent 6a0ccab604
commit 9f7388accc
3 changed files with 46 additions and 53 deletions
--- a/Benchmarks_test.go
+++ b/Benchmarks_test.go
@ -1,23 +1,31 @@
 package hash_test

 import (
+	"bytes"
 	"testing"

 	"git.akyoto.dev/go/hash"
 )

-var data = []byte(`<!doctype html>
-<html lang="en">
-	<head>
-		<title>Hash</title>
-	</head>
-	<body>
-		<main>Test</main>
-	</body>
-</html>`)
+func BenchmarkSize(b *testing.B) {
+	b.Run("8", bench(8))
+	b.Run("16", bench(16))
+	b.Run("32", bench(32))
+	b.Run("64", bench(64))
+	b.Run("128", bench(128))
+	b.Run("256", bench(256))
+	b.Run("512", bench(512))
+	b.Run("1024", bench(1024))
+	b.Run("2048", bench(2048))
+	b.Run("4096", bench(4096))
+}

-func BenchmarkBytes(b *testing.B) {
-	for i := 0; i < b.N; i++ {
-		hash.Bytes(data)
+func bench(n int) func(*testing.B) {
+	return func(b *testing.B) {
+		tmp := bytes.Repeat([]byte{'a'}, n)
+
+		for i := 0; i < b.N; i++ {
+			hash.Bytes(tmp)
+		}
 	}
 }
--- a/hash.go
+++ b/hash.go
@ -9,7 +9,6 @@ func Bytes(in []byte) uint64 {
 	return add(0, in)
 }

-// add implements the actual hashing.
 func add(x uint64, in []byte) uint64 {
 	var i int

@ -18,50 +17,34 @@ func add(x uint64, in []byte) uint64 {
 	// That means we should read 8 uint64 at a time.
 	for ; i < len(in)-63; i += 64 {
 		words := (*[8]uint64)(unsafe.Pointer(&in[i]))
-
-		x += words[0]
-		x = (x << 1) | (x >> (64 - 1))
-
-		x += words[1]
-		x = (x << 1) | (x >> (64 - 1))
-
-		x += words[2]
-		x = (x << 1) | (x >> (64 - 1))
-
-		x += words[3]
-		x = (x << 1) | (x >> (64 - 1))
-
-		x += words[4]
-		x = (x << 1) | (x >> (64 - 1))
-
-		x += words[5]
-		x = (x << 1) | (x >> (64 - 1))
-
-		x += words[6]
-		x = (x << 1) | (x >> (64 - 1))
-
-		x += words[7]
-		x = (x << 1) | (x >> (64 - 1))
+		x = mix(x, words[0])
+		x = mix(x, words[1])
+		x = mix(x, words[2])
+		x = mix(x, words[3])
+		x = mix(x, words[4])
+		x = mix(x, words[5])
+		x = mix(x, words[6])
+		x = mix(x, words[7])
 	}

 	// While we have at least 8 bytes left, convert them to uint64.
 	for ; i < len(in)-7; i += 8 {
-		x += *(*uint64)(unsafe.Pointer(&in[i]))
-		x = (x << 1) | (x >> (64 - 1))
+		word := *(*uint64)(unsafe.Pointer(&in[i]))
+		x = mix(x, word)
 	}

 	// Hash the remaining bytes.
-	// At this point we know that there are less than 8 bytes left,
-	// so we can shift each iteration by 8 bits to assure that hashes
-	// for tiny data buffers are always unique.
 	for ; i < len(in); i++ {
-		x += uint64(in[i])
-		x = (x << 8) | (x >> (64 - 8))
+		x = mix(x, uint64(in[i]))
 	}

 	// This helps to avoid clashes between different lengths
 	// of all-zero bytes by making the data length significant.
-	x += uint64(len(in))
+	x = mix(x, uint64(len(in)))

 	return x
 }
+
+func mix(x uint64, b uint64) uint64 {
+	return (x + b) * 0x50003
+}
--- a/hash_test.go
+++ b/hash_test.go
@ -7,14 +7,14 @@ import (
 	"git.akyoto.dev/go/hash"
 )

-func TestTinyCollisions(t *testing.T) {
-	hashes := map[uint64][]byte{}
+var hashes = map[uint64][]byte{}

+func TestTinyCollisions(t *testing.T) {
 	for size := 1; size < 8; size++ {
 		tmp := make([]byte, size)
 		index := 0

-		for i := 0; i < 10; i++ {
+		for i := 0; i < 256; i++ {
 			tmp[index] += 1
 			h := hash.Bytes(tmp)
 			previous, found := hashes[h]
@ -23,22 +23,24 @@ func TestTinyCollisions(t *testing.T) {
 				t.Fatalf("collision between %v and %v:\nhash %064b", previous, tmp, h)
 			}

-			hashes[h] = tmp
+			save := make([]byte, size)
+			copy(save, tmp)
+			hashes[h] = save
 			index = (index + 1) % size
 		}
 	}
 }

 func TestZeroedCollisions(t *testing.T) {
-	hashes := map[uint64][]byte{}
+	zero := make([]byte, 8192)

-	for size := 1; size <= 8192; size++ {
-		tmp := make([]byte, size)
+	for size := 1; size <= len(zero); size++ {
+		tmp := zero[:size]
 		h := hash.Bytes(tmp)
 		previous, found := hashes[h]

 		if found && !bytes.Equal(tmp, previous) {
-			t.Fatalf("collision between zeroed sizes %d and %d:\nhash %064b", len(previous), size, h)
+			t.Fatalf("collision between %v and %v:\nhash %064b", previous, tmp, h)
 		}

 		hashes[h] = tmp