From 6a0ccab6046ed58567e61ad8416e1c733608131c Mon Sep 17 00:00:00 2001 From: Eduard Urbach Date: Sat, 22 Jul 2023 17:02:22 +0200 Subject: [PATCH] Implemented basic hashing --- .editorconfig | 9 +++++++ .gitignore | 24 +++-------------- Benchmarks_test.go | 23 ++++++++++++++++ LICENSE | 2 +- go.mod | 3 +++ hash.go | 67 ++++++++++++++++++++++++++++++++++++++++++++++ hash_test.go | 46 +++++++++++++++++++++++++++++++ 7 files changed, 153 insertions(+), 21 deletions(-) create mode 100644 .editorconfig create mode 100644 Benchmarks_test.go create mode 100644 go.mod create mode 100644 hash.go create mode 100644 hash_test.go diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..3c94c19 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,9 @@ +root = true + +[*] +indent_style = tab +indent_size = 4 +end_of_line = lf +charset = utf-8 +trim_trailing_whitespace = true +insert_final_newline = false diff --git a/.gitignore b/.gitignore index 5cbdfa9..9ef8dad 100644 --- a/.gitignore +++ b/.gitignore @@ -1,25 +1,9 @@ -# ---> Go.AllowList -# Allowlisting gitignore template for GO projects prevents us -# from adding various unwanted local files, such as generated -# files, developer configurations or IDE-specific files etc. -# -# Recommended: Go.AllowList.gitignore - -# Ignore everything * - -# But not these files... -!/.gitignore - -!*.go +!*/ +!.gitignore +!.editorconfig !go.sum !go.mod - !README.md !LICENSE - -# !Makefile - -# ...even if they are in subdirectories -!*/ - +!*.go \ No newline at end of file diff --git a/Benchmarks_test.go b/Benchmarks_test.go new file mode 100644 index 0000000..e6f67ab --- /dev/null +++ b/Benchmarks_test.go @@ -0,0 +1,23 @@ +package hash_test + +import ( + "testing" + + "git.akyoto.dev/go/hash" +) + +var data = []byte(` + + + Hash + + +
Test
+ +`) + +func BenchmarkBytes(b *testing.B) { + for i := 0; i < b.N; i++ { + hash.Bytes(data) + } +} diff --git a/LICENSE b/LICENSE index 45695be..40d5bcd 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2023 go +Copyright (c) 2023 Eduard Urbach Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..595f0fe --- /dev/null +++ b/go.mod @@ -0,0 +1,3 @@ +module git.akyoto.dev/go/hash + +go 1.20 diff --git a/hash.go b/hash.go new file mode 100644 index 0000000..e20973a --- /dev/null +++ b/hash.go @@ -0,0 +1,67 @@ +package hash + +import ( + "unsafe" +) + +// Bytes hashes the given byte slice. +func Bytes(in []byte) uint64 { + return add(0, in) +} + +// add implements the actual hashing. +func add(x uint64, in []byte) uint64 { + var i int + + // Cache lines on modern processors are 64 bytes long. + // A single uint64 consumes 64 bits (8 bytes). + // That means we should read 8 uint64 at a time. + for ; i < len(in)-63; i += 64 { + words := (*[8]uint64)(unsafe.Pointer(&in[i])) + + x += words[0] + x = (x << 1) | (x >> (64 - 1)) + + x += words[1] + x = (x << 1) | (x >> (64 - 1)) + + x += words[2] + x = (x << 1) | (x >> (64 - 1)) + + x += words[3] + x = (x << 1) | (x >> (64 - 1)) + + x += words[4] + x = (x << 1) | (x >> (64 - 1)) + + x += words[5] + x = (x << 1) | (x >> (64 - 1)) + + x += words[6] + x = (x << 1) | (x >> (64 - 1)) + + x += words[7] + x = (x << 1) | (x >> (64 - 1)) + } + + // While we have at least 8 bytes left, convert them to uint64. + for ; i < len(in)-7; i += 8 { + x += *(*uint64)(unsafe.Pointer(&in[i])) + x = (x << 1) | (x >> (64 - 1)) + } + + // Hash the remaining bytes. + // At this point we know that there are less than 8 bytes left, + // so we can shift each iteration by 8 bits to assure that hashes + // for tiny data buffers are always unique. + for ; i < len(in); i++ { + x += uint64(in[i]) + x = (x << 8) | (x >> (64 - 8)) + } + + // This helps to avoid clashes between different lengths + // of all-zero bytes by making the data length significant. + x += uint64(len(in)) + + return x +} diff --git a/hash_test.go b/hash_test.go new file mode 100644 index 0000000..9139644 --- /dev/null +++ b/hash_test.go @@ -0,0 +1,46 @@ +package hash_test + +import ( + "bytes" + "testing" + + "git.akyoto.dev/go/hash" +) + +func TestTinyCollisions(t *testing.T) { + hashes := map[uint64][]byte{} + + for size := 1; size < 8; size++ { + tmp := make([]byte, size) + index := 0 + + for i := 0; i < 10; i++ { + tmp[index] += 1 + h := hash.Bytes(tmp) + previous, found := hashes[h] + + if found && !bytes.Equal(tmp, previous) { + t.Fatalf("collision between %v and %v:\nhash %064b", previous, tmp, h) + } + + hashes[h] = tmp + index = (index + 1) % size + } + } +} + +func TestZeroedCollisions(t *testing.T) { + hashes := map[uint64][]byte{} + + for size := 1; size <= 8192; size++ { + tmp := make([]byte, size) + h := hash.Bytes(tmp) + previous, found := hashes[h] + + if found && !bytes.Equal(tmp, previous) { + t.Fatalf("collision between zeroed sizes %d and %d:\nhash %064b", len(previous), size, h) + } + + hashes[h] = tmp + } +}