pax_global_header 0000666 0000000 0000000 00000000064 14566140541 0014520 g ustar 00root root 0000000 0000000 52 comment=651f09dad397dda1f433276133cdf61cdaf8bf2a
golang-github-kalafut-imohash-1.0.3/ 0000775 0000000 0000000 00000000000 14566140541 0017323 5 ustar 00root root 0000000 0000000 golang-github-kalafut-imohash-1.0.3/.gitignore 0000664 0000000 0000000 00000000434 14566140541 0021314 0 ustar 00root root 0000000 0000000 # Compiled Object files, Static and Dynamic libs (Shared Objects)
*.o
*.a
*.so
# Folders
_obj
_test
# Architecture specific extensions/prefixes
*.[568vq]
[568vq].out
*.cgo1.go
*.cgo2.c
_cgo_defun.c
_cgo_gotypes.go
_cgo_export.*
_testmain.go
*.exe
*.test
*.prof
cmd/imosum/imosum
golang-github-kalafut-imohash-1.0.3/LICENSE 0000664 0000000 0000000 00000002067 14566140541 0020335 0 ustar 00root root 0000000 0000000 The MIT License (MIT)
Copyright (c) 2015 Jim Kalafut
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
golang-github-kalafut-imohash-1.0.3/README.md 0000664 0000000 0000000 00000007274 14566140541 0020614 0 ustar 00root root 0000000 0000000 [](https://pkg.go.dev/github.com/kalafut/imohash)
# imohash
imohash is a fast, constant-time hashing library for Go. It uses file size and
sampling to calculate hashes quickly, regardless of file size.
[imosum](https://github.com/kalafut/imohash/blob/master/cmd/imosum/main.go) is
a sample application to hash files from the command line, similar to md5sum.
### Alternative implementations
* **Java**: https://github.com/dynatrace-oss/hash4j
* **Python**: https://github.com/kalafut/py-imohash
* **Rust**: https://github.com/hiql/imohash
## Installation
`go get github.com/kalafut/imohash/...`
The API is described in the [package documentation](https://pkg.go.dev/github.com/kalafut/imohash).
## Uses
Because imohash only reads a small portion of a file's data, it is very fast and
well suited to file synchronization and deduplication, especially over a fairly
slow network. A need to manage media (photos and video) over Wi-Fi between a NAS
and multiple family computers is how the library was born.
If you just need to check whether two files are the same, and understand the
limitations that sampling imposes (see below), imohash may be a good fit.
## Misuses
Because imohash only reads a small portion of a file's data, it is not suitable
for:
- file verification or integrity monitoring
- cases where fixed-size files are manipulated
- anything cryptographic
## Design
(Note: a more precise description is provided in the
[algorithm description](https://github.com/kalafut/imohash/blob/master/algorithm.md).)
imohash works by hashing small chunks of data from the beginning, middle and
end of a file. It also incorporates the file size into the final 128-bit hash.
This approach is based on a few assumptions which will vary by application.
First, file size alone *tends*1 to be a pretty good differentiator, especially
as file size increases. And when people do things to files (such as editing
photos), size tends to change. So size is used directly in the hash, and **any
files that have different sizes will have different hashes**.
Size is an effective differentiator but isn't sufficient. It can show that two
files aren't the same, but to increase confidence that like-size files are the
same, a few segments are hashed using
[murmur3](https://en.wikipedia.org/wiki/MurmurHash), a fast and effective
hashing algorithm. By default, 16K chunks from the beginning, middle and end of
the file are used. The ends of files often contain metadata which is more prone
to changing without affecting file size. The middle is for good measure. The
sample size can be changed for your application.
1 Try `du -a . | sort -nr | less` on a sample of your files to check this assertion.
### Small file exemption
Small files are more likely to collide on size than large ones. They're also
probably more likely to change in subtle ways that sampling will miss (e.g.
editing a large text file). For this reason, imohash will simply hash the entire
file if it is less than 128K. This parameter is also configurable.
## Performance
The standard hash performance metrics make no sense for imohash since it's only
reading a limited set of the data. That said, the real-world performance is
very good. If you are working with large files and/or a slow network,
expect huge speedups. (**spoiler**: reading 48K is quicker than reading 500MB.)
## Name
Inspired by [ILS marker beacons](https://en.wikipedia.org/wiki/Marker_beacon).
## Credits
* The "sparseFingerprints" used in [TMSU](https://github.com/oniony/TMSU) gave me
some confidence in this approach to hashing.
* The [twmb/mumur3](https://github.com/twmb/murmur3) library that does
all of the heavy lifting.
golang-github-kalafut-imohash-1.0.3/algorithm.md 0000664 0000000 0000000 00000007412 14566140541 0021637 0 ustar 00root root 0000000 0000000 ## Introduction
imohash is a file hashing algorithm optimized for large files. It uses
file size and sampling in hash generation. Because it does not process
the whole file, it is not a general purpose hashing algorithm. But for
applications where a hash sample is sufficient, imohash will provide a
high performance hashing, especially for large files over slow
networks.
## Algorithm
imohash generates a 128-bit hash from a fixed length message or file.
This is done in two phases:
1. hash calculation
2. size injection
### Parameters and mode
imohash takes two parameters, as well as the message length:
* sample size (s)
* sampling threshold (t)
* message length (L)
There are two mode of operation: **sampled** and **full**. Mode is
determined as follows:
```
if (s > 0) && (t > 0) && (L > t) && (t > 2s)
mode = sampled
else
mode = full
```
### Hash calculation
The core hashing routine uses [MurmurHash3](https://code.google.com/p/smhasher/wiki/MurmurHash3) in a 128-bit configuration.
Hashing in *Full* mode is identical to passing the entire
message to Murmhash3. *Sampled* mode constructs a new message using
three samples from the original:
Message M of length L is an array of bytes, M[0]...M[L-1]. If
L > t, full mode is used and h'=Murmur3(M). Otherwise, samples are selected and concatenated as follows:
```
middle = floor(L/2)
S0 = M[0:s-1] // samples are s bytes long
S1 = M[middle:middle+s]
S2 = M[L-s:L-1]
h' = Murmur3(concat(S0, S1, S2))
```
### Size injection
Size is inserted into the hash directly. This means that two files
that differ in size are guaranteed to have different hashes.
The message size is converted to a variable-length integer (varint)
using 128-bit encoding. Consult [Google Protobuf documentation](https://developers.google.com/protocol-buffers/docs/encoding#varints) for more
information on the technique.
The result of encoding will be an array **v** of 1 or more bytes. This
array will replace the highest-order bytes of h.
```
h = concat(v, h'[len(v):])
```
h is the final imosum hash.
## Default parameters
The default imohash parameters are:
s = 16384
t = 131072
t was chosen to delay sampling until file size was outside the range
of "small" files, such as text files that might be hand-edited and
escape both size changes and being detect by sampling. s was chosen to
provide a large enough sample to distiguish files of like size, but
still small enough to provide high performance.
An application should adjust these values as necessary.
## Test Vectors
(Note: these have not been independently verified using another implementation.)
To avoid offset errors in testing, the test messages need to not repeat
trivially. To this end, MD5 is used to generate pseudorandom test data, 16 bytes at a time,
By repeatedly updating the hash with 'A'. M(n) shall be a test data n bytes long:
```
M(n):
msg = []
while len(msg) < n:
Md5.Write('A')
msg = msg + Md5.Sum()
return msg[0:n]
// M(16) == 7fc56270e7a70fa81a5935b72eacbe29
// M(1000000) == ... 197c74f51423765786516442fd1c9832
```
Test vectors for imohash of M length n using sample size s and sample
threshold t.
```
s t M(n) I
{16384, 131072, 0, "00000000000000000000000000000000"},
{16384, 131072, 1, "01659e2ec0f3c75bf39e43a41adb5d4f"},
{16384, 131072, 127, "7f47671cc79d4374404b807249f3166e"},
{16384, 131072, 128, "800183e5dbea2e5199ef7c8ea963a463"},
{16384, 131072, 4095, "ff1f770d90d3773949d89880efa17e60"},
{16384, 131072, 4096, "802048c26d66de432dbfc71afca6705d"},
{16384, 131072, 131072, "8080085a3d3af2cb4b3a957811cdf370"},
{16384, 131073, 131072, "808008282d3f3b53e1fd132cc51fcc1d"},
{16384, 131072, 500000, "a0c21e44a0ba3bddee802a9d1c5332ca"},
{50, 131072, 300000, "e0a712edd8815c606344aed13c44adcf"},
```
golang-github-kalafut-imohash-1.0.3/cmd/ 0000775 0000000 0000000 00000000000 14566140541 0020066 5 ustar 00root root 0000000 0000000 golang-github-kalafut-imohash-1.0.3/cmd/imosum/ 0000775 0000000 0000000 00000000000 14566140541 0021377 5 ustar 00root root 0000000 0000000 golang-github-kalafut-imohash-1.0.3/cmd/imosum/main.go 0000664 0000000 0000000 00000000752 14566140541 0022656 0 ustar 00root root 0000000 0000000 // imosum is a sample application using imohash. It will calculate and report
// file hashes in a format similar to md5sum, etc.
package main
import (
"flag"
"fmt"
"log"
"os"
"github.com/kalafut/imohash"
)
func main() {
flag.Parse()
files := flag.Args()
if len(files) == 0 {
fmt.Println("imosum filenames")
os.Exit(0)
}
for _, file := range files {
hash, err := imohash.SumFile(file)
if err != nil {
log.Fatal(err)
}
fmt.Printf("%016x %s\n", hash, file)
}
}
golang-github-kalafut-imohash-1.0.3/go.mod 0000664 0000000 0000000 00000000123 14566140541 0020425 0 ustar 00root root 0000000 0000000 module github.com/kalafut/imohash
go 1.11
require github.com/twmb/murmur3 v1.1.5
golang-github-kalafut-imohash-1.0.3/go.sum 0000664 0000000 0000000 00000000245 14566140541 0020457 0 ustar 00root root 0000000 0000000 github.com/twmb/murmur3 v1.1.5 h1:i9OLS9fkuLzBXjt6dptlAEyk58fJsSTXbRg3SgVyqgk=
github.com/twmb/murmur3 v1.1.5/go.mod h1:Qq/R7NUyOfr65zD+6Q5IHKsJLwP7exErjN6lyyq3OSQ=
golang-github-kalafut-imohash-1.0.3/imohash.go 0000664 0000000 0000000 00000006046 14566140541 0021310 0 ustar 00root root 0000000 0000000 // Package imohash implements a fast, constant-time hash for files. It is based atop
// murmurhash3 and uses file size and sample data to construct the hash.
//
// For more information, including important caveats on usage, consult https://github.com/kalafut/imohash.
package imohash
import (
"bytes"
"encoding/binary"
"io"
"os"
"github.com/twmb/murmur3"
)
const Size = 16
// Files smaller than this will be hashed in their entirety.
const SampleThreshold = 128 * 1024
const SampleSize = 16 * 1024
var emptyArray = [Size]byte{}
type ImoHash struct {
hasher murmur3.Hash128
sampleSize int
sampleThreshold int
bytesAdded int
}
// New returns a new ImoHash using the default sample size
// and sample threshhold values.
func New() ImoHash {
return NewCustom(SampleSize, SampleThreshold)
}
// NewCustom returns a new ImoHash using the provided sample size
// and sample threshhold values. The entire file will be hashed
// (i.e. no sampling), if sampleSize < 1.
func NewCustom(sampleSize, sampleThreshold int) ImoHash {
h := ImoHash{
hasher: murmur3.New128(),
sampleSize: sampleSize,
sampleThreshold: sampleThreshold,
}
return h
}
// SumFile hashes a file using default sample parameters.
func SumFile(filename string) ([Size]byte, error) {
imo := New()
return imo.SumFile(filename)
}
// Sum hashes a byte slice using default sample parameters.
func Sum(data []byte) [Size]byte {
imo := New()
return imo.Sum(data)
}
// Sum hashes a byte slice using the ImoHash parameters.
func (imo *ImoHash) Sum(data []byte) [Size]byte {
sr := io.NewSectionReader(bytes.NewReader(data), 0, int64(len(data)))
result, err := imo.hashCore(sr)
if err != nil {
panic(err)
}
return result
}
// SumFile hashes a file using using the ImoHash parameters.
func (imo *ImoHash) SumFile(filename string) ([Size]byte, error) {
f, err := os.Open(filename)
defer f.Close()
if err != nil {
return emptyArray, err
}
fi, err := f.Stat()
if err != nil {
return emptyArray, err
}
sr := io.NewSectionReader(f, 0, fi.Size())
return imo.hashCore(sr)
}
// hashCore hashes a SectionReader using the ImoHash parameters.
func (imo *ImoHash) hashCore(f *io.SectionReader) ([Size]byte, error) {
var result [Size]byte
imo.hasher.Reset()
if f.Size() < int64(imo.sampleThreshold) || imo.sampleSize < 1 {
if _, err := io.Copy(imo.hasher, f); err != nil {
return emptyArray, err
}
} else {
buffer := make([]byte, imo.sampleSize)
if _, err := f.Read(buffer); err != nil {
return emptyArray, err
}
imo.hasher.Write(buffer) // these Writes never fail
if _, err := f.Seek(f.Size()/2, 0); err != nil {
return emptyArray, err
}
if _, err := f.Read(buffer); err != nil {
return emptyArray, err
}
imo.hasher.Write(buffer)
if _, err := f.Seek(int64(-imo.sampleSize), 2); err != nil {
return emptyArray, err
}
if _, err := f.Read(buffer); err != nil {
return emptyArray, err
}
imo.hasher.Write(buffer)
}
hash := imo.hasher.Sum(nil)
binary.PutUvarint(hash, uint64(f.Size()))
copy(result[:], hash)
return result, nil
}
golang-github-kalafut-imohash-1.0.3/imohash_test.go 0000664 0000000 0000000 00000010074 14566140541 0022343 0 ustar 00root root 0000000 0000000 package imohash
import (
"bytes"
"flag"
"fmt"
"os"
"path/filepath"
"reflect"
"runtime"
"testing"
)
var tempDir string
func TestMain(m *testing.M) {
flag.Parse()
// Make a temp area for test files
tempDir, _ = os.MkdirTemp(os.TempDir(), "imohash_test_data")
ret := m.Run()
os.RemoveAll(tempDir)
os.Exit(ret)
}
func TestCustom(t *testing.T) {
const sampleFile = "sample"
var hash [Size]byte
var err error
sampleSize := 3
sampleThreshold := 45
imo := NewCustom(sampleSize, sampleThreshold)
// empty file
os.WriteFile(sampleFile, []byte{}, 0666)
hash, err = imo.SumFile(sampleFile)
ok(t, err)
equal(t, hash, [Size]byte{})
// small file
os.WriteFile(sampleFile, []byte("hello"), 0666)
hash, err = imo.SumFile(sampleFile)
ok(t, err)
hashStr := fmt.Sprintf("%x", hash)
equal(t, hashStr, "05d8a7b341bd9b025b1e906a48ae1d19")
/* boundary tests using the custom sample size */
size := sampleThreshold
// test that changing the gaps between sample zones does not affect the hash
data := bytes.Repeat([]byte{'A'}, size)
os.WriteFile(sampleFile, data, 0666)
h1, _ := imo.SumFile(sampleFile)
data[sampleSize] = 'B'
data[size-sampleSize-1] = 'B'
os.WriteFile(sampleFile, data, 0666)
h2, _ := imo.SumFile(sampleFile)
equal(t, h1, h2)
// test that changing a byte on the edge (but within) a sample zone
// does change the hash
data = bytes.Repeat([]byte{'A'}, size)
data[sampleSize-1] = 'B'
os.WriteFile(sampleFile, data, 0666)
h3, _ := imo.SumFile(sampleFile)
notEqual(t, h1, h3)
data = bytes.Repeat([]byte{'A'}, size)
data[size/2] = 'B'
os.WriteFile(sampleFile, data, 0666)
h4, _ := imo.SumFile(sampleFile)
notEqual(t, h1, h4)
notEqual(t, h3, h4)
data = bytes.Repeat([]byte{'A'}, size)
data[size/2+sampleSize-1] = 'B'
os.WriteFile(sampleFile, data, 0666)
h5, _ := imo.SumFile(sampleFile)
notEqual(t, h1, h5)
notEqual(t, h3, h5)
notEqual(t, h4, h5)
data = bytes.Repeat([]byte{'A'}, size)
data[size-sampleSize] = 'B'
os.WriteFile(sampleFile, data, 0666)
h6, _ := imo.SumFile(sampleFile)
notEqual(t, h1, h6)
notEqual(t, h3, h6)
notEqual(t, h4, h6)
notEqual(t, h5, h6)
// test that changing the size changes the hash
data = bytes.Repeat([]byte{'A'}, size+1)
os.WriteFile(sampleFile, data, 0666)
h7, _ := imo.SumFile(sampleFile)
notEqual(t, h1, h7)
notEqual(t, h3, h7)
notEqual(t, h4, h7)
notEqual(t, h5, h7)
notEqual(t, h6, h7)
// test sampleSize < 1
imo = NewCustom(0, size)
data = bytes.Repeat([]byte{'A'}, size)
os.WriteFile(sampleFile, data, 0666)
hash, _ = imo.SumFile(sampleFile)
hashStr = fmt.Sprintf("%x", hash)
equal(t, hashStr, "2d9123b54d37e9b8f94ab37a7eca6f40")
os.Remove(sampleFile)
}
// Test that the top level functions are the same as custom
// functions using the spec defaults.
func TestDefault(t *testing.T) {
const sampleFile = "sample"
var h1, h2 [Size]byte
var testData []byte
for _, size := range []int{100, 131071, 131072, 50000} {
imo := NewCustom(16384, 131072)
testData = M(size)
equal(t, Sum(testData), imo.Sum(testData))
os.WriteFile(sampleFile, []byte{}, 0666)
h1, _ = SumFile(sampleFile)
h2, _ = imo.SumFile(sampleFile)
equal(t, h1, h2)
}
os.Remove(sampleFile)
}
// Testing helpers from: https://github.com/benbjohnson/testing
// ok fails the test if an err is not nil.
func ok(tb testing.TB, err error) {
if err != nil {
_, file, line, _ := runtime.Caller(1)
fmt.Printf("\033[31m%s:%d: unexpected error: %s\033[39m\n\n", filepath.Base(file), line, err.Error())
tb.FailNow()
}
}
// equal fails the test if exp is not equal to act.
func equal(tb testing.TB, exp, act interface{}) {
if !reflect.DeepEqual(exp, act) {
_, file, line, _ := runtime.Caller(1)
fmt.Printf("\033[31m%s:%d:\n\n\texp: %#v\n\n\tgot: %#v\033[39m\n\n", filepath.Base(file), line, exp, act)
tb.FailNow()
}
}
// equal fails the test if exp is equal to act.
func notEqual(tb testing.TB, exp, act interface{}) {
if reflect.DeepEqual(exp, act) {
_, file, line, _ := runtime.Caller(1)
fmt.Printf("\033[31m%s:%d:\n\n\texpected mismatch, got matching\n\n\tgot: %#v\033[39m\n\n", filepath.Base(file), line, act)
tb.FailNow()
}
}
golang-github-kalafut-imohash-1.0.3/spec_test.go 0000664 0000000 0000000 00000002304 14566140541 0021642 0 ustar 00root root 0000000 0000000 package imohash
import (
"crypto/md5"
"fmt"
"testing"
)
func TestSpec(t *testing.T) {
var hashStr string
tests := []struct {
s int
t int
n int
hash string
}{
{16384, 131072, 0, "00000000000000000000000000000000"},
{16384, 131072, 1, "01659e2ec0f3c75bf39e43a41adb5d4f"},
{16384, 131072, 127, "7f47671cc79d4374404b807249f3166e"},
{16384, 131072, 128, "800183e5dbea2e5199ef7c8ea963a463"},
{16384, 131072, 4095, "ff1f770d90d3773949d89880efa17e60"},
{16384, 131072, 4096, "802048c26d66de432dbfc71afca6705d"},
{16384, 131072, 131072, "8080085a3d3af2cb4b3a957811cdf370"},
{16384, 131073, 131072, "808008282d3f3b53e1fd132cc51fcc1d"},
{16384, 131072, 500000, "a0c21e44a0ba3bddee802a9d1c5332ca"},
{50, 131072, 300000, "e0a712edd8815c606344aed13c44adcf"},
}
for _, test := range tests {
i := NewCustom(test.s, test.t)
hashStr = fmt.Sprintf("%x", i.Sum(M(test.n)))
equal(t, hashStr, test.hash)
}
}
// M generates n bytes of pseudo-random data according to the
// method described in the imohash algorithm description.
func M(n int) []byte {
r := make([]byte, 0, n)
hasher := md5.New()
for len(r) < n {
hasher.Write([]byte{'A'})
r = hasher.Sum(r)
}
return r[0:n]
}