simhash-0.0.20161225/0000755000175000017500000000000013027763630012771 5ustar pizzapizzasimhash-0.0.20161225/crc.h0000644000175000017500000000032213027763630013706 0ustar pizzapizza/* * Copyright (c) 2005-2007 Bart Massey * ALL RIGHTS RESERVED * Please see the file COPYING in this directory for license information. */ extern uint32_t hash_crc32(unsigned char *buf, int i0, int nbuf); simhash-0.0.20161225/COPYING0000644000175000017500000000320013027763630014017 0ustar pizzapizzaCopyright © 2005-2009 Bart Massey ALL RIGHTS RESERVED [This program is licensed under the "3-clause ('new') BSD License"] Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the copyright holder, nor the names of other affiliated organizations, nor the names of other contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. simhash-0.0.20161225/simhash.c0000644000175000017500000002707013027763630014577 0ustar pizzapizza/* * Copyright © 2005-2009 Bart Massey * ALL RIGHTS RESERVED * [This program is licensed under the "3-clause ('new') BSD License"] * Please see the file COPYING in the source * distribution of this software for license terms. */ /* * Generate and compare simple shingleprints * Bart Massey 2005/03 */ /* Bibliography * * Mark Manasse * Microsoft Research Silicon Valley * Finding similar things quickly in large collections * http://research.microsoft.com/research/sv/PageTurner/similarity.htm * * Andrei Z. Broder * On the resemblance and containment of documents * In Compression and Complexity of Sequences (SEQUENCES'97), * pages 21-29. IEEE Computer Society, 1998 * ftp://ftp.digital.com/pub/DEC/SRC/publications/broder/ * positano-final-wpnums.pdf * * Andrei Z. Broder * Some applications of Rabin's fingerprinting method * Published in R. Capocelli, A. De Santis, U. Vaccaro eds. * Sequences II: Methods in Communications, Security, and * Computer Science, Springer-Verlag, 1993. * http://athos.rutgers.edu/~muthu/broder.ps */ #include #include #include #include #include #include #include #include #include "crc.h" #include "heap.h" #include "hash.h" #include #define _GNU_SOURCE #include /* size of a shingle in bytes. should be at least 4 to make CRC work */ int nshingle = 8; int nfeature = 128; /* were the defaults changed? */ int pset = 0; /* do a debugging trace? */ int debug_trace = 0; static struct option long_options[] = { {"write-hashfile", 0, 0, 'w'}, {"match-files", 0, 0, 'm'}, {"compare-hashfile", 0, 0, 'c'}, {"shingle-size", 1, 0, 's'}, {"feature-set-size", 1, 0, 'f'}, {"debug-trace", 1, 0, 'd'}, {0,0,0,0} }; /* HASH FILE VERSION */ #define FILE_VERSION 0xcb01 /* SUFFIX for hash outputs */ #define SUFFIX ".sim" /* if crc is less than top of heap, extract top-of-heap, then insert crc. don't worry about sign bits---doesn't matter here. */ static void crc_insert(uint32_t crc) { if (debug_trace) fprintf(stderr, ">got %x\n", crc); if(nheap == nfeature && crc >= heap[0]) return; if (hash_contains(crc)) { if (debug_trace) fprintf(stderr, ">dup\n"); return; } if(nheap == nfeature) { uint32_t m = heap_extract_max(); assert(hash_delete(m)); if (debug_trace) fprintf(stderr, ">pop %x\n", m); } if (debug_trace) fprintf(stderr, ">push\n"); hash_insert(crc); heap_insert(crc); } /* return true if the file had enough bytes for at least a single shingle. */ static int running_crc(FILE *f) { int i; static unsigned char *buf = 0; if (buf == 0) { buf = (unsigned char *) malloc(nshingle); assert(buf); } for (i = 0; i < nshingle; i++) { int ch = fgetc(f); if (ch == EOF) { fclose(f); return 0; } buf[i] = (unsigned char) ch; } i = 0; while(1) { int ch; crc_insert(hash_crc32(buf, i, nshingle)); ch = fgetc(f); if (ch == EOF) { fclose(f); return 1; } buf[i] = (unsigned char) ch; i = (i + 1) % nshingle; } assert(0); /*NOTREACHED*/ } typedef struct hashinfo { uint16_t nshingle; uint16_t nfeature; uint32_t *feature; } hashinfo; static void free_hashinfo(hashinfo *hi) { free(hi->feature); free(hi); } static hashinfo * get_hashinfo() { hashinfo *hi = malloc(sizeof *hi); uint32_t *crcs = malloc(nheap * sizeof crcs[0]); int i = 0; assert(hi); assert(crcs); hi->nshingle = nshingle; hi->nfeature = nheap; while (nheap > 0) crcs[i++] = heap_extract_max(); hi->feature = crcs; return hi; } static hashinfo * hash_file(FILE *f) { heap_reset(nfeature); hash_reset(nfeature); if (!running_crc(f)) return 0; return get_hashinfo(); } static hashinfo * hash_filename(char *filename) { FILE *f = fopen(filename, "r"); hashinfo *hi; if (!f) { perror(filename); exit(1); } hi = hash_file(f); return hi; } static void write_hash(hashinfo *hi, FILE *f) { uint16_t s = htons(FILE_VERSION); /* file/CRC version */ int i; fwrite(&s, sizeof(uint16_t), 1, f); s = htons(hi->nshingle); fwrite(&s, sizeof(uint16_t), 1, f); for(i = 0; i < hi->nfeature; i++) { uint32_t hv = htonl(hi->feature[i]); fwrite(&hv, sizeof(uint32_t), 1, f); } } static void write_hashes(int argc, char **argv) { int i; static char nambuf[MAXPATHLEN + 1]; for(i = 0; i < argc; i++) { hashinfo *hi = hash_filename(argv[i]); FILE *of; if (hi == 0) { fprintf(stderr, "%s: warning: not hashed\n", argv[i]); continue; } strncpy(nambuf, argv[i], MAXPATHLEN - sizeof(SUFFIX)); nambuf[MAXPATHLEN - sizeof(SUFFIX)] = '\0'; strcat(nambuf, SUFFIX); of = fopen(nambuf, "w"); if (!of) { perror(argv[i]); exit(1); } write_hash(hi, of); fclose(of); free_hashinfo(hi); } } /* fills features with the features from f, and returns a pointer to info. A null pointer is returned on error. */ static hashinfo *read_hash(FILE *f) { hashinfo *h = malloc(sizeof(hashinfo)); uint16_t s; int i; uint16_t version; assert(h); fread(&s, sizeof(uint16_t), 1, f); version = ntohs(s); if (version != FILE_VERSION) { fprintf(stderr, "bad file version\n"); return 0; } fread(&s, sizeof(uint16_t), 1, f); h->nshingle = ntohs(s); h->nfeature = 16; h->feature = malloc(h->nfeature * sizeof(int)); assert(h->feature); i = 0; while(1) { int fe; int nread = fread(&fe, sizeof(int), 1, f); if (nread <= 0) { if (ferror(f)) { perror("fread"); return 0; } h->nfeature = i; h->feature = realloc(h->feature, h->nfeature * sizeof(int)); assert(h->feature); return h; } if (i >= h->nfeature) { h->nfeature *= 2; h->feature = realloc(h->feature, h->nfeature * sizeof(int)); assert(h->feature); } h->feature[i++] = ntohl(fe); } h->nfeature = i; abort(); /*NOTREACHED*/ } static hashinfo *read_hashfile(char *name) { FILE *f = fopen(name, "r"); hashinfo *hi; if (!f) { perror(name); exit(1); } hi = read_hash(f); return hi; } /* walk backward until one set runs out, counting the number of elements in the union of the sets. the backward walk is necessary because the common subsets are at the end of the file by construction. bleah. should probably reformat so that it's the other way around, which would mean that one could shorten a shingleprint by truncation. */ static double score(hashinfo *hi1, hashinfo *hi2) { double unionsize; double intersectsize; int i1 = hi1->nfeature - 1; int i2 = hi2->nfeature - 1; int count = 0; int matchcount = 0; while(i1 >= 0 && i2 >= 0) { if (hi1->feature[i1] < hi2->feature[i2]) { --i1; continue; } if(hi1->feature[i1] > hi2->feature[i2]) { --i2; continue; } matchcount++; --i1; --i2; } count = hi1->nfeature; if (count > hi2->nfeature) count = hi2->nfeature; intersectsize = matchcount; unionsize = 2 * count - matchcount; return intersectsize / unionsize; } void print_score(int fieldwidth, double s) { int lead = fieldwidth - 3; int i; for (i = 0; i < lead; i++) printf(" "); if (s == -1) { printf(" ? "); } else if (s == 1.0) { printf("1.0"); } else { printf(".%02d", (int)floor(s * 100)); } } static void compare_hashes(char *name1, char *name2) { hashinfo *hi1, *hi2; hi1 = read_hashfile(name1); if (!hi1) exit(1); hi2 = read_hashfile(name2); if (!hi2) exit(1); if (hi1->nshingle != hi2->nshingle) { fprintf(stderr, "shingle size mismatch\n"); exit(1); } #if 0 /* this isn't normally necessary when things are working properly */ if (hi1->nfeature != hi2->nfeature) fprintf(stderr, "warning: feature set size mismatch %d %d\n", hi1->nfeature, hi2->nfeature); #endif print_score(0, score(hi1, hi2)); printf("\n"); free_hashinfo(hi1); free_hashinfo(hi2); } static int width(int n) { int i = 0; int k = 1; while (k <= n) { k *= 10; i++; } return i; } static void print_index(int fieldwidth, int value) { int n = width(value); int lead = fieldwidth - n; int i; for (i = 0; i < lead; i++) printf(" "); printf("%d", value); } static void match_hashes(int argc, char **argv) { hashinfo **his = malloc(argc * sizeof *his); double **scores = malloc(argc * sizeof *scores); int nfilename = 0; int i, j; int fieldwidth; if (argc <= 0) return; assert(his); assert(scores); /* compute filename hashes */ for (i = 0; i < argc; i++) his[i] = hash_filename(argv[i]); /* build score matrix */ for (i = 0; i < argc; i++) { scores[i] = malloc(argc * sizeof **scores); assert(scores[i]); for (j = 0; j < i; j++) if (his[i] && his[j]) scores[i][j] = score(his[i], his[j]); else scores[i][j] = -1; } /* find maximum filename length */ for (i = 0; i < argc; i++) { int n = strlen(argv[i]); if (n > nfilename) nfilename = n; } /* find the field width */ fieldwidth = width(argc); if (fieldwidth < 3) fieldwidth = 3; /* print the first row of indices */ for (i = 0; i <= nfilename + fieldwidth; i++) printf(" "); for (i = 1; i < argc - 1; i++) { print_index(fieldwidth, i); printf(" "); } print_index(fieldwidth, argc - 1); printf("\n"); /* print the rows of the matrix */ for (i = 0; i < argc; i++) { printf("%s", argv[i]); for (j = strlen(argv[i]); j <= nfilename; j++) printf(" "); print_index(fieldwidth, i + 1); if (i > 0) printf(" "); for (j = 0; j < i - 1; j++) { print_score(fieldwidth, scores[i][j]); printf(" "); } if (i > 0) print_score(fieldwidth, scores[i][i - 1]); printf("\n"); } } static void usage(void) { fprintf(stderr, "simhash: usage:\n" "\tsimhash [-s nshingles] [-f nfeatures] [file]\n" "\tsimhash [-s nshingles] [-f nfeatures] [-w|-m] file ...\n" "\tsimhash -c hashfile hashfile\n"); exit(1); } int main(int argc, char **argv) { char mode = '?'; FILE *fin = stdin; /* parse initial arguments */ while(1) { switch(getopt_long(argc, argv, "wmcs:f:d", long_options, 0)) { case 'w': mode = 'w'; continue; case 'm': mode = 'm'; continue; case 'c': mode = 'c'; continue; case 's': nshingle = atoi(optarg); if (nshingle < 4) { fprintf(stderr, "simhash: shingle size must be at least 4\n"); exit(1); } pset = 1; continue; case 'f': nfeature = atoi(optarg); if (nfeature < 1) { fprintf(stderr, "simhash: feature set size must be at least 1\n"); exit(1); } pset = 1; continue; case 'd': debug_trace = 1; } break; } /* actually process */ switch(mode) { case '?': switch (argc - optind) { hashinfo *hi; case 1: hi = hash_filename(argv[optind]); if (!hi) { fprintf(stderr, "%s: not hashable\n", argv[optind]); return -1; } write_hash(hi, stdout); free_hashinfo(hi); return 0; case 0: hi = hash_file(fin); if (!hi) { fprintf(stderr, "stdin not hashable\n"); return -1; } write_hash(hi, stdout); free_hashinfo(hi); return 0; } usage(); abort(); /*NOTREACHED*/ case 'w': write_hashes(argc - optind, argv + optind); return 0; case 'c': if (pset) usage(); if (optind != argc - 2) usage(); compare_hashes(argv[optind], argv[optind + 1]); return 0; case 'm': match_hashes(argc - optind, argv + optind); return 0; } abort(); /*NOTREACHED*/ } simhash-0.0.20161225/crc32.c0000644000175000017500000001246113027763630014055 0ustar pizzapizza/* * Copyright (C) 1986 Gary S. Brown. You may use this program, or * code or tables extracted from it, as desired without restriction. */ #include #include "crc.h" /* * First, the polynomial itself and its table of feedback terms. The * polynomial is * X^32+X^26+X^23+X^22+X^16+X^12+X^11+X^10+X^8+X^7+X^5+X^4+X^2+X^1+X^0 * * Note that we take it "backwards" and put the highest-order term in * the lowest-order bit. The X^32 term is "implied"; the LSB is the * X^31 term, etc. The X^0 term (usually shown as "+1") results in * the MSB being 1 * * Note that the usual hardware shift register implementation, which * is what we're using (we're merely optimizing it by doing eight-bit * chunks at a time) shifts bits into the lowest-order term. In our * implementation, that means shifting towards the right. Why do we * do it this way? Because the calculated CRC must be transmitted in * order from highest-order term to lowest-order term. UARTs transmit * characters in order from LSB to MSB. By storing the CRC this way * we hand it to the UART in the order low-byte to high-byte; the UART * sends each low-bit to hight-bit; and the result is transmission bit * by bit from highest- to lowest-order term without requiring any bit * shuffling on our part. Reception works similarly * * The feedback terms table consists of 256, 32-bit entries. Notes * * The table can be generated at runtime if desired; code to do so * is shown later. It might not be obvious, but the feedback * terms simply represent the results of eight shift/xor opera * tions for all combinations of data and CRC register values * * The values must be right-shifted by eight bits by the "updcrc * logic; the shift must be unsigned (bring in zeroes). On some * hardware you could probably optimize the shift in assembler by * using byte-swap instructions * polynomial $edb88320 * * * CRC32 code derived from work by Gary S. Brown. */ static unsigned crc32_tab[] = { 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f, 0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7, 0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9, 0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172, 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, 0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59, 0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423, 0xcfba9599, 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433, 0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01, 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, 0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65, 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0, 0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f, 0x5edef90e, 0x29d9c998,0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a, 0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, 0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1, 0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb, 0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc, 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, 0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b, 0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55, 0x316e8eef, 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d, 0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713, 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, 0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777, 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45, 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, 0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9, 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693, 0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d }; uint32_t hash_crc32(unsigned char *buf, int i0, int nbuf) { int i = i0; uint32_t crc = ~0U; do { crc = crc32_tab[(crc ^ buf[i]) & 0xFF] ^ (crc >> 8); i = (i + 1) % nbuf; } while(i != i0); return crc ^ ~0U; } simhash-0.0.20161225/heap.h0000644000175000017500000000044113027763630014056 0ustar pizzapizza/* * Copyright (c) 2005-2007 Bart Massey * ALL RIGHTS RESERVED * Please see the file COPYING in this directory for license information. */ extern uint32_t *heap; extern int nheap; extern void heap_reset(int); extern uint32_t heap_extract_max(void); extern void heap_insert(uint32_t); simhash-0.0.20161225/test.sh0000644000175000017500000000123213027763630014302 0ustar pizzapizza#!/bin/sh # Copyright (c) 2005-2007 Bart Massey # ALL RIGHTS RESERVED # Please see the file COPYING in this directory for license information. # create files named a and b that you want to compare, then # use this script to try various hash sizes SIZES="64 256 1024 4096 8192" for j in $SIZES; do for i in a b ; do ./simhash -f $j -s 4 $i > $i-$j.sim done done for i in $SIZES; do for j in $SIZES; do HASHVAL=`./simhash -c a-$i.sim b-$j.sim 2>/dev/null` if [ $? -ne 0 ] then echo "" echo -n "./simhash -c a-$i.sim b-$j.sim: " ./simhash -c a-$i.sim b-$j.sim exit $? fi echo -n $HASHVAL "" done echo '' done simhash-0.0.20161225/hash.c0000644000175000017500000000640213027763630014062 0ustar pizzapizza/* * Copyright © 2005-2009 Bart Massey * ALL RIGHTS RESERVED * [This program is licensed under the "3-clause ('new') BSD License"] * Please see the file COPYING in the source * distribution of this software for license terms. */ /* * Simple hash table stop list ala corman-leiserson-rivest. * Bart Massey 2005/03 */ #include #include #include #include #include "hash.h" /* occupancy is out-of-band. sigh */ #define EMPTY 0 #define FULL 1 #define DELETED 2 static char *occ; static int *hash; static int nhash; /* for n > 0 */ static int next_pow2(int n) { int m = 1; while (n > 0) { n >>= 1; m <<= 1; } return m; } static void hash_alloc(void) { int i; hash = malloc(nhash * sizeof(int)); assert(hash); occ = malloc(nhash); assert(occ); for (i = 0; i < nhash; i++) occ[i] = EMPTY; } /* The occupancy shouldn't be bad, since we only keep small crcs in the stop list */ void hash_reset(int size) { nhash = 7 * size; nhash = next_pow2(nhash); hash_alloc(); } /* Since the input values are crc's, we don't try to hash them at all! they're plenty random coming in, in principle. */ static int do_hash_insert(uint32_t crc) { int count; uint32_t h = crc; for (count = 0; count < nhash; count++) { int i = h & (nhash - 1); if (occ[i] != FULL) { occ[i] = FULL; hash[i] = crc; return 1; } if (hash[i] == crc) return 1; h += 2 * (nhash / 4) + 1; } return 0; } /* idiot stop-and-copy for deleted references */ static void gc(void) { int i; int *oldhash = hash; char *oldocc = occ; hash_alloc(); for (i = 0; i < nhash; i++) { if (oldocc[i] == FULL) { if(!do_hash_insert(oldhash[i])) { fprintf(stderr, "internal error: gc failed, table full\n"); exit(1); } } } free(oldhash); free(oldocc); } void hash_insert(uint32_t crc) { if (do_hash_insert(crc)) return; gc(); if (do_hash_insert(crc)) return; fprintf(stderr, "internal error: insert failed, table full\n"); abort(); /*NOTREACHED*/ } static int do_hash_contains(uint32_t crc) { int count; uint32_t h = crc; for (count = 0; count < nhash; count++) { int i = h & (nhash - 1); if (occ[i] == EMPTY) return 0; if (occ[i] == FULL && hash[i] == crc) return 1; h += 2 * (nhash / 4) + 1; } return -1; } int hash_contains(uint32_t crc) { int result = do_hash_contains(crc); if (result >= 0) return result; gc(); result = do_hash_contains(crc); if (result >= 0) return result; fprintf(stderr, "internal error: can't find value, table full\n"); abort(); /*NOTREACHED*/ } static int do_hash_delete(uint32_t crc) { int count; uint32_t h = crc; for (count = 0; count < nhash; count++) { int i = h & (nhash - 1); if (occ[i] == FULL && hash[i] == crc) { occ[i] = DELETED; return 1; } if (occ[i] == EMPTY) return 0; h += 2 * (nhash / 4) + 1; } return -1; } int hash_delete(uint32_t crc) { int result = do_hash_delete(crc); if (result >= 0) return result; gc(); result = do_hash_delete(crc); if (result >= 0) return result; fprintf(stderr, "internal error: delete failed, table full\n"); abort(); /*NOTREACHED*/ } simhash-0.0.20161225/simhash.man0000644000175000017500000001010613027763630015120 0ustar pizzapizza.TH SIMHASH 1 "3 January 2007" .\" Copyright © 2005-2009 Bart Massey .\" ALL RIGHTS RESERVED .\" [This program is licensed under the "3-clause ('new') BSD License"] .\" Please see the file COPYING in the source .\" distribution of this software for license terms. .SH NAME simhash \- file similarity hash tool .SH SYNOPSIS simhash .BI "[ -s " nshingles " ]" .BI "[ -f " nfeatures " ]" .BI "[ " file " ]" .br simhash .BI "[ -s " nshingles " ]" .BI "[ -f " nfeatures " ]" .BI "-w " file " ..." .br simhash .BI "[ -s " nshingles " ]" .BI "[ -f " nfeatures " ]" .BI "-m " file " ..." .br simhash .BI "-c " "hashfile hashfile" .SH DESCRIPTION .LP This program is used to compute and compare similarity hashes of files. A similarity hash is a chunk of data that has the property that some distance metric between files is proportional to some distance metric between the hashes. Typically the similarity hash will be much smaller than the file itself. .P The algorithm used by .I simhash is Manassas' "shingleprinting" algorithm (see BIBLIOGRAPHY below): take a hash of every \fIm\fP-byte subsequence of the file, and retain the \fIn\fP of these hashes that are numerically smallest. The size of the intersection of the hash sets of two files gives a statistically good estimate of the similarity of the files as a whole. .P In its default mode, .I simhash will compute the similarity hash of its file argument (or stdin) and write this hash to its standard output. When invoked with the .B -w argument (see below), .I simhash will compute similarity hashes of all of its file arguments in "batch mode". When invoked with the .B -m argument (see below), .I simhash will compare all the given files using similarity hashes in "match mode". Finally, when invoked with the .B -c argument (see below), .I simhash will report the degree of similarity between two hashes. .SH OPTIONS .TP .BI "-f " "feature-count" When computing a similarity hash, retain at most .I "feature-count" significant hashes from the target file. The default is 128 features. Larger feature counts will give higher resolution in differences between files, will increase the size of the similarity hash proportionally to the feature count, and will increase similarity hash computation time slightly. .TP .BI "-s " "shingle-size" When computing a similarity hash, use hashes of samples consisting of .I "shingle-size" consecutive bytes drawn from the target file. The default is 8 bytes, the minimum is 4 bytes. Larger shingle sizes will emphasize the differences between files more and will slow the similarity hash computation proportionally to the shingle size. .TP .BI "-c " "hashfile1 hashfile2" Display the distance (normalized to the range 0..1) between the similarity hash stored in .I hashfile1 and the similarity hash stored in .IR hashfile2 . .TP .BI "-w " file " ..." Write the similarity hash of each of the .I file arguments to .IR "file.sim" . .TP .BI "-m " file " ..." Compute the similarity hash of each of the .I file arguments, and output a similarity matrix for those files. .SH AUTHOR Bart Massey .SH BUGS This currently uses CRC32 for the hashing. A Rabin Fingerprint should be offered as a slightly slower but more reliable alternative. .P The shingleprinting algorithm works for text files and fairly well for other sequential filetypes, but does not work well for image files. The latter both are 2D and often undergo odd transformations. .SH BIBLIOGRAPHY .LP Mark Manasse, Microsoft Research Silicon Valley. Finding similar things quickly in large collections. http://research.microsoft.com/research/sv/PageTurner/similarity.htm .LP Andrei Z. Broder. On the resemblance and containment of documents. In Compression and Complexity of Sequences (SEQUENCES'97), pages 21-29. IEEE Computer Society, 1998. ftp://ftp.digital.com/pub/DEC/SRC/publications/broder/positano-final-wpnums.pdf .LP Andrei Z. Broder. Some applications of Rabin's fingerprinting method. Published in R. Capocelli, A. De Santis, U. Vaccaro eds., Sequences II: Methods in Communications, Security, and Computer Science, Springer-Verlag, 1993. http://athos.rutgers.edu/~muthu/broder.ps simhash-0.0.20161225/hash.h0000644000175000017500000000042613027763630014067 0ustar pizzapizza/* * Copyright (c) 2005-2007 Bart Massey * ALL RIGHTS RESERVED * Please see the file COPYING in this directory for license information. */ extern void hash_reset(int); extern int hash_contains(uint32_t); extern void hash_insert(uint32_t); extern int hash_delete(uint32_t); simhash-0.0.20161225/heap.c0000644000175000017500000000374113027763630014057 0ustar pizzapizza/* * Copyright © 2005-2009 Bart Massey * ALL RIGHTS RESERVED * [This program is licensed under the "3-clause ('new') BSD License"] * Please see the file COPYING in the source * distribution of this software for license terms. */ /* * heap max int priority queue * Bart Massey 2005/03 */ #include #include #include #include "heap.h" uint32_t *heap = 0; int nheap = 0; static int maxheap = 0; void heap_reset(int size) { nheap = 0; maxheap = size; if (heap) free (heap); heap = malloc(size * sizeof(*heap)); assert(heap); } /* push the top of heap down as needed to restore the heap property */ static void downheap(void) { int tmp; int i = 0; while(1) { int left = (i << 1) + 1; int right = left + 1; if (left >= nheap) return; if (right >= nheap) { if (heap[i] < heap[left]) { tmp = heap[left]; heap[left] = heap[i]; heap[i] = tmp; } return; } if (heap[i] >= heap[left] && heap[i] >= heap[right]) return; if (heap[left] > heap[right]) { tmp = heap[left]; heap[left] = heap[i]; heap[i] = tmp; i = left; } else { tmp = heap[right]; heap[right] = heap[i]; heap[i] = tmp; i = right; } } } uint32_t heap_extract_max(void) { uint32_t m; assert(nheap > 0); /* lift the last heap element to the top, replacing the current top element */ m = heap[0]; heap[0] = heap[--nheap]; /* now restore the heap property */ downheap(); /* and return the former top */ return m; } /* lift the last value on the heap up as needed to restore the heap property */ static void upheap(void) { int i = nheap - 1; assert(nheap > 0); while(i > 0) { int tmp; int parent = (i - 1) >> 1; if (heap[parent] >= heap[i]) return; tmp = heap[parent]; heap[parent] = heap[i]; heap[i] = tmp; i = parent; } } void heap_insert(uint32_t v) { assert(nheap < maxheap); heap[nheap++] = v; upheap(); } simhash-0.0.20161225/dump.5c0000644000175000017500000000072613027763630014174 0ustar pizzapizza#!/usr/bin/env nickle # Copyright © 2005-2009 Bart Massey # ALL RIGHTS RESERVED # [This program is licensed under the "3-clause ('new') BSD License"] # Please see the file COPYING in the source # distribution of this software for license terms. # hex-dump a shingleprint file import File; while(!end(stdin)) { int ch1 = getbyte(); int ch2 = getbyte(); int ch3 = getbyte(); int ch4 = getbyte(); printf("%02x%02x%02x%02x\n", ch1, ch2, ch3, ch4); } simhash-0.0.20161225/Makefile0000644000175000017500000000074113027763630014433 0ustar pizzapizza# Copyright (c) 2005-2007 Bart Massey # ALL RIGHTS RESERVED # Please see the file COPYING in this directory for license information. DESTDIR=/usr/local BIN=$(DESTDIR)/bin MAN=$(DESTDIR)/man CC=gcc CFLAGS=-g -O4 -Wall -ansi -pedantic OBJS=simhash.o crc32.o heap.o hash.o simhash: $(OBJS) $(CC) $(CFLAGS) -o simhash $(OBJS) -lm clean: -rm -f $(OBJS) simhash install: simhash simhash.man cp simhash $(BIN) cp simhash.man $(MAN)/man1/simhash.1 heap.o: heap.h crc32.o: crc.h