similarity-tester-2.89.orig/0000755000000000000000000000000012540506376013003 5ustar similarity-tester-2.89.orig/aiso.bdy0000644000000000000000000000702012540503627014431 0ustar /* This file is part of the module Arbitrary-In Sorted-Out (AISO). Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: aiso.bdy,v 1.5 2014-10-05 19:32:21 Gebruiker Exp $ */ /* Description: This is the body of a module that builds an arbitrary-in sorted-out data structure, to be used as a heap, a priority queue, etc. See aiso.spc for further information. */ #include #include #include "Malloc.h" static struct aiso_node *root; /* root of tree */ #ifdef AISO_ITER static struct aiso_node *list; /* start of linked list */ #endif /* AISO_ITER */ /* the policy */ static uint64_t aiso_size = 0; static uint64_t acc_mark = 1; #define add_entry() (aiso_size++) #define rem_entry() (aiso_size--) #define reset_access() (acc_mark = 1) #define count_access() (acc_mark <<= 1) #define must_rotate() (acc_mark > aiso_size) int InsertAiso(AISO_TYPE v) { struct aiso_node *new_node; struct aiso_node **hook = &root; #ifdef AISO_ITER struct aiso_node **prev = &list; #endif /* AISO_ITER */ new_node = (struct aiso_node *)TryMalloc(sizeof (struct aiso_node)); if (!new_node) { /* avoid modifying the tree */ return 0; } while (*hook) { struct aiso_node *an = *hook; count_access(); if (AISO_BEFORE(v, an->an_value)) { /* head left */ if (!an->an_left || !must_rotate()) { /* standard action */ hook = &an->an_left; } else { /* change (l A r) B (C) into (l) A (r B C) */ struct aiso_node *anl = an->an_left; an->an_left = anl->an_right; anl->an_right = an; *hook = anl; reset_access(); } } else { /* head right */ if (!an->an_right || !must_rotate()) { /* standard action */ hook = &an->an_right; } else { /* change (A) B (l C r) into (A B l) C (r) */ struct aiso_node *anr = an->an_right; an->an_right = anr->an_left; anr->an_left = an; *hook = anr; reset_access(); } #ifdef AISO_ITER prev = &an->an_next; #endif /* AISO_ITER */ } } new_node->an_left = 0; new_node->an_right = 0; #ifdef AISO_ITER new_node->an_next = *prev; *prev = new_node; #endif /* AISO_ITER */ new_node->an_value = v; *hook = new_node; add_entry(); return 1; } #ifdef AISO_EXTR int ExtractAiso(AISO_TYPE *vp) { struct aiso_node **hook = &root; struct aiso_node *an; if (!root) return 0; while ((an = *hook), an->an_left) { /* head left */ count_access(); if (!must_rotate()) { /* standard action */ hook = &an->an_left; } else { /* change (l A r) B (C) into (l) A (r B C) */ struct aiso_node *anl = an->an_left; an->an_left = anl->an_right; anl->an_right = an; *hook = anl; reset_access(); } } /* found the first */ *vp = an->an_value; *hook = an->an_right; #ifdef AISO_ITER list = an->an_next; #endif /* AISO_ITER */ Free((void *)an); rem_entry(); return 1; } #endif /* AISO_EXTR */ #ifdef AISO_ITER void OpenIter(AisoIter *ip) { *ip = list; } int GetAisoItem(AisoIter *ip, AISO_TYPE *vp) { struct aiso_node *an = *ip; if (!an) return 0; *vp = an->an_value; *ip = an->an_next; return 1; } void CloseIter(AisoIter *ip) { *ip = 0; } #endif /* AISO_ITER */ #ifdef AISO_DEBUG /* requires AISO_FORMAT */ static void pr_inf(int level, char ch, struct aiso_node *an) { int i; if (!an) return; pr_inf(level+1, '/', an->an_right); for (i = 0; i < level; i++) { printf(" "); } printf("%c", ch); printf(AISO_FORMAT, an->an_value); printf("\n"); pr_inf(level+1, '\\', an->an_left); } void pr_tree(void) { pr_inf(0, '-', root); printf("================\n"); } #endif /* AISO_DEBUG */ similarity-tester-2.89.orig/Malloc.c0000644000000000000000000002010312540503627014346 0ustar /* This file is part of the memory management and leak detector MALLOC. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: Malloc.c,v 1.15 2014-01-27 11:22:39 Gebruiker Exp $ */ #include #include #include #include #include "any_int.h" #include "Malloc.h" /*Library module source prelude */ #undef _MALLOC_CODE_ #ifndef lint #define _MALLOC_CODE_ #endif #ifdef LIB #define _MALLOC_CODE_ #endif #ifdef _MALLOC_CODE_ /* Library module source code */ #undef new #define new use_my_new /* don't call Malloc in Malloc.c */ #define my_new(type) ((type *)malloc(sizeof (type))) /* All output goes through designated files, so we block printf, etc. */ #undef printf #define printf use_fprintf #undef putchar #define putchar use_fprintf static void fprintloc(FILE *f, const char *fname, int l_nmb) { fprintf(f, "\"%s\", line %d: ", fname, l_nmb); } static void out_of_memory(const char *fname, int l_nmb, size_t size) { fprintloc(stderr, fname, l_nmb); fprintf(stderr, "Out of memory, requested size = %s bytes\n", any_uint2string(size, 0)); exit(1); } #if defined MEMLEAK || defined MEMCLOBBER /* Both need almost the same information: MEMLEAK obviously needs a list of all blocks still allocated, but MEMCLOBBER needs the same list to find the size of a block given to Free(), in order to clobber it. MEMCLOBBER does not need total, balance and max, but finecombing them out would be too much. */ static vlong_uint total = 0; static vlong_uint balance = 0; static vlong_uint max = 0; struct record { struct record *next; const char *addr; size_t size; const char *fname; int l_nmb; }; #define HASH_SIZE 16381 /* largest prime under 2^16 */ static struct record *record_hash[HASH_SIZE]; #define chain_start(x) record_hash[((unsigned int)(x)%HASH_SIZE)] static void record_alloc(char *addr, size_t size, const char *fname, int l_nmb) { struct record *new; struct record **r_hook = &chain_start(addr); if (addr == 0) return; new = my_new(struct record); new->addr = addr; new->size = size; new->fname = fname; /* no need to copy fname */ new->l_nmb = l_nmb; new->next = *r_hook; *r_hook = new; total += size; balance += size; if (balance > max) { max = balance; } } static struct record ** record_pointer_for_address(const char *addr) { struct record **rp = &chain_start(addr); while (*rp) { if ((*rp)->addr == addr) break; rp = &(*rp)->next; } return rp; } static size_t record_free(char *addr) { struct record **oldp = record_pointer_for_address(addr); struct record *old = *oldp; if (old == 0) return (size_t) -1; *oldp = old->next;/* this loses the struct record; is that a problem? */ balance -= old->size; return old->size; } #endif /* defined MEMLEAK || defined MEMCLOBBER */ void MemClobber(void *p, size_t size) { unsigned char *s = (unsigned char *)p; size_t i; for (i = 0; i < size; i++) { s[i] = 0125; /* 0101 0101 */ } } #ifdef MEMLEAK struct entry { struct entry *next; const char *fname; int l_nmb; unsigned int n_blocks; int var_size; /* all blocks have the same size or not */ size_t size; /* !var_size: the one size; var_size: sum of sizes */ }; static struct entry * compacted_leaks(void) { struct entry *res = 0; int i; for (i = 0; i < HASH_SIZE; i++) { struct record *r = record_hash[i]; while (r) { struct entry *e = res; /* try to find an entry for this location */ while (e) { if ( e->fname == r->fname && e->l_nmb == r->l_nmb ) break; e = e->next; } if (e) { /* update the entry */ if (e->var_size) { e->size += r->size; } else if (e->size != r->size) { /* switch to var_size */ e->var_size = 1; e->size = e->n_blocks*e->size + r->size; } e->n_blocks++; } else { /* create a new entry */ e = my_new(struct entry); e->fname = r->fname; e->l_nmb = r->l_nmb; e->n_blocks = 1; e->var_size = 0; e->size = r->size; e->next = res; res = e; } r = r->next; } } return res; } static int number_of_leaks(const struct entry *e) { int res = 0; while (e != 0) { res++; e = e->next; } return res; } static void report_actual_leaks(FILE *f) { const struct entry *e = compacted_leaks(); int n_leaks = number_of_leaks(e); if (n_leaks == 0) return; fprintf(f, "There %s %d case%s of unreclaimed memory:\n", (n_leaks == 1 ? "was" : "were"), n_leaks, (n_leaks == 1 ? "" : "s") ); while (e) { fprintloc(f, e->fname, e->l_nmb); fprintf(f, "left allocated: %d block%s of size ", e->n_blocks, (e->n_blocks == 1 ? "" : "s") ); if (e->var_size) { /* e->size is the sum of the sizes */ fprintf(f, "%s on average", any_uint2string( (e->size+e->n_blocks/2) / e->n_blocks, 0 )); if (e->n_blocks > 1) { fprintf(f, " = %s", any_uint2string(e->size, 0)); } } else { /* e->size is the single size */ fprintf(f, "%s", any_uint2string(e->size, 0)); if (e->n_blocks > 1) { vlong_uint all = e->size*e->n_blocks; fprintf(f, " = %s", any_uint2string(all, 0)); } } fprintf(f, "\n"); e = e->next; } } void ReportMemoryLeaks(FILE *f) { if (f == 0) f = stderr; report_actual_leaks(f); fprintf(f, "Total memory allocated = %s", any_uint2string(total, 0)); fprintf(f, ", maximum allocated = %s", any_uint2string(max, 0)); fprintf(f, ", garbage left = %s", any_uint2string(balance, 0)); fprintf(f, "\n"); } #else /* no MEMLEAK */ /*ARGSUSED*/ void ReportMemoryLeaks(FILE *f) { } #endif /* MEMLEAK */ void * _leak_malloc(int chk, size_t size, const char *fname, int l_nmb) { void *res = malloc(size); if (chk && res == 0) { out_of_memory(fname, l_nmb, size); /*NOTREACHED*/ } #if defined MEMLEAK || defined MEMCLOBBER record_alloc(res, size, fname, l_nmb); #ifdef MEMCLOBBER MemClobber((char *)res, size); #endif /* MEMCLOBBER */ #endif /* MEMLEAK || MEMCLOBBER */ return res; } void * _leak_calloc(int chk, size_t n, size_t size, const char *fname, int l_nmb) { void *res = calloc(n, size); if (chk && res == 0) { out_of_memory(fname, l_nmb, n*size); /*NOTREACHED*/ } #if defined MEMLEAK || defined MEMCLOBBER record_alloc(res, n*size, fname, l_nmb); #endif /* MEMLEAK || MEMCLOBBER */ return res; } void * _leak_realloc(int chk, void *addr, size_t size, const char *fname, int l_nmb) { void *res; #if defined MEMLEAK || defined MEMCLOBBER size_t old_size = record_free(addr); /* we report first, because the realloc() below may cause a crash */ if ( /* we are not reallocating address 0, which is allowed */ addr != 0 && /* the address was never handed out before */ old_size == (size_t) -1 ) { fprintloc(stderr, fname, l_nmb); fprintf(stderr, ">>>> unallocated block reallocated <<<<\n"); } #endif res = realloc(addr, size); if (chk && res == 0) { out_of_memory(fname, l_nmb, size); /*NOTREACHED*/ } #if defined MEMLEAK || defined MEMCLOBBER record_alloc(res, size, fname, l_nmb); #endif /* MEMLEAK || MEMCLOBBER */ #ifdef MEMCLOBBER if (old_size > 0 && size > old_size) { MemClobber(((char *)res)+old_size, size-old_size); } #endif /* MEMCLOBBER */ return res; } /* ARGSUSED */ void _leak_free(void *addr, const char *fname, int l_nmb) { #if defined MEMLEAK || defined MEMCLOBBER size_t old_size = record_free(addr); /* we report first, because the free() below may cause a crash */ if (old_size == (size_t) -1) { fprintloc(stderr, fname, l_nmb); fprintf(stderr, ">>>> unallocated block freed "); fprintf(stderr, "or multiple free of allocated block <<<<\n"); } else { #ifdef MEMCLOBBER MemClobber((char *)addr, old_size); #endif /* MEMCLOBBER */ } #endif /* MEMLEAK || MEMCLOBBER */ free(addr); } char * _new_string(const char *s, const char *fname, int l_nmb) { return strcpy((char *)(_leak_malloc(1, strlen(s)+1, fname, l_nmb)), s); } /* End library module source code */ #endif /* _MALLOC_CODE_ */ #ifdef lint static void satisfy_lint(void *x) { void *v; v = _leak_malloc(0, 0, 0, 0); v = _leak_calloc(0, 0, 0, 0, 0); v = _leak_realloc(0, 0, 0, 0, 0); _leak_free(x, 0, 0); ReportMemoryLeaks(0); MemClobber(v, 0); v = _new_string(0, 0, 0); satisfy_lint(v); } #endif /* lint */ similarity-tester-2.89.orig/tokenarray.h0000644000000000000000000000060512540503627015330 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: tokenarray.h,v 1.6 2015-01-12 09:16:13 dick Exp $ */ /* Interface for the token storage */ extern void Init_Token_Array(void); extern void Store_Token(Token tk); extern size_t Token_Array_Length(void); /* also first free token position */ extern Token *Token_Array; similarity-tester-2.89.orig/lex.h0000644000000000000000000000065012540503627013741 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: lex.h,v 2.12 2012-09-30 11:55:19 dick Exp $ */ /* Macros for the *lang.l files */ #define return_tk(tk) {lex_tk_cnt++; lex_token = (tk); return 1;} #define return_ch(ch) {lex_tk_cnt++; lex_token = int2Token((int)(ch)); return 1;} #define return_eol() {lex_nl_cnt++; lex_token = End_Of_Line; return 1;} similarity-tester-2.89.orig/pass1.h0000644000000000000000000000055712540503627014206 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: pass1.h,v 1.7 2012-05-16 07:56:06 dick Exp $ */ /* Reads the input files; stores the tokens in Token Token_Array[] and the input file descriptions in struct text text[]. */ extern void Read_Input_Files(int argc, const char *argv[], int round); similarity-tester-2.89.orig/miralang.l0000644000000000000000000000467712540503627014764 0ustar %{ /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: miralang.l,v 1.11 2013-04-28 16:30:41 dick Exp $ */ /* Miranda language front end for the similarity tester. Author: Emma Norling (ejn@cs.mu.oz.au) Date: Nov 1998 */ #include "token.h" #include "language.h" #include "algollike.h" #include "lex.h" #include "lang.h" /* General language front end data */ Token lex_token; size_t lex_nl_cnt; size_t lex_tk_cnt; size_t lex_non_ascii_cnt; /* Language-dependent data */ #include "idf.h" static const struct idf reserved[] = { {"abstype", NORM('a')}, {"bool", NORM('b')}, {"char", NORM('c')}, {"const", META('c')}, {"div", NORM('d')}, {"False", NORM('F')}, {"if", NORM('i')}, {"mod", NORM('m')}, {"num", NORM('n')}, {"otherwise", NORM('o')}, {"readvals", NORM('r')}, {"show", NORM('s')}, {"sys_message", META('s')}, {"True", NORM('T')}, {"type", NORM('t')}, {"where", NORM('w')}, {"with", META('w')} }; /* Token sets for module algollike */ const Token Non_Finals[] = { NORM('('), NORM('['), NORM('='), No_Token }; const Token Non_Initials[] = { NORM(')'), NORM(']'), No_Token }; const Token Openers[] = { NORM('('), NORM('['), NORM('='), No_Token }; const Token Closers[] = { NORM(')'), NORM(']'), No_Token }; /* Language-dependent code */ void Init_Language(void) { Init_Algol_Language(Non_Finals, Non_Initials, Openers, Closers); } int May_Be_Start_Of_Run(Token ch) { return May_Be_Start_Of_Algol_Run(ch); } size_t Best_Run_Size(const Token *str, size_t size) { return Best_Algol_Run_Size(str, size); } %} %option noyywrap %Start Comment Layout ([ \t\r\f]) ASCII95 ([- !"#$%&'()*+,./0-9:;<=>?@A-Z\[\\\]^_`a-z{|}~]) AnyQuoted (\\.) StrChar ([^"\n\\]|{AnyQuoted}) ChrChar ([^'\\]|{AnyQuoted}) Idf ([A-Za-z][A-Za-z0-9_']*) %% "||".*$ { /* comment */ } \"{StrChar}*\" { /* strings */ return_ch('"'); } \'{ChrChar}\' { /* characters */ return_ch('\''); } \%{Layout}*include.* { /* skip %include line */ } \%{Layout}*insert.* { /* skip %insert line */ } {Idf} { /* identifier */ return_tk(idf_in_list(yytext, reserved, sizeof reserved, IDF)); } \n { /* count newlines */ return_eol(); } {Layout} { /* ignore layout */ } {ASCII95} { /* copy other text */ return_ch(yytext[0]); } . { /* count non-ASCII chars */ lex_non_ascii_cnt++; } %% /* More language-dependent code */ void yystart(void) { BEGIN INITIAL; } similarity-tester-2.89.orig/aiso.spc0000644000000000000000000000603412540503627014444 0ustar /* This file is part of the module Arbitrary-In Sorted-Out (AISO). Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: aiso.spc,v 1.2 2008/02/05 16:48:42 dick Exp $ */ /* Description: This is the specification of a module that builds an arbitrary-in sorted-out data structure, to be used as a heap, a priority queue, etc. Elements can be inserted, the first element extracted and the set scanned at any moment. The module is not generic, in that only one copy of it can be instantiated per program. Instantiation: The module is instantiated as follows. Create a file X.h, where X is arbitrary, which contains at least: - a definition of AISO_TYPE, the type of the object to be stored - a possible definition of AISO_EXTR; see below - a possible definition of AISO_ITER; see below - #include "aiso.spc" This file X.h is to be included in all files that use the aiso package. Create a file X.c which contains at least: - #include "X.h" - a definition of a routine int AISO_BEFORE(AISO_TYPE v, AISO_TYPE w) which yields non-zero if v is to be sorted before w - #include "aiso.bdy" This file X.c compiles into the module object. Specification: The module always supplies: int InsertAiso(AISO_TYPE value) inserts value in its proper place; fails if out of memory If AISO_EXTR is defined, the module will also supply: int ExtractAiso(AISO_TYPE *value) yields the first value in the aiso and removes it; fails if empty If AISO_ITER is defined, the module also supplies a type AisoIter which declares an iterator, i.e., a structure that records a position in the ordered set, plus routines for manipulating the iterator, thus enabling the user to scan the ordered set. The iterator should be declared as: AisoIter iter; and is manipulated by the following commands: OpenIter(AisoIter *iter) opens the iterator for scanning the existing set in order int GetAisoItem(AisoIter *iter, AISO_TYPE *value) yields the next value in the iterator; fails if exhausted CloseIter(AisoIter *iter) closes the iterator For the use of AISO_DEBUG see aiso.bdy. Implementation: The AISO implementation is based on a self-adjusting binary tree. Degenerate behaviour of the tree is avoided by shaking the tree every 'ln aiso_size' node accesses. This guarantees ln aiso_size behaviour in the long run, though it is possible for a single operation to take aiso_size node accesses. The iterator is implemented as an additional linear linked list through the tree. This is simpler than and at least as efficient as clever tree-wiring. */ struct aiso_node { struct aiso_node *an_left; struct aiso_node *an_right; #ifdef AISO_ITER struct aiso_node *an_next; #endif /* AISO_ITER */ AISO_TYPE an_value; }; extern int InsertAiso(AISO_TYPE value); #ifdef AISO_EXTR extern int ExtractAiso(AISO_TYPE *value); #endif /* AISO_EXTR */ #ifdef AISO_ITER typedef struct aiso_node *AisoIter; extern void OpenIter(AisoIter *iter); extern int GetAisoItem(AisoIter *iter, AISO_TYPE *value); extern void CloseIter(AisoIter *iter); #endif /* AISO_ITER */ similarity-tester-2.89.orig/LICENSE.txt0000644000000000000000000000303312540503627014621 0ustar Copyright (c) 1986, 2007, Dick Grune, Vrije Universiteit, The Netherlands All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the Vrije Universiteit nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. similarity-tester-2.89.orig/idf.c0000644000000000000000000000320212540503627013702 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: idf.c,v 2.19 2015-01-17 10:20:40 dick Exp $ */ #include #include "system.par" #include "token.h" #include "idf.h" Token idf_in_list( const char *str, const struct idf list[], size_t listsize, Token default_token ) { int first = 0; int last = (int) (listsize / sizeof (struct idf)) - 1; while (first < last) { int middle = (first + last) / 2; if (strcmp(str, list[middle].id_tag) > 0) { first = middle + 1; } else { last = middle; } } return (strcmp(str, list[first].id_tag) == 0 ? list[first].id_tr : default_token ); } #define HASH(h,ch) (((h) * 8209) + (ch)*613) Token idf_hashed(const char *str) { int32 h = 0; /* let's be careful about ranges; if done wrong it's hard to debug */ while (*str) { int ch = *str++ & 0377; /* ignore spaces in spaced words */ if (ch == ' ') continue; /* -1 <= h <= 2^31-1 */ h = HASH(h, ch); /* -2^31 <= h <= 2^31-1 */ if (h < 0) { /* -2^31 <= h <= -1 */ h += 2147483647; /* 2^31-1 */ /* -1 <= h <= 2^31-2 */ } else { /* 0 <= h <= 2^31-1 */ } /* -1 <= h <= 2^31-1 */ } /* -1 <= h <= 2^31-1 */ if (h < 0) { /* h = -1 */ h = 0; } /* 0 <= h <= 2^31-1 */ h %= (N_TOKENS - N_REGULAR_TOKENS - 1); /* 0 <= h < N_TOKENS - N_REGULAR_TOKENS - 1 */ h += N_REGULAR_TOKENS; /* N_REGULAR_TOKENS <= h < N_TOKENS - 1 */ return int2Token(h); /* this avoids the regular tokens and End_Of_Line */ } void lower_case(char *str) { char *s; for (s = str; *s; s++) { if ('A' <= *s && *s <= 'Z') { *s += (-'A' + 'a'); } } } similarity-tester-2.89.orig/sim.pdf0000644000000000000000000010074712540503627014273 0ustar %PDF-1.5 % 9 0 obj <> stream xڽZ_۶M H^3{4E1H|v>}w )'=X[]pqXb*߭yz2\AMڅZm~n_~ﮮ}R2>iyROGZjw_p7g h\t˱=L|)xwALgC?7Wk)%NWVY[e!EyeM-ϯ}v%bӕH욚"%bj ͡ւlzqC/>DA*XF<:e ~Bϛ޾jQF/VWZ="hd[EmZݮ~$:A_TZ<Ոdg؏#;|Sl_۟ۻOgVrk2oE8=~LgOȶ6hn~Oǖ+\"?6eYڏsKO]Ka \ S[Z]t%B?)]LmZ_+˦ "Jۓ%˟-$ _`+|A2FEаk;1s+g, .U ̈qggXfM.& ؁[a Zf-52k^[)?vo ^{Ût\tAH6!lك3YSK}\m(~CAw&@hGU8p!m [!I׎[2dWJhf!뉖0qVC#fgEF!pml}tbWSQ-Al_6%E-@$ /CMļbR qUI|șQK?nNvs-z 9;PqCfy]9*^mm/TJ 7wZ% x&͡0,ı9S=Y#nP|Rf+Z[ k#<̷)l8`悼4L<o"^9@+N56sY؃tUMs./x^$T{yϾ%VClFӍc[rnvb٪IfuS?p,4q,%9\m`e+F[O F dP@8>ctM NfXm`w6񯝣v"_Xt|G;;G5lB`eQ w;=,ҏΫ/D㽠>9f'KأF<0 OQďEk͌S_| qb JkLED~dx͒CeTd ǀ* "?'M) ֤ࢵVxVdsѠSQUiۂɪ96y`'2T(7@yrd PhdcvՍ1 0UV9lF$P }|vz(0Y@ ( u5=Wr9Ri.ʞ3 1mCmx(2Ƭ3Gm8ۦba1FV͝Imfɽ#hCzF'xFdʵwuURnowYo|6;TU:,$tkx_ 7W%xҀi:7<q~\Mʮ JvGٗLfxM|l-t|0FI8 kJ=Pv5$4TX}}Uwi[EE9#`Aݷhvٟ(E6vyZpO:"l>dUVA 0Wf ˬLQ֊139ە/= 3GA١KvӜ1o f[vט8O^sLk[|< Vft+>gyO1ؓA0~$1JbTحd*0 g g2UQU l;U^8Ǥ8fkGoF9W,sŒiq׶Wqc'u-*c ez j93 \6j@2ML$Іѱkv1YxkeEISj0A]4 $fHm7u8(`_ob״:t)& \5;or nVmkfJ};vnO>)K u&v;q8 pME]iEX%o 7g MBK[PU."-+'\C {#~"yȏR0֙0k6dG$`> stream xڥn_!*֘CoMQ`6, -[D)R!){n3$Zq6Ϝ9s^iH\r^(wozZvD h<6}( dw*du@-l9]e:ep;쬛8U/U۬t9nq,ܱ/0Y nxhr pW--3`g˶At&PtU6j@ƵNT2b׉y7IHl9F6f/ءJ ۧZ#'BgJ;b_w eȀJ,~Łܰ^|XrЉD0ÓDN3n;Yg7y Yec;܋E<.C ,c ǁ0}l]KKQD_q`FޟbT[/w_ij~ܑ~N]51ggdJ:qVak;1v4Р1m147+?|z'u(D-ZM<8y ثD n.˶>gMB6D::dޛE j eg^Q厽9q*J@拽 % W^~B9ɂs@T\{6o\>`OhUcOBω';s)zZ!E==2yW |lf*UE}f*3-0()$DOs8W3Qhm{s5 K/Ƞ^Y4Eݷ<5RܷGl!&`!VL?_[֩N("J7f84e7~o*_&:f1MsfY(}`J 6CcmdƧ1ЧZ NW^N1z"*A;MX.iFHyyL8a1BС0zO{n1?F@]?;Sa GVlv$MFd?j N{˕&vδ XBJhCk'R]80s&270 (`HjLZm-ִ\:M/B`1}Կ-ҥR;?}l~na8Ɲ&q'tE^ȊeA2E{z5&0_I+f`;$1MOs < :)B2ŞǞ |6f̘ 91О,V-uעsP+8Q$h';;[M?KU&:+$M%CD G]GJ)dQLݜ S "I~+H@-_JQۺė܉u:2^{SnOUIVCˑ) Th#g[n)$ u ϞI"hn=g'c\GU^"vXC1u>4-`4vj؃ izǤYr[q{IP6z˲D8@~|Y ke\[ixJb=BMEZ2vVq8GS~[%3B2彽|Miy\j;`F@ofnH1G Bb4'v\Sa:M.*Z@YI%ڢDGO4,\we0[nxaDUHեBdd/AuyCo;oI><н$Iɷ4zNf$e0.RkDDqU9ͯ.N1^ pٲsbob9j`8(Zw>`˷\dpv[}P#HPRC]{<KNr#FPq0cKz1)vO80čA^WhG!E7;9-k$g\ _碒x\!k<]sbQ ۚ+[[1f1/UU JIk-7&Ҧe&㜷@}=$Iofsu[>Sa@H  WDc:ǭ>Lz3IOeG2M2ɷz hԸZ?d+@r0b%~S4s󟘦2qʫq\ $w33EC@hY}|0y68HӈjT. t|0a"tү??) endstream endobj 16 0 obj <> stream xYYoF~_G 0[샗,d6QLhC Qȏߪjb3@YutuW쏙Ob5TnlN+ҳ֕-׿y?ߗ?̌E+ $.:E4^;}oB!c!a;bȾDhZﻢHWUŅշ,")0J W2|D.iX[aH4OKi;7- A2Oy4ķ!djccbG饴M#&T[?h gI0ݓB[W\qa233f:۳Bd|J0sLZ whYdkU]%);2NG]S66h;XBS#FZBkiO (,$~5 I"q_K~Y9Z0̫'@㧩Nj[NKf.+٦oSF)hKc&7h#L7h ֣7@[t m F}_B /A hd4xk+]UYu ]LEz!c,^+d0]ס%t /^N"oOMkkU4fպ-Ol튮7cվMcVR[2k%|& +6Gh1Ļط֋0Ѧ]@C]VT}a]9V vϫB\Jݩvqcc,3uz1i@ÉQMK+m޸1II`ykEݞ =՞ %7FgH&9k'h!]oK[& \-xqRJy &Z*@CL 1$!6Q2nN=*CԴ(̉&jsF;䆙NvyVq;#c@Hor"  J0zP=T$W.Jf!~MD#w="!Pt["F۲4 J!\ؾ-EX؂- З+|& iraö '-YG]YL.RMGX r~W2JhgI UZ{|1Zy4:d~M`_Y:Z[ל(-> stream xڵX{o/Q$O*;Rr[* TE"q&Ugfg)P}WU8sU^^ݮίb|n:o~_WH\̜_Mn#b򡒺lvHCZrլEhS=S?0nj ȑM::H_e9i챍4mpep\y%,cyÖYҺ{+-seи)uKӲ!:2cnۡ)@CpБ ^u!^zg)Ý8^ّk }hFe!7"L3Yv ۞[h"KzH~jMc%ɄQ9ۄ^ɨZdd Iz8h#DBSN.PgȦU lڃ_8wMa gd\K$ѳr9$WZ7)܀^_\CM-3"ک|chvה1.=a&n8 Pz,N; ЌwQ2c.gnlHS"3 MPZAy'Y*,e3\]jfC NΛz'04ЁfZO/.Ѽxq!_\ HomG*dqM O ןS 7ΛTg syI9xRj-ʍ)KB\,r.w?]|s~s<^۪j=L[jiPK˚\98;cVe%ʮ16)baChU(ٍJjvz$8E 1IMT3 V=PCϔH`-ޛ@~4pFΝB\iݞժn''߆{iܑ"1<(\vɉ5j CSJBMh~hƥr ‹(4MJEծJySFmӟKrUqE$m:sd_%k0J9aL&6СFcqYh(Z#`Vmn>fiۆşW) ) `6r(vleUg5ץ#XJ͇W41cQa ]'kL{16)Gј0Z%ap—Ipgm%h3f䤊7m7tǽZ0FŞB~+Ab*&C@?}=$ISf _Lb/qqoϧ[[sL p7{Dw/ߗ}1|lp\jvȜQ QأO5@"W 4{ ~`n!kStvq}jl׫)l*9/R(&aC4.~ YNri;: Y 4fS"%,&Pڢch k_$L|=hY-'#$8 > A薏Ggp0n'v|me_t,УoaYƉ>zY;Cz {;CnÔrC P]ze^8@f$$ pF y\p;74Oy4 m#u Hx GԳ#Y2ec0}  !" vJ nW>Y|9# @SL2o;tHܻ_qVCBX2[~G0e'JJH} N^<0ymPo,-wv1)]W8'sC6F%&JxrYn1rz;3Ted۫"IsN٢qM&߇gѷ+_i< DD8LOK&ӄIta MZ`;+3mv.ndpw=;ӵIg': ]}ה,e_jU _x*de`z6a$]@O,h:HL).j+% endstream endobj 22 0 obj <> stream x]Pj07"ۅN81hpԒ_I6:;xhNL+àt8 GI^TSbкOZ<{ߚ$Zb٩[gӃ$gV8|JK};NKn'2RUI., #q\zDRfYeTWl~@yoGDT,)]44>Ofo endstream endobj 24 0 obj <> stream xڭzTT^*M ;b D *vPzFPff a`2 BT4D%QcXc{h{;g.{={>eaA K.^eąOl1k_Ȼ,a?\ۇR wR݃[12kB `1ɓOeO}@MPӨ j&5O-Rj)ZNVRʁZKSj#LmR('*Z݇P)N?<&X(ree9Q_\t^Hcf3M_dWb`@laueAO>!/W[ ZeSޚֆ:?8ܲ]iaaî ^׻?qk$N/N3"g䢑G9V`-M(C/xމ;,MbDw?k?iRҢ!()U6qւWjZPcF00S,p _w`@׫j CʦM2+fuZ]|e^ס,]o;(50*:F WU%)! LV|G}QoMX`E0A8.^^HcX[ J q-Eo|Bc.])X_VQQ\{jC|{ pUDi:Y>G^A-?J/t0#q0|L,#f<\TAuRRZl"Bt9ckԿԫ#2iYvk5lRF* ,_6~U] ݖzCxՁڎ%X8mޮYAۃ;E{B 4GXz 櫰k~3B&#!5Eu鲐V6tD*f/Ǘ/,$P:yNm)3])I J[YnR. <۳ sG 7Kxh)aw_b9+TKwϭoRGmtWВFNjZMYqiOYjuB(Adex-<>)D7q ZĔ!>{4 ) h44eZ,ނ$`!̄9yx4t{E.0,ɛQ,ټ iY*C7 K4hD`iir (@Y˸x`/s"<[*+OO$8A}n2rB\V C~fnFcV$2x\N'kMt8)CBTؓtMυhQ&2((<<(2 xiqm!OFDE"?ؤƛ-cd9Iy\!jsI!w7P !.}nyV8Pw{%s>u/Ec]g\k XOXkYv)8 P9q0Vc CU'/xO;>sQ5&3L: i\:E u.N;adU2U)*Hd2YpEכ×H\a^渚T2!p8#7뫮*cWh!vZvZ;OLcy'{f,Db%9i@#!W)=>Uz ÏL(~#΢`sX_mu:`Р 2)A5U-rx,dc!9Vѐ\eFWtbo'le/=CX4VUS_b1l~~aDg$̏q8|➸٠JeGHhw2=`G@n}qmNi>C0P!fC$oMԟw/-N / (Rjݓ{$iu7lm[c|FO9B>鬜i"lʂ2&L- "mqΙ̫P@bgco %0@A8s+ o7FW'P p~dqX䷳k@ Źn^zO'_FPt2<Dkv9%4buAa3eϐO7Qv-<6we 3֍ETG6Ϛ;{W7+f7x<?Gm_FMfvM)H-R~pOz*Ep<Z*N2xl,l3Mp|^TuxoAqHΒ`; Ӹ~E/)WW-w{3i}y^_,~-dI:g7SvAVg4YG(Q_=:;cMQQyD- :Œ&QBO`FO'cYMm[推X"[<4"%p6l1 hb&7Hmy:>}׭o)h)rDz})}HdA!-n˪-H`:FB$mt,"|MJ"/ ]eV,ĂB={ >MCYgnЪs՚m}!6BHcب{TѴesHZbȪk QSMHr| a|zMV;5شrL,Ob? 2w>Thd>JZ\x?\l̺j|CDʬz 26e6-򖚺7Y7b8h֠ br!+Ux,,f ޲(@|!ܝi@QL1h%(.,iηnrx"0؁8~v%%0 w?6te*W&2F֢xJп۱={{z"K,ڷ6g:,r~{fOP~m3A_e_ VԆHa  <g/!ݣϐPih,`7zÊHXK(I-OUnB{n0Q$[ufv\++I;q[C#?z4H6s+3&rQKDh1GD[u$ 3cz9ox7 [ m )$hݜq3ު @3`t@}O+󖬤Wm pxll b7>zx%޸iI m]&b6xy_}zl6jmcҳ1sȂe" 4}Ʊlua)d:k`8J~K. atk!!((Tnox_6K6(n}0lL370Zz՛D>9LjRՋq/ͷ+"yx6IR}Cy4!M}!lz4WIF5 yNC6W1Ԥ0NjJQw49PUJ4>m~gU lRJET4FT:UBVA]T+3Rժt9?MNZL!dICt?%@)iO9D=&eEjixuy*BX'Y؝=h"vp|7Qp.@қ{茙TBγh @O~#~W9$෰wX{ `40AR=uT:{s Y}4c|* _Ԁ{" Y1'y>\e~Ƣx, u//KuHwHC4 VGWZ`\΄x $U|ot=& e ?1,&aP"cp±/r2w 'Qe7S@JI]oX:p肫!yiG.aUJmJwyh1.܌MSU5rIf!%ZH#ZjiY/D| ,>SGܛ j&AeEa},thzQ=`\ey !y |3lx#p9I K4UP"B^̄ ړ`쑷q7?_l%t^F,ːlI fdY| ;mݞ3mˋI-ԯEmbң?vzڟ<֕w=c|J=:"uhi61ʇD\42J(L@]7gy*h.Pc_Ci|!ǘxoʃz+N wlG[;0_ 9H dpș&ټ؉ԕ%u1Ri.S뷫呑 ϩoZbŪy;6 3 [ڙ;pm 9iNx^VΏ#6Q n4Ȣ 4- /dlt?4juvaU;#>TѶ[~;񏱈T9hq1iLgWh 2k׃;x+v ;2)d Az Ax/ bmG8jqW! -6Jj)!xb;uh@nn"($P)A+IkoMX,mm{,&ϓ^hrͳ4ыIćl?]G#Ǔ9[8d,ccPvh:퐺3 mUVhҜ9Pn[ miG^ [_X譫oΞC! f7 {yyJp.ՉZ(4F!b;& Y@²dMf2wHQtB#I' E;B_"&Mgla|tLϽ?yAuuGerCI2“ T -܁QKN 3,CE9UQH))qOcF)ɷ Ъdi2H`b s˲*J|q!Z̯ayŹ9 ]+ucʆ^2ɐeʃ!!PS#?=zmmETOikjT55 Xڨ4L/>sMSn.AxY32U !2.!9V6|Ŀ% v 9yMO|AX\NFݙkjF3w57k^)q^*zBhx 2 屿O$`௿ 1C3 .!prFW+/RɄ&y۶0jTl(%w}euXuw5}6CgĶmM [Hď怛K"5GӰ%x|xBZISVʗL0fKtl)~_~3SChݧa(#h8`]R)_mn"{oO-uċb|{q̇Q)T ֗j\9,Ε2zCg(.DX}XXlQWͿMojuY)-v@UN~&yfer[V'cm)uRQwDwBI%7>AvF+e-2uo6<ա`H'3zK?T/h3/ B~ gvS $dD"\=0e26.nJe( s#-"w><}d+#Rip^Z]3Vrh[R}p`dhx.Z欤z~a ԋ{>mw>g~3uC H endstream endobj 26 0 obj <> stream xڕW TW֮$J[A踢1nA @dYv싂, U@dj41˭b99s8:]޽ݾ{dd26ZwS>8Roʲd2/R.M &Еey=޼Τpݎtpw_0}ؽ 3B\-\|w+#Cc7'DE'{}bC"Rg,HHػh9Ƈ/uusLLp/,>),qulL0QsF+c&&;n hm21L̄XFXƳ 3Q0Si3qf\+3a2hf * 0|tW W#n% jV܁xŠ]gېM{L!cgouCJ8|4pβoYakWlq̧J+8[ Dtu5 c2BB~vN*>7] $5۩u^q9T@o,p=ֲeoA;5#_KQŊTaVv肓N%($l q,ݹa 4co\WOj52÷YQK,.=|jg͢(7 ;D;l6ayni 슬ʭXn,zBQ2=kO)& X8]/bݯ*޴X磍4 49y뇸%ʦͪ!c*ۇ3Z %k8PnrVh{)J8€D.Bρm6Njtr"@LhB\zTnxg54=t?뵕;td.q!!d'N'sLHo+C{.QS9dD6L AAfFƐr>ܟߏĘ4ރ{i]><*^&VdwS,Y5CO ܃IO%I#i:'4h*=)4DNz}g~ r#5f6dB"8xK,$t}- >Wx.Nu$9lW\W5P|p J^?0ށcxI?QJgx)dI4b;N^wN@IJ:xh&!;FUCZ۬ v>> -7uW!)œT"3 $4L4Tt V[fjWMjz.p۱v>wx)Kͽ5T!yYT!.AEaGd-p" 'Y VdB_vHnj vJKjڇ3e3+M]DjhbNU6CA  yR&q57yڙ79Sk5qlb[ٵDTA>Ӑǣw9'Tfz"t`HOY8g}H;g[КCOÒ> YYI'_yf\A]{lN1cJ8 -tY'th8J!]F:&16Aֵ^mYۺlm :JaKoo+ޓRs .;5]N܏'m-ǡ^ύV6tw??dWΚumrPWܹRK:θӔˀd  *Kje  Eoɥ+hu_xPccwzE̡M)%'DrDJ?F' 9#E3{ptCA_K ; nܬ!tCF:HvKeh ai_Y#]7 _eJmb 8 {xbsq}.ꐵRȥ#ԕ 7BT@n&zS OfNI;D[UK[] 8 d>iy "6i\yZͶ WG2 '9AYPMN>=dkmOoxW+Ёnwu`B](U8nyTŸEXo[n@X@El ^6Pr::HoK?zhLfA._Xux}Z{2Ԛ]+N[x`w^abb#r_$H?JFߡ S>M`P6d@?ry1NCQȭ7=E!#S;쐥A ]Mơ+꒗*I{t~qwF#ÑfN-`|TmҲm].Y}K/Q]һvYR-I|b}4y}:q o_qi7"d@PA?RuK "j#a+=0DA;.Iv>GjANO8WVv) c!\f!s{/5qR-{RyŷTjXH4cNzjg~D[]VY;gBX)Ӏ)unMnnn>yR !~ tr;'[:{AP(P D8:c yF" 4%"0hBƓTR&;3^q# / )<8hCE`#ph:5@g%G4Ѱ6G.}(Ke54֢( kڗ[/yZftt.3fhpn{vHp[QK4Ӟ>B^K5 ?NR A~4^5J%d#V. kct$qY6?vC=Q`9}Q;ޫ !*+m+hvRci-mF7j̓PPqY~6Zoڐh_2TFL%%6 "ۡ6{.r~4:7ZLO2ݺ{v"G6a뢴ÇU??3=Ҵdu%9u¢OA";ty~X3J(ז6?Yuln6**"rdL".i)h ^֌u)*ZeoQfqP5ۨlhk׶=82is ֖$}&Jk˧klml%@> endstream endobj 28 0 obj <> stream x%KPӥ~t; )BY] =hN櫱СA1^;ymֆ{@0B)w+Rὧ{N{^"|6%όޔi.,` DYϣm@%԰(w\TcxFHs#V%%$_n&H:-, +5:DFQ%yCJ+W?LȾ8˶ΊE;:5 jπREi'>/ryTPUkAAuޜ`9ȅuxbCl aͲn&?|K endstream endobj 30 0 obj <> stream xڥWyTgB(|*3|A(hjV+*Hطy'$-- Zpv,G?fnowyz9gΙ7'9y{E,+z=׮cͦ$~y'DO*_!WY/hM"?Qa$֚١5_ sµ""+j!)׋)T E 㗬\"1~iRKrYD9%Y"8~{YvA:>|HoVVV.*9LQ~9Š~RQ欒'?XSV"(7s("(l kNdDGYe,aXQA$"X@I,#> D ؄wo!v;DJ"‰?V\KGa|ȍ_]' +3SA4KO},8Fmf0v~7:AG"5zM&ECɁ|{3 c63; ~ۜX? j~v16Cq`ByzLx-s\F0@Cqvs9$ꖩHFdJ$F#IUo܀"~0!vd4J=Ϻl)J崀 ,+QbvGTsuRF[ZRf*M,b2/n-+*,)<\~(MEΧh}C6GG=XCyQ 4ze᪥)w)('-LtY|,z$cxScPzYեzQnR+̩RHX I&` Yb-y)Ԙr^]&k(pMy訟Vv(?z36Jw/CgW3 +)̡'PTTjPL*Y;gLa%ڂޢղ [ fN^!OƂTԁlq;;tCtVjګFC煴Kׯ2;<'U rP F2z2h$&co#j<щubڨr`1Z!d5Fe`k׊o: 8{^C3"~A}^dsjn%s-r꣈TKFPjzC O2c;a96<25z)">g=/$+¤ d=)4^9o2ɩ)yŅ⨹Z)T]FYa( j eR&MTqNhqUA`S\Ɏ\R%7V+z%s85f$m3ڡ,`6"ᢢGZ q$Dyp2>]gOΜ\q9NSn5Uɼ;v_ӠpO:MnȤ ޑ[ fy@`fr8zWMg`2;ܞWjOc 6zK}@0($vVي]ZsACM'u~>j]QeH#֦Pq+>?o.~cN8 E_GՋ-FT Z2vjff2_b:Fɳ`62T< 'ǎ9؝Ϩ5h0L5vecs4;-?ٷy`ם0o`SUCW)_TWN8t|~ձC!CA~ GlBbN7\lWA8ym> B<rtv?gc 0+[&h?]џp*V4eqP R(BYl9P],gUߥ[kXt ڀɷ>!_"},tfbÙH&|CL`C^E^^&EUy5&ᖛ'ь֑@^pPpKXGq_ǠΏ&Fwki=HNU7bk:x^[w[ׅ<]T,NˮKu?֊KU%oq2PI~ԋ^R"N)T }*+ԫ0};{[]!)쌟bm/9R2=a@44썁. ˫y WĉS]՛$N҃uܾAܹzf$[ZWB3Ch*bT0@]w8 qV_ iJnn ^Xz;R$.ʖneL#&fhv(?M{̙S [<7ٴ9X]-zq|b쌏 ~{]<ٍ>ڹo 2س7B2(ql:OKUJ7JTmZs(N =`nm> stream xڅW XWTlNELLF 5PܗYZfk6iN#ȦㆊlQ'&Hc\̜ryn7}U=?)cmd2nK6.sW],Wf"ziߣe6ҋ4R&d% riK #lovtdnЁɸ8pq>ehMiNΜ4Nn!~@_P'Om_N֨}Ngh;fN>U.P/Oi:T'ϩ} !;"~'OV?M(~43L:jl=}ƛa|LJ'(;fc80*9gg3E%F`Df3Ldf3sy|f!Y,a<Of9Yɬb0ku;Fz+FX]Y첕ջVVkuX_Y={HaXxAv\7 9m;6P*G) ĮaJJdfϷݾ~[|ֱx,bhˊlzȩ0s7+8ӻqL'Uv`.o V'6*(<`Ԕ/V9Ǣ<䲡W d6?7(5g患)D*L-NAJG¬LHO݈]z@a_2!tD2 { h)G%ȥXA[[P7/ E^^W~ma[Zk#m,yeF># %.5|bǔ \tiLPyŜ2$d'Y.YĊXO /!Vh}_@|~Ɖ8|򥞗2у Cm4IJ*1QԇQ(ClFԜ"TGsg(b!8JEfK롹%bEֲaIv 6埝w 2b J (@s'rڎBd왯okL,-o(89豖]",zwZ yLGzWYuG|go]z}Uf%ӳZZ.Bp!$.`G0߰zW@(5{wU,s}anaر\ꢪ4ڈB;O{3GG$ 5Kx0 *Hi00l?xoݛ(pHˈq|hC*]7P0 éf0>SDX En JI.JE4h[ ߽`YT1ͅ+հ4_}:''?٤- IzcZP.`]2X`ލ3M} fGG|&샲Ĝ4Ih.BP_UI4?_ C${({mK6xmЉ=`kd5r@?c2 唢ϟvzZj7Ɏв*YHvV^^$`;NsUPY7ljY4."DEDk'~f}cfc~gGש_]=ކ՞ůttWbY>0Q"yܤ[~yG+ܪ٤wu2^3˖/v_*Zٍ^^J{,6Id؜!e99PEWit1!+sHSߥb*|+9 )wNZ/v}!Fm!HC -;V C~*9䷮jɻ#WJ|a^q>eڨX!3:uLNI%*>?qw ~GK2hmUՊPI ={drE{)nHmrR߉3_NY_lB%@kVS r-@¤<Q=H7&z@H萘b1q2L(,cIL54a3dG~ET9J|z"7=f2g_'d!Imd:pLSH~f28_~W:jsKE~7>qdլY^7[7HAr4Kzc0 QZH5$&A<CRluvQTh8mwBNjVQsHm1=4HsՌǑm5­E #1ֳ}T]%\R۳zf%A3OXKL7lhMC?Z N$C8 XZ<&w7n[ 8_mpA[N̘܄,(*\$%4Pmǒ;Y  j"@hIjGC~Ru#?*s o=q~4}t(?F?s>T:?k.KojZkWMM͛{ӬZk i(H>3`u7`~HulP v72K1YN6dn*pCA)rk*=Pq\ K{6񍡐*B!E xb,*uU7]^d 2r:y:nS'?73Uq?ز'\ %RAΠ-ei|e a?DU\v녊5^sK'=wY+Xy3N|6+٥wd Ď(GPoqR‘κd5̃y'bKN)}tО&2+<*{3hvlm-k<&ľ9ׄOܲ Hx6.f˓jcKiXaI?nnrϰHJUD=x`mI g(EW]d !%^5nV]!O;8KzoR߿fVO%5So\Ī:_ bN.}ق˷+0*Y1c=M9 ~=bMN$DuۙvVA[?~/>p9MT _JT,,ϝ_cќ2VPv6nQ{̩4JmXq꠽rsP^lOU!WXvr_~һFg $r/L` 6Iٺ,MNh"khm-|%!YaƘ̘Lo#pإ7xW{vڙ"{OzNus۶-Ek 5W*v:q"4TˌE)fZ9.oޞ$p~Oi<4v>= 3_$V@u@PFd:I \8U/*wU&Y(ާ7WS h(pW-TFr&4`H&Fa`7I>BԟWS?OzYq`*04dpF6gC;(ʲkm8x5<8[Yűnz`L,wžaF_z:[Ҡ3|٧/ΨtE.&H6Eo/[,SlmͶCjARs E9 endstream endobj 11 0 obj <> stream xW[o6~߯[PHz5ڴAmvEX6lhyaW+DBĈhAN%3e@d6~#-V#=Iҳ+jPOb1\Jꦤfx˲dKy)kdaӣ̃)aeP- v/yq}Sk7=,gtR!y1_ ҳ*mv]2|%>P"\@G#\qqeW°sv`\aD1lh9~9@G[(QSb^D DO1p61}bzwcƮ1bv >B~Eq2*Uv9?No|Z{>'=qx&[&ϣ{:B7P1+Qg }fzwbF5f2Zf>f0! LcXMg^ޅ endstream endobj 38 0 obj <<3b53887a7f9de42ce6bd42b1e824e83d>]/Size 39/W[1 2 2]/Filter/FlateDecode/Length 118>> stream x%ǽ D;Ζ0;v6vL`B:"5_0MRE"B{qP1(_@x )ҕwǮ ډ(I3| H endstream endobj startxref 32924 %%EOF similarity-tester-2.89.orig/VERSION0000644000000000000000000000000512540503627014042 0ustar 2.89 similarity-tester-2.89.orig/runs.c0000644000000000000000000000274012540503627014135 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: runs.c,v 1.8 2015-01-14 16:47:27 dick Exp $ */ #include "sim.h" #include "text.h" #include "runs.h" #include "debug.par" #define AISO_BEFORE(r0,r1) ((r0)->rn_size > (r1)->rn_size) #include "aiso.bdy" static int aiso_overflow; void add_to_runs(struct run *r) { if (InsertAiso(r)) return; /* insert failed */ if (!aiso_overflow) { fprintf(stderr, ">>>> Memory overflow: too many runs found\n"); aiso_overflow = 1; } } #ifdef DB_RUN void db_run_info(const char *msg, const struct run *run, int lines_too) { const struct chunk *cnk0 = &run->rn_chunk0; const struct chunk *cnk1 = &run->rn_chunk1; if (msg) { fprintf(Debug_File, "%s: ", msg); } fprintf(Debug_File, "\"%s\" / \"%s\":\n", cnk0->ch_text->tx_fname, cnk1->ch_text->tx_fname ); fprintf(Debug_File, "from %s %s/%s to %s/%s:", token_name, size_t2string(cnk0->ch_first.ps_tk_cnt), size_t2string(cnk1->ch_first.ps_tk_cnt), size_t2string(cnk0->ch_last.ps_tk_cnt), size_t2string(cnk1->ch_last.ps_tk_cnt) ); if (lines_too) { fprintf(Debug_File, " from lines %s/%s to %s/%s:", size_t2string(cnk0->ch_first.ps_nl_cnt), size_t2string(cnk1->ch_first.ps_nl_cnt), size_t2string(cnk0->ch_last.ps_nl_cnt), size_t2string(cnk1->ch_last.ps_nl_cnt) ); } fprintf(Debug_File, " %s %s%s\n", size_t2string(run->rn_size), token_name, (run->rn_size == 1 ? "" : "s") ); } #endif /* DB_RUN */ similarity-tester-2.89.orig/sim.c0000644000000000000000000001371712540503627013744 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: sim.c,v 2.45 2015-04-29 18:18:22 dick Exp $ */ #include #include #include #include "system.par" #include "settings.par" #include "sim.h" #include "options.h" #include "newargs.h" #include "token.h" #include "language.h" #include "error.h" #include "text.h" #include "runs.h" #include "hash.h" #include "compare.h" #include "pass1.h" #include "pass2.h" #include "pass3.h" #include "percentages.h" #include "stream.h" #include "lang.h" #include "Malloc.h" #include "any_int.h" /* VERSION */ #if 0 /* set to 1 when experimenting */ #undef VERSION #define VERSION __TIMESTAMP__ #endif /* PARAMETERS */ /* command-line parameters */ int Min_Run_Size = DEFAULT_MIN_RUN_SIZE; int Page_Width = DEFAULT_PAGE_WIDTH; int Threshold_Percentage = 1; /* minimum percentage to show */ FILE *Output_File; FILE *Debug_File; /* and their string values, for language files that define their own parameters */ const char *token_name = "token"; const char *min_run_string; const char *threshold_string; const char *progname; /* for error reporting */ static const char *page_width_string; static const char *output_name; /* for reporting */ static const struct option optlist[] = { {'r', "minimum run size", 'N', &min_run_string}, {'w', "page width", 'N', &page_width_string}, {'f', "function-like forms only", ' ', 0}, {'F', "keep function identifiers in tact", ' ', 0}, {'d', "use diff format for output", ' ', 0}, {'T', "terse output", ' ', 0}, {'n', "display headings only", ' ', 0}, {'p', "use percentage format for output", ' ', 0}, {'P', "use percentage format, main contributor only", ' ', 0}, {'t', "threshold level of percentage to show", 'N', &threshold_string}, {'e', "compare each file to each file separately", ' ', 0}, {'s', "do not compare a file to itself", ' ', 0}, {'S', "compare new files to old files only", ' ', 0}, {'R', "recurse into subdirectories", ' ', 0}, {'i', "read arguments (file names) from standard input", ' ', 0}, {'o', "write output to file F", 'F', &output_name}, {'v', "show version number and compilation date", ' ', 0}, {'M', "show memory usage info", ' ', 0}, {'-', "lexical scan output only", ' ', 0}, {0, 0, 0, 0} }; static void allow_at_most_one_out_of(const char *opts) { const char *first; for (first = opts; *first; first++) { const char *second; for (second = first + 1; *second; second++) { if (is_set_option(*first) &&is_set_option(*second)) { char msg[256]; sprintf(msg, "options -%c and -%c are incompatible", *first, *second ); fatal(msg); } } } } /* SERVICE ROUTINES */ int is_new_old_separator(const char *s) { if (strcmp(s, "/") == 0) return 1; if (strcmp(s, "|") == 0) return 1; return 0; } const char * size_t2string(size_t s) { return any_uint2string(s, 0); } /* PROGRAM */ static void read_and_compare_files(int argc, const char **argv, int round) { Read_Input_Files(argc, argv, round); Make_Forward_References(); Compare_Files(); Free_Forward_References(); } #ifdef ARG_TEST static void show_args(const char *msg, int argc, const char *argv[]) { fprintf(stdout, "%s: ", msg); int i; for (i = 0; i < argc; i++) { fprintf(stdout, "arg[%d] = %s; ", i, argv[i]); } fprintf(stdout, "\n"); } #endif /* ARG_TEST */ int main(int argc, const char *argv[]) { /* Save program name */ progname = argv[0]; argv++, argc--; /* and skip it */ /* Set the default output and debug streams */ Output_File = stdout; Debug_File = stdout; /* Get command line options */ { int nop = do_options(progname, optlist, argc, argv); argc -= nop, argv += nop; /* and skip them */ } /* Check options compatibility */ allow_at_most_one_out_of("dnpPT"); if (is_set_option('t')) { /* threshold means percentages */ if (!is_set_option('p') && !is_set_option('P')) fatal("option -t requires -p or -P"); } /* Treat the simple options */ if (is_set_option('v')) { fprintf(stdout, "Version %s\n", VERSION); return 0; } if (is_set_option('P')) { set_option('p'); } if (is_set_option('p')) { set_option('e'); set_option('s'); } /* Treat the value options */ if (min_run_string) { Min_Run_Size = atoi(min_run_string); if (Min_Run_Size == 0) fatal("bad or zero run size; form is: -r N"); } if (page_width_string) { Page_Width = atoi(page_width_string); if (Page_Width <= 0) fatal("bad or zero page width"); } if (threshold_string) { Threshold_Percentage = atoi(threshold_string); if ((Threshold_Percentage > 100) || (Threshold_Percentage <= 0)) fatal("threshold must be between 1 and 100"); } if (output_name) { Output_File = fopen(output_name, "w"); if (Output_File == 0) { char *msg = (char *)Malloc(strlen(output_name) + 100); sprintf(msg, "cannot open output file `%s'", output_name); fatal(msg); /*NOTREACHED*/ } } /* Treat the input-determining options */ if (is_set_option('i')) { /* read input file names from standard input */ if (argc != 0) fatal("-i option conflicts with file arguments"); get_new_std_input_args(&argc, &argv); } if (is_set_option('R')) { get_new_recursive_args(&argc, &argv); } /* (argc, argv) now represents new_file* [ / old_file*] */ /* Here the real work starts */ Init_Language(); if (is_set_option('-')) { /* Just the lexical scan */ while (argv[0]) { const char *arg = argv[0]; if (!is_new_old_separator(arg)) { Print_Stream(arg); } argv++; } } else if (is_set_option('p')) { /* Show percentages */ read_and_compare_files(argc, argv, 1); Show_Percentages(); } else { /* Show runs */ read_and_compare_files(argc, argv, 1); Retrieve_Runs(); Show_Runs(); } if (is_set_option('M')) { /* It is not trivial to plug the leaks, because data structures point to each other, and have to be freed in the proper order. But it is not impossible either. To do, perhaps. */ ReportMemoryLeaks(stderr); } return 0; } similarity-tester-2.89.orig/pass3.h0000644000000000000000000000035012540503627014177 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: pass3.h,v 1.3 2012-06-05 09:58:53 dick Exp $ */ /* Print the contents of runs */ extern void Show_Runs(void); similarity-tester-2.89.orig/newargs.h0000644000000000000000000000047012540503627014617 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: newargs.h,v 2.4 2012-05-16 07:56:06 dick Exp $ */ extern void get_new_std_input_args(int *argcp, char const **argvp[]); extern void get_new_recursive_args(int *argcp, const char **argvp[]); similarity-tester-2.89.orig/percentages.c0000644000000000000000000001005312540503627015442 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: percentages.c,v 1.18 2015-01-18 15:33:07 dick Exp $ */ #include #include "debug.par" #include "sim.h" #include "text.h" #include "runs.h" #include "options.h" #include "Malloc.h" #include "error.h" #include "percentages.h" struct match { struct match *ma_next; const char *ma_fname0; const char *ma_fname1; size_t ma_size; /* # tokens of file 0 found in file 1 */ size_t ma_size0; /* # tokens in file 0 */ }; static struct match *match_start = 0; /* to be allocated by new() */ static void do_add_to_precentages(struct chunk ch0, struct chunk ch1, size_t size); void add_to_percentages(struct run *r) { /* percentages are only meaningful between different files */ if (r->rn_chunk0.ch_text == r->rn_chunk1.ch_text) return; do_add_to_precentages(r->rn_chunk0, r->rn_chunk1, r->rn_size); do_add_to_precentages(r->rn_chunk1, r->rn_chunk0, r->rn_size); } static void do_add_to_precentages(struct chunk ch0, struct chunk ch1, size_t size) { struct match **match_hook = &match_start; /* look up the (text0, text1) combination in the match list */ while (*match_hook) { struct match *m = *match_hook; if ( m->ma_fname0 == ch0.ch_text->tx_fname && m->ma_fname1 == ch1.ch_text->tx_fname ) { /* found it; now update it */ m->ma_size += size; return; } match_hook = &m->ma_next; } { /* it's not there; make a new entry */ struct match *m = *match_hook = new(struct match); struct text *text0 = ch0.ch_text; struct text *text1 = ch1.ch_text; m->ma_next = 0; m->ma_fname0 = text0->tx_fname; m->ma_fname1 = text1->tx_fname; m->ma_size = size; m->ma_size0 = text0->tx_limit - text0->tx_start; } } /* PRINTING */ /* We want the sorting order all contributors of the file with the highest percentage all contributors of the file with the next lower percentage etc. but this order cannot be specified by a single SORT_BEFORE(). So we sort for percentage, and then reorder during printing. */ /* instantiate sort_match_list(struct match **listhook) */ static float match_percentage(struct match *m) { return (((float)m->ma_size)/((float)m->ma_size0)); } #define SORT_STRUCT match #define SORT_NAME sort_match_list #define SORT_BEFORE(p1,p2) (match_percentage(p1) > match_percentage(p2)) #define SORT_NEXT ma_next #include "sortlist.bdy" static void print_perc_info(struct match *m) { int mp = (int)(match_percentage(m)*100.0); if (mp > 100) { /* this may result from overlapping matches */ mp = 100; } if (mp >= Threshold_Percentage) { fprintf(Output_File, "%s consists for %d %% of %s material\n", m->ma_fname0, mp, m->ma_fname1 ); } } static void print_and_remove_perc_info_for_top_file(struct match **m_hook) { struct match *m = *m_hook; const char *fname = m->ma_fname0; print_perc_info(m); /* always print main contributor */ *m_hook = m->ma_next; Free(m); while ((m = *m_hook)) { if (m->ma_fname0 == fname) { /* print subsequent contributors only if not suppressed by -P */ if (!is_set_option('P')) { print_perc_info(m); } /* remove the struct */ *m_hook = m->ma_next; Free(m); } else { /* skip the struct */ m_hook = &m->ma_next; continue; } } } static void print_percentages(void) { /* destroys the match list while printing */ while (match_start) { print_and_remove_perc_info_for_top_file(&match_start); } } #ifdef DB_PERC static void print_match_list(const char *msg) { fprintf(Debug_File, "\n\n**** DB_PERCENTAGES %s ****\n", msg); struct match *ma; for (ma = match_start; ma; ma = ma->ma_next) { fprintf(Debug_File, "%s < %s, %d/%d=%3.0f%%\n", ma->ma_fname0, ma->ma_fname1, ma->ma_size, ma->ma_size0, match_percentage(ma)*100 ); } fprintf(Debug_File, "\n"); } #endif /* DB_PERC */ void Show_Percentages(void) { #ifdef DB_PERC print_match_list("before sort"); #endif /* DB_PERC */ sort_match_list(&match_start); #ifdef DB_PERC print_match_list("after sort"); #endif /* DB_PERC */ print_percentages(); } similarity-tester-2.89.orig/ForEachFile.c0000644000000000000000000001110012540503627015243 0ustar /* This file is part of the auxiliaries library. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: ForEachFile.c,v 1.19 2014-07-28 09:18:11 Gebruiker Exp $ */ #include #include #include #include #include #include "ForEachFile.h" /*Library module source prelude */ #undef _FOREACHFILE_CODE_ #ifndef lint #define _FOREACHFILE_CODE_ #endif #ifdef LIB #define _FOREACHFILE_CODE_ #endif #ifdef _FOREACHFILE_CODE_ /* Library module source code */ /* LOOP DETECTION */ struct ino_link { struct ino_link *next; long il_ino; long il_device; }; static int in_ino_list(const struct ino_link *inop, const struct stat *st) { while (inop) { #ifdef UNIX if ( inop->il_ino == st->st_ino && inop->il_device == st->st_dev ) return 1; #else #ifdef lint st = st; #endif #endif inop = inop->next; } return 0; } static void link_ino_list( struct ino_link *inop, struct ino_link *ninop, const struct stat *st ) { ninop->next = inop; ninop->il_ino = st->st_ino; ninop->il_device = st->st_dev; } /* TREE SCANNING */ #ifdef S_IFLNK /* system with symbolic links */ #define LSTAT lstat #else /* S_IFLNK */ #define LSTAT Stat #endif /* S_IFLNK */ static void do_FEF( Fchar *fn, void (*proc)(const Fchar *, const char *, const struct stat *), int dev, struct ino_link *inop, Fchar separator, int max_depth ) { struct stat fs; Dir_t *dir; if (proc == 0) return; /* just make sure */ if (LSTAT(fn, &fs) < 0) { (*proc)(fn, strerror(errno), &fs); return; } /* report on file fn */ (*proc)(fn, (char*)0, &fs); if (max_depth == 0) return; if ((fs.st_mode & S_IFMT) != S_IFDIR) return; #ifdef S_IFLNK /* don't follow links */ if ((fs.st_mode & S_IFMT) == S_IFLNK) return; #endif /* treat directory */ if (dev < 0) { /* no device known yet */ dev = fs.st_dev; } if (fs.st_dev != dev) { return; } dir = Opendir(fn); if (dir == 0) { (*proc)(fn, "directory not readable", &fs); } else { /* scan new directory */ int fnl = Fnamelen(fn); Dirent_t *dent; struct ino_link ino; /* worry about loops in the file system */ if (in_ino_list(inop, &fs)) { (*proc)(fn, "loop in file system", &fs); Closedir(dir); return; } link_ino_list(inop, &ino, &fs); /* shape up the directory name */ if (fn[fnl-1] != separator) { /* append separator */ fn[fnl++] = separator; fn[fnl] = '\0'; } /* descend */ while ((dent = Readdir(dir)) != (Dirent_t *)0) { if ( Fnamecmp(dent->d_name, str2Fname(".")) == 0 || Fnamecmp(dent->d_name, str2Fname("..")) == 0 ) continue; if (Fnamecmp(dent->d_name, str2Fname("")) == 0) { (*proc)(fn, "directory contains empty file name", &fs ); continue; } /* append name */ Fnamecat(fn, dent->d_name); do_FEF(fn, proc, dev, &ino, separator, max_depth-1); /* remove name again*/ fn[fnl] = '\0'; } Closedir(dir); } } static Fchar get_separator(const Fchar *fn) { #ifndef MSDOS (void)(fn); /* use fn */ return '/'; #else /* under MSDOS, conform to user's use, or use '\' */ Fchar sep = 0; while (*fn) { if (*fn == '/' || *fn == '\\') { if (sep == 0) { sep = *fn; } else if (sep != *fn) return 0; /* bad mixed use */ } fn++; } return (sep ? sep : '\\'); #endif } static void clean_name(Fchar *fn, Fchar sep) { Fchar *f1 = fn; Fchar *f2 = fn; /* remove multiple separators */ while (*f1) { if (*f1 == sep && *(f1+1) == sep) { f1++; } else { *f2++ = *f1++; } } *f2 = '\0'; /* remove a trailing separator */ if (f2-1 > fn && *(f2-1) == sep) { *(f2-1) = '\0'; } } static void do_ForEachFile( const Fchar *fn, void (*proc)(const Fchar *, const char *, const struct stat *), int max_depth ) { Fchar fname[MAX_FILE_NAME_LENGTH]; Fchar separator; Fnamecpy(fname, (!fn || !*fn) ? str2Fname(".") : fn); separator = get_separator(fname); if (!separator) { (*proc)(fname, "both / and \\ used as separators", 0); return; } clean_name(fname, separator); do_FEF(fname, proc, -1, (struct ino_link *)0, separator, max_depth); } /* THE ENTRIES */ void ForEachFile( const Fchar *fn, void (*proc)(const Fchar *, const char *, const struct stat *) ) { do_ForEachFile(fn, proc, -1); /* infinitely deep */ } void ForEachLocalFile( const Fchar *fn, void (*proc)(const Fchar *, const char *, const struct stat *), int depth ) { do_ForEachFile(fn, proc, depth); } /* End library module source code */ #endif /* _FOREACHFILE_CODE_ */ #ifdef lint static void satisfy_lint(void *x) { ForEachFile(0, 0); ForEachLocalFile(0, 0, 0); satisfy_lint(x); } #endif /* lint */ similarity-tester-2.89.orig/any_int.h0000644000000000000000000000257212540503627014617 0ustar /* This file is part of the module ANY_INT. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: any_int.h,v 1.3 2014-09-25 06:58:25 Gebruiker Exp $ */ #ifndef _ANY_INT_H_ #define _ANY_INT_H_ /* Printing size_t and very long ints. Printing integers using *printf requires specifying the format, which requires knowing the exact nature of the integer. But this is not always the case, f.e with size_t or extra-long integers for the accumulation of size_t values. Some systems use %z as a dedicated format to print size_t, but this is not portable since not all compilers know it. These problems are solved by introducing the type vlong_[u]int (see below), defined as the largest [unsigned] machine int type on the system, and routines to convert these to string. The resulting string is transient, but up to N_INDEPENDENT_CALLS calls can be used simultaneously. Since the value is passed to the conversion routines as a typed parameter the C compiler does the conversion (actually widening) for you. */ /* Public entries */ typedef long long int vlong_int; /* largest int in the system */ typedef unsigned long long int vlong_uint; /* largest uint in the system */ /* transient * N_INDEPENDENT_CALLS */ extern const char *any_int2string(vlong_int val, int size); extern const char *any_uint2string(vlong_uint val, int size); #endif /* _ANY_INT_H_ */ similarity-tester-2.89.orig/language.c0000644000000000000000000000115412540503627014727 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: language.c,v 2.3 2013-04-28 16:30:41 dick Exp $ */ /* This is a dummy implementation of the abstract class 'language'. The actual implementation is provided by one of the *lang.l files. */ #include #include #include "token.h" #include "language.h" void Init_Language(void) { abort(); } int May_Be_Start_Of_Run(Token ch) { if (ch == ch) abort(); return 0; } size_t Best_Run_Size(const Token *str, size_t size) { if (str == str || size == size) abort(); return 0; } similarity-tester-2.89.orig/pass1.c0000644000000000000000000000722312540503627014176 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: pass1.c,v 2.27 2015-01-14 16:47:27 dick Exp $ */ #include #include #include "debug.par" #include "sim.h" #include "text.h" #include "token.h" #include "tokenarray.h" #include "lang.h" #include "error.h" #include "options.h" #include "pass1.h" #ifdef DB_TEXT static void db_print_text(const struct text *); #endif static void fprint_count(FILE *f, size_t cnt, const char *); void Read_Input_Files(int argc, const char *argv[], int round /* about printing */) { int n; Init_Text(argc); Init_Token_Array(); /* Initially assume all texts to be new */ Number_of_New_Texts = Number_of_Texts; /* Read the files */ for (n = 0; n < Number_of_Texts; n++) { const char *fname = argv[n]; struct text *txt = &Text[n]; if (round == 1 && !is_set_option('T')) { fprintf(Output_File, "File %s: ", fname); } txt->tx_fname = fname; txt->tx_pos = 0; txt->tx_start = Token_Array_Length(); txt->tx_limit = Token_Array_Length(); if (is_new_old_separator(fname)) { if (round == 1 && !is_set_option('T')) { fprintf(Output_File, "separator\n"); } Number_of_New_Texts = n; } else { if (!Open_Text(First_Pass, txt)) { if (round == 1 && !is_set_option('T')) { fprintf(Output_File, ">>>> cannot open <<<< "); } /* the file has still been opened with a null file for uniformity */ } while (Next_Text_Token_Obtained()) { if (!Token_EQ(lex_token, End_Of_Line)) { Store_Token(lex_token); } } Close_Text(First_Pass, txt); txt->tx_limit = Token_Array_Length(); txt->tx_EOL_terminated = Token_EQ(lex_token, End_Of_Line); /* report */ if (round == 1 && !is_set_option('T')) { fprint_count(Output_File, txt->tx_limit - txt->tx_start, token_name ); fprintf(Output_File, ", "); fprint_count(Output_File, lex_nl_cnt - 1 + (!txt->tx_EOL_terminated ? 1 : 0), "line" ); if (!txt->tx_EOL_terminated) { fprintf(Output_File, " (not NL-terminated)"); } if (lex_non_ascii_cnt) { fprintf(Output_File, ", "); fprint_count(Output_File, lex_non_ascii_cnt, "non-ASCII character" ); } fprintf(Output_File, "\n"); } #ifdef DB_TEXT db_print_text(txt); #endif /* DB_TEXT */ } fflush(Output_File); } /* report total */ if (round == 1 && !is_set_option('T')) { fprintf(Output_File, "Total: "); fprint_count(Output_File, Token_Array_Length() - 1, token_name); fprintf(Output_File, "\n\n"); fflush(Output_File); } } static void fprint_count(FILE *f, size_t cnt, const char *unit) { /* Prints a grammatically correct string "%u %s[s]" for units that form their plural by suffixing -s. */ fprintf(f, "%s %s%s", size_t2string(cnt), unit, (cnt == 1 ? "" : "s")); } #ifdef DB_TEXT static void db_print_text(const struct text *txt) { /* prints a text (in compressed form) */ size_t i; fprintf(Debug_File, "\n\n**** DB_PRINT_TEXT ****\n"); fprintf(Debug_File, "File \"%s\", %s %ss, ", txt->tx_fname, size_t2string(txt->tx_limit - txt->tx_start), token_name ); fprintf(Debug_File, "txt->tx_start = %s, txt->tx_limit = %s\n", size_t2string(txt->tx_start), size_t2string(txt->tx_limit) ); int BoL = 1; for (i = txt->tx_start; i < txt->tx_limit; i++) { if (BoL) { fprintf(Debug_File, "[%s]:", size_t2string(i)); BoL = 0; } fprintf(Debug_File, " "); fprint_token(Debug_File, Token_Array[i]); if ((i - txt->tx_start + 1) % 10 == 0) { fprintf(Debug_File, "\n"); BoL = 1; } } fprintf(Debug_File, "\n"); } #endif /* DB_TEXT */ similarity-tester-2.89.orig/runs.h0000644000000000000000000000230312540503627014135 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: runs.h,v 1.7 2013-04-28 16:30:42 dick Exp $ */ /* Although all other segments of data in this program are described by giving the position of the first in the segment and that of the first not in the segment (so the size is the difference of the two), a `chunk' is given by first and last. This is done because later on we are interested in the actual position of the last token of it, and the position of the first token not in the segment gives no indication about that. */ struct chunk { /* a chunk of text in various representations */ struct text *ch_text; /* pointer to the file */ struct position ch_first; /* first in chunk */ struct position ch_last; /* last in chunk */ }; struct run { /* a 'run' of coincident tokens */ struct chunk rn_chunk0; /* chunk in left file */ struct chunk rn_chunk1; /* chunk in right file */ size_t rn_size; }; #define AISO_TYPE struct run * #define AISO_ITER #include "aiso.spc" extern void add_to_runs(struct run *r); #ifdef DB_RUN extern void db_run_info(const char *msg, const struct run *run, int lines_too); #endif /* DB_RUN */ similarity-tester-2.89.orig/error.c0000644000000000000000000000066512540503627014303 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: error.c,v 2.7 2015-01-22 20:54:30 dick Exp $ */ #include #include #include "sim.h" #include "error.h" void fatal(const char *msg) { #ifdef lint /* prevent non-use messages in lint */ min_run_string = 0; threshold_string = 0; #endif fprintf(stderr, "%s: %s\n", progname, msg); exit(1); } similarity-tester-2.89.orig/text.c0000644000000000000000000001252212540503627014131 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: text.c,v 1.18 2015-01-17 10:20:41 dick Exp $ */ #include #include #include "debug.par" #include "sim.h" #include "token.h" #include "stream.h" #include "lang.h" #include "Malloc.h" #include "options.h" #include "error.h" #include "text.h" struct text *Text; /* to be filled in by Malloc() */ int Number_of_Texts; int Number_of_New_Texts; typedef unsigned short nl_tk_diff_t; struct newline { nl_tk_diff_t nl_tk_diff; /* token position difference */ }; #define NL_START 1024 /* initial newline buffer size */ static struct newline *nl_buff; /* to be filled by Malloc() */ static size_t nl_size; /* size of nl_buff[] */ static size_t nl_free; /* next free position in nl_buff[] */ static size_t nl_next, nl_limit; /* nl_buff[] pointers during pass 2 */ static void store_newline(void); static void init_nl_buff(void); /* TEXT INTERFACE */ static size_t last_tk_cnt; /* token count at newline */ static size_t last_nl_cnt; /* nl counter during pass 2 */ void Init_Text(int nfiles) { /* allocate the array of text descriptors */ if (Text) { Free(Text); Text = 0; } Number_of_Texts = nfiles; Text = (struct text *) Malloc((size_t)(Number_of_Texts*sizeof (struct text))); init_nl_buff(); } int Open_Text(enum Pass pass, struct text *txt) { switch (pass) { case First_Pass: last_tk_cnt = 0; if (nl_buff) { txt->tx_nl_start = nl_free; } break; case Second_Pass: last_tk_cnt = 0; if (nl_buff) { nl_next = txt->tx_nl_start; nl_limit = txt->tx_nl_limit; last_nl_cnt = 1; lex_nl_cnt = 1; lex_tk_cnt = 0; return 1; } break; } return Open_Stream(txt->tx_fname); } int Next_Text_Token_Obtained(void) { if (!Next_Stream_Token_Obtained()) return 0; if (Token_EQ(lex_token, End_Of_Line)) { store_newline(); last_tk_cnt = lex_tk_cnt; } return 1; } int Next_Text_EOL_Obtained(void) { /* get newline info from the buffer or from the file itself */ if (nl_buff) { if (nl_next == nl_limit) return 0; struct newline *nl = &nl_buff[nl_next++]; lex_nl_cnt = ++last_nl_cnt; lex_tk_cnt = (last_tk_cnt += nl->nl_tk_diff); lex_token = End_Of_Line; return 1; } else { int ok; while ( (ok = Next_Stream_Token_Obtained()) && !Token_EQ(lex_token, End_Of_Line) ) { /* skip */ } return ok; } } void Close_Text(enum Pass pass, struct text *txt) { switch (pass) { case First_Pass: if (nl_buff) { if (last_tk_cnt != lex_tk_cnt) { /* there were tokens after the last newline */ store_newline(); } txt->tx_nl_limit = nl_free; } break; case Second_Pass: break; } Close_Stream(); } /* NEWLINE CACHING */ /* To speed up pass2 which is interested in token positions at line ends, the newline buffer keeps this info from pass1. To reduce the size of the newline buffer, the info is kept as the differences of the values at consecutive line ends. This allows unsigned chars to be used rather than integers. The recording of token position differences at End_Of_Line is optional, and is switched off if - there is not room enough for the newline buffer. - a difference would not fit in the field in the struct. Switching off is done by freeing the buffer and setting nl_buff to 0. Anybody using nl_buff should therefore test for nl_buff being zero. */ static void abandon_nl_buff(const char *); static void init_nl_buff(void) { /* Allocate the newline buffer, if possible */ nl_size = 0 + NL_START; nl_buff = (struct newline *)TryMalloc(sizeof (struct newline)*nl_size); nl_free = 0; } static void store_newline(void) { if (!nl_buff) return; if (nl_free == nl_size) { /* allocated array is full; try to increase its size */ size_t new_size = nl_size + nl_size/2; if (new_size < nl_free) { abandon_nl_buff("out of address space"); return; } struct newline *new_buff = (struct newline *)TryRealloc( (char *)nl_buff, sizeof (struct newline) * new_size ); if (!new_buff) { abandon_nl_buff("out of memry"); return; } nl_buff = new_buff, nl_size = new_size; } /* now we are sure there is room enough */ { struct newline *nl = &nl_buff[nl_free++]; size_t tk_diff = lex_tk_cnt - last_tk_cnt; nl->nl_tk_diff = (nl_tk_diff_t) tk_diff; if (nl->nl_tk_diff != tk_diff) { abandon_nl_buff("tk_diff does not fit in nl_tk_diff"); } } } static void /*ARGSUSED*/ abandon_nl_buff(const char *msg) { #undef DB_BUFF #ifdef DB_BUFF fprintf(Debug_File, "abandon_nl_buff, %s\n", msg); #endif /* DB_BUFF */ if (nl_buff) { Free((char *)nl_buff); nl_buff = 0; } } #ifdef DB_NL_BUFF void db_print_nl_buff(size_t start, size_t limit) { size_t i; fprintf(Debug_File, "\n**** DB_NL_BUFF ****\n"); if (!nl_buff) { fprintf(Debug_File, ">>>> NO NL_BUFF\n\n"); return; } if (start > nl_free) { fprintf(Debug_File, ">>>> start (%s) > nl_free (%s)\n\n", size_t2string(start), size_t2string(nl_free) ); return; } if (limit > nl_free) { fprintf(Debug_File, ">>>> limit (%s) > nl_free (%s)\n\n", size_t2string(limit), size_t2string(nl_free) ); return; } fprintf(Debug_File, "nl_buff: %s entries:\n", size_t2string(nl_free)); for (i = start; i < limit; i++) { struct newline *nl = &nl_buff[i]; fprintf(Debug_File, "nl_tk_diff = %d\n", nl->nl_tk_diff); } fprintf(Debug_File, "\n"); } #endif /* DB_NL_BUFF */ similarity-tester-2.89.orig/m2lang.l0000644000000000000000000001536112540503627014342 0ustar %{ /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: m2lang.l,v 2.19 2013-04-28 16:30:41 dick Exp $ */ /* Modula-2 language front end for the similarity tester. Author: Dick Grune */ #include "options.h" #include "token.h" #include "language.h" #include "algollike.h" #include "idf.h" #include "lex.h" #include "lang.h" /* General language front end data */ Token lex_token; size_t lex_nl_cnt; size_t lex_tk_cnt; size_t lex_non_ascii_cnt; /* Language-dependent data */ /* Most Modula-2 programs start with a number of IMPORTs that look very similar from program to program. These are skipped by ignoring the reserved words IMPLEMENTATION, DEFINITION, MODULE, IMPORT and FROM, having a flag skip_imports, and start reacting only at the first non-ignored reserved word. Also, the nesting comments require a state variable. */ /* Additional state variables, set in yystart() */ static int skip_imports; static int comment_level; /* Data for module idf */ static const struct idf reserved[] = { {"AND", NORM('&')}, {"ARRAY", NORM('A')}, {"BEGIN", NORM('{')}, {"BY", NORM('B')}, {"CASE", NORM('c')}, {"CONST", NORM('C')}, {"DEFINITION", No_Token}, {"DIV", NORM('/')}, {"DO", NORM('D')}, {"ELSE", NORM('e')}, {"ELSIF", NORM('e')}, {"END", NORM('}')}, {"EXIT", NORM('E')}, {"EXPORT", CTRL('E')}, {"FOR", NORM('F')}, {"FROM", No_Token}, {"IF", NORM('i')}, {"IMPLEMENTATION", No_Token}, {"IMPORT", No_Token}, {"IN", NORM('I')}, {"LOOP", NORM('l')}, {"MOD", NORM('%')}, {"MODULE", No_Token}, {"NOT", NORM('~')}, {"OF", No_Token}, {"OR", NORM('O')}, {"POINTER", NORM('p')}, {"PROCEDURE", NORM('P')}, {"QUALIFIED", NORM('q')}, {"RECORD", NORM('r')}, {"REPEAT", NORM('R')}, {"RETURN", CTRL('r')}, {"SET", NORM('s')}, {"THEN", No_Token}, {"TO", NORM('t')}, {"TYPE", NORM('T')}, {"UNTIL", NORM('u')}, {"VAR", NORM('v')}, {"WHILE", NORM('w')}, {"WITH", NORM('W')}, }; static const struct idf standard[] = { {"ABS", META('a')}, {"ADDRESS", META('A')}, {"ALLOCATE", MTCT('A')}, {"BITSET", META('b')}, {"BOOLEAN", META('B')}, {"CAP", META('c')}, {"CARDINAL", META('C')}, {"CHAR", MTCT('C')}, {"CHR", META('x')}, {"DEALLOCATE", META('d')}, {"DEC", META('D')}, {"EXCL", META('e')}, {"FALSE", META('f')}, {"FLOAT", META('F')}, {"HALT", META('h')}, {"HIGH", META('H')}, {"INC", META('i')}, {"INCL", META('I')}, {"INTEGER", MTCT('I')}, {"LONGCARD", META('L')}, {"LONGINT", META('L')}, {"LONGREAL", META('L')}, {"MAX", META('m')}, {"MIN", META('M')}, {"NEWPROCESS", META('n')}, {"NIL", META('N')}, {"ODD", META('o')}, {"ORD", META('O')}, {"PROC", META('p')}, {"REAL", META('r')}, {"SIZE", META('s')}, {"SYSTEM", META('S')}, {"TRANSFER", META('t')}, {"TRUE", META('T')}, {"TRUNC", MTCT('T')}, {"VAL", META('v')}, {"WORD", META('w')} }; /* Special treatment of identifiers */ static Token idf2token(int hashing) { Token tk; /* the token can be on two lists, reserved and standard */ tk = idf_in_list(yytext, reserved, sizeof reserved, IDF); /* is it one of the keywords to be ignored? */ if (Token_EQ(tk, No_Token)) return tk; /* The statement below is a significant comment on the value of state variables. */ if (!Token_EQ(tk, IDF)) { /* reserved word, stop the skipping */ skip_imports = 0; } else { /* it is an identifier but not a reserved word */ if (skip_imports) { /* skip it */ tk = 0; } else { /* look further */ tk = idf_in_list(yytext, standard, sizeof standard, IDF); if (Token_EQ(tk, IDF) && hashing) { /* return a one-Token hash code */ tk = idf_hashed(yytext); } } } return tk; } /* Token sets for module algollike */ const Token Non_Finals[] = { IDF, /* identifier */ NORM('{'), /* also BEGIN */ NORM('('), NORM('['), NORM('A'), /* ARRAY */ NORM('c'), /* CASE */ NORM('C'), /* CONST */ NORM('E'), /* EXIT */ NORM('F'), /* FOR */ NORM('i'), /* IF */ NORM('l'), /* LOOP */ NORM('p'), /* POINTER */ NORM('P'), /* PROCEDURE */ NORM('r'), /* RECORD */ NORM('R'), /* REPEAT */ CTRL('R'), /* RETURN */ NORM('s'), /* SET */ NORM('T'), /* TYPE */ NORM('v'), /* VAR */ NORM('w'), /* WHILE */ NORM('W'), /* WITH */ No_Token }; const Token Non_Initials[] = { NORM('}'), NORM(')'), NORM(']'), NORM(';'), No_Token }; const Token Openers[] = { NORM('{'), NORM('('), NORM('['), No_Token }; const Token Closers[] = { NORM('}'), NORM(')'), NORM(']'), No_Token }; /* Language-dependent code */ void Init_Language(void) { Init_Algol_Language(Non_Finals, Non_Initials, Openers, Closers); } int May_Be_Start_Of_Run(Token ch) { return May_Be_Start_Of_Algol_Run(ch); } size_t Best_Run_Size(const Token *str, size_t size) { return Best_Algol_Run_Size(str, size); } %} %option noyywrap %Start Comment Layout ([ \t\r\f]) ASCII95 ([- !"#$%&'()*+,./0-9:;<=>?@A-Z\[\\\]^_`a-z{|}~]) AnyQuoted (\\.) QuStrChar ([^"\n\\]|{AnyQuoted}) ApoStrChar ([^'\n\\]|{AnyQuoted}) StartComment ("(*") EndComment ("*)") SafeComChar ([^*\n]) UnsafeComChar ("*") Digit ([0-9a-fA-F]) Idf ([A-Za-z][A-Za-z0-9_]*) %% {StartComment} { /* See clang.l */ /* Lex itself is incapable of handling Modula-2's nested comments. So let's help it a bit. */ if (comment_level == 0) { BEGIN Comment; } comment_level++; } {SafeComChar}+ { /* safe comment chunk */ } {UnsafeComChar} { /* unsafe char, read one by one */ } "\n" { /* to break up long comments */ return_eol(); } {EndComment} { /* end-of-comment */ comment_level--; if (comment_level == 0) { BEGIN INITIAL; } } \"{QuStrChar}*\" { /* quoted strings */ return_ch('"'); } \'{ApoStrChar}*\' { /* apostrophed strings */ return_ch('"'); } {Digit}+("B"|"C"|"H")? { /* numeral, passed as an identifier */ return_tk(IDF); } "END"{Layout}*{Idf} { /* ignore identifier after END */ Token tk = idf_in_list("END", reserved, sizeof reserved, No_Token); if (!Token_EQ(tk, No_Token)) return_tk(tk); } {Idf}/"(" { /* identifier in front of ( */ Token tk = idf2token(is_set_option('F')/* hashing option */); if (!Token_EQ(tk, No_Token)) return_tk(tk); } {Idf} { /* identifier */ Token tk = idf2token(0 /* no hashing */); if (!Token_EQ(tk, No_Token)) return_tk(tk); } "<>" { /* <>, special equivalence */ return_ch('#'); } \; { /* semicolon, conditionally ignored */ if (is_set_option('f')) return_ch(yytext[0]); } \n { /* count newlines */ return_eol(); } {Layout} { /* ignore layout */ } {ASCII95} { /* copy other text */ if (!skip_imports) return_ch(yytext[0]); } . { /* count non-ASCII chars */ lex_non_ascii_cnt++; } %% /* More language-dependent code */ void yystart(void) { skip_imports = 1; comment_level = 0; BEGIN INITIAL; } similarity-tester-2.89.orig/pascallang.l0000644000000000000000000001152412540503627015264 0ustar %{ /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: pascallang.l,v 2.20 2015-01-17 10:20:40 dick Exp $ */ /* PASCAL language front end for the similarity tester. Author: Maarten van der Meulen Date: May 1986 */ #include "options.h" #include "token.h" #include "language.h" #include "algollike.h" #include "idf.h" #include "lex.h" #include "lang.h" /* General language front end data */ Token lex_token; size_t lex_nl_cnt; size_t lex_tk_cnt; size_t lex_non_ascii_cnt; /* Language-dependent data */ /* Data for module idf */ static const struct idf ppcmd[] = { {"define", META('d')}, {"else", META('e')}, {"endif", META('E')}, {"if", META('i')}, {"ifdef", META('I')}, {"ifndef", META('x')}, {"include", MTCT('I')}, {"line", META('l')}, {"undef", META('u')} }; static const struct idf reserved[] = { {"and", NORM('&')}, {"array", NORM('A')}, {"begin", NORM('{')}, {"case", NORM('c')}, {"const", NORM('C')}, {"div", NORM('/')}, {"do", NORM('D')}, {"downto", NORM('d')}, {"else", NORM('e')}, {"end", NORM('}')}, {"extern", CTRL('E')}, {"file", NORM('F')}, {"for", NORM('f')}, {"function", NORM('p')}, /* Equal to procedure */ {"goto", NORM('g')}, {"if", NORM('i')}, {"in", NORM('I')}, {"label", NORM('l')}, {"mod", NORM('%')}, {"nil", NORM('n')}, {"not", NORM('!')}, {"of", No_Token}, {"or", NORM('|')}, {"packed", NORM('P')}, {"procedure", NORM('p')}, {"program", No_Token}, {"record", NORM('r')}, {"repeat", NORM('R')}, {"set", NORM('s')}, {"then", No_Token}, {"to", NORM('t')}, {"type", NORM('T')}, {"until", NORM('u')}, {"var", NORM('v')}, {"while", NORM('w')}, {"with", NORM('W')} }; /* Special treatment of identifiers */ static Token idf2token(int hashing) { Token tk; lower_case(yytext); /* Pascal is case-insensitive */ tk = idf_in_list(yytext, reserved, sizeof reserved, IDF); if (Token_EQ(tk, IDF) && hashing) { /* return a one-Token hash code */ tk = idf_hashed(yytext); } return tk; } /* Token sets for module algollike */ const Token Non_Finals[] = { IDF, /* identifier */ NORM('{'), /* also begin */ NORM('('), NORM('['), NORM('A'), /* array */ NORM('c'), /* case */ NORM('C'), /* const */ NORM('/'), /* div */ CTRL('E'), /* extern */ NORM('F'), /* file */ NORM('f'), /* for */ NORM('g'), /* goto */ NORM('i'), /* if */ NORM('l'), /* label */ NORM('P'), /* packed */ NORM('p'), /* procedure/function */ NORM('r'), /* record */ NORM('R'), /* repeat */ NORM('s'), /* set */ NORM('T'), /* type */ NORM('v'), /* var */ NORM('w'), /* while */ NORM('W'), /* with */ No_Token }; const Token Non_Initials[] = { NORM(')'), NORM('}'), NORM(';'), No_Token }; const Token Openers[] = { NORM('{'), NORM('('), NORM('['), No_Token }; const Token Closers[] = { NORM('}'), NORM(')'), NORM(']'), No_Token }; /* Language-dependent code */ void Init_Language(void) { Init_Algol_Language(Non_Finals, Non_Initials, Openers, Closers); } int May_Be_Start_Of_Run(Token ch) { return May_Be_Start_Of_Algol_Run(ch); } size_t Best_Run_Size(const Token *str, size_t size) { return Best_Algol_Run_Size(str, size); } %} %option noyywrap %Start Comment Layout ([ \t\r\f]) ASCII95 ([- !"#$%&'()*+,./0-9:;<=>?@A-Z\[\\\]^_`a-z{|}~]) AnyQuoted (\\.) StrChar ([^'\n\\]|{AnyQuoted}) StartComment ("{"|"(*") EndComment ("}"|"*)") SafeComChar ([^*}\n]) UnsafeComChar ("*") Digit ([0-9]) Idf ([A-Za-z][A-Za-z0-9_]*) %% {StartComment} { /* See clang.l */ BEGIN Comment; } {SafeComChar}+ { /* safe comment chunk */ } {UnsafeComChar} { /* unsafe char, read one by one */ } "\n" { /* to break up long comments */ return_eol(); } {EndComment} { /* end-of-comment */ BEGIN INITIAL; } \'{StrChar}*\' { /* character strings */ return_ch('"'); } ^#{Layout}*include.* { /* ignore #include lines */ } ^#{Layout}*{Idf} { /* a preprocessor line */ char *idf = yytext+1; /* skip layout in front of preprocessor identifier */ while (*idf == ' ' || *idf == '\t') { idf++; } return_tk(idf_in_list(idf, ppcmd, sizeof ppcmd, NORM('#'))); } {Digit}+ { /* numeral, passed as an identifier */ return_tk(IDF); } {Idf}/"(" { /* identifier in front of ( */ Token tk; tk = idf2token(is_set_option('F')); if (!Token_EQ(tk, No_Token)) return_tk(tk); } {Idf} { /* identifier */ Token tk; tk = idf2token(0 /* no hashing */); if (!Token_EQ(tk, No_Token)) return_tk(tk); } \; { /* semicolon, conditionally ignored */ if (is_set_option('f')) return_ch(yytext[0]); } \n { /* count newlines */ return_eol(); } {Layout} { /* ignore layout */ } {ASCII95} { /* copy other text */ return_ch(yytext[0]); } . { /* count non-ASCII chars */ lex_non_ascii_cnt++; } %% /* More language-dependent code */ void yystart(void) { BEGIN INITIAL; } similarity-tester-2.89.orig/hash.h0000644000000000000000000000062212540503627014073 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: hash.h,v 1.4 2013-04-28 16:30:40 dick Exp $ */ /* Creating and consulting forward_reference[], used to speed up the Longest Substring Allgorithm. */ extern void Make_Forward_References(void); extern void Free_Forward_References(void); extern size_t Forward_Reference(size_t i); similarity-tester-2.89.orig/lang.h0000644000000000000000000000151612540503627014074 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: lang.h,v 1.8 2013-04-28 16:30:41 dick Exp $ */ /* The *lang.l files provide two interfaces: language.[ch] static data about the language lang.[ch] dynamic data about the input file's content This is lang.[ch]. */ /* The abstract module 'lang' provides access to the lowest-level token routines and data. The actual implementation derives from one of the *lang.l files. There is a dummy implementation lang.c. */ extern FILE *yyin; extern int yylex(void); extern void yystart(void); extern Token lex_token; /* token produced, or End_Of_Line */ extern size_t lex_nl_cnt; /* line count */ extern size_t lex_tk_cnt; /* token position */ extern size_t lex_non_ascii_cnt; /* # of non-ASCII chars found */ similarity-tester-2.89.orig/lex.c0000644000000000000000000000037512540503627013740 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: lex.c,v 1.9 2012-06-08 16:04:28 dick Exp $ */ /* The service macros for the *lang.l files do not require code */ #include "lex.h" similarity-tester-2.89.orig/hash.c0000644000000000000000000003153412540503627014074 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: hash.c,v 2.27 2015-01-17 10:20:40 dick Exp $ */ /* Text is compared by comparing every substring to all substrings to the right of it; this process is in essence quadratic. However, only substrings of length at least 'Min_Run_Size' are of interest, which gives us the possibility to speed up this process by using a hash table. For every position p in the text, we construct an index forward_reference[p] which gives the next position in the text at which a run of Min_Run_Size tokens starts that has the same hash code, as calculated by hash1(). If there is no such run, the index is 0. To construct this array, we use a hash table last_index[] whose size is a prime and which is about 8 times smaller than the text array. The hash table last_index[] is set up such that last_index[i] is the index of the latest token with hash_code i, or 0 if there is none. This results in hash chains of an average length of 8. See Make_Forward_References(). If there is not enough room for a hash table of the proper size (which can be considerable) the hashing is not efficient any more. In that case, the forward reference table is scanned a second time, eliminating from any chain all references to runs that do not hash to the same value under a second hash function, hash2(). For the UNIX manuals this reduced the number of matches from 91.9% to 1.9% (of which 0.06% was genuine). The forward references can be checked with db_forward_reference_check(), which also collects Statistics. These can be compared to the perfect forward references created by db_make_forward_references_perfect(). For the LaTeX sourcces of our book Modern Compiler Desgin, 2nd Ed. the second hashing reduced the total forward chain length from 103555 to 388, whereas the total length for perfect forward references would be 345, all 3 numbers as determined by db_forward_reference_check(). */ #include #include #include "system.par" #include "debug.par" #include "sim.h" #include "text.h" #include "Malloc.h" #include "error.h" #include "any_int.h" #include "token.h" #include "language.h" #include "token.h" #include "tokenarray.h" #include "options.h" #include "hash.h" /* MAIN ENTRIES */ static size_t *forward_reference; /* to be filled by Malloc() */ static size_t n_forward_references; static void make_forward_references_hash1(void); static void clean_forward_references_hash2(void); #ifdef DB_FORW_REF static void db_forward_reference_check(const char *); static void db_make_forward_references_perfect(void); #endif /* DB_FORW_REF */ void Make_Forward_References(void) { /* Constructs the forward references table. */ n_forward_references = Token_Array_Length(); forward_reference = (size_t *)Calloc(n_forward_references, sizeof (size_t)); make_forward_references_hash1(); clean_forward_references_hash2(); #ifdef DB_FORW_REF db_make_forward_references_perfect(); #endif /* DB_FORW_REF */ } size_t Forward_Reference(size_t i) { if (i == 0 || i >= n_forward_references) { fatal("internal error, bad forward reference"); } return forward_reference[i]; } void Free_Forward_References(void) { Free((char *)forward_reference); } /* HASHING */ /* We want a hash function whose time cost does not depend on Min_Run_Size, which is a problem since the size of the object we derive the hash value from *is* equal to Min_Run_Size! Therefore we base the hash function on a sample of at most N_SAMPLES tokens from the input string; this works just as well in practice. */ #define N_SAMPLES 24 static size_t *last_index; static size_t last_index_table_size; /* positions where the N_SAMPLES samples can be found: */ static size_t sample_pos[N_SAMPLES]; /* The prime numbers of the form 4 * i + 3 for some i, all greater than twice the previous one and smaller than 2^40 (for now). */ static const uint64_t prime[] = { #if 0 3, 7, 19, 43, 103, 211, 431, 863, 1747, 3499, 7019, #endif 14051, 28111, 56239, 112507, 225023, 450067, 900139, 1800311, 3600659, 7201351, 14402743, 28805519, 57611039, 115222091, 230444239, 460888499, 921777067, 1843554151, UINT64_C (3687108307), UINT64_C (7374216631), UINT64_C (14748433279), UINT64_C (29496866579), UINT64_C (58993733159), UINT64_C (117987466379), UINT64_C (235974932759), UINT64_C (471949865531), UINT64_C (943899731087) /* 2^40= 1099511627776 */ }; static void init_hash_table(void) { int n; /* find the ideal hash table size */ n = 0; while (prime[n] < Token_Array_Length()) { n++; /* this will always terminate, if prime[] is large enough */ } /* see if we can allocate that much space, and if not, step down */ last_index = 0; while ( /* we have not yet obtained our array */ !last_index && /* and there is still a (prime) size left to try */ n >= 0 ) { last_index_table_size = prime[n]; last_index = (size_t *) TryCalloc(last_index_table_size, sizeof (size_t)); n--; } if (!last_index) { fatal("out of memory"); } /* find sample positions (if Min_Run_Size < N_SAMPLES there will be duplicates) */ for (n = 0; n < N_SAMPLES; n++) { /* straight-line approximation; uninituitive as usual */ sample_pos[n] = ( (2 * n * (Min_Run_Size - 1) + (N_SAMPLES - 1)) / (2 * (N_SAMPLES - 1)) ); } } static size_t hash1(const Token *); static void make_forward_references_hash1(void) { int n; init_hash_table(); /* set up the forward references using the last_index[] hash table */ for (n = 0; n < Number_of_Texts; n++) { struct text *txt = &Text[n]; size_t j; for ( /* all positions in txt ... */ j = txt->tx_start; /* >= 1 */ /* ... except the last Min_Run_Size-1 */ j + Min_Run_Size - 1 < txt->tx_limit; j++ ) { if (May_Be_Start_Of_Run(Token_Array[j])) { /* the hash value is used here for an index */ size_t h = hash1(&Token_Array[j]) % last_index_table_size; if (last_index[h]) { forward_reference[last_index[h]] = j; } last_index[h] = j; } } } Free((char *)last_index); #ifdef DB_FORW_REF db_forward_reference_check("first hashing"); #endif /* DB_FORW_REF */ } static size_t hash1(const Token *p) { /* The function hash1(p) returns a hash code of the Min_Run_Size tokens starting at p; caller guarantees that there are at least Min_Run_Size tokens. Since its value is used as an index in a hash array, it needs to be as smooth as possible. Its type is size_t. */ /* The hash type and its width */ #if 0 #define HASH_T uint64_t #define HASH_W 64 #else #define HASH_T uint32_t /* Turns out to be at least as good */ #define HASH_W 32 #endif HASH_T h_val; int n; /* The hash operation */ #if 1 #define OPERATION ^ #elif 0 #define OPERATION + /* does not seem to make any diff. */ #else #define OPERATION + 613 * /* does not seem to make any diff. */ #endif h_val = 0; for (n = 0; n < N_SAMPLES; n++) { /* left-most bit of h_val is 0 */ /* do a circular left shift over the HASH_W-1 right-most bits */ h_val <<= 1; if ( /* left-most bit of h_val is now 1 */ h_val & (((HASH_T)1)<<(HASH_W-1)) ) { /* move it to the end */ h_val ^= (((HASH_T)1)<<(HASH_W-1)|1); } /* left-most bit of h_val is again 0 */ /* update */ h_val = h_val OPERATION Token2int(p[sample_pos[n]]); /* left-most bit of h_val is still 0 */ } #ifdef DB_HASH size_t h = (size_t)h_val; fprintf(Debug_File, "h_val = %s\n", any_uint2string(h, 0)); #endif /* DB_HASH */ return (size_t)h_val; } static vlong_uint hash2(const Token *); static void clean_forward_references_hash2(void) { size_t i; /* Clean out spurious matches, by a slightly quadratic algorithm. */ for (i = 0; i+Min_Run_Size < Token_Array_Length(); i++) { size_t j = i; vlong_uint h2 = hash2(&Token_Array[i]); /* The hash value h2 is used as a representative.*/ /* Find the first token sequence in the chain with the same secondary hash code ... */ while ( /* there is still a forward reference */ (j = forward_reference[j]) && /* its hash code does not match */ hash2(&Token_Array[j]) != h2 ) { /* continue searching */ } /* ... and short-circuit forward reference to it, or to zero. */ forward_reference[i] = j; } #ifdef DB_FORW_REF db_forward_reference_check("second hashing"); #endif /* DB_FORW_REF */ } static vlong_uint hash2(const Token *p) { /* The function hash2(p) returns a representative code for the Min_Run_Size tokens starting at p; caller guarantees that there are at least Min_Run_Size tokens. Since its value is used as a representative in a comparison, it needs to be as unique as possible. Its type is vlong_uint. */ int pos_last_sample = N_SAMPLES - 1; vlong_uint h_val = 0; /* macro for readability (not relying on C compiler to do in-lining) */ #define extract_Token(pos) ((vlong_uint)Token2int(p[sample_pos[pos]])) #define VLONG_W ((sizeof (vlong_uint))*8) h_val ^= extract_Token(0) << 0; h_val ^= extract_Token(pos_last_sample) << (VLONG_W*1/5); h_val ^= extract_Token(pos_last_sample/2) << (VLONG_W*2/5); h_val ^= extract_Token(pos_last_sample*1/4) << (VLONG_W*3/5); h_val ^= extract_Token(pos_last_sample*3/4) << (VLONG_W*4/5); #ifdef DB_HASH /* print the result */ fprintf(Debug_File, "hash2 = %s\n", any_uint2string(h_val, 0)); #endif /* DB_HASH */ return h_val; } #ifdef DB_FORW_REF static void db_print_forward_references(void) { size_t n; size_t *printed_at = (size_t *)Calloc(Token_Array_Length(), sizeof (size_t)); for (n = 1; n < Token_Array_Length(); n++) { size_t fw = forward_reference[n]; if (fw == 0) continue; fprintf(Debug_File, "FWR[%s]:", any_uint2string(n, 0)); if (printed_at[fw]) { fprintf(Debug_File, " see %s", any_uint2string(printed_at[fw], 0)); } else { while (fw) { fprintf(Debug_File, " %s", any_uint2string(fw, 0)); printed_at[fw] = n; fw = forward_reference[fw]; } } fprintf(Debug_File, "\n"); } Free((void *)printed_at); } static int is_eq_min_run(const Token *p, const Token *q) { /* a full comparison for the tertiary sweep */ size_t n; for (n = 0; n < Min_Run_Size; n++) { if (!Token_EQ(p[n], q[n])) return 0; } return 1; } static void db_make_forward_references_perfect(void) { size_t i; /* Simulate a perfect hash by doing a full comparison over Min_Run_Size, for gathering statistics. */ for (i = 0; i+Min_Run_Size < Token_Array_Length(); i++) { size_t j = i; while ( /* there is still a forward reference */ (j = forward_reference[j]) && /* it does not match over Min_Run_Size */ !is_eq_min_run(&Token_Array[i], &Token_Array[j]) ) { /* continue searching */ } /* short-circuit forward reference to it, or to zero */ forward_reference[i] = j; } /* now we have perfect forward references */ db_forward_reference_check("full Min_Run_Size comparison"); } static size_t db_frw_chain(size_t n, char *crossed_out) { if (forward_reference[n] == 0) { fprintf(Debug_File, ">>>> db_frw_chain() forward_reference[n] == 0 <<<<\n" ); return 0; } size_t n_entries = 0; size_t fw; for (fw = n; fw; fw = forward_reference[fw]) { if (crossed_out[fw]) { fprintf(Debug_File, ">>>> error: forward references cross <<<<\n" ); } n_entries++; crossed_out[fw] = 1; } #ifdef DB_FORW_REF_PRINT fprintf(Debug_File, "chain_start = %s, n_entries = %s\n", any_uint2string(n, 0), any_uint2string(n_entries, 0)); #endif /* DB_FORW_REF_PRINT */ /* return chain length */ return n_entries - 1; } static void db_forward_reference_check(const char *msg) { /* Each forward_reference[n] starts in principle a new chain, and these chains never touch each other. We check this property by marking the positions in each chain in an array; if we meet a marked entry while following a chain, it must have been on an earlier chain and we have an error. We also determine the lengths of the chains, for statistics. */ size_t n; size_t n_frw_chains = 0; /* number of forward ref. chains */ size_t tot_frwc_len = 0; char *crossed_out = (char *)Calloc(Token_Array_Length(), sizeof (char)); fprintf(Debug_File, "\n\n**** DB_FORWARD_REFERENCES, %s ****\n", msg); fprintf(Debug_File, "last_index_table_size = %s\n", any_uint2string(last_index_table_size, 0)); fprintf(Debug_File, "N_SAMPLES = %d\n", N_SAMPLES); if (forward_reference[0]) { fprintf(Debug_File, ">>>> forward_reference[0] is not zero <<<<\n" ); } for (n = 1; n < Token_Array_Length(); n++) { if (forward_reference[n] && !crossed_out[n]) { /* start of a new chain */ n_frw_chains++; tot_frwc_len += db_frw_chain(n, crossed_out); } } #ifdef DB_FORW_REF_PRINT db_print_forward_references(); #endif /* DB_FORW_REF_PRINT */ Free((void *)crossed_out); fprintf(Debug_File, "text length = %s, # forward chains = %s, total frw chain length = %s\n\n", any_uint2string(Token_Array_Length(), 0), any_uint2string(n_frw_chains, 0), any_uint2string(tot_frwc_len, 0) ); } #endif /* DB_FORW_REF */ similarity-tester-2.89.orig/pass2.h0000644000000000000000000000045712540503627014206 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: pass2.h,v 1.3 2012-06-05 09:58:53 dick Exp $ */ /* Determines for each position that is part of a run, at which line number it starts and ends. */ extern void Retrieve_Runs(void); similarity-tester-2.89.orig/token.h0000644000000000000000000000467512540503627014304 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: token.h,v 2.12 2012-06-08 16:04:30 dick Exp $ */ /* Token interface. Since the definition of a token has been a continual source of problems, it is now defined as an ADT 'Token'. There are four classes of tokens: 1. simple tokens; they derive directly from input characters; 2. summary tokens; they summarise keywords, etc.; 3. special tokens: No_Token, IDF, and End_Of_Line; 4. hashed tokens, segments condensed by idf_hashed(). The first three classes are called 'regular tokens'. There are also a few 'gap' tokens, tokens not produced by the above mechanisms, for example 0x100. In addition to the type Token and the special tokens, the module defines 1. the constants N_REGULAR_TOKENS number of regular tokens N_TOKENS total number of tokens, including No_Token 2. macros for defining summary tokens (with ranges of their parameters): CTRL(ch) ch in 'A'-'~' NORM(ch) ch in '!'-'~' MTCT(ch) ch in 'A'-'~' META(ch) ch in '!'-'~' These restrictions are not checked. 3. the conversion routines Token2int(c) int2Token(i) */ #include #ifndef _TOKEN_H #define _TOKEN_H #ifdef lint /* For security we want to distinguish tokens from integers. Lint is not good at this, so for checking we use a pointer to a weird data type */ struct for_lint_only {int i;}; typedef struct for_lint_only *Token; #else /* if normal */ typedef unsigned short Token; #endif /* lint/normal */ #define N_TOKENS (1<<16) #define N_REGULAR_TOKENS (1<<9) /* Macros for the composition of tokens */ /* range (gaps unused)*/ #define No_Token int2Token(0) /* 0x0000 */ /* UTF-8 characters */ /* 0x0001-0x00FF */ #define CTRL(ch) int2Token(0x100|((ch)&0x01F)) /* 0x0101-0x011E */ #define NORM(ch) int2Token(0x100|((ch)&0x07F)) /* 0x0121-0x017E */ #define IDF int2Token(0x180) /* 0x0180 */ #define MTCT(ch) int2Token(0x180|((ch)&0x01F)) /* 0x0181-0x019E */ #define META(ch) int2Token(0x180|((ch)&0x07F)) /* 0x01A1-0x01FE */ /* tokens from idf_hashed() */ /* 0x0200-0xFFFE */ #define End_Of_Line int2Token(0xFFFF) /* 0xFFFF */ /* Conversion routines */ #define Token2int(c) ((int)(c)) #define int2Token(i) ((Token)(i)) /* Auxiliaries */ #define is_regular_token(tk) (Token2int(tk) < N_REGULAR_TOKENS) extern int Token_EQ(const Token t1, const Token t2); extern void fprint_token(FILE *ofile, const Token tk); #endif /* _TOKEN_H */ similarity-tester-2.89.orig/Malloc.h0000644000000000000000000001022112540503627014353 0ustar /* This file is part of the memory management and leak detector MALLOC. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: Malloc.h,v 1.9 2014-08-23 15:15:43 Gebruiker Exp $ */ /* requires stdio.h and stdlib.h */ #ifndef _MALLOC_H_ #define _MALLOC_H_ /***** The files Malloc.[ch] provide several functionalities: - checking for "out of memory": to simplify programming - allocating memory using new(type) " " " " - detecting memory leaks: to obtain cleaner programs - clobbering freshly allocated memory: to obtain safer programs The module defines several sets of routines: 1. void *Malloc(size_t s) void *Calloc(size_t n, size_t s) void *Realloc(void *p, size_t s) void Free(void *p) 2. void *TryMalloc(size_t s) void *TryCalloc(size_t n, size_t s) void *TryRealloc(void *p, size_t s) 3. T *new(T) char *new_string(const char *s) 4. void ReportMemoryLeaks(FILE *f) void MemClobber(void *p, size_t size) * The members of the first set act like their Unix counterparts, except that they never return NULL; upon out-of-memory an error message is given on standard error, showing the file name and the line number of the call. Since in almost all cases there is nothing more intelligent to do, this is almost always adequate, and makes for simpler and safer programming. In those rare cases that the program *can* continue when out of memory, the routines in the second set can be used; they act exactly like their Unix counterparts. Note that automatic out-of-memory detection is active, regardless of the -DMEM... flags described below. * A call of new(T), with T any type, yields a pointer of type T* to a block of type T, allocated using Malloc(). A call of new_string(s), with s a string, yields a pointer to a copy of s, allocated using Malloc(); it is equivalent to strdup() except that it uses Malloc(). * Normally, a call of ReportMemoryLeaks() does nothing, but when Malloc.c is compiled with -DMEMLEAK, it produces a compacted list of allocated but not yet freed blocks on the stream f, with information about where they were allocated. This is useful to get insight into memory use and abuse. * When Malloc.c is compiled with -DMEMCLOBBER, it clobbers all newly allocated memory from Malloc() and Realloc() just after allocation, and all freed memory just before freeing it. An area is clobbered by overwriting it with a wacky bit pattern. This is done in the hope that improper use of memory will cause some evident error somewhere. The routine that performs the clobbering, MemClobber(void *p, size_t size), is available regardless of the -DMEMCLOBBER compilation option. It can be used to create comparison patterns. * Compiled with any of the -DMEM... flags, Malloc will also produce run-time error messages for multiple Free()s of the same block, and Realloc()s on not-allocated blocks. It then allows the program to continue. * The system consumes hardly any time and is fast enough to be kept active all the time. *****/ /* Private entries */ extern void *_leak_malloc(int chk, size_t size, const char *fname, int l_nmb); extern void *_leak_calloc(int chk, size_t n, size_t size, const char *fname, int l_nmb); extern void *_leak_realloc(int chk, void *addr, size_t size, const char *fname, int l_nmb); extern void _leak_free(void *addr, const char *fname, int l_nmb); extern char *_new_string(const char *s, const char *fname, int l_nmb); /* Public entries */ #define Malloc(s) (_leak_malloc(1, (s), __FILE__, __LINE__)) #define Calloc(n,s) (_leak_calloc(1, (n), (s), __FILE__, __LINE__)) #define Realloc(p,s) (_leak_realloc(1, (void *)(p), (s), __FILE__, __LINE__)) #define Free(p) (_leak_free((void *)(p), __FILE__, __LINE__)) #define TryMalloc(s) (_leak_malloc(0, (s), __FILE__, __LINE__)) #define TryCalloc(n,s) (_leak_calloc(0, (n), (s), __FILE__, __LINE__)) #define TryRealloc(p,s) (_leak_realloc(0, (void *)(p), (s), __FILE__, __LINE__)) #define new(type) ((type *)Malloc(sizeof (type))) #define new_string(s) (_new_string((s), __FILE__, __LINE__)) extern void ReportMemoryLeaks(FILE *f); extern void MemClobber(void *p, size_t size); #endif /* _MALLOC_H_ */ similarity-tester-2.89.orig/text.h0000644000000000000000000000340312540503627014134 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: text.h,v 1.7 2015-01-12 09:16:13 dick Exp $ */ /* Implements the access to the lexical scanner. Additionally, the module tries to save newline information, anticipating a second scan which is interested in this information only. */ /* The input files are called "texts" */ struct text { const char *tx_fname; /* the file name */ size_t tx_start; /* index of first token in Token_Array[] belonging to the text */ size_t tx_limit; /* index of first position in Token_Array[] not belonging to the text */ size_t tx_nl_start; /* possibly newline pointer for pass2 */ size_t tx_nl_limit; int tx_EOL_terminated; /* Boolean */ struct position *tx_pos;/* list of positions in this file that are part of a chunk; sorted and updated by Pass 2 */ }; struct position { /* position of first and last token of a chunk */ struct position *ps_next; int ps_type; /* first = 0, last = 1 */ size_t ps_tk_cnt; /* in tokens; set by add_run() in Read_Input_Files() */ size_t ps_nl_cnt; /* same, in line numbers;set by Retrieve_Runs(), used by Show_Runs(), to report line numbers */ }; extern struct text *Text; /* Text[], one for each input file */ extern int Number_of_Texts; /* number of text files */ extern int Number_of_New_Texts; /* number of new text files */ extern void Init_Text(int nfiles); enum Pass {First_Pass, Second_Pass}; extern int Open_Text(enum Pass pass, struct text *txt); extern int Next_Text_Token_Obtained(void); extern int Next_Text_EOL_Obtained(void); extern void Close_Text(enum Pass pass, struct text *txt); #ifdef DB_NL_BUFF extern void db_print_nl_buff(size_t start, size_t limit); #endif similarity-tester-2.89.orig/system.par0000644000000000000000000000067312540503627015035 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: system.par,v 1.3 2014-01-26 13:51:27 dick Exp $ */ /* Operating-system dependent data */ #ifdef MSDOS /* GNU gcc */ #define int32 int /* type of a 32 bits signed int */ #define NULLFILE "nul" # else /* various *NIXes */ #define int32 int /* type of a 32 bits signed int */ #define NULLFILE "/dev/null" #endif similarity-tester-2.89.orig/language.h0000644000000000000000000000136712540503627014742 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: language.h,v 1.8 2013-04-28 16:30:41 dick Exp $ */ /* The *lang.l files provide two interfaces: language.[ch] static data about the language lang.[ch] dynamic data about the input file's content This is language.[ch]. */ /* The abstract class 'language' defines the routines Init_Language(), May_Be_Start_Of_Run() and Best_Run_Size(), which describe some properties of the language. These routines are provided by the *lang.l files. There is a dummy implementation language.c. */ extern void Init_Language(void); extern int May_Be_Start_Of_Run(Token ch); extern size_t Best_Run_Size(const Token *str, size_t size); similarity-tester-2.89.orig/sim.10000644000000000000000000002745012540503627013661 0ustar .\" This file is part of the software similarity tester SIM. .\" Written by Dick Grune, Vrije Universiteit, Amsterdam. .\" $Id: sim.1,v 2.28 2015-01-28 16:46:42 dick Exp $ .\" .TH SIM 1 2015/01/23 .SH NAME sim \- find similarities in C, Java, Pascal, Modula-2, Lisp, Miranda, or text files .SH SYNOPSIS .B sim_c [ .B \-[defFiMnpPRsSTv] .B \-r .I N .B \-t .I N .B \-w .I N .B \-o .I F ] file ... [ [ .B / .B | ] file ... ] .br .B sim_c \&... .br .B sim_java \&... .br .B sim_pasc \&... .br .B sim_m2 \&... .br .B sim_lisp \&... .br .B sim_mira \&... .br .B sim_text \&... .br .SH DESCRIPTION .I Sim_c reads the C files .I file ... and looks for segments of text that are similar; two segments of program text are similar if they only differ in layout, comment, identifiers, and the contents of numbers, strings and characters. If any runs of sufficient length are found, they are reported on standard output; the number of significant tokens in the run is given between square brackets. .PP .I Sim_java does the same for Java, .I sim_pasc for Pascal, .I sim_m2 for Modula-2, .I sim_mira for Miranda, and .I sim_lisp for Lisp. .I Sim_text works on arbitrary text and it is occasionally useful on shell scripts. .PP The program can be used for finding copied pieces of code in purportedly unrelated programs (with .B \-s or .BR \-S ), or for finding accidentally duplicated code in larger projects (with .B \-f or .BR \-F ). .PP If a separator .B / or .B | is present in the list of input files, the files are divided into a group of "new" files (before the .BR / or .BR | ) and a group of "old" files; if there is no .BR / or .BR | , all files are "new". Old files are never compared to each other. See also the description of the .B \-s and .B \-S options below. .PP Since the similarity tester needs file names to pinpoint the similarities, it cannot read from standard input. .PP There are the following options: .TP .B \-d The output is in a diff(1)-like format instead of the default 2-column format. .TP .B \-e Each file is compared to each file in isolation; this will find all similarities between all texts involved, regardless of repetitive text (see `Calculating Percentages' below). .TP .B \-f Runs are restricted to segments with balancing parentheses, to isolate potential routine bodies (not in .IR sim_text ). .TP .B \-F The names of routines in calls are required to match exactly (not in .IR sim_text ). .TP .B \-i The names of the files to be compared are read from standard input, including a possible separator .BR / or .BR | ; the file names must be one to a line. This option allows a very large number of file names to be specified; it differs from the \fC@\fP facility provided by some compilers in that it handles file names only, and does not recognize option arguments. .TP .B \-M Memory usage information is displayed on standard error output. .TP .B \-n Similarities found are summarized by file name, position and size, rather than displayed in full. .TP .B "\-o F" The output is written to the file named .IR F . .TP .B \-p The output is given in similarity percentages; see `Calculating Percentages' below; implies \fB\-e\fP and \fB\-s\fP. .TP .B \-P As .B \-p but only the main contributor is shown; implies \fB\-e\fP and \fB\-s\fP. .TP .B "\-r N" The minimum run length is set to .I N units; the default is 24 tokens, except in .IR sim_text , where it is 8 words. .TP .B \-R Directories in the input list are entered recursively, and all files they contain are involved in the comparison. .TP .B \-s The contents of a file are not compared to itself (\-s for "not self"). .TP .B \-S The contents of the new files are compared to the old files only \- not between themselves. .TP .B "\-t N" In combination with the .B \-p or .B \-P options, sets the threshold (in percent) below which similarities will not be reported; the default is 1, except in .IR sim_text , where it is 20. .TP .B \-T A more terse and uniform form of output is produced, which may be more suitable for postprocessing. .TP .B \-v Prints the version number and compilation date on standard output, then stops. .TP .B "\-w N" The page width used is set to .I N columns; the default is 80. .TP .B "\-\-" (A secret option, which prints the input as the similarity checker sees it, and then stops.) .PP The .B \-p option results in lines of the form .nf .ft C F consists for x % of G material .ft P .fi meaning that \fCx\fP % of \fCF\fP's text can also be found in \fCG\fP. Note that this relation is not symmetric; it is in fact quite possible for one file to consist for 100 % of text from another file, while the other file consists for only 1 % of text of the first file, if their lengths differ enough. The .B \-P (capital P) option shows the main contributor for each file only. This simplifies the identification of a set of files \fCA[1] ... A[n]\fP, where the concatenation of these files is also present. A threshold can be set using the .B \-t option; note that the granularity of the recognized text is still governed by the .B \-r option or its default. .PP The .B \-r option controls the number of "units" that constitute a run. For the programs that compare programming language code, a unit is a lexical token in the pertinent language; comment and standard preamble material (file inclusion, etc.) is ignored and all strings are considered the same. For .I sim_text a unit is a "word" which is defined as any sequence of one or more letters, digits, or characters over 127 (177 octal), (to accommodate letters such as \(:a, \(/o, etc.). .br .I Sim_text accepts s p a c e d t e x t as normal text. .PP The .B \-s and .B \-S options control which files to compare. Input files are divided into two groups, new and old. In the absence of these control options the programs compare the files thus (for 4 new files and 6 old ones): .nf .ft C n e w / o l d <- first file 1 2 3 4 / 5 6 7 8 9 10 |------------/------------ n 1 | c / e 2 | c c / w 3 | c c c / 4 | c c c c / second / / / / / / / / / / / / / file -> 5 | c c c c / o 6 | c c c c / l 7 | c c c c / d 8 | c c c c / 9 | c c c c / 10 | c c c c / .ft P .fi where the \fCc\fPs represent file comparisons, and the \fC/\fP the demarcation between new and old files. .bp \" KLUGDE ZZ Using the .B \-s option reduces this to: .nf .ft C n e w / o l d <- first file 1 2 3 4 / 5 6 7 8 9 10 |------------/------------ n 1 | / e 2 | c / w 3 | c c / 4 | c c c / second / / / / / / / / / / / / / file -> 5 | c c c c / o 6 | c c c c / l 7 | c c c c / d 8 | c c c c / 9 | c c c c / 10 | c c c c / .ft P .fi The .B \-S option reduces this further to: .nf .ft C n e w / o l d <- first file 1 2 3 4 / 5 6 7 8 9 10 |------------/------------ n 1 | / e 2 | / w 3 | / 4 | / second / / / / / / / / / / / / / file -> 5 | c c c c / o 6 | c c c c / l 7 | c c c c / d 8 | c c c c / 9 | c c c c / 10 | c c c c / .ft P .fi .PP The programs can handle UNICODE file names under Windows. This is relevant only under the .B \-R option, since there is no way to give UNICODE file names from the command line. .SH LIMITATIONS Repetitive input is the bane of similarity checking. If we have a file containing 4 copies of identical text, .nf .ft C A1 A2 A3 A4 .ft P .fi where the numbers serve only to distinguish the identical copies, there are 8 identities: \fCA1=A2\fP, \fCA1=A3\fP, \fCA1=A4\fP, \fCA2=A3\fP, \fCA2=A4\fP, \fCA3=A4\fP, \fCA1A2=A3A4\fP, and \fCA1A2A3=A2A3A4\fP. Of these, only 3 are meaningful: \fCA1=A2\fP, \fCA2=A3\fP, and \fCA3=A4\fP. And for a table with 20 lines identical to each other, not unusual in a program, there are 715 identities, of which at most 19 are meaningful. Reporting all 715 of them is clearly unacceptable. .PP To remedy this, finding the identities is performed as follows: For each position in the text, the largest segment is found, of which a non-overlapping copy occurs in the text following it. That segment and its copy are reported and scanning resumes at the position just after the segment. For the above example this results in the identities \fCA1A2=A3A4\fP and \fCA3=A4\fP, which is quite satisfactory, and for \fIN\fP identical segments roughly \fI2 log N\fP messages are given. .PP This also works out well when the four identical segments are in different files: .nf .ft C File1: A1 File2: A2 File3: A3 File4: A4 .ft P .fi Now combined segments like \fCA1A2\fP do not occur, and the algorithm finds the runs \fCA1=A2\fP, \fCA2=A3\fP, and \fCA3=A4\fP, for a total of \fIN-1\fP runs, all informative. .SS Calculating Percentages The above approach is not suitable for obtaining the percentage of a file's content that can be found in another file. This requires comparing in isolation each file pair represented by a \fCc\fP in the matrixes above; this is what the \fB\-e\fP option does. Under the \fB\-e\fP option a segment \fCFile1:A1\fP, recognized in \fCFile2\fP, will again be recognized in \fCFile3\fP and \fCFile4\fP. In the example above it produces the runs .nf .ft C File1:A1=File2:A2 File1:A1=File3:A3 File1:A1=File4:A4 File2:A2=File3:A3 File2:A2=File4:A4 File3:A3=File4:A4 .ft P .fi for a total of \fI\(12N(N-1)\fP runs. .SH TIME AND SPACE REQUIREMENTS Care has been taken to keep the time requirements of all internal processes (almost) linear in the lengths of the input files, by using various tables. If, however, there is not enough memory for the tables, they are discarded in order of unimportance, under which conditions the algorithms revert to their quadratic nature. .PP The time requirements are quadratic in the number of files. This means that, for example, one 64 MB file processes much faster than 8000 8 kB files. .PP The program requires 6 bytes of memory for each token in the input; 2 bytes per newline (not when doing percentages); and about 76 bytes for each run found. .SH EXAMPLES The call .nf .ft C sim_c *.c .ft P .fi highlights duplicate code in the directory. (It is useful to remove generated files first.) A call .nf .ft C sim_c -f -F *.c .ft P .fi can pinpoint them further. .PP A call .nf .ft C sim_text -e -p -s new/* / old/* .ft P .fi compares each file in \fCnew/*\fP to each file in \fCnew/*\fP and \fCold/*\fP, and if any pair has more that 20% in common, that fact is reported. Usually a similarity of 30% or more is significant; lower than 20% is probably coincidence; and in between is doubtful. .PP A call .nf .ft C sim_text -e -n -s -r100 new/* "|" old/* .ft P .fi compares the same files, and reports large common segments. (The .B | can be used as a separator instead of .B / on systems where the .B / as a command-line parameter gets mangled by the command interpreter.) .PP Both approaches are good for plagiarism detection. .SH BUGS Since it uses .I lex(1) on some systems, it may crash on any weird construction in the input that overflows .IR lex 's internal buffers, for example an identifier of several thousand letters long. .SH AUTHOR Dick Grune, Vrije Universiteit, Amsterdam; dick@dickgrune.com. similarity-tester-2.89.orig/stream.c0000644000000000000000000000332012540503627014434 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: stream.c,v 2.13 2015-01-22 14:38:28 dick Exp $ */ #include #include #include #include "system.par" #include "sim.h" #include "options.h" #include "token.h" #include "lang.h" #include "stream.h" static FILE *fopen_regular_file(const char *fname); int Open_Stream(const char *fname) { int ok; lex_nl_cnt = 1; lex_tk_cnt = 0; lex_non_ascii_cnt = 0; /* start the lex machine */ yyin = fopen_regular_file(fname); ok = (yyin != 0); if (!ok) { /* fake a stream, to simplify the rest of the program */ yyin = fopen(NULLFILE, "r"); } yystart(); return ok; } static FILE * fopen_regular_file(const char *fname) { struct stat buf; if (stat(fname, &buf) != 0) return 0; if ((buf.st_mode & S_IFMT) != S_IFREG) return 0; return fopen(fname, "r"); } int Next_Stream_Token_Obtained(void) { return yylex(); } void Close_Stream(void) { if (yyin) { fclose(yyin); yyin = 0; } } void Print_Stream(const char *fname) { fprintf(Output_File, "File %s:", fname); if (!Open_Stream(fname)) { fprintf(Output_File, " cannot open\n"); return; } fprintf(Output_File, " showing the %s stream\n", token_name); lex_token = End_Of_Line; do { if (Token_EQ(lex_token, End_Of_Line)) { fprintf(Output_File, "line # = %s, %s # = %s:\n", size_t2string(lex_nl_cnt), token_name, size_t2string(lex_tk_cnt) ); } else { extern char *yytext; fprintf(Output_File, " %s -> ", yytext); fprint_token(Output_File, lex_token); fprintf(Output_File, "\n"); } } while (Next_Stream_Token_Obtained()); fprintf(Output_File, "\n"); Close_Stream(); } similarity-tester-2.89.orig/ForEachFile.h0000644000000000000000000000224312540503627015260 0ustar /* This file is part of the auxiliaries library. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: ForEachFile.h,v 1.9 2014-07-28 09:18:11 Gebruiker Exp $ */ #ifndef _FOREACHFILE_H_ #define _FOREACHFILE_H_ #include "fname.h" #include #include /**** * ForEachFile(const Fchar *fn, void (*proc)(...): each file reachable from fn is passed to the procedure proc, which is declared as: void proc(const Fchar *fn, const char *msg, const struct stat *fs): the file fn is reached; if msg != NULL, an error prevails the text of which is *msg; otherwise fs points to the stat buffer for fn. * ForEachLocalFile() restricts itself to the directory fn and its local contents. * MAX_FILE_NAME_LENGTH is the maximum length of the file name fn, including directories. ****/ /* Public entries */ #define MAX_FILE_NAME_LENGTH 1024 /* maximum file name length */ extern void ForEachFile( const Fchar *fn, void (*proc)(const Fchar *fn, const char *msg, const struct stat *fs) ); extern void ForEachLocalFile( const Fchar *fn, void (*proc)(const Fchar *fn, const char *msg, const struct stat *fs), int depth ); #endif /* _FOREACHFILE_H_ */ similarity-tester-2.89.orig/idf.h0000644000000000000000000000162112540503627013712 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: idf.h,v 2.12 2015-01-17 10:20:40 dick Exp $ */ /* Idf module: Token idf_in_list(char *str, struct idf l[], sizeof l, Token dflt); looks up a keyword in a list of keywords l, represented as an array of struct idf, and returns its translation as a token; dflt is returned if the keyword is not found. Token idf_hashed(char *str); returns a token unequal to No_Token or End_Of_Line, derived from str through hashing */ /* the struct for keywords etc. */ struct idf { char *id_tag; /* an interesting identifier */ Token id_tr; /* with its one-Token translation */ }; /* public functions */ extern Token idf_in_list( const char *str, const struct idf list[], size_t listsize, Token default_token ); extern Token idf_hashed(const char *str); extern void lower_case(char *str); similarity-tester-2.89.orig/any_int.c0000644000000000000000000000341012540503627014602 0ustar /* This file is part of the module ANY_INT. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: any_int.c,v 1.3 2014-07-28 09:18:11 Gebruiker Exp $ */ #include "any_int.h" #define N_INDEPENDENT_CALLS 12 #define MAX_ANY_UINT_DIGITS 40 /* good for 128 bits, including sign */ /*Library module source prelude */ #undef _ANY_UINT_CODE_ #ifndef lint #define _ANY_UINT_CODE_ #endif #ifdef LIB #define _ANY_UINT_CODE_ #endif #ifdef _ANY_UINT_CODE_ /* Library module source code */ /* circular list of buffers */ static char buff[N_INDEPENDENT_CALLS][MAX_ANY_UINT_DIGITS+1]; static int next_buff_cnt = 0; static char * next_buff(void) { if (next_buff_cnt == N_INDEPENDENT_CALLS) next_buff_cnt = 0; return buff[next_buff_cnt++]; } static const char * int2string(vlong_uint val, int neg, int size) { char *res = next_buff() + MAX_ANY_UINT_DIGITS; /* end of new buffer */ *res = '\0'; /* insert EOS */ /* protect size */ if (size < 0 || size > MAX_ANY_UINT_DIGITS) size = 0; do { /* one decimal character, the first always */ *--res = "0123456789ABCDEF"[val % 10]; size--; val = val / 10; } while (val > 0); if (neg) { *--res = '-'; size--; } while (size > 0) { /* fill up to size */ *--res = ' '; size--; } return res; } const char * /* transient * N_INDEPENDENT_CALLS */ any_int2string(vlong_int val, int size) { int neg = 0; if (val < 0) { val = - val; neg = 1; } return int2string((vlong_uint)val, neg, size); } const char * /* transient * N_INDEPENDENT_CALLS */ any_uint2string(vlong_uint val, int size) { return int2string(val, 0, size); } /* End library module source code */ #endif /* _ANY_UINT_CODE_ */ #ifdef lint static void satisfy_lint(void *x) { any_int2string(0, 0); any_uint2string(0, 0); satisfy_lint(x); } #endif /* lint */ similarity-tester-2.89.orig/lisplang.l0000644000000000000000000000457312540503627014776 0ustar %{ /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: lisplang.l,v 2.17 2013-04-28 16:30:41 dick Exp $ */ /* LISP language front end for the similarity tester. Author: Gertjan Akkerman Date: Thu, 9 Apr 87 11:15:23 MDT */ #include "token.h" #include "language.h" #include "algollike.h" #include "lex.h" #include "lang.h" /* General language front end data */ Token lex_token; size_t lex_nl_cnt; size_t lex_tk_cnt; size_t lex_non_ascii_cnt; /* Language-dependent data */ #include "idf.h" static const struct idf reserved[] = { {"append", NORM('a')}, {"append1", NORM('b')}, {"atom", NORM('t')}, {"car", NORM('h')}, {"cdr", NORM('t')}, {"cond", NORM('c')}, {"cons", NORM('s')}, {"defun", NORM('u')}, {"do", NORM('d')}, {"eq", NORM('e')}, {"equal", NORM('e')}, /* See eq */ {"for", NORM('f')}, {"if", NORM('i')}, {"list", NORM('l')}, {"nconc", NORM('n')}, {"rplaca", NORM('A')}, {"rplacd", NORM('D')} }; /* Token sets for module algollike */ const Token Non_Finals[] = { NORM('('), NORM('['), No_Token }; const Token Non_Initials[] = { NORM(')'), NORM(']'), No_Token }; const Token Openers[] = { NORM('('), NORM('['), No_Token }; const Token Closers[] = { NORM(')'), NORM(']'), No_Token }; /* Language-dependent code */ void Init_Language(void) { Init_Algol_Language(Non_Finals, Non_Initials, Openers, Closers); } int May_Be_Start_Of_Run(Token ch) { return May_Be_Start_Of_Algol_Run(ch); } size_t Best_Run_Size(const Token *str, size_t size) { return Best_Algol_Run_Size(str, size); } %} %option noyywrap %Start Comment Layout ([ \t\r\f]) ASCII95 ([- !"#$%&'()*+,./0-9:;<=>?@A-Z\[\\\]^_`a-z{|}~]) AnyQuoted (\\.) StrChar ([^"\n\\]|{AnyQuoted}) ChrChar ([^'\\]|{AnyQuoted}) IdfChar ([-!#$%&*+,/0-9:;<=>?@A-Z\\^_`a-z{}~]) EscIdf (({IdfChar}|\\.)+) QuotIdf ("|"[^\|\n]*"|") Idf ({EscIdf}|{QuotIdf}) %% ";".*$ { /* comment */ } \"{StrChar}*\" { /* strings */ return_ch('"'); } {Idf} { /* identifier */ return_tk(idf_in_list(yytext, reserved, sizeof reserved, IDF)); } \n { /* count newlines */ return_eol(); } {Layout} { /* ignore layout */ } {ASCII95} { /* copy other text */ return_ch(yytext[0]); } . { /* count non-ASCII chars */ lex_non_ascii_cnt++; } %% /* More language-dependent code */ void yystart(void) { BEGIN INITIAL; } similarity-tester-2.89.orig/ChangeLog0000644000000000000000000006614412540503627014564 0ustar 2014-02-17 Dick Grune * sim.c (is_new_old_separator): MinGW sometimes (?) interprets the / as a command-line argument as a reference to the MinGW tree, which makes the / unusable as a separator. Even escaping it ("/") does not help. Added the | as a separator. 2014-01-26 Dick Grune * %z from Marcus Brinkmann implemented by a routine size_t2string. 2014-01-26 Dick Grune * Sizes on my present machine (HP 6730b laptop): unsigned short int: 2 unsigned int: 4 unsigned long int: 4 unsigned long long int: 8 2013-05-31 Dick Grune * (hash.c) Better hash2() function. 2013-04-28 Dick Grune * Markus Brinkmann (marcus.brinkmann@ruhr-uni-bochum.de) supplied a 64-bit version. %z does not work on MinGW C. 2012-11-28 Dick Grune * newargs.c (recursive_args): Liqun Chen (liqun.chen@hp.com) submitted a bug report noting that the separator / is expanded under the -R option. Corrected. 2012-09-30 Dick Grune * pass2.c (pass2_txt): Boyd Blackwell (Boyd.Blackwell@anu.edu.au) submitted a bug report in which the line numbers (and runs representations) were way off (75 lines). The input files were characterized by extremely long lines, hundreds of tokens (max. 521). After 2.5 days of debugging the cause was found: 1. since the mapping from token positions to line numbers is stored as the difference of the token positions from one line to the next (see text.c); 2. since these differences are stored in unsigned chars to save space; 3. since the nl_buff mechanism is switched off when one of these unsigned characters overflow; and since 521 tokens on one line overflowed this unsigned char, the nl_buff mechanism was shut off. Since when there is no nl_buff information in pass2, pass2 resorts to rereading the input file calling yylex again; 2. since the preceding file had few runs to find line number to, the preceding file was not read to the end, and the rest remained in flex's buffer, so a portion of the preceding file seemed prefixed to the present file, adding 75 lines to it. Remedy: flushing flex's buffer explicitly in pass2_txt(); this is simpler than using flex's YY_BUFFER_STATE mechanism. Advice: get rid of the nl_buff mechanism; it is no longer relevant. 2012-06-09 Dick Grune * lang.h: The *lang.l files are unusual in two respects: 1. they present two interfaces to the rest of the system: language.[ch], static data about the language, and lang.[ch], dynamic data about the input file's content; 2. both interfaces come with multiple implementations, one for each *lang.l file; i.e., they are abstract. This has been sorted out with some difficulty. 2012-05-08 Dick Grune * Changed to 16-bit tokens, for better resolution for sim_text and on -F option, and for UTF-8 input. It was not worth while to save the 8-bit token code: on serious comparisons the increase in memory usage is about 10% (330 000 on a maximum allocation of 3 030 976 for comparing the sources of MCD2). 2009-03-11 Dick Grune * newargs.c: added -R option to follow directories recursively. See recursive_args(). 2008-09-22 * added newargs.[ch], to supply file names from standard input, for those compilers that do not have the @ facility. Implemented without fixed limits. 2008-09-21 * changed default format back to original, and inverted the -v(erbose) option into a -T(erse) option. 2008-03-31 Dick Grune * *.l: the following are not universally recognized; removed. %option nounput %option never-interactive 2008-03-31 Introduced aiso.* and Malloc.? as imported modules. 2007-11-21 Carlos Maziero - output format modified in order to facilitate "grep" filtering - added option "-v" for a more verbose output - added option "-tN" to define a threshold %N (only similarities over N% are shown) - fixed SEGV on writing to the output file - the file list can be informed through STDIN (one file per line, accepts "/" marker); this is useful for compilers that lack the @ facility 2007-08-23 Dick Grune LICENSE.txt added. 2006-11-27 Dick Grune Removal of setbuff() for compatibility. 2005-01-17 Dick Grune Corrections by Jerry James ; ANSIizing, etc. 2004-08-05 Dick Grune Finished the 'percentage' option. 08-Nov-2001 Dick Grune Begun to add a 'percentage' option, which will express the similarity between two files in percents. 27-Sep-2001 Dick Grune Split add_run() off from compare.c into add_run.c, to accommodate different add_run()s, for different types of processing. 27-Nov-1998 Dick Grune Installed a Miranda version supplied by Emma Norling (ejn@cs.mu.oz.au) 23-Feb-1998 Dick Grune Renamed text.l to textlang.l for uniformity and to make room for a possible module text.[ch]. Isolated a module for handling the token array from buff.[ch] to tokenarray.[ch], and renamed buff.[ch] to text.[ch]. 23-Feb-1998 Dick Grune There is probably not much point in abandoning the nl_buff list when running out of memory for TokenArray[]: each token costs 1 byte for the token and 4 bytes for the entry in forward_references[], a total of 5 bytes. There are about 3 tokens to a line, together requiring 15 bytes, plus 1 byte in nl_buff yields 16 bytes. So releasing nl_buff frees only 1/16 = 6.7 % of memory. Since the code is a bother, I removed it. Note that nl_buff is still abandoned when the number of tokens in a line does not fit in one unsigned char (but that is not very likely to happen). 21-Feb-1998 Dick Grune Printing got into an infinite loop when the last line of the input was not terminated by a newline AND contained tokens that were included in a matching run. This was due to a double bug: 1. the non-terminated line was not registered properly in NextTextTokenObtained() / CloseText(), and 2. the loop in pass 2 which sets the values of pos->ps_nl_cnt was terminated prematurely when the file turned out to be shorter than the list of pos-es indicated. Both bugs were corrected, the first by supplying an extra newline in CloseText() when one is found missing, and the second by rewriting the list-parallel loop in pass 2. 02-Feb-1998 Dick Grune Pascal does not differentiate between strings and characters (strings of one character); this difference has been removed from pascallang.l. 22-Jan-1998 Dick Grune Detection of non-ASCII characters added. Since the lexical analyser itself generates non-ASCII characters, the test must occur earlier. We could replace the input routine of lex by a checking routine, but with several lex-es going around, we want a more lex-independent solution. To allow each language its own restrictions about non-ASCII characters, the check is implemented in the *lang.l files. 28-Nov-1997 Dick Grune Changed the name of the C similarity tester 'sim' to 'sim_c', for uniformity with sim_java, etc. 23-Nov-1997 Dick Grune Java version finished; checked by Matty Huntjens and crew. 24-Jun-1997 Dick Grune Started on a Java version, by copying the C version. 22-Jun-1997 Dick Grune Modern lexical analysers, among which flex, read the entire input into a buffer before they issue the first token. As a result, ftell() no longer gives a usable indication of the position of a token in a file. This pulls the rug from under the nl_buff mechanism in buff.c, which is removed. We loose a valuable optimization this way, but there just seems to be no way to keep it. Note that this has nothing to do with the problem in MS-DOS of character count and fseek position not being synchronized. That problem has been solved on June 14, 1991 (which see) and the code has been running OK since. 18-Jun-1997 Dick Grune The thought has occurred to use McCreight's linear longest common substring algorithm rather than the existing algorithm, which has a small quadratic component. There are a couple of problems with this: 1. We need the longest >non-overlapping< common substring; McCreight provides just the longest. It is not at all clear how to modify the algorithm. 2. Once we have found our LCS, we want to find the one-but-longest; it is far from obvious how to do that in McCreight's algorithm. 3. Once we have found our LCS, we want to take one of its copies out of the game, to suppress duplicate messages. Again, it is difficult to see how to do that, without redoing all the calculations. 4. McCreight's algorithm seems to require about two binary tree nodes per token, say 8 bytes, which is double we use now. 17-Jun-1997 Dick Grune Did some experimenting with the hash function; it is still pretty bad: the simple-minded second sweep through forward_references easily removes another 80-99% of false hits. Next, a third sweep that does a full comparison will remove another large percentage. So I have left in the second sweep in all cases. There are a couple of questions here: 1. Can we find a better hash function, or will we forever need a second sweep? 2. Does it actually matter, or will we loose on more expensive hashing what we gain by having a better set of forward references in compare.c? 16-Jun-1997 Dick Grune Cleaned up sim.h and renamed aiso.[ch] to runs.[ch] since they are instantiations of the aiso module concerned with runs. Aiso.[spc|bdy] stays aiso.[spc|bdy], of course. 16-Jun-1997 Dick Grune Redid largest_function() in algollike.c. Corrected bug in CheckRun; it now always removes NonFinals from the end, even when it has first applied largest_function(). 15-Jun-1997 Dick Grune Reorganized the layers around the input file. There were and still are three layers: lang, stream and buff. Since the lex_X variables are hoisted unchanged through the levels lang, stream, and buff, to be used by pass1, pass2, etc., they have to be placed in a module of their own. The token-providing module 'lang' has three interfaces: - lang.h, which provides access to the lowest-level token routines, to be used by the next level. - lex.h, which provides the lex variables, to be used by all and sundry. - language.h, which provides language-specific info about tokens, concerning their suitability as initial and final tokens, to be used by higher levels. This structure is not satisfactory, but it is also unreasonable to combine them in one interface. There is no single lang.c; rather it is represented by the various Xlang.c files generated from the Xlang.l files. 14-Jun-1997 Dick Grune Added a Makefile zip entry to parallel the shar entry. 13-Jun-1997 Dick Grune A number of simplifications, in view of better software and bigger machines: - Removed good_realloc from hash.c; I don't think there are any bad reallocs left. - Removed the option to run without forward_references. On a 16Mb machine this means you have at least 2M tokens; using a quadratic algorithm will take 4*10^6 sec. at an impossible rate of 1M actions/sec., which is some 50 days. Forget it. - Renamed lang() to print_stream(), and incorporated it in sim.c - Removed the MSDOS subdirectory mechanism in the Makefile. - Removed the funny and sneaky double parameter expansion in the call of idf_in_list(). 12-Jun-1997 Dick Grune Converted to ANSI C. Removed cport.h. 09-Jan-1995 Dick Grune Decided not to do directories: they usually contain extraneous files and doing sim * is simple enough anyway. 09-Sep-1994 Dick Grune Added system.h to cater for the (few) differences between Unix and DOS. The #define int32 is also supplied there. 05-Sep-1994 Dick Grune Added many prototype declarations using cport.h. Added a depend entry to the Makefile. 31-Aug-1994 Dick Grune All these changes require a 32 bit integer; introduced a #define int32, set from the command line in the Makefile. 25-Aug-1994 Dick Grune It turned out that one of the most often called routines was .rem, from idf_hashed() in idf.c. Moving the % out of the loop chafed off another 6% and reduced the time to 18.4 sec. 19-Aug-1994 Dick Grune With very large files (e.g., concatenated /usr/man/man1/*) the fixed built-in hash table size of 10639 is no longer satisfactory. Hash.c now finds a prime about 8 times smaller than the text_size to use for hash table size; this achieves optimal speed-up without gobbling up too much memory. Reduced the time for the above file from 30.2 sec. to 19.6 sec. For checking, the same test was run with all hashing off; it took 20h 27m 19s = 73639 sec. But it worked. 11-Aug-1994 Dick Grune For large values of MinRunSize (>1000) a large part of the time (>two-thirds) was spent in calculating the hash values for each position in the input, since the cost of this calculation was proportional to MinRunSize. We now sample a maximum of 24 tokens from the input string to calculate the hash value, and avoid overflow. On my workstation, this reduces the time for sim_text -r 1000 -n /usr/man/man1/* from 60 sec to 21 sec. 30-Jun-1992 Dick Grune,kamer R4.40,telef. 5778 There was an amazing bug in buff.c where NextTextToken() for pass 2 omitted to set lex_token to EOL when retrieving newline info from nl_buff. Worked until now!?! 23-Sep-1991 Dick Grune Cport.h introduced, CONST and *.spc only. 17-Sep-1991 Dick Grune The position-sorting routine in pass2.c has been made into a separate generic module. 14-Jun-1991 Dick Grune (dick@cs.vu.nl) at dick.cs.vu.nl Replaced the determination of the input position through counting input characters by calls of ftell(); this is cleaner and the other method will never work on MSDOS. 30-May-1989 Dick Grune (dick) at dick Replaced the old top-100 module (which had been extended to top-10000 already anyway) by the new aiso (arbitrary-in sorted-out) module. This caused a considerable speed-up on the Mod2 test bed: %time cumsecs #call ms/call name 17.9 99.20 7209 13.76 _InsertTop 0.3 1.37 7209 0.19 _InsertAiso It turns out that malloc() is not a serious problem, so no special version for the aiso module is required. 23-May-1989 Dick Grune (dick) at dick No more uncommented comment at the end of preprocessor lines, to conform to ANSI C. 23-May-1989 Dick Grune (dick) at dick Added code in the X.l files to (silently) reject characters over 0200. This does not really help, since lex stops on null chars. Ah, well. 19-May-1989 Dick Grune (dick) at dick Made the token as handled by sim into an abstract data type, for aesthetic reasons. Sign extension is still a problem. 03-May-1989 Dick Grune (dick) at dick Optimized lcs() by first checking from the end if a sufficiently long run is present; if in fact only the first 12 tokens match, chances are good that you can reject the run right away by first testing the 20th token, then the 19th, and so on. 21-Apr-1989 Dick Grune (dick) at dick A run of sim_m2 finding 7209 similarities raised the question of the appropriateness of the linear sort in sort_pos(). Profiling showed that in this case sorting takes all of 7.5 % of the total time. Putting the word register in in the right places in sort_pos() lowered this number to 4.6%. 20-Apr-1989 Dick Grune (dick) at dick Moved the test for MayBeStartOfRun() from compare.c (where it is done again and again) to hash.c, where its effect is incorporated in the forward reference chain. 14-Apr-1989 Dick Grune (dick) at dick Replaced elem_of() by bit tables, headers[] and trailers[], to be prefilled from Headers[] and Trailers[] by a call of InitLanguage(). This saves a few percents. 13-Apr-1989 Dick Grune (dick) at dick Implemented the -e and the -S option, by putting yet another loop in compare.c 13-Apr-1989 Dick Grune (dick) at dick The -- option (displaying the tokens) will now handle more than one file. 20-Jan-1989 Dick Grune (dick) at dick After the modification of 19-Dec-88, 12% of the time went into updating the positions in the chunks, as they were produced by the matching process. This matching process identifies runs (matches) by token position, which has to be recalculated to lseek positions and line numbers. To this end the files are read again, and for each line all positions found were checked to see if they applied to this line; this was a awfully stupid algorithm, but since much more time was spent elsewhere, it did not really matter. With all the saving below, however, it had risen to second position, after yylook() with 35%. Th solution was, to sort the positions in the same order in which they would be met by the reading of the files. The process is then linear. This required some extensive hacking in pass2.c 06-Jan-1989 Dick Grune (dick) at dick The modification below did indeed save 25%. The newline information is now reduced to 2 shorts; 2 chars were not enough, since some lines are longer that 127 bytes, and a char and a short together take as much room as two shorts. 19-Dec-1988 Dick Grune (dick) at dick To avoid reading the files twice (which is still taking 25% of the time), the first pass will now collect newline information for the second pass in a buffer called nl_buff[]. This buffer, and the original token buffer now named TokenArray[], are managed by the file buff.c, which implements a layer between stream.h and pass?.c. This layer provides OpenText(), NextTextToken() and CloseText(), each with a parameter telling which pass it is. 06-Dec-1988 Dick Grune (dick) at dick As an introduction to removing the second pass altogether, the first and second scan were unified, i.e., their input is identical. This also means that the call sim -[12] has now been replaced by one call: sim --. 23-Sep-1988 Dick Grune (dick) at dick Dynamic allocation of line buffers in pass 3. This removes the restriction on the page width. 22-Sep-1988 Dick Grune (dick) at dick In order to give better messages on incorrect calls to sim, the whole option handling has been concentrated in a file option.c and separated from the options and their messages themselves. See sim.c 07-Sep-1988 Dick Grune (dick) at dick For long text sequences (say hundreds of thousands of tokens), the hashing is not really efficient any more since too many spurious matches occur. Therefore, the forward reference table is scanned a second time, eliminating from any chain all references to runs that do not end in the same token. For the UNIX manuals this reduced the number of matches from 91.9% to 1.9% (of which 0.06% were genuine). 30-Aug-1988 Dick Grune (dick) at dick For compatibility, NextTop has been rewritten to yield true or false and to accept a pointer to a run as a parameter. 30-Aug-1988 Dick Grune (dick) at dick When trying to find line-number and lseek position to beginnings and ends of runs found, the whole set of runs was scanned for each line in each file. Now only the runs belonging to that file are scanned; to this end another linked list has been braided through the data structures (tx_chunk). 30-Aug-1988 Dick Grune (dick) at dick The longest-common-substring algorithm was called much too often, mainly because the forward references made by hashing suffered from pollution. If you have say 1000 tokens and a hash range of say 10000, about 5 % of the hashings will be false matches, i.e. 50 matches, which is quite a lot on a natural number of 2 to 3 matches. Improved by doing a second check in make_forw_ref(). 12-Jun-1988 Dick Grune (dick) at dick Installed a Lisp version supplied by Gertjan Akkerman. 15-Jan-1988 Dick Grune (dick) at dick Added register declarations all over the place. 14-Jan-1988 Dick Grune (dick) at dick It is often useful to match a piece of code exactly, especially when function names (or, even more so, macro names) are involved. What one would want is having all the letters in the text array, but this is kind of hard, since each entry is one lexical item. This means that under the -F option each letter is a lex item, and normally each tag is a lex item; this requires two lex grammars in one program; no good. So, on the -F flag we hash the identifier into one lex item, which is hopefully characteristic enough. It works. 30-Sep-1987 Dick Grune (dick) at dick Some cosmetics. 31-Aug-1987 Dick Grune (dick) at dick Moved the whole thing to the SUN (while testing on a VAX and a MC68000) 16-Aug-1987 Dick Grune (dick) at dick The test program lang.c is no longer a main program, but rather a subroutine called in main() in sim.c, through the command line option -1 or -2. 23-Apr-1987 Dick Grune (dick) at tjalk Changed the name 'index' into 'elem_of', because of compatibility problems on different Unices. Added a declaration for it in the file algollike.c 10-Mar-1987 Dick Grune (dick) at tjalk Changed the printing of the header of a run so that: - long file names will no longer be truncated - the run length is displayed 27-Jan-1987 Dick Grune (dick) at tjalk Switched it right off again! Getting them in textual order is still more unpleasant, since now you cannot find the important ones if their are more than a few runs. 27-Jan-1987 Dick Grune (dick) at tjalk Going to experiment with leaving out the sorting; just all the runs, in the order we meet them. Should be as good or better. Comparisons of more than 100 runs are very rare anyway, so the fact that those over a 100 are rejected is probably no great help. Getting them in a funny order is a nuisance, however. Down with featurism. Just to be safe, present version saved as 870127.SV 26-Dec-1986 Dick Grune (dick) at tjalk Names of overall parameters in params.h changed to more uniformity. 26-Dec-1986 Dick Grune (dick) at tjalk Since the top package and the instantiation system have grown apart so much, I have integrated the old top package into sim, i.e., done the instantiation by hand. This removes top.g and top.p, and will save outsiders from wondering what is going on here. 23-Dec-1986 Dick Grune (dick) at tjalk Use setbuf to print unbuffered while reading the files (lex core dumps, other mishaps) and print buffered while printing the real output (for speed). 30-Nov-1986 Dick Grune (dick) at tjalk Various small changes in *lang.l: ; ignored conditionally (!options['f']) new format for tokens in struct idf cosmetics: macro Layout, macro UnsafeComChar, no \n in character denotations, more than one char in a char denotations in Pascal, etc. 30-Nov-1986 Dick Grune (dick) at tjalk Added a Modula-2 version. 29-Nov-1986 Dick Grune (dick) at tjalk Restricting tokens to the ASCII95 character set is really too severe: some languages have many more reserved words (COBOL!). Corrected this by adding a couple of '&0377' in strategic places. Added a routine for printing the 8-bit beasties: show_token(). 15-Aug-1986 Dick Grune (dick) at tjalk Since the ; is superfluous in both C and Pascal, it is now ignored by clang.l and pascallang.l 15-Aug-1986 Dick Grune (dick) at tjalk The code in CheckRun in Xlang.l was incorrect in that it used the wrong criterion for throwing away trailing garbage. I've taken CheckRun etc. out of the Xlang.l-s and turned them into a module "algollike.c". Made a cleaner interface and avoided duplication of code. 02-Jul-1986 Dick Grune (dick) at tjalk Looking backwards in compare.c to see if we are in the middle of a run is an atavism. You can be and still be all right, e.g., if part of the run was rejected as not fitting for a function. Removed from compare.c. 10-Jun-1986 Dick Grune (dick) at tjalk The function hash_code() in hash.c could yield a negative value; corrected. 09-Jun-1986 Dick Grune (dick) at tjalk Changed the name of the file text.h to sim.h. Sim.h is more appropriate and text.h sounds as if it belongs to text.l, with which it has no connection. 04-Jun-1986 Dick Grune (dick) at tjalk After having looked at a couple of hash functions and having done some calculations on the number of duplicates normally encountered in hash functions, I conclude that our function in hash.c is quite good. Removed all the statistics-gathering stuff. Actually, hash_table[] is not the hash table at all; it is a forward reference table; likewise, the real hash table was called last[]. Renamed both. There is a way to keep the hash table local without putting it on the stack: use malloc(). 02-Jun-1986 Dick Grune (dick) at tjalk Added a simple lex file for text: each word is condensed into a hash code which is mapped on the ASCII95 character set. This turns out to be quite effective. 01-Jun-1986 Dick Grune (dick) at tjalk The macros cput(tk) and c_eol() both have a return in them, so any code after them may not be executed -> they have to be last in an entry. But they weren't, in many places; I can't imagine why it all worked nevertheless. They have been renamed return_tk(tk) and return_eol() and the entries have been restructured. 30-May-1986 Dick Grune (dick) at tjalk Moved the string and character entries in clang.l and pascallang.l to a place behind the comment entries, to avoid strings (and characters) being recognized inside comments. I first thought this would not happen, but as Maarten pointed out, if both interpretations have the same length, lex will take the first entry. Now this will happen if the string occupies the whole line that would otherwise be taken as a comment. In short, /* "hallo" */ would return ". 28-May-1986 Dick Grune (dick) at tjalk Added -d option, to display the output in diff(1) format (courtesy of Maarten van der Meulen). Rewrote the lexical parsing of comments (likewise courtesy Maarten van der Meulen). 20-May-1986 Dick Grune (dick) at tjalk Added a routine to convert identifiers to lower case in pascallang.l . 19-May-1986 Dick Grune (dick) at tjalk Added -a option, to quickly check antecedent of a file (courtesy of Maarten van der Meulen). 18-May-1986 Dick Grune (dick) at tjalk Brought everything under RCS/CVS. 18-Mar-1986 Dick Grune (dick) at tjalk Added modifications by Paul Bame (hp-lsd!paul@hp-labs) to have an option -w to set the page width. 21-Feb-1986 Dick Grune (dick) at tjalk Took array last[N_HASH] out of make_hash() in hash.c, due to stack overflow on the Gould (reported by George Walker tekig4!georgew@mcvax.uucp) 16-Feb-1986 Dick Grune (dick) at tjalk Corrected some subtractions that caused unsigned ints to turn pseudo-negative. (Reported by jaap@mcvax) 11-Jan-1986 Dick Grune (dick) at tjalk Touched up for distribution. 10-Jan-1986 Dick Grune (dick) at tjalk Fill_line was not called for empty lines, which caused them to be printed as repetitions of the previous line. 24-Dec-1985 Dick Grune (dick) at tjalk Reduced hash table to a single array of indices; it is used only in one place, which makes it very easy to make it (the hash table) optional. General tune-up of everything. This seems to be another stable "final" version. 14-Dec-1985 Dick Grune (dick) at tjalk Some experiments with hash formulas: h = (h OP CST) + *p++ OP CST yields right wrong * 96 - 32 205 562 * 96 - 2 205 560 * 96 205 560 * 97 205 559 << 0 66 3128 << 1 203 555 << 2 205 536 << 7 203 540 Conclusion: it doesn't matter, unless you do it wrong. 01-Oct-1983 Dick Grune (dick) at vu44 Oldest known files. # This file is part of the software similarity tester SIM. # Written by Dick Grune, Vrije Universiteit, Amsterdam. # $Id: ChangeLog,v 2.25 2014-02-17 11:21:39 dick Exp $ # similarity-tester-2.89.orig/fname.c0000644000000000000000000001623712540503627014242 0ustar /* This file is part of the auxiliaries library. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: fname.c,v 1.7 2014-07-28 09:18:12 Gebruiker Exp $ */ /* Support for UNICODE file names in Windows */ /* Two data types are involved in UNICODE file names in Windows: UTF16 strings, the file names as stored by Windows, and UTF8 strings, the names as they are displayed and stored. The conversion between these two proceeds through CodePoints, the 'real' values of the characters, of which UTF16 strings and UTF8 strings are the compressed representations. The module consists of two levels: a set of static routines int UTF8_sequence_to_CodePoint(const UTF8 *s, UTF32 *cp), int UTF16_sequence_to_CodePoint(const UTF16 *s, UTF32 *cp), const UTF8 *CodePoint_to_UTF8_sequence(UTF32 c), const Fchar *CodePoint_to_UTF16_sequence(UTF32 cp) which convert from and to CodePoints; and two global routines const char *Fname2str(const Fchar *fn), const Fchar *str2Fname(const char *s) which convert from Unicode file names to UTF-8 strings and vice versa. Unicode file names are obtained by calls of Opendir() and Readdir(), as defined in fname.h. */ #include "fname.h" /*Library module source prelude */ #undef _FNAME_CODE_ #ifndef lint #define _FNAME_CODE_ #endif #ifdef LIB #define _FNAME_CODE_ #endif #ifdef _FNAME_CODE_ /* Library module source code */ #ifdef _UNICODE typedef uint8_t UTF8; typedef uint16_t UTF16; typedef uint32_t UTF32; #define BAD_CodePoint (UTF32)(-1) /* mask of n left-aligned 1-s in a UTF8 */ #define LMASK(n) (((1<<(n))-1)<<(8-(n))) /* mask of n right-aligned 1-s */ #define RMASK(n) ((1u<<(n))-1) /* SEQUENCE -> CODEPOINT */ static int nmb_leading_ones_in_UTF8(UTF8 c) { int n = 0; while (c&LMASK(1)) { c <<= 1, n++; } return n; } static UTF32 get_UTF8_tail(const UTF8 *s, int n) { UTF32 res = 0; int i; /* scoop up n UTF-8s */ for (i = 0; i < n; i++) { if ((s[i]&LMASK(2)) != LMASK(1)) return BAD_CodePoint; res = (res<<6) + (s[i]&RMASK(6)); } return res; } static int /* number of UTF8s used; cp = BAD_CodePoint for error */ UTF8_sequence_to_CodePoint(const UTF8 *s, UTF32 *cp) { UTF8 head = s[0]; int head_length = 1; const UTF8 *tail = &s[1]; int tail_length; UTF32 tail_value; if ((head&LMASK(1)) == 0) { *cp = head; return head_length; } tail_length = nmb_leading_ones_in_UTF8(head) - 1; if (tail_length < 1 || tail_length > 3) goto error; tail_value = get_UTF8_tail(tail, tail_length); if (tail_value == BAD_CodePoint) goto error; *cp = ((head&RMASK(6-tail_length))<<(tail_length*6)) | tail_value; return head_length+tail_length; error: { int i = head_length; /* skip the head */ /* skip until new head */ while ((s[i]&LMASK(1)) != 0) { i++; } *cp = BAD_CodePoint; return i; } } static int is_in_BMP(UTF32 c) { /* Basic Multilingual Plane */ return c <= 0xD7FF || (0xE000 <= c && c < 0x10000); } static int is_high_surrogate(UTF16 c) { return 0xD800 <= c && c <= 0xDBFF; } static int is_low_surrogate(UTF16 c) { return 0xDC00 <= c && c <= 0xDFFF; } static int /* number of UTF16s used; cp = BAD_CodePoint for error */ UTF16_sequence_to_CodePoint(const UTF16 *s, UTF32 *cp) { /* adapted from code from http://unicode.org/faq/utf_bom.html */ UTF32 plane_number; UTF32 position; if (is_in_BMP(s[0])) { *cp = s[0]; return 1; } /* s[0:1] must be a surrogate pair */ if (!is_high_surrogate(s[0])) goto error; if (!is_low_surrogate(s[1])) goto error; /* get the plane number */ plane_number = (s[0] >> 6) & RMASK(5); plane_number = plane_number + 1; /* to offset it from the BMP */ /* get the position in the plane */ position = ((s[0] & RMASK(6)) << 10) | (s[1] & RMASK(10)); /* combine them */ *cp = plane_number << 16 | position; return 2; error: { int i = 1; /* skip one UTF-16 */ /* skip until acceptable UTF-16 */ while (!is_in_BMP(s[i]) && !is_high_surrogate(s[0])) { i++; } *cp = BAD_CodePoint; return i; } } /* CODEPOINT -> SEQUENCE */ static const UTF8 * /* transient */ CodePoint_to_UTF8_sequence(UTF32 c) { /* adapted from code by user R on stackoverflow.com */ static UTF8 buff[6]; UTF8 *bp = buff; if (c < 0x80) { /* it fits in 7 bits */ *bp++ = (c>>0)&RMASK(7); } else if (c < 0x800) { /* it fits in 11 bits */ *bp++ = 0xC0 | ((c>>6)&RMASK(5)); *bp++ = 0x80 | ((c>>0)&RMASK(6)); } else if (c < 0x10000) { /* it fits in 16 bits */ if (!is_in_BMP(c)) { /* it is in the forbidden zone */ return NULL; } *bp++ = 0xE0 | ((c>>12)&RMASK(4)); *bp++ = 0x80 | ((c>>6)&RMASK(6)); *bp++ = 0x80 | ((c>>0)&RMASK(6)); } else if (c < 0x110000) { /* it fits in 21 bits */ *bp++ = 0xF0 | ((c>>18)&RMASK(3)); *bp++ = 0x80 | ((c>>12)&RMASK(6)); *bp++ = 0x80 | ((c>>6)&RMASK(6)); *bp++ = 0x80 | ((c>>0)&RMASK(6)); } else return NULL; *bp = '\0'; return buff; } static UTF16 * /* transient */ CodePoint_to_UTF16_sequence(UTF32 cp) { /* adapted from code from http://unicode.org/faq/utf_bom.html */ static UTF16 res[3]; if (is_in_BMP(cp)) { res[0] = cp; res[1] = '\0'; return res; } if (cp >= 0x10000) { UTF16 position = (UTF16) cp; UTF16 plane_number = ((cp >> 16) & RMASK(5)) - 1; res[0] = 0xD800 | (plane_number << 6) | (position >> 10); res[1] = 0xDC00 | (position & RMASK(10)); res[2] = '\0'; return res; } else return NULL; } const char * /* transient */ Fname2str(const Fchar *fn) { /* converts a Fchar (wchar_t) string to an UTF-8 string */ static UTF8 res[1024]; UTF8 *rp = &res[0]; int i = 0; if (fn == NULL) return NULL; while (fn[i]) { UTF32 cp; const UTF8 *p; /* get Codepoint from one or two Fchar chars */ i += UTF16_sequence_to_CodePoint(&fn[i], &cp); if (cp == BAD_CodePoint) goto error; /* convert code point to UTF8 sequence */ p = CodePoint_to_UTF8_sequence(cp); if (p == NULL) goto error; /* append it to the output */ while (*p) { *rp++ = *p++; } continue; error: *rp++ = '?'; } *rp = '\0'; return (const char *)res; } const Fchar * /* transient */ str2Fname(const char *s) { /* converts a possibly UTF-8 string to an Fchar (wchar_t) string */ static Fchar res[512]; Fchar *rp = &res[0]; int i = 0; if (s == NULL) return NULL; while (s[i]) { UTF32 cp; const Fchar *p; /* get Codepoint from one to four UTF-8s */ i += UTF8_sequence_to_CodePoint((const UTF8 *)&s[i], &cp); if (cp == BAD_CodePoint) goto error; /* convert code point to UTF-16 sequence */ p = CodePoint_to_UTF16_sequence(cp); if (p == NULL) goto error; /* append it to the output */ while (*p) { *rp++ = *p++; } continue; error: *rp++ = '?'; } *rp = '\0'; return res; } /* OTHER UTF-16 ROUTINES */ int Stat(const Fchar *fn, struct stat *st) { /* why on earth does _wstat use a funny struct _stat ? */ return _wstat(fn, (struct _stat *)st); } FILE * Fopen(const Fchar *fn, const char *rb) { /* stream is still char* */ Fchar fn_copy[512]; /* avoid possible transiency of fn */ Fnamecpy(fn_copy, fn); return _tfopen(fn_copy, str2Fname(rb)); } #endif /* _UNICODE */ /* End library module source code */ #endif /* _FNAME_CODE_ */ #ifdef lint static void satisfy_lint(void *x) { /* lint cannot handle Fchar complications */ satisfy_lint(x); } #endif /* lint */ similarity-tester-2.89.orig/TechnReport0000644000000000000000000001725212540503627015166 0ustar CONCISE REPORT ON THE ALGORITHMS IN SIM 970623 INTRODUCTION The general outline of the similarity checker is as follows: 1. the files are read in (pass 1) 2. a forward-reference table is prepared 3. the set of interesting runs is determined 4. the line numbers of the runs are determined (pass 2) 5. the contents of the runs are printed in order (pass 3) To keep the memory requirements (relatively) small, the exact positions of the tokens are not recorded. This necessitates pass 2. See, however, the pertinent chapter. READING THE FILES Each file is tokenized using an lex-generated scanner appropriate for the input. Each token fits in one byte, possibly using all 8 bits. The tokens are stored in the array Token_Array[], which is extended by reallocation if it overflows. See tokenarray.c. Also, to optimize away pass 2, an attempt is made to remember the token positions of all beginnings of lines. The token-positions at BOL are stored in the array nl_buff[], which is also extended by reallocation, if needed. If the attempt fails due to lack of memory, nl_buff[] is abandoned, and pass2 will read the files instead. PREPARING THE FORWARD-REFERENCE TABLE Text is compared by comparing every substring to all substrings to the right of it; this process is in essence quadratic. However, only substrings of length at least 'MinRunSize' are of interest, which gives us the possibility to speed up this process by using a hash table. Once the entire text has been read in, a forward-reference table forward_references[] is made (see hash.c). For every position in the text, we construct an index which gives the next position in the text where a run of MinRunSize tokens starts that has the same hash code. If there is no such run, the index is 0. To fill in this array, we use a hash table last_index[], such that last_index[i] is the index of the latest token with hash_code i, or 0 if there is none. If at a given position p, we find that the text ahead of us has hash code i, last_index[i] tells us which position in forward_references[] will have to be updated to p. See MakeForward_References(). For long text sequences (say hundreds of thousands of tokens), the hashing is not really efficient any more since too many spurious matches occur. Therefore, the forward reference table is scanned a second time, eliminating from any chain all references to runs that do not start with and end in the same token (actually this is a second hash code). For the UNIX manuals this reduced the number of matches from 91.9% to 1.9% (of which 0.06% was genuine). DETERMINING THE SET OF INTERESTING RUNS The overall structure of the routine Compare_Files() (see compare.c) is: for all new files for all texts it must be compared to for all positions in the new file for all positions in the text for ever increasing sizes try to match and keep the best If for a given position in the new file a good run (i.e. on of at least minimum length) has been found, the run is registered using a call of add_run(), the run is skipped in the new file and searching continues at the position after it. This prevents duplicate reports of runs. Add_run() allocates a struct run for the run (see sim.h) which contains two struct chunks and a quality description. It fills in the two chunks with the pertinent info, one for the first file and one for the second (which may be the same, if the run relates two chunks in the same file). The run is then entered into the arbitrary-in-sorted-out store AISO (see aiso.spc and aiso.bdy, a genuine generic abstract data type in C!), in which it is inserted according to its quality. Both positions (struct position) in both chunks in the run (so four in total) are each entered in a linked list starting at the tx_pos field in the struct text of the appropriate file. When this is finished, the forward reference table can be deleted. So the final results of this phase are visible both through the tx_pos fields and through the aiso interface. DETERMINING THE EXACT POSITION OF EACH RUN (PASS 2) The purpose of this pass is to find for each chunk, which up to now is known by token position only, its starting and ending line number (which cannot be easily derived from the token position). For each file that has a non-zero tx_pos field, ie. that has some interesting chunks, the positions in the tx_pos list are sorted on ascending line number (they have been found in essentially arbitrary order) by sort_pos() in pass2.c. Next we scan the pos list and the file in parallel, updating the info in a position when we meet it. A position carries an indication whether it is a starting or an ending position, since slightly differing calculations have to be done in each case. Actually, if the nl_buff[] data structure still exists, the file is not accessed at all and the data from nl_buff[] is used instead. This is done transparently in buff.c. PRINTING THE CONTENTS OF THE RUNS (PASS 3) Since each struct run has now been completely filled in, this is simple; the hard work is calculating the page layout. Pass3() accesses the aiso store and retrieves from it the runs in descending order of importance. Show_run() opens both files, positions them using the line numbers and prints the runs. ================================================================ CODE EXCERPT OF THE SOFTWARE SIMILARITY TESTER SIM (980222) sim: get command line options check the options init language, to precompute tables pass1, read the files # there is an array Token_Array[] that holds all input tokens make forward reference table # there is an array forward_references[], with one entry for # each token in the input; forward_references[i] gives the # token number where a token sequence starts with the same # hash value as the one starting at i compare various files to find runs delete forward reference table pass2, find newline positions of found similarities pass3, print the similarities pass1, read the files: for each file divide the text into tokens store all tokens except newlines in Token_Array and try to keep a record of the newline positions make forward reference table: # there are two independent hash functions, hash1() and hash2(). # hash1(i) gives the hash value of the token sequence starting at i # likewise for hash2(i) set up the forward references using the last_index table: # there is an array last_index[], with one entry for each # possible hash value; last_index[i] gives the position in # forward_references[] at which i was most recently # encountered as a hash value for each file for all positions in file except the last MinRunSize set forward_references[] and update last_index[] use hash2() to clean out matches: for all tokens find first token in chain with same hash2 code short-circuit forward reference to it compare: for all new files for all texts it must be compared to for all positions in the new file for all positions in the text for ever increasing sizes try to match and keep the best try to match and keep the best: # using forward_references[], we find a list of positions in # which a matching token sequence will start; # scanning this list, we measure the maximum length of the # match and add the longest match to the run collection pass2, find positions of found runs: for all files: sort the positions in the runs # we scan the pos list and the file in parallel for all positions inside this file if it matches a token position in a run record line number pass3, print the similarities: for all runs # a run consists of two chunks open the files that hold the chunks and position them at the beginning of the chunk display the chunks similarity-tester-2.89.orig/options.c0000644000000000000000000000502612540503627014641 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: options.c,v 1.10 2012-05-13 09:05:49 dick Exp $ */ #include #include #include "options.h" static char options[128]; static void bad_option( const char *progname, const struct option *optlist, char *msg, int c ); static int opt_value( const char *progname, const struct option *op, const char *arg, const char *argv[] ); static int do_arg( const char *progname, const struct option *optlist, const char *arg, const char *argv[] ); int do_options( const char *progname, const struct option *optlist, int argc, const char *argv[] ) { int skips = 0; while (argc > 0 && argv[0][0] == '-' && argv[0][1] != '\0') { int consumed = do_arg(progname, optlist, &argv[0][1], argv); argc -= consumed, argv += consumed, skips += consumed; } return skips; } void set_option(char ch) { options[(int)ch]++; } int is_set_option(int ch) { return options[ch]; } static int do_arg( const char *progname, const struct option *optlist, const char *arg, const char *argv[] ) { int consumed = 0; while (*arg) { /* treat argument character */ char opc = *arg++; const struct option *op; for (op = optlist; op->op_char; op++) { if (opc == op->op_char) { set_option(opc); if (op->op_indicator != ' ') { consumed = opt_value( progname, op, arg, argv ); } break; } } if (!op->op_char) { bad_option(progname, optlist, "*option -%c unknown", opc ); /*NOTREACHED*/ } if (consumed) break; } if (!consumed) { consumed = 1; } return consumed; } static int opt_value( const char *progname, const struct option *op, const char *arg, const char *argv[] ) { /* locate the option value */ if (*arg) { /* argument is continuation of option */ *op->op_stringp = arg; return 1; } else if (argv[1]) { /* argument follows option */ *op->op_stringp = argv[1]; return 2; } else { bad_option(progname, (struct option *)0, " option -%c requires another argument", op->op_char ); return 0; /*NOTREACHED*/ } } static void bad_option( const char *progname, const struct option *optlist, char *msg, int c ) { fprintf(stderr, "%s: ", progname); fprintf(stderr, &msg[1], c); fprintf(stderr, "\n"); if (msg[0] != ' ') { const struct option *op; fprintf(stderr, "Possible options are:\n"); for (op = optlist; op->op_char; op++) { fprintf(stderr, "\t-%c%c\t%s\n", op->op_char, op->op_indicator, op->op_text ); } } exit(1); } similarity-tester-2.89.orig/fname.h0000644000000000000000000000606212540503627014242 0ustar /* This file is part of the auxiliaries library. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: fname.h,v 1.12 2014-07-28 09:18:12 Gebruiker Exp $ */ /* Support for UNICODE file names */ /* To accommodate UNICODE file names on various platforms, this file defines the types Fchar file name character Dir_t struct for accessing a directory Dirent_t struct for accessing a directory entry and the functions Dir_t* Opendir(const Fchar*); Dirent_t* Readdir(Dir_t*); int Closedir(Dir_t*); Fchar *Fnamecpy(Fchar *dest, Fchar *source); Fchar *Fnamecat(Fchar*, const Fchar*); int Fnamelen(const Fchar*); int Fnamecmp(const Fchar*, const Fchar*); int Stat(const Fchar *fn, struct stat *st); FILE *Fopen(const Fchar *fn, const char *rb); The stream is still char*! int Fclose(FILE*); const char *Fname2str(const Fchar *fn); const Fchar *str2Fname(const char *s); The result of these two routines is transient: is is good only until the next call. The only way to obtain a file name is through readdir; the command line arguments are in ASCII. So a program can be adapted by replacing DIR by Dir_t, and struct dirent by Dirent_t. Compiling and correcting using the above replacements until there are no more errors or warnings will then yield an UTF-16 compatible program, as far as the input is concerned. Output is done in UTF-8; there seems to be no way to do output in UFT-16. For details about UTF-16 see fname.c. */ #ifndef _FNAME_H_ #define _FNAME_H_ /* lint cannot handle the weird code Windows throws at it, so even under Windows we claim to have UTF8 */ #ifdef MSDOS #define IS_UTF_16 #endif #ifdef lint #undef IS_UTF_16 #endif #ifdef IS_UTF_16 /* file names in UTF-16 */ #define _UNICODE #include #include #include /* Private entries */ typedef _TCHAR Fchar; typedef _WDIR Dir_t; typedef struct _tdirent Dirent_t; /* Public entries */ #define Opendir _topendir #define Closedir _tclosedir #define Readdir _treaddir #define Fnamecpy wcscpy #define Fnamecat wcscat #define Fnamelen (int)wcslen #define Fnamecmp wcscmp extern const char *Fname2str(const Fchar *fn); /* transient! */ extern const Fchar *str2Fname(const char *s); /* transient! */ extern int Stat(const Fchar *fn, struct stat *st); extern FILE *Fopen(const Fchar *fn, const char *rb);/* stream is still char* */ #define Fclose fclose #else /* not MSDOS */ /* file names are in UTF-8 */ #include #include #include /* life is simple */ /* Public entries */ typedef char Fchar; #define Fnamecpy strcpy #define Fnamecat strcat #define Fnamelen strlen #define Fnamecmp strcmp #define Fname2str(fn) (fn) #define str2Fname(s) (s) #define Stat(fn,st) stat(fn,st) typedef DIR Dir_t; typedef struct dirent Dirent_t; #define Opendir opendir #define Closedir closedir #define Readdir readdir #define Fopen fopen #define Fclose fclose #endif /* MSDOS */ #endif /* _FNAME_H_ */ similarity-tester-2.89.orig/debug.par0000644000000000000000000000142512540503627014573 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: debug.par,v 1.9 2015-01-15 11:31:05 dick Exp $ */ #undef DB_ALL #undef DB_HASH /* print hash value info */ #undef DB_FORW_REF /* check forward references */ #undef DB_FORW_REF_PRINT /* also print forward references */ #undef DB_TEXT /* print all text parts */ #undef DB_POS /* print positions in files */ #undef DB_NL_BUFF /* print the newline count buffer */ #undef DB_RUN /* print all run activity */ #undef DB_PERC /* print the percentaage match list */ #if defined(lint) || defined(DB_ALL) #define DB_HASH #define DB_FORW_REF #define DB_FORW_REF_PRINT #define DB_TEXT #define DB_POS #define DB_NL_BUFF #define DB_RUN #define DB_PERC #endif similarity-tester-2.89.orig/Makefile0000644000000000000000000003125612540503627014446 0ustar # This file is part of the software similarity tester SIM. # Written by Dick Grune, Vrije Universiteit, Amsterdam. # $Id: Makefile,v 2.66 2015-04-29 18:18:21 dick Exp $ # VERSION="\"2.89 of 2015-04-29\"" # E N T R Y P O I N T S help: @echo 'Entry points:' @echo 'test: compile sim_c and run a simple test' @echo '' @echo 'binaries: create all binaries' @echo 'exes: create executables in MSDOS' @echo 'install: install all binaries' @echo '' @echo 'man: create sim.pdf' @echo 'lint: lint sim sources' @echo 'simsim: run sim_c on the sim sources' @echo '' @echo 'fresh: remove created files' # # When you modify any of the following macros, do 'make clean' # # System dependencies # =============== including ../lib/sysidf.mk here # This file is part of the auxiliary libraries. # Written by Dick Grune, dick@dickgrune.com # $Id: sysidf.mk,v 1.17 2014-07-28 09:18:13 dick Exp $ # ################################################################ # For UNIX-like systems SYSTEM = UNIX SUBSYSTEM = SOLARIS # Locations DIR = /home/dick BINDIR = $(DIR)/bin.`$(DIR)/bin/arch` MAN1DIR = $(DIR)/man/man1 # Commands COPY = cp -p EXE = # LEX = flex LN = ln ZIP = zip -o ################################################################ # For MSDOS + MinGW SYSTEM = MSDOS SUBSYSTEM = MinGW # Locations DIR = C:/BIN BINDIR = C:/BIN MAN1DIR = C:/BIN # Commands (cp required, since xcopy cannot handle forward slashes) COPY = cp -p EXE = .exe LEX = flex LN = ln ZIP = zip -o ################################################################ # General, compiling: CC = gcc -D$(SYSTEM) -D$(SUBSYSTEM) LINT = lint -ansi -D$(SYSTEM) -D$(SUBSYSTEM) LINTFLAGS = -xh # General, manual: GROFF = groff -man GROFF = man2pdf .SUFFIXES: .1 .3 .pdf .1.pdf: $(GROFF) $< .3.pdf: $(GROFF) $< # =============== end of ../lib/sysidf.mk # Compiling MEMORY = -DMEMLEAK -DMEMCLOBBER CFLAGS = -DVERSION=$(VERSION) $(MEMORY) -O4 LIBFLAGS = # LINTFLAGS = $(MEMORY) -h# -X LOADFLAGS = -s# # strip symbol table LOADER = $(CC) $(LOADFLAGS) # Debugging CFLAGS += -DDEBUG DEBUG_C = debug.c DEBUG_O = debug.o DEBUG_H = debug.h # T E S T P A R A M E T E R S # percentage test TEST_LANG = c TEST_OPT = -p TEST_INP = *.l # text test TEST_LANG = text TEST_OPT = -r 5 TEST_INP = test_seplet # Rumen Stevanov test TEST_LANG = text TEST_OPT = -p TEST_INP = Rumen_Stefanov/new/*.txt # Kuhl test 1 TEST_LANG = c TEST_OPT = -p TEST_INP = Kuhl/simc1.c Kuhl/simc2.c # Kuhl test 2 TEST_LANG = c TEST_OPT = -p TEST_INP = Kuhl/simc2.c Kuhl/simc1.c # -i option test TEST_LANG = c TEST_OPT = -f -r 20 -R -i $@ SIM_C_CFS = $(SIM_CFS) $(ALG_CFS) $(CLANG_CFS) SIM_C_OBJ = $(SIM_OBJ) $(ALG_OBJ) $(CLANG_OBJ) sim_c$(EXE): $(SIM_C_OBJ) $(LOADER) $(SIM_C_OBJ) -o $@ SIM_GRB += clang.c sim_c $(BINDIR)/sim_c$(EXE): sim_c$(EXE) $(COPY) sim_c$(EXE) $@ # The Java Language module: # Java JAVALANG_CFS = javalang.c JAVALANG_OBJ = javalang.o JAVALANG_SRC = javalang.l JAVALANG_FLS = $(JAVALANG_SRC) javalang.c: javalang.l $(LEX) -t javalang.l >$@ SIM_JAVA_CFS = $(SIM_CFS) $(ALG_CFS) $(JAVALANG_CFS) SIM_JAVA_OBJ = $(SIM_OBJ) $(ALG_OBJ) $(JAVALANG_OBJ) sim_java$(EXE): $(SIM_JAVA_OBJ) $(LOADER) $(SIM_JAVA_OBJ) -o $@ SIM_GRB += javalang.c sim_java $(BINDIR)/sim_java$(EXE): sim_java$(EXE) $(COPY) sim_java$(EXE) $@ # The Pascal Language module: # Pascal PASCLANG_CFS = pascallang.c PASCLANG_OBJ = pascallang.o PASCLANG_SRC = pascallang.l PASCLANG_FLS = $(PASCLANG_SRC) pascallang.c: pascallang.l $(LEX) -t pascallang.l >pascallang.c SIM_PASC_CFS = $(SIM_CFS) $(ALG_CFS) $(PASCLANG_CFS) SIM_PASC_OBJ = $(SIM_OBJ) $(ALG_OBJ) $(PASCLANG_OBJ) sim_pasc$(EXE): $(SIM_PASC_OBJ) $(LOADER) $(SIM_PASC_OBJ) -o $@ SIM_GRB += pascallang.c sim_pasc $(BINDIR)/sim_pasc$(EXE): sim_pasc$(EXE) $(COPY) sim_pasc$(EXE) $@ # The Modula-2 Language module: # Modula-2 M2LANG_CFS = m2lang.c M2LANG_OBJ = m2lang.o M2LANG_SRC = m2lang.l M2LANG_FLS = $(M2LANG_SRC) m2lang.c: m2lang.l $(LEX) -t m2lang.l >$@ SIM_M2_CFS = $(SIM_CFS) $(ALG_CFS) $(M2LANG_CFS) SIM_M2_OBJ = $(SIM_OBJ) $(ALG_OBJ) $(M2LANG_OBJ) sim_m2$(EXE): $(SIM_M2_OBJ) $(LOADER) $(SIM_M2_OBJ) -o $@ SIM_GRB += m2lang.c sim_m2 $(BINDIR)/sim_m2$(EXE): sim_m2$(EXE) $(COPY) sim_m2$(EXE) $@ # The Lisp Language module: # Lisp LISPLANG_CFS = lisplang.c LISPLANG_OBJ = lisplang.o LISPLANG_SRC = lisplang.l LISPLANG_FLS = $(LISPLANG_SRC) lisplang.c: lisplang.l $(LEX) -t lisplang.l >$@ SIM_LISP_CFS = $(SIM_CFS) $(ALG_CFS) $(LISPLANG_CFS) SIM_LISP_OBJ = $(SIM_OBJ) $(ALG_OBJ) $(LISPLANG_OBJ) sim_lisp$(EXE): $(SIM_LISP_OBJ) $(LOADER) $(SIM_LISP_OBJ) -o $@ SIM_GRB += lisplang.c sim_lisp $(BINDIR)/sim_lisp$(EXE): sim_lisp$(EXE) $(COPY) sim_lisp$(EXE) $@ # The Miranda Language module: # Miranda MIRALANG_CFS = miralang.c MIRALANG_OBJ = miralang.o MIRALANG_SRC = miralang.l MIRALANG_FLS = $(MIRALANG_SRC) miralang.c: miralang.l $(LEX) -t miralang.l >$@ SIM_MIRA_CFS = $(SIM_CFS) $(ALG_CFS) $(MIRALANG_CFS) SIM_MIRA_OBJ = $(SIM_OBJ) $(ALG_OBJ) $(MIRALANG_OBJ) sim_mira$(EXE): $(SIM_MIRA_OBJ) $(LOADER) $(SIM_MIRA_OBJ) -o $@ SIM_GRB += miralang.c sim_mira $(BINDIR)/sim_mira$(EXE): sim_mira$(EXE) $(COPY) sim_mira$(EXE) $@ # The Text module: # Text TEXTLANG_CFS = textlang.c TEXTLANG_OBJ = textlang.o TEXTLANG_SRC = textlang.l TEXTLANG_FLS = $(TEXTLANG_SRC) textlang.c: textlang.l $(LEX) -t textlang.l >$@ SIM_TEXT_CFS = $(SIM_CFS) $(TEXTLANG_CFS) SIM_TEXT_OBJ = $(SIM_OBJ) $(TEXTLANG_OBJ) sim_text$(EXE): $(SIM_TEXT_OBJ) $(LOADER) $(SIM_TEXT_OBJ) -o $@ SIM_GRB += textlang.c sim_text $(BINDIR)/sim_text$(EXE): sim_text$(EXE) $(COPY) sim_text$(EXE) $@ # T E S T S # Some simple tests: .PHONY: sim.res percentages.res sim.res: sim_$(TEST_LANG)$(EXE) # no TEST_INP required, for error tests ./sim_$(TEST_LANG)$(EXE) $(TEST_OPT) $(TEST_INP) stream.res: sim_$(TEST_LANG)$(EXE) ./sim_$(TEST_LANG)$(EXE) -- $(TEST_OPT) $(TEST_INP) >$@ wc $@ $(TEST_INP) percentages.res:sim_$(TEST_LANG)$(EXE) ./sim_$(TEST_LANG)$(EXE) -p $(TEST_OPT) $(TEST_INP) TEST_GRB = stream.res # More simple tests, using the C version only: simsim: sim_c$(EXE) $(SIM_CFS) $(ALG_CFS) ./sim_c$(EXE) -fr 20 $(SIM_CFS) $(ALG_CFS) # Lint lint: $(SIM_SRC) $(ALG_SRC) $(ABS_CFS) $(LINT) $(LINTFLAGS) $(SIM_CFS) $(ALG_CFS) $(ABS_CFS) # O T H E R E N T R I E S # Sets of files: general, modules, main programs, languages CFS = $(SIM_CFS) $(ALG_CFS) \ $(CLANG_CFS) $(JAVALANG_CFS) $(PASCLANG_CFS) $(M2LANG_CFS) \ $(LISPLANG_CFS) $(MIRALANG_CFS) $(TEXTLANG_CFS) OBJ = $(SIM_OBJ) $(ALG_OBJ) \ $(CLANG_OBJ) $(JAVALANG_OBJ) $(PASCLANG_OBJ) $(M2LANG_OBJ) \ $(LISPLANG_OBJ) $(MIRALANG_OBJ) $(TEXTLANG_OBJ) SRC = $(SIM_SRC) $(ALG_SRC) \ $(CLANG_SRC) $(JAVALANG_SRC) $(PASCLANG_SRC) $(M2LANG_SRC) \ $(LISPLANG_SRC) $(MIRALANG_SRC) $(TEXTLANG_SRC) FLS = $(SIM_FLS) $(ALG_FLS) \ $(CLANG_FLS) $(JAVALANG_FLS) $(PASCLANG_FLS) $(M2LANG_FLS) \ $(LISPLANG_FLS) $(MIRALANG_FLS) $(TEXTLANG_FLS) \ sysidf.mk sysidf.msdos sysidf.unix DOC = README sim.1 sim.txt sim.html ChangeLog Answers TechnReport # Documentation man: sim.pdf # Installation install_all: install # just a synonym install: $(MAN1DIR)/sim.1 \ $(BINDIR)/sim_c$(EXE) \ $(BINDIR)/sim_java$(EXE) \ $(BINDIR)/sim_pasc$(EXE) \ $(BINDIR)/sim_m2$(EXE) \ $(BINDIR)/sim_lisp$(EXE) \ $(BINDIR)/sim_mira$(EXE) \ $(BINDIR)/sim_text$(EXE) $(MAN1DIR)/sim.1: sim.1 $(COPY) sim.1 $@ # Clean-up .PHONY: clean fresh clean: -rm -f *.o -rm -f $(SIM_GRB) -rm -f $(TEST_GRB) -rm -f $(DOC_GRB) -rm -f a.out a.exe sim.txt core mon.out fresh: clean -rm -f *.exe # D E P E N D E N C I E S # DO NOT DELETE THIS LINE -- make depend depends on it. ForEachFile.o: ForEachFile.c ForEachFile.h fname.h Malloc.o: Malloc.c any_int.h Malloc.h add_run.o: add_run.c sim.h debug.par text.h runs.h aiso.spc percentages.h \ Malloc.h options.h error.h add_run.h algollike.o: algollike.c options.h error.h token.h algollike.h any_int.o: any_int.c any_int.h clang.o: clang.c options.h token.h language.h algollike.h idf.h lex.h \ lang.h compare.o: compare.c sim.h text.h token.h tokenarray.h hash.h language.h \ options.h add_run.h compare.h debug.par count_sim_dup.o: count_sim_dup.c debug.o: debug.c debug.h error.o: error.c sim.h error.h fname.o: fname.c fname.h hash.o: hash.c system.par debug.par sim.h text.h Malloc.h error.h \ any_int.h token.h language.h tokenarray.h options.h hash.h idf.o: idf.c system.par token.h idf.h javalang.o: javalang.c options.h token.h language.h algollike.h idf.h \ lex.h lang.h lang.o: lang.c token.h language.h algollike.h idf.h lex.h lang.h language.o: language.c token.h language.h lex.o: lex.c lex.h lisplang.o: lisplang.c token.h language.h algollike.h lex.h lang.h idf.h m2lang.o: m2lang.c options.h token.h language.h algollike.h idf.h lex.h \ lang.h miralang.o: miralang.c token.h language.h algollike.h lex.h lang.h idf.h newargs.o: newargs.c sim.h ForEachFile.h fname.h Malloc.h error.h \ newargs.h options.o: options.c options.h pascallang.o: pascallang.c options.h token.h language.h algollike.h idf.h \ lex.h lang.h pass1.o: pass1.c debug.par sim.h text.h token.h tokenarray.h lang.h \ error.h options.h pass1.h pass2.o: pass2.c debug.par sim.h token.h text.h lang.h pass2.h \ sortlist.bdy pass3.o: pass3.c system.par debug.par sim.h text.h token.h runs.h \ aiso.spc Malloc.h error.h options.h pass3.h percentages.h percentages.o: percentages.c debug.par sim.h text.h runs.h aiso.spc \ options.h Malloc.h error.h percentages.h sortlist.bdy runs.o: runs.c sim.h text.h runs.h aiso.spc debug.par aiso.bdy Malloc.h sim.o: sim.c system.par settings.par sim.h options.h newargs.h token.h \ language.h error.h text.h runs.h aiso.spc hash.h compare.h pass1.h \ pass2.h pass3.h percentages.h stream.h lang.h Malloc.h any_int.h stream.o: stream.c system.par sim.h options.h token.h lang.h stream.h t.o: t.c text.o: text.c debug.par sim.h token.h stream.h lang.h Malloc.h options.h \ error.h text.h textlang.o: textlang.c sim.h token.h idf.h lex.h lang.h language.h token.o: token.c token.h tokenarray.o: tokenarray.c error.h Malloc.h token.h lang.h tokenarray.h similarity-tester-2.89.orig/options.h0000644000000000000000000000115712540503627014647 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: options.h,v 1.8 2012-05-13 09:05:49 dick Exp $ */ /* Setting and consulting command line options */ struct option { char op_char; /* char as in call */ char *op_text; /* explanatory text */ char op_indicator; /* type indicator, N = int, F = file name */ const char **op_stringp;/* string value to be picked up */ }; extern void set_option(char ch); extern int is_set_option(int ch); extern int do_options( const char *progname, const struct option *optlist, int argc, const char *argv[] ); similarity-tester-2.89.orig/settings.par0000644000000000000000000000044612540503627015347 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: settings.par,v 1.2 2012-06-05 14:58:39 dick Exp $ */ #define DEFAULT_MIN_RUN_SIZE 24 /* default minimum run size */ #define DEFAULT_PAGE_WIDTH 80 /* default page width */ similarity-tester-2.89.orig/pass2.c0000644000000000000000000000744512540503627014205 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: pass2.c,v 2.23 2015-01-12 09:16:13 dick Exp $ */ #include #include "debug.par" #include "sim.h" #include "token.h" #include "text.h" #include "lang.h" #include "pass2.h" #ifdef DB_POS static void db_print_pos_list(const char *, const struct text *); static void db_print_lex(const char *); #endif static void pass2_txt(struct text *txt); void Retrieve_Runs(void) { int n; for (n = 0; n < Number_of_Texts; n++) { pass2_txt(&Text[n]); } } /* begin instantiate static void sort_pos_list(struct position **) */ #define SORT_STRUCT position #define SORT_NAME sort_pos_list #define SORT_BEFORE(p1,p2) ((p1)->ps_tk_cnt < (p2)->ps_tk_cnt) #define SORT_NEXT ps_next #include "sortlist.bdy" /* end instantiate sort_pos_list() */ static void pass2_txt(struct text *txt) { struct position *pos; size_t old_nl_cnt; if (!txt->tx_pos) /* no need to scan the file */ return; /* Open_Text() initializes lex_nl_cnt and lex_tk_cnt */ if (!Open_Text(Second_Pass, txt)) { fprintf(stderr, ">>>> File %s disappeared <<<<\n", txt->tx_fname ); return; } /* Sort the positions so they can be matched to the file; the linked list of struct positions snakes through the struct positions in the struct chunks in the struct runs. */ #ifdef DB_POS db_print_pos_list("before sorting", txt); #endif /* DB_POS */ sort_pos_list(&txt->tx_pos); #ifdef DB_POS db_print_pos_list("after sorting", txt); #endif /* DB_POS */ #ifdef DB_NL_BUFF db_print_nl_buff(txt->tx_nl_start, txt->tx_nl_limit); #endif /* DB_NL_BUFF */ #ifdef DB_POS fprintf(Debug_File, "\n**** DB_PRINT_SCAN of %s ****\n", txt->tx_fname); #endif /* DB_POS */ old_nl_cnt = 1; pos = txt->tx_pos; while (pos) { /* we scan the pos list and the file in parallel */ /* find the corresponding line */ while (pos->ps_tk_cnt >= lex_tk_cnt) { /* pos does not refer to this line, try the next */ /* shift the administration */ old_nl_cnt = lex_nl_cnt; /* and get the next eol position */ if (!Next_Text_EOL_Obtained()) { /* reached end of file without obtaining EOL */ if (!txt->tx_EOL_terminated) { /* that's OK then */ } else { fprintf(stderr, ">>>> File %s modified <<<<\n", txt->tx_fname ); } break; } #ifdef DB_POS db_print_lex(txt->tx_fname); #endif /* DB_POS */ } /* fill in the pos */ switch (pos->ps_type) { case 0: /* first token of run */ pos->ps_nl_cnt = old_nl_cnt; break; case 1: /* last token of run */ pos->ps_nl_cnt = lex_nl_cnt; break; } /* and get the next pos */ pos = pos->ps_next; } #ifdef DB_POS db_print_pos_list("after scanning", txt); #endif /* DB_POS */ /* Flush the flex buffers; it's easier than using YY_BUFFER_STATE. */ while (Next_Text_EOL_Obtained()); Close_Text(Second_Pass, txt); } #ifdef DB_POS static void db_print_pos(const struct position *pos) { fprintf(Debug_File, "pos type = %s; %s count = %u", (pos->ps_type == 0 ? "first" : " last"), token_name, pos->ps_tk_cnt ); fprintf(Debug_File, ", line # = "); if (pos->ps_nl_cnt == (size_t) -1) { fprintf(Debug_File, ""); } else { fprintf(Debug_File, "%u", pos->ps_nl_cnt); } fprintf(Debug_File, "\n"); } static void db_print_pos_list(const char *msg, const struct text *txt) { fprintf(Debug_File, "\n**** DB_PRINT_POS_LIST of %s, %s ****\n", txt->tx_fname, msg); const struct position *pos = txt->tx_pos; while (pos) { db_print_pos(pos); pos = pos->ps_next; } fprintf(Debug_File, "\n"); } static void db_print_lex(const char *fn) { fprintf(Debug_File, "%s: lex_tk_cnt = %u, lex_nl_cnt = %u, lex_token = ", fn, lex_tk_cnt, lex_nl_cnt); fprint_token(Debug_File, lex_token); fprintf(Debug_File, "\n"); } #endif /* DB_POS */ similarity-tester-2.89.orig/lang.c0000644000000000000000000000146712540503627014074 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: lang.c,v 2.8 2014-01-27 20:50:51 dick Exp $ */ /* This is a dummy implementation of the module 'lang'. Its actual implementation derives from one of the *lang.l files. */ #include #include #include "token.h" #include "language.h" #include "algollike.h" #include "idf.h" #include "lex.h" #include "lang.h" FILE *yyin; int yylex(void) { abort(); #ifdef lint (void)May_Be_Start_Of_Algol_Run(0); (void)Best_Algol_Run_Size(0, 0); (void)idf_in_list(0, 0, 0, 0); (void)idf_hashed(0); #endif return 0; } void yystart(void) { abort(); #ifdef lint Init_Algol_Language(0, 0, 0, 0); #endif } Token lex_token; size_t lex_nl_cnt; size_t lex_tk_cnt; size_t lex_non_ascii_cnt; similarity-tester-2.89.orig/debug.c0000644000000000000000000000337312540503627014237 0ustar /* This file is part of the debugging module DEBUG. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: debug.c,v 1.7 2014-09-25 06:58:25 Gebruiker Exp $ */ #include #include #include #include "debug.h" /*Library module source prelude */ #undef _MODULE_CODE_ #ifndef lint #define _MODULE_CODE_ #endif #ifdef LIB #define _MODULE_CODE_ #endif #ifdef _MODULE_CODE_ /* Library module source code */ #ifdef DEBUG static void wr_char(char ch) { write(2, &ch, 1); } static void wr_num(int b,int v) { if (v >= b) { wr_num(b, v/b); } wr_char("0123456789ABCDEF"[v%b]); } static void wr_str(const char *s) { while (*s) { wr_char(*s++); } } void wr_info(const char *s, int v) { /* print the string */ if (s) { int cnt = 0; while (*s) { int ch = *s++ &0377; /* cut short a possibly corrupted string */ if (cnt++ > 50) { wr_str("..."); break; } /* put not thy faith in chars, signed or unsigned */ if (isprint(ch)) { wr_char(ch); } else { switch (ch) { case '\n': wr_str("\\n"); break; case '\t': wr_str("\\t"); break; case '\r': wr_str("\\r"); break; case '\f': wr_str("\\f"); break; default: wr_char('\\'); wr_char(ch / 0100 % 010 + '0'); wr_char(ch / 010 % 010 + '0'); wr_char(ch / 01 % 010 + '0'); break; } } } } else { wr_str(""); } /* print the value */ wr_char(' '); if (v < 0) { wr_char('-'); v = -v; } wr_num(10, v); wr_char('\n'); } #else /*ARGSUSED*/ void wr_info(const char *s, int v) { } #endif /* DEBUG */ /* End library module source code */ #endif /* _MODULE_CODE_ */ #ifdef lint static void satisfy_lint(void *x) { wr_info((char *)x, 0); satisfy_lint(x); } #endif /* lint */ similarity-tester-2.89.orig/clang.l0000644000000000000000000001240112540503627014236 0ustar %{ /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: clang.l,v 2.20 2013-04-28 16:30:40 dick Exp $ */ /* C language front end for the similarity tester. Author: Dick Grune */ #include "options.h" #include "token.h" #include "language.h" #include "algollike.h" #include "idf.h" #include "lex.h" #include "lang.h" /* General language front end data */ Token lex_token; size_t lex_nl_cnt; size_t lex_tk_cnt; size_t lex_non_ascii_cnt; /* Language-dependent data */ /* Data for module idf */ static const struct idf ppcmd[] = { {"define", META('d')}, {"else", META('e')}, {"endif", META('E')}, {"if", META('i')}, {"ifdef", META('I')}, {"ifndef", META('x')}, {"include", MTCT('I')}, {"line", META('l')}, {"undef", META('u')} }; static const struct idf reserved[] = { {"auto", NORM('a')}, {"break", NORM('b')}, {"case", NORM('c')}, {"char", NORM('C')}, {"continue", CTRL('C')}, {"default", NORM('d')}, {"do", NORM('D')}, {"double", CTRL('D')}, {"else", NORM('e')}, {"enum", NORM('E')}, {"extern", CTRL('E')}, {"float", NORM('f')}, {"for", NORM('F')}, {"goto", NORM('g')}, {"if", NORM('i')}, {"int", NORM('I')}, {"long", NORM('l')}, {"register", No_Token}, {"return", NORM('r')}, {"short", NORM('s')}, {"sizeof", NORM('S')}, {"static", CTRL('S')}, {"struct", META('s')}, {"switch", META('S')}, {"typedef", NORM('t')}, {"union", NORM('u')}, {"unsigned", NORM('U')}, {"void", No_Token}, {"while", NORM('w')} }; /* Special treatment of identifiers */ static Token idf2token(int hashing) { Token tk; tk = idf_in_list(yytext, reserved, sizeof reserved, IDF); if (Token_EQ(tk, IDF) && hashing) { /* return a one-Token hash code */ tk = idf_hashed(yytext); } return tk; } /* Token sets for module algollike */ static const Token Non_Finals[] = { IDF, /* identifier */ NORM('{'), NORM('('), NORM('a'), /* auto */ NORM('b'), /* break */ NORM('c'), /* case */ NORM('C'), /* char */ CTRL('C'), /* continue */ NORM('d'), /* default */ NORM('D'), /* do */ CTRL('D'), /* double */ NORM('E'), /* enum */ CTRL('E'), /* extern */ NORM('f'), /* float */ NORM('F'), /* for */ NORM('g'), /* goto */ NORM('i'), /* if */ NORM('I'), /* int */ NORM('l'), /* long */ NORM('r'), /* return */ NORM('s'), /* short */ CTRL('S'), /* static */ META('s'), /* struct */ META('S'), /* switch */ NORM('t'), /* typedef */ NORM('u'), /* union */ NORM('U'), /* unsigned */ NORM('w'), /* while */ No_Token }; static const Token Non_Initials[] = { NORM(')'), NORM('}'), NORM(';'), No_Token }; static const Token Openers[] = { NORM('{'), NORM('('), NORM('['), No_Token }; static const Token Closers[] = { NORM('}'), NORM(')'), NORM(']'), No_Token }; /* Language-dependent code */ void Init_Language(void) { Init_Algol_Language(Non_Finals, Non_Initials, Openers, Closers); } int May_Be_Start_Of_Run(Token ch) { return May_Be_Start_Of_Algol_Run(ch); } size_t Best_Run_Size(const Token *str, size_t size) { return Best_Algol_Run_Size(str, size); } %} %option noyywrap %Start Comment Layout ([ \t\r\f]) ASCII95 ([- !"#$%&'()*+,./0-9:;<=>?@A-Z\[\\\]^_`a-z{|}~]) AnyQuoted (\\.) StrChar ([^"\n\\]|{AnyQuoted}) ChrChar ([^'\n\\]|{AnyQuoted}) StartComment ("/*") EndComment ("*/") SafeComChar ([^*\n]) UnsafeComChar ("*") MSComment ("//"{MSCommentChar}*) MSCommentChar ([^\n]) Digit ([0-9a-fA-F]) Idf ([A-Za-z][A-Za-z0-9_]*) %% {StartComment} { /* We do not have one single pattern to match a comment (although one can be written), for two reasons. The matched string might overflow lex-internal buffers like yysbuf and yytext; and the pattern would be very complicated and impair maintainability. So we break up the string into safe chunks and keep track of where we are in a start condition . */ BEGIN Comment; } {SafeComChar}+ { /* safe comment chunk */ } {UnsafeComChar} { /* unsafe char, read one by one */ } "\n" { /* to break up long comments */ return_eol(); } {EndComment} { /* end-of-comment */ BEGIN INITIAL; } {MSComment} { /* ignore */ } \"{StrChar}*\" { /* strings */ return_ch('"'); } \'{ChrChar}+\' { /* characters */ return_ch('\''); } ^#{Layout}*include.* { /* ignore #include lines */ } ^#{Layout}*{Idf} { /* a preprocessor line */ char *idf = yytext+1; /* skip layout in front of preprocessor identifier */ while (*idf == ' ' || *idf == '\t') { idf++; } return_tk(idf_in_list(idf, ppcmd, sizeof ppcmd, NORM('#'))); } (0x)?{Digit}+("l"|"L")? { /* numeral, passed as an identifier */ return_tk(IDF); } {Idf}/"(" { /* identifier in front of ( */ Token tk; tk = idf2token(is_set_option('F')); if (!Token_EQ(tk, No_Token)) return_tk(tk); } {Idf} { /* identifier */ Token tk; tk = idf2token(0 /* no hashing */); if (!Token_EQ(tk, No_Token)) return_tk(tk); } \; { /* semicolon, conditionally ignored */ if (is_set_option('f')) return_ch(yytext[0]); } \n { /* count newlines */ return_eol(); } {Layout} { /* ignore layout */ } {ASCII95} { /* copy other text */ return_ch(yytext[0]); } . { /* count non-ASCII chars */ lex_non_ascii_cnt++; } %% /* More language-dependent code */ void yystart(void) { BEGIN INITIAL; } similarity-tester-2.89.orig/debug.h0000644000000000000000000000154612540503627014244 0ustar /* This file is part of the debugging module DEBUG. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: debug.h,v 1.9 2014-09-25 06:58:26 Gebruiker Exp $ */ #ifndef _DEBUG_H_ #define _DEBUG_H_ /**** The module DEBUG defines one routine, extern void wr_info(const char *str, int val); which, when compiled with a -DDEBUG option, writes the string str, a space character, the value val in decimal, and a newline to standard error output (file descriptor 2), without interfering with other program activities. This allows debugging info to be obtained in the presence of sudden crashes and other nefarious program activity. Compiled without the -DDEBUG option wr_info does nothing. This allows easy switching off of the debugging feature by recompiling debug.c. ****/ /* Public entries */ extern void wr_info(const char *s, int v); #endif /* _DEBUG_H_ */ similarity-tester-2.89.orig/javalang.l0000644000000000000000000001327012540503627014742 0ustar %{ /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: javalang.l,v 1.14 2013-04-28 16:30:41 dick Exp $ */ /* Java language front end for the similarity tester. Author: Dick Grune */ #include "options.h" #include "token.h" #include "language.h" #include "algollike.h" #include "idf.h" #include "lex.h" #include "lang.h" /* General language front end data */ Token lex_token; size_t lex_nl_cnt; size_t lex_tk_cnt; size_t lex_non_ascii_cnt; /* Language-dependent data */ static const struct idf reserved[] = { {"abstract", NORM('a')}, {"boolean", NORM('b')}, {"break", NORM('B')}, {"byte", CTRL('B')}, {"case", NORM('c')}, {"catch", NORM('C')}, {"char", CTRL('C')}, {"class", META('c')}, {"continue", META('C')}, {"default", NORM('d')}, {"do", NORM('D')}, {"double", CTRL('D')}, {"else", NORM('e')}, {"extends", NORM('E')}, {"false", NORM('g')}, /* Boolean literal */ {"final", NORM('f')}, {"finally", NORM('F')}, {"float", CTRL('F')}, {"for", META('f')}, {"if", NORM('i')}, {"implements", NORM('I')}, {"import", CTRL('I')}, {"instanceof", META('i')}, {"int", META('I')}, {"interface", MTCT('I')}, {"long", NORM('l')}, {"native", NORM('n')}, {"new", NORM('N')}, {"null", CTRL('N')}, /* null literal */ {"package", NORM('p')}, {"private", NORM('P')}, {"protected", CTRL('P')}, {"public", META('p')}, {"return", NORM('r')}, {"short", NORM('s')}, {"static", NORM('S')}, {"super", CTRL('S')}, {"switch", META('s')}, {"synchronized",META('S')}, {"this", NORM('t')}, {"throw", NORM('T')}, {"throws", CTRL('T')}, {"true", META('t')}, /* Boolean literal */ {"void", NORM('v')}, {"volatile", NORM('V')}, {"while", NORM('w')} }; /* Special treatment of identifiers */ static Token idf2token(int hashing) { Token tk; tk = idf_in_list(yytext, reserved, sizeof reserved, IDF); if (Token_EQ(tk, IDF) && hashing) { /* return a one-Token hash code */ tk = idf_hashed(yytext); } return tk; } /* Token sets for module algollike */ const Token Non_Finals[] = { IDF, /* identifier */ NORM('{'), NORM('('), NORM('a'), /* abstract */ NORM('b'), /* boolean */ NORM('B'), /* break */ CTRL('B'), /* byte */ NORM('c'), /* case */ NORM('C'), /* catch */ CTRL('C'), /* char */ META('c'), /* class */ META('C'), /* continue */ NORM('d'), /* default */ NORM('D'), /* do */ CTRL('D'), /* double */ NORM('e'), /* else */ NORM('E'), /* extends */ NORM('f'), /* final */ NORM('F'), /* finally */ CTRL('F'), /* float */ META('f'), /* for */ NORM('i'), /* if */ NORM('I'), /* implements */ CTRL('I'), /* import */ META('i'), /* instanceof */ META('I'), /* int */ MTCT('I'), /* interface */ NORM('l'), /* long */ NORM('n'), /* native */ NORM('N'), /* new */ NORM('p'), /* package */ NORM('P'), /* private */ CTRL('P'), /* protected */ META('p'), /* public */ NORM('r'), /* return */ NORM('s'), /* short */ NORM('S'), /* static */ CTRL('S'), /* super */ META('s'), /* switch */ META('S'), /* synchronized */ NORM('T'), /* throw */ CTRL('T'), /* throws */ NORM('v'), /* void */ NORM('V'), /* volatile */ NORM('w'), /* while */ No_Token }; const Token Non_Initials[] = { NORM(')'), NORM('}'), NORM(';'), No_Token }; const Token Openers[] = { NORM('{'), NORM('('), NORM('['), No_Token }; const Token Closers[] = { NORM('}'), NORM(')'), NORM(']'), No_Token }; /* Language-dependent code */ void Init_Language(void) { Init_Algol_Language(Non_Finals, Non_Initials, Openers, Closers); } int May_Be_Start_Of_Run(Token ch) { return May_Be_Start_Of_Algol_Run(ch); } size_t Best_Run_Size(const Token *str, size_t size) { return Best_Algol_Run_Size(str, size); } %} %option noyywrap %Start Comment Layout ([ \t\r\f]) ASCII95 ([- !"#$%&'()*+,./0-9:;<=>?@A-Z\[\\\]^_`a-z{|}~]) Digit ([0-9a-fA-F]) UniCode (\\u{Digit}{Digit}{Digit}{Digit}) AnyQuoted ((\\.)|{UniCode}) StrChar ([^"\n\\]|{AnyQuoted}) ChrChar ([^'\n\\]|{AnyQuoted}) StartComment ("/*") EndComment ("*/") SafeComChar ([^*\n]) UnsafeComChar ("*") SingleLineCom ("//".*) Idf ([A-Za-z][A-Za-z0-9_]*) %% {StartComment} { /* We do not have one single pattern to match a comment (although one can be written), for two reasons. The matched string might overflow lex-internal buffers like yysbuf and yytext; and the pattern would be very complicated and overtax lex. So we break up the string into safe chunks and keep track of where we are in a start condition . */ BEGIN Comment; } {SafeComChar}+ { /* safe comment chunk */ } {UnsafeComChar} { /* unsafe char, read one by one */ } "\n" { /* to break up long comments */ return_eol(); } {EndComment} { /* end-of-comment */ BEGIN INITIAL; } {SingleLineCom}"\n" { /* single-line comment */ return_eol(); } \"{StrChar}*\" { /* strings */ return_ch('"'); } \'{ChrChar}+\' { /* characters */ return_ch('\''); } (0x)?{Digit}+("l"|"L")? { /* numeral, passed as an identifier */ return_tk(IDF); } "import"{Layout}[^;]*; { /* import statement; ignore */ } {Idf}/"(" { /* identifier in front of ( */ Token tk; tk = idf2token(is_set_option('F')); if (!Token_EQ(tk, No_Token)) return_tk(tk); } {Idf} { /* identifier */ Token tk; tk = idf2token(0 /* no hashing */); if (!Token_EQ(tk, No_Token)) return_tk(tk); } \; { /* semicolon, conditionally ignored */ if (is_set_option('f')) return_ch(yytext[0]); } \n { /* count newlines */ return_eol(); } {Layout} { /* ignore layout */ } {ASCII95} { /* copy other text */ return_ch(yytext[0]); } . { /* count non-ASCII chars */ lex_non_ascii_cnt++; } %% /* More language-dependent code */ void yystart(void) { BEGIN INITIAL; } similarity-tester-2.89.orig/sim.h0000644000000000000000000000143512540503627013743 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: sim.h,v 2.17 2014-01-26 21:52:59 dick Exp $ */ #include extern int Min_Run_Size; extern int Page_Width; extern FILE *Output_File; extern FILE *Debug_File; extern const char *token_name; /* possibly modified in *lang.l */ extern int Threshold_Percentage; /* threshold percentage */ extern const char *progname; /* for error reporting */ extern const char *min_run_string; extern const char *threshold_string; extern int is_new_old_separator(const char *s); extern const char *size_t2string(size_t s); /* All output goes through designated files, so we block printf, etc. */ #undef printf #define printf use_fprintf #undef putchar #define putchar use_fprintf similarity-tester-2.89.orig/sortlist.bdy0000644000000000000000000000303512540503627015363 0ustar /* Module: Sort Linked Lists Author: dick@dickgrune.com (Dick Grune, Amstelveen) Version: 2015-01-18 Description: This is the implementation part of a generic routine that sorts linked lists. Instantiation: See sortlist.spc */ #ifndef _SORT_EXTERN_DEFINED static #endif void SORT_NAME(struct SORT_STRUCT **l_hook) { /* by split-sort-merge */ struct SORT_STRUCT *lst = *l_hook; if (lst == 0) return; /* the empty list is sorted */ if (lst->SORT_NEXT == 0) return; /* a 1-element list is sorted */ /* There are at least two elements; split them into two sublists. */ struct SORT_STRUCT *q0 = 0, *q1 = 0; /* starts of the sublists */ struct SORT_STRUCT **q_hook[2]; /* append hooks for the lists */ q_hook[0] = &q0, q_hook[1] = &q1; int q_cnt = 0; /* pertinemt sublist pointer */ while (lst) { /* Detach the head element */ struct SORT_STRUCT *l = lst; lst = lst->SORT_NEXT; l->SORT_NEXT = 0; /* and append it to the pertinent sublist. */ *q_hook[q_cnt] = l; q_hook[q_cnt] = &l->SORT_NEXT; q_cnt = 1 - q_cnt; /* switch pertinent sublist */ } /* Sort recursively. */ SORT_NAME(&q0); SORT_NAME(&q1); /* Merge. */ *l_hook = 0; while (q0 || q1) { /* determine the list with the smallest head element */ struct SORT_STRUCT **h_hook = ( q0 == 0 ? &q1 : q1 == 0 ? &q0 : SORT_BEFORE((q0), (q1)) ? &q0 : &q1 ); /* detach head element */ struct SORT_STRUCT *l = *h_hook; *h_hook = (*h_hook)->SORT_NEXT; l->SORT_NEXT = 0; /* append l to l_hook */ *l_hook = l; l_hook = &l->SORT_NEXT; } } similarity-tester-2.89.orig/README0000644000000000000000000000466412540503627013671 0ustar # This file is part of the software similarity tester SIM. # Written by Dick Grune, Vrije Universiteit, Amsterdam. # $Id: README,v 2.17 2015-04-29 18:18:21 dick Exp $ These programs test for similar or equal stretches in one or more program or text files and can be used to detect common code or plagiarism. See sim.pdf. Checkers are available for C, Java, Pascal, Modula-2, Lisp, Miranda and natural language text. >>>> NEW, Apr 2015: - better percentage computation >>>> NEW, Jan 2014: - 64-bit compatible - works also on 32-bit machines with software 64-bit emulator - accepts | as new-old separator >>>> NEW, June 6, 2012: - greatly improved percentage computation - increased resolution, reducing false positives in sim_text - // comments in C recognized - characters 0200-0377 accepted in sim_text - s p a c e d w o r d s recognized in sim_text - UNICODE file names accepted - manual page in PDF ==== To install on any system with gcc, flex, cp, ln, echo, rm, and wc, or their equivalents, for example UNIX/Linux or MSDOS+MinGW: Unpack the archive sim_2_*.zip To compile and test, edit the Makefile to fit the local situation, and call: make test This will generate one executable called sim_c, the checker for C, and will run two small tests to show sample output. To install, examine the Makefile, edit BINDIR and MAN1DIR to sensible paths, and call make install To change the default run size or the page width, adjust the file settings.par and recompile. ==== To install on MSDOS, if you don't have a C compiler, the archive sim_exe_2_*.zip contains: SIM_C.EXE similarity tester for C SIM_JAVA.EXE similarity tester for Java SIM_PASC.EXE similarity tester for Pascal SIM_M2.EXE similarity tester for Modula-2 SIM_LISP.EXE similarity tester for Lisp SIM_MIRA.EXE similarity tester for Miranda SIM_TEXT.EXE similarity tester for text ==== To extend: To add another language L, write a file Llang.l along the lines of clang.l and the other *lang.l files, extend the Makefile and recompile. All knowledge about a given language L is located in Llang.l; the rest of the programs expect each token to be a 16-bit character. Available at present: clang.l javalang.l pascallang.l m2lang.llisplang.l miralang.l textlang.l Dick Grune email: dick@dickgrune.com http://www.dickgrune.com similarity-tester-2.89.orig/tokenarray.c0000644000000000000000000000243312540503627015324 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: tokenarray.c,v 1.16 2015-01-12 09:16:13 dick Exp $ */ #include #include #include "error.h" #include "Malloc.h" #include "token.h" #include "lang.h" #include "tokenarray.h" #define TK_START 16384 /* initial token array size */ Token *Token_Array; /* to be filled by Malloc() */ static size_t tk_size; /* size of Token_Array[] */ static size_t tk_free; /* next free position in Token_Array[]*/ void Init_Token_Array(void) { if (Token_Array) Free(Token_Array); tk_size = TK_START; Token_Array = (Token *)Malloc(sizeof (Token) * tk_size); tk_free = 1; /* don't use position 0 */ } void Store_Token(Token tk) { if (tk_free == tk_size) { /* allocated array is full; try to increase its size */ size_t new_size = tk_size + tk_size/2; if (new_size < tk_free) fatal("out of address space"); Token *new_array = (Token *)TryRealloc( (char *)Token_Array, sizeof (Token) * new_size ); if (!new_array) { /* we failed */ fatal("out of memory"); } Token_Array = new_array, tk_size = new_size; } /* now we are sure there is room enough */ Token_Array[tk_free++] = tk; } size_t Token_Array_Length(void) { return tk_free; } similarity-tester-2.89.orig/stream.h0000644000000000000000000000072212540503627014444 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: stream.h,v 2.7 2012-06-08 16:04:30 dick Exp $ */ /* Interface of the stream module. Implements the direct interaction with the lexical module. It supplies the routines below. */ extern int Open_Stream(const char *); extern int Next_Stream_Token_Obtained(void); extern void Close_Stream(void); extern void Print_Stream(const char *fname); similarity-tester-2.89.orig/newargs.c0000644000000000000000000000620212540503627014611 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: newargs.c,v 2.9 2013-04-28 16:30:41 dick Exp $ */ #include #include "sim.h" #include "ForEachFile.h" #include "Malloc.h" #include "error.h" #include "newargs.h" #define ARGS_INCR 1024 static char *args; static size_t args_free; static size_t args_size; static void init_args(void) { args = 0; args_free = 0; args_size = 0; } static void add_char_to_args(char ch) { if (args_free == args_size) { /* allocated array is full; increase its size */ size_t new_size = args_size + ARGS_INCR; char *new_args = (char *)Realloc( (char *)args, sizeof (char *) * new_size ); args = new_args, args_size = new_size; } /* now we are sure there is room enough */ args[args_free++] = ch; } static void add_string_to_args(const Fchar *fn) { while (*fn) { add_char_to_args(*fn++); } add_char_to_args('\n'); } static char * std_input(void) { /* in the form (name \n)* \0 */ /* get all of standard input */ int ch; int last_char = '\n'; while (ch = getchar(), ch != EOF) { /* omit duplicate layout (= empty name) */ if (last_char == '\n' && ch == '\n') continue; add_char_to_args((char)ch); last_char = ch; } add_char_to_args('\0'); /* make sure the result conforms to the form above */ if (args[args_free-2] != '\n') fatal("standard input not terminated with newline"); return args; } static int n_names(const char *s) { int cnt = 0; while (*s) { if (*s == '\n') { cnt++; } s++; } return cnt; } static const char ** new_argv(int argc, char *args) { /* converts the layout in args to \0, and constructs an argv list */ const char **argv = (const char **)Malloc((size_t)(argc+1) * sizeof (char *)); char *p = args; char last_char = '\n'; argc = 0; while (*p) { if (last_char == '\n') { /* here a new name starts */ argv[argc++] = p; } last_char = *p; if (*p == '\n') { *p = '\0'; } p++; } argv[argc] = 0; return argv; } void get_new_std_input_args(int *argcp, const char **argvp[]) { init_args(); char *n_args = std_input(); int argc = n_names(n_args); const char **argv = new_argv(argc, n_args); *argcp = argc, *argvp = argv; } static void register_file(const Fchar *fn, const char *msg, const struct stat *fs) { if (msg) { fprintf(stderr, "could not handle file %s: %s\n", fn, msg); return; } if ( /* it is a non-empty regular file */ S_ISREG(fs->st_mode) && fs->st_size > 0 ) { add_string_to_args(fn); } } static char * recursive_args(int argc, const char *argv[]) { if (argc == 0) { ForEachFile(str2Fname("."), register_file); } else { int i; for (i = 0; i < argc; i++) { const char *arg = argv[i]; const Fchar *Farg = str2Fname(arg); if (is_new_old_separator(arg)) { add_string_to_args(Farg); } else { ForEachFile(Farg, register_file); } } } add_char_to_args('\0'); return args; } void get_new_recursive_args(int *argcp, const char **argvp[]) { init_args(); char *n_args = recursive_args(*argcp, *argvp); int argc = n_names(n_args); const char **argv = new_argv(argc, n_args); *argcp = argc, *argvp = argv; } similarity-tester-2.89.orig/compare.h0000644000000000000000000000055012540503627014576 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: compare.h,v 1.3 2012-05-16 07:56:05 dick Exp $ */ /* Compares each new text to the appropriate texts. Stores the runs found in the AISO heap. Runs contain references to positions in the input files. */ extern void Compare_Files(void); similarity-tester-2.89.orig/pass3.c0000644000000000000000000001604212540503627014177 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: pass3.c,v 2.22 2014-01-26 21:52:59 dick Exp $ */ #include #include #include "system.par" #include "debug.par" #include "sim.h" #include "text.h" #include "token.h" #include "runs.h" #include "Malloc.h" #include "error.h" #include "options.h" #include "pass3.h" #include "percentages.h" #ifdef DB_RUN #include "tokenarray.h" static void db_run(const struct run *); #endif static FILE *open_chunk(const struct chunk *); static void fill_line(FILE *, char []); static void clear_line(char []); static void show_run(const struct run *); static void show_2C_line(const char [], const char []); static void show_1C_line(FILE *, const char *); static int pr_head(const struct chunk *); static int prs(const char *); static int pru(size_t); static int unslen(size_t); static int max_line_length; /* Actual maximum line length */ static char *line0; /* by Malloc() */ static char *line1; void Show_Runs(void) { AisoIter iter; struct run *run; #ifdef DB_RUN fprintf(Debug_File, "Starting Show_Runs()\n"); #endif /* DB_RUN */ max_line_length = Page_Width / 2 - 2; line0 = Malloc((size_t)(max_line_length + 1) * sizeof (char)); line1 = Malloc((size_t)(max_line_length + 1) * sizeof (char)); OpenIter(&iter); while (GetAisoItem(&iter, &run)) { #ifdef DB_RUN db_run(run); #endif /* DB_RUN */ show_run(run); fprintf(Output_File, "\n"); } CloseIter(&iter); Free(line0); line0 = 0; Free(line1); line1 = 0; } static void show_run(const struct run *run) { /* The animals came in two by two ... */ const struct chunk *cnk0 = &run->rn_chunk0; const struct chunk *cnk1 = &run->rn_chunk1; size_t nl_cnt0 = cnk0->ch_last.ps_nl_cnt - cnk0->ch_first.ps_nl_cnt; size_t nl_cnt1 = cnk1->ch_last.ps_nl_cnt - cnk1->ch_first.ps_nl_cnt; FILE *f0; FILE *f1; /* display heading of chunk */ if (!is_set_option('d')) { /* no assumptions about the lengths of the file names! */ size_t size = run->rn_size; int pos = 0; pos += pr_head(cnk0); while (pos < max_line_length + 1) { pos += prs(" "); } pos += prs("|"); pos += pr_head(cnk1); while (pos < 2*max_line_length - unslen(size)) { pos += prs(" "); } fprintf(Output_File, "[%s]\n", size_t2string(size)); } else { (void)pr_head(cnk0); fprintf(Output_File, "\n"); (void)pr_head(cnk1); fprintf(Output_File, "\n"); } /* stop if that suffices */ if (is_set_option('n')) return; /* ... had enough so soon ... */ /* open the files that hold the chunks */ f0 = open_chunk(cnk0); f1 = open_chunk(cnk1); /* display the chunks in the required format */ if (!is_set_option('d')) { /* fill 2-column lines and print them */ while (nl_cnt0 != 0 || nl_cnt1 != 0) { if (nl_cnt0) { fill_line(f0, line0); nl_cnt0--; } else { clear_line(line0); } if (nl_cnt1) { fill_line(f1, line1); nl_cnt1--; } else { clear_line(line1); } show_2C_line(line0, line1); } } else { /* display the lines in a diff(1)-like format */ while (nl_cnt0--) { show_1C_line(f0, "<"); } fprintf(Output_File, "---\n"); while (nl_cnt1--) { show_1C_line(f1, ">"); } } /* close the pertinent files */ fclose(f0); fclose(f1); } static int pr_head(const struct chunk *cnk) { int pos = 0; pos += prs(cnk->ch_text->tx_fname); pos += prs(": line "); pos += pru(cnk->ch_first.ps_nl_cnt); pos += prs("-"); pos += pru(cnk->ch_last.ps_nl_cnt - 1); return pos; } static int prs(const char *str) { fprintf(Output_File, "%s", str); return (int) strlen(str); } static int pru(size_t u) { fprintf(Output_File, "%s", size_t2string(u)); return unslen(u); } static int unslen(size_t u) { int res = 1; while (u > 9) { u /= 10, res++; } return res; } static FILE * open_chunk(const struct chunk *cnk) { /* Opens the file in which the chunk resides, positions the file at the beginning of the chunk and returns the file pointer. Note that we use fopen() here, which opens a character stream, rather than Open_Text(), which opens a token stream. */ const char *fname = cnk->ch_text->tx_fname; FILE *f = fopen(fname, "r"); size_t nl_cnt; if (!f) { fprintf(stderr, ">>>> File %s disappeared <<<<\n", fname); f = fopen(NULLFILE, "r"); } nl_cnt = cnk->ch_first.ps_nl_cnt; while (nl_cnt > 1) { int ch = getc(f); if (ch < 0) break; if (ch == '\n') { nl_cnt--; } } return f; } static void fill_line(FILE *f, char ln[]) { /* Reads one line from f and puts it in condensed form in ln. */ int indent = 0, lpos = 0; int ch; /* condense and skip initial blank */ while ((ch = getc(f)), ch == ' ' || ch == '\t') { if (ch == '\t') { indent = 8; } else { indent++; } if (indent == 8) { /* every eight blanks give one blank */ if (lpos < max_line_length) { ln[lpos++] = ' '; } indent = 0; } } /* store the rest */ while (ch >= 0 && ch != '\n') { if (ch == '\t') { /* replace tabs by blanks */ ch = ' '; } if (lpos < max_line_length) { ln[lpos++] = (char) ch; } ch = getc(f); } ln[lpos] = '\0'; /* always room for this one */ } static void clear_line(char ln[]) { /* a simple null byte will suffice */ ln[0] = '\0'; } static void show_2C_line(const char ln0[], const char ln1[]) { /* displays the contents of the two lines in a two-column format */ int i; for (i = 0; i < max_line_length && ln0[i] != '\0'; i++) { fputc(ln0[i], Output_File); } for (; i < max_line_length; i++) { fputc(' ', Output_File); } fprintf(Output_File, " |"); for (i = 0; i < max_line_length && ln1[i] != '\0'; i++) { fputc(ln1[i], Output_File); } fprintf(Output_File, "\n"); } static void show_1C_line(FILE *f, const char *marker) { /* displays one line from f, preceded by the marker */ int ch; fprintf(Output_File, "%s", marker); while ((ch = getc(f)), ch > 0 && ch != '\n') { fputc(ch, Output_File); } fputc('\n', Output_File); } #ifdef DB_RUN static void db_chunk(const struct chunk *); static void db_run(const struct run *run) { /* prints detailed data about a run */ const struct chunk *cnk0 = &run->rn_chunk0; const struct chunk *cnk1 = &run->rn_chunk1; db_run_info(0, run, 1); db_chunk(cnk0); db_chunk(cnk1); } static void db_chunk(const struct chunk *cnk) { /* print the tokens in the chunk, with a one-char margin */ size_t i; const struct position *first = &cnk->ch_first; const struct position *last = &cnk->ch_last; size_t start = cnk->ch_text->tx_start; if (first->ps_tk_cnt > 0) { fprintf(Debug_File, "..."); fprint_token(Debug_File, Token_Array[start + first->ps_tk_cnt - 1]); fprintf(Debug_File, " "); } else { /* create same offset as above */ fprintf(Debug_File, " "); } for (i = first->ps_tk_cnt; i <= last->ps_tk_cnt; i++) { fprintf(Debug_File, " "); fprint_token(Debug_File, Token_Array[start + i]); } if (start + last->ps_tk_cnt + 1 < cnk->ch_text->tx_limit) { fprintf(Debug_File, " "); fprint_token(Debug_File, Token_Array[start + last->ps_tk_cnt + 1]); fprintf(Debug_File, "..."); } fprintf(Debug_File, "\n"); } #endif /* DB_RUN */ similarity-tester-2.89.orig/error.h0000644000000000000000000000031612540503627014301 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: error.h,v 1.3 1998/02/03 14:28:23 dick Exp $ */ extern void fatal(const char *msg); similarity-tester-2.89.orig/percentages.h0000644000000000000000000000040312540503627015445 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: percentages.h,v 1.4 2012-06-05 09:58:54 dick Exp $ */ extern void add_to_percentages(struct run *r); extern void Show_Percentages(void); similarity-tester-2.89.orig/algollike.c0000644000000000000000000000654612540503627015121 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: algollike.c,v 2.10 2013-04-28 16:30:40 dick Exp $ */ #include "options.h" #include "error.h" #include "token.h" #include "algollike.h" /* Arrays for fast identification tests for tokens. Each token is identified by its position in the set + 1. For example, if tk is the n-th Opener, openers[Token2int(tk)] == n+1. */ static char non_finals[N_REGULAR_TOKENS]; static char non_initials[N_REGULAR_TOKENS]; static char openers[N_REGULAR_TOKENS]; static char closers[N_REGULAR_TOKENS]; /* Init_Language */ static void cvt2bittable(const Token *tl, char bt[]) { /* assumes bt[] is cleared */ int i; int cnt = 1; for (i = 0; !Token_EQ(tl[i], No_Token); i++) { int index = Token2int(tl[i]); if (index < 0 || index >= N_REGULAR_TOKENS) fatal("internal error: bad Token list"); bt[index] = cnt++; } } void Init_Algol_Language( const Token Non_Finals[], const Token Non_Initials[], const Token Openers[], const Token Closers[] ) { /* convert the token sets to bitmaps for speed-up */ cvt2bittable(Non_Initials, non_initials); cvt2bittable(Non_Finals, non_finals); cvt2bittable(Openers, openers); cvt2bittable(Closers, closers); } /* May_Be_Start_Of_Run */ static int pos_in_set(const char set[], const Token tk) { if (!is_regular_token(tk)) return 0; return set[Token2int(tk)]; } int May_Be_Start_Of_Algol_Run(const Token tk) { return pos_in_set(non_initials, tk) == 0; } /* Best_Run_Size */ static size_t largest_routine(const Token *tk_array, size_t size) { /* Returns the size of the longest sequence starting at tk_array[0] and not containing unbalanced parentheses. Does not check the nesting of the parentheses, but then, sim is syntax-free anyway. */ size_t mrb_size = 0; /* most recent balancing size */ size_t pos; int i; int balance_count[N_REGULAR_TOKENS]; /* Overkill: only a fraction of the tokens are balancers; oh well. */ int n_imbalances; /* clear administration */ n_imbalances = 0; for (i = 0; i < N_REGULAR_TOKENS; i++) { balance_count[i] = 0; } /* scan tk_array[] and see how far we get */ for (pos = 0; pos < size; pos++) { Token tk = tk_array[pos]; int pp; /* parenthesis position */ /* account for openers */ if ((pp = pos_in_set(openers, tk))) { if (balance_count[pp] == 0) { /* about to create an imbalance */ n_imbalances++; } balance_count[pp]++; } /* account for closers */ if ((pp = pos_in_set(closers, tk))) { if (balance_count[pp] == 0) { /* this is one Closer too many */ return mrb_size; } balance_count[pp]--; if (balance_count[pp] == 0) { /* we just cleared an imbalance */ n_imbalances--; } } if (n_imbalances == 0) { /* register the balance point */ mrb_size = pos + 1; } } return mrb_size; } size_t Best_Algol_Run_Size(const Token *tk_array, size_t size) { /* Checks the run starting at tk_array[0] with length size for acceptability in the language. Cuts from the end if necessary and returns the accepted length, which may be zero. */ if (is_set_option('f')) { /* reduce to a routine-like form first */ size = largest_routine(tk_array, size); } while ( /* there is trailing garbage */ size != 0 && pos_in_set(non_finals, tk_array[size-1]) ) { /* remove it */ size--; } return size; } similarity-tester-2.89.orig/add_run.c0000644000000000000000000000305112540503627014556 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: add_run.c,v 2.13 2013-04-28 16:30:39 dick Exp $ */ #include "sim.h" #include "debug.par" #include "text.h" #include "runs.h" #include "percentages.h" #include "Malloc.h" #include "options.h" #include "error.h" #include "add_run.h" static void set_chunk( struct chunk *, struct text *, size_t, size_t ); static void set_pos( struct position *, int, struct text *, size_t ); void add_run(struct text *txt0, size_t i0, struct text *txt1, size_t i1, size_t size ) { /* Adds the run of given size to our collection. */ struct run *r = new(struct run); set_chunk(&r->rn_chunk0, txt0, i0 - txt0->tx_start, size); set_chunk(&r->rn_chunk1, txt1, i1 - txt1->tx_start, size); r->rn_size = size; #ifdef DB_RUN db_run_info("Added", r, 0); #endif /* DB_RUN */ if (is_set_option('p')) { add_to_percentages(r); } else { add_to_runs(r); } } static void set_chunk(struct chunk *cnk, struct text *txt, size_t start, size_t size ) { /* Fill the chunk *cnk with info about the piece of text in txt starting at start extending over size tokens. */ cnk->ch_text = txt; set_pos(&cnk->ch_first, 0, txt, start); set_pos(&cnk->ch_last, 1, txt, start + size - 1); } static void set_pos(struct position *pos, int type, struct text *txt, size_t start) { /* Fill a single struct position */ pos->ps_next = txt->tx_pos; txt->tx_pos = pos; pos->ps_type = type; pos->ps_tk_cnt = start; pos->ps_nl_cnt = (size_t) -1; /* uninitialized */ } similarity-tester-2.89.orig/textlang.l0000644000000000000000000000305412540503627015004 0ustar %{ /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: textlang.l,v 1.17 2015-01-22 20:54:31 dick Exp $ */ /* Text front end for the similarity tester. */ #include "sim.h" #include "token.h" #include "idf.h" #include "lex.h" #include "lang.h" #include "language.h" /* General language front end data */ Token lex_token; size_t lex_nl_cnt; size_t lex_tk_cnt; size_t lex_non_ascii_cnt; /* Language-dependent code */ void Init_Language(void) { if (is_set_option('f') || is_set_option('F')) fatal("options -f or -F not applicable in sim_text"); token_name = "word"; if (!min_run_string) { Min_Run_Size = 8; } if (!threshold_string) { Threshold_Percentage = 20; } } /*ARGSUSED*/ int May_Be_Start_Of_Run(Token tk) { /* any token is acceptable */ return 1; } /*ARGSUSED*/ size_t Best_Run_Size(const Token *str, size_t size) { /* any run size is acceptable */ return size; } static Token word2token(char *word) { /* ignore case */ lower_case(word); return idf_hashed(word); } %} %option noyywrap WordElem ([a-zA-Z0-9\200-\377]) TightWord ({WordElem}+) NonWordElem ([^a-zA-Z0-9\200-\377]) LooseElem ({WordElem}(" ")) SpacedWord ({LooseElem}+{WordElem}) %% {TightWord} { return_tk(word2token(yytext)); } {SpacedWord}/{NonWordElem} { /* the / operator works at the top level only */ return_tk(word2token(yytext)); } \n { /* count newlines */ return_eol(); } . { /* ignore the rest */ } %% /* More language-dependent code */ void yystart(void) { BEGIN INITIAL; } similarity-tester-2.89.orig/algollike.h0000644000000000000000000000251012540503627015111 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: algollike.h,v 1.7 2013-04-28 16:30:40 dick Exp $ */ /* The class Algollike is a subclass of Language. It implements the routines void Init_Algol_Language() int May_Be_Start_Of_Algol_Run() and size_t Best_Algol_Run_Size() for ALGOL-like languages, languages in which it is meaningful and useful to isolate function bodies. These routines can be used in Init_Language(), May_Be_Start_Of_Run(), and Best_Run_Size(), required by language.h . It requires the user to define four token sets, represented as Token set[] and terminated by No_Token: Token Non_Finals[] tokens that may not end a chunk Token Non_Initials[] tokens that may not start a chunk Token Openers[] openers of parentheses that must balance in functions Token Closers[] the corresponding closers, in the same order These must be passed to Init_Algol_Language(), in the above order. */ extern void Init_Algol_Language( const Token Non_Finals[], const Token Non_Initials[], const Token Openers[], const Token Closers[] ); /* note the order of the arguments: Non_Finals ~ Openers, etc. */ extern int May_Be_Start_Of_Algol_Run(Token ch); extern size_t Best_Algol_Run_Size(const Token *str, size_t size); similarity-tester-2.89.orig/add_run.h0000644000000000000000000000133212540503627014563 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: add_run.h,v 1.4 2013-04-28 16:30:39 dick Exp $ */ /* Interface between front-end and back-end: all information about runs passes through add_run(). Its parameters are the two chunks, each identified by their struct text and the position of the common segment in Token_Array[], and the number of tokens in the common segment. */ extern void add_run( struct text *txt0, /* text of first chunk */ size_t i0, /* chunk position in Token_Array[] */ struct text *txt1, /* text of second chunk */ size_t i1, /* chunk position in Token_Array[] */ size_t size /* number of tokens in the chunk */ ); similarity-tester-2.89.orig/sortlist.spc0000644000000000000000000000404412540503627015373 0ustar /* Module: Sort Linked Lists Author: dick@cs.vu.nl (Dick Grune @ Vrije Universiteit, Amsterdam) Version: 2015-01-18 Description: This is the specification part of a generic routine that sorts linked lists. The elements in the list are structs, each of which carries a pointer to the next element. Specification: The module supplies: - a routine void SORT_NAME(struct SORT_STRUCT **listhook) where 'listhook' is a pointer to the location that holds the pointer to the list to be sorted. Upon return, the list will be sorted, and the pointer updated. The routine will be defined static when instantiated inline. Instantiation, inline: For each struct list type T, specify: - a definition of SORT_STRUCT, the struct name of the linked structs; - a definition of SORT_NAME, the name of the resulting sort routine; - a definition of a routine int SORT_BEFORE( struct SORT_STRUCT *v, struct SORT_STRUCT *w ) or a definition #define SORT_BEFORE((v,w) which yields non-zero if v is to be sorted before w; - a definition of a field selector SORT_NEXT which names the field that points to the next struct SORT_STRUCT in the list. - #include "sortlist.bdy" Instantiation, separate: For each struct list type T, create a file sort_T.h which contains at least: - a definition of SORT_STRUCT, the struct name of the linked structs; - a definition of SORT_NAME, the name of the resulting sort routine; - #include "sortlist.spc" This file sort_T.h is to be included in all files that use the routine SORT_NAME. For each struct list type T, create a file sort_T.c which contains at least: - #include "sort_T.h" - a definition of a routine or definition SORT_BEFORE as described above; - a definition of a field selector SORT_NEXT which names the field that points to the next struct SORT_STRUCT in the list; - #include "sortlist.bdy" This file sort_T.c compiles into the module object for SORT_STRUCT. Implementation: Recursive split-sort-merge. */ extern void SORT_NAME(struct SORT_STRUCT **); #define _SORT_EXTERN_DEFINED similarity-tester-2.89.orig/compare.c0000644000000000000000000001111712540503627014572 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: compare.c,v 2.21 2015-01-17 10:20:39 dick Exp $ */ #include "sim.h" #include "text.h" #include "token.h" #include "tokenarray.h" #include "hash.h" #include "language.h" #include "options.h" #include "add_run.h" #include "compare.h" #include "debug.par" static void compare_one_text(int, int, int); static size_t lcs( struct text *, size_t, size_t, size_t, struct text **, size_t * ); /* The overall structure of the routine Compare_Files() is: for all new files for all texts it must be compared to for all positions in the new file for all positions in the text for ever increasing sizes try to match and keep the best */ void Compare_Files(void) { int n; for ( /* all new texts */ n = 0; n < Number_of_New_Texts; n++ ) { int first = ( /* if compare to old only */ is_set_option('S') ? Number_of_New_Texts + 1 : /* else if do not compare to self */ is_set_option('s') ? n + 1 /* else */ : n ); if (is_set_option('e')) { /* from first to Number_of_Texts in steps */ int m; for (m = first; m < Number_of_Texts; m++) { compare_one_text(n, m, m+1); } } else { /* from first to Number_of_Texts in one action */ if (first < Number_of_Texts) { compare_one_text(n, first, Number_of_Texts); } } } } static void compare_one_text( int n, /* text to be compared */ int first, /* first text to be compared to */ int limit /* first text not to be compared to */ ) { size_t i_first = Text[first].tx_start; size_t i_limit = Text[limit-1].tx_limit; struct text *txt0 = &Text[n]; size_t i0 = txt0->tx_start; while ( /* there may be a useful substring */ i0 + Min_Run_Size <= txt0->tx_limit ) { /* see if there really is one */ struct text *txt_best; size_t i_best; size_t size_best = lcs(txt0, i0, i_first, i_limit, &txt_best, &i_best); if (size_best) { /* good run found; enter it */ add_run(txt0, i0, txt_best, i_best, size_best); /* and skip it */ i0 += size_best; } else { /* we try our luck at the next token */ i0++; } } } static size_t lcs( struct text *txt0, /* input: starting position */ size_t i0, size_t i_first, /* no comparison before this pos. */ size_t i_limit, /* no comparison after this pos. */ struct text **tx_bp, /* output: position of best run */ size_t *i_bp ) { /* Finds the longest common substring (not subsequence) in: txt0, starting precisely at i0 and the text from i_first to i_limit-1. Writes the position in tx_bp and i_bp and returns the size. Returns 0 if no common substring is found. */ struct text *txt1; size_t i1; size_t size_best = 0; for ( txt1 = txt0, i1 = i0; i1 && i1 < i_limit; i1 = Forward_Reference(i1) ) { if (i1 < i_first) { /* not in range */ continue; } size_t min_size= (size_best ? size_best+1 : Min_Run_Size); /* bump txt1; we may have to skip a text or two */ while (i1 >= txt1->tx_limit) { txt1++; } /* are we looking at something better than we have got? */ { /* comparing backwards */ size_t j0 = i0 + min_size - 1; size_t j1 = i1 + min_size - 1; if ( /* j0 still inside txt0 */ j0 < txt0->tx_limit && /* j1 still inside txt1 */ j1 < txt1->tx_limit && /* j0 and j1 don't overlap */ j0 + min_size <= j1 ) { /* there is room enough for a match */ size_t cnt = min_size; /* text matches for at least min_size tokens? */ while ( cnt && Token_EQ(Token_Array[j0], Token_Array[j1]) ) { cnt--, j0--, j1--; } if (cnt) continue; /* forget it */ } else continue; /* forget it */ } /* yes, we are; how long can we make it? */ size_t new_size = min_size; { /* extending forwards */ size_t j0 = i0 + min_size; size_t j1 = i1 + min_size; while ( /* j0 still inside txt0 */ j0 < txt0->tx_limit && /* j1 still inside txt1 */ j1 < txt1->tx_limit && /* j0 and j1 don't overlap */ j0 + new_size < j1 && /* tokens are the same */ Token_EQ(Token_Array[j0], Token_Array[j1]) ) { j0++, j1++, new_size++; } } /* offer the run to the Language Department which may reject it or may cut its tail */ new_size = ( May_Be_Start_Of_Run(Token_Array[i0]) ? Best_Run_Size(&Token_Array[i0], new_size) : 0 ); if ( /* we still have something acceptable */ new_size >= Min_Run_Size && /* it is better still than what we had */ new_size > size_best ) { /* record it */ *tx_bp = txt1; *i_bp = i1; size_best = new_size; } } return size_best; } similarity-tester-2.89.orig/ToDo0000644000000000000000000000421212540503627013566 0ustar - don't aiso and sortlist do the same thing? - some size_t are sizes, others are positions, indexes - start,limit -> start,length - Bool.h - min_run_string, thresh.. = clean up sim.[ch] - report runs as '... ...' (proper name for Retrieve_Runs()) - unify idf2token() in *lang.l - get rid of static forward references to routines; occurrences: egr static *.c | grep "(" 1 compare.c 7 hash.c 2 pass1.c 4 pass2.c 12 pass3.c 1 stream.c 3 text.c - lex_nl_cnt counts from 1; this requires small, complicating adjustments Done ================================================================ + command line parameter consistency + make sim_text case-indifferent? + sortlist.bdy by split-merge + get rid of the nl_buff mechanism. No, use 16 bits line length. + in hash.c, size_t -> uint64_t? No, unit32_t is just as good. + / misinterpreted by shell; | alternative + register - removed + Run hashing OK: average chain length = 1.5, for sim-ing the sources of MCD2 + Idf hashing OK: smooth distribution when sim-ing the sources of MCD2 + use two-byte tokens to obtain better resolution for sim_text and on -F option and UTF-8 (Johnson, Benjamin (US - Chicago)) + different defaults per program + cleaning up sim.c & names + Microsoft comment (// ... unescaped \n) + emails 2009-2011 (A = I answered, R= they replied) +AR Marcus Brinkmann, separate letters +AR Scott Kuhl, percentages +AR Yaroslav Halchenko, identifying non-existent lines +A Rumen Stefanov, UTF-8 +A Jonathan Martin, UTF-8 +AR UTF-8 (Johnson, Benjamin (US - Chicago)) + better structure between X.h and X.c + clean-up language.h and its sub-class algollike.h + warning in README to correct for non-MSDOS Rejected ================================================================ X remove Miranda X Mon Apr 11 13:23:41 1994: sim_orca X Thu May 13 23:02:46 1993: sim ook voor C++ en Ada X plug memory leaks (and still report memory usage with -M!) not worth the effort X db_ not protected by #ifdef but by compilation to a call to an (empty) routine 1. not conspicuous enough in the code; 2. impairs efficiency similarity-tester-2.89.orig/Answers0000644000000000000000000000413012540503627014342 0ustar The software and text similarity tester SIM SIM tests lexical similarity in texts in C, Java, Pascal, Modula-2, Lisp, Miranda, and natural language. It is used - to detect potentially duplicated code fragments in large software projects, in program text but also in shell scripts and documentation; - to detect plagiarism in software projects, educational and otherwise. SIM is available through ftp. The directory ftp.cs.vu.nl:pub/dick/similarity_tester contains the sources (in C) and the MSDOS .EXEs. The software similarity tester is very efficient and allows us to compare this year's students' work with that collected from many past years (much to the dismay of some, mostly non-CS, students). Students are told in advance that their work is going to be compared, but some are non-believers ... The output of the similarity tester can be processed by a number of shell scripts by Matty Huntjens. These shell scripts take sim output and produce lists of suspect submissions, histograms and the like. The present version of these scripts is very much geared to the local situation at the Vrije Universiteit, though; they are low on portability. Matty Huntjens' email address is matty@cs.vu.nl. We are not afraid that students would try to tune their work to the similarity tester. We reckon if they can do that they can also do the exercise. Since this piece of handicraft does not qualify as research, there are no international papers on it. A paper, titled `Detecting copied submissions in computer science lab work', was published in a local (i.e. Dutch) computer science journal: %A Dick Grune %A Matty Huntjens %T Het detecteren van kopie\(:en bij informatica-practica %J Informatie (in Dutch) %V 31 %N 11 %D Nov 1989 %P 864-867 The ftp directory contains a terse technical report about the internal working of the program. Dick Grune Vrije Universiteit de Boelelaan 1081 1081 HV Amsterdam the Netherlands dick@cs.vu.nl +31 20 444 7744 ---------------------------------------------------------------- With infinitely many exceptions, what you do makes no difference. similarity-tester-2.89.orig/token.c0000644000000000000000000000531312540503627014265 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: token.c,v 2.12 2013-04-28 16:30:43 dick Exp $ */ /* Token interface, implementation part. */ #include #include "token.h" static int Token_in_range(const Token tk, int low, int high) { int tki = Token2int(tk); if (tki < low) return 0; if (tki > high) return 0; return 1; } static int check_and_print( FILE *ofile, const char *name, int ch, int low, int high, int offset ) { int ch1 = ch + offset; if (low <= ch1 && ch1 <= high) { fprintf(ofile, "%s(%c)", name, (char)ch1); return 1; } return 0; } #define is_simple_token(tk) (Token_in_range(tk, 0x0001, 0x00FF)) #define is_CTRL_token(tk) (Token_in_range(tk, 0x0101, 0x011E)) #define is_NORM_token(tk) (Token_in_range(tk, 0x0121, 0x017E)) #define is_MTCT_token(tk) (Token_in_range(tk, 0x0181, 0x019E)) #define is_META_token(tk) (Token_in_range(tk, 0x01A1, 0x01FE)) #define is_hashed_token(tk) (Token_in_range(tk, 0x0200, 0xFFFE)) void fprint_token(FILE *ofile, const Token tk) { /* Prints a regular token in two characters: normal char meta (bit 9 set) ^A cntl $A meta-cntl A printable #A meta and hashed tokens in hexadecimal. */ int tki = Token2int(tk); int ch = tki & 0x7F; int bit8 = tki & 0x80; if (Token_EQ(tk, No_Token)) {fprintf(ofile, "--"); return;} if (Token_EQ(tk, IDF)) {fprintf(ofile, "IDF"); return;} if (Token_EQ(tk, End_Of_Line)) {fprintf(ofile, "EOL"); return;} if (is_simple_token(tk)) { if ('!' <= ch && ch <= '~') { fprintf(ofile, "%s%c", (bit8 ? "8" : ""), ch); return; } if (0 < ch && ch <= ' ') { fprintf(ofile, "%s%c", (bit8 ? "$" : "^"), ch + '@'); return; } if (ch == 0x7F) { fprintf(ofile, "%s%c", (bit8 ? "$" : "^"), '?'); return; } } if (is_CTRL_token(tk)) { if (check_and_print(ofile, "CTRL", ch, 'A', '~', '@')) return; } if (is_NORM_token(tk)) { if (check_and_print(ofile, "NORM", ch, '!', '~', '\0')) return; } if (is_MTCT_token(tk)) { if (check_and_print(ofile, "MTCT", ch, 'A', '~', '@')) return; } if (is_META_token(tk)) { if (check_and_print(ofile, "META", ch, '!', '~', '\0')) return; } if (is_hashed_token(tk)) { fprintf(ofile, "0x%04x", tki); return; } /* gap token! */ fprintf(ofile, "!0x%04x!", tki); } #ifdef XXXX int ch = tki & 0177; int meta = tki & 0200; if (' ' <= ch && ch <= '~') { fprintf(ofile, "%c%c", (meta ? '#' : ' '), ch); } else { fprintf(ofile, "%c%c", (meta ? '$' : '^'), (ch == 0177 ? '?' : ch + '@') ); } #endif int Token_EQ(const Token t1, const Token t2) { /* to make sure Token_EQ is indeed called with two Token parameters */ return Token2int(t1) == Token2int(t2); } similarity-tester-2.89.orig/option-i.inp0000644000000000000000000000004412540503627015243 0ustar pass1.c pass2.c / pass3.c ../teckel