similarity-tester-2.70.orig/0000755000000000000000000000000012057640104012760 5ustar similarity-tester-2.70.orig/ForEachFile.h0000644000000000000000000000152511750732757015262 0ustar /* This file is part of the auxiliaries library. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: ForEachFile.h,v 1.6 2012-05-04 10:56:47 Gebruiker Exp $ */ #include "fname.h" #include #include extern void ForEachFile( const Fchar *fn, void (*proc)(const Fchar *fn, const char *msg, const struct stat *fs) ); extern void ForEachLocalFile( const Fchar *fn, void (*proc)(const Fchar *fn, const char *msg, const struct stat *fs) ); /* Each file reachable from fn is passed to the procedure proc, which is declared as: void proc(const Fchar *fn, const char *msg, const struct stat *fs) The file fn is reached; if msg != NULL, an error prevails the text of which is *msg; otherwise fs points to the stat buffer for fn. ForEachLocalFile() restricts itself to the directory fn and its local contents. */ similarity-tester-2.70.orig/idf.h0000644000000000000000000000157111764320437013707 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: idf.h,v 2.10 2012-06-08 06:52:15 Gebruiker Exp $ */ /* Idf module: Token idf_in_list(char *str, struct idf l[], sizeof l, Token dflt); looks up a keyword in a list of keywords l, represented as an array of struct idf, and returns its translation as a token; dflt is returned if the keyword is not found. Token idf_hashed(char *str); returns a token unequal to No_Token or End_Of_Line, derived from str through hashing */ /* the struct for keywords etc. */ struct idf { char *id_tag; /* an interesting identifier */ Token id_tr; /* with its one-Token translation */ }; /* public functions */ extern Token idf_in_list( const char *str, const struct idf list[], unsigned int listsize, Token default_token ); extern Token idf_hashed(const char *str); similarity-tester-2.70.orig/debug.par0000644000000000000000000000110311754656626014570 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: debug.par,v 1.5 2012-05-16 07:56:06 Gebruiker Exp $ */ #undef DB_ALL #undef DB_FORW_REF /* print & check forward references */ #undef DB_TEXT /* print all text parts */ #undef DB_POS /* print positions in files */ #undef DB_NL_BUFF /* print the newline count buffer */ #undef DB_RUN /* print all run activity */ #if defined(lint) || defined(DB_ALL) #define DB_FORW_REF #define DB_TEXT #define DB_POS #define DB_NL_BUFF #define DB_RUN #endif similarity-tester-2.70.orig/clang.l0000644000000000000000000001244411764421214014231 0ustar %{ /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: clang.l,v 2.19 2012-06-08 16:04:28 Gebruiker Exp $ */ /* C language front end for the similarity tester. Author: Dick Grune */ #include "options.h" #include "token.h" #include "language.h" #include "algollike.h" #include "idf.h" #include "lex.h" #include "lang.h" /* General language front end data */ Token lex_token; unsigned int lex_nl_cnt; unsigned int lex_tk_cnt; unsigned int lex_non_ascii_cnt; /* Language-dependent data */ /* Data for module idf */ static const struct idf ppcmd[] = { {"define", META('d')}, {"else", META('e')}, {"endif", META('E')}, {"if", META('i')}, {"ifdef", META('I')}, {"ifndef", META('x')}, {"include", MTCT('I')}, {"line", META('l')}, {"undef", META('u')} }; static const struct idf reserved[] = { {"auto", NORM('a')}, {"break", NORM('b')}, {"case", NORM('c')}, {"char", NORM('C')}, {"continue", CTRL('C')}, {"default", NORM('d')}, {"do", NORM('D')}, {"double", CTRL('D')}, {"else", NORM('e')}, {"enum", NORM('E')}, {"extern", CTRL('E')}, {"float", NORM('f')}, {"for", NORM('F')}, {"goto", NORM('g')}, {"if", NORM('i')}, {"int", NORM('I')}, {"long", NORM('l')}, {"register", No_Token}, {"return", NORM('r')}, {"short", NORM('s')}, {"sizeof", NORM('S')}, {"static", CTRL('S')}, {"struct", META('s')}, {"switch", META('S')}, {"typedef", NORM('t')}, {"union", NORM('u')}, {"unsigned", NORM('U')}, {"void", No_Token}, {"while", NORM('w')} }; /* Special treatment of identifiers */ static Token idf2token(int hashing) { Token tk; tk = idf_in_list(yytext, reserved, sizeof reserved, IDF); if (Token_EQ(tk, IDF) && hashing) { /* return a one-Token hash code */ tk = idf_hashed(yytext); } return tk; } /* Token sets for module algollike */ static const Token Non_Finals[] = { IDF, /* identifier */ NORM('{'), NORM('('), NORM('a'), /* auto */ NORM('b'), /* break */ NORM('c'), /* case */ NORM('C'), /* char */ CTRL('C'), /* continue */ NORM('d'), /* default */ NORM('D'), /* do */ CTRL('D'), /* double */ NORM('E'), /* enum */ CTRL('E'), /* extern */ NORM('f'), /* float */ NORM('F'), /* for */ NORM('g'), /* goto */ NORM('i'), /* if */ NORM('I'), /* int */ NORM('l'), /* long */ NORM('r'), /* return */ NORM('s'), /* short */ CTRL('S'), /* static */ META('s'), /* struct */ META('S'), /* switch */ NORM('t'), /* typedef */ NORM('u'), /* union */ NORM('U'), /* unsigned */ NORM('w'), /* while */ No_Token }; static const Token Non_Initials[] = { NORM(')'), NORM('}'), NORM(';'), No_Token }; static const Token Openers[] = { NORM('{'), NORM('('), NORM('['), No_Token }; static const Token Closers[] = { NORM('}'), NORM(')'), NORM(']'), No_Token }; /* Language-dependent code */ void Init_Language(void) { Init_Algol_Language(Non_Finals, Non_Initials, Openers, Closers); } int May_Be_Start_Of_Run(Token ch) { return May_Be_Start_Of_Algol_Run(ch); } unsigned int Best_Run_Size(const Token *str, unsigned int size) { return Best_Algol_Run_Size(str, size); } %} %option noyywrap %Start Comment Layout ([ \t\r\f]) ASCII95 ([- !"#$%&'()*+,./0-9:;<=>?@A-Z\[\\\]^_`a-z{|}~]) AnyQuoted (\\.) StrChar ([^"\n\\]|{AnyQuoted}) ChrChar ([^'\n\\]|{AnyQuoted}) StartComment ("/*") EndComment ("*/") SafeComChar ([^*\n]) UnsafeComChar ("*") MSComment ("//"{MSCommentChar}*) MSCommentChar ([^\n]) Digit ([0-9a-fA-F]) Idf ([A-Za-z][A-Za-z0-9_]*) %% {StartComment} { /* We do not have one single pattern to match a comment (although one can be written), for two reasons. The matched string might overflow lex-internal buffers like yysbuf and yytext; and the pattern would be very complicated and impair maintainability. So we break up the string into safe chunks and keep track of where we are in a start condition . */ BEGIN Comment; } {SafeComChar}+ { /* safe comment chunk */ } {UnsafeComChar} { /* unsafe char, read one by one */ } "\n" { /* to break up long comments */ return_eol(); } {EndComment} { /* end-of-comment */ BEGIN INITIAL; } {MSComment} { /* ignore */ } \"{StrChar}*\" { /* strings */ return_ch('"'); } \'{ChrChar}+\' { /* characters */ return_ch('\''); } ^#{Layout}*include.* { /* ignore #include lines */ } ^#{Layout}*{Idf} { /* a preprocessor line */ char *idf = yytext+1; /* skip layout in front of preprocessor identifier */ while (*idf == ' ' || *idf == '\t') { idf++; } return_tk(idf_in_list(idf, ppcmd, sizeof ppcmd, NORM('#'))); } (0x)?{Digit}+("l"|"L")? { /* numeral, passed as an identifier */ return_tk(IDF); } {Idf}/"(" { /* identifier in front of ( */ Token tk; tk = idf2token(is_set_option('F')); if (!Token_EQ(tk, No_Token)) return_tk(tk); } {Idf} { /* identifier */ Token tk; tk = idf2token(0 /* no hashing */); if (!Token_EQ(tk, No_Token)) return_tk(tk); } \; { /* semicolon, conditionally ignored */ if (is_set_option('f')) return_ch(yytext[0]); } \n { /* count newlines */ return_eol(); } {Layout} { /* ignore layout */ } {ASCII95} { /* copy other text */ return_ch(yytext[0]); } . { /* count non-ASCII chars */ lex_non_ascii_cnt++; } %% /* More language-dependent code */ void yystart(void) { BEGIN INITIAL; } similarity-tester-2.70.orig/newargs.c0000644000000000000000000000615612055474360014611 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: newargs.c,v 2.8 2012-11-28 20:49:52 Gebruiker Exp $ */ #include #include "sim.h" #include "ForEachFile.h" #include "Malloc.h" #include "error.h" #include "newargs.h" #define ARGS_INCR 1024 static char *args; static int args_free; static int args_size; static void init_args(void) { args = 0; args_free = 0; args_size = 0; } static void add_char_to_args(char ch) { if (args_free == args_size) { /* allocated array is full; increase its size */ int new_size = args_size + ARGS_INCR; char *new_args = (char *)Realloc( (char *)args, sizeof (char *) * new_size ); args = new_args, args_size = new_size; } /* now we are sure there is room enough */ args[args_free++] = ch; } static void add_string_to_args(const Fchar *fn) { while (*fn) { add_char_to_args(*fn++); } add_char_to_args('\n'); } static char * std_input(void) { /* in the form (name \n)* \0 */ /* get all of standard input */ int ch; int last_char = '\n'; while (ch = getchar(), ch != EOF) { /* omit duplicate layout (= empty name) */ if (last_char == '\n' && ch == '\n') continue; add_char_to_args(ch); last_char = ch; } add_char_to_args('\0'); /* make sure the result conforms to the form above */ if (args[args_free-2] != '\n') fatal("standard input not terminated with newline"); return args; } static int n_names(const char *s) { int cnt = 0; while (*s) { if (*s == '\n') { cnt++; } s++; } return cnt; } static const char ** new_argv(int argc, char *args) { /* converts the layout in args to \0, and constructs an argv list */ const char **argv = (const char **)Malloc((argc+1) * sizeof (char *)); char *p = args; char last_char = '\n'; argc = 0; while (*p) { if (last_char == '\n') { /* here a new name starts */ argv[argc++] = p; } last_char = *p; if (*p == '\n') { *p = '\0'; } p++; } argv[argc] = 0; return argv; } void get_new_std_input_args(int *argcp, const char **argvp[]) { init_args(); char *n_args = std_input(); int argc = n_names(n_args); const char **argv = new_argv(argc, n_args); *argcp = argc, *argvp = argv; } static void register_file(const Fchar *fn, const char *msg, const struct stat *fs) { if (msg) { fprintf(stderr, "could not handle file %s: %s\n", fn, msg); return; } if ( /* it is a non-empty regular file */ S_ISREG(fs->st_mode) && fs->st_size > 0 ) { add_string_to_args(fn); } } static char * recursive_args(int argc, const char *argv[]) { if (argc == 0) { ForEachFile(str2Fname("."), register_file); } else { int i; for (i = 0; i < argc; i++) { const char *arg = argv[i]; const Fchar *Farg = str2Fname(arg); if (is_new_old_separator(arg)) { add_string_to_args(Farg); } else { ForEachFile(Farg, register_file); } } } add_char_to_args('\0'); return args; } void get_new_recursive_args(int *argcp, const char **argvp[]) { init_args(); char *n_args = recursive_args(*argcp, *argvp); int argc = n_names(n_args); const char **argv = new_argv(argc, n_args); *argcp = argc, *argvp = argv; } similarity-tester-2.70.orig/percentages.c0000644000000000000000000000657011763354135015445 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: percentages.c,v 1.13 2012-06-05 09:58:53 Gebruiker Exp $ */ #include #include "sim.h" #include "text.h" #include "runs.h" #include "options.h" #include "Malloc.h" #include "error.h" #include "percentages.h" /* To compute percentages fairly, the input files are read twice. This makes it impossible to use the struct text-s from the presented run as identifications of the files, since their order differs between the first and the second scan. Specific entries from the struct text-s are stored instead. */ struct match { struct match *ma_next; const char *ma_fname0; const char *ma_fname1; unsigned int ma_size; /* # tokens of file 0 found in file 1 */ unsigned int ma_size0; /* # tokens in file 0 */ }; static struct match *match_start; /* to be allocated by new() */ void add_to_percentages(struct run *r) { struct match **match_hook = &match_start; /* percentages are only meaningful between different files */ if (r->rn_chunk0.ch_text == r->rn_chunk1.ch_text) return; /* look (text0, text1) combination up in match list */ while (*match_hook) { struct match *m = *match_hook; if ( m->ma_fname0 == r->rn_chunk0.ch_text->tx_fname && m->ma_fname1 == r->rn_chunk1.ch_text->tx_fname ) { /* found it; now update it */ m->ma_size += r->rn_size; return; } match_hook = &m->ma_next; } { /* it's not there; make a new entry */ struct match *m = *match_hook = new(struct match); struct text *text0 = r->rn_chunk0.ch_text; struct text *text1 = r->rn_chunk1.ch_text; m->ma_next = 0; m->ma_fname0 = text0->tx_fname; m->ma_fname1 = text1->tx_fname; m->ma_size = r->rn_size; m->ma_size0 = text0->tx_limit - text0->tx_start; } } static float match_percentage(struct match *m) { return (m->ma_size*1.0/m->ma_size0); } /* We want the sorting order all contributors of the file with the highest percentage all contributors of the file with the next lower percentage etc. but this order cannot be specified by a single SORT_BEFORE(). So we sort for percentage, and then reorder during printing. */ /* instantiate sort_match_list(struct match **listhook) */ #define SORT_STRUCT match #define SORT_NAME sort_match_list #define SORT_BEFORE(p1,p2) (match_percentage(p1) > match_percentage(p2)) #define SORT_NEXT ma_next #include "sortlist.bdy" static void print_perc_info(struct match *m) { int mp = match_percentage(m)*100.0; if (mp > 100) { /* this may result from overlapping matches */ mp = 100; } if (mp >= Threshold_Percentage) { fprintf(Output_File, "%s consists for %d %% of %s material\n", m->ma_fname0, mp, m->ma_fname1 ); } } static void print_and_remove_perc_info_for_top_file(struct match **m_hook) { struct match *m = *m_hook; const char *fname = m->ma_fname0; print_perc_info(m); *m_hook = m->ma_next; Free(m); while ((m = *m_hook)) { if (m->ma_fname0 == fname) { if (is_set_option('P')) { print_perc_info(m); } /* remove the struct */ *m_hook = m->ma_next; Free(m); } else { /* skip the struct */ m_hook = &m->ma_next; continue; } } } static void print_percentages(void) { while (match_start) { print_and_remove_perc_info_for_top_file(&match_start); } } void Show_Percentages(void) { sort_match_list(&match_start); print_percentages(); } similarity-tester-2.70.orig/Makefile0000644000000000000000000003014212055474357014435 0ustar # This file is part of the software similarity tester SIM. # Written by Dick Grune, Vrije Universiteit, Amsterdam. # $Id: Makefile,v 2.54 2012-11-28 20:49:51 Gebruiker Exp $ # # E N T R Y P O I N T S help: @echo 'Entry points:' @echo 'test: compile sim_c and run a simple test' @echo '' @echo 'binaries: create all binaries' @echo 'exes: create executables in MSDOS' @echo 'install: install all binaries' @echo '' @echo 'lint: lint sim sources' @echo 'simsim: run sim_c on the sim sources' @echo '' @echo 'fresh: remove created files' # # When you modify any of the following macros, do 'make clean' # # System dependencies # =============== including ../lib/sysidf.mk here # This file is part of the auxiliary libraries. # Written by Dick Grune, dick@dickgrune.com # $Id: sysidf.mk,v 1.15 2012-06-13 09:59:52 Gebruiker Exp $ # ################################################################ # For UNIX|Linux SYSTEM = UNIX SUBSYSTEM = SOLARIS # Locations DIR = /home/dick BINDIR = $(DIR)/bin.`$(DIR)/bin/arch` MAN1DIR = $(DIR)/man/man1 # Commands COPY = cp -p EXE = # LEX = flex LN = ln ZIP = zip -o ################################################################ # For MSDOS + MinGW SYSTEM = MSDOS SUBSYSTEM = MinGW # Locations DIR = C:/BIN BINDIR = C:/BIN MAN1DIR = C:/BIN # Commands (cp required, since xcopy cannot handle forward slashes) COPY = cp -p EXE = .exe LEX = flex LN = ln ZIP = zip -o ################################################################ # General, compiling: CC = gcc -D$(SYSTEM) -D$(SUBSYSTEM) LINT = lint -ansi -D$(SYSTEM) -D$(SUBSYSTEM) LINTFLAGS = -xh # General, manual: .SUFFIXES: .1 .3 .pdf .1.pdf: man2pdf $< .3.pdf: man2pdf $< # =============== end of ../lib/sysidf.mk # Compiler Options MEMORY = -DMEMLEAK -DMEMCLOBBER CFLAGS = $(MEMORY) -O4 LIBFLAGS = # LINTFLAGS = $(MEMORY) -h# -X # Debugging CFLAGS += -DDEBUG DEBUG_C = debug.c DEBUG_O = debug.o # T E S T P A R A M E T E R S # percentage test TEST_LANG = c TEST_OPT = -p TEST_INP = *.l # text test TEST_LANG = text TEST_OPT = -r 5 TEST_INP = test_seplet # Rumen Stevanov test TEST_LANG = text TEST_OPT = -p TEST_INP = Rumen_Stefanov/new/*.txt # Kuhl test 1 TEST_LANG = c TEST_OPT = -p TEST_INP = Kuhl/simc1.c Kuhl/simc2.c # Kuhl test 2 TEST_LANG = c TEST_OPT = -p TEST_INP = Kuhl/simc2.c Kuhl/simc1.c # regular test TEST_LANG = c TEST_OPT = -r24 TEST_INP = pass3.c # -i option test TEST_LANG = c TEST_OPT = -f -r 20 -R -i $@ SIM_C_CFS = $(SIM_CFS) $(ALG_CFS) $(CLANG_CFS) SIM_C_OBJ = $(SIM_OBJ) $(ALG_OBJ) $(CLANG_OBJ) sim_c$(EXE): $(SIM_C_OBJ) $(CC) $(SIM_C_OBJ) -o $@ SIM_GRB += clang.c sim_c $(BINDIR)/sim_c$(EXE): sim_c$(EXE) $(COPY) sim_c$(EXE) $@ # The Java Language module: Java JAVALANG_CFS = javalang.c JAVALANG_OBJ = javalang.o JAVALANG_SRC = javalang.l JAVALANG_FLS = $(JAVALANG_SRC) javalang.c: javalang.l $(LEX) -t javalang.l >$@ SIM_JAVA_CFS = $(SIM_CFS) $(ALG_CFS) $(JAVALANG_CFS) SIM_JAVA_OBJ = $(SIM_OBJ) $(ALG_OBJ) $(JAVALANG_OBJ) sim_java$(EXE): $(SIM_JAVA_OBJ) $(CC) $(SIM_JAVA_OBJ) -o $@ SIM_GRB += javalang.c sim_java $(BINDIR)/sim_java$(EXE): sim_java$(EXE) $(COPY) sim_java$(EXE) $@ # The Pascal Language module: Pascal PASCLANG_CFS = pascallang.c PASCLANG_OBJ = pascallang.o PASCLANG_SRC = pascallang.l PASCLANG_FLS = $(PASCLANG_SRC) pascallang.c: pascallang.l $(LEX) -t pascallang.l >pascallang.c SIM_PASC_CFS = $(SIM_CFS) $(ALG_CFS) $(PASCLANG_CFS) SIM_PASC_OBJ = $(SIM_OBJ) $(ALG_OBJ) $(PASCLANG_OBJ) sim_pasc$(EXE): $(SIM_PASC_OBJ) $(CC) $(SIM_PASC_OBJ) -o $@ SIM_GRB += pascallang.c sim_pasc $(BINDIR)/sim_pasc$(EXE): sim_pasc$(EXE) $(COPY) sim_pasc$(EXE) $@ # The Modula-2 Language module: Modula-2 M2LANG_CFS = m2lang.c M2LANG_OBJ = m2lang.o M2LANG_SRC = m2lang.l M2LANG_FLS = $(M2LANG_SRC) m2lang.c: m2lang.l $(LEX) -t m2lang.l >$@ SIM_M2_CFS = $(SIM_CFS) $(ALG_CFS) $(M2LANG_CFS) SIM_M2_OBJ = $(SIM_OBJ) $(ALG_OBJ) $(M2LANG_OBJ) sim_m2$(EXE): $(SIM_M2_OBJ) $(CC) $(SIM_M2_OBJ) -o $@ SIM_GRB += m2lang.c sim_m2 $(BINDIR)/sim_m2$(EXE): sim_m2$(EXE) $(COPY) sim_m2$(EXE) $@ # The Lisp Language module: Lisp LISPLANG_CFS = lisplang.c LISPLANG_OBJ = lisplang.o LISPLANG_SRC = lisplang.l LISPLANG_FLS = $(LISPLANG_SRC) lisplang.c: lisplang.l $(LEX) -t lisplang.l >$@ SIM_LISP_CFS = $(SIM_CFS) $(ALG_CFS) $(LISPLANG_CFS) SIM_LISP_OBJ = $(SIM_OBJ) $(ALG_OBJ) $(LISPLANG_OBJ) sim_lisp$(EXE): $(SIM_LISP_OBJ) $(CC) $(SIM_LISP_OBJ) -o $@ SIM_GRB += lisplang.c sim_lisp $(BINDIR)/sim_lisp$(EXE): sim_lisp$(EXE) $(COPY) sim_lisp$(EXE) $@ # The Miranda Language module: Miranda MIRALANG_CFS = miralang.c MIRALANG_OBJ = miralang.o MIRALANG_SRC = miralang.l MIRALANG_FLS = $(MIRALANG_SRC) miralang.c: miralang.l $(LEX) -t miralang.l >$@ SIM_MIRA_CFS = $(SIM_CFS) $(ALG_CFS) $(MIRALANG_CFS) SIM_MIRA_OBJ = $(SIM_OBJ) $(ALG_OBJ) $(MIRALANG_OBJ) sim_mira$(EXE): $(SIM_MIRA_OBJ) $(CC) $(SIM_MIRA_OBJ) -o $@ SIM_GRB += miralang.c sim_mira $(BINDIR)/sim_mira$(EXE): sim_mira$(EXE) $(COPY) sim_mira$(EXE) $@ # The Text module: Text TEXTLANG_CFS = textlang.c TEXTLANG_OBJ = textlang.o TEXTLANG_SRC = textlang.l TEXTLANG_FLS = $(TEXTLANG_SRC) textlang.c: textlang.l $(LEX) -t textlang.l >$@ SIM_TEXT_CFS = $(SIM_CFS) $(TEXTLANG_CFS) SIM_TEXT_OBJ = $(SIM_OBJ) $(TEXTLANG_OBJ) sim_text$(EXE): $(SIM_TEXT_OBJ) $(CC) $(SIM_TEXT_OBJ) -o $@ SIM_GRB += textlang.c sim_text $(BINDIR)/sim_text$(EXE): sim_text$(EXE) $(COPY) sim_text$(EXE) $@ # T E S T S # Some simple tests: sim.res: sim_$(TEST_LANG)$(EXE) $(TEST_INP) ./sim_$(TEST_LANG)$(EXE) $(TEST_OPT) $(TEST_INP) stream.res: sim_$(TEST_LANG)$(EXE) $(TEST_INP) ./sim_$(TEST_LANG)$(EXE) -- $(TEST_OPT) $(TEST_INP) >$@ wc $@ $(TEST_INP) percentages.res:sim_$(TEST_LANG)$(EXE) $(TEST_INP) ./sim_$(TEST_LANG)$(EXE) -p $(TEST_OPT) $(TEST_INP) TEST_GRB = stream.res # More simple tests, using the C version only: simsim: sim_c$(EXE) $(SIM_CFS) $(ALG_CFS) ./sim_c$(EXE) -fr 20 $(SIM_CFS) $(ALG_CFS) # Lint lint: $(SIM_SRC) $(ALG_SRC) $(ABS_CFS) $(LINT) $(LINTFLAGS) $(SIM_CFS) $(ALG_CFS) $(ABS_CFS) # O T H E R E N T R I E S # Sets of files: general, modules, main programs, languages CFS = $(SIM_CFS) $(ALG_CFS) \ $(CLANG_CFS) $(JAVALANG_CFS) $(PASCLANG_CFS) $(M2LANG_CFS) \ $(LISPLANG_CFS) $(MIRALANG_CFS) $(TEXTLANG_CFS) OBJ = $(SIM_OBJ) $(ALG_OBJ) \ $(CLANG_OBJ) $(JAVALANG_OBJ) $(PASCLANG_OBJ) $(M2LANG_OBJ) \ $(LISPLANG_OBJ) $(MIRALANG_OBJ) $(TEXTLANG_OBJ) SRC = $(SIM_SRC) $(ALG_SRC) \ $(CLANG_SRC) $(JAVALANG_SRC) $(PASCLANG_SRC) $(M2LANG_SRC) \ $(LISPLANG_SRC) $(MIRALANG_SRC) $(TEXTLANG_SRC) FLS = $(SIM_FLS) $(ALG_FLS) \ $(CLANG_FLS) $(JAVALANG_FLS) $(PASCLANG_FLS) $(M2LANG_FLS) \ $(LISPLANG_FLS) $(MIRALANG_FLS) $(TEXTLANG_FLS) \ sysidf.mk sysidf.msdos sysidf.unix DOC = README sim.1 sim.txt sim.html ChangeLog Answers TechnReport # Installation install_all: install # just a synonym install: $(MAN1DIR)/sim.1 \ $(BINDIR)/sim_c$(EXE) \ $(BINDIR)/sim_java$(EXE) \ $(BINDIR)/sim_pasc$(EXE) \ $(BINDIR)/sim_m2$(EXE) \ $(BINDIR)/sim_lisp$(EXE) \ $(BINDIR)/sim_mira$(EXE) \ $(BINDIR)/sim_text$(EXE) $(MAN1DIR)/sim.1: sim.1 $(COPY) sim.1 $@ # Clean-up .PHONY: clean fresh clean: -rm -f *.o -rm -f $(SIM_GRB) -rm -f $(TEST_GRB) -rm -f a.out a.exe sim.txt core mon.out fresh: clean -rm -f *.exe # D E P E N D E N C I E S # DO NOT DELETE THIS LINE -- make depend depends on it. ForEachFile.o: ForEachFile.c ForEachFile.h fname.h Malloc.o: Malloc.c Malloc.h add_run.o: add_run.c sim.h debug.par text.h runs.h aiso.spc percentages.h \ Malloc.h options.h error.h add_run.h algollike.o: algollike.c options.h error.h token.h algollike.h language.h clang.o: clang.c options.h algollike.h token.h language.h idf.h lex.h \ lang.h compare.o: compare.c sim.h text.h tokenarray.h token.h hash.h language.h \ options.h add_run.h compare.h debug.par count_sim_dup.o: count_sim_dup.c debug.o: debug.c debug.h error.o: error.c sim.h error.h fname.o: fname.c fname.h hash.o: hash.c system.par debug.par sim.h text.h Malloc.h error.h \ language.h token.h tokenarray.h options.h hash.h idf.o: idf.c system.par token.h idf.h javalang.o: javalang.c options.h algollike.h token.h language.h idf.h \ lex.h lang.h lex.o: lex.c token.h lex.h lisplang.o: lisplang.c algollike.h token.h language.h lex.h lang.h idf.h m2lang.o: m2lang.c options.h algollike.h token.h language.h idf.h lex.h \ lang.h miralang.o: miralang.c algollike.h token.h language.h lex.h lang.h idf.h newargs.o: newargs.c ForEachFile.h fname.h Malloc.h error.h newargs.h options.o: options.c options.h pascallang.o: pascallang.c options.h algollike.h token.h language.h idf.h \ lex.h lang.h pass1.o: pass1.c debug.par sim.h text.h tokenarray.h token.h lex.h \ error.h options.h pass1.h pass2.o: pass2.c debug.par sim.h text.h lex.h token.h pass2.h \ sortlist.bdy pass3.o: pass3.c system.par debug.par sim.h text.h runs.h aiso.spc \ Malloc.h error.h options.h pass3.h percentages.h percentages.o: percentages.c sim.h text.h runs.h aiso.spc options.h \ Malloc.h error.h percentages.h sortlist.bdy runs.o: runs.c sim.h text.h runs.h aiso.spc debug.par aiso.bdy Malloc.h sim.o: sim.c system.par settings.par sim.h options.h newargs.h language.h \ token.h error.h text.h runs.h aiso.spc hash.h compare.h pass1.h pass2.h \ pass3.h percentages.h stream.h lex.h Malloc.h stream.o: stream.c system.par token.h lex.h lang.h stream.h text.o: text.c debug.par sim.h token.h stream.h lex.h Malloc.h options.h \ error.h text.h textlang.o: textlang.c sim.h language.h token.h idf.h lex.h lang.h token.o: token.c token.h tokenarray.o: tokenarray.c error.h lex.h token.h Malloc.h tokenarray.h similarity-tester-2.70.orig/debug.h0000644000000000000000000000163611710073551014226 0ustar /* This file is part of the debugging module DEBUG. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: debug.h,v 1.5 2012-01-25 21:43:05 Gebruiker Exp $ */ /* DEBUG defines one routine, extern void wr_info(const char *s, int b, int v); which, when compiled with a -DDEBUG option, writes the string s, a space character, the value v in base b, and a newline to standard error output (file descriptor 2), without interfering with other program activities. The following values for b are accepted: b = 0: the string s only b = 8: octal b = 16: hex b = 128: char otherwise: decimal This allows debugging info to be obtained in the presence of sudden crashes and other nefarious program activity. Compiled without the -DDEBUG option wr_info does nothing. This allows easy switching off of the debugging feature by recompiling debug.c. */ extern void wr_info(const char *s, int b, int v); similarity-tester-2.70.orig/hash.c0000644000000000000000000002320011764320436014053 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: hash.c,v 2.18 2012-06-08 06:52:14 Gebruiker Exp $ */ /* Text is compared by comparing every substring to all substrings to the right of it; this process is in essence quadratic. However, only substrings of length at least 'Min_Run_Size' are of interest, which gives us the possibility to speed up this process by using a hash table. For every position in the text, we construct an index which gives the next position in the text at which a run of Min_Run_Size tokens starts that has the same hash code, as calculated by hash1(). If there is no such run, the index is 0. These forward references are kept in the array forward_reference[]. To construct this array, we use a hash table last_index[] whose size is a prime and which is about 8 times smaller than the text array. The hash table last_index[] is set up such that last_index[i] is the index of the latest token with hash_code i, or 0 if there is none. This results in hash chains of an average length of 8. See Make_Forward_References(). If there is not enough room for a hash table of the proper size (which can be considerable) the hashing is not efficient any more. In that case, the forward reference table is scanned a second time, eliminating from any chain all references to runs that do not hash to the same value under a second hash function, hash2(). For the UNIX manuals this reduced the number of matches from 91.9% to 1.9% (of which 0.06% was genuine). */ #include #include "system.par" #include "debug.par" #include "sim.h" #include "text.h" #include "Malloc.h" #include "error.h" #include "token.h" #include "language.h" #include "token.h" #include "tokenarray.h" #include "options.h" #include "hash.h" /* MAIN ENTRIES */ static unsigned int *forward_reference; /* to be filled by Malloc() */ static int n_forward_references; static void make_forward_references_hash1(void); static void make_forward_references_hash2(void); #ifdef DB_FORW_REF static void db_forward_references(const char *); static void make_forward_references_hash3(void); #endif void Make_Forward_References(void) { /* Constructs the forward references table. */ n_forward_references = Text_Length(); forward_reference = (unsigned int *)Calloc( n_forward_references, sizeof (unsigned int) ); make_forward_references_hash1(); make_forward_references_hash2(); #ifdef DB_FORW_REF make_forward_references_hash3(); #endif } unsigned int Forward_Reference(int i) { if (i <= 0 || i >= n_forward_references) { fatal("internal error, bad forward reference"); } return forward_reference[i]; } void Free_Forward_References(void) { Free((char *)forward_reference); } /* HASHING */ /* We want a hash function whose time cost does not depend on Min_Run_Size, which is a problem since the size of the object we derive the hash value from IS equal to Min_Run_Size! Therefore we base the hash function on a sample of at most N_SAMPLES tokens from the input string; this works at least as well in practice. */ #define N_SAMPLES 24 #define OPERATION ^ /* An alternative algorithm; does not seem to make any difference. #define N_SAMPLES 23 #define OPERATION + */ /* Another algorithm; not yet tested #define N_SAMPLES 24 #define OPERATION + 613 * */ static unsigned int *last_index; static unsigned int hash_table_size; static int sample_pos[N_SAMPLES]; static unsigned int prime[] = { /* lots of hopefully suitable primes */ 10639, 21283, 42571, 85147, 170227, 340451, 680959, 1361803, 2723599, 5447171, 10894379, 21788719, 43577399, 87154759, 174309383, 348618827, 697237511, 1394475011 }; static void init_hash_table(void) { int n; /* find the ideal hash table size */ n = 0; while (prime[n] < Text_Length()) { n++; /* this will always terminate, if prime[] is large enough */ } /* see if we can allocate that much space, and if not, step down */ last_index = 0; while (!last_index && n >= 0) { hash_table_size = prime[n]; last_index = (unsigned int *) TryCalloc(hash_table_size, sizeof (unsigned int)); n--; } if (!last_index) { fatal("out of memory"); } /* find sample positions */ for (n = 0; n < N_SAMPLES; n++) { /* straigh-line approximation; uninituitive as usual */ sample_pos[n] = ( (2 * n * (Min_Run_Size - 1) + (N_SAMPLES - 1)) / (2 * (N_SAMPLES - 1)) ); } } static int hash1(const Token *); static void make_forward_references_hash1(void) { int n; init_hash_table(); /* set up the forward references using the last_index hash table */ for (n = 0; n < Number_Of_Texts; n++) { struct text *txt = &Text[n]; unsigned int j; for ( /* all pos'ns in txt except the last Min_Run_Size-1 */ j = txt->tx_start; /* >= 1 */ j + Min_Run_Size - 1 < txt->tx_limit; j++ ) { if (May_Be_Start_Of_Run(Token_Array[j])) { int h = hash1(&Token_Array[j]); if (last_index[h]) { forward_reference[last_index[h]] = j; } last_index[h] = j; } } } Free((char *)last_index); #ifdef DB_FORW_REF db_forward_references("first hashing"); #endif /* DB_FORW_REF */ } static int hash1(const Token *p) { /* hash1(p) returns the hash code of Min_Run_Size tokens starting at p; caller guarantees that there are at least Min_Run_Size tokens. */ int32 h_val; int n; h_val = 0; for (n = 0; n < N_SAMPLES; n++) { h_val = (h_val << 1) OPERATION Token2int(p[sample_pos[n]]); if (h_val & (1<<31)) { h_val ^= (1<<31|1); } } return h_val % hash_table_size; } static int hash2(const Token *); static void make_forward_references_hash2(void) { unsigned int i; /* Clean out spurious matches, by a quadratic algorithm. Note that we do not want to eliminate overlapping sequences in this stage, since we might be removing the wrong copy. */ for (i = 0; i+Min_Run_Size < Text_Length(); i++) { unsigned int j = i; int h2 = hash2(&Token_Array[i]); /* Find the first token sequence in the chain with same secondary hash code. */ while ( /* there is still a forward reference */ (j = forward_reference[j]) && /* its hash code does not match */ hash2(&Token_Array[j]) != h2 ) { /* continue searching */ } /* short-circuit forward reference to it, or to zero */ forward_reference[i] = j; } #ifdef DB_FORW_REF db_forward_references("second hashing"); #endif /* DB_FORW_REF */ } static int hash2(const Token *p) { /* A simple-minded hashing for the secondary sweep; first and last token combined in a short int. */ return (Token2int(p[0]) << 8) + Token2int(p[Min_Run_Size-1]); } #ifdef DB_FORW_REF static int hash3(const Token *, const Token *); static void db_print_forward_references(void) { unsigned int n; unsigned int *printed_at = (unsigned int *)Calloc(Text_Length(), sizeof (unsigned int)); for (n = 1; n < Text_Length(); n++) { unsigned int fw = forward_reference[n]; if (fw == 0) continue; fprintf(Debug_File, "FWR[%d]:", n); if (printed_at[fw]) { fprintf(Debug_File, " see %d", printed_at[fw]); } else { while (fw) { fprintf(Debug_File, " %d", fw); printed_at[fw] = n; fw = forward_reference[fw]; } } fprintf(Debug_File, "\n"); } Free((void *)printed_at); } static void make_forward_references_hash3(void) { unsigned int i; /* Do a third hash to check up on the previous two */ /* This time we use a genuine compare */ for (i = 0; i+Min_Run_Size < Text_Length(); i++) { unsigned int j = i; while ( /* there is still a forward reference */ (j = forward_reference[j]) && /* its hash code does not match */ !hash3(&Token_Array[i], &Token_Array[j]) ) { /* continue searching */ } /* short-circuit forward reference to it, or to zero */ forward_reference[i] = j; } db_forward_references("third hashing"); } static int hash3(const Token *p, const Token *q) { /* a full comparison for the tertiary sweep */ int n; for (n = 0; n < Min_Run_Size; n++) { if (!Token_EQ(p[n], q[n])) return 0; } return 1; } static int db_frw_chain(int n, char *crossed_out) { int chain_len = -1; /* if there are two values, the chain length is still 1 */ int fw; for (fw = n; fw; fw = forward_reference[fw]) { if (crossed_out[fw]) { fprintf(Debug_File, ">>>> error: forward references cross <<<<\n" ); } chain_len++; crossed_out[fw] = 1; } fprintf(Debug_File, "chain_start = %d, chain_len = %d\n", n, chain_len); return chain_len; } static void db_forward_references(const char *msg) { int n; int n_frw_chains = 0; /* number of forward ref. chains */ int tot_frwc_len = 0; char *crossed_out; fprintf(Debug_File, "\n\n**** DB_FORWARD_REFERENCES, %s ****\n", msg); fprintf(Debug_File, "hash_table_size = %u\n", hash_table_size); fprintf(Debug_File, "N_SAMPLES = %d\n", N_SAMPLES); crossed_out = (char *)Calloc(Text_Length(), sizeof (char)); /* Each forward_reference[n] starts in principle a new chain, and these chains never touch each other. We check this property by marking the positions in each chain in an array; if we meet a marked entry while following a chain, it must have been on an earlier chain and we have an error. We also determine the lengths of the chains, for statistics. */ if (forward_reference[0]) { fprintf(Debug_File, ">>>> forward_reference[0] is not zero <<<<\n" ); } for (n = 1; n < Text_Length(); n++) { if (forward_reference[n] && !crossed_out[n]) { /* start of a new chain */ n_frw_chains++; tot_frwc_len += db_frw_chain(n, crossed_out); } } db_print_forward_references(); Free((char *)crossed_out); fprintf(Debug_File, "text length = %u, # forward chains = %d, total frw chain length = %d\n\n", Text_Length(), n_frw_chains, tot_frwc_len ); } #endif /* DB_FORW_REF */ similarity-tester-2.70.orig/Malloc.c0000644000000000000000000001661512055472657014362 0ustar /* This file is part of the memory management and leak detector MALLOC. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: Malloc.c,v 1.7 2012-06-13 09:59:52 Gebruiker Exp $ */ #include #include #include #include #include "Malloc.h" #undef new #define new use_my_new /* don't call Malloc in Malloc.c */ #define my_new(type) ((type *)malloc(sizeof (type))) /* All output goes through designated files, so we block printf, etc. */ #undef printf #define printf use_fprintf #undef putchar #define putchar use_fprintf #ifndef lint static void fprintloc(FILE *f, const char *fname, int l_nmb) { fprintf(f, "\"%s\", line %d: ", fname, l_nmb); } static void out_of_memory(const char *fname, int l_nmb, size_t size) { fprintloc(stderr, fname, l_nmb); fprintf(stderr, "Out of memory, requested size = %lld\n", (long long int)size); exit(1); } #if defined MEMLEAK || defined MEMCLOBBER /* Both need almost the same information: MEMLEAK obviously needs a list of all blocks still allocated, but MEMCLOBBER needs the same list to find the size of a block given to Free(), in order to clobber it. MEMCLOBBER does not need total, balance and max, but finecombing them out would be too much. */ static long long int total = 0; static long long int balance = 0; static long long int max = 0; struct record { struct record *next; const char *addr; size_t size; const char *fname; int l_nmb; }; #define HASH_SIZE 16381 /* largest prime under 2^16 */ static struct record *record_hash[HASH_SIZE]; #define chain_start(x) record_hash[((unsigned int)(x)%HASH_SIZE)] static void record_alloc(char *addr, size_t size, const char *fname, int l_nmb) { struct record *new; struct record **r_hook = &chain_start(addr); if (addr == 0) return; new = my_new(struct record); new->addr = addr; new->size = size; new->fname = fname; /* no need to copy fname */ new->l_nmb = l_nmb; new->next = *r_hook; *r_hook = new; total += size; balance += size; if (balance > max) { max = balance; } } static struct record ** record_pointer_for_address(const char *addr) { struct record **rp = &chain_start(addr); while (*rp) { if ((*rp)->addr == addr) break; rp = &(*rp)->next; } return rp; } static size_t record_free(char *addr) { struct record **oldp = record_pointer_for_address(addr); struct record *old = *oldp; if (old == 0) return -1; *oldp = old->next;/* this loses the struct record; is that a problem? */ balance -= old->size; return old->size; } #endif /* defined MEMLEAK || defined MEMCLOBBER */ void MemClobber(void *p, size_t size) { unsigned char *s = (unsigned char *)p; size_t i; for (i = 0; i < size; i++) { s[i] = 0125; /* 0101 0101 */ } } #ifdef MEMLEAK struct entry { struct entry *next; const char *fname; int l_nmb; int n_blocks; int var_size; /* all blocks have the same size or not */ int size; /* !var_size: the one size; var_size: sum of sizes */ }; static struct entry * compacted_leaks(void) { struct entry *res = 0; int i; for (i = 0; i < HASH_SIZE; i++) { struct record *r = record_hash[i]; while (r) { struct entry *e = res; /* try to find an entry for this location */ while (e) { if ( e->fname == r->fname && e->l_nmb == r->l_nmb ) break; e = e->next; } if (e) { /* update the entry */ if (e->var_size) { e->size += r->size; } else if (e->size != r->size) { /* switch to var_size */ e->var_size = 1; e->size = e->n_blocks*e->size + r->size; } e->n_blocks++; } else { /* create a new entry */ e = my_new(struct entry); e->fname = r->fname; e->l_nmb = r->l_nmb; e->n_blocks = 1; e->var_size = 0; e->size = r->size; e->next = res; res = e; } r = r->next; } } return res; } static int number_of_leaks(const struct entry *e) { int res = 0; while (e != 0) { res++; e = e->next; } return res; } static void report_actual_leaks(FILE *f) { const struct entry *e = compacted_leaks(); int n_leaks = number_of_leaks(e); if (n_leaks == 0) return; fprintf(f, "There %s %d case%s of unreclaimed memory:\n", (n_leaks == 1 ? "was" : "were"), n_leaks, (n_leaks == 1 ? "" : "s") ); while (e) { fprintloc(f, e->fname, e->l_nmb); fprintf(f, "left allocated: %d block%s of size ", e->n_blocks, (e->n_blocks == 1 ? "" : "s") ); if (e->var_size) { /* e->size is the sum of the sizes */ fprintf(f, "%d on average", (e->size + e->n_blocks/2) / e->n_blocks ); } else { /* e->size is the single size */ fprintf(f, "%d", e->size); } if (e->n_blocks > 1) { fprintf(f, " = %d", (e->var_size ? e->size : e->size*e->n_blocks)); } fprintf(f, "\n"); e = e->next; } } void ReportMemoryLeaks(FILE *f) { if (f == 0) f = stderr; report_actual_leaks(f); fprintf(f, "Total memory allocated= %lld", total); fprintf(f, ", maximum allocated = %lld", max); fprintf(f, ", garbage left = %lld", balance); fprintf(f, "\n"); } #else /* no MEMLEAK */ /*ARGSUSED*/ void ReportMemoryLeaks(FILE *f) { } #endif /* MEMLEAK */ void * _leak_malloc(int chk, size_t size, const char *fname, int l_nmb) { void *res = malloc(size); if (chk && res == 0) { out_of_memory(fname, l_nmb, size); /*NOTREACHED*/ } #if defined MEMLEAK || defined MEMCLOBBER record_alloc(res, size, fname, l_nmb); #ifdef MEMCLOBBER MemClobber((char *)res, size); #endif #endif return res; } void * _leak_calloc(int chk, int n, size_t size, const char *fname, int l_nmb) { void *res = calloc(n, size); if (chk && res == 0) { out_of_memory(fname, l_nmb, n*size); /*NOTREACHED*/ } #if defined MEMLEAK || defined MEMCLOBBER record_alloc(res, n*size, fname, l_nmb); #endif return res; } void * _leak_realloc(int chk, void *addr, size_t size, const char *fname, int l_nmb) { void *res; #if defined MEMLEAK || defined MEMCLOBBER size_t old_size = record_free(addr); /* we report first, because the realloc() below may cause a crash */ if ( /* we are not reallocating address 0, which is allowed */ addr != 0 && /* the address was never handed out before */ old_size == -1 ) { fprintloc(stderr, fname, l_nmb); fprintf(stderr, ">>>> unallocated block reallocated <<<<\n"); } #endif res = realloc(addr, size); if (chk && res == 0) { out_of_memory(fname, l_nmb, size); /*NOTREACHED*/ } #if defined MEMLEAK || defined MEMCLOBBER record_alloc(res, size, fname, l_nmb); #endif #ifdef MEMCLOBBER if (old_size > 0 && size > old_size) { MemClobber(((char *)res)+old_size, size-old_size); } #endif return res; } /* ARGSUSED */ void _leak_free(void *addr, const char *fname, int l_nmb) { #if defined MEMLEAK || defined MEMCLOBBER size_t old_size = record_free(addr); /* we report first, because the free() below may cause a crash */ if (old_size == -1) { fprintloc(stderr, fname, l_nmb); fprintf(stderr, ">>>> unallocated block freed "); fprintf(stderr, "or multiple free of allocated block <<<<\n"); } else { #ifdef MEMCLOBBER MemClobber((char *)addr, old_size); #endif } #endif free(addr); } char * _new_string(const char *s, const char *fname, int l_nmb) { return strcpy((char *)(_leak_malloc(1, strlen(s)+1, fname, l_nmb)), s); } #endif /* not lint */ #ifdef lint static void satisfy_lint(void *x) { void *v; v = _leak_malloc(0, 0, 0, 0); v = _leak_calloc(0, 0, 0, 0, 0); v = _leak_realloc(0, 0, 0, 0, 0); _leak_free(x, 0, 0); ReportMemoryLeaks(0); v = _new_string(0, 0, 0); satisfy_lint(v); } #endif similarity-tester-2.70.orig/token.h0000644000000000000000000000470211764421216014261 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: token.h,v 2.12 2012-06-08 16:04:30 Gebruiker Exp $ */ /* Token interface. Since the definition of a token has been a continual source of problems, it is now defined as an ADT 'Token'. There are four classes of tokens: 1. simple tokens; they derive directly from input characters; 2. summary tokens; they summarise keywords, etc.; 3. special tokens: No_Token, IDF, and End_Of_Line; 4. hashed tokens, segments condensed by idf_hashed(). The first three classes are called 'regular tokens'. There are also a few 'gap' tokens, tokens not produced by the above mechanisms, for example 0x100. In addition to the type Token and the special tokens, the module defines 1. the constants N_REGULAR_TOKENS number of regular tokens N_TOKENS total number of tokens, including No_Token 2. macros for defining summary tokens (with ranges of their parameters): CTRL(ch) ch in 'A'-'~' NORM(ch) ch in '!'-'~' MTCT(ch) ch in 'A'-'~' META(ch) ch in '!'-'~' These restrictions are not checked. 3. the conversion routines Token2int(c) int2Token(i) */ #include #ifndef _TOKEN_H #define _TOKEN_H #ifdef lint /* For security we want to distinguish tokens from integers. Lint is not good at this, so for checking we use a pointer to a weird data type */ struct for_lint_only {int i;}; typedef struct for_lint_only *Token; #else /* if normal */ typedef unsigned short Token; #endif /* lint/normal */ #define N_TOKENS (1<<16) #define N_REGULAR_TOKENS (1<<9) /* Macros for the composition of tokens */ /* range (gaps unused)*/ #define No_Token int2Token(0) /* 0x0000 */ /* UTF-8 characters */ /* 0x0001-0x00FF */ #define CTRL(ch) int2Token(0x100|((ch)&0x01F)) /* 0x0101-0x011E */ #define NORM(ch) int2Token(0x100|((ch)&0x07F)) /* 0x0121-0x017E */ #define IDF int2Token(0x180) /* 0x0180 */ #define MTCT(ch) int2Token(0x180|((ch)&0x01F)) /* 0x0181-0x019E */ #define META(ch) int2Token(0x180|((ch)&0x07F)) /* 0x01A1-0x01FE */ /* tokens from idf_hashed() */ /* 0x0200-0xFFFE */ #define End_Of_Line int2Token(0xFFFF) /* 0xFFFF */ /* Conversion routines */ #define Token2int(c) ((int)(c)) #define int2Token(i) ((Token)(i)) /* Auxiliaries */ #define is_regular_token(tk) (Token2int(tk) < N_REGULAR_TOKENS) extern int Token_EQ(const Token t1, const Token t2); extern void fprint_token(FILE *ofile, const Token tk); #endif /* _TOKEN_H */ similarity-tester-2.70.orig/stream.h0000644000000000000000000000072711764421216014437 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: stream.h,v 2.7 2012-06-08 16:04:30 Gebruiker Exp $ */ /* Interface of the stream module. Implements the direct interaction with the lexical module. It supplies the routines below. */ extern int Open_Stream(const char *); extern int Next_Stream_Token_Obtained(void); extern void Close_Stream(void); extern void Print_Stream(const char *fname); similarity-tester-2.70.orig/token.c0000644000000000000000000000531512032031447014245 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: token.c,v 2.11 2012-09-30 11:55:19 Gebruiker Exp $ */ /* Token interface, implementation part. */ #include #include "token.h" static int Token_in_range(const Token tk, int low, int high) { int tki = Token2int(tk); if (tki < low) return 0; if (tki > high) return 0; return 1; } static int check_and_print( FILE *ofile, const char *name, char ch, char low, char high, char offset ) { int ch1 = ch + offset; if (low <= ch1 && ch1 <= high) { fprintf(ofile, "%s(%c)", name,ch1); return 1; } return 0; } #define is_simple_token(tk) (Token_in_range(tk, 0x0001, 0x00FF)) #define is_CTRL_token(tk) (Token_in_range(tk, 0x0101, 0x011E)) #define is_NORM_token(tk) (Token_in_range(tk, 0x0121, 0x017E)) #define is_MTCT_token(tk) (Token_in_range(tk, 0x0181, 0x019E)) #define is_META_token(tk) (Token_in_range(tk, 0x01A1, 0x01FE)) #define is_hashed_token(tk) (Token_in_range(tk, 0x0200, 0xFFFE)) void fprint_token(FILE *ofile, const Token tk) { /* Prints a regular token in two characters: normal char meta (bit 9 set) ^A cntl $A meta-cntl A printable #A meta and hashed tokens in hexadecimal. */ int tki = Token2int(tk); int ch = tki & 0x7F; int bit8 = tki & 0x80; if (Token_EQ(tk, No_Token)) {fprintf(ofile, "--"); return;} if (Token_EQ(tk, IDF)) {fprintf(ofile, "IDF"); return;} if (Token_EQ(tk, End_Of_Line)) {fprintf(ofile, "EOL"); return;} if (is_simple_token(tk)) { if ('!' <= ch && ch <= '~') { fprintf(ofile, "%s%c", (bit8 ? "8" : ""), ch); return; } if (0 < ch && ch <= ' ') { fprintf(ofile, "%s%c", (bit8 ? "$" : "^"), ch + '@'); return; } if (ch == 0x7F) { fprintf(ofile, "%s%c", (bit8 ? "$" : "^"), '?'); return; } } if (is_CTRL_token(tk)) { if (check_and_print(ofile, "CTRL", ch, 'A', '~', '@')) return; } if (is_NORM_token(tk)) { if (check_and_print(ofile, "NORM", ch, '!', '~', '\0')) return; } if (is_MTCT_token(tk)) { if (check_and_print(ofile, "MTCT", ch, 'A', '~', '@')) return; } if (is_META_token(tk)) { if (check_and_print(ofile, "META", ch, '!', '~', '\0')) return; } if (is_hashed_token(tk)) { fprintf(ofile, "0x%04x", tki); return; } /* gap token! */ fprintf(ofile, "!0x%04x!", tki); } #ifdef XXXX int ch = tki & 0177; int meta = tki & 0200; if (' ' <= ch && ch <= '~') { fprintf(ofile, "%c%c", (meta ? '#' : ' '), ch); } else { fprintf(ofile, "%c%c", (meta ? '$' : '^'), (ch == 0177 ? '?' : ch + '@') ); } #endif int Token_EQ(const Token t1, const Token t2) { /* to make sure Token_EQ is indeed called with two Token parameters */ return Token2int(t1) == Token2int(t2); } similarity-tester-2.70.orig/pascallang.l0000644000000000000000000001207311764421215015251 0ustar %{ /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: pascallang.l,v 2.18 2012-06-08 16:04:29 Gebruiker Exp $ */ /* PASCAL language front end for the similarity tester. Author: Maarten van der Meulen Date: May 1986 */ #include "options.h" #include "token.h" #include "language.h" #include "algollike.h" #include "idf.h" #include "lex.h" #include "lang.h" /* General language front end data */ Token lex_token; unsigned int lex_nl_cnt; unsigned int lex_tk_cnt; unsigned int lex_non_ascii_cnt; /* Language-dependent data */ /* Data for module idf */ static const struct idf ppcmd[] = { {"define", META('d')}, {"else", META('e')}, {"endif", META('E')}, {"if", META('i')}, {"ifdef", META('I')}, {"ifndef", META('x')}, {"include", MTCT('I')}, {"line", META('l')}, {"undef", META('u')} }; static const struct idf reserved[] = { {"and", NORM('&')}, {"array", NORM('A')}, {"begin", NORM('{')}, {"case", NORM('c')}, {"const", NORM('C')}, {"div", NORM('/')}, {"do", NORM('D')}, {"downto", NORM('d')}, {"else", NORM('e')}, {"end", NORM('}')}, {"extern", CTRL('E')}, {"file", NORM('F')}, {"for", NORM('f')}, {"function", NORM('p')}, /* Equal to procedure */ {"goto", NORM('g')}, {"if", NORM('i')}, {"in", NORM('I')}, {"label", NORM('l')}, {"mod", NORM('%')}, {"nil", NORM('n')}, {"not", NORM('!')}, {"of", No_Token}, {"or", NORM('|')}, {"packed", NORM('P')}, {"procedure", NORM('p')}, {"program", No_Token}, {"record", NORM('r')}, {"repeat", NORM('R')}, {"set", NORM('s')}, {"then", No_Token}, {"to", NORM('t')}, {"type", NORM('T')}, {"until", NORM('u')}, {"var", NORM('v')}, {"while", NORM('w')}, {"with", NORM('W')} }; /* Special treatment of identifiers */ static void lower_case(char *str) { /* Turns upper case into lower case, since Pascal does not distinguish between them. */ char *s; for (s = str; *s; s++) { if ('A' <= *s && *s <= 'Z') { *s += (-'A' + 'a'); } } } static Token idf2token(int hashing) { Token tk; lower_case(yytext); tk = idf_in_list(yytext, reserved, sizeof reserved, IDF); if (Token_EQ(tk, IDF) && hashing) { /* return a one-Token hash code */ tk = idf_hashed(yytext); } return tk; } /* Token sets for module algollike */ const Token Non_Finals[] = { IDF, /* identifier */ NORM('{'), /* also begin */ NORM('('), NORM('['), NORM('A'), /* array */ NORM('c'), /* case */ NORM('C'), /* const */ NORM('/'), /* div */ CTRL('E'), /* extern */ NORM('F'), /* file */ NORM('f'), /* for */ NORM('g'), /* goto */ NORM('i'), /* if */ NORM('l'), /* label */ NORM('P'), /* packed */ NORM('p'), /* procedure/function */ NORM('r'), /* record */ NORM('R'), /* repeat */ NORM('s'), /* set */ NORM('T'), /* type */ NORM('v'), /* var */ NORM('w'), /* while */ NORM('W'), /* with */ No_Token }; const Token Non_Initials[] = { NORM(')'), NORM('}'), NORM(';'), No_Token }; const Token Openers[] = { NORM('{'), NORM('('), NORM('['), No_Token }; const Token Closers[] = { NORM('}'), NORM(')'), NORM(']'), No_Token }; /* Language-dependent code */ void Init_Language(void) { Init_Algol_Language(Non_Finals, Non_Initials, Openers, Closers); } int May_Be_Start_Of_Run(Token ch) { return May_Be_Start_Of_Algol_Run(ch); } unsigned int Best_Run_Size(const Token *str, unsigned int size) { return Best_Algol_Run_Size(str, size); } %} %option noyywrap %Start Comment Layout ([ \t\r\f]) ASCII95 ([- !"#$%&'()*+,./0-9:;<=>?@A-Z\[\\\]^_`a-z{|}~]) AnyQuoted (\\.) StrChar ([^'\n\\]|{AnyQuoted}) StartComment ("{"|"(*") EndComment ("}"|"*)") SafeComChar ([^*}\n]) UnsafeComChar ("*") Digit ([0-9]) Idf ([A-Za-z][A-Za-z0-9_]*) %% {StartComment} { /* See clang.l */ BEGIN Comment; } {SafeComChar}+ { /* safe comment chunk */ } {UnsafeComChar} { /* unsafe char, read one by one */ } "\n" { /* to break up long comments */ return_eol(); } {EndComment} { /* end-of-comment */ BEGIN INITIAL; } \'{StrChar}*\' { /* character strings */ return_ch('"'); } ^#{Layout}*include.* { /* ignore #include lines */ } ^#{Layout}*{Idf} { /* a preprocessor line */ char *idf = yytext+1; /* skip layout in front of preprocessor identifier */ while (*idf == ' ' || *idf == '\t') { idf++; } return_tk(idf_in_list(idf, ppcmd, sizeof ppcmd, NORM('#'))); } {Digit}+ { /* numeral, passed as an identifier */ return_tk(IDF); } {Idf}/"(" { /* identifier in front of ( */ Token tk; tk = idf2token(is_set_option('F')); if (!Token_EQ(tk, No_Token)) return_tk(tk); } {Idf} { /* identifier */ Token tk; tk = idf2token(0 /* no hashing */); if (!Token_EQ(tk, No_Token)) return_tk(tk); } \; { /* semicolon, conditionally ignored */ if (is_set_option('f')) return_ch(yytext[0]); } \n { /* count newlines */ return_eol(); } {Layout} { /* ignore layout */ } {ASCII95} { /* copy other text */ return_ch(yytext[0]); } . { /* count non-ASCII chars */ lex_non_ascii_cnt++; } %% /* More language-dependent code */ void yystart(void) { BEGIN INITIAL; } similarity-tester-2.70.orig/system.par0000644000000000000000000000066107355036374015030 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: system.par,v 1.2 2001-09-28 09:03:55 dick Exp $ */ /* Operating-system dependent data */ #ifdef UNIX #define int32 int /* type of a 32 bits signed int */ #define NULLFILE "/dev/null" #endif #ifdef MSDOS /* GNU gcc */ #define int32 int /* type of a 32 bits signed int */ #define NULLFILE "nul" #endif similarity-tester-2.70.orig/VERSION0000644000000000000000000000000500000000001013766 0ustar 2.70 similarity-tester-2.70.orig/lang.c0000644000000000000000000000107012032031447014040 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: lang.c,v 2.6 2012-09-30 11:55:19 Gebruiker Exp $ */ /* This is a dummy implementation of the module 'lang'. Its actual implementation derives from one of the *lang.l files. */ #include #include #include "token.h" #include "lang.h" FILE *yyin; int yylex(void) { abort(); return 0; } void yystart(void) { abort(); } Token lex_token; unsigned int lex_nl_cnt; unsigned int lex_tk_cnt; unsigned int lex_non_ascii_cnt; similarity-tester-2.70.orig/compare.c0000644000000000000000000001140511764320436014562 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: compare.c,v 2.16 2012-06-08 06:52:14 Gebruiker Exp $ */ #include "sim.h" #include "text.h" #include "token.h" #include "tokenarray.h" #include "hash.h" #include "language.h" #include "options.h" #include "add_run.h" #include "compare.h" #include "debug.par" static void compare_one_text(int, int, int); static unsigned int lcs( struct text *, unsigned int, struct text **, unsigned int *, unsigned int, unsigned int ); /* The overall structure of the routine Compare_Files() is: for all new files for all texts it must be compared to for all positions in the new file for all positions in the text for ever increasing sizes try to match and keep the best */ void Compare_Files(void) { int n; for ( /* all new texts */ n = 0; n < Number_Of_New_Texts; n++ ) { int first = ( /* if compare to old only */ is_set_option('S') ? Number_Of_New_Texts + 1 : /* else if do not compare to self */ is_set_option('s') ? n + 1 /* else */ : n ); if (is_set_option('e')) { /* from first to Number_Of_Texts in steps */ int m; for (m = first; m < Number_Of_Texts; m++) { compare_one_text(n, m, m+1); } } else { /* from first to Number_Of_Texts in one action */ if (first < Number_Of_Texts) { compare_one_text(n, first, Number_Of_Texts); } } } } static void compare_one_text( int n, /* text to be compared */ int first, /* first text to be compared to */ int limit /* first text not to be compared to */ ) { unsigned int i_first = Text[first].tx_start; unsigned int i_limit = Text[limit-1].tx_limit; struct text *txt0 = &Text[n]; unsigned int i0 = txt0->tx_start; while ( /* there may be a useful substring */ i0 + Min_Run_Size <= txt0->tx_limit ) { /* see if there really is one */ struct text *txt_best; unsigned int i_best; unsigned int size_best = lcs(txt0, i0, &txt_best, &i_best, i_first, i_limit); if (size_best) { /* good run found; enter it */ add_run(txt0, i0, txt_best, i_best, size_best); /* and skip it */ i0 += size_best; } else { /* we try our luck at the next token */ i0++; } } } static unsigned int lcs( struct text *txt0, /* input: starting position */ unsigned int i0, struct text **tbp, /* output: position of best run */ unsigned int *ibp, unsigned int i_first, /* no comparison before this pos. */ unsigned int i_limit /* no comparison after this pos. */ ) { /* Finds the longest common substring (not subsequence) in: txt0, starting precisely at i0 and the text from i_first to i_limit-1. Writes the position in tbp and ibp and returns the size. Returns 0 if no common substring is found. */ struct text *txt1 = txt0; unsigned int i1 = i0; unsigned int size_best = 0; while ( /* there is a next opportunity */ (i1 = Forward_Reference(i1)) && /* it is still in range */ i1 < i_limit ) { unsigned int min_size= (size_best ? size_best+1 : Min_Run_Size); if (i1 < i_first) { /* not in range */ continue; } /* bump txt1; we may have to skip a text or two */ while (i1 >= txt1->tx_limit) { txt1++; } /* are we looking at something better than we have got? */ { /* comparing backwards */ unsigned int j0 = i0 + min_size - 1; unsigned int j1 = i1 + min_size - 1; if ( /* j0 still inside txt0 */ j0 < txt0->tx_limit && /* j1 still inside txt1 */ j1 < txt1->tx_limit && /* j0 and j1 don't overlap */ j0 + min_size <= j1 ) { /* there is room enough for a match */ int cnt = min_size; /* text matches for at least min_size tokens? */ while ( cnt && Token_EQ(Token_Array[j0], Token_Array[j1]) ) { cnt--, j0--, j1--; } if (cnt) continue; /* forget it */ } else continue; /* forget it */ } /* yes, we are; how long can we make it? */ unsigned int new_size = min_size; { /* extending forwards */ unsigned int j0 = i0 + min_size; unsigned int j1 = i1 + min_size; while ( /* j0 still inside txt0 */ j0 < txt0->tx_limit && /* j1 still inside txt1 */ j1 < txt1->tx_limit && /* j0 and j1 don't overlap */ j0 + new_size < j1 && /* tokens are the same */ Token_EQ(Token_Array[j0], Token_Array[j1]) ) { j0++, j1++, new_size++; } } /* offer the run to the Language Department which may reject it or may cut its tail */ new_size = ( May_Be_Start_Of_Run(Token_Array[i0]) ? Best_Run_Size(&Token_Array[i0], new_size) : 0 ); if ( /* we still have something acceptable */ new_size >= Min_Run_Size && /* it is better still than what we had */ new_size > size_best ) { /* record it */ *tbp = txt1; *ibp = i1; size_best = new_size; } } return size_best; } similarity-tester-2.70.orig/sortlist.spc0000644000000000000000000000407010103670706015354 0ustar /* Module: Sort Linked Lists Author: dick@cs.vu.nl (Dick Grune @ Vrije Universiteit, Amsterdam) Version: Tue Sep 17 17:32:33 1991 Description: This is the specification part of a generic routine that sorts linked lists. The elements in the list are structs, each of which carries a pointer to the next element. Instantiation, inline: For each struct list type T, specify: - a definition of SORT_STRUCT, the struct name of the linked structs - a definition of SORT_NAME, the name of the resulting sort routine - a definition of a routine int SORT_BEFORE( struct SORT_STRUCT *v, struct SORT_STRUCT *w ) which yields non-zero if v is to be sorted before w - a definition of a field selector SORT_NEXT which names the field that points to the next struct SORT_STRUCT in the list - #include "sortlist.bdy" Instantiation, separate: For each struct list type T, create a file sortT.h which contains at least: - a definition of SORT_STRUCT, the struct name of the linked structs - a definition of SORT_NAME, the name of the resulting sort routine - #include "sortlist.spc" This file sortT.h is to be included in all files that use the routine SORT_NAME. For each struct list type T, create a file sortT.c which contains at least: - #include "sortT.h" - a definition of a routine int SORT_BEFORE( struct SORT_STRUCT *v, struct SORT_STRUCT *w ) which yields non-zero if v is to be sorted before w - a definition of a field selector SORT_NEXT which names the field that points to the next struct SORT_STRUCT in the list - #include "sortlist.bdy" This file sortT.c compiles into the module object for SORT_STRUCT. Specification: The module supplies: - void SORT_NAME(struct SORT_STRUCT **listhook) where 'listhook' is a pointer to the location that holds the pointer to the list to be sorted. Upon return, the list will be sorted, and the pointer updated. The routine will be defined static when instantiated inline. Implementation: Linear insert sort:-(. */ extern void SORT_NAME(struct SORT_STRUCT **); #define _SORT_EXTERN_DEFINED similarity-tester-2.70.orig/algollike.h0000644000000000000000000000253711764320436015112 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: algollike.h,v 1.6 2012-06-08 06:52:14 Gebruiker Exp $ */ /* The class Algollike is a subclass of Language. It implements the routines void Init_Algol_Language() int May_Be_Start_Of_Algol_Run() and unsigned int Best_Algol_Run_Size() for ALGOL-like languages, languages in which it is meaningful and useful to isolate function bodies. These routines can be used in Init_Language(), May_Be_Start_Of_Run(), and Best_Run_Size(), required by language.h . It requires the user to define four token sets, represented as Token set[] and terminated by No_Token: Token Non_Finals[] tokens that may not end a chunk Token Non_Initials[] tokens that may not start a chunk Token Openers[] openers of parentheses that must balance in functions Token Closers[] the corresponding closers, in the same order These must be passed to Init_Algol_Language(), in the above order. */ extern void Init_Algol_Language( const Token Non_Finals[], const Token Non_Initials[], const Token Openers[], const Token Closers[] ); /* note the order of the arguments: Non_Finals ~ Openers, etc. */ extern int May_Be_Start_Of_Algol_Run(Token ch); extern unsigned int Best_Algol_Run_Size(const Token *str, unsigned int size); similarity-tester-2.70.orig/tokenarray.c0000644000000000000000000000244311764421216015313 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: tokenarray.c,v 1.13 2012-06-08 16:04:30 Gebruiker Exp $ */ #include "error.h" #include "Malloc.h" #include "token.h" #include "lang.h" #include "tokenarray.h" #define TK_INCR 10000 /* increment of token array size */ Token *Token_Array; /* to be filled by Malloc() */ static unsigned int tk_size; /* size of Token_Array[] */ static unsigned int tk_free; /* next free position in Token_Array[]*/ void Init_Token_Array(void) { if (Token_Array) Free(Token_Array); tk_size = TK_INCR; Token_Array = (Token *)Malloc(sizeof (Token) * tk_size); tk_free = 1; /* don't use position 0 */ } void Store_Token(Token tk) { if (tk_free == tk_size) { /* allocated array is full; try to increase its size */ unsigned int new_size = tk_size + TK_INCR; Token *new_array = (Token *)TryRealloc( (char *)Token_Array, sizeof (Token) * new_size ); if (!new_array) { /* we failed */ fatal("out of memory"); } if (new_size < tk_free) fatal("internal error: TK_INCR causes numeric overflow"); Token_Array = new_array, tk_size = new_size; } /* now we are sure there is room enough */ Token_Array[tk_free++] = tk; } unsigned int Text_Length(void) { return tk_free; } similarity-tester-2.70.orig/textlang.l0000644000000000000000000000257311764421216014777 0ustar %{ /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: textlang.l,v 1.13 2012-06-08 16:04:30 Gebruiker Exp $ */ /* Text front end for the similarity tester. */ #include "sim.h" #include "token.h" #include "idf.h" #include "lex.h" #include "lang.h" #include "language.h" /* General language front end data */ Token lex_token; unsigned int lex_nl_cnt; unsigned int lex_tk_cnt; unsigned int lex_non_ascii_cnt; /* Language-dependent code */ void Init_Language(void) { token_name = "word"; if (!min_run_string) { Min_Run_Size = 8; } if (!threshold_string) { Threshold_Percentage = 20; } } /*ARGSUSED*/ int May_Be_Start_Of_Run(Token tk) { /* any token is acceptable */ return 1; } /*ARGSUSED*/ unsigned int Best_Run_Size(const Token *str, unsigned int size) { /* any run size is acceptable */ return size; } %} %option noyywrap WordElem ([-a-zA-Z\200-\377]) TightWord ({WordElem}+) NonWordElem ([^-a-zA-Z\200-\377]) LooseElem ({WordElem}(" ")) SpacedWord ({LooseElem}+{WordElem}) %% {TightWord} { return_tk(idf_hashed(yytext)); } {SpacedWord}/{NonWordElem} { /* the / operator works at the top level only */ return_tk(idf_hashed(yytext)); } \n { /* count newlines */ return_eol(); } . { /* ignore the rest */ } %% /* More language-dependent code */ void yystart(void) { BEGIN INITIAL; } similarity-tester-2.70.orig/error.c0000644000000000000000000000066211763354134014271 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: error.c,v 2.6 2012-06-05 09:58:52 Gebruiker Exp $ */ #include #include #include "sim.h" #include "error.h" void fatal(const char *msg) { #ifdef lint /* prevent non-use messages */ min_run_string = 0; threshold_string = 0; #endif fprintf(stderr, "%s: %s\n", progname, msg); exit(1); } similarity-tester-2.70.orig/sim.h0000644000000000000000000000137612055474360013737 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: sim.h,v 2.15 2012-11-28 20:49:52 Gebruiker Exp $ */ #include extern unsigned int Min_Run_Size; extern int Page_Width; extern FILE *Output_File; extern FILE *Debug_File; extern const char *token_name; /* for possible mod in *lang.l */ extern int Threshold_Percentage; /* threshold percentage */ extern const char *progname; /* for error reporting */ extern const char *min_run_string; extern const char *threshold_string; extern int is_new_old_separator(const char *s); /* All output goes through designated files, so we block printf, etc. */ #undef printf #define printf use_fprintf #undef putchar #define putchar use_fprintf similarity-tester-2.70.orig/newargs.h0000644000000000000000000000047511754656626014630 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: newargs.h,v 2.4 2012-05-16 07:56:06 Gebruiker Exp $ */ extern void get_new_std_input_args(int *argcp, char const **argvp[]); extern void get_new_recursive_args(int *argcp, const char **argvp[]); similarity-tester-2.70.orig/algollike.c0000644000000000000000000000661511763714036015110 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: algollike.c,v 2.9 2012-06-06 17:49:18 Gebruiker Exp $ */ #include "options.h" #include "error.h" #include "token.h" #include "algollike.h" /* Arrays for fast identification tests for tokens. Each token is identified by its position in the set + 1. For example, if tk is the n-th Opener, openers[Token2int(tk)] == n+1. */ static char non_finals[N_REGULAR_TOKENS]; static char non_initials[N_REGULAR_TOKENS]; static char openers[N_REGULAR_TOKENS]; static char closers[N_REGULAR_TOKENS]; /* Init_Language */ static void cvt2bittable(const Token *tl, char bt[]) { /* assumes bt[] is cleared */ int i; int cnt = 1; for (i = 0; !Token_EQ(tl[i], No_Token); i++) { int index = Token2int(tl[i]); if (index < 0 || index >= N_REGULAR_TOKENS) fatal("internal error: bad Token list"); bt[index] = cnt++; } } void Init_Algol_Language( const Token Non_Finals[], const Token Non_Initials[], const Token Openers[], const Token Closers[] ) { /* convert the token sets to bitmaps for speed-up */ cvt2bittable(Non_Initials, non_initials); cvt2bittable(Non_Finals, non_finals); cvt2bittable(Openers, openers); cvt2bittable(Closers, closers); } /* May_Be_Start_Of_Run */ static int pos_in_set(const char set[], const Token tk) { if (!is_regular_token(tk)) return 0; return set[Token2int(tk)]; } int May_Be_Start_Of_Algol_Run(const Token tk) { return pos_in_set(non_initials, tk) == 0; } /* Best_Run_Size */ static unsigned int largest_routine(const Token *tk_array, unsigned int size) { /* Returns the size of the longest sequence starting at tk_array[0] and not containing unbalanced parentheses. Does not check the nesting of the parentheses, but then, sim is syntax-free anyway. */ unsigned int mrb_size = 0; /* most recent balancing size */ unsigned int pos; int i; int balance_count[N_REGULAR_TOKENS]; /* Overkill: only a fraction of the tokens are balancers; oh well. */ int n_imbalances; /* clear administration */ n_imbalances = 0; for (i = 0; i < N_REGULAR_TOKENS; i++) { balance_count[i] = 0; } /* scan tk_array[] and see how far we get */ for (pos = 0; pos < size; pos++) { Token tk = tk_array[pos]; int pp; /* parenthesis position */ /* account for openers */ if ((pp = pos_in_set(openers, tk))) { if (balance_count[pp] == 0) { /* about to create an imbalance */ n_imbalances++; } balance_count[pp]++; } /* account for closers */ if ((pp = pos_in_set(closers, tk))) { if (balance_count[pp] == 0) { /* this is one Closer too many */ return mrb_size; } balance_count[pp]--; if (balance_count[pp] == 0) { /* we just cleared an imbalance */ n_imbalances--; } } if (n_imbalances == 0) { /* register the balance point */ mrb_size = pos + 1; } } return mrb_size; } unsigned int Best_Algol_Run_Size(const Token *tk_array, unsigned int size) { /* Checks the run starting at tk_array[0] with length size for acceptability in the language. Cuts from the end if necessary and returns the accepted length, which may be zero. */ if (is_set_option('f')) { /* reduce to a routine-like form first */ size = largest_routine(tk_array, size); } while ( /* there is trailing garbage */ size != 0 && pos_in_set(non_finals, tk_array[size-1]) ) { /* remove it */ size--; } return size; } similarity-tester-2.70.orig/pass1.c0000644000000000000000000000643612055474360014173 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: pass1.c,v 2.21 2012-11-28 20:49:52 Gebruiker Exp $ */ #include #include #include "debug.par" #include "sim.h" #include "text.h" #include "token.h" #include "tokenarray.h" #include "lang.h" #include "error.h" #include "options.h" #include "pass1.h" #ifdef DB_TEXT static void db_print_text(const struct text *); #endif static void fprint_count(FILE *f, unsigned int cnt, const char *); void Read_Input_Files(int argc, const char *argv[], int round) { int n; Init_Text(argc); Init_Token_Array(); /* Assume all texts to be new */ Number_Of_New_Texts = Number_Of_Texts; /* Read the files */ for (n = 0; n < Number_Of_Texts; n++) { const char *fname = argv[n]; struct text *txt = &Text[n]; if (round == 1 && !is_set_option('T')) { fprintf(Output_File, "File %s: ", fname); } txt->tx_fname = fname; txt->tx_pos = 0; txt->tx_start = txt->tx_limit = Text_Length(); if (is_new_old_separator(fname)) { if (round == 1 && !is_set_option('T')) { fprintf(Output_File, "separator\n"); } Number_Of_New_Texts = n; } else { if (!Open_Text(First, txt)) { if (round == 1 && !is_set_option('T')) { fprintf(Output_File, ">>>> cannot open <<<< "); } /* the file has still been opened with a null file for uniformity */ } while (Next_Text_Token_Obtained(First)) { if (!Token_EQ(lex_token, End_Of_Line)) { Store_Token(lex_token); } } Close_Text(First, txt); txt->tx_limit = Text_Length(); /* report */ if (round == 1 && !is_set_option('T')) { fprint_count(Output_File, txt->tx_limit - txt->tx_start, token_name ); fprintf(Output_File, ", "); fprint_count(Output_File, lex_nl_cnt-1, "line"); if (lex_non_ascii_cnt) { fprintf(Output_File, ", "); fprint_count(Output_File, lex_non_ascii_cnt, "non-ASCII character" ); } fprintf(Output_File, "\n"); } #ifdef DB_TEXT db_print_text(txt); #endif /* DB_TEXT */ } fflush(Output_File); } /* report total */ if (round == 1 && !is_set_option('T')) { fprintf(Output_File, "Total: "); fprint_count(Output_File, Text_Length() - 1, token_name); fprintf(Output_File, "\n\n"); fflush(Output_File); } } static void fprint_count(FILE *f, unsigned int cnt, const char *unit) { /* Prints a grammatically correct string "%u %s[s]" for units that form their plural by suffixing -s. */ fprintf(f, "%u %s%s", cnt, unit, (cnt == 1 ? "" : "s")); } #ifdef DB_TEXT static void db_print_text(const struct text *txt) { /* prints a text (in compressed form) */ int i; fprintf(Debug_File, "\n\n**** DB_PRINT_TEXT ****\n"); fprintf(Debug_File, "File \"%s\", %u %ss, ", txt->tx_fname, txt->tx_limit - txt->tx_start, token_name ); fprintf(Debug_File, "txt->tx_start = %u, txt->tx_limit = %u\n", txt->tx_start, txt->tx_limit ); int BoL = 1; for (i = txt->tx_start; i < txt->tx_limit; i++) { if (BoL) { fprintf(Debug_File, "[%d]:", i); BoL = 0; } fprintf(Debug_File, " "); fprint_token(Debug_File, Token_Array[i]); if ((i - txt->tx_start + 1) % 10 == 0) { fprintf(Debug_File, "\n"); BoL = 1; } } fprintf(Debug_File, "\n"); } #endif /* DB_TEXT */ similarity-tester-2.70.orig/language.c0000644000000000000000000000117511764421214014716 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: language.c,v 2.2 2012-06-08 16:04:28 Gebruiker Exp $ */ /* This is a dummy implementation of the abstract class 'language'. The actual implementation is provided by one of the *lang.l files. */ #include #include #include "token.h" #include "language.h" void Init_Language(void) { abort(); } int May_Be_Start_Of_Run(Token ch) { if (ch == ch) abort(); return 0; } unsigned int Best_Run_Size(const Token *str, unsigned int size) { if (str == str || size == size) abort(); return 0; } similarity-tester-2.70.orig/ForEachFile.c0000644000000000000000000001071211750732757015253 0ustar /* This file is part of the auxiliaries library. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: ForEachFile.c,v 1.14 2012-05-04 10:56:47 Gebruiker Exp $ */ #include #include #include #include #include #include "ForEachFile.h" #define MAX_NL 256 /* maximum file name length */ struct ino_link { /* to detect loop in file system */ struct ino_link *next; long il_ino; long il_device; }; static void do_FEF( Fchar *fn, void (*proc)(const Fchar *, const char *, const struct stat *), int dev, struct ino_link *inop, Fchar separator, int max_depth ); static Fchar get_separator(const Fchar *fn) { #ifndef MSDOS (void)(fn); /* use fn */ return '/'; #else /* under MSDOS, conform to user's use, or use '\' */ Fchar sep = 0; while (*fn) { if (*fn == '/' || *fn == '\\') { if (sep == 0) { sep = *fn; } else if (sep != *fn) return 0; /* bad mixed use */ } fn++; } return (sep ? sep : '\\'); #endif } static void clean_name(Fchar *fn, Fchar sep) { Fchar *f1 = fn; Fchar *f2 = fn; /* remove multiple separators */ while (*f1) { if (*f1 == sep && *(f1+1) == sep) { f1++; } else { *f2++ = *f1++; } } *f2 = '\0'; /* remove a trailing separator */ if (f2-1 > fn && *(f2-1) == sep) { *(f2-1) = '\0'; } } static void do_ForEachFile( const Fchar *fn, void (*proc)(const Fchar *, const char *, const struct stat *), int max_depth ) { Fchar fname[MAX_NL]; Fchar separator; Fnamecpy(fname, (!fn || !*fn) ? str2Fname(".") : fn); separator = get_separator(fname); if (!separator) { (*proc)(fname, "both / and \\ used as separators", 0); return; } clean_name(fname, separator); do_FEF(fname, proc, -1, (struct ino_link *)0, separator, max_depth); } static int in_ino_list(const struct ino_link *inop, const struct stat *st); static void link_ino_list( struct ino_link *inop, struct ino_link *ninop, const struct stat *st ); void ForEachFile( const Fchar *fn, void (*proc)(const Fchar *, const char *, const struct stat *) ) { do_ForEachFile(fn, proc, -1); /* infinitely deep */ } void ForEachLocalFile( const Fchar *fn, void (*proc)(const Fchar *, const char *, const struct stat *) ) { do_ForEachFile(fn, proc, 1); /* one level deep */ } #ifdef S_IFLNK /* system with symbolic links */ #define LSTAT lstat #else /* S_IFLNK */ #define LSTAT Stat #endif /* S_IFLNK */ static void do_FEF( Fchar *fn, void (*proc)(const Fchar *, const char *, const struct stat *), int dev, struct ino_link *inop, Fchar separator, int max_depth ) { struct stat fs; Dir_t *dir; if (LSTAT(fn, &fs) < 0) { (*proc)(fn, strerror(errno), &fs); return; } /* report on file fn */ (*proc)(fn, (char*)0, &fs); if (max_depth == 0) return; if ((fs.st_mode & S_IFMT) != S_IFDIR) return; #ifdef S_IFLNK /* don't follow links */ if ((fs.st_mode & S_IFMT) == S_IFLNK) return; #endif /* treat directory */ if (dev < 0) { /* no device known yet */ dev = fs.st_dev; } if (fs.st_dev != dev) { return; } dir = Opendir(fn); if (dir == 0) { (*proc)(fn, "directory not readable", &fs); } else { /* scan new directory */ int fnl = Fnamelen(fn); Dirent_t *dent; struct ino_link ino; /* worry about loops in the file system */ if (in_ino_list(inop, &fs)) { (*proc)(fn, "loop in file system", &fs); Closedir(dir); return; } link_ino_list(inop, &ino, &fs); /* shape up the directory name */ if (fn[fnl-1] != separator) { /* append separator */ fn[fnl++] = separator; fn[fnl] = '\0'; } /* descend */ while ((dent = Readdir(dir)) != (Dirent_t *)0) { if ( Fnamecmp(dent->d_name, str2Fname(".")) == 0 || Fnamecmp(dent->d_name, str2Fname("..")) == 0 ) continue; if (Fnamecmp(dent->d_name, str2Fname("")) == 0) { (*proc)(fn, "directory contains empty file name", &fs ); continue; } /* append name */ Fnamecat(fn, dent->d_name); do_FEF(fn, proc, dev, &ino, separator, max_depth-1); /* remove name again*/ fn[fnl] = '\0'; } Closedir(dir); } } static int in_ino_list(const struct ino_link *inop, const struct stat *st) { while (inop) { #ifdef UNIX if ( inop->il_ino == st->st_ino && inop->il_device == st->st_dev ) return 1; #else #ifdef lint st = st; #endif #endif inop = inop->next; } return 0; } static void link_ino_list( struct ino_link *inop, struct ino_link *ninop, const struct stat *st ) { ninop->next = inop; ninop->il_ino = st->st_ino; ninop->il_device = st->st_dev; } similarity-tester-2.70.orig/runs.h0000644000000000000000000000231611754656626014145 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: runs.h,v 1.6 2012-05-16 07:56:06 Gebruiker Exp $ */ /* Although all other segments of data in this program are described by giving the position of the first in the segment and that of the first not in the segment (so the size is the difference of the two), a `chunk' is given by first and last. This is done because later on we are interested in the actual position of the last token of it, and the position of the first token not in the segment gives no indication about that. */ struct chunk { /* a chunk of text in various representations */ struct text *ch_text; /* pointer to the file */ struct position ch_first; /* first in chunk */ struct position ch_last; /* last in chunk */ }; struct run { /* a 'run' of coincident tokens */ struct chunk rn_chunk0; /* chunk in left file */ struct chunk rn_chunk1; /* chunk in right file */ unsigned int rn_size; }; #define AISO_TYPE struct run * #define AISO_ITER #include "aiso.spc" extern void add_to_runs(struct run *r); #ifdef DB_RUN extern void db_run_info(const char *msg, const struct run *run, int lines_too); #endif /* DB_RUN */ similarity-tester-2.70.orig/error.h0000644000000000000000000000031606465624606014301 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: error.h,v 1.3 1998/02/03 14:28:23 dick Exp $ */ extern void fatal(const char *msg); similarity-tester-2.70.orig/text.c0000644000000000000000000001230012055474360014113 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: text.c,v 1.11 2012-11-28 20:49:52 Gebruiker Exp $ */ #include #include "debug.par" #include "sim.h" #include "token.h" #include "stream.h" #include "lang.h" #include "Malloc.h" #include "options.h" #include "error.h" #include "text.h" struct text *Text; /* to be filled in by Malloc() */ int Number_Of_Texts; /* number of text files */ int Number_Of_New_Texts; /* number of new text files */ struct newline { unsigned char nl_tk_diff; /* token position difference */ }; #define NL_INCR 1000 /* increment of newline buffer size */ static struct newline *nl_buff; /* to be filled by Malloc() */ static unsigned int nl_size; /* size of nl_buff[] */ static unsigned int nl_free; /* next free position in nl_buff[] */ static unsigned int nl_next, nl_limit; /* nl_buff[] pointers during pass 2 */ static void store_newline(void); static void init_nl_buff(void); /* TEXT INTERFACE */ static unsigned int last_tk_cnt; /* token count at newline */ static unsigned int last_nl_cnt; /* nl counter during pass 2 */ void Init_Text(int nfiles) { /* allocate the array of text descriptors */ if (Text) { Free(Text); Text = 0; } Number_Of_Texts = nfiles; Text = (struct text *) Malloc((unsigned int)(Number_Of_Texts*sizeof (struct text))); init_nl_buff(); } int Open_Text(enum Pass pass, struct text *txt) { switch (pass) { case First: last_tk_cnt = 0; if (nl_buff) { txt->tx_nl_start = nl_free; } break; case Second: last_tk_cnt = 0; if (nl_buff) { nl_next = txt->tx_nl_start; nl_limit = txt->tx_nl_limit; last_nl_cnt = 1; lex_nl_cnt = 1; lex_tk_cnt = 0; return 1; } break; } return Open_Stream(txt->tx_fname); } int Next_Text_Token_Obtained(enum Pass pass) { int ok = 0; /* gcc does not understand enum Pass */ switch (pass) { case First: ok = Next_Stream_Token_Obtained(); if (Token_EQ(lex_token, End_Of_Line)) { store_newline(); last_tk_cnt = lex_tk_cnt; } break; case Second: /* get newline info from the buffer or from the file itself */ if (nl_buff) { if (nl_next == nl_limit) { ok = 0; } else { struct newline *nl = &nl_buff[nl_next++]; lex_nl_cnt = ++last_nl_cnt; lex_tk_cnt = (last_tk_cnt += nl->nl_tk_diff); lex_token = End_Of_Line; ok = 1; } } else { while ( (ok = Next_Stream_Token_Obtained()) && !Token_EQ(lex_token, End_Of_Line) ) { /* skip */ } } break; } return ok; } void Close_Text(enum Pass pass, struct text *txt) { switch (pass) { case First: if (nl_buff) { if (last_tk_cnt != lex_tk_cnt) { /* there were tokens after the last newline */ store_newline(); } txt->tx_nl_limit = nl_free; } break; case Second: break; } Close_Stream(); } /* NEWLINE CACHING */ /* To speed up pass2 which is interested in token positions at line ends, the newline buffer keeps this info from pass1. To reduce the size of the newline buffer, the info is kept as the differences of the values at consecutive line ends. This allows unsigned chars to be used rather than integers. The recording of token position differences at End_Of_Line is optional, and is switched off if - there is not room enough for the newline buffer. - a difference would not fit in the field in the struct. Switching off is done by freeing the buffer and setting nl_buff to 0. Anybody using nl_buff should therefore test for nl_buff being zero. */ static void abandon_nl_buff(void); static void init_nl_buff(void) { /* Allocate the newline buffer, if possible */ nl_size = 0 + NL_INCR; nl_buff = (struct newline *)TryMalloc(sizeof (struct newline)*nl_size); nl_free = 0; } static void store_newline(void) { if (!nl_buff) return; if (nl_free == nl_size) { /* allocated array is full; try to increase its size */ unsigned int new_size = nl_size + NL_INCR; struct newline *new_buff = (struct newline *)TryRealloc( (char *)nl_buff, sizeof (struct newline) * new_size ); if (!new_buff) { /* we failed */ abandon_nl_buff(); return; } nl_buff = new_buff, nl_size = new_size; } /* now we are sure there is room enough */ { struct newline *nl = &nl_buff[nl_free++]; unsigned int tk_diff = lex_tk_cnt - last_tk_cnt; nl->nl_tk_diff = tk_diff; if (nl->nl_tk_diff != tk_diff) { /* tk_diff does not fit in nl_tk_diff */ abandon_nl_buff(); } } } static void abandon_nl_buff(void) { if (nl_buff) { Free((char *)nl_buff); nl_buff = 0; } } #ifdef DB_NL_BUFF void db_print_nl_buff(unsigned int start, unsigned int limit) { int i; fprintf(Debug_File, "\n**** DB_NL_BUFF ****\n"); if (!nl_buff) { fprintf(Debug_File, ">>>> NO NL_BUFF\n\n"); return; } if (start > nl_free) { fprintf(Debug_File, ">>>> start (%u) > nl_free (%u)\n\n", start, nl_free ); return; } if (limit > nl_free) { fprintf(Debug_File, ">>>> limit (%u) > nl_free (%u)\n\n", limit, nl_free ); return; } fprintf(Debug_File, "nl_buff: %u entries:\n", nl_free); for (i = start; i < limit; i++) { struct newline *nl = &nl_buff[i]; fprintf(Debug_File, "nl_tk_diff = %d\n", nl->nl_tk_diff); } fprintf(Debug_File, "\n"); } #endif /* DB_NL_BUFF */ similarity-tester-2.70.orig/runs.c0000644000000000000000000000247711763354136014137 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: runs.c,v 1.6 2012-06-05 09:58:54 Gebruiker Exp $ */ #include "sim.h" #include "text.h" #include "runs.h" #include "debug.par" #define AISO_BEFORE(r0,r1) ((r0)->rn_size > (r1)->rn_size) #include "aiso.bdy" static int aiso_overflow; void add_to_runs(struct run *r) { if (InsertAiso(r)) return; if (!aiso_overflow) { fprintf(stderr, ">>>> Memory overflow: too many runs found\n"); aiso_overflow = 1; } } #ifdef DB_RUN void db_run_info(const char *msg, const struct run *run, int lines_too) { const struct chunk *cnk0 = &run->rn_chunk0; const struct chunk *cnk1 = &run->rn_chunk1; if (msg) { fprintf(Debug_File, "%s: ", msg); } fprintf(Debug_File, "File %s / file %s:\n", cnk0->ch_text->tx_fname, cnk1->ch_text->tx_fname ); fprintf(Debug_File, "from %s %u/%u to %u/%u:", token_name, cnk0->ch_first.ps_tk_cnt, cnk1->ch_first.ps_tk_cnt, cnk0->ch_last.ps_tk_cnt, cnk1->ch_last.ps_tk_cnt ); if (lines_too) { fprintf(Debug_File, " from lines %u/%u to %u/%u:", cnk0->ch_first.ps_nl_cnt, cnk1->ch_first.ps_nl_cnt, cnk0->ch_last.ps_nl_cnt, cnk1->ch_last.ps_nl_cnt ); } fprintf(Debug_File, " %u %s%s\n", run->rn_size, token_name, (run->rn_size == 1 ? "" : "s") ); } #endif /* DB_RUN */ similarity-tester-2.70.orig/sim.10000644000000000000000000001710112055474360013641 0ustar .\" This file is part of the software similarity tester SIM. .\" Written by Dick Grune, Vrije Universiteit, Amsterdam. .\" $Id: sim.1,v 2.22 2012-11-28 20:49:52 Gebruiker Exp $ .\" .TH SIM 1 2012/05/02 .SH NAME sim \- find similarities in C, Java, Pascal, Modula-2, Lisp, Miranda, or text files .SH SYNOPSIS .B sim_c [ .B \-[defFiMnpPRsST] .B \-r .I N .B \-t .I N .B \-w .I N .B \-o .I F ] file ... [ .B / [ file ... ] ] .br .B sim_c \&... .br .B sim_java \&... .br .B sim_pasc \&... .br .B sim_m2 \&... .br .B sim_lisp \&... .br .B sim_mira \&... .br .B sim_text \&... .br .SH DESCRIPTION .I Sim_c reads the C files .I file ... and looks for segments of text that are similar; two segments of program text are similar if they only differ in layout, comment, identifiers and the contents of numbers, strings and characters. If any runs of sufficient length are found, they are reported on standard output; the number of significant tokens in the run is given between square brackets. .PP .I Sim_java does the same for Java, .I sim_pasc for Pascal, .I sim_m2 for Modula-2, .I sim_mira for Miranda, and .I sim_lisp for Lisp. .I Sim_text works on arbitrary text; it is occasionally useful on shell scripts. .PP The program can be used for finding copied pieces of code in purportedly unrelated programs (with .B \-s or .BR \-S ), or for finding accidentally duplicated code in larger projects (with .BR \-f ). .PP If a .B / is present between the input files, the latter are divided into a group of "new" files (before the .BR / ) and a group of "old" files; if there is no .BR / , all files are "new". Old files are never compared to each other. .PP Since the similarity tester reads the files several times, it cannot read from standard input. .PP There are the following options: .TP .B \-d The output is in a diff(1)-like format instead of the default 2-column format. .TP .B \-e Each file is compared to each file in isolation; this will find all similarities between all texts involved, regardless of duplicates. .TP .B \-f Runs are restricted to segments with balancing parentheses, to isolate potential routine bodies (not in text). .TP .B \-F The names of routines in calls are required to match exactly (not in text). .TP .B \-i The names of the files to be compared are read from standard input, including a possible .BR / ; the file names must be one to a line. This option allows a very large number of file names to be specified; it differs from the @ facility provided by some compilers in that it handles file names only, and does not recognize option arguments. .TP .B \-M Memory usage information is displayed on standard error output. .TP .B \-n Similarities found are only summarized, not displayed. .TP .B "\-o F" The output is written to the file named .IR F . .TP .B \-p The output is given in similarity percentages; see below; implies \fB\-e\fP and \fB\-s\fP. .TP .B \-P As .B \-p but more extensive; implies \fB\-e\fP and \fB\-s\fP. .TP .B "\-r N" The minimum run length is set to .I N units; the default is 24 tokens, except in .IR sim_text , where it is 8 words. .TP .B \-R Directories in the input list are entered recursively, and all files they contain are involved in the comparison. .TP .B \-s The contents of a file are not compared to itself (\-s for "not self"). .TP .B \-S The contents of the new files are compared to the old files only \- not between themselves. .TP .B "\-t N" In combination with the .B \-p option, sets the threshold (in percents) below which similarities will not be reported; the default is 1, except in .IR sim_text , where it is 20. .TP .B \-T A more terse and uniform form of output is produced, which may be more suitable for postprocessing. .TP .B "\-w N" The page width used is set to .I N columns; the default is 80. .TP .B "\-\-" (A secret option, which prints the input as the similarity checker sees it, and then stops.) .PP The .B \-p option results in lines of the form .nf .ft C F consists for x % of G material .ft P .fi meaning that \fCx\fP % of \fCF\fP's text can also be found in \fCG\fP. Note that this relation is not symmetric; it is in fact quite possible for one file to consist for 100 % of text from another file, while the other file consists for only 1 % of text of the first file, if their lengths differ enough. Each file is reported only once in the position of the \&\fCF\fP in the above line. This simplifies the identification of a set of files .IR "A[1] ... A[n]" , where the concatenation of these files is also present. This restriction can be lifted by using the .B \-P option instead. A threshold can be set using the .B \-t option; this option is ignored under \fB\-P\fP. Note that the granularity of the recognized text is still governed by the .B \-r option or its default. .PP .I Sim_text accepts s p a c e d t e x t as normal text. .PP The program can handle UNICODE file names under Windows. This is relevant only under the .B \-R option, since there is no way to give UNICODE file names from the command line. .PP Care has been taken to keep all internal processes linear in the length of the input, with the exception of the matching process which is almost linear, using a hash table; various other tables are used for speed-up. If, however, there is not enough memory for the tables, they are discarded in order of unimportance, under which conditions the algorithms revert to their quadratic nature. .SH EXAMPLES The call .nf .ft C sim_c *.c .ft P .fi highlights duplicate code in the directory. (It is useful to remove generated files first.) A call .nf .ft C sim_c -f -F *.c .ft P .fi can pinpoint them further. .PP A call .nf .ft C sim_text -e -p -s new/* / old/* .ft P .fi compares each file in \fCnew/*\fP to each subsequent file in \fCnew/*\fP and \fCold/*\fP, and if any pair has more that 20% in common, that fact is reported. Usually a similarity of 30% or more is significant; lower than 20% is probably coincidence; and in between is doubtful. .PP A call .nf .ft C sim_text -e -n -s -r100 new/* / old/* .ft P .fi compares the same files, and reports large common segments. Both approaches are good for plagiarism detection. .SH LIMITATIONS Repetitive input is the bane of similarity checking. If we have a file containing 4 copies of similar text, .nf A1 A2 A3 A4 .fi where the numbers serve only to distinguish the similar copies, there are 7 similarities: A1=A2, A1=A3, A1=A4, A2=A3, A2=A4, A3=A4, and A1A2=A3A4, even discarding the overlapping A1A2A3=A2A3A4. Of these, only 3 are meaningful: A1=A2, A2=A3, and A3=A4. And for a table with 20 lines similar to each other, not unusual in a program, there are 715 similarities, of which at most 19 are meaningful. Reporting all 715 of them is clearly unacceptable. .PP To remedy this, finding the similarities is performed as follows: For each position in the text, the largest segment is found, of which a non-overlapping copy occurs in the text following it. That segment and its copy are reported and scanning resumes at the position just after the segment. For the above example this results in the similarities A1A2=A3A4 and A3=A4, which is quite satisfactory, and for N similar segments roughly \fIlog N\fP messages are given. .PP A drawback of this heuristic is that the output is sensitive to the order of the input files. If we have two files .nf file1 = A1, file2 = A2A3 .fi then the order "file1 file2" gives "A1=A2, A2=A3" and "file2 file1" gives "A2=A3, A3=A1"; but both reports convey the same information. .SH BUGS Since it uses .I lex(1) on some systems, it may crash on any weird construction that overflows .IR lex 's internal buffers. .SH AUTHOR Dick Grune, Vrije Universiteit, Amsterdam; dick@dickgrune.com. similarity-tester-2.70.orig/lang.h0000644000000000000000000000154511764602256014071 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: lang.h,v 1.7 2012-06-09 08:09:18 Gebruiker Exp $ */ /* The *lang.l files provide two interfaces: language.[ch] static data about the language lang.[ch] dynamic data about the input file's content This is lang.[ch]. */ /* The abstract module 'lang' provides access to the lowest-level token routines and data. The actual implementation derives from one of the *lang.l files. There is a dummy implementation lang.c. */ extern FILE *yyin; extern int yylex(void); extern void yystart(void); extern Token lex_token; /* token produced, or End_Of_Line */ extern unsigned int lex_nl_cnt; /* line count */ extern unsigned int lex_tk_cnt; /* token position */ extern unsigned int lex_non_ascii_cnt; /* # of non-ASCII chars found */ similarity-tester-2.70.orig/add_run.h0000644000000000000000000000135611763354134014562 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: add_run.h,v 1.3 2012-06-05 09:58:52 Gebruiker Exp $ */ /* Interface between front-end and back-end: all information about runs passes through add_run(). Its parameters are the two chunks, each identified by their struct text and the position of the common segment in Token_Array[], and the number of tokens in the common segment. */ extern void add_run( struct text *txt0, /* text of first chunk */ unsigned int i0, /* chunk position in Token_Array[] */ struct text *txt1, /* text of second chunk */ unsigned int i1, /* chunk position in Token_Array[] */ unsigned int size /* number of tokens in the chunk */ ); similarity-tester-2.70.orig/sim.pdf0000644000000000000000000007420112055474451014257 0ustar %PDF-1.4 %äðíø 9 0 obj <> stream xÚÅZKsÛH¾ï¯P͉ª²²Ù|­/;›Iª2µN2±/[q EÉœ¡H…8™_¿@M6EŠÖŒSµåƒÙ/4€FÐZ}Y¹+þÜU(VaÛñ*=¬þ}·zñÚ]A+ðVw;ru·ýhݾ¹¹·ÜûõúÓݯ+é…¶ƒË'#/^‹•ëØ±ãê OÜxšúöç›W<ÍØ7®PšüÀãÞ0îF¶'x³{Çq¦\×vÂ~†+Ëízãy’Ë‹¤ÎÛÍSš<©Ö"O¡¾ý¿'kÏ_ä +™²'à>‡‹üÉÀÞžä/ŠMþTk‘¿(bþŽàgøö_¡\À— gn0FÍEÎ\ÇaÖbƾãþná˵=ÿ¾Ü‘_¥æ2_®ö¬„‰•áê k†¼§Y#û§æ2kB߀„­ÖÀå2k±íİæ®5—YóôEP±sÊš´)k£x [˜ñô—W·/?¼y÷æÝÛ©w4BêíT +ø$q+×·š¶ÎË}C Ú>R\ðÔIÚÂT‘¡Œ¬7;= GQNWZuW6ôUñpÓÓAš³¾i°ÈÊ}ûÀë•2¡sWuå–ù`Ã>摌¤n3æ ÄKÖ´wk†Ø Óc×^S(WJÂÞ^úPIXYÍðx§‘ù¾TšNͳq¯­þÀ5YÙÐ4 ×™°’GÐïó¯´‚ºY×-ÒTÖ˜é‘æK§ÄT³@Ë8ã5£mìéusLX~Æ=1²wÔZr2É¡þk2ëá\„·”t`Þ¸¬Yüh’©û‹ãfŠ1¯@“úkbÆþ ŒB…Å%qD0Z£:®}ÇšÇÒÖ±Ž9§X~}ÈÄYv=`WÊ]X‰ä2»8‡ÖhvÏà…`̨ǫDÙƒWé9¶,ó*EÌkz^ózí»Ö|°ö¤Á²ò®RÀ&ýe5’Sé)g3•AØò¼y Ü õ @¿(Z“Û·çepwZã° ó8EôL•#˜YÏÜGBDΙûèÆ¡íy—Âv(–¹Ä9´Fs9;ð°â>…k¨j ¡žèá?ÆÄÏy['õwé\ÓgÞÒ å® M18M“&¯Ê¤(¾Óp×d»®à)L´yÈŠ‚#fÀÎÀƒiµ×2Õ RqlÄTh ‹ÝøŽ£œ#ø;ê…-·ÔM@º¨t±‰×UÇ\ϯ}~«¨£Æ½mFÝ*àâÞ]mœq»²ÎФíéƒÍIl¹·sˆY“L,Ý¿8nær»þ¢TõL¢ ¸ñRê·Sêp {d¶æªPÅo§ B†r±?IÓr°À¤mw,ò”Ô„mCÅfõ àÐ>cº IPi`ýž¥-ÇÄsZĬúòRËnVÚ~/'Y߯‹Àÿº† "dÎÞ_ÉÑMîù²$ô1¨ûTŽuÖô¨Æ‹œsF#Àš¼DBŸŒ¦¯¨ÕÏCl•þ`èÜæ_áD¶š†:™ŠçPç¾®º#}VZ;fÀ‘hCøi*Ÿo m„eö8]ï†C†ðÓ\þàx#üÏ*º×÷L-Å‚Ê==ŒÐáùÃðí°ßm‰¡ªrwÔÖê€Ï9u„þ¯ÍkCoPÛmx˜Ë]iå8j®‰Ìð?­N»⻬¦j@ó½H W,{QÑá $艽ÇZ–þ¬--‡L!#ë]±=×ú(3‚É5IùÏÆHíÑs¨l)«Ï’%”“°CC…NCÎm^¦Ù HïËP=¼iµËâôw”1 l«Œ”¹M ^è£lÙ7&eYq)Rï®®Ld”§(§01uRÀÙÁÓ®* iƼ–Ý6háØBànþ9u®&Õ'}ëvÌ©*ˆ3Äñ!»b¹Or¢„º)ëU/0›"§ôI³_’V/õ“ކ$¬?°m¶Kº‚§ŠMZÝ¡ìã±§ònTýäb‰³e‰_õ¶&£h¨oÄŽ[Fñ`²ªmÿgçו´.o*p÷ph×¼òAS|Ìñ.cŸ~3ÂÞDwŽpˆý¬.u„{ô¢1’A 8åg •¦eÛ+Êwël vߘ'#Ð<_ó»eÍ jB +.ð1·­ó”JÐVʆÿ'%À'œ¡Æ>'ER¦„ƒÐ£¢ú\U> Bt&¼ƒÅV×_ðÎ#mˆ.m^f'z¥øÏ°¨Ö»·Ø%8Rž£¾×^Õóf|¿X‚“§GÈz îê]N}éòZ£>rÀÒ‚‹7x`aeß’´%¤8#±üaç—H†½Äð©$Cö(ð1¸óaTù6†ûŒ-Ò|°'íÉ¡ÏðäaHžüJ§EÇ ©Ð¿#ÙHÓ䟋ä3ÓrÄG‡!uå>zïß,»”¡¥ª\]ÓÒãSªJž® þ'Ô_€U`…/ŒA›9Ó¡ÐÂótàiŒuÅQñ;µTÀìÌÕÛÑ݈w}’1 íšÇ ]Ï,¥ºèöšæ¨ ÃºæÚP7Ÿ™®—b׿¨½KÒ¼È[ªÅB¯ô‹t£î/ÀhS˜J^ô[(kW¨ØF¼Œ}Ę c‰ñ›fG=*GÂbGLØ=/¹=Í@iµ/ó?3è[¼ït5ùøë›åëw“ªšAU×$û>5¤HÍ\ ÒmÞ‹„4«B–ŸË¢J’`6µÎW z<_œrYœÛ“ŸhxT¢ùG>-BuÝáÓÿä*78ôù¼« á]æÐÚ#ˆÖfÚCW'ÀË|Gãœ)ϹG-=ã‡v4¤S±ÖèEEèã3¤=©Æëg$týáÆ±_©S]ÙS §žŽX=ì‰Ø^ÓÂüب!3‹D‘°yé¯1²¹œÒŒW³i™(žºÿNÊ(lÏÛ&¾/þ…“z¿|R?ÏèÈ•F^ÿ”ŠŽ³5šþ‰âsÇ7ïPé{ $+4A|šgÏËbåï$ÑPüœ—ðbJ›/>°šüÉÛ n˜±uÈËü€ªì°’ 7A½~IÏÞöDL ô5YKíÌO\|£òN~j4þ]¬‡»2oñ**Zš—>EÄ­ô–Bêõ³ÝudßÒì¨'—3ìDÁð㣙‡‰•Æ»øû¸´#;îÛgN`ŽZãàœ…r~€oº¦C岨ÃZ˜†CM ?ño5ù`û¢ý‡e[ø²‚´­jŠŽ€‹þø_ùCµÓÇWþ⢦o±sÍhG Çtµ¾¡<`„ .V%®HšeÆ€_‹q\½€£Ç&^ä°[ŸéFC¦{‚Í&/º”@FX>_™Í…¡kú†ïñ¯úÂÊ8ö@¤ÇƒFúbv¸8Y±ÓÚLˆrýá1iþ·¬#qtgZ ßé_V^ÔøHq6-Z»v_D¨/ù»¾»ÚÀms\Õ/W¼pü½© ß\‰«/öoÿøEm3 endstream endobj 10 0 obj <>/ProcSet[/PDF/Text/ImageC/ImageB/ImageI]>> endobj 12 0 obj <> stream xÚµZYÜ6~ß_Ñ/Æj‚nY¢¨+Æ>L²v0Al'ö»@l,d‰Ó-DG[GƳ¿~뢎îö ,ž"‹Å:¾*öæýÆßxðçobµ‰£ÔM7y½ùávóø™¿Vlnï`\éÍmñ‡óúæùÇsuõööçb×ÃÏÏF? æwÚóqÞ.ˆhæÏód–žgÅnÚ¥dtA€J\e‡oæj“·Í•ŠÁpÑc¯rÚ;,µ3Øi¹çž7ž¯+Ós#ë¦Uê#4 ùªå^ùZ;mUpÏêë¶©ÎOªâÄ,™‹c.â+b™Ñ´/ýîj§Ç xŒû+¬Â‘,uoª¿®Tä˜Þ½ÀÜ@….\ÝÎÜäËØ‹›&óâÓ\¾A "b’õ®l²¡l©S9÷åpàaäÓQ¾ï¹Jóá2=Ç \ò\Èp{Ä}·¼]oèš#{ATéLàk‚æ§ÊÌÕ.Gùˆ@>@8‰zawÕ2¿ùD‡2Ǧ«/ë²ÊºrG´í|àŒ±d–,0¢ªX"N/“a*Ún0Å“Á,Ì]6V÷–²¢¿å¶ù›£, 'b…‹»R‘0h¥á§·›÷Woî7¸*ÅóA&jh‚XÚfµy½ù|±N¡/<‘¢Á|.ÜKäÆVz·ÂƒéDUÊ“£(ï’Àª4u½ðkööÓ’z}µÓaèÔ-ÒµÁt½T³¦àÊØ”wmWsÃÖ"2ØÓŽÃq¸ŽäcyìÀ€HcnŠ-Ï_ˆ‰µSgØxàÆtñkjú±²w•™¶¦Óx“0i“¶x¿ÄÉMß—Íþ¢²G_÷û/RôÉœ³½\ä}Y~CçØ[ËhoTp2•gRéÊÕV0_\TìÔ*vÞVcÝôOV÷#Z‘\%?N]­OXrÙî–¨7(/|޼³G™m I4ÞqD¦üɱ+þeAhÙÈ3éO¼†µƒÈ9“£ÿë¦ãzcÝJ9 $·b,ÄôC{ì]q°kI¸½d„Õ†~£ öÓ• fjÁÚÂõX‚.«²±&ñÌÿ’¶ñòÑâB\­&xvµ Uˆ½/{\;þ£ªP÷n?â®åO\Ô¨{™Uç {§§=j“5 V¨ ˆË†sš"^‹g>œ/vKËè#^h9[Dnj§=;_$tU £ï…´µ¤ªyÖpWVõ-×fw»v$Á€êä[û€BkŸúäÞîÕ.öCçE;˜G¤FZ— ÷ª¬ÓÇ…\öt0Ü?Ôµº2Âí’YºS¿¼\ú.õYV ¼ËòkïÁ4®Š›ËG¦2Hß?޵LbÆujPŠÌœ|⃼Så‘,q'_1«‘Š€ Õ28ÐÁtË-¶ÜK`‰aiÆ•hîÚ„OtÉB†ƒ ‚ B€‘&øÜñˆ[¬-©%lÙÃú“Òê]/£3}0RÎSËŽÇ+Ó쇃ì^”0=0B‚iÚq€‹bå<Í&¸3­*‹ÊÇ+èrr`>öç†kx½XÝX±WZ²Ñì»yιôê­ùG5'u'#Î{ó^« !º¿È¶ò´K(ê:#IŽ„Œó±BÖhÓŸÐ\LÐpž’–™›ä¥Ÿ!b¨Á¯%¢¿ÛÀZÜò‡ÿAç¸.Шu ]ÍÛ j¹©?£.­´E]qíA* y@°±8]‹­”9½ÌšÂ'Ž@'›¬Á¼ oabùc0„Ún—Ø\á:ÖH'âØMü‰Uy7„ˆ˜&Þ0ŠX°1ŽÒË1D¢g÷óY÷õë÷¥N}P6ý`²Oæ)й ±ÁD­µ¼Ð`ÔÄNt'—éV~êÆ_LøŒ X=X‘þ„&fHX¨<ÈÝŸ¹^î›–bZD€} 1Y+J£póßÂa¸æ¥7‰=ñ&:ˆÅ›pºgße$W£à¢„ps |žƒœåí¾)ÿËô'bq„$Ê~ ¸ «{«óê5üI~¤LZè¥pWˆÜ}ÏZNŒ\°9dî`ÓKˆJìæNDøq† •’³5 ÌYÅ,c#[ AYÇõÒîƒ¤Ó ^ŽÂ…å …m[ä<–^cNÀúåu°dþ@%¨H¦„Ò‰`Ñ[e•’D°g­[t° dË ë\ šq<=ÈÆ˜DxÂuÙ®lÇžW¶ÖÎëOÃÙNðÏGš¢k“D™)vãQ\ãÍÝ–g$I†W%Fv;y?»¨uLDjò`mê¶{Xíi¹ˆ«µÛ©W&NôeŸg]q†@ËÉÔŒoµçS=õ]3P¤­íŸ¦.ï…É¡ÔÓ0>«ö-8«C-jÒ±±0ÝÀ±ðÐNJZŠÍ?fE,·:6Œ±†_QÄKÏ“9Oÿ}ýü×_ž¾¾˜¹°‡ž3ߨ-gzïÊýOΑównþÙðøPîü/SéÅ81¥µÄÕbÊœæ3KðÜ^6D‰gD7›Ä‚ÞÕ:ÛÞŒ,¡;n¾7FÚôÓ*ý.!fCh—k!ðëØ²“LÂNÒ_Â&F‡è%ÁްIzhNØKÚcìPEÜIpíU_Ý2ú!zÐ{”R%¹üw=–IU@ÏgÏÂ/ÈÓ|ùB|8ù‰8¸˜jH!V›Þ1hÿó$cê&v ]ôÙ&ýø®7ïG31ð 6öýÐU_±3º°óE£ÛXò#¼R‘뇋Twl“rH×ÓÊ€öÇŒL “Ä.J3<‡ÛÒùá1Ø7å=â …´€žÐíÐdíïë~¹y~s{•†p?iäÜÞ¼|ñi·÷JŽnpÒ“‹³o3ðQ,$¶À"À¬‘á oNsò°Âœ“?éÑe¥)ÀþäžÞ]yê!›MîÉxÒd±H{Ù–e¥äšaš¶CG~6Œ’S’NøŠr¸¥>|Ç›­ÆµÏ¬¾VRRjIA­fÛ‘)à àü KPÕt=ô¦›òfÚ·fˆƒ%±Ž4–ˆGµæEí!¨Á§ÜNûZ2[‰W•ø”ú=/xíÿãZmOxÁ–1:Öµ­«E¿Zôst‡?¤é2à;‚âbÒ}áè), IWæàŒSê®î¨pqʬzÎK‚¡’s#èOÙߨ ¸_DöUÌ×÷<"\àº'ŠaŒù„²×µí”<3Ë$æRΔ•_ #-1ŽB0Î\Êã V组]=|a2û¦£€C-Ï À•‘% ÝÃîL³ mp/ŸY¹€*ËLŒýpEI†Ì—«·í~Ï|IT>ÁOeÄþÊcæ³ËYŠW³=eM%¸_ñl"fSXX7;²¼‚°Íú±±áD 3çnèö* y®¤k ü* (£·ŸÁµ[X0ƒ{J)í¯ðu͈±Îzk´+ñʨ[¸þ3„äÖ”/@ØâåEÒôóKÍ‚ ¶HS™¦8pÁ¸ïtèòÏz”ç«Ç^øØS4®ÂÀMm~PÙ¼Öoû°]û endstream endobj 13 0 obj <>/ProcSet[/PDF/Text/ImageC/ImageB/ImageI]>> endobj 15 0 obj <> stream xÚÅVKoã6¾÷W½TlE$õ°°P-Úlw¶ØÓKÓ#Ó¶v-ÉKRqòï;á,;nÓöTø ’3óÍ{ÆÁ×€ üXPð È˸ ê6x» ®nX·\Ë5Ðy,W¿‡·ï¾Ù}ý±ü¤¢ˆ¿ ÌÓ„!i.rGÜI½QÆFó4å¡Q›VuÏCÿÐú®û¡[Íð,Â~Mo‡mS#ë¯,”DíúnÞG¼#¾•ÞÉý¾é6$R÷{$=Ó­æÀR׃öZšŽ0ìVùƒz²£»ƒ=ÁGó¢ÈÃåVZç[Ìw^Ž£#²[R†°Ïm‘ZÑA«½3«×V­ˆó(kjÙu^¿N3´Ê£I{f:=ŠilÓ{·>cœåÚ*ýBà$öñ on¢„„ÈI@ˆ"”¤b 61¨'ÙîwžÉn1…xBswÖ_0ÒDö|¦i(°Vy–ŠUüºUêµa¾×U:£Ç“" GU_‡ÆŽ¸ÒÎ_xÓ˜µ¬m¯ŸÑ+$Ftyvn¦á/t÷vÑÛIV ‘u?l¶»g§áê&›Ú‚•e,¸o¨œ, !m"+˜˜OzˆçS³@FÜPR* T´iQ¯ê(7sγXð)N®‚,²E¸ÒÒU郬ñû_ jVúl ßV º1ÂçôŒ ®€êžúÁî{ʶ€@tXX{E*l?"Œrzåj Ø'õŠšîyŸ°t§ ¶RÆÃ÷žóàœ}‘¸­¶G:\Âv×Òl‰Nò0?;¿c‘~pãªÑ+ÏÞwÆê¡¦‰Ü4ýœüÉ?¨·¬Çãá/\P0‚M¾]ºÅø4¾3>½8½Ói#ÑNQº“;L‡B­BióÏiª"à¿[þôë§W“õC3.ħ‡Nͨ<~Ã=¨›Ï¾Fî:¿j4ŒxÕX?"«â¯W²}CR+÷½;,Â/„Œë¾%“Y!b0wžÁô¤z>/ProcSet[/PDF/Text/ImageC/ImageB/ImageI]>> endobj 3 0 obj <> endobj 11 0 obj <> endobj 14 0 obj <> endobj 17 0 obj <> endobj 2 0 obj <> endobj 1 0 obj <> endobj 18 0 obj <> stream xÚ]’ÁŽ‚0†ï}ŠÝl6Ñ„ ®Y6Ñ5¢€epI–Ò<øö;Ó¢0_f:?zy±)T;rï`zYÂÈ›VÕ†þf$ð \[Åü€×­§•}Ë®ÒÌËw•ÞWpï++÷߇÷|wôÅÇù´õ¼†Æeœîx0­‹MyFè Õô> stream xÚ­y XײÓí†Ê¤ÈÒ[\CD£ÄD£¨qßqEvdßF@¶f©™aß—a`XPÀ…¨¸k\5ƸD‰ÞÄh4šÓxÈ}ÿ3€š÷¿Þûò}ïññ5ݧOUýªêWU§E”•%‰úÏuq]<阙‹–;³Ü˜è…õÏÉÖÂ[”ð¶Hx§—À‰…7¬Þ¡(Qú r¥:X®ß´\‡“˰¶”•HÄȶªÇ›à8nÜø™¡a±›ýü£Fzrpš<Ùy¬Ãøqã&;¸ûDlöòqXäåïìEþ rp õÚìë0rŠTTØGï¿íèéá÷ɨ±Ñ›£ü–ûDúDlññv˜å°Ø#ØÇ¡KgÇ®ëÌÐà0Y”O„âPoŸˆ¢kÿ¾âáÔˆ^c{;[M¥¦ÓóD ­–[¯§6Pö”¨L…ˆúRŠ¡zS}¨þ” 5€H ¢l))õÅR¯Sv”=õ&õõ6ÅQƒ©!ÔPj5œz—A¤FSc¨÷(Gê}jåDM > &R“(gêCj õ 5ƒšIÍ¢>¥fSs¨¹Ôµ€ZH-¢SK¨¥Ô2j9åJ­ VR«¨ÕÔj-µŽZOÍïE‰(+ê„h³è¿z=¶`huÒz õPkëDëgÉ^z½“ñaN÷^×ûQŸ}Òúœê[ÖïWÞß¡¿ªÿŸ6ú X9 e`ï%ƒøAƒ®ÛæJ]¥G_›üÚvv=Ëæ¿þáëGÞýÆ»!vjû8ûŽ7¿¹ó­9oÉßžòöáw†¿ÓÌçÊùU| ßé8ø­Ág‡l²[PÙ×Á,X›Q†Qôôò=wï¶XX! cmÆ}#¹óYÛ^NEP”š¤ŒM„&ª8ºº¶ÈPÙè¿Ã•[ î>iÊ&׌0`Æbf5¦x,™t _¡7Êa.D×kéL¦ýê¦~ÃÌô\9ìåÑäûòù.m›®¦¹ šÉó-´‹:¯‘Çuè)‹¬éû7ZÏrû¡&F Œ†ÞªŽN„PPh#ª“U‰ I ¦i¡½Ë‚ Ýˆ¾Óh´%V,¼;ÿœôºIÌØ—•èÉ㊯‹ ¦±mrœ†ü€½ô¦Z£†`ˆéÖ’·š?ç¶Cã6C”§Z qL@UbYEEñö+Z]qß5XÄý c_¨ˆ”y—‚~ç¤÷)Qï?½(ü“öSXK¿U‚F«Ôz‚˜!Xä:ôï L#?3‹ˆ÷¾Ù÷5—ÕP“œ¬ŒÛá›_¢`l:¾~ÜÝú½º±;(~ÿ›ày«Ö)S`½Ò^úõ÷Áã&Àr¯Øø¨ÃZ`Þý ”+nüþ«çõýðö®\%4¤U‡z«“#!dùq¥i£]ÁˆôÆÛFÑÑÏÑrúÈrw¸t¸±™9:=d2yÉ9rn¤¥*R×Ι6>…E¦í1%[Š™6Ë÷Æý¿úèò¥cw…ж¥ý¬ï´Føš3Óµ»Î•:%'B¨íå¹É¹\.èÒ³ó“àÈFÏ+@¡Ï”:è2S«g®æ…%#^ù±hIÇ¥·gýb9´]àË· öÁOmo¡~2£T@žÂ-ödäÞMþ±aáá%aœ s33µ:­–$‘”<¦Î\¸() Ô bR3”y7¿A}8ÁëËŸ¹DŽ‹Ñw8•=è¶–àAÈ6-ô ;Ñ 4 *Èè™,¥^APrš³K×à^IX Î05÷?‚­Oâ·æç“™®Ïâ_îâÈæ­HÏÂU9ê¿Y/A&žÝŒB©Tð uYì¯#á €}Oóųð@l›Ø­¬"]™IpüñËHzõÊGbøŽ!(à•](”zw¡à®eÏD¶x½¡ò3s³u:;­V“ÀÂÐÅîn Iyeù×®!šÀ¹ä¸(•ŠQ-ZÂÇ¥Ëh²7<Ýz*F+±µ‘•!!‘‘!!•‘µµ••µ1UUG…7ÄÂ|>кÃ'”ÛTÛð?þ9Ã.-Ñb#ÏIÎã !=WŸË—eÆ;£h×M”}S,lB—X¸žvÝÿë ¿L.ö€%0#ÜÓ1pvÚTø¦§;µLÛýÑ¥èýp¾)ßósÝùŒ«pÁ~øë KM±¿ÈOÂ]8gà\öá2Ô÷rŽ êá\lÙÈ\W˜ a:Ì‘/ˆ»}ŒÆ!îäaxh$N×± ¶vy¼ëЗ4ä·TÅVùs)j*În:ÌO»?šÌÃÇk§¬X„Wà;¹œ7Ù¶"«IÐÇw§=–b íFOØÕÇ/xnL8:æÝiK¦cjÍcí7ˆçÌûÏ4ÓrDæÄC’zY¸{ðúͪMŠ&R )šT lc¤ry”p{$7šæŒÂoÏqç7Áôÿ if#ìdà !‘ îãîÍC"dóÓO8"n,¼:´¼_ô¦Yð ·]!£Éì´ŸZîÂm ’´èÛ¡µõ´¼¬x·šGë ì™q\2‰•”$|²s³]L„ÑT¤z[wŽzýŠÑ7mrqä V·±Õ-/©)´•‘>^2ÑF2ƒÍ}‚¬¾ùò½>­1åq¹ š™,0µ&¦-Û»L_=yâÌÏorp¼îÌ÷mÝú4Þý®BFúqþ]äý#q“]diœ8tÃ2n+¬Ô…í`öd×–˜#‹dqÁÉÞs{~OÈÞ½yé3dÄô—ã¹°.vÓFÏQ0Áƒ~Eˆ_úÇ)hi­8ÎàŸ°q°‚NÇ6Á°°C[ySõ °ê7—lÌL_˜7žÀu± í!4Áæ_ _µúW–ëâW4Ýò¦í­§1Féo¯~Yz/¦SÌ’mê—Ô”—Õå«ëˆ©U½ògÿkµYÚ<]v©È¥È¾96‹Žv¼'®³Ee`0Ä@ßù‹$!¢£Ë ˆ~Æ«ØÂî'ñ|ç?$ñÝO y²ƒÜˆzÑÃiË_jˆ: Ï#Ê$¾'1P¡ÑFq[%q¤“Rdm8½ºÐ˜ac±-¶ýÙñ1>«ÝÞš»º=f'0?>þ%âiÈOÄ ðh<¯ÇK‡?An|;œÉ:ST•‰ú5@1“› $[ɶ–¼šäåùi~æãבšƒ\ÑP4}ˆÞAÓp/¬x?Õiž?,âgïc!” [Ü®QD3sÉ-$Fâ·.ñuðY\£kA ÉÿX‚K2áµfažAT×á"†t,dÓ ÔYq*EZöêüÙ.vWüJ !ì ín]£¹]]gé†ÖF˜ªxÐksµº}è ;’Cø=ÒD*@iì a¤{jÔ_€»÷iZÉúXz $dÉëlrI߯Zc™Ä¦c Qàœí2Û"þ&:~ÆCÂïB/ièz]V‡Jî7LZµ`3ó?%³÷ËΟ¯™ÛïßÀ¯s½‚*%Wtr?¾³’–>!ä÷ $¼-Aö?Þù·'_Æâ|f¾ÆâA7uV=ªÐRúA«ãÜY«'#~õ•™;Þ3Û},6ârï(ú™Å^hvB‹‘ó¯ˆGL.d%rr…f[ ê0‹€YŒ×ì@y§íhõŽSÁqü®.T…Àƒ¾œGÛèÈ’sJP²]ëbçÛ¸æÀ`¤÷ðâø x‘Ÿæ‘Y„¸[‚ô¡¸¡cTI?7é¹A½$h âÐH´-Ço!'<ǯýéÀvY$ììFê¡äs8/;8k÷Òœ‰0°8f‰ßºè°åËÞ%õ¬c ú¤Õï¡·ï u&4Ƥ7‰ÑÛWÙçB¾–´¢… ·€r vîºÖÕ…ÓST¤Û¹2ãqd%l1‹ZuKŒŽ ‘ìûk]Wq ˜~Ç™öüíU¡†PÙæw—‹KîðàÒ…¼ËÚl‹;a)ÝR§ îj›Çð!Âé_!ÑŸ¸Íš¸ÍË p]Ê¢ï->Cyl„D><³€™ÜÂvzþÌÕ›%~ªqàé=µËˆßR‰Í³‘=æ;Ô•ߊ‘¢c›lTAmê‹`PÑaà6ŒpGƒv¿®¾~¿Úb6= Úªy¡QK#ÅŸ½23’sÁ>ôÙéyHÓÑßNÛyQò<†šô_®]_€eb‹¤?±ô=B%ݨ˻ÂUšñTI8€&5‡vž´ëb¿nÔ“kÄ(¾c<›œ¯ÑÆ•> ª•AS¤©Pi5Ãt–ýÕJèŒ4­&]!Œí¼o—“¤ÕdSYe¼°Ÿ®†¶ÇhÍ:u@À: ú1¸…ñÓhRVdUÎU"¯îÅ>è.‹6á1„~æã9ø=üö$¡äˆÇ¡Ùh>ƒ& þÿÀŽÁoþ€ Q>z÷Ë›·ÑÐY8—âwœßç»3}g:m!•;¨í±E£ó,š*AýõíÃ_Gþ€ßáñÓ¿0çËüvî¼Ük=à©P ‹*º æŽM€ÁëÜSx%3Q‡Œ¨°7w iõ´Ç¿T‘¶"Êr¶’OCäõ¹ºZ0ó#²Râá³#?Â#ù‡‰ìýÊ«§à*ó –|‡Gpø‹W¨Ó¨¿H|y±‹zÔA™ˆ¦¥C+4lÞéA0æƒW‹O‹ßÅv`®–ܨɄ¼DžÐu²&U1Ë?6ÖAÊÁ„{‰×íbϬ«_QFZ§vBúãjlV…ŸA_žYU%½„ v,~ͼ¨Èû0·¿pøÔU} FüaæF­„? £ŒGº?4šÔ$ÿåvÁµž-ãIÀˆ' æáÃC3®†å)&] íݽêÔí©5¾åá¹ÀÌZ;Ó™ i«ö/;¬:£iÔä¦*Ì$Bj,Ýè­šG:c­.+´z}ý!óæ3~wHÚûú¯¨/½çÝp6*÷%ƒÕäräÒ$j4¡ì;(Û$îpì˜Ìv"I –‘·M,ªGc ª W]®9D1ØÍ0„ó² `ß¾Ç]Ã= wX’„8©°4=OW e@^. (Ä»ˆ5™ õF7t˜-\Çâ!£?ë†û˜ÐrNô<¿•¹€ÏôÿÆýi<1˜¼Î!~¼W‚$zä@3ƒ.Ñõ°×’~šõêÍþëu~/’Ã"µ¼c°YTÖ±^Ü!n°…MÕµ_j3¤öÒ„/Ó…[·kÊà ‡¤¨j^йÅ» :E!a9}í10Y6WÍŸõ›º7?KŒ+ôAñÏì²b2TPºŒŒBÆæY~·¼Òg‘âgƒ‰¼Òc»sw‘¬Ò‡,è–÷…Æ1 P¥y£;=íðB¢&] öÛCU—¼9„M:OËúcPù~ÆOí0ßé‚' r2Ídþ•k/CzZ¾/rëì°Ó'ë·1#+½M2ìÐG™Ý7í»ïvI蛟m¡>Íb”õlóègcï7ãwþOÿåi_òtBG2‹ßñºÿϱ£½Ð;2<È[Í(ÐXõ£ÓÝ}FÛÖ+ëï"çó—LÒàCèÖ7lºWµG0å§ ¿åF¸ÒË"òåÞ ¹Ñ$÷vù5yïZ[¼žT‚Ùnó‚ [«jJ ¤ÌO«ß ­nØŸG†ÂýǼÇó¾´t÷¡5ªOU ƒ§oŽX îÌG÷#NsDÊ9Ø×\ÖÆ¤¢ÙôØ™›7®ól8ÀÁ‘Ö›hr¦%EåFy¹0Ôˆ|[mŸžGÅ7¥ñhºÐŸEh¸þà»l£Y»{[Œf 3YÄÐ?|öÙžWhò­_õé¤Uƒ9Lÿ·ÝWô£ËHb¦-çhžhùñï*TØÖ´G_F3.7´»—þ!Gh:û<¬à¡Z[WdÊί¨iæ6Œˆâ!L‘’êE@f¼YH‡ŠîÐU0÷Ο¿ÂCS\SDEãŽÆŒRÂ; ×È•irHb¶$ç–eU”$ÕysÑà®ððô¨÷ÐË€qœ=ûÜ>¦-‰ñ±àÏH;(*ð1G/Ù¸ ¼˜™V1½Ïï¯rÐppMͪše°"`¤jÓ·˜xw>Ͼ;ûà!öílÚ£gNHn!+˜¸rÖÚQ|÷®p¡^ç‘Ïy1êõ ÛTƒö ®ºª]½ãåÉöBú÷‹GÏœÎ[¹œÃ ô«Ö­&â5ÛäJÃ-05&”–3¸/ÙêàÇ LË9qk+êÿÔöòÓOÏIï¡O…Å,v°¬8›£ ¡k|.£N{XWÙÐÐ-cž2c«Ò…l]Dyh,,<Ìn®1•×q$V°‡±ø!²k_¯»íOæŸ#톯eç±–¯äXHÁÞ˜}°UWn)c!ªDõ[¤K°P™&W‰”’Šc;!ê_ÐìGŠÏ‰Ñ§óÙ¿r`É¿ÿ\Ñó¦Uf´íŽò+Ýz¡û,.§ÿ7±ð3 ·]ò6 pÛºˆ‘~û„t «¤û»îþ¤õRwéÞÿ Å_­Ðs%®¼Pbo*¹IgÜ~S,¢½6â)îÆ"ByìïcH=èÿÛ¯HÊápìÆn÷ò°}{à4Â.8\¹§z_[y3ìݲªMU›`ø€¬”¹Ë6º‡¬µ6·ãUf4Ö(Ø·F˜zªÝ}Ríú±çz¢àb–—~ÛéÿܤÚv]UõÁî!¤Ûô)t׺ò›—%pôlÅKïO÷h)E,Фkî¯=Ì¥S°5îÿ©³ ï>†°Rð´@úK¡²G›Ñ¦hCO%úVxŠÅsþ½tᘌög-½ŸXãyúc S_ǃ¢øhÙm¬eô(ˆÅƒiŒ •E&&†¹3cñR)ú^¸ùõõÖVÀßzîõ§çžÞ’^A£ïYÔ‹Æü‹ˆ—î}¯b|GúAëo,ÇÌ Ó¨4ÑI΄–ÇKMùuuAµ›8é•à77a1–“…Ie¶õ¦ˆÏP¬)ê3éäV±Få8˜É¬ñ÷šÊ9ƒß±˜ïUšômÏÛu»îv=´Ú²2SZ¶Çc¯¦œ@Oïûâ+oOʨóÎöÈ&Aåš7·Ž3 U-·8Ô 2œtAÚä¢^÷Pñ[÷L‘Fjob’J¹-Y©ò©ñ„x£Ï\Wü‹C«eŒôN]xâ«¢µ)Ï# (ÔŒ>4ˆ~?Þ4‰‘‹å“úP3ŠFH4“ƒF)Ù÷¸–Æî%5^¬¹x†;±‚v öñ ?•pÏ;ëÖçù#¼uu¢Ñ°—ýÆ^„h<âe¿1›þ`ýœÎʶv}ßóä*¨Æóx4=Æ¡±œ4æ(ì©©µœ˜W’îMÒ bát‹Í©(Ù}ÎÂ}QêMTäÝV ÷5iò¢HK¬V*åÃq¶¶BuŠ‚®¾j”’.9X㪎 óÒ[–Ÿ†tEA)}È.3A—Vž“žk™‹ºÅ•ZN2ÿ ÝaK˜ ,âdDœ,d£.Æò~£Æ ! uZâ8œc7Õ)ó5¤nÛWï‘¢Y£Ž é–ÖÅêÜÄ E^pVÌDœf7Ò,ßÒŸ¯îÒM¹JÛ5]€|U~ ²ÅØe$èS,ºé23 CvpSºÜr˾ô]úöTQ¾ £f”Änß¹se•¹±¥¨Í¢u¸:˜Ì9«t‰–ËÕÙ‰qÛÓ‹¦ÚMý-9—h‘iEE9†®Õšyê¸-nÚ0Ëê  Õ´-y€Y»÷pßõË–ª·€}]¥3k«j[Õ•–¬•éS Hge¬­?õÕ7#ì²åq“ääø0ËQïvíQ±z—f»eéJˆÕEÕvƒ‡±ô;AF\úX˜@ ®:}P[ Ï=릓YÄ›5Å[ ËnÓ¨Ó’ÞÅ;,FFe®: VÕe\·¶[Ÿ{ô3ËLæFwŠìÒ‰²¡³€¥|Ç”šÓ³PÌTÅGE˃7¶úàNC˱Êc–.LÂkFQ« }n o ÃÙœ<‹s˜‚ˆåð%‰O$óp棲&’v ’Hp]gÿdË·¿4û„,¢ º))AK¬#$ɸÏwÌXi‰Ì(Ì,A!y™F ö̧Í}Î÷5æôëwÞЯ?×ÇÊÙЯ7ºõÚÿûGb endstream endobj 21 0 obj [575 0 0 575 0 0 0 0 0 0 0 0 0 0 0 0 0 0 869 818 831 882 756 724 904 900 436 0 0 692 1092 900 864 786 0 862 639 800 885 0 0 869 869 0 319 0 319 0 0 0 559 0 511 639 527 351 0 0 319 351 0 319 958 639 575 639 0 474 454 447 0 607 831 607] endobj 22 0 obj <> stream xÚ•W TTåö?ÃÀ9GÃQOBÙ ¾@P2Í7Š7ó &‚2¡ È[D™Ù3# OEyŠ Oá¾ó‘Õ-+³UY™ÝìáÍ«Ù>ãGÝÿ7=î½ë¿V‹ÅwÖ|g{ÿöo?¾}dŒ­-#“ÉþKW¯Ù4Ùgåâ Ó=­;Ïúó[£%ÛNzš‘ÆÈ¤gl$•\eû ÃÈFZW›áten°®îtQþݱ•Éøä ðôœ9ÕÓs†Oܶ´„èȨDçIanÎÓgÏž5Åy†§çlçE[#¢Ã´±Î+µ‰Q[µ‰ôÇgÿ¸°èˆÄ4çIs£·Í™6-%%eªvëö©q ‘óݦ8§D'F9ûElHHŽwö‹Mt^¥Ýá<ˆzêàÃ'n붤Ĉç•qá ±§b3ƒYį°õ“omd‚-&–m±M°I¶a˜Œ’ÇŒg&0Æ•™Ä¸1îŒ3•™Æx2Ó™™ŒãÍ<ËÌfæ0s™ùÌbf ³”ñe–1Ë™™•Ì*f5³†YËø3똗˜fÈL¶adŒ-³ˆ9)›$«³q±É´ùH>Sþ©í*[í%»@»ïÙ@¶“Óð£øpþÓ!¹C¤¡Oõúú“ž8jï`gj˜Û° aZEïðyÃX=b߈#ÁA%éÒ{Ä\ýÈ¿JvUò“K&Ë¡  ÁÈWí(Ý©J†¼¹;û‡ÿ2Õ1oä‚O9U¡ª†‚òÂý<= 8/àØVQv' /Ò¨V`‚ au)Ù/îÜ“•»vO\Y±ítW÷qd?»*·Cµ;Zæ ýÿz,HC~Nö¢ NÒ˜“§±¶û®ø7S9ÔT Ö ³ÕTíüýd5¥x4&KŒ¬ÆpQ*:XcâŽã‚z•–±g„Æm—â‹€¯€òýj þXä¢t^¾v*¼d?Aƒ\8¥©žŸ“ÞàÝt%ô\æ5ø mÕë¯ö¶÷Ýxõ;øé¿ƒ€1ÜŸƒ¾øuŸ’JL#®$Œlĉd–ª•Çá¾©ï ¯xäH9h±Läà'·ì¡‘CFd#tnêVd )#`Ýçþ¼?c®Õ|ººîB«•%ÛðÒößûœBúÛoy2NŽVK¬Èn¦ƒƒÝ`³U#K5²®ƒžìé¹ Ö“,-èe¿ŸÄK4P£Ò Ç7Ì 4µ'ÐTzZd7éˆ ‚6¹üô.§xôá¯Ç'‘~Æ‘r© ²GªÏªË‚$p" VéK,$ qºí:=í|‚5þ ­óôŒ. Òi¦X³Á5¼$9¬ë¿[œR–W NU°oÿÞbl’ˆ£qÀ,Ûb¾ß@Ë€ó#’x. ² +aûA‡ÎHùgäxÖâ-d•ŒÉ4biL'åõß,Ùe4§#P\«–®sPc456›à(ðuÖúA¸6`€è:î TêtF$ñýn”ãY¶c ,B¹¥¿p ™H{YI#j2‰¼LÂЙŒÇLGND­ªß–\ÖÀ⣛?|åJÎ5ø®š??ù÷ŽËo×]„«ðN\û ­/îË`iþ ­oÈâ¥I/YKûͽ}ØûkL›¨AìÄs.eQü{8m&}Næ¨û½G’ý|ÂQØ+§°ŸfÑø,ùd²Za¹ŸÕ’ bc 6´ã1˜îÎèñ>ålÁŸlÄ¡z›æ­ò‡çÈl5NN?vêMx‹¿Gl>#kTý#~³ÚB­vvÞý5ØÈ)ûp['¡1º)ŠÖëbx‚Þ>™t%¿øŸÊ¿:P³v«“Á°Û™§‰MÚ «!ûZ^§¾4çCÁÎw×5† 8оýK—ÃÕîïñïçw+ß—\¤™ØVì.;u½òƒêc¸]KÆìÝújàCA­_¬õ0²v‡lr\Û³¼x£µúL§†åǃ«“zƒNìAÙv^yçËÔJ]MòþŒºxæ7ÆûŽW-2¥*Ý ¡ øí›¨&˹TÈ>¨*“i_QOã[Awr©;8û ÚÝS+ß”ÏùÄóÅÝAS÷£×e"ˡ՘h™%üìÍ&“\;‘-F»’Opð7ú?"ö¬åpñ"]ÎZô>úÃñPò| a&Oyøa0$õ¹%IúIø+¹;Œ£™) "2â£ÇÑçU’×ÿhTS8âDû˜ ÃÐïsš\ÃUS‚µOX:åRª4VÀ¹_Za46›› ­–DV« ‚ÐðùIõPÃmМeÐçåêuÄ‘ø8ÒZ´»imbS $Ì ¢¨Ô]È%BfïÒëctN)údC ð±¶ÓÜ ]m7r‰–p8D•M…&3ÎÆ8G\L |NÃvÐI£»ýÖÀ¤áÆý†S²Á1Vvþ-Wìƒs¥é ±øj‘ 3ëãÂýáe+ÆZ8õ»õù@Q’Õä¨#Ù„}vŸ[1ºSŒÓc4õËìHеOdx×wƒu%ÃNtæ»üÈæ² G]–c1 —Ç?²yp™Ìþ%ãoúÖÏ’-Ù«üb3~ÎfÒùÜ–­"®>†-M8d>½U&KCo[ øê5¶ª ×fB¦z•Ê¡–§·×í%o7?¢ÈŸ¹©9õHcSu§ ŽÑÔ5-l2–½Y»@MoÈ%:R” d"?G¤½£R& bGM'ŸŽÓ…)Üäi™ šOªè¨‚3*úÌtÝ—Û. o—µÒ†±˜vì—& i;ôÉÍ?¶õá ×’e‹GxâôÍœŠÏC¾¨‚V²* hÄ3ãWlLÝ@‹L èÚsðIÔÍP©+×YkÙê³$oÁµç¬v ¨ÃÒ³]B‡/Ú‰ÜÊ=Ц–~äÈÒ; w¥ÂÕ$€Sö¶¾BµºŒ›IÇ]ǯ§à$zE_†×7Öñø†-1s™i›×ÑA•(‡½«†×àl}[C§XÒ =¦&–zx8·WÇ9-²&j<ç[¹´c…š8¢†ÆÜ÷û^nYЗêðòš@F’§~pGoœÑõ°ò`*ä©Ó­©²#y£_Ìr*2Ö€N§ÔÇmMmÆÎêãõííe €CÀDF£x> û¾ˆÛZd´Š˜Û8\ŽC¤5ÂBÍurßWÃ6»M=­W2Ü ï#'’a÷8´ù°¸NUP¤ÏÈÊI…T>´9µþXSuGgd÷<7Âh‰Š&U~ïÜmôtø=ÉPd”w0QZ/ÅVÖ.°vÅWÃ5{MÇNé­VVrÊ{%ÒÛÂÉÈÚШ¨ÐÐæ¨®Î–'©ºDÓ"Ðç#;©J†ªüñ¦´JÐp•Pûõ5ù É|ÿX.0Ãgµjä˜2öª(¨†ý|sü‘Ȉ„m‘~—¢?ýâÛÓURG=oÛÕúØTWss× réö9òÈÈ1Ѳ\øC ûû(ä&ú‘ÑÕÕ£?A!ûÒ‘ÄoÀÕÈdéŸvð_ÙKŸ’0àãQã%SkÛEýQëŒàPv«÷ʉš”ù*RûW_—q÷7tLPyB@`|pÊiæð¿˜¼òd8²ÐŠUÙ;€–&óŸ …RXÝÿ¬/zß8Q¼àÛýñ¥6WÕdXï¿WÞøÝV÷ -é=Z›ÊÁ´-ÂQr©æ->5í™EžóœHœÈˆ/]ñyœ{ëmÙdY!%-í[jè†S•'½ÚYÝ Ð¹£&ª:ÖÁFÞ C”´½;õ"Žm‘dçÂÚR puw~*À!£±¢¢½ãta ðoŸ_EÈØ¥ËÕ;A!g³ÊÁ‡øy4Î'w[ï@÷}E‡'ÏoÈõBX¿%1W~ßû{-G¨ûÿé;w·RËqüºsÔ:ó”ÆMø•€v™ó{°’úçªñá”7Þ­CÅ*ƒ!mÄò››RšªÛϼÔ Rv/…°ÀÄ´ìs[¤§›Z;µ×p](Á‘( /-/‘¿r©®CklÁ:c†1ïñôðñàð@ïO/* ~+¡ß^£ÕVµ­*½=ª5÷ÆN^y»%÷z&¬áNƒ$ÕXˆWW^Eì|<1È'†"0Aõ~:†î7›zbNCð8ÐE ])ÍQGiØÃb ç•SÀþ´Oý[ÄU²û4CEù}zí’#œó­¤kv_x_uF³[³=n-\­Uaýˆ'.è¯á ®A“&éøHÊêýüji„(«—NÉ%³d/Vì-º ¼È%èc !¦dë-x¨A—£¦ó‘>?›vt³#Qb½ÝY‘}E?ŽÕ!&ë q•CŸ~û²„rz9UB‰¹¤š¶òkƒúë,ÕÖËÖ^((ß»ï¢U¢~«aû6)ɪ¿¸×¥.× GŠ ‹Çí΋lŒ!@¿U«5í 2¹ê@J²ò­RsÉ:ÇqXjÚ¬ßlؼe“)z@=ú‡¥ïÝ]Nu°·¤¨ò!–9>$eÚ£ÜÝh¼²é’ÿ‰¶ß'Ë+ ‹n‚ýÔòMqVlP¦/Û“Ÿ úüüYŽnÈÙµ‹l¼a…>&&Äô ¹ÆÑ^¸‚Œ&®é©‘°œh šN ê^×W Œ&©…jZÆ¢}ߣÌmÈ“vá¶ÞxÙt¼©[œÊ,áY¥Òšb ©.)eIp'Eö‰ö"{{dÚS µUe?Äh¯”£þC^½ endstream endobj 23 0 obj [778] endobj 24 0 obj <> stream xÚcd`aa`ddä ÷ówòÐvö Ž44‰Øäý ùõáçlÖÒ ?dÈ2ýcþ!Î"÷Ћå?“,ÃGAùŽD¾‘j@‚Q^ˆ…‘‘£¬¦ÛÀÀXÏÀÀÈ9¿ ²(3=£DA#YSÁÐÒÒ\GÁÈÀÀRÁ17µ(391OÁ7±$#57±ÈÉQÎOÎL-©TаÉ())°Ò×///×KÌ-ÖË/J·ÓÔQ(Ï,ÉPJ-N-*KMQpËÏ+QðKÌMU€¸]B9çç”–¤)øæ§¤åÅtÃ2&FFÍ|?ç~¯ÚÌxïÇwæŸêß«DçÌíž?¿²»JþÏF¶ªÊîòò¹ÝsäùŠÿ´_Èö[z»‹ù|ÎI<\=<Ü›¹6sožÍÃļߊ÷¦f± endstream endobj 25 0 obj [562 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 409 409 0 0 0 0 307 0 0 511 511 0 0 0 0 0 0 0 0 0 0 0 0 0 0 743 0 0 0 0 653 0 0 0 0 0 0 0 743 0 0 0 0 562 0 0 0 0 0 0 0 307 0 307 0 0 0 511 0 460 0 460 0 460 0 307 307 0 256 818 562 511 511 0 422 409 332 0 460 0 464] endobj 26 0 obj <> stream xÚ­WkT“g¶þB0|*Å Íjè´_Ð:Š-Zl=Zjµµ«8*jAŠ wˆ!\Cnäž{ !Ü#áAÁkÁëTm;Õ#µÚÎeUÏs¦Óéúqæœ7 Óžµ:kÍZs+‹ì½ßg?ϳßÍ üý ƒ´yÛ¶ø=ñ¯n‰Mؾ&Ò÷ÎÛÙ´ñGïtõ<ï¯ï‹ ïK~^Šé}ÞŸú:Æÿý^"Ǝžgð"ü$?ë{.ÁÅéÁ„?ƒA–VAdä«##_ßRÄ«àçde ÂÂÓW†­‰ŠZözddTØæ‚L~NzZaXlš ;³ M€ÿÈÛS”ž“)¨ ;[ à½õÚkeee«Ó ŠWñ³6­Œ+Ëd‡íÎ,Îä—ff„m-*„íL+È ›«}õܯ-E¼A&?,¶(#“_ˆë $«‰5ŒÍÄVb'±‡8@|D¤éD&‘Eä0òýù~¥D9>D1ŸX@,"‚‰b)±‚xXK¼Ml"Þ#ÞÇßÜFÄ;ˆX!Žˆ'vá8 Ä^b?‘H¼äG0â÷Œ ŒÆI¿U~f*ü÷ÌcλÏzƒ¥ Xà$ýIÏü-óo-ر06 L}&ô™† %Aõ‹ž_tÏ« òNÒi®æ©Ï]ÞPzyªËÅà?@ß^>ÿ€‰º¼ËØ(™µ…Ô©ÕB…î;~ø Õ­îzOÓ€c d­”t:e‰hOùor€<N.X¡ÁÚy}ÌÑ[ fü9»*)ZÎjú¶å?ꮡ í„Q›°2@GŠkÊÛ]M_Ð €OÑ$lÎàÑK*VqbX•R(—Ô@÷*k+=-K’&I’BGÄû²@ Å nê6;Û …tUÔ ùü²ìäÓå}Ôá4ZÞ‰~My#hcë{Z(bœ‰Úh `PÔî½/·½8é @ ”êtêõ`$­j£’’ðÖZz)íGoϧ âk~Ó”èÞ6´Ë ¤Åd²p÷á8ÓìMdÃdõµŠqþ'©" •jµ’‹ ÔZe¢lë èãgh*ét !&…² øò´ù¡í”?AhþÊ@óƒ‰ø×Ø.akQ^nAÊöcžÞn*Èk·Èƒ- oÊ4›­7l`›Ò¤¤”¸’ªMË¢…•NªƒJRâUc–Ñ>zÁL!Gž'—mR>‡«LÜ¡aãÐé‘ÓK­uÐì+g‰¡R/Ö—•zp‚ÅP£7a$m%·­.äp¡.ŠêF.ôò§Lñ;öø×ŸŸô4¡Õßüé* 6‰6^ØH‡GÑ)1e–Ôá{§Pà@SuŸX©R))(z­Ebƒ„,<¼d§Dfî¼ñŠGA'GyoR@ÿzíúðø^ZqÚþèõÒhLY-˜) F»­³éx7ô@g™»²½ÂS28ÝzÏîc"×ÇDµPt(cÛ6(‚ ›Ð”e˨É)Tkµ:2hZ”äñnkaLN?ÃD h;õTu¶VªÓUâL}µrwÛ==ÝKÍ.Øå ×+õYð@ÙQÓø æ r–~k^Y±$U*ÁH:ÀHõúŽ \jq@S#¹46m{ê4‹£séŒB‚Z¡”’ù4“}”žšGg²‚¦ÃpI£4rŒ1½©Ÿ‰n¡gźQSÅãB¥Z¨j:5拯W˜/F½¹ù"âqô&½̤]J :ª¢â}!ÿM 7²®#‡­×ªï…P4Ÿõƒ++Š5åNÈ™ â½ÉÐtEsŒíJ(ræ«Rb+´L€¼š :P¨«èfz9ò¥ü …sd1rOGF‡—Ö80Yªñe¬÷ÀVÇLõsO3<Œ«Óæ´Àû%»áú%c—ž4:ô6µ¹š›Ÿ«ÉËËÉ\ªC¹´FWÃ=§µ•á2ªµ ­œö›ÙÁ¡ã¼Ê¾BéŬÙÜ6h¶^¹¯4­FFsf:8¢„Ãâ´¤\ª«¨¶h¬ÜîcFOOŸg}‡B/áî6hÛ¡Œ`15 ¶×ÁAy3ÌUz­B1IÌêÉÏtd'¸‰ÞdN/˜ÞÈÆØZÀDt6¥ÂŒª–¦ìád|ò¾¾ HzÍÀ¯.ìwí›àY4.e½zLÄ9WeÓ4VØEöÈ"#7®xýÝwÿ…‚£æ£ zƒÏ«A‰½R«UWí§39¸¯ÜW¥Ym¡¬ 7ØjíŽöžÚºþ½×«ÎaV#&zs¡KqòHß#1Ft–›ókT†œúÊ8IÞ¿û‡‡:?ØLÁB]¢qfuâ㈩Ñg$´qðLjAÆçÞùLô—é$ö Vèõ&“Ñlj=éÓ:µ‰U‚1U•Óò¿Åst­´¤¤¨»¿ðáxÖOQE¾¨Þ霧ô$Z…OjJíSÓl I]ù·ˆy1¿ð¯»,èa¶ 6©Ëû«­+7Ñ;WBî¢?O²íIý¥}ðÏœ›ÈÍ19±Á›ÉÖJ³ŒÊEŽNE† —å5äáæ1ét½|ÝHôÜ»ÇátëàyYÙÉô.y^ì[ š,ŸìæÂ¸Ío:å¾22~ÈÓîýiØÌu*­Wš*u¡{ÃØýn£´‡LïÞg؉J8„‘Ú…G;Ú\c©ý;ègi6ž!ËÃ'·ÿ=÷ÕÖ›åv9¥Æ¥Ëe›W½ŽQ ÷ò‡'¸p¾ãûö3ÇoŒ÷ ƒ†µõ‡ž$á¹Ðœ…u ©˜HŽ„ìnøÄx¦ù^Ü+9røÄŽ¥ôËËéè5gÖ=þb¤ŸžàÂ@í°µƒ´¸ØCèÍ@åå,¡£ ²T<ÿw¹¼A. ÿù¼›!ߣ?g—®—$*K‹rBÓ“Ó¡ Ä<{‹ÍÑ d]uó‘Š|QFzÅê{¸óÛæ;ˆáá˜ê1ø¦9ð% « ÉÇ¢.^g*ߟ Ãé%ëF?Àà†N:ä äd§Ð1ò¼1ø{µwì\8Óæ·Žº/œ>äåšõr.¦æÇ* ý2=wiôbÚý/z€^fÏòA €E%/佺 6‘¾ƒ íu½uý×Ñ;œÚ.LÏfòb¦ûðªz¥jV#uXo&gMoû·@Úí*¶Hi¡¢¼àMŽ8W‚D™j)”‘m‡Î=êE¯˜¨?¨¼¶»Ì&*å•ñ[äÎ–Îæ.ʧ‡4WÍ”×>[[ö”}9ù«lãŸae˜ÀIŽ í ßG¯PE¯    ¾ÙØt¨† ùk‘%ËCd[ëÍ)ªúeöŒ<µœ2Rj©lklw :±žèÕýTȰ醨;ôP:»nÁ{ÎBl*ux#|Èj´?¹ûm`õ£ð¹:ëù|•ª«2è78ë~Zܦ~¶A&ö¦Ÿû¯³O©.ÐÉ+*¤¿»H¸žÌñ¹8ËÈÈ+ÎKhÍÓ+ó²û$žB_  @lUXmGŒyX>EÉ´‰Ø+ï¥`¢ç›ŽÏÌ –zlù㹞ÔÕÙt¸òg–oót<žµ|Ílù2ž(#]Y$“‘× 'ÑÛ#è¿ÛñŠ•.u9§Ð‘)§‹qþÀôz_a§Ëàð“]Õ×ô³»…Tä&ÉÊK.'Ùø@®ØK?/ð%ãúLÞÜÐñõ¹x?éj¤ãd’¤êÃÒC:µô#aLé†8ÐàíµÒÙm«qC=Ù^jóù¥¹§#ÔèhìnÙ=&mò»³è…–¿/7*!ïß3Ò€,ã9Îq¡Í>d;f?UÕÖë<×qqßuÝÐ "ƒJ\Þ-vTlhp±è4{€gÁÍ…®šÀ@jÿú–Àùhú¹ÿ N|S endstream endobj 27 0 obj [525 0 0 525 0 0 0 0 525 0 0 525 525 525 525 525 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 525 525 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 525 0 525 0 525 525 525 525 0 0 525 0 0 525 525 525 525 525 0 525 525 525 0 0 525 525] endobj 28 0 obj <> stream xÚ…V{XåŸa—Ùi¶1(›!K¥T$´RË â¥HÔoe ÂÊ‚Àʲ°Ë•‹ð® @r‡E.+¨„w‚y2uW+ƒŽ>Qç”Ö1ÓÔ:=ïx>;ç̺öGç™ýæùf¿ïßû{¿÷š’Ë)š¦•a‹—-;>84<üÅÇ“—ŠH¶¸÷îBWñ)JE‹O»ˆ¼L|Bþ4EÑS<¥‘úÕÃ1^៑÷‰^”œ¦Ù´,˜ì¬Ý®‹‹Õè}ý¢Ÿ÷}qÚ´W&øLó JTë⢣’|C£ôub”^š$ø†i£ãÔút_¿×4zý†é“& ÿ¨Ä­.væó| qzïRuŠZ—¦Žñ¯MÒû.ŠJTû:Aû;oÁÚÄ ©zµÎ7T£Ö%IhÝGSc¨¨‰òùt$EEË⨹ÎÅ@S”Œb¨á”’ò FPž”5Žò£fSs¨¹Ôµ€ ¡B©EÔbj õF…S˨•Ô*j” ES.Òä&]A¦\<\ÞvÙ-ó‘™e•/•·»ú¸&»ö3þL’b¤¢Lagç±Ã^ÖâöŠ[¯ñý_*çl×õ/¯Ë0I<Ï•‹«È#$å•FV“HoSVišZa[ƒ©žÅ¥¸æÏW(§Úp´ ·ØðYÝuÊvêé'ƒïzreïmÝålCV½O|ca.iûužwÑÆâBÈgu™| ”7”í`ñ?sëάZµnIPP÷’ÞÞî3ü”¹\®±ýÈ‘ÆöªªÆ ‘‘Æ ¹Y&^æ.Çœzçmõ[³gì;Þ³ïôçΕ–#:ª«×Fµ¹‚ò[sšL®Cs›¼.ˆÞÄð‰ê?x·sÖ¸uM}Qí™[¡ئê†Zês+‹Mºú{’k:ã]U^]lìŠæÉˆà­m¦O`—O%쉇øM<$™ÞØšL¨sÞٕŬ!'=K㎂ê|¤—xjIn~N|Š é#>+N.y’|6Aüž=°§’‡]%Ÿ·±Jlé£O‰2q¶pU]ÐÕ¥)Í‹L®4š.¨”€ž4®FO^¶q݉mëÕº„„uÉ»÷µvvòÊ©æ»ú&ú´8A&FÜõàÊëLNÒS!?« ‡|õë|ïüÌ€ÍlzmFß åµÛ«YežU¬°Ò½vÜo—‰IxšÃEcð âBäãÈ“$„,ºCž@”ßÂ'1„'RÁÍA7ôñöúûž'ãˆ_q›!a´ŠJë& í2iP˜—§éˆ®–¸?Gž"o…?.|‡:šv²©`pf¤œðP þ@ Öp+¾h½eõúоæ*†ÚUi¢Ën®9áxÉ>`-Íb„8 6nÛ\Êb‚âⲿ“çÖŒ9‰æÌ¾,•¦n¨0™š¶Ö˜ª¡Ø¿}0WP#ó¶U0ž.ä¯Jë[ù±N6q¯(@i1é±°ØÐµ{÷ Ðóu7Ž)c%@xÉJ÷Û1åªL\‡4gI‚>6ç rŒ«ƒ€}9¸ï»ãTàSæý°I0i³[€m1×ïê"ÈìÅdÔdâþýôBaïÍúI^°¢«Õë};n·«²ðß½\,dàû™ºÖûUÉ"ÄíÌý"ÏyXÕá( dT=ñ»ÖÖ­‘ȦÇO *2âÛ~TY§àP»e‹ëåsaSgÎ]|ñê}÷ï çiŒ:‹[éíØ"Õÿ5 ãÌÐÌ›7ÉH2|ü8âA”·Ç"‡ÞǾmu$)¤Ãæ-ùÆÔ•Ë6ia6ÌîÍüš5åjúÏ|Ø€MWùëP±ÍŠžVÚo|#Ãí"árHzDyŒ¹ü)tðE•1]M;ë,g_…ÌWg¾;…_>Nq?-‡5nÛX&+Y’à}é¶0µmÒ~‡4Ç2ÎW ÞòBõ­ùvÕ5´‰Or÷y!£ÙãJlä±Gä|(ÆhFu[,–[R›µÚÔT­¶9Õbin¶ðOgKÈ’÷zÅO bGÞË&Ü#ó/IlädáV¯“§p¸ãR "-ŽáŠ; ìz':3¼/›"6M Ò.¹â–±.P¨z®’éd¦Æ]'e‰óÕÑ[ûNœ¬Ñ.æ‰!ÆY8œøèº,æöÊïñªK¯CDœÑøÐ<Ù±U’\,&pèéÿ#‘¹¿ñ$ªë¤æërýôâIy‰ ýüÚw|ñ³ó!þ¡³ƒ%ÑYq’Ã+Nô’Uq:ÖsgNŸ<7pzΔñóBæÍÐ\9Γ±òƒQ‡7íé×Q…7ŸÞˆô¸VucWqúwÙÞ³üQ¶Ceʼnvœÿê(ê¤&ñý™ð1Þ”Roh¶¹©®½)÷ÐTa%¼™¾U]ÚëìÇÄe¨“Ûô¡Ö ’È'5^ŸY1ך`UÝÆXÁéË ¥ºmIe ª`?Û¶»ó:ÿÔ$–&›2¶ft>h»vg×M¢ìâüHs¤ôÕÂ’WõÓb˜ô¯Œf³ªk=›¿ÈåìºuñüjÛ¡»\ØP´#X ¤„™ ëà+¡ÎTÕÝ{²Ø"q¸û«ã‚ê6üL8óúm÷ wÁú“tÚIJöÐ9l=õõOý’qNi~Ú”!'_![˜ñbû_m³&¾$-ƒ/ ]ºQìLø UP™1ÈŽ2+]'æÉÄw¥Àf©ã¥@º@LzŠÄ×Nhz˜Uä„k SÔ–ó—܃éç¶ìÐÀF)Ù¼Ô-ùÑ„/2°%Ì <áÚÃ4´H ô`üm»:„³äj™¾4ëø€ÒÊkëSMi3kb¶­è&ÃÊ6We4@tCi]YóƒÐíbœ  ŽCý\05íë5ÌYÇfh‚ÆÎºíÚöðrü“üÀ t#~dyÉ5zè)s²&Ù_ø=®WmêÃ/‚aCÎèbp%ÎÚÁôr…ÕÍ>ÜêînwŒw“§k݇‰ÍÿÐ% endstream endobj 4 0 obj <> endobj 29 0 obj <> endobj 5 0 obj <> endobj 30 0 obj <> endobj 6 0 obj <> endobj 31 0 obj <> endobj 7 0 obj <> endobj 32 0 obj <> endobj 8 0 obj <> endobj 33 0 obj <> endobj xref 0 34 0000000000 65535 f 0000008349 00000 n 0000008234 00000 n 0000007907 00000 n 0000028536 00000 n 0000028858 00000 n 0000029171 00000 n 0000029476 00000 n 0000029789 00000 n 0000000015 00000 n 0000003224 00000 n 0000007984 00000 n 0000003348 00000 n 0000006562 00000 n 0000008063 00000 n 0000006686 00000 n 0000007801 00000 n 0000008142 00000 n 0000008395 00000 n 0000008802 00000 n 0000009191 00000 n 0000016713 00000 n 0000016964 00000 n 0000021198 00000 n 0000021220 00000 n 0000021620 00000 n 0000021912 00000 n 0000025637 00000 n 0000025883 00000 n 0000028681 00000 n 0000028987 00000 n 0000029297 00000 n 0000029605 00000 n 0000029918 00000 n trailer <> startxref 30094 %%EOF similarity-tester-2.70.orig/pass2.h0000644000000000000000000000046411763354135014176 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: pass2.h,v 1.3 2012-06-05 09:58:53 Gebruiker Exp $ */ /* Determines for each position that is part of a run, at which line number it starts and ends. */ extern void Retrieve_Runs(void); similarity-tester-2.70.orig/fname.c0000644000000000000000000001542611750732757014241 0ustar /* This file is part of the auxiliaries library. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: fname.c,v 1.5 2012-05-04 10:56:47 Gebruiker Exp $ */ /* Support for UNICODE file names in Windows */ /* Two data types are involved in UNICODE file names in Windows: UTF16 strings, the file names as stored by Windows, and UTF8 strings, the names as they are displayed and stored. The conversion between these two proceeds through CodePoints, the 'real' vales of the characters, of which UTF16 strings and UTF8 strings are the compressed representations. The module consists of two levels: a set of static routines int UTF8_sequence_to_CodePoint(const UTF8 *s, UTF32 *cp), int UTF16_sequence_to_CodePoint(const UTF16 *s, UTF32 *cp), const UTF8 *CodePoint_to_UTF8_sequence(UTF32 c), const Fchar *CodePoint_to_UTF16_sequence(UTF32 cp) which convert from and to CodePoints; and two global routines const char *Fname2str(const Fchar *fn), const Fchar *str2Fname(const char *s) which convert from Unicode file names to UTF-8 strings and vice versa. Unicode file names are obtained by calls of Opendir() and Readdir(), as defined in fname.h. */ #include "fname.h" #ifdef _UNICODE typedef uint8_t UTF8; typedef uint16_t UTF16; typedef uint32_t UTF32; #define BAD_CodePoint (UTF32)(-1) /* mask of n left-aligned 1-s in a UTF8 */ #define LMASK(n) (((1<<(n))-1)<<(8-(n))) /* mask of n right-aligned 1-s */ #define RMASK(n) ((1u<<(n))-1) /* SEQUENCE -> CODEPOINT */ static int nmb_leading_ones_in_UTF8(UTF8 c) { int n = 0; while (c&LMASK(1)) { c <<= 1, n++; } return n; } static UTF32 get_UTF8_tail(const UTF8 *s, int n) { UTF32 res = 0; int i; /* scoop up n UTF-8s */ for (i = 0; i < n; i++) { if ((s[i]&LMASK(2)) != LMASK(1)) return BAD_CodePoint; res = (res<<6) + (s[i]&RMASK(6)); } return res; } static int /* number of UTF8s used; cp = BAD_CodePoint for error */ UTF8_sequence_to_CodePoint(const UTF8 *s, UTF32 *cp) { UTF8 head = s[0]; int head_length = 1; const UTF8 *tail = &s[1]; int tail_length; UTF32 tail_value; if ((head&LMASK(1)) == 0) { *cp = head; return head_length; } tail_length = nmb_leading_ones_in_UTF8(head) - 1; if (tail_length < 1 || tail_length > 3) goto error; tail_value = get_UTF8_tail(tail, tail_length); if (tail_value == BAD_CodePoint) goto error; *cp = ((head&RMASK(6-tail_length))<<(tail_length*6)) | tail_value; return head_length+tail_length; error: { int i = head_length; /* skip the head */ /* skip until new head */ while ((s[i]&LMASK(1)) != 0) { i++; } *cp = BAD_CodePoint; return i; } } static int is_in_BMP(UTF32 c) { /* Basic Multilingual Plane */ return c <= 0xD7FF || (0xE000 <= c && c < 0x10000); } static int is_high_surrogate(UTF16 c) { return 0xD800 <= c && c <= 0xDBFF; } static int is_low_surrogate(UTF16 c) { return 0xDC00 <= c && c <= 0xDFFF; } static int /* number of UTF16s used; cp = BAD_CodePoint for error */ UTF16_sequence_to_CodePoint(const UTF16 *s, UTF32 *cp) { /* adapted from code from http://unicode.org/faq/utf_bom.html */ UTF32 plane_number; UTF32 position; if (is_in_BMP(s[0])) { *cp = s[0]; return 1; } /* s[0:1] must be a surrogate pair */ if (!is_high_surrogate(s[0])) goto error; if (!is_low_surrogate(s[1])) goto error; /* get the plane number */ plane_number = (s[0] >> 6) & RMASK(5); plane_number = plane_number + 1; /* to offset it from the BMP */ /* get the position in the plane */ position = ((s[0] & RMASK(6)) << 10) | (s[1] & RMASK(10)); /* combine them */ *cp = plane_number << 16 | position; return 2; error: { int i = 1; /* skip one UTF-16 */ /* skip until acceptable UTF-16 */ while (!is_in_BMP(s[i]) && !is_high_surrogate(s[0])) { i++; } *cp = BAD_CodePoint; return i; } } /* CODEPOINT -> SEQUENCE */ static const UTF8 * /* transient */ CodePoint_to_UTF8_sequence(UTF32 c) { /* adapted from code by user R on stackoverflow.com */ static UTF8 buff[6]; UTF8 *bp = buff; if (c < 0x80) { /* it fits in 7 bits */ *bp++ = (c>>0)&RMASK(7); } else if (c < 0x800) { /* it fits in 11 bits */ *bp++ = 0xC0 | ((c>>6)&RMASK(5)); *bp++ = 0x80 | ((c>>0)&RMASK(6)); } else if (c < 0x10000) { /* it fits in 16 bits */ if (!is_in_BMP(c)) { /* it is in the forbidden zone */ return NULL; } *bp++ = 0xE0 | ((c>>12)&RMASK(4)); *bp++ = 0x80 | ((c>>6)&RMASK(6)); *bp++ = 0x80 | ((c>>0)&RMASK(6)); } else if (c < 0x110000) { /* it fits in 21 bits */ *bp++ = 0xF0 | ((c>>18)&RMASK(3)); *bp++ = 0x80 | ((c>>12)&RMASK(6)); *bp++ = 0x80 | ((c>>6)&RMASK(6)); *bp++ = 0x80 | ((c>>0)&RMASK(6)); } else return NULL; *bp = '\0'; return buff; } static UTF16 * /* transient */ CodePoint_to_UTF16_sequence(UTF32 cp) { /* adapted from code from http://unicode.org/faq/utf_bom.html */ static UTF16 res[3]; if (is_in_BMP(cp)) { res[0] = cp; res[1] = '\0'; return res; } if (cp >= 0x10000) { UTF16 position = (UTF16) cp; UTF16 plane_number = ((cp >> 16) & RMASK(5)) - 1; res[0] = 0xD800 | (plane_number << 6) | (position >> 10); res[1] = 0xDC00 | (position & RMASK(10)); res[2] = '\0'; return res; } else return NULL; } const char * /* transient */ Fname2str(const Fchar *fn) { /* converts a Fchar (wchar_t) string to an UTF-8 string */ static UTF8 res[1024]; UTF8 *rp = &res[0]; int i = 0; if (fn == NULL) return NULL; while (fn[i]) { UTF32 cp; const UTF8 *p; /* get Codepoint from one or two Fchar chars */ i += UTF16_sequence_to_CodePoint(&fn[i], &cp); if (cp == BAD_CodePoint) goto error; /* convert code point to UTF8 sequence */ p = CodePoint_to_UTF8_sequence(cp); if (p == NULL) goto error; /* append it to the output */ while (*p) { *rp++ = *p++; } continue; error: *rp++ = '?'; } *rp = '\0'; return (const char *)res; } const Fchar * /* transient */ str2Fname(const char *s) { /* converts a possibly UTF-8 string to an Fchar (wchar_t) string */ static Fchar res[512]; Fchar *rp = &res[0]; int i = 0; if (s == NULL) return NULL; while (s[i]) { UTF32 cp; const Fchar *p; /* get Codepoint from one to four UTF-8s */ i += UTF8_sequence_to_CodePoint((const UTF8 *)&s[i], &cp); if (cp == BAD_CodePoint) goto error; /* convert code point to UTF-16 sequence */ p = CodePoint_to_UTF16_sequence(cp); if (p == NULL) goto error; /* append it to the output */ while (*p) { *rp++ = *p++; } continue; error: *rp++ = '?'; } *rp = '\0'; return res; } /* OTHER UTF-16 ROUTINES */ int Stat(const Fchar *fn, struct stat *st) { /* why on earth does _wstat use a funny struct _stat ? */ return _wstat(fn, (struct _stat *)st); } FILE * Fopen(const Fchar *fn, const char *rb) { /* stream is still char* */ Fchar fn_copy[512]; /* avoid possible transiency of fn */ Fnamecpy(fn_copy, fn); return _tfopen(fn_copy, str2Fname(rb)); } #endif /* _UNICODE */ similarity-tester-2.70.orig/lisplang.l0000644000000000000000000000463611764421215014763 0ustar %{ /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: lisplang.l,v 2.16 2012-06-08 16:04:29 Gebruiker Exp $ */ /* LISP language front end for the similarity tester. Author: Gertjan Akkerman Date: Thu, 9 Apr 87 11:15:23 MDT */ #include "token.h" #include "language.h" #include "algollike.h" #include "lex.h" #include "lang.h" /* General language front end data */ Token lex_token; unsigned int lex_nl_cnt; unsigned int lex_tk_cnt; unsigned int lex_non_ascii_cnt; /* Language-dependent data */ #include "idf.h" static const struct idf reserved[] = { {"append", NORM('a')}, {"append1", NORM('b')}, {"atom", NORM('t')}, {"car", NORM('h')}, {"cdr", NORM('t')}, {"cond", NORM('c')}, {"cons", NORM('s')}, {"defun", NORM('u')}, {"do", NORM('d')}, {"eq", NORM('e')}, {"equal", NORM('e')}, /* See eq */ {"for", NORM('f')}, {"if", NORM('i')}, {"list", NORM('l')}, {"nconc", NORM('n')}, {"rplaca", NORM('A')}, {"rplacd", NORM('D')} }; /* Token sets for module algollike */ const Token Non_Finals[] = { NORM('('), NORM('['), No_Token }; const Token Non_Initials[] = { NORM(')'), NORM(']'), No_Token }; const Token Openers[] = { NORM('('), NORM('['), No_Token }; const Token Closers[] = { NORM(')'), NORM(']'), No_Token }; /* Language-dependent code */ void Init_Language(void) { Init_Algol_Language(Non_Finals, Non_Initials, Openers, Closers); } int May_Be_Start_Of_Run(Token ch) { return May_Be_Start_Of_Algol_Run(ch); } unsigned int Best_Run_Size(const Token *str, unsigned int size) { return Best_Algol_Run_Size(str, size); } %} %option noyywrap %Start Comment Layout ([ \t\r\f]) ASCII95 ([- !"#$%&'()*+,./0-9:;<=>?@A-Z\[\\\]^_`a-z{|}~]) AnyQuoted (\\.) StrChar ([^"\n\\]|{AnyQuoted}) ChrChar ([^'\\]|{AnyQuoted}) IdfChar ([-!#$%&*+,/0-9:;<=>?@A-Z\\^_`a-z{}~]) EscIdf (({IdfChar}|\\.)+) QuotIdf ("|"[^\|\n]*"|") Idf ({EscIdf}|{QuotIdf}) %% ";".*$ { /* comment */ } \"{StrChar}*\" { /* strings */ return_ch('"'); } {Idf} { /* identifier */ return_tk(idf_in_list(yytext, reserved, sizeof reserved, IDF)); } \n { /* count newlines */ return_eol(); } {Layout} { /* ignore layout */ } {ASCII95} { /* copy other text */ return_ch(yytext[0]); } . { /* count non-ASCII chars */ lex_non_ascii_cnt++; } %% /* More language-dependent code */ void yystart(void) { BEGIN INITIAL; } similarity-tester-2.70.orig/tokenarray.h0000644000000000000000000000061111764320441015311 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: tokenarray.h,v 1.4 2012-06-08 06:52:17 Gebruiker Exp $ */ /* Interface for the token storage */ extern void Init_Token_Array(void); extern void Store_Token(Token tk); extern unsigned int Text_Length(void); /* also first free token position */ extern Token *Token_Array; similarity-tester-2.70.orig/javalang.l0000644000000000000000000001333311764421214014726 0ustar %{ /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: javalang.l,v 1.13 2012-06-08 16:04:28 Gebruiker Exp $ */ /* Java language front end for the similarity tester. Author: Dick Grune */ #include "options.h" #include "token.h" #include "language.h" #include "algollike.h" #include "idf.h" #include "lex.h" #include "lang.h" /* General language front end data */ Token lex_token; unsigned int lex_nl_cnt; unsigned int lex_tk_cnt; unsigned int lex_non_ascii_cnt; /* Language-dependent data */ static const struct idf reserved[] = { {"abstract", NORM('a')}, {"boolean", NORM('b')}, {"break", NORM('B')}, {"byte", CTRL('B')}, {"case", NORM('c')}, {"catch", NORM('C')}, {"char", CTRL('C')}, {"class", META('c')}, {"continue", META('C')}, {"default", NORM('d')}, {"do", NORM('D')}, {"double", CTRL('D')}, {"else", NORM('e')}, {"extends", NORM('E')}, {"false", NORM('g')}, /* Boolean literal */ {"final", NORM('f')}, {"finally", NORM('F')}, {"float", CTRL('F')}, {"for", META('f')}, {"if", NORM('i')}, {"implements", NORM('I')}, {"import", CTRL('I')}, {"instanceof", META('i')}, {"int", META('I')}, {"interface", MTCT('I')}, {"long", NORM('l')}, {"native", NORM('n')}, {"new", NORM('N')}, {"null", CTRL('N')}, /* null literal */ {"package", NORM('p')}, {"private", NORM('P')}, {"protected", CTRL('P')}, {"public", META('p')}, {"return", NORM('r')}, {"short", NORM('s')}, {"static", NORM('S')}, {"super", CTRL('S')}, {"switch", META('s')}, {"synchronized",META('S')}, {"this", NORM('t')}, {"throw", NORM('T')}, {"throws", CTRL('T')}, {"true", META('t')}, /* Boolean literal */ {"void", NORM('v')}, {"volatile", NORM('V')}, {"while", NORM('w')} }; /* Special treatment of identifiers */ static Token idf2token(int hashing) { Token tk; tk = idf_in_list(yytext, reserved, sizeof reserved, IDF); if (Token_EQ(tk, IDF) && hashing) { /* return a one-Token hash code */ tk = idf_hashed(yytext); } return tk; } /* Token sets for module algollike */ const Token Non_Finals[] = { IDF, /* identifier */ NORM('{'), NORM('('), NORM('a'), /* abstract */ NORM('b'), /* boolean */ NORM('B'), /* break */ CTRL('B'), /* byte */ NORM('c'), /* case */ NORM('C'), /* catch */ CTRL('C'), /* char */ META('c'), /* class */ META('C'), /* continue */ NORM('d'), /* default */ NORM('D'), /* do */ CTRL('D'), /* double */ NORM('e'), /* else */ NORM('E'), /* extends */ NORM('f'), /* final */ NORM('F'), /* finally */ CTRL('F'), /* float */ META('f'), /* for */ NORM('i'), /* if */ NORM('I'), /* implements */ CTRL('I'), /* import */ META('i'), /* instanceof */ META('I'), /* int */ MTCT('I'), /* interface */ NORM('l'), /* long */ NORM('n'), /* native */ NORM('N'), /* new */ NORM('p'), /* package */ NORM('P'), /* private */ CTRL('P'), /* protected */ META('p'), /* public */ NORM('r'), /* return */ NORM('s'), /* short */ NORM('S'), /* static */ CTRL('S'), /* super */ META('s'), /* switch */ META('S'), /* synchronized */ NORM('T'), /* throw */ CTRL('T'), /* throws */ NORM('v'), /* void */ NORM('V'), /* volatile */ NORM('w'), /* while */ No_Token }; const Token Non_Initials[] = { NORM(')'), NORM('}'), NORM(';'), No_Token }; const Token Openers[] = { NORM('{'), NORM('('), NORM('['), No_Token }; const Token Closers[] = { NORM('}'), NORM(')'), NORM(']'), No_Token }; /* Language-dependent code */ void Init_Language(void) { Init_Algol_Language(Non_Finals, Non_Initials, Openers, Closers); } int May_Be_Start_Of_Run(Token ch) { return May_Be_Start_Of_Algol_Run(ch); } unsigned int Best_Run_Size(const Token *str, unsigned int size) { return Best_Algol_Run_Size(str, size); } %} %option noyywrap %Start Comment Layout ([ \t\r\f]) ASCII95 ([- !"#$%&'()*+,./0-9:;<=>?@A-Z\[\\\]^_`a-z{|}~]) Digit ([0-9a-fA-F]) UniCode (\\u{Digit}{Digit}{Digit}{Digit}) AnyQuoted ((\\.)|{UniCode}) StrChar ([^"\n\\]|{AnyQuoted}) ChrChar ([^'\n\\]|{AnyQuoted}) StartComment ("/*") EndComment ("*/") SafeComChar ([^*\n]) UnsafeComChar ("*") SingleLineCom ("//".*) Idf ([A-Za-z][A-Za-z0-9_]*) %% {StartComment} { /* We do not have one single pattern to match a comment (although one can be written), for two reasons. The matched string might overflow lex-internal buffers like yysbuf and yytext; and the pattern would be very complicated and overtax lex. So we break up the string into safe chunks and keep track of where we are in a start condition . */ BEGIN Comment; } {SafeComChar}+ { /* safe comment chunk */ } {UnsafeComChar} { /* unsafe char, read one by one */ } "\n" { /* to break up long comments */ return_eol(); } {EndComment} { /* end-of-comment */ BEGIN INITIAL; } {SingleLineCom}"\n" { /* single-line comment */ return_eol(); } \"{StrChar}*\" { /* strings */ return_ch('"'); } \'{ChrChar}+\' { /* characters */ return_ch('\''); } (0x)?{Digit}+("l"|"L")? { /* numeral, passed as an identifier */ return_tk(IDF); } "import"{Layout}[^;]*; { /* import statement; ignore */ } {Idf}/"(" { /* identifier in front of ( */ Token tk; tk = idf2token(is_set_option('F')); if (!Token_EQ(tk, No_Token)) return_tk(tk); } {Idf} { /* identifier */ Token tk; tk = idf2token(0 /* no hashing */); if (!Token_EQ(tk, No_Token)) return_tk(tk); } \; { /* semicolon, conditionally ignored */ if (is_set_option('f')) return_ch(yytext[0]); } \n { /* count newlines */ return_eol(); } {Layout} { /* ignore layout */ } {ASCII95} { /* copy other text */ return_ch(yytext[0]); } . { /* count non-ASCII chars */ lex_non_ascii_cnt++; } %% /* More language-dependent code */ void yystart(void) { BEGIN INITIAL; } similarity-tester-2.70.orig/percentages.h0000644000000000000000000000041011763354136015436 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: percentages.h,v 1.4 2012-06-05 09:58:54 Gebruiker Exp $ */ extern void add_to_percentages(struct run *r); extern void Show_Percentages(void); similarity-tester-2.70.orig/pass3.c0000644000000000000000000001611612032015166014157 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: pass3.c,v 2.20 2012-06-08 06:52:16 Gebruiker Exp $ */ #include #include #include "system.par" #include "debug.par" #include "sim.h" #include "text.h" #include "token.h" #include "runs.h" #include "Malloc.h" #include "error.h" #include "options.h" #include "pass3.h" #include "percentages.h" #ifdef DB_RUN #include "tokenarray.h" static void db_run(const struct run *); #endif static FILE *open_chunk(const struct chunk *); static void fill_line(FILE *, char []); static void clear_line(char []); static void show_run(const struct run *); static void show_2C_line(const char [], const char []); static void show_1C_line(FILE *, const char *); static int pr_head(const struct chunk *); static int prs(const char *); static int pru(unsigned int); static int unslen(unsigned int); static int max_line_length; /* Actual maximum line length */ static char *line0; /* by Malloc() */ static char *line1; void Show_Runs(void) { AisoIter iter; struct run *run; #ifdef DB_RUN fprintf(Debug_File, "Starting Show_Runs()\n"); #endif /* DB_RUN */ max_line_length = Page_Width / 2 - 2; line0 = Malloc((unsigned int)((max_line_length + 1) * sizeof (char))); line1 = Malloc((unsigned int)((max_line_length + 1) * sizeof (char))); OpenIter(&iter); while (GetAisoItem(&iter, &run)) { #ifdef DB_RUN db_run(run); #endif /* DB_RUN */ show_run(run); fprintf(Output_File, "\n"); } CloseIter(&iter); Free(line0); line0 = 0; Free(line1); line1 = 0; } static void show_run(const struct run *run) { /* The animals came in two by two ... */ const struct chunk *cnk0 = &run->rn_chunk0; const struct chunk *cnk1 = &run->rn_chunk1; unsigned int nl_cnt0 = cnk0->ch_last.ps_nl_cnt - cnk0->ch_first.ps_nl_cnt; unsigned int nl_cnt1 = cnk1->ch_last.ps_nl_cnt - cnk1->ch_first.ps_nl_cnt; FILE *f0; FILE *f1; /* display heading of chunk */ if (!is_set_option('d')) { /* no assumptions about the lengths of the file names! */ unsigned int size = run->rn_size; int pos = 0; pos += pr_head(cnk0); while (pos < max_line_length + 1) { pos += prs(" "); } pos += prs("|"); pos += pr_head(cnk1); while (pos < 2*max_line_length - unslen(size)) { pos += prs(" "); } fprintf(Output_File, "[%u]\n", size); } else { (void)pr_head(cnk0); fprintf(Output_File, "\n"); (void)pr_head(cnk1); fprintf(Output_File, "\n"); } /* stop if that suffices */ if (is_set_option('n')) return; /* ... had enough so soon ... */ /* open the files that hold the chunks */ f0 = open_chunk(cnk0); f1 = open_chunk(cnk1); /* display the chunks in the required format */ if (!is_set_option('d')) { /* fill 2-column lines and print them */ while (nl_cnt0 != 0 || nl_cnt1 != 0) { if (nl_cnt0) { fill_line(f0, line0); nl_cnt0--; } else { clear_line(line0); } if (nl_cnt1) { fill_line(f1, line1); nl_cnt1--; } else { clear_line(line1); } show_2C_line(line0, line1); } } else { /* display the lines in a diff(1)-like format */ while (nl_cnt0--) { show_1C_line(f0, "<"); } fprintf(Output_File, "---\n"); while (nl_cnt1--) { show_1C_line(f1, ">"); } } /* close the pertinent files */ fclose(f0); fclose(f1); } static int pr_head(const struct chunk *cnk) { int pos = 0; pos += prs(cnk->ch_text->tx_fname); pos += prs(": line "); pos += pru(cnk->ch_first.ps_nl_cnt); pos += prs("-"); pos += pru(cnk->ch_last.ps_nl_cnt - 1); return pos; } static int prs(const char *str) { fprintf(Output_File, "%s", str); return strlen(str); } static int pru(unsigned int u) { fprintf(Output_File, "%u", u); return unslen(u); } static int unslen(unsigned int u) { int res = 1; while (u > 9) { u /= 10, res++; } return res; } static FILE * open_chunk(const struct chunk *cnk) { /* Opens the file in which the chunk resides, positions the file at the beginning of the chunk and returns the file pointer. Note that we use fopen() here, which opens a character stream, rather than Open_Text(), which opens a token stream. */ const char *fname = cnk->ch_text->tx_fname; FILE *f = fopen(fname, "r"); unsigned int nl_cnt; if (!f) { fprintf(stderr, ">>>> File %s disappeared <<<<\n", fname); f = fopen(NULLFILE, "r"); } nl_cnt = cnk->ch_first.ps_nl_cnt; while (nl_cnt > 1) { int ch = getc(f); if (ch < 0) break; if (ch == '\n') { nl_cnt--; } } return f; } static void fill_line(FILE *f, char ln[]) { /* Reads one line from f and puts it in condensed form in ln. */ int indent = 0, lpos = 0; int ch; /* condense and skip initial blank */ while ((ch = getc(f)), ch == ' ' || ch == '\t') { if (ch == '\t') { indent = 8; } else { indent++; } if (indent == 8) { /* every eight blanks give one blank */ if (lpos < max_line_length) { ln[lpos++] = ' '; } indent = 0; } } /* store the rest */ while (ch >= 0 && ch != '\n') { if (ch == '\t') { /* replace tabs by blanks */ ch = ' '; } if (lpos < max_line_length) { ln[lpos++] = ch; } ch = getc(f); } ln[lpos] = '\0'; /* always room for this one */ } static void clear_line(char ln[]) { /* a simple null byte will suffice */ ln[0] = '\0'; } static void show_2C_line(const char ln0[], const char ln1[]) { /* displays the contents of the two lines in a two-column format */ int i; for (i = 0; i < max_line_length && ln0[i] != '\0'; i++) { fputc(ln0[i], Output_File); } for (; i < max_line_length; i++) { fputc(' ', Output_File); } fprintf(Output_File, " |"); for (i = 0; i < max_line_length && ln1[i] != '\0'; i++) { fputc(ln1[i], Output_File); } fprintf(Output_File, "\n"); } static void show_1C_line(FILE *f, const char *marker) { /* displays one line from f, preceded by the marker */ int ch; fprintf(Output_File, "%s", marker); while ((ch = getc(f)), ch > 0 && ch != '\n') { fputc(ch, Output_File); } fputc('\n', Output_File); } #ifdef DB_RUN static void db_chunk(const struct chunk *); static void db_run(const struct run *run) { /* prints detailed data about a run */ const struct chunk *cnk0 = &run->rn_chunk0; const struct chunk *cnk1 = &run->rn_chunk1; db_run_info(0, run, 1); db_chunk(cnk0); db_chunk(cnk1); } static void db_chunk(const struct chunk *cnk) { /* print the tokens in the chunk, with a one-char margin */ unsigned int i; const struct position *first = &cnk->ch_first; const struct position *last = &cnk->ch_last; unsigned int start = cnk->ch_text->tx_start; if (first->ps_tk_cnt > 0) { fprintf(Debug_File, "..."); fprint_token(Debug_File, Token_Array[start + first->ps_tk_cnt - 1]); fprintf(Debug_File, " "); } else { /* create same offset as above */ fprintf(Debug_File, " "); } for (i = first->ps_tk_cnt; i <= last->ps_tk_cnt; i++) { fprintf(Debug_File, " "); fprint_token(Debug_File, Token_Array[start + i]); } if (start + last->ps_tk_cnt + 1 < cnk->ch_text->tx_limit) { fprintf(Debug_File, " "); fprint_token(Debug_File, Token_Array[start + last->ps_tk_cnt + 1]); fprintf(Debug_File, "..."); } fprintf(Debug_File, "\n"); } #endif /* DB_RUN */ similarity-tester-2.70.orig/text.h0000644000000000000000000000312311763354137014127 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: text.h,v 1.4 2012-06-05 09:58:55 Gebruiker Exp $ */ /* Implements the access to the lexical scanner. Additionally, the module tries to save newline information, anticipating a second scan which is interested in this information only. */ struct text { const char *tx_fname; /* the file name */ struct position *tx_pos;/* list of positions in this file that are part of a chunk; sorted and updated by Pass 2 */ unsigned int tx_start; /* positions in Token_Array[] for the text */ unsigned int tx_limit; unsigned int tx_nl_start;/* possibly newline pointer for pass2 */ unsigned int tx_nl_limit; }; struct position { /* position of first and last token of a chunk */ struct position *ps_next; int ps_type; /* first = 0, last = 1 */ unsigned int ps_tk_cnt; /* in tokens; set by add_run() in Read_Input_Files() */ unsigned int ps_nl_cnt; /* same, in line numbers;set by Retrieve_Runs(), used by Show_Runs(), to report line numbers */ }; extern struct text *Text; /* Text[], one for each input file */ extern int Number_Of_Texts; /* number of text files */ extern int Number_Of_New_Texts; /* number of new text files */ extern void Init_Text(int nfiles); enum Pass {First, Second}; extern int Open_Text(enum Pass pass, struct text *txt); extern int Next_Text_Token_Obtained(enum Pass pass); extern void Close_Text(enum Pass pass, struct text *txt); #ifdef DB_NL_BUFF extern void db_print_nl_buff(unsigned int start, unsigned int limit); #endif similarity-tester-2.70.orig/compare.h0000644000000000000000000000055511754656625014606 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: compare.h,v 1.3 2012-05-16 07:56:05 Gebruiker Exp $ */ /* Compares each new text to the appropriate texts. Stores the runs found in the AISO heap. Runs contain references to positions in the input files. */ extern void Compare_Files(void); similarity-tester-2.70.orig/settings.par0000644000000000000000000000045311763417237015342 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: settings.par,v 1.2 2012-06-05 14:58:39 Gebruiker Exp $ */ #define DEFAULT_MIN_RUN_SIZE 24 /* default minimum run size */ #define DEFAULT_PAGE_WIDTH 80 /* default page width */ similarity-tester-2.70.orig/pass2.c0000644000000000000000000000761212032031447014157 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: pass2.c,v 2.20 2012-09-30 11:55:19 Gebruiker Exp $ */ #include #include "debug.par" #include "sim.h" #include "token.h" #include "text.h" #include "lang.h" #include "pass2.h" #undef DB_POS #ifdef DB_POS static void db_print_pos_list(const char *, const struct text *); static void db_print_lex(const char *); #endif static void pass2_txt(struct text *txt); static int next_eol_obtained(void); void Retrieve_Runs(void) { int n; for (n = 0; n < Number_Of_Texts; n++) { pass2_txt(&Text[n]); } } /* begin instantiate static void sort_pos_list(struct position **) */ #define SORT_STRUCT position #define SORT_NAME sort_pos_list #define SORT_BEFORE(p1,p2) ((p1)->ps_tk_cnt < (p2)->ps_tk_cnt) #define SORT_NEXT ps_next #include "sortlist.bdy" /* end instantiate sort_pos_list() */ static void pass2_txt(struct text *txt) { struct position *pos; unsigned int old_nl_cnt; if (!txt->tx_pos) /* no need to scan the file */ return; /* Open_Text() initializes lex_nl_cnt and lex_tk_cnt */ if (!Open_Text(Second, txt)) { fprintf(stderr, ">>>> File %s disappeared <<<<\n", txt->tx_fname ); return; } /* Sort the positions so they can be matched to the file; the linked list of struct positions snakes through the struct positions in the struct chunks in the struct runs. */ #ifdef DB_POS db_print_pos_list("before sorting", txt); #endif /* DB_POS */ sort_pos_list(&txt->tx_pos); #ifdef DB_POS db_print_pos_list("after sorting", txt); #endif /* DB_POS */ #ifdef DB_NL_BUFF db_print_nl_buff(txt->tx_nl_start, txt->tx_nl_limit); #endif /* DB_NL_BUFF */ #ifdef DB_POS fprintf(Debug_File, "\n**** DB_PRINT_SCAN of %s ****\n", txt->tx_fname); #endif /* DB_POS */ old_nl_cnt = 1; pos = txt->tx_pos; while (pos) { /* we scan the pos list and the file in parallel */ /* find the corresponding line */ while (pos->ps_tk_cnt > lex_tk_cnt) { /* was >= ZZ */ /* pos does not refer to this line, try the next */ /* shift the administration */ old_nl_cnt = lex_nl_cnt; /* and get the next eol position */ if (!next_eol_obtained()) { /* ouch! not enough lines! */ fprintf(stderr, ">>>> File %s modified <<<<\n", txt->tx_fname ); break; } #ifdef DB_POS db_print_lex(txt->tx_fname); #endif /* DB_POS */ } /* fill in the pos */ switch (pos->ps_type) { case 0: /* first token of run */ pos->ps_nl_cnt = old_nl_cnt; break; case 1: /* last token of run */ pos->ps_nl_cnt = lex_nl_cnt; break; } /* and get the next pos */ pos = pos->ps_next; } #ifdef DB_POS db_print_pos_list("after scanning", txt); #endif /* DB_POS */ /* Flush the flex buffers; it's easier than using YY_BUFFER_STATE. */ while (Next_Text_Token_Obtained(Second)); Close_Text(Second, txt); } static int next_eol_obtained(void) { while (Next_Text_Token_Obtained(Second)) { if (Token_EQ(lex_token, End_Of_Line)) return 1; } return 0; } #ifdef DB_POS static void db_print_pos(const struct position *pos) { fprintf(Debug_File, "pos type = %s; %s count = %u", (pos->ps_type == 0 ? "first" : " last"), token_name, pos->ps_tk_cnt ); fprintf(Debug_File, ", line # = "); if (pos->ps_nl_cnt == -1) { fprintf(Debug_File, ""); } else { fprintf(Debug_File, "%u", pos->ps_nl_cnt); } fprintf(Debug_File, "\n"); } static void db_print_pos_list(const char *msg, const struct text *txt) { fprintf(Debug_File, "\n**** DB_PRINT_POS_LIST of %s, %s ****\n", txt->tx_fname, msg); const struct position *pos = txt->tx_pos; while (pos) { db_print_pos(pos); pos = pos->ps_next; } fprintf(Debug_File, "\n"); } static void db_print_lex(const char *fn) { fprintf(Debug_File, "%s: lex_tk_cnt = %u, lex_nl_cnt = %u, lex_token = ", fn, lex_tk_cnt, lex_nl_cnt); fprint_token(Debug_File, lex_token); fprintf(Debug_File, "\n"); } #endif /* DB_POS */ similarity-tester-2.70.orig/hash.h0000644000000000000000000000063211753674555014100 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: hash.h,v 1.3 2012-05-13 09:05:49 Gebruiker Exp $ */ /* Creating and consulting forward_reference[], used to speed up the Longest Substring Allgorithm. */ extern void Make_Forward_References(void); extern void Free_Forward_References(void); extern unsigned int Forward_Reference(int i); similarity-tester-2.70.orig/README0000644000000000000000000000442411764602255013655 0ustar # This file is part of the software similarity tester SIM. # Written by Dick Grune, Vrije Universiteit, Amsterdam. # $Id: README,v 2.13 2012-06-09 08:09:17 Gebruiker Exp $ These programs test for similar or equal stretches in one or more program or text files and can be used to detect common code or plagiarism. See sim.pdf. Checkers are available for C, Java, Pascal, Modula-2, Lisp, Miranda and natural language text. >>>> NEW, June 6, 2012: - greatly improved percentage computation - increased resolution, reducing false positives in sim_text - // comments in C recognized - characters 0200-0377 accepted in sim_text - s p a c e d w o r d s recognized in sim_text - UNICODE file names accepted - manual page in PDF >>>> NEW, March 11, 2009: - -R option to follow directories recursively ==== To install on UNIX/Linux, or on MSDOS if you have a C compiler: Unpack the archive sim_2_*.zip To compile and test, edit the Makefile to comment out MSDOS entries and/or change directory names, as appropriate, and call make test This will generate one executable called sim_c, the checker for C, and will run two small tests to show sample output. To install, examine the Makefile, edit BINDIR and MAN1DIR to sensible paths, and call make install To change the default run size or the page width, adjust the file settings.par and recompile. ==== To install on MSDOS, if you don't have a C compiler, sim_exe_2_*.zip contains: SIM_C.EXE similarity tester for C SIM_JAVA.EXE similarity tester for Java SIM_PASC.EXE similarity tester for Pascal SIM_M2.EXE similarity tester for Modula-2 SIM_LISP.EXE similarity tester for Lisp SIM_MIRA.EXE similarity tester for Miranda SIM_TEXT.EXE similarity tester for text ==== To extend: To add another language L, write a file Llang.l along the lines of clang.l and the other *lang.l files, extend the Makefile and recompile. All knowledge about a given language L is located in Llang.l; the rest of the programs expect each token to be a 16-bit character. Available at present: clang.l javalang.l pascallang.l m2lang.l lisplang.l miralang.l text.l Dick Grune Vrije Universiteit de Boelelaan 1081 1081 HV Amsterdam the Netherlands email: dick@dickgrune.com http://www.dickgrune.com similarity-tester-2.70.orig/aiso.bdy0000644000000000000000000000676211766061017014434 0ustar /* This file is part of the module Arbitrary-In Sorted-Out (AISO). Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: aiso.bdy,v 1.4 2012-05-08 08:43:56 Gebruiker Exp $ */ /* Description: This is the body of a module that builds an arbitrary-in sorted-out data structure, to be used as a heap, a priority queue, etc. See aiso.spc for further information. */ #include #include "Malloc.h" static struct aiso_node *root; /* root of tree */ #ifdef AISO_ITER static struct aiso_node *list; /* start of linked list */ #endif /* AISO_ITER */ /* the policy */ static int aiso_size = 0; static int acc_mark = 1; #define add_entry() (aiso_size++) #define rem_entry() (aiso_size--) #define reset_access() (acc_mark = 1) #define count_access() (acc_mark <<= 1) #define must_rotate() (acc_mark > aiso_size) int InsertAiso(AISO_TYPE v) { struct aiso_node *new_node; struct aiso_node **hook = &root; #ifdef AISO_ITER struct aiso_node **prev = &list; #endif /* AISO_ITER */ new_node = (struct aiso_node *)TryMalloc(sizeof (struct aiso_node)); if (!new_node) { /* avoid modifying the tree */ return 0; } while (*hook) { struct aiso_node *an = *hook; count_access(); if (AISO_BEFORE(v, an->an_value)) { /* head left */ if (!an->an_left || !must_rotate()) { /* standard action */ hook = &an->an_left; } else { /* change (l A r) B (C) into (l) A (r B C) */ struct aiso_node *anl = an->an_left; an->an_left = anl->an_right; anl->an_right = an; *hook = anl; reset_access(); } } else { /* head right */ if (!an->an_right || !must_rotate()) { /* standard action */ hook = &an->an_right; } else { /* change (A) B (l C r) into (A B l) C (r) */ struct aiso_node *anr = an->an_right; an->an_right = anr->an_left; anr->an_left = an; *hook = anr; reset_access(); } #ifdef AISO_ITER prev = &an->an_next; #endif /* AISO_ITER */ } } new_node->an_left = 0; new_node->an_right = 0; #ifdef AISO_ITER new_node->an_next = *prev; *prev = new_node; #endif /* AISO_ITER */ new_node->an_value = v; *hook = new_node; add_entry(); return 1; } #ifdef AISO_EXTR int ExtractAiso(AISO_TYPE *vp) { struct aiso_node **hook = &root; struct aiso_node *an; if (!root) return 0; while ((an = *hook), an->an_left) { /* head left */ count_access(); if (!must_rotate()) { /* standard action */ hook = &an->an_left; } else { /* change (l A r) B (C) into (l) A (r B C) */ struct aiso_node *anl = an->an_left; an->an_left = anl->an_right; anl->an_right = an; *hook = anl; reset_access(); } } /* found the first */ *vp = an->an_value; *hook = an->an_right; #ifdef AISO_ITER list = an->an_next; #endif /* AISO_ITER */ Free((void *)an); rem_entry(); return 1; } #endif /* AISO_EXTR */ #ifdef AISO_ITER void OpenIter(AisoIter *ip) { *ip = list; } int GetAisoItem(AisoIter *ip, AISO_TYPE *vp) { struct aiso_node *an = *ip; if (!an) return 0; *vp = an->an_value; *ip = an->an_next; return 1; } void CloseIter(AisoIter *ip) { *ip = 0; } #endif /* AISO_ITER */ #ifdef AISO_DEBUG /* requires AISO_FORMAT */ static void pr_inf(int level, char ch, struct aiso_node *an) { int i; if (!an) return; pr_inf(level+1, '/', an->an_right); for (i = 0; i < level; i++) { printf(" "); } printf("%c", ch); printf(AISO_FORMAT, an->an_value); printf("\n"); pr_inf(level+1, '\\', an->an_left); } void pr_tree(void) { pr_inf(0, '-', root); printf("================\n"); } #endif /* AISO_DEBUG */ similarity-tester-2.70.orig/idf.c0000644000000000000000000000273612032013676013677 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: idf.c,v 2.16 2012-05-09 11:50:37 Gebruiker Exp $ */ #include #include "system.par" #include "token.h" #include "idf.h" Token idf_in_list( const char *str, const struct idf list[], unsigned int listsize, Token default_token ) { int first = 0; int last = (listsize / sizeof (struct idf)) - 1; while (first < last) { int middle = (first + last) / 2; if (strcmp(str, list[middle].id_tag) > 0) { first = middle + 1; } else { last = middle; } } return (strcmp(str, list[first].id_tag) == 0 ? list[first].id_tr : default_token ); } #define HASH(h,ch) (((h) * 8209) + (ch)*613) Token idf_hashed(const char *str) { int32 h = 0; /* let's be careful about ranges; if done wrong it's hard to debug */ while (*str) { int ch = *str++ & 0377; if (ch == ' ') continue; /* -1 <= h <= 2^31-1 */ h = HASH(h, ch); /* -2^31 <= h <= 2^31-1 */ if (h < 0) { /* -2^31 <= h <= -1 */ h += 2147483647; /* 2^31-1 */ /* -1 <= h <= 2^31-2 */ } else { /* 0 <= h <= 2^31-1 */ } /* -1 <= h <= 2^31-1 */ } /* -1 <= h <= 2^31-1 */ if (h < 0) { /* h = -1 */ h = 0; } /* 0 <= h <= 2^31-1 */ h %= (N_TOKENS - N_REGULAR_TOKENS - 1); /* 0 <= h < N_TOKENS - N_REGULAR_TOKENS - 1 */ h += N_REGULAR_TOKENS; /* N_REGULAR_TOKENS <= h < N_TOKENS - 1 */ return int2Token(h); /* this avoids the regular tokens and End_Of_Line */ } similarity-tester-2.70.orig/miralang.l0000644000000000000000000000474211764421215014742 0ustar %{ /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: miralang.l,v 1.10 2012-06-08 16:04:29 Gebruiker Exp $ */ /* Miranda language front end for the similarity tester. Author: Emma Norling (ejn@cs.mu.oz.au) Date: Nov 1998 */ #include "token.h" #include "language.h" #include "algollike.h" #include "lex.h" #include "lang.h" /* General language front end data */ Token lex_token; unsigned int lex_nl_cnt; unsigned int lex_tk_cnt; unsigned int lex_non_ascii_cnt; /* Language-dependent data */ #include "idf.h" static const struct idf reserved[] = { {"abstype", NORM('a')}, {"bool", NORM('b')}, {"char", NORM('c')}, {"const", META('c')}, {"div", NORM('d')}, {"False", NORM('F')}, {"if", NORM('i')}, {"mod", NORM('m')}, {"num", NORM('n')}, {"otherwise", NORM('o')}, {"readvals", NORM('r')}, {"show", NORM('s')}, {"sys_message", META('s')}, {"True", NORM('T')}, {"type", NORM('t')}, {"where", NORM('w')}, {"with", META('w')} }; /* Token sets for module algollike */ const Token Non_Finals[] = { NORM('('), NORM('['), NORM('='), No_Token }; const Token Non_Initials[] = { NORM(')'), NORM(']'), No_Token }; const Token Openers[] = { NORM('('), NORM('['), NORM('='), No_Token }; const Token Closers[] = { NORM(')'), NORM(']'), No_Token }; /* Language-dependent code */ void Init_Language(void) { Init_Algol_Language(Non_Finals, Non_Initials, Openers, Closers); } int May_Be_Start_Of_Run(Token ch) { return May_Be_Start_Of_Algol_Run(ch); } unsigned int Best_Run_Size(const Token *str, unsigned int size) { return Best_Algol_Run_Size(str, size); } %} %option noyywrap %Start Comment Layout ([ \t\r\f]) ASCII95 ([- !"#$%&'()*+,./0-9:;<=>?@A-Z\[\\\]^_`a-z{|}~]) AnyQuoted (\\.) StrChar ([^"\n\\]|{AnyQuoted}) ChrChar ([^'\\]|{AnyQuoted}) Idf ([A-Za-z][A-Za-z0-9_']*) %% "||".*$ { /* comment */ } \"{StrChar}*\" { /* strings */ return_ch('"'); } \'{ChrChar}\' { /* characters */ return_ch('\''); } \%{Layout}*include.* { /* skip %include line */ } \%{Layout}*insert.* { /* skip %insert line */ } {Idf} { /* identifier */ return_tk(idf_in_list(yytext, reserved, sizeof reserved, IDF)); } \n { /* count newlines */ return_eol(); } {Layout} { /* ignore layout */ } {ASCII95} { /* copy other text */ return_ch(yytext[0]); } . { /* count non-ASCII chars */ lex_non_ascii_cnt++; } %% /* More language-dependent code */ void yystart(void) { BEGIN INITIAL; } similarity-tester-2.70.orig/pass1.h0000644000000000000000000000056411754656626014210 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: pass1.h,v 1.7 2012-05-16 07:56:06 Gebruiker Exp $ */ /* Reads the input files; stores the tokens in Token Token_Array[] and the input file descriptions in struct text text[]. */ extern void Read_Input_Files(int argc, const char *argv[], int round); similarity-tester-2.70.orig/Answers0000644000000000000000000000413006627520064014333 0ustar The software and text similarity tester SIM SIM tests lexical similarity in texts in C, Java, Pascal, Modula-2, Lisp, Miranda, and natural language. It is used - to detect potentially duplicated code fragments in large software projects, in program text but also in shell scripts and documentation; - to detect plagiarism in software projects, educational and otherwise. SIM is available through ftp. The directory ftp.cs.vu.nl:pub/dick/similarity_tester contains the sources (in C) and the MSDOS .EXEs. The software similarity tester is very efficient and allows us to compare this year's students' work with that collected from many past years (much to the dismay of some, mostly non-CS, students). Students are told in advance that their work is going to be compared, but some are non-believers ... The output of the similarity tester can be processed by a number of shell scripts by Matty Huntjens. These shell scripts take sim output and produce lists of suspect submissions, histograms and the like. The present version of these scripts is very much geared to the local situation at the Vrije Universiteit, though; they are low on portability. Matty Huntjens' email address is matty@cs.vu.nl. We are not afraid that students would try to tune their work to the similarity tester. We reckon if they can do that they can also do the exercise. Since this piece of handicraft does not qualify as research, there are no international papers on it. A paper, titled `Detecting copied submissions in computer science lab work', was published in a local (i.e. Dutch) computer science journal: %A Dick Grune %A Matty Huntjens %T Het detecteren van kopie\(:en bij informatica-practica %J Informatie (in Dutch) %V 31 %N 11 %D Nov 1989 %P 864-867 The ftp directory contains a terse technical report about the internal working of the program. Dick Grune Vrije Universiteit de Boelelaan 1081 1081 HV Amsterdam the Netherlands dick@cs.vu.nl +31 20 444 7744 ---------------------------------------------------------------- With infinitely many exceptions, what you do makes no difference. similarity-tester-2.70.orig/ChangeLog0000644000000000000000000006456412055474357014566 0ustar 2012-11-28 Dick Grune * newargs.c (recursive_args): Liqun Chen (liqun.chen@hp.com) submitted a bug report noting that the separator / is expanded under the -R option. Corrected. 2012-09-30 Dick Grune * pass2.c (pass2_txt): Boyd Blackwell (Boyd.Blackwell@anu.edu.au) submitted a bug report in which the line numbers (and runs representations) were way off (75 lines). The input files were characterized by extremely long lines, hundreds of tokens (max. 521). After 2.5 days of debugging the cause was found: 1. since the mapping from token positions to line numbers is stored as the difference of the token positions from one line to the next (see text.c); 2. since these differences are stored in unsigned chars to save space; 3. since the nl_buff mechanism is switched off when one of these unsigned characters overflow; and since 521 tokens on one line overflowed this unsigned char, the nl_buff mechanism was shut off. Since when there is no nl_buff information in pass2, pass2 resorts to rereading the input file calling yylex again; 2. since the preceding file had few runs to find line number to, the preceding file was not read to the end, and the rest remained in flex's buffer, so a portion of the preceding file seemed prefixed to the present file, adding 75 lines to it. Remedy: flushing flex's buffer explicitly in pass2_txt(); this is simpler than using flex's YY_BUFFER_STATE mechanism. Advice: get rid of the nl_buff mechanism; it is no longer relevant. 2012-06-09 Dick Grune * lang.h: The *lang.l files are unusual in two respects: 1. they present two interfaces to the rest of the system: language.[ch], static data about the language, and lang.[ch], dynamic data about the input file's content; 2. both interfaces come with multiple implementations, one for each *lang.l file; i.e., they are abstract. This has been sorted out with some difficulty. 2012-05-08 Dick Grune * Changed to 16-bit tokens, for better resolution for sim_text and on -F option, and for UTF-8 input. It was not worth while to save the 8-bit token code: on serious comparisons the increase in memory usage is about 10% (330 000 on a maximum allocation of 3 030 976 for comparing the sources of MCD2). 2009-03-11 Dick Grune * newargs.c: added -R option to follow directories recursively. See recursive_args(). 2008-09-22 * added newargs.[ch], to supply file names from standard input, for those compilers that do not have the @ facility. Implemented without fixed limits. 2008-09-21 * changed default format back to original, and inverted the -v(erbose) option into a -T(erse) option. 2008-03-31 Dick Grune * *.l: the following are not universally recognized; removed. %option nounput %option never-interactive 2008-03-31 Introduced aiso.* and Malloc.? as imported modules. 2007-11-21 Carlos Maziero - output format modified in order to facilitate "grep" filtering - added option "-v" for a more verbose output - added option "-tN" to define a threshold %N (only similarities over N% are shown) - fixed SEGV on writing to the output file - the file list can be informed through STDIN (one file per line, accepts "/" marker); this is useful for compilers that lack the @ facility 2007-08-23 Dick Grune LICENSE.txt added. 2006-11-27 Dick Grune Removal of setbuff() for compatibility. 2005-01-17 Dick Grune Corrections by Jerry James ; ANSIizing, etc. 2004-08-05 Dick Grune Finished the 'percentage' option. 08-Nov-2001 Dick Grune Begun to add a 'percentage' option, which will express the similarity between two files in percents. 27-Sep-2001 Dick Grune Split add_run() off from compare.c into add_run.c, to accommodate different add_run()s, for different types of processing. 27-Nov-1998 Dick Grune Installed a Miranda version supplied by Emma Norling (ejn@cs.mu.oz.au) 23-Feb-1998 Dick Grune Renamed text.l to textlang.l for uniformity and to make room for a possible module text.[ch]. Isolated a module for handling the token array from buff.[ch] to tokenarray.[ch], and renamed buff.[ch] to text.[ch]. 23-Feb-1998 Dick Grune There is probably not much point in abandoning the nl_buff list when running out of memory for TokenArray[]: each token costs 1 byte for the token and 4 bytes for the entry in forward_references[], a total of 5 bytes. There are about 3 tokens to a line, together requiring 15 bytes, plus 1 byte in nl_buff yields 16 bytes. So releasing nl_buff frees only 1/16 = 6.7 % of memory. Since the code is a bother, I removed it. Note that nl_buff is still abandoned when the number of tokens in a line does not fit in one unsigned char (but that is not very likely to happen). 21-Feb-1998 Dick Grune Printing got into an infinite loop when the last line of the input was not terminated by a newline AND contained tokens that were included in a matching run. This was due to a double bug: 1. the non-terminated line was not registered properly in NextTextTokenObtained() / CloseText(), and 2. the loop in pass 2 which sets the values of pos->ps_nl_cnt was terminated prematurely when the file turned out to be shorter than the list of pos-es indicated. Both bugs were corrected, the first by supplying an extra newline in CloseText() when one is found missing, and the second by rewriting the list-parallel loop in pass 2. 02-Feb-1998 Dick Grune Pascal does not differentiate between strings and characters (strings of one character); this difference has been removed from pascallang.l. 22-Jan-1998 Dick Grune Detection of non-ASCII characters added. Since the lexical analyser itself generates non-ASCII characters, the test must occur earlier. We could replace the input routine of lex by a checking routine, but with several lex-es going around, we want a more lex-independent solution. To allow each language its own restrictions about non-ASCII characters, the check is implemented in the *lang.l files. 28-Nov-1997 Dick Grune Changed the name of the C similarity tester 'sim' to 'sim_c', for uniformity with sim_java, etc. 23-Nov-1997 Dick Grune Java version finished; checked by Matty Huntjens and crew. 24-Jun-1997 Dick Grune Started on a Java version, by copying the C version. 22-Jun-1997 Dick Grune Modern lexical analysers, among which flex, read the entire input into a buffer before they issue the first token. As a result, ftell() no longer gives a usable indication of the position of a token in a file. This pulls the rug from under the nl_buff mechanism in buff.c, which is removed. We loose a valuable optimization this way, but there just seems to be no way to keep it. Note that this has nothing to do with the problem in MS-DOS of character count and fseek position not being synchronized. That problem has been solved on June 14, 1991 (which see) and the code has been running OK since. 18-Jun-1997 Dick Grune The thought has occurred to use McCreight's linear longest common substring algorithm rather than the existing algorithm, which has a small quadratic component. There are a couple of problems with this: 1. We need the longest >non-overlapping< common substring; McCreight provides just the longest. It is not at all clear how to modify the algorithm. 2. Once we have found our LCS, we want to find the one-but-longest; it is far from obvious how to do that in McCreight's algorithm. 3. Once we have found our LCS, we want to take one of its copies out of the game, to suppress duplicate messages. Again, it is difficult to see how to do that, without redoing all the calculations. 4. McCreight's algorithm seems to require about two binary tree nodes per token, say 8 bytes, which is double we use now. 17-Jun-1997 Dick Grune Did some experimenting with the hash function; it is still pretty bad: the simple-minded second sweep through forward_references easily removes another 80-99% of false hits. Next, a third sweep that does a full comparison will remove another large percentage. So I have left in the second sweep in all cases. There are a couple of questions here: 1. Can we find a better hash function, or will we forever need a second sweep? 2. Does it actually matter, or will we loose on more expensive hashing what we gain by having a better set of forward references in compare.c? 16-Jun-1997 Dick Grune Cleaned up sim.h and renamed aiso.[ch] to runs.[ch] since they are instantiations of the aiso module concerned with runs. Aiso.[spc|bdy] stays aiso.[spc|bdy], of course. 16-Jun-1997 Dick Grune Redid largest_function() in algollike.c. Corrected bug in CheckRun; it now always removes NonFinals from the end, even when it has first applied largest_function(). 15-Jun-1997 Dick Grune Reorganized the layers around the input file. There were and still are three layers: lang, stream and buff. Since the lex_X variables are hoisted unchanged through the levels lang, stream, and buff, to be used by pass1, pass2, etc., they have to be placed in a module of their own. The token-providing module 'lang' has three interfaces: - lang.h, which provides access to the lowest-level token routines, to be used by the next level. - lex.h, which provides the lex variables, to be used by all and sundry. - language.h, which provides language-specific info about tokens, concerning their suitability as initial and final tokens, to be used by higher levels. This structure is not satisfactory, but it is also unreasonable to combine them in one interface. There is no single lang.c; rather it is represented by the various Xlang.c files generated from the Xlang.l files. 14-Jun-1997 Dick Grune Added a Makefile zip entry to parallel the shar entry. 13-Jun-1997 Dick Grune A number of simplifications, in view of better software and bigger machines: - Removed good_realloc from hash.c; I don't think there are any bad reallocs left. - Removed the option to run without forward_references. On a 16Mb machine this means you have at least 2M tokens; using a quadratic algorithm will take 4*10^6 sec. at an impossible rate of 1M actions/sec., which is some 50 days. Forget it. - Renamed lang() to print_stream(), and incorporated it in sim.c - Removed the MSDOS subdirectory mechanism in the Makefile. - Removed the funny and sneaky double parameter expansion in the call of idf_in_list(). 12-Jun-1997 Dick Grune Converted to ANSI C. Removed cport.h. 09-Jan-1995 Dick Grune Decided not to do directories: they usually contain extraneous files and doing sim * is simple enough anyway. 09-Sep-1994 Dick Grune Added system.h to cater for the (few) differences between Unix and DOS. The #define int32 is also supplied there. 05-Sep-1994 Dick Grune Added many prototype declarations using cport.h. Added a depend entry to the Makefile. 31-Aug-1994 Dick Grune All these changes require a 32 bit integer; introduced a #define int32, set from the command line in the Makefile. 25-Aug-1994 Dick Grune It turned out that one of the most often called routines was .rem, from idf_hashed() in idf.c. Moving the % out of the loop chafed off another 6% and reduced the time to 18.4 sec. 19-Aug-1994 Dick Grune With very large files (e.g., concatenated /usr/man/man1/*) the fixed built-in hash table size of 10639 is no longer satisfactory. Hash.c now finds a prime about 8 times smaller than the text_size to use for hash table size; this achieves optimal speed-up without gobbling up too much memory. Reduced the time for the above file from 30.2 sec. to 19.6 sec. For checking, the same test was run with all hashing off; it took 20h 27m 19s = 73639 sec. But it worked. 11-Aug-1994 Dick Grune For large values of MinRunSize (>1000) a large part of the time (>two-thirds) was spent in calculating the hash values for each position in the input, since the cost of this calculation was proportional to MinRunSize. We now sample a maximum of 24 tokens from the input string to calculate the hash value, and avoid overflow. On my workstation, this reduces the time for sim_text -r 1000 -n /usr/man/man1/* from 60 sec to 21 sec. 30-Jun-1992 Dick Grune,kamer R4.40,telef. 5778 There was an amazing bug in buff.c where NextTextToken() for pass 2 omitted to set lex_token to EOL when retrieving newline info from nl_buff. Worked until now!?! 23-Sep-1991 Dick Grune Cport.h introduced, CONST and *.spc only. 17-Sep-1991 Dick Grune The position-sorting routine in pass2.c has been made into a separate generic module. 14-Jun-1991 Dick Grune (dick@cs.vu.nl) at dick.cs.vu.nl Replaced the determination of the input position through counting input characters by calls of ftell(); this is cleaner and the other method will never work on MSDOS. 30-May-1989 Dick Grune (dick) at dick Replaced the old top-100 module (which had been extended to top-10000 already anyway) by the new aiso (arbitrary-in sorted-out) module. This caused a considerable speed-up on the Mod2 test bed: %time cumsecs #call ms/call name 17.9 99.20 7209 13.76 _InsertTop 0.3 1.37 7209 0.19 _InsertAiso It turns out that malloc() is not a serious problem, so no special version for the aiso module is required. 23-May-1989 Dick Grune (dick) at dick No more uncommented comment at the end of preprocessor lines, to conform to ANSI C. 23-May-1989 Dick Grune (dick) at dick Added code in the X.l files to (silently) reject characters over 0200. This does not really help, since lex stops on null chars. Ah, well. 19-May-1989 Dick Grune (dick) at dick Made the token as handled by sim into an abstract data type, for aesthetic reasons. Sign extension is still a problem. 03-May-1989 Dick Grune (dick) at dick Optimized lcs() by first checking from the end if a sufficiently long run is present; if in fact only the first 12 tokens match, chances are good that you can reject the run right away by first testing the 20th token, then the 19th, and so on. 21-Apr-1989 Dick Grune (dick) at dick A run of sim_m2 finding 7209 similarities raised the question of the appropriateness of the linear sort in sort_pos(). Profiling showed that in this case sorting takes all of 7.5 % of the total time. Putting the word register in in the right places in sort_pos() lowered this number to 4.6%. 20-Apr-1989 Dick Grune (dick) at dick Moved the test for MayBeStartOfRun() from compare.c (where it is done again and again) to hash.c, where its effect is incorporated in the forward reference chain. 14-Apr-1989 Dick Grune (dick) at dick Replaced elem_of() by bit tables, headers[] and trailers[], to be prefilled from Headers[] and Trailers[] by a call of InitLanguage(). This saves a few percents. 13-Apr-1989 Dick Grune (dick) at dick Implemented the -e and the -S option, by putting yet another loop in compare.c 13-Apr-1989 Dick Grune (dick) at dick The -- option (displaying the tokens) will now handle more than one file. 20-Jan-1989 Dick Grune (dick) at dick After the modification of 19-Dec-88, 12% of the time went into updating the positions in the chunks, as they were produced by the matching process. This matching process identifies runs (matches) by token position, which has to be recalculated to lseek positions and line numbers. To this end the files are read again, and for each line all positions found were checked to see if they applied to this line; this was a awfully stupid algorithm, but since much more time was spent elsewhere, it did not really matter. With all the saving below, however, it had risen to second position, after yylook() with 35%. Th solution was, to sort the positions in the same order in which they would be met by the reading of the files. The process is then linear. This required some extensive hacking in pass2.c 06-Jan-1989 Dick Grune (dick) at dick The modification below did indeed save 25%. The newline information is now reduced to 2 shorts; 2 chars were not enough, since some lines are longer that 127 bytes, and a char and a short together take as much room as two shorts. 19-Dec-1988 Dick Grune (dick) at dick To avoid reading the files twice (which is still taking 25% of the time), the first pass will now collect newline information for the second pass in a buffer called nl_buff[]. This buffer, and the original token buffer now named TokenArray[], are managed by the file buff.c, which implements a layer between stream.h and pass?.c. This layer provides OpenText(), NextTextToken() and CloseText(), each with a parameter telling which pass it is. 06-Dec-1988 Dick Grune (dick) at dick As an introduction to removing the second pass altogether, the first and second scan were unified, i.e., their input is identical. This also means that the call sim -[12] has now been replaced by one call: sim --. 23-Sep-1988 Dick Grune (dick) at dick Dynamic allocation of line buffers in pass 3. This removes the restriction on the page width. 22-Sep-1988 Dick Grune (dick) at dick In order to give better messages on incorrect calls to sim, the whole option handling has been concentrated in a file option.c and separated from the options and their messages themselves. See sim.c 07-Sep-1988 Dick Grune (dick) at dick For long text sequences (say hundreds of thousands of tokens), the hashing is not really efficient any more since too many spurious matches occur. Therefore, the forward reference table is scanned a second time, eliminating from any chain all references to runs that do not end in the same token. For the UNIX manuals this reduced the number of matches from 91.9% to 1.9% (of which 0.06% were genuine). 30-Aug-1988 Dick Grune (dick) at dick For compatibility, NextTop has been rewritten to yield true or false and to accept a pointer to a run as a parameter. 30-Aug-1988 Dick Grune (dick) at dick When trying to find line-number and lseek position to beginnings and ends of runs found, the whole set of runs was scanned for each line in each file. Now only the runs belonging to that file are scanned; to this end another linked list has been braided through the data structures (tx_chunk). 30-Aug-1988 Dick Grune (dick) at dick The longest-common-substring algorithm was called much too often, mainly because the forward references made by hashing suffered from pollution. If you have say 1000 tokens and a hash range of say 10000, about 5 % of the hashings will be false matches, i.e. 50 matches, which is quite a lot on a natural number of 2 to 3 matches. Improved by doing a second check in make_forw_ref(). 12-Jun-1988 Dick Grune (dick) at dick Installed a Lisp version supplied by Gertjan Akkerman. 15-Jan-1988 Dick Grune (dick) at dick Added register declarations all over the place. 14-Jan-1988 Dick Grune (dick) at dick It is often useful to match a piece of code exactly, especially when function names (or, even more so, macro names) are involved. What one would want is having all the letters in the text array, but this is kind of hard, since each entry is one lexical item. This means that under the -F option each letter is a lex item, and normally each tag is a lex item; this requires two lex grammars in one program; no good. So, on the -F flag we hash the identifier into one lex item, which is hopefully characteristic enough. It works. 30-Sep-1987 Dick Grune (dick) at dick Some cosmetics. 31-Aug-1987 Dick Grune (dick) at dick Moved the whole thing to the SUN (while testing on a VAX and a MC68000) 16-Aug-1987 Dick Grune (dick) at dick The test program lang.c is no longer a main program, but rather a subroutine called in main() in sim.c, through the command line option -1 or -2. 23-Apr-1987 Dick Grune (dick) at tjalk Changed the name 'index' into 'elem_of', because of compatibility problems on different Unices. Added a declaration for it in the file algollike.c 10-Mar-1987 Dick Grune (dick) at tjalk Changed the printing of the header of a run so that: - long file names will no longer be truncated - the run length is displayed 27-Jan-1987 Dick Grune (dick) at tjalk Switched it right off again! Getting them in textual order is still more unpleasant, since now you cannot find the important ones if their are more than a few runs. 27-Jan-1987 Dick Grune (dick) at tjalk Going to experiment with leaving out the sorting; just all the runs, in the order we meet them. Should be as good or better. Comparisons of more than 100 runs are very rare anyway, so the fact that those over a 100 are rejected is probably no great help. Getting them in a funny order is a nuisance, however. Down with featurism. Just to be safe, present version saved as 870127.SV 26-Dec-1986 Dick Grune (dick) at tjalk Names of overall parameters in params.h changed to more uniformity. 26-Dec-1986 Dick Grune (dick) at tjalk Since the top package and the instantiation system have grown apart so much, I have integrated the old top package into sim, i.e., done the instantiation by hand. This removes top.g and top.p, and will save outsiders from wondering what is going on here. 23-Dec-1986 Dick Grune (dick) at tjalk Use setbuf to print unbuffered while reading the files (lex core dumps, other mishaps) and print buffered while printing the real output (for speed). 30-Nov-1986 Dick Grune (dick) at tjalk Various small changes in *lang.l: ; ignored conditionally (!options['f']) new format for tokens in struct idf cosmetics: macro Layout, macro UnsafeComChar, no \n in character denotations, more than one char in a char denotations in Pascal, etc. 30-Nov-1986 Dick Grune (dick) at tjalk Added a Modula-2 version. 29-Nov-1986 Dick Grune (dick) at tjalk Restricting tokens to the ASCII95 character set is really too severe: some languages have many more reserved words (COBOL!). Corrected this by adding a couple of '&0377' in strategic places. Added a routine for printing the 8-bit beasties: show_token(). 15-Aug-1986 Dick Grune (dick) at tjalk Since the ; is superfluous in both C and Pascal, it is now ignored by clang.l and pascallang.l 15-Aug-1986 Dick Grune (dick) at tjalk The code in CheckRun in Xlang.l was incorrect in that it used the wrong criterion for throwing away trailing garbage. I've taken CheckRun etc. out of the Xlang.l-s and turned them into a module "algollike.c". Made a cleaner interface and avoided duplication of code. 02-Jul-1986 Dick Grune (dick) at tjalk Looking backwards in compare.c to see if we are in the middle of a run is an atavism. You can be and still be all right, e.g., if part of the run was rejected as not fitting for a function. Removed from compare.c. 10-Jun-1986 Dick Grune (dick) at tjalk The function hash_code() in hash.c could yield a negative value; corrected. 09-Jun-1986 Dick Grune (dick) at tjalk Changed the name of the file text.h to sim.h. Sim.h is more appropriate and text.h sounds as if it belongs to text.l, with which it has no connection. 04-Jun-1986 Dick Grune (dick) at tjalk After having looked at a couple of hash functions and having done some calculations on the number of duplicates normally encountered in hash functions, I conclude that our function in hash.c is quite good. Removed all the statistics-gathering stuff. Actually, hash_table[] is not the hash table at all; it is a forward reference table; likewise, the real hash table was called last[]. Renamed both. There is a way to keep the hash table local without putting it on the stack: use malloc(). 02-Jun-1986 Dick Grune (dick) at tjalk Added a simple lex file for text: each word is condensed into a hash code which is mapped on the ASCII95 character set. This turns out to be quite effective. 01-Jun-1986 Dick Grune (dick) at tjalk The macros cput(tk) and c_eol() both have a return in them, so any code after them may not be executed -> they have to be last in an entry. But they weren't, in many places; I can't imagine why it all worked nevertheless. They have been renamed return_tk(tk) and return_eol() and the entries have been restructured. 30-May-1986 Dick Grune (dick) at tjalk Moved the string and character entries in clang.l and pascallang.l to a place behind the comment entries, to avoid strings (and characters) being recognized inside comments. I first thought this would not happen, but as Maarten pointed out, if both interpretations have the same length, lex will take the first entry. Now this will happen if the string occupies the whole line that would otherwise be taken as a comment. In short, /* "hallo" */ would return ". 28-May-1986 Dick Grune (dick) at tjalk Added -d option, to display the output in diff(1) format (courtesy of Maarten van der Meulen). Rewrote the lexical parsing of comments (likewise courtesy Maarten van der Meulen). 20-May-1986 Dick Grune (dick) at tjalk Added a routine to convert identifiers to lower case in pascallang.l . 19-May-1986 Dick Grune (dick) at tjalk Added -a option, to quickly check antecedent of a file (courtesy of Maarten van der Meulen). 18-May-1986 Dick Grune (dick) at tjalk Brought everything under RCS/CVS. 18-Mar-1986 Dick Grune (dick) at tjalk Added modifications by Paul Bame (hp-lsd!paul@hp-labs) to have an option -w to set the page width. 21-Feb-1986 Dick Grune (dick) at tjalk Took array last[N_HASH] out of make_hash() in hash.c, due to stack overflow on the Gould (reported by George Walker tekig4!georgew@mcvax.uucp) 16-Feb-1986 Dick Grune (dick) at tjalk Corrected some subtractions that caused unsigned ints to turn pseudo-negative. (Reported by jaap@mcvax) 11-Jan-1986 Dick Grune (dick) at tjalk Touched up for distribution. 10-Jan-1986 Dick Grune (dick) at tjalk Fill_line was not called for empty lines, which caused them to be printed as repetitions of the previous line. 24-Dec-1985 Dick Grune (dick) at tjalk Reduced hash table to a single array of indices; it is used only in one place, which makes it very easy to make it (the hash table) optional. General tune-up of everything. This seems to be another stable "final" version. 14-Dec-1985 Dick Grune (dick) at tjalk Some experiments with hash formulas: h = (h OP CST) + *p++ OP CST yields right wrong * 96 - 32 205 562 * 96 - 2 205 560 * 96 205 560 * 97 205 559 << 0 66 3128 << 1 203 555 << 2 205 536 << 7 203 540 Conclusion: it doesn't matter, unless you do it wrong. 01-Oct-1983 Dick Grune (dick) at vu44 Oldest known files. # This file is part of the software similarity tester SIM. # Written by Dick Grune, Vrije Universiteit, Amsterdam. # $Id: ChangeLog,v 2.21 2012-11-28 20:49:51 Gebruiker Exp $ # similarity-tester-2.70.orig/add_run.c0000644000000000000000000000313311763354134014550 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: add_run.c,v 2.12 2012-06-05 09:58:52 Gebruiker Exp $ */ #include "sim.h" #include "debug.par" #include "text.h" #include "runs.h" #include "percentages.h" #include "Malloc.h" #include "options.h" #include "error.h" #include "add_run.h" static void set_chunk( struct chunk *, struct text *, unsigned int, unsigned int ); static void set_pos( struct position *, int, struct text *, unsigned int ); void add_run(struct text *txt0, unsigned int i0, struct text *txt1, unsigned int i1, unsigned int size ) { /* Adds the run of given size to our collection. */ struct run *r = new(struct run); set_chunk(&r->rn_chunk0, txt0, i0 - txt0->tx_start, size); set_chunk(&r->rn_chunk1, txt1, i1 - txt1->tx_start, size); r->rn_size = size; #ifdef DB_RUN db_run_info("Added", r, 0); #endif /* DB_RUN */ if (is_set_option('p')) { add_to_percentages(r); } else { add_to_runs(r); } } static void set_chunk(struct chunk *cnk, struct text *txt, unsigned int start, unsigned int size ) { /* Fill the chunk *cnk with info about the piece of text in txt starting at start extending over size tokens. */ cnk->ch_text = txt; set_pos(&cnk->ch_first, 0, txt, start); set_pos(&cnk->ch_last, 1, txt, start + size - 1); } static void set_pos(struct position *pos, int type, struct text *txt, unsigned int start) { /* Fill a single struct position */ pos->ps_next = txt->tx_pos; txt->tx_pos = pos; pos->ps_type = type; pos->ps_tk_cnt = start; pos->ps_nl_cnt = -1; /* uninitialized */ } similarity-tester-2.70.orig/lex.h0000644000000000000000000000065512032031447013724 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: lex.h,v 2.12 2012-09-30 11:55:19 Gebruiker Exp $ */ /* Macros for the *lang.l files */ #define return_tk(tk) {lex_tk_cnt++; lex_token = (tk); return 1;} #define return_ch(ch) {lex_tk_cnt++; lex_token = int2Token((int)(ch)); return 1;} #define return_eol() {lex_nl_cnt++; lex_token = End_Of_Line; return 1;} similarity-tester-2.70.orig/LICENSE.txt0000644000000000000000000000303310663264234014611 0ustar Copyright (c) 1986, 2007, Dick Grune, Vrije Universiteit, The Netherlands All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the Vrije Universiteit nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. similarity-tester-2.70.orig/pass3.h0000644000000000000000000000035511763354135014176 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: pass3.h,v 1.3 2012-06-05 09:58:53 Gebruiker Exp $ */ /* Print the contents of runs */ extern void Show_Runs(void); similarity-tester-2.70.orig/options.h0000644000000000000000000000116411754252416014636 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: options.h,v 1.8 2012-05-13 09:05:49 Gebruiker Exp $ */ /* Setting and consulting command line options */ struct option { char op_char; /* char as in call */ char *op_text; /* explanatory text */ char op_indicator; /* type indicator, N = int, F = file name */ const char **op_stringp;/* string value to be picked up */ }; extern void set_option(char ch); extern int is_set_option(int ch); extern int do_options( const char *progname, const struct option *optlist, int argc, const char *argv[] ); similarity-tester-2.70.orig/lex.c0000644000000000000000000000040211764421214013713 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: lex.c,v 1.9 2012-06-08 16:04:28 Gebruiker Exp $ */ /* The service macros for the *lang.l files do not require code */ #include "lex.h" similarity-tester-2.70.orig/sim.c0000644000000000000000000001251112055474360013723 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: sim.c,v 2.32 2012-11-28 20:49:52 Gebruiker Exp $ */ #include #include #include #include "system.par" #include "settings.par" #include "sim.h" #include "options.h" #include "newargs.h" #include "token.h" #include "language.h" #include "error.h" #include "text.h" #include "runs.h" #include "hash.h" #include "compare.h" #include "pass1.h" #include "pass2.h" #include "pass3.h" #include "percentages.h" #include "stream.h" #include "lang.h" #include "Malloc.h" /* command-line parameters */ unsigned int Min_Run_Size = DEFAULT_MIN_RUN_SIZE; int Page_Width = DEFAULT_PAGE_WIDTH; int Threshold_Percentage = 1; /* minimum percentage to show */ FILE *Output_File; FILE *Debug_File; /* and their string values, for language files that define their own parameters */ const char *token_name = "token"; const char *min_run_string; const char *threshold_string; const char *progname; /* for error reporting */ static const char *page_width_string; static const char *output_name; /* for reporting */ static const struct option optlist[] = { {'r', "minimum run size", 'N', &min_run_string}, {'w', "page width", 'N', &page_width_string}, {'f', "function-like forms only", ' ', 0}, {'F', "keep function identifiers in tact", ' ', 0}, {'d', "use diff format for output", ' ', 0}, {'T', "terse output", ' ', 0}, {'n', "display headings only", ' ', 0}, {'p', "use percentage format for output", ' ', 0}, {'P', "use percentage format, showing all combinations", ' ', 0}, {'t', "threshold level of percentage to show", 'N', &threshold_string}, {'e', "compare each file to each file separately", ' ', 0}, {'s', "do not compare a file to itself", ' ', 0}, {'S', "compare new files to old files only", ' ', 0}, {'R', "recurse into subdirectories", ' ', 0}, {'i', "read arguments (file names) from standard input", ' ', 0}, {'o', "write output to file F", 'F', &output_name}, {'M', "show memory usage info", ' ', 0}, {'-', "lexical scan output only", ' ', 0}, {0, 0, 0, 0} }; static void read_and_compare_files(int argc, const char **argv, int round) { Read_Input_Files(argc, argv, round); Make_Forward_References(); Compare_Files(); Free_Forward_References(); } int is_new_old_separator(const char *s) { return strcmp(s, "/") == 0; } static void reverse_new_input_files(int argc, const char *argv[]) { int txt_first = 0; int txt_last; /* find the end of the new files */ for (txt_last = 0; txt_last < argc; txt_last++) { if (is_new_old_separator(argv[txt_last])) break; } txt_last--; /* swap the names from the outer sides on */ while (txt_first < txt_last) { const char *tmp = argv[txt_first]; argv[txt_first] = argv[txt_last]; argv[txt_last] = tmp; txt_first++, txt_last--; } } int main(int argc, const char *argv[]) { /* Save program name */ progname = argv[0]; argv++, argc--; /* and skip it */ /* Set the default output and debug streams */ Output_File = stdout; Debug_File = stdout; /* Get command line options */ { int nop = do_options(progname, optlist, argc, argv); argc -= nop, argv += nop; /* and skip them */ } /* Treat the value options */ if (min_run_string) { Min_Run_Size = strtoul(min_run_string, NULL, 10); if (Min_Run_Size == 0) fatal("bad or zero run size; form is: -r N"); } if (page_width_string) { Page_Width = atoi(page_width_string); if (Page_Width == 0) fatal("bad or zero page width; form is: -w N"); } if (threshold_string) { Threshold_Percentage = atoi(threshold_string); if ((Threshold_Percentage > 100) || (Threshold_Percentage <= 0)) fatal("threshold must be between 1 and 100"); } if (output_name) { Output_File = fopen(output_name, "w"); if (Output_File == 0) { char msg[500]; sprintf(msg, "cannot open output file %s", output_name); fatal(msg); /*NOTREACHED*/ } } if (is_set_option('P')) { Threshold_Percentage = 1; set_option('p'); } if (is_set_option('p')) { set_option('e'); set_option('s'); } /* Treat the input-determining options */ if (is_set_option('i')) { /* read input file names from standard input */ if (argc != 0) fatal("-i option conflicts with file arguments"); get_new_std_input_args(&argc, &argv); } if (is_set_option('R')) { get_new_recursive_args(&argc, &argv); } /* (argc, argv) now represents new_file* [ / old_file*] */ /* Here the real work starts */ Init_Language(); if (is_set_option('-')) { /* Just the lexical scan */ while (argv[0]) { const char *arg = argv[0]; if (!is_new_old_separator(arg)) { Print_Stream(arg); } argv++; } } else if (is_set_option('p')) { /* Show percentages */ /* To compute the percentages fairly, the input files are read twice, once in command line order, and once with the new files in reverse order. */ read_and_compare_files(argc, argv, 1); reverse_new_input_files(argc, argv); read_and_compare_files(argc, argv, 2); Show_Percentages(); } else { /* Show runs */ read_and_compare_files(argc, argv, 1); Retrieve_Runs(); Show_Runs(); } if (is_set_option('M')) { /* It is not trivial to plug the leaks, because data structures point to each other, and have to be freed in the proper order. But it is not impossible either. To do, perhaps. */ ReportMemoryLeaks(stderr); } return 0; } similarity-tester-2.70.orig/fname.h0000644000000000000000000000555111766062230014231 0ustar /* This file is part of the auxiliaries library. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: fname.h,v 1.8 2012-06-13 09:59:52 Gebruiker Exp $ */ /* Support for UNICODE file names */ /* To accommodate UNICODE file names on various platforms, this file defines the types Fchar file name character Dir_t struct for accessing a directory Dirent_t struct for accessing a directory entry and the functions Dir_t* Opendir(const Fchar*); Dirent_t* Readdir(Dir_t*); int Closedir(Dir_t*); Fchar *Fnamecpy(Fchar *dest, Fchar *source); Fchar *Fnamecat(Fchar*, const Fchar*); size_t Fnamelen(const Fchar*); int Fnamecmp(const Fchar*, const Fchar*); int Stat(const Fchar *fn, struct stat *st); FILE *Fopen(const Fchar *fn, const char *rb); The stream is still char*! int Fclose(FILE*); const char *Fname2str(const Fchar *fn); const Fchar *str2Fname(const char *s); The result of these two routines is transient: is is good only until the next call. The only way to obtain a file name is through readdir; the command line arguments are in ASCII. So a program can be adapted by replacing DIR by Dir_t, and struct dirent by Dirent_t. Compiling and correcting using the above replacements until there are no more errors or warnings will then yield an UTF-16 compatible program. For details about UTF-16 see fname.c. */ #ifndef _FNAME_H_ #define _FNAME_H_ /* lint cannot handle the weird code Windows throws at it, so even under Windows we clain to have UTF8 */ #ifdef MSDOS #define IS_UTF_16 #endif #ifdef lint #undef IS_UTF_16 #endif #ifdef IS_UTF_16 /* file names in UTF-16 */ #define _UNICODE #include #include #include typedef _TCHAR Fchar; typedef _WDIR Dir_t; typedef struct _tdirent Dirent_t; #define Opendir _topendir #define Closedir _tclosedir #define Readdir _treaddir #define Fnamecpy wcscpy #define Fnamecat wcscat #define Fnamelen wcslen #define Fnamecmp wcscmp extern const char *Fname2str(const Fchar *fn); /* transient! */ extern const Fchar *str2Fname(const char *s); /* transient! */ extern int Stat(const Fchar *fn, struct stat *st); extern FILE *Fopen(const Fchar *fn, const char *rb);/* stream is still char* */ #define Fclose fclose #else /* not MSDOS */ /* file names are in UTF-8 */ #include #include /* life is simple */ typedef char Fchar; #define Fnamecpy strcpy #define Fnamecat strcat #define Fnamelen strlen #define Fnamecmp strcmp #define Fname2str(fn) (fn) #define str2Fname(s) (s) #define Stat(fn,st) stat(fn,st) typedef DIR Dir_t; typedef struct dirent Dirent_t; #define Opendir opendir #define Closedir closedir #define Readdir readdir #define Fopen fopen #define Fclose fclose #endif /* MSDOS */ #endif /* _FNAME_H_ */ similarity-tester-2.70.orig/stream.c0000644000000000000000000000322211764421215014422 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: stream.c,v 2.11 2012-06-08 16:04:29 Gebruiker Exp $ */ #include #include #include #include "system.par" #include "sim.h" #include "options.h" #include "token.h" #include "lang.h" #include "stream.h" static FILE *fopen_regular_file(const char *fname); int Open_Stream(const char *fname) { int ok; lex_nl_cnt = 1; lex_tk_cnt = 0; lex_non_ascii_cnt = 0; /* start the lex machine */ yyin = fopen_regular_file(fname); ok = (yyin != 0); if (!ok) { /* fake a stream, to simplify the rest of the program */ yyin = fopen(NULLFILE, "r"); } yystart(); return ok; } static FILE * fopen_regular_file(const char *fname) { struct stat buf; if (stat(fname, &buf) != 0) return 0; if ((buf.st_mode & S_IFMT) != S_IFREG) return 0; return fopen(fname, "r"); } int Next_Stream_Token_Obtained(void) { return yylex(); } void Close_Stream(void) { if (yyin) { fclose(yyin); yyin = 0; } } void Print_Stream(const char *fname) { fprintf(Output_File, "File %s:", fname); if (!Open_Stream(fname)) { fprintf(Output_File, " cannot open\n"); return; } if (!is_set_option('T')) { fprintf(Output_File, " showing token stream:\nnl_cnt, tk_cnt: %ss", token_name ); lex_token = End_Of_Line; do { if (Token_EQ(lex_token, End_Of_Line)) { fprintf(Output_File, "\n%u,%u:", lex_nl_cnt, lex_tk_cnt ); } else { fprintf(Output_File, " "); fprint_token(Output_File, lex_token); } } while (Next_Stream_Token_Obtained()); fprintf(Output_File, "\n"); } Close_Stream(); } similarity-tester-2.70.orig/sortlist.bdy0000644000000000000000000000245210103670304015341 0ustar /* Module: Sort Linked Lists Author: dick@cs.vu.nl (Dick Grune @ Vrije Universiteit, Amsterdam) Version: Tue Sep 17 17:32:33 1991 Description: This is the implementation part of a generic routine that sorts linked lists. Instantiation: See sortlist.spc */ #ifndef _SORT_EXTERN_DEFINED static #endif void SORT_NAME(struct SORT_STRUCT **lh) { /* I've never known that sorting a linked list was this complicated; what am I missing? */ register struct SORT_STRUCT **listhook = lh; while (*listhook) { /* 0. the list is not empty -> there must be a smallest one */ register struct SORT_STRUCT **hsmall; /* 1. find (the pointer to) the smallest element */ { register struct SORT_STRUCT **hook = listhook; /* assume initially that first element is smallest */ hsmall = hook; while (*hook) { if (SORT_BEFORE(*hook, *hsmall)) { /* revise opinion */ hsmall = hook; } hook = &(*hook)->SORT_NEXT; } } /* 2. move the smallest element to front */ { register struct SORT_STRUCT *smallest = *hsmall; /* remove it from the chain */ *hsmall = smallest->SORT_NEXT; /* and insert it before the first element */ smallest->SORT_NEXT = *listhook; *listhook = smallest; } /* 3. skip over smallest element */ listhook = &(*listhook)->SORT_NEXT; } } similarity-tester-2.70.orig/aiso.spc0000644000000000000000000000603410752111552014425 0ustar /* This file is part of the module Arbitrary-In Sorted-Out (AISO). Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: aiso.spc,v 1.2 2008/02/05 16:48:42 dick Exp $ */ /* Description: This is the specification of a module that builds an arbitrary-in sorted-out data structure, to be used as a heap, a priority queue, etc. Elements can be inserted, the first element extracted and the set scanned at any moment. The module is not generic, in that only one copy of it can be instantiated per program. Instantiation: The module is instantiated as follows. Create a file X.h, where X is arbitrary, which contains at least: - a definition of AISO_TYPE, the type of the object to be stored - a possible definition of AISO_EXTR; see below - a possible definition of AISO_ITER; see below - #include "aiso.spc" This file X.h is to be included in all files that use the aiso package. Create a file X.c which contains at least: - #include "X.h" - a definition of a routine int AISO_BEFORE(AISO_TYPE v, AISO_TYPE w) which yields non-zero if v is to be sorted before w - #include "aiso.bdy" This file X.c compiles into the module object. Specification: The module always supplies: int InsertAiso(AISO_TYPE value) inserts value in its proper place; fails if out of memory If AISO_EXTR is defined, the module will also supply: int ExtractAiso(AISO_TYPE *value) yields the first value in the aiso and removes it; fails if empty If AISO_ITER is defined, the module also supplies a type AisoIter which declares an iterator, i.e., a structure that records a position in the ordered set, plus routines for manipulating the iterator, thus enabling the user to scan the ordered set. The iterator should be declared as: AisoIter iter; and is manipulated by the following commands: OpenIter(AisoIter *iter) opens the iterator for scanning the existing set in order int GetAisoItem(AisoIter *iter, AISO_TYPE *value) yields the next value in the iterator; fails if exhausted CloseIter(AisoIter *iter) closes the iterator For the use of AISO_DEBUG see aiso.bdy. Implementation: The AISO implementation is based on a self-adjusting binary tree. Degenerate behaviour of the tree is avoided by shaking the tree every 'ln aiso_size' node accesses. This guarantees ln aiso_size behaviour in the long run, though it is possible for a single operation to take aiso_size node accesses. The iterator is implemented as an additional linear linked list through the tree. This is simpler than and at least as efficient as clever tree-wiring. */ struct aiso_node { struct aiso_node *an_left; struct aiso_node *an_right; #ifdef AISO_ITER struct aiso_node *an_next; #endif /* AISO_ITER */ AISO_TYPE an_value; }; extern int InsertAiso(AISO_TYPE value); #ifdef AISO_EXTR extern int ExtractAiso(AISO_TYPE *value); #endif /* AISO_EXTR */ #ifdef AISO_ITER typedef struct aiso_node *AisoIter; extern void OpenIter(AisoIter *iter); extern int GetAisoItem(AisoIter *iter, AISO_TYPE *value); extern void CloseIter(AisoIter *iter); #endif /* AISO_ITER */ similarity-tester-2.70.orig/ToDo0000644000000000000000000000247312032002142013541 0ustar - get rid of the nl_buff mechanism (2012!) - report runs as '... matching text only ...' (proper name for Retrieve_Runs()) ? plug memory leaks (and still report memory usage!) - unify idf2token() in *lang.l - db_ not protected by #ifdef but by compilation to void Done ================================================================ + register - removed + Run hashing OK: average chain length = 1.5, for sim-ing the sources of MCD2 + Idf hashing OK: smooth distribution when sim-ing the sources of MCD2 + use two-byte tokens to obtain better resolution for sim_text and on -F option and UTF-8 (Johnson, Benjamin (US - Chicago)) + different defaults per program + cleaning up sim.c & names + Microsoft comment (// ... unescaped \n) + emails 2009-2011 (A = I answered, R= they replied) +AR Marcus Brinkmann, separate letters +AR Scott Kuhl, percentages +AR Yaroslav Halchenko, identifying non-existent lines +A Rumen Stefanov, UTF-8 +A Jonathan Martin, UTF-8 +AR UTF-8 (Johnson, Benjamin (US - Chicago)) + better structure between X.h and X.c + clean-up language.h and its sub-class algollike.h + warning in README to correct for non-MSDOS Rejected ================================================================ X remove Miranda X Mon Apr 11 13:23:41 1994: sim_orca X Thu May 13 23:02:46 1993: sim ook voor C++ en Ada similarity-tester-2.70.orig/options.c0000644000000000000000000000503311753674555014643 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: options.c,v 1.10 2012-05-13 09:05:49 Gebruiker Exp $ */ #include #include #include "options.h" static char options[128]; static void bad_option( const char *progname, const struct option *optlist, char *msg, int c ); static int opt_value( const char *progname, const struct option *op, const char *arg, const char *argv[] ); static int do_arg( const char *progname, const struct option *optlist, const char *arg, const char *argv[] ); int do_options( const char *progname, const struct option *optlist, int argc, const char *argv[] ) { int skips = 0; while (argc > 0 && argv[0][0] == '-' && argv[0][1] != '\0') { int consumed = do_arg(progname, optlist, &argv[0][1], argv); argc -= consumed, argv += consumed, skips += consumed; } return skips; } void set_option(char ch) { options[(int)ch]++; } int is_set_option(int ch) { return options[ch]; } static int do_arg( const char *progname, const struct option *optlist, const char *arg, const char *argv[] ) { int consumed = 0; while (*arg) { /* treat argument character */ char opc = *arg++; const struct option *op; for (op = optlist; op->op_char; op++) { if (opc == op->op_char) { set_option(opc); if (op->op_indicator != ' ') { consumed = opt_value( progname, op, arg, argv ); } break; } } if (!op->op_char) { bad_option(progname, optlist, "*option -%c unknown", opc ); /*NOTREACHED*/ } if (consumed) break; } if (!consumed) { consumed = 1; } return consumed; } static int opt_value( const char *progname, const struct option *op, const char *arg, const char *argv[] ) { /* locate the option value */ if (*arg) { /* argument is continuation of option */ *op->op_stringp = arg; return 1; } else if (argv[1]) { /* argument follows option */ *op->op_stringp = argv[1]; return 2; } else { bad_option(progname, (struct option *)0, " option -%c requires another argument", op->op_char ); return 0; /*NOTREACHED*/ } } static void bad_option( const char *progname, const struct option *optlist, char *msg, int c ) { fprintf(stderr, "%s: ", progname); fprintf(stderr, &msg[1], c); fprintf(stderr, "\n"); if (msg[0] != ' ') { const struct option *op; fprintf(stderr, "Possible options are:\n"); for (op = optlist; op->op_char; op++) { fprintf(stderr, "\t-%c%c\t%s\n", op->op_char, op->op_indicator, op->op_text ); } } exit(1); } similarity-tester-2.70.orig/TechnReport0000644000000000000000000001725211754532131015152 0ustar CONCISE REPORT ON THE ALGORITHMS IN SIM 970623 INTRODUCTION The general outline of the similarity checker is as follows: 1. the files are read in (pass 1) 2. a forward-reference table is prepared 3. the set of interesting runs is determined 4. the line numbers of the runs are determined (pass 2) 5. the contents of the runs are printed in order (pass 3) To keep the memory requirements (relatively) small, the exact positions of the tokens are not recorded. This necessitates pass 2. See, however, the pertinent chapter. READING THE FILES Each file is tokenized using an lex-generated scanner appropriate for the input. Each token fits in one byte, possibly using all 8 bits. The tokens are stored in the array Token_Array[], which is extended by reallocation if it overflows. See tokenarray.c. Also, to optimize away pass 2, an attempt is made to remember the token positions of all beginnings of lines. The token-positions at BOL are stored in the array nl_buff[], which is also extended by reallocation, if needed. If the attempt fails due to lack of memory, nl_buff[] is abandoned, and pass2 will read the files instead. PREPARING THE FORWARD-REFERENCE TABLE Text is compared by comparing every substring to all substrings to the right of it; this process is in essence quadratic. However, only substrings of length at least 'MinRunSize' are of interest, which gives us the possibility to speed up this process by using a hash table. Once the entire text has been read in, a forward-reference table forward_references[] is made (see hash.c). For every position in the text, we construct an index which gives the next position in the text where a run of MinRunSize tokens starts that has the same hash code. If there is no such run, the index is 0. To fill in this array, we use a hash table last_index[], such that last_index[i] is the index of the latest token with hash_code i, or 0 if there is none. If at a given position p, we find that the text ahead of us has hash code i, last_index[i] tells us which position in forward_references[] will have to be updated to p. See MakeForward_References(). For long text sequences (say hundreds of thousands of tokens), the hashing is not really efficient any more since too many spurious matches occur. Therefore, the forward reference table is scanned a second time, eliminating from any chain all references to runs that do not start with and end in the same token (actually this is a second hash code). For the UNIX manuals this reduced the number of matches from 91.9% to 1.9% (of which 0.06% was genuine). DETERMINING THE SET OF INTERESTING RUNS The overall structure of the routine Compare_Files() (see compare.c) is: for all new files for all texts it must be compared to for all positions in the new file for all positions in the text for ever increasing sizes try to match and keep the best If for a given position in the new file a good run (i.e. on of at least minimum length) has been found, the run is registered using a call of add_run(), the run is skipped in the new file and searching continues at the position after it. This prevents duplicate reports of runs. Add_run() allocates a struct run for the run (see sim.h) which contains two struct chunks and a quality description. It fills in the two chunks with the pertinent info, one for the first file and one for the second (which may be the same, if the run relates two chunks in the same file). The run is then entered into the arbitrary-in-sorted-out store AISO (see aiso.spc and aiso.bdy, a genuine generic abstract data type in C!), in which it is inserted according to its quality. Both positions (struct position) in both chunks in the run (so four in total) are each entered in a linked list starting at the tx_pos field in the struct text of the appropriate file. When this is finished, the forward reference table can be deleted. So the final results of this phase are visible both through the tx_pos fields and through the aiso interface. DETERMINING THE EXACT POSITION OF EACH RUN (PASS 2) The purpose of this pass is to find for each chunk, which up to now is known by token position only, its starting and ending line number (which cannot be easily derived from the token position). For each file that has a non-zero tx_pos field, ie. that has some interesting chunks, the positions in the tx_pos list are sorted on ascending line number (they have been found in essentially arbitrary order) by sort_pos() in pass2.c. Next we scan the pos list and the file in parallel, updating the info in a position when we meet it. A position carries an indication whether it is a starting or an ending position, since slightly differing calculations have to be done in each case. Actually, if the nl_buff[] data structure still exists, the file is not accessed at all and the data from nl_buff[] is used instead. This is done transparently in buff.c. PRINTING THE CONTENTS OF THE RUNS (PASS 3) Since each struct run has now been completely filled in, this is simple; the hard work is calculating the page layout. Pass3() accesses the aiso store and retrieves from it the runs in descending order of importance. Show_run() opens both files, positions them using the line numbers and prints the runs. ================================================================ CODE EXCERPT OF THE SOFTWARE SIMILARITY TESTER SIM (980222) sim: get command line options check the options init language, to precompute tables pass1, read the files # there is an array Token_Array[] that holds all input tokens make forward reference table # there is an array forward_references[], with one entry for # each token in the input; forward_references[i] gives the # token number where a token sequence starts with the same # hash value as the one starting at i compare various files to find runs delete forward reference table pass2, find newline positions of found similarities pass3, print the similarities pass1, read the files: for each file divide the text into tokens store all tokens except newlines in Token_Array and try to keep a record of the newline positions make forward reference table: # there are two independent hash functions, hash1() and hash2(). # hash1(i) gives the hash value of the token sequence starting at i # likewise for hash2(i) set up the forward references using the last_index table: # there is an array last_index[], with one entry for each # possible hash value; last_index[i] gives the position in # forward_references[] at which i was most recently # encountered as a hash value for each file for all positions in file except the last MinRunSize set forward_references[] and update last_index[] use hash2() to clean out matches: for all tokens find first token in chain with same hash2 code short-circuit forward reference to it compare: for all new files for all texts it must be compared to for all positions in the new file for all positions in the text for ever increasing sizes try to match and keep the best try to match and keep the best: # using forward_references[], we find a list of positions in # which a matching token sequence will start; # scanning this list, we measure the maximum length of the # match and add the longest match to the run collection pass2, find positions of found runs: for all files: sort the positions in the runs # we scan the pos list and the file in parallel for all positions inside this file if it matches a token position in a run record line number pass3, print the similarities: for all runs # a run consists of two chunks open the files that hold the chunks and position them at the beginning of the chunk display the chunks similarity-tester-2.70.orig/language.h0000644000000000000000000000141011764602256014722 0ustar /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: language.h,v 1.7 2012-06-09 08:09:18 Gebruiker Exp $ */ /* The *lang.l files provide two interfaces: language.[ch] static data about the language lang.[ch] dynamic data about the input file's content This is language.[ch]. */ /* The abstract class 'language' defines the routines Init_Language(), May_Be_Start_Of_Run() and Best_Run_Size(), which describe some properties of the language. These routines are provided by the *lang.l files. There is a dummy implementation language.c. */ extern void Init_Language(void); extern int May_Be_Start_Of_Run(Token ch); extern unsigned int Best_Run_Size(const Token *str, unsigned int size); similarity-tester-2.70.orig/m2lang.l0000644000000000000000000001542411764421215014327 0ustar %{ /* This file is part of the software similarity tester SIM. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: m2lang.l,v 2.18 2012-06-08 16:04:29 Gebruiker Exp $ */ /* Modula-2 language front end for the similarity tester. Author: Dick Grune */ #include "options.h" #include "token.h" #include "language.h" #include "algollike.h" #include "idf.h" #include "lex.h" #include "lang.h" /* General language front end data */ Token lex_token; unsigned int lex_nl_cnt; unsigned int lex_tk_cnt; unsigned int lex_non_ascii_cnt; /* Language-dependent data */ /* Most Modula-2 programs start with a number of IMPORTs that look very similar from program to program. These are skipped by ignoring the reserved words IMPLEMENTATION, DEFINITION, MODULE, IMPORT and FROM, having a flag skip_imports, and start reacting only at the first non-ignored reserved word. Also, the nesting comments require a state variable. */ /* Additional state variables, set in yystart() */ static int skip_imports; static int comment_level; /* Data for module idf */ static const struct idf reserved[] = { {"AND", NORM('&')}, {"ARRAY", NORM('A')}, {"BEGIN", NORM('{')}, {"BY", NORM('B')}, {"CASE", NORM('c')}, {"CONST", NORM('C')}, {"DEFINITION", No_Token}, {"DIV", NORM('/')}, {"DO", NORM('D')}, {"ELSE", NORM('e')}, {"ELSIF", NORM('e')}, {"END", NORM('}')}, {"EXIT", NORM('E')}, {"EXPORT", CTRL('E')}, {"FOR", NORM('F')}, {"FROM", No_Token}, {"IF", NORM('i')}, {"IMPLEMENTATION", No_Token}, {"IMPORT", No_Token}, {"IN", NORM('I')}, {"LOOP", NORM('l')}, {"MOD", NORM('%')}, {"MODULE", No_Token}, {"NOT", NORM('~')}, {"OF", No_Token}, {"OR", NORM('O')}, {"POINTER", NORM('p')}, {"PROCEDURE", NORM('P')}, {"QUALIFIED", NORM('q')}, {"RECORD", NORM('r')}, {"REPEAT", NORM('R')}, {"RETURN", CTRL('r')}, {"SET", NORM('s')}, {"THEN", No_Token}, {"TO", NORM('t')}, {"TYPE", NORM('T')}, {"UNTIL", NORM('u')}, {"VAR", NORM('v')}, {"WHILE", NORM('w')}, {"WITH", NORM('W')}, }; static const struct idf standard[] = { {"ABS", META('a')}, {"ADDRESS", META('A')}, {"ALLOCATE", MTCT('A')}, {"BITSET", META('b')}, {"BOOLEAN", META('B')}, {"CAP", META('c')}, {"CARDINAL", META('C')}, {"CHAR", MTCT('C')}, {"CHR", META('x')}, {"DEALLOCATE", META('d')}, {"DEC", META('D')}, {"EXCL", META('e')}, {"FALSE", META('f')}, {"FLOAT", META('F')}, {"HALT", META('h')}, {"HIGH", META('H')}, {"INC", META('i')}, {"INCL", META('I')}, {"INTEGER", MTCT('I')}, {"LONGCARD", META('L')}, {"LONGINT", META('L')}, {"LONGREAL", META('L')}, {"MAX", META('m')}, {"MIN", META('M')}, {"NEWPROCESS", META('n')}, {"NIL", META('N')}, {"ODD", META('o')}, {"ORD", META('O')}, {"PROC", META('p')}, {"REAL", META('r')}, {"SIZE", META('s')}, {"SYSTEM", META('S')}, {"TRANSFER", META('t')}, {"TRUE", META('T')}, {"TRUNC", MTCT('T')}, {"VAL", META('v')}, {"WORD", META('w')} }; /* Special treatment of identifiers */ static Token idf2token(int hashing) { Token tk; /* the token can be on two lists, reserved and standard */ tk = idf_in_list(yytext, reserved, sizeof reserved, IDF); /* is it one of the keywords to be ignored? */ if (Token_EQ(tk, No_Token)) return tk; /* The statement below is a significant comment on the value of state variables. */ if (!Token_EQ(tk, IDF)) { /* reserved word, stop the skipping */ skip_imports = 0; } else { /* it is an identifier but not a reserved word */ if (skip_imports) { /* skip it */ tk = 0; } else { /* look further */ tk = idf_in_list(yytext, standard, sizeof standard, IDF); if (Token_EQ(tk, IDF) && hashing) { /* return a one-Token hash code */ tk = idf_hashed(yytext); } } } return tk; } /* Token sets for module algollike */ const Token Non_Finals[] = { IDF, /* identifier */ NORM('{'), /* also BEGIN */ NORM('('), NORM('['), NORM('A'), /* ARRAY */ NORM('c'), /* CASE */ NORM('C'), /* CONST */ NORM('E'), /* EXIT */ NORM('F'), /* FOR */ NORM('i'), /* IF */ NORM('l'), /* LOOP */ NORM('p'), /* POINTER */ NORM('P'), /* PROCEDURE */ NORM('r'), /* RECORD */ NORM('R'), /* REPEAT */ CTRL('R'), /* RETURN */ NORM('s'), /* SET */ NORM('T'), /* TYPE */ NORM('v'), /* VAR */ NORM('w'), /* WHILE */ NORM('W'), /* WITH */ No_Token }; const Token Non_Initials[] = { NORM('}'), NORM(')'), NORM(']'), NORM(';'), No_Token }; const Token Openers[] = { NORM('{'), NORM('('), NORM('['), No_Token }; const Token Closers[] = { NORM('}'), NORM(')'), NORM(']'), No_Token }; /* Language-dependent code */ void Init_Language(void) { Init_Algol_Language(Non_Finals, Non_Initials, Openers, Closers); } int May_Be_Start_Of_Run(Token ch) { return May_Be_Start_Of_Algol_Run(ch); } unsigned int Best_Run_Size(const Token *str, unsigned int size) { return Best_Algol_Run_Size(str, size); } %} %option noyywrap %Start Comment Layout ([ \t\r\f]) ASCII95 ([- !"#$%&'()*+,./0-9:;<=>?@A-Z\[\\\]^_`a-z{|}~]) AnyQuoted (\\.) QuStrChar ([^"\n\\]|{AnyQuoted}) ApoStrChar ([^'\n\\]|{AnyQuoted}) StartComment ("(*") EndComment ("*)") SafeComChar ([^*\n]) UnsafeComChar ("*") Digit ([0-9a-fA-F]) Idf ([A-Za-z][A-Za-z0-9_]*) %% {StartComment} { /* See clang.l */ /* Lex itself is incapable of handling Modula-2's nested comments. So let's help it a bit. */ if (comment_level == 0) { BEGIN Comment; } comment_level++; } {SafeComChar}+ { /* safe comment chunk */ } {UnsafeComChar} { /* unsafe char, read one by one */ } "\n" { /* to break up long comments */ return_eol(); } {EndComment} { /* end-of-comment */ comment_level--; if (comment_level == 0) { BEGIN INITIAL; } } \"{QuStrChar}*\" { /* quoted strings */ return_ch('"'); } \'{ApoStrChar}*\' { /* apostrophed strings */ return_ch('"'); } {Digit}+("B"|"C"|"H")? { /* numeral, passed as an identifier */ return_tk(IDF); } "END"{Layout}*{Idf} { /* ignore identifier after END */ Token tk = idf_in_list("END", reserved, sizeof reserved, No_Token); if (!Token_EQ(tk, No_Token)) return_tk(tk); } {Idf}/"(" { /* identifier in front of ( */ Token tk = idf2token(is_set_option('F')/* hashing option */); if (!Token_EQ(tk, No_Token)) return_tk(tk); } {Idf} { /* identifier */ Token tk = idf2token(0 /* no hashing */); if (!Token_EQ(tk, No_Token)) return_tk(tk); } "<>" { /* <>, special equivalence */ return_ch('#'); } \; { /* semicolon, conditionally ignored */ if (is_set_option('f')) return_ch(yytext[0]); } \n { /* count newlines */ return_eol(); } {Layout} { /* ignore layout */ } {ASCII95} { /* copy other text */ if (!skip_imports) return_ch(yytext[0]); } . { /* count non-ASCII chars */ lex_non_ascii_cnt++; } %% /* More language-dependent code */ void yystart(void) { skip_imports = 1; comment_level = 0; BEGIN INITIAL; } similarity-tester-2.70.orig/debug.c0000644000000000000000000000313511710073551014215 0ustar /* This file is part of the debugging module DEBUG. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: debug.c,v 1.5 2012-01-25 21:43:05 Gebruiker Exp $ */ #include #include #include #include "debug.h" #ifdef DEBUG static void wr_char(char ch) { write(2, &ch, 1); } static void wr_num(int b,int v) { if (v >= b) { wr_num(b, v/b); } wr_char("0123456789ABCDEF"[v%b]); } static void wr_str(const char *s) { while (*s) { wr_char(*s++); } } void wr_info(const char *s, int b, int v) { /* print the string */ if (s) { int cnt = 0; while (*s) { int ch = *s++ &0377; /* cut short a possibly corrupted string */ if (cnt++ > 50) { wr_str("..."); break; } /* put not thy faith in chars, signed or unsigned */ if (isprint(ch)) { wr_char(ch); } else { switch (ch) { case '\n': wr_str("\\n"); break; case '\t': wr_str("\\t"); break; case '\r': wr_str("\\r"); break; case '\f': wr_str("\\f"); break; default: wr_char('\\'); wr_char(ch / 0100 % 010 + '0'); wr_char(ch / 010 % 010 + '0'); wr_char(ch / 01 % 010 + '0'); break; } } } } else { wr_str(""); } /* print the value */ if (b != 0) { wr_char(' '); if (v < 0) { wr_char('-'); v = -v; } switch (b) { case 8: wr_char('0'); wr_num(b, v); break; default: wr_num(10, v); break; case 16: wr_char('#'); wr_num(b, v); break; case 128: wr_char(v); break; } } wr_char('\n'); } #else /*ARGSUSED*/ void wr_info(const char *s, int b, int v) { } #endif similarity-tester-2.70.orig/Malloc.h0000644000000000000000000001002111766062230014336 0ustar /* This file is part of the memory management and leak detector MALLOC. Written by Dick Grune, Vrije Universiteit, Amsterdam. $Id: Malloc.h,v 1.4 2012-06-13 09:59:52 Gebruiker Exp $ */ #include /***** The files Malloc.[ch] provide several functionalities: - checking for "out of memory": to simplify programming - allocating memory using new(type) " " " " - detecting memory leaks: to obtain cleaner programs - clobbering freshly allocated memory: to obtain safer programs The module defines several sets of routines: 1. void *Malloc(size_t s) void *Calloc(int n, size_t s) void *Realloc(void *p, size_t s) void Free(void *p) 2. void *TryMalloc(size_t s) void *TryCalloc(int n, size_t s) void *TryRealloc(void *p, size_t s) 3. T *new(T) char *new_string(const char *s) 4. void ReportMemoryLeaks(FILE *f) * The members of the first set act like their Unix counterparts, except that they never return NULL; upon out-of-memory an error message is given on standard error, showing the file name and the line number of the call. Since in almost all cases there is nothing more intelligent to do, this is almost always adequate, and makes for simpler and safer programming. In those rare cases that the program *can* continue when out of memory, the routines in the second set can be used; they act exactly like their Unix counterparts. Note that automatic out-of-memory detection is active, regardless of the -DMEM... flags described below. * A call of new(T), with T any type, yields a pointer of type T* to a block of type T, allocated using Malloc(). A call of new_string(s), with s a string, yields a pointer to a copy of s, allocated using Malloc(); it is equivalent to strdup() except that it uses Malloc(). * Normally, a call of ReportMemoryLeaks() does nothing, but when Malloc.c is compiled with -DMEMLEAK, it produces a compacted list of allocated but not yet freed blocks on the stream f, with information about where they were allocated. This is useful to get insight into memory use and abuse. * When Malloc.c is compiled with -DMEMCLOBBER, it clobbers all newly allocated memory from Malloc() and Realloc() just after allocation, and all freed memory just before freeing it. An area is clobbered by overwriting it with a wacky bit pattern. This is done in the hope that improper use of memory will cause some evident error somewhere. The routine that performs the clobbering, MemClobber(void *p, size_t size), is available regardless of the -DMEMCLOBBER compilation option. It can be used to create comparison patterns. * Compiled with any of the -DMEM... flags, Malloc will also produce run-time error messages for multiple Free()s of the same block, and Realloc()s on not-allocated blocks. It then allows the program to continue. * The system consumes hardly any time and is fast enough to be kept active all the time. *****/ #define Malloc(s) (_leak_malloc(1, (size_t)(s), __FILE__, __LINE__)) #define Calloc(n,s) (_leak_calloc(1, (n), (size_t)(s), __FILE__, __LINE__)) #define Realloc(p,s) (_leak_realloc(1, (void *)(p), (size_t)(s), __FILE__, __LINE__)) #define TryMalloc(s) (_leak_malloc(0, (size_t)(s), __FILE__, __LINE__)) #define TryCalloc(n,s) (_leak_calloc(0, (n), (size_t)(s), __FILE__, __LINE__)) #define TryRealloc(p,s) (_leak_realloc(0, (void *)(p), (size_t)(s), __FILE__, __LINE__)) #define Free(p) (_leak_free((void *)(p), __FILE__, __LINE__)) #define new(type) ((type *)Malloc(sizeof (type))) #define new_string(s) (_new_string((s), __FILE__, __LINE__)) extern void *_leak_malloc(int chk, size_t size, const char *fname, int l_nmb); extern void *_leak_calloc(int chk, int n, size_t size, const char *fname, int l_nmb); extern void *_leak_realloc(int chk, void *addr, size_t size, const char *fname, int l_nmb); extern void _leak_free(void *addr, const char *fname, int l_nmb); extern void ReportMemoryLeaks(FILE *f); extern void MemClobber(void *p, size_t size); extern char *_new_string(const char *s, const char *fname, int l_nmb); similarity-tester-2.70.orig/option-i.inp0000644000000000000000000000004412055463314015230 0ustar pass1.c pass2.c / pass3.c ../teckel