snowdrop/0040755000000000000000000000000007757265777011470 5ustar rootrootsnowdrop/TODO0100644000000000000000000000077207537020010012120 0ustar rootroot1) Rework resync algorithm 2) Add multiword synonyms to eng module 3) Fix / rework C module 4) Add covert channels in C code: - ; after } 1 bit get orig: If notcomment nottext If current is } and next isn't ;, set a flag foo1 get water: If current is } and foo1, and next is ;, append ; to cur - ! versus == 0 1 bit - (sth) versus != 0 1 bit - adding / removing inlines, static 1 bit - adding signed to char, float, int, long w/o unsigned 1 bit - while -> for 1 bit snowdrop/snowdrop.c0100644000000000000000000006007407757262420013471 0ustar rootroot/* snowdrop - text watermarking and watermark recovery --------------------------------------------------- Copyright (C) 2002 by Michal Zalewski This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. As a special exception, this program may be linked with the OpenSSL library, despite that library's more restrictive license. This file implements a language-independent watermark injection and recovery. */ #include #include #include #include #include #include #include #include #include #include #ifdef USE_OPENSSL #include #else #include #include #define MD5_Init MD5Init #define MD5_Final MD5Final #define MD5_Update MD5Update #endif /* USE_OPENSSL */ #include #include "language.h" #define MAXATOMS 100000 // Max number of words in a document. #define EXPENSELIMIT 4 // Atoms with higher storage capacity are // too risky and expensive for resyncs. #define MINSYNCLEN 10 // Minimum number of matching atoms to // establish synchronization. /* void dump_binstream(const char* str,int blen) { int off=0; for (off=0;off>= 1; if (carry) x |= (((unsigned long long)1) << (bits-1)); } return x; } inline unsigned long long rol(unsigned long long x,int cnt,int bits) { return ror(x,bits-cnt,bits); } static char* argv0; int use_64bit; int nb=32; float nbf=32; static void usage(void) { debug("Usage: %s [ -6 ] -e origfile newfile\n" " %s [ -6 ] -l\n" " %s [ -6 ] -i origfile newfile \"Recipient\" [ \"Comment\" ] \n\n",argv0,argv0,argv0); debug("First method of calling the program (with -e option) enables watermark\n" "extraction mode. In this mode, file passed as a first parameter must be\n" "the original document used to generate second file (or its portions).\n\n" "Second method (-l) simply lists the contents of the watermark database\n" "for the module you're now running.\n\n" "Third method (-i) enables watermark injection mode. File 'origfile' is\n" "modified and saved as 'newfile'. Mandatory parameter is the recipient\n" "identifier. Optional comment can be added for your refernece.\n\n" "Additional parameter -6 enables strong, 64-bit watermarking that is suitable\n" "for providing public documentation of watermarked document abuse.\n\n"); module_help(); debug("\n"); exit(0); } unsigned int result[4]; unsigned int wmark[4]; MD5_CTX kuku; static unsigned int get_random(void) { int x,r,r2; x=open("/dev/urandom",O_RDONLY); if (x<0) fatal("cannot open /dev/urandom"); if (read(x,&r,4)!=4) fatal("cannot read from /dev/urandom"); close(x); x=open("/dev/urandom",O_RDONLY); if (x<0) fatal("cannot open /dev/urandom"); if (read(x,&r2,4)!=4) fatal("cannot read from /dev/urandom"); close(x); if (r == r2) fatal("/dev/urandom is deterministic"); return r2; } struct dbent { char* fn,*rcpt,*cmt; int f1,f2,m1,m2; unsigned long long wm; int tim; }; #define MAXDB 10240 struct dbent db[MAXDB]; int dbtop; static int gotothermd5; static void load_database(void) { int line=0; FILE* foo; char buf[MAXBUF]; sprintf(buf,"%s/.snowdrop/",getenv("HOME")); strcat(buf,"database"); foo=fopen(buf,"r"); if (!foo) fatal("cannot open database file %s",buf); debug("[*] Loading database for module %s, conf %08x...\n",TARGETLANG,md5_importantstuff()); while (fgets(buf,MAXBUF,foo)) { char modname[MAXBUF],cmt[MAXBUF],rcpt[MAXBUF],fn[MAXBUF]; int tim,dict,f1,f2,m1,m2; char type; unsigned long long wmark; if (sscanf(buf,"%c:[%[a-z]] %u\xad%x\xad%[ -~]\xad%x\xad%x\xad%[ -~]\xad%x\xad%x\xad%Lx" "\xad%[ -~]",&type,modname,&tim,&dict,fn,&f1,&f2,rcpt,&m1,&m2,&wmark,cmt)!=12) fatal("malformed database line %d",line+1); line++; if (type=='3' && use_64bit) continue; if (type=='6' && !use_64bit) continue; if (strcmp(TARGETLANG,modname)) continue; if (dict != md5_importantstuff()) { gotothermd5=1; continue; } db[dbtop].fn=strdup(fn); if (!db[dbtop].fn) fatal("not enough memory"); db[dbtop].rcpt=strdup(rcpt); if (!db[dbtop].rcpt) fatal("not enough memory"); db[dbtop].cmt=strdup(cmt); if (!db[dbtop].cmt) fatal("not enough memory"); db[dbtop].tim=tim; db[dbtop].f1=f1; db[dbtop].f2=f2; db[dbtop].m1=m1; db[dbtop].m2=m2; db[dbtop].wm=wmark; dbtop++; } if (dbtop) { debug("[+] Watermarks database: %d lines, loaded %d entries.\n",line,dbtop); if (gotothermd5) debug("[!] Skipped %d entries created from other conf files.\n",gotothermd5); } else { if (!gotothermd5) debug("[-] The database for this module is empty.\n"); else debug("[-] No entries matching current conf file. Try passing different\n" " configuration files as parameters to this module.\n"); fatal("no data loaded"); } } static void list_database(void) { int i; load_database(); debug("\n"); for (i=0;i0) set_value(foo,rand() % q,3-j); cap[3-j]+=q; tots+=q; } } debug("[+] Calculated input storage redundancy / size:\n"); debug(" Overall redundancy : %.02f (%d bits)\n",((float)tots)/nbf,tots); debug(" Whitespace channel : %.02f (%d bits)\n",((float)cap[0])/nbf,cap[0]); debug(" Grammar channel : %.02f (%d bits)\n",((float)cap[1])/nbf,cap[1]); debug(" Formatting channel : %.02f (%d bits)\n",((float)cap[2])/nbf,cap[2]); debug(" Substitution channel : %.02f (%d bits)\n",((float)cap[3])/nbf,cap[3]); // Create per-channel storage space. chan[0]=malloc(cap[0]/8+8); if (!chan[0]) fatal("not enough memory"); chan[1]=malloc(cap[1]/8+8); if (!chan[1]) fatal("not enough memory"); chan[2]=malloc(cap[2]/8+8); if (!chan[2]) fatal("not enough memory"); chan[3]=malloc(cap[3]/8+8); if (!chan[3]) fatal("not enough memory"); bigchan=malloc(tots/8+8); if (!bigchan) fatal("not enough memory"); off[0]=0; off[1]=cap[0]>=nb?(nb/4):cap[0]; off[2]=cap[1]>=nb?(nb/2):((off[1]+cap[1])%nb); off[3]=cap[2]>=nb?(3*nb/4):((off[2]+cap[2])%nb); debug(" Channel offsets are : %d, %d, %d, %d\n",off[0],off[1],off[2],off[3]); for (i=0;i MINSYNCLEN) { S=get_water_pos(); synced=1; if (atomcnt <= MINSYNCLEN+2) { if (good_cnt==MINSYNCLEN+1) debug("[+] Files synchronized from the beginning, we're lucky.\n"); oncesynced=1; } else { if (good_cnt==MINSYNCLEN+1) debug(" done!\n[+] Files synchronized near atom %d [%d:%d:%d:%d]...\n",atomcnt,chpos[0],chpos[1],chpos[2],chpos[3]); oncesynced=1; } } if (!zoo) { debug("[!] Watermarked file truncated.\n"); break; } z=get_value(foo,zoo,&si[0],&va[0],0); // debug("CAPABILITY %s : %d %d %d %d (%d %d %d %d)\n",foo,si[0],si[1],si[2],si[3],chpos[0],chpos[1],chpos[2],chpos[3]); // if (!z) debug("not synced: [%s] - [%s]\n",foo,zoo); // else debug("synced (%d %d %d %d): [%s] - [%s]\n",si[0],si[1],si[2],si[3],foo,zoo); if (z) good_cnt++; else { synced=0; if (good_cnt>MINSYNCLEN) debug("[!] Sync lost with the watermarked file near atom %d [%d:%d:%d:%d]: ",atomcnt,chpos[0],chpos[1],chpos[2],chpos[3]); else if (!oncesynced) { oncesynced=1; debug("[!] Files not identical, trying to synchronize...\n"); debug("[+] Processing: "); } good_cnt=0; while (foo) { int rskip; char* text=foo; while (isspace(*text)) text++; set_water_pos(S); rskip=0; while ((zoo=get_water_atom())) { // debug("Resyncing %s <-> %s\n",foo,zoo); if (rskip) continue; z=get_value(foo,zoo,&si[0],&va[0],1); if ((!z) && (si[0] + si[1] + si[2] + si[3] > EXPENSELIMIT)) { // debug("Item %s considered too expensive for resync, skipping...\n",foo); rskip=1; continue; } if ((!z) && si[3]) { // Resyncing at a synonym is way too expensive in most cases. // debug("Item %s considered too expensive for resync, skipping...\n",foo); rskip=1; continue; } if (z && strlen(text)>1) { z=get_value(foo,zoo,&si[0],&va[0],0); goto gotsync; } else if (z) break; } // Failed to find any matching water atom. Fake storing // some value (just skip some space), and proceed to the next // input atom. // Let's fake something... z=get_value(foo,".SKIPME-PLEASE.",&si[0],&va[0],0); for (z=0;z<4;z++) if (si[z]>0) chpos[z]+=si[z]; // debug("CAPABILITY[2] %s : %d %d %d %d (%d %d %d %d)\n",foo,si[0],si[1],si[2],si[3],chpos[0],chpos[1],chpos[2],chpos[3]); foo=get_orig_atom(); if (!foo || strchr(foo,'\n')) debug("."); // debug("<%s>\n",foo); atomcnt++; } // Whoopsie! We shouldn't be here that long. if (!foo) { debug(" EOF\n[!] Failed to resync before the end of the original file.\n"); goto bailout; } break; } gotsync: for (z=0;z<4;z++) if (si[z]>0) { append_binary(chan[z],chpos[z],(char*)&va[z],si[z]); chpos[z]+=si[z]; rd[z]+=si[z]; trd+=si[z]; } } bailout: if (get_water_atom()) debug("[!] Trailing garbage in the watermarked file.\n"); append_binary(bigchan,0,chan[0],chpos[0]); bigtop=bigpos=chpos[0]; while ((bigpos % nb) != off[1]) bigpos--; append_binary(bigchan,bigpos,chan[1],chpos[1]); bigpos+=chpos[1]; if (bigpos>bigtop) bigtop=bigpos; else bigpos=bigtop; while ((bigpos % nb) != off[2]) bigpos--; append_binary(bigchan,bigpos,chan[2],chpos[2]); bigpos+=chpos[2]; if (bigpos>bigtop) bigtop=bigpos; else bigpos=bigtop; while ((bigpos % nb) != off[3]) bigpos--; append_binary(bigchan,bigpos,chan[3],chpos[3]); bigpos+=chpos[3]; if (bigpos>bigtop) bigtop=bigpos; else bigpos=bigtop; debug("[+] Successfully read %d bits of data of %d expected.\n",trd,tots); debug("[+] Constructed an uniform stream of %d bits.\n",bigpos); if (trd) debug(" Overall retreived : %d bits (%0.02f%%)\n",trd,((float)(trd))*100.0/((float)tots)); else debug(" Overall retreived : 0 bits (n/a)\n"); if (cap[0]) debug(" Whitespace channel : %d bits (%0.02f%%)\n",rd[0],((float)(rd[0]))*100.0/((float)cap[0])); else debug(" Whitespace channel : 0 bits (n/a)\n"); if (cap[1]) debug(" Grammar channel : %d bits (%0.02f%%)\n",rd[1],((float)(rd[1]))*100.0/((float)cap[1])); else debug(" Grammar channel : 0 bits (n/a)\n"); if (cap[2]) debug(" Formatting channel : %d bits (%0.02f%%)\n",rd[2],((float)(rd[2]))*100.0/((float)cap[2])); else debug(" Formatting channel : 0 bits (n/a)\n"); if (cap[3]) debug(" Substitution channel : %d bits (%0.02f%%)\n",rd[3],((float)(rd[3]))*100.0/((float)cap[3])); else debug(" Substitution channel : 0 bits (n/a)\n"); if (trd> 8*f) & 0xff)) { gotpiece++; break; } i+=nb/8; } } // Got all pieces of the puzzle? if (gotpiece==nb/8) { char ctim[100]; strcpy(ctim,ctime((void*)&db[j].tim)); if (strchr(ctim,'\n')) *strchr(ctim,'\n')=0; debug("\n[+] This document possibly matches fragmented entry %d:\n" " Source file : %s\n" " Time : %s\n" " Recipient : %s\n" " Comment : %s\n" " Source MD5 : %08x-%08x\n" " Magic value : %08x-%08x\n" " Watermark : %016Lx\n", j,db[j].fn,ctim,db[j].rcpt,db[j].cmt, db[j].f1,db[j].f2,db[j].m1,db[j].m2,db[j].wm); gotsomething=1; } } if (gotsomething) { debug("\nPossibly matching document found, exiting.\n"); exit(0); } debug("[-] I am sorry. Unable to find any matching document.\n"); exit(1); } static void add_database(int dict,char* fn,int f1,int f2,char* rcpt,char* cmt,int m1,int m2,long long wmark) { FILE* foo; char *q; char buf[1024]; sprintf(buf,"%s/.snowdrop/",getenv("HOME")); mkdir(buf,0700); strcat(buf,"database"); foo=fopen(buf,"a"); if (!foo) fatal("cannot open database file %s",buf); if (!cmt) cmt=""; // Some chars are obviously not OK... while ((q=strchr(fn,0xad))) *q='-'; while ((q=strchr(rcpt,0xad))) *q='-'; while ((q=strchr(cmt,0xad))) *q='-'; while ((q=strchr(fn,'\n'))) *q=' '; while ((q=strchr(rcpt,'\n'))) *q=' '; while ((q=strchr(cmt,'\n'))) *q=' '; fprintf(foo,"%c:[%s] %u\xad%x\xad%s\xad%x\xad%x\xad%s\xad%x\xad%x\xad%Lx\xad%s\n",use_64bit?'6':'3',TARGETLANG,(int)time(0),dict,fn,f1,f2,rcpt,m1,m2,wmark,cmt); fclose(foo); } static char capseq[MAXATOMS][4]; static int vc; static void add_sig(char* orig,char* mod,char* to,char* comm) { char* x; char* foo; int cap[4]={0,0,0,0},tots=0; long long wm; unsigned int dict; unsigned int totstor=0; unsigned int totst[4]={0,0,0,0}; int dof[4]={0,0,0,0}; int writ=0; int i,siz,m1,m2; int off[4]; i=open(orig,O_RDONLY); if (i<0) fatal("cannot open input file %s",orig); siz=lseek(i,0,SEEK_END); lseek(i,0,SEEK_SET); if (siz<1) fatal("input file of zero length"); x=malloc(siz+1); if (!x) fatal("not enough memory to load input file"); read(i,x,siz); x[siz]=0; close(i); i=open(mod,O_RDWR|O_TRUNC|O_CREAT,0600); if (i<0) fatal("cannot open output file %s",mod); set_original(x); MD5_Init(&kuku); MD5_Update(&kuku,x,siz); MD5_Final((char*)result,&kuku); m1=get_random(); m2=get_random(); MD5_Init(&kuku); MD5_Update(&kuku,&m1,sizeof(int)); MD5_Update(&kuku,&m2,sizeof(int)); MD5_Update(&kuku,to,sizeof(strlen(to))); MD5_Final((char*)wmark,&kuku); if (!use_64bit) wm=wmark[0]^wmark[1]^wmark[2]^wmark[3]; else wm=(((long long)wmark[0]^wmark[1])) << 32 | (wmark[2]^wmark[3]); dict=md5_importantstuff(); debug("[+] Input file loaded successfully.\n"); debug(" Location : %s\n",orig); debug(" Size : %d bytes\n",siz); debug(" Signature : %08x-%08x\n",result[0] ^ result[1],result[2] ^ result[3]); debug(" Recipient : %s\n",to); debug(" Comment : %s\n",comm?comm:""); debug(" Magic : %08x-%08x\n",m1,m2); if (!use_64bit) debug(" Watermark : %08Lx\n",wm); else debug(" Watermark : %016Lx\n",wm); debug(" Conf MD5 : %08x\n",dict); add_database(dict,orig,result[0]^result[1],result[2]^result[3],to,comm,m1,m2,wm); vc=0; while ((foo=get_orig_atom())) { int j; vc++; if (vc>=MAXATOMS) fatal("file too big - MAXATOMS exceeded"); for (j=0;j<4;j++) { int q=get_storage(foo,3-j); cap[3-j]+=q; tots+=q; // Commit to our choices. if (q>0) set_value(foo,rand() % q,3-j); capseq[vc][3-j]=q; } } debug("[*] Computed capacity: %d bits overall.\n",tots); debug(" Target specification : %s [%s]\n",get_langdesc(),TARGETLANG); if (tots < nb) fatal("File capacity too low to carry the watermark (need %d bits)",nb); if (tots >= nb*2) debug(" Overall redundancy : %.02f (%d bits)\n",((float)tots)/nbf,tots); else debug(" WARNING: capacity too low, watermark will not be redundant!\n"); debug(" Whitespace channel : %.02f (%d bits)",((float)cap[0])/nbf,cap[0]); if (tots >= nb*2 && cap[0] < nb*2) debug(" - NOT redundant!"); debug("\n"); debug(" Grammar channel : %.02f (%d bits)",((float)cap[1])/nbf,cap[1]); if (tots >= nb*2 && cap[1] < nb*2) debug(" - NOT redundant!"); debug("\n"); debug(" Formatting channel : %.02f (%d bits)",((float)cap[2])/nbf,cap[2]); if (tots >= nb*2 && cap[2] < nb*2) debug(" - NOT redundant!"); debug("\n"); debug(" Substitution channel : %.02f (%d bits)",((float)cap[3])/nbf,cap[3]); if (tots >= nb*2 && cap[3] < nb*2) debug(" - NOT redundant!"); debug("\n"); off[0]=0; off[1]=cap[0]>=nb?(nb/4):cap[0]; off[2]=cap[1]>=nb?(nb/2):((off[1]+cap[1])%nb); off[3]=cap[2]>=nb?(3*nb/4):((off[2]+cap[2])%nb); debug(" Channel offsets are : %d, %d, %d, %d\n",off[0],off[1],off[2],off[3]); set_original(x); debug("[*] Embedding the watermark...\n"); vc=0; while ((foo=get_orig_atom())) { int j; vc++; if (vc>=MAXATOMS) fatal("strange things happening - MAXATOMS exceeded"); for (j=0;j<4;j++) { int q=get_storage(foo,3-j); int fox; if (q!=capseq[vc][3-j]) { debug("[!] Internal module bug: failed expectation in domain %d.\n" " Expected storage %d, got %d for atom %d [%s].\n",3-j, capseq[vc][3-j],q,vc,foo); fatal("internal bug"); } if (q>0) { totstor+=q; totst[3-j]+=q; fox=ror(wm,off[3-j],nb); fox=fox & ((1<=nb) off[3-j]-=nb; } } if (write(i,foo,strlen(foo))<0) fatal("cannot write to file"); writ+=strlen(foo); } close(i); if (totstor1 && !strcmp(argv[1],"-6")) { argc--; argv++; use_64bit=1; nbf=64; nb=64; debug("[*] Strong 64-bit watermarking mode enabled.\n"); } else debug("[*] Weak 32-bit watermarking used (use -6 to change it).\n"); if (argc==2 && !strcmp(argv[1],"-l")) list_database(); if (argc<4) usage(); if (!strcmp(argv[1],"-e")) { if (argc-4) usage(); extract_sig(argv[2],argv[3]); } else if (!strcmp(argv[1],"-i")) { if (argc<5 || argc>6) usage(); add_sig(argv[2],argv[3],argv[4],argv[5]); } else usage(); fatal("Broken Turing machines all over the place"); return 0; } static const char spell[] = "\n\n\n\n" "`How many Prolog programmers does it take to change a lightbulb?'\n" "`No.'\n\n\n\n"; snowdrop/Makefile0100644000000000000000000000536207757262440013113 0ustar rootroot# # snowdrop - text watermarking and watermark recovery # --------------------------------------------------- # # Copyright (C) 2002 by Michal Zalewski # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # As a special exception, this program may be linked with the # OpenSSL library, despite that library's more restrictive license. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # VER = 0.02b LANG = eng engf c BINROOT = /usr/bin/ CFLAGS = -ggdb -O9 -fomit-frame-pointer -funroll-loops -fexpensive-optimizations \ -ffast-math -Wall all: modules snowdrop toinstall modules: language.h @echo ; \ echo "[*] Compiling language modules:" ; \ test -d /usr/include/openssl && USEOPENSSL=1; \ test -d /usr/local/include/openssl && USEOPENSSL=1; \ test "$$USEOPENSSL" = "" || echo "[+] Using OpenSSL MD5 modules." ; \ test "$$USEOPENSSL" = "" && echo "[+] Trying to use RSA MD5 modules." ; \ for i in $(LANG); do \ echo "[+] Building language module for '$$i'..."; \ test "$$USEOPENSSL" = "" || ADDME="-DUSE_OPENSSL" ; \ $(CC) $$ADDME $(CFLAGS) -c lang-$$i.c -o lang-$$i.o || exit 1; \ done; \ echo "[*] Language modules compiled." snowdrop: snowdrop.c language.h @echo "[*] Compiling main code:"; \ test -d /usr/include/openssl && USEOPENSSL=1; \ test -d /usr/local/include/openssl && USEOPENSSL=1; \ for i in $(LANG); do \ echo "[+] Building 'sd-$$i'..." ; \ ADDME="-lmd5"; \ test "$$USEOPENSSL" = "" || ADDME="-DUSE_OPENSSL -lcrypto" ; \ $(CC) -DVER=\"$(VER)\" $(CFLAGS) -DTARGETLANG=\"$$i\" snowdrop.c lang-$$i.o -o sd-$$i $$ADDME || exit 1; \ done; \ echo "[*] Main code compiled." toinstall: @echo "Type 'make install' to install binaries in $(BINROOT)." @echo clean: rm -f sd-* *.o core core.* a.out install: modules snowdrop @echo "[*] Installing binaries in $(BINROOT)..." cp -f sd-* $(BINROOT) @echo "[*] Installing synonyms database..." @mkdir /usr/share/snowdrop || true cp synonyms /usr/share/snowdrop/ @echo "[*] Installation complete." publish: clean @ tar cfvz /snowdrop.tgz /snowdrop; \ scp -p /snowdrop.tgz lcamtuf@coredump.cx:/export/www/lcamtuf/snowdrop.tgz; \ rm -f /snowdrop.tgz snowdrop/language.h0100644000000000000000000000321307537771134013401 0ustar rootroot/* snowdrop - text watermarking and watermark recovery --------------------------------------------------- Copyright (C) 2002 by Michal Zalewski This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. As a special exception, this program may be linked with the OpenSSL library, despite that library's more restrictive license. Common language engine API. */ #ifndef HAVE_LANGUAGE_H #define HAVE_LANGUAGE_H #define MAXBUF 4096 #define DOMAIN_WHITE 0 // Whitespaces, comments #define DOMAIN_GRAMMAR 1 // Grammar changes (and typos) #define DOMAIN_FORMAT 2 // Formatting, notation #define DOMAIN_SYNONYMS 3 // Synonyms / name substitution void set_original(const char* buf); void set_watermarked(const char* buf); char* get_orig_atom(void); char* get_water_atom(void); int get_storage(const char* orig, const int domain); char* set_value(const char* orig,int value, const int domain); int get_value(const char* orig,const char* water,int* src,int* va,char test); int get_water_pos(void); void set_water_pos(int x); char* get_langdesc(void); // Helpers. unsigned int md5_importantstuff(void); void module_help(void); void md5_wrong(void); #define fatal(x...) exit( ( fputs("[-] FATAL: ",stderr) + \ fprintf(stderr,x) + \ fputs("\n\n",stderr) ) != 0 ) #define debug(x...) fprintf(stderr,x) #endif /* not HAVE_LANGUAGE_H */ snowdrop/README0100644000000000000000000006134407757262242012335 0ustar rootroot snowdrop - text watermarking and watermark recovery (ver. 0.02b) ---------------------------------------------------------------- == http://lcamtuf.coredump.cx/snowdrop.tgz == *************************************** **** THIS IS A BETA STAGE MATERIAL **** *************************************** Copyright (C) 2002, 2003 by Michal Zalewski Submit bug reports, complaints, ideas, patches, ports and chocolate to Michal Zalewski at . Contents: [1] Why would I possibly want to watermark a document? [2] What should I know about this approach? [3] Text watermarking: what to expect [4] C code watermarking: what to expect [5] General usage rules / bugs [6] Developing your own modules [1] Why would I possibly want to watermark a document? ------------------------------------------------------ The traditional watermarking relies on embedding some information in a binary file (such a proprietary format document - Adobe PDF, MS Word; or multimedia files) to identify the origin of a particular copy. Watermarking can be combined with steganography to hide this data from a casual viewer. Snowdrop is intended to bring (relatively) invisible and modification-proof watermarking to a new realm of "source material" - written word and computer source codes. The information is not being embedded in the least significant portions of some binary output, as it would be with a traditional low-level steganography, but into the source itself. The idea, at least for English, isn't new - there was some serious work done by Mikhail Atallah from Purdue University. Snowdrop is merely an attempt to provide a reliable, useful tool to implement those source-level watermarking / steganography capabilities. Because of some tricks, such as using specially crafted MD5 shortcuts, it gives certain additional advantages to its potential users, such as integrity and privacy of embedded information, or an ability to demonstrate the origin of a document to the public (see section 2 for more details). Separate logical channels are used to carry highly redundant watermark to ensure it is extremely difficult to remove this information by accident, simple reformatting, etc. I am under the impression that both the computer community in general, and security researchers in particular, would benefit from having such a tool for many reasons. There are two main scenarios where watermarking capabilities provided by Snowdrop are particularly useful. One is protecting limited distribution work, such as advisories, exploits, licensed or closed source code, confidential research, internal corporate memos and other information that could eventually leak to the public. In such cases, embedding an unique watermark in every copy of the document would enable you to track down the leak - at the same time, only you would be able to decode or modify this information. This knowledge can be later demonstrated and documented. Second scenario is enforcing copyright. In case of plagiarism or copyright violation, it can be demonstrated that other person used a text or source code originating from you. This procedure does not prove who authored the document, merely demonstrating that party A published portions of a document received from party B, as opposed to publishing original work. Once again, the information can be recovered only by you, and cannot be altered in a meaningful way by third parties. While it is perfectly possible to intentionally remove a watermark from a document, the idea of using steganography makes it much more difficult to realize you are actually dealing with a watermarked document. In other words, unless you run every document routinely thru a "watermark remover" utility, there is very little chance you'd be aware of the watermark and thus attempt to remove it. At the same time, as moentioned above, i t is difficult to remove the watermark by accident or simple modifications. Snowdrop, in its current version, supports three operating modes: draft quality English language document, fine quality English language document, and C source code. It is relatively simple to implement new language modules (see section 6), and I encourage you to do so, both for programming and spoken languages. [2] What should I know about this approach? ------------------------------------------- More detailed information about how documents are being modified is provided in next two sections. First, I'd like to focus on some basic mechanisms implemented in the language-independent kernel. Embedded in the document are MD5 shortcuts of a specific 64-bit one-time magic value and the name of a recipient. This makes it practically impossible to guess who the recipient was, or to modify this information and mislead you. You can, however, verify MD5 watermark embedded in a file - you will need the original file without a watermark, and a database of magic values and recipient names. Having this information, you can demonstrate to others what MD5 value was actually embedded in the file (this cannot be determined without seeing the original), and how this MD5 value was created (by disclosing recipient's name and one-time magic). By doing that, you do not compromise the security of other signatures, because magic values are different for each copy. The advantages of using Snowdrop: - "Source-level" watermarks ensure medium-independent protection. You can store information into many files that otherwise couldn't be controlled at all. - Watermarks are relatively short and redundant, so even 5-10 lines of text can be sufficient to recover the watermark, and even severe modifications in medium-sized document will not affect the ability to recover the watermark. - Several separate channels are used to carry the information. This means that simple reformatting, edits, spell-checking or any other operation alone will most likely not destroy the watermark. - Watermarks can be only analyzed by you, unless you decide to publish the information, in which case, the watermark can be confirmed by third parties. Otherwise, no third party can read, swap or spoof watermarks in your name. - Watermark presence is not evident. In a typical text document or almost every C source, there is very little to indicate steganography on all levels supported. Note that Snowdrop signature does not prove that you actually authored the material - it only demonstrates that you had it, you passed it to another party, and the party did something with it. Snowdrop also does not prove you actually disclosed the code to a person you claimed, as "recipient" field can be set arbitrarily. Your intent can be questioned. NOTE: Snowdrop, by default, uses 32-bit watermarks. This alone has a relatively low value as a "public proof". Simply put, it is feasible for you to use brute-force to find such values that would give an identical watermark to what you've found in some file watermarked by somebody else, and claim it is your file. If you plan on using watermarks to document leaks to the public, please use -6 option (64-bit watermarks) when possible. 64 bit is reasonably strong in that, on a box that is around 50,000 times faster than an average PC, it would still take a year to crack it, approximately. This is far beyond the limit of computational feasibility for typical users, and I expect that it won't be disputed whether you actually watermarked the file in question, unless, of course, you happen to be an administrator of a cluster of supercomputers ;-) 96-bit signatures will be coming in future versions, but I don't think this is critical at this point. That said, 32-bit signatures are still suitable for watermarking any code or text for your own purposes - that is, just tracing leaks or other information. Another advantage of sticking with 32-bit is that such watermarks take less space, can be embedded more times, and thus, are more reliable and easier to recover from even a short chunk of text. [3] Text watermarking: what to expect ------------------------------------- Text watermarking component for technical English: sd-eng Higher quality output, lower capacity: sd-engf There are two versions of the same module available. First of them, sd-eng, works fine to generate a readable equivalent of input text in technical English. Use it with e-mails and other information that does not have to look nice, can have typos, etc. It is NOT supposed to preserve ASCII layout, equations, to preserve the meaning of poetry, and so on. It is suitable for short drafts, memos and other documents that do not have to deliver highest possible quality, where the amount of information stored in each file is more important than the quality. This documentation wouldn't look pretty after being run thru sd-eng. Another version, sd-engf, generates higher quality output for the price of only 30% the storage capacity of sd-eng. It should preserve tables, equations and other ASCII artifacts, preserve indentation, introduce far less typos, etc. This documentation should still look very good after being processed with this tool. Both modules use synonym database to replace certain words with others. If you feel that your wording is being hurt by the synonym substitution used by the program, create a copy of /usr/share/snowdrop/synonyms file and make necessary edits to remove the offending rules. NEVER EDIT THE MASTER COPY. You will need the exact copy of 'synonyms' file you used to watermark files in order to recover watermarks later. If you want to use a new file, pass SD_SYNONYMS environment variable pointing to the new copy, and don't forget to keep the copy for further reference. Unfortunately, current version of sd-eng does not support watermark recovery using multiple 'synonyms' files at once, so please make a note of used file and be sure to pass it in SD_SYNONYMS on the run time. Minimum length of a document that can be successfully watermarked is around 5-10 lines of normal, reasonably formatted English text. This is also probably the shortest length from which the watermark can be successfully recovered. Redundancy in all channels is achieved at around 40-70 lines of text (multiply those figures by three to get an estimate for sd-engf utility). This amount, even if modified, should be sufficient to recover the watermark. Four used channels: - whitespaces - typos (rarely used but high capacity) - notation (various types of quotes and other punctuation marks) - synonymous words Unfortunately, unlike with C code, channel distribution is very not proportional, with weakest channel (whitespaces) dominant in sd-eng. In sd-engf, distribution should be a bit more fair, but only because of whitespace channel capacity being severily reduced. Usage of the module is pretty straightforward - please run it with no parameters to get help. [4] C code watermarking: what to expect --------------------------------------- C code watermarking component: sd-c ********************************************************* * NOTE: This code still needs some work. Use with care. * ********************************************************* C support is a bit rough in that it generates ugly code and, at least for now, non-customizable variable names that can be a bit suspicious to a paranoid person. Also, some features are still missing, making the watermark easier to remove; your contributions, fixes, etc are welcome - for now, consider this code pretty close to being only a proof of concept. But you can expect very good results. Even around three lines of code can carry enough information to store a watermark, ten to fifteen lines should provide very good redundancy in all channels. The code is optimized for standalone executables. Because of that, function names and variable names will be modified. If this file is supposed to provide exported symbols (e.g. is a library), you might consider restoring symbols later by hand. Future versions should include an option to limit this functionality to static variables and functions and local variable names. Currently, some specific constructions (for example, preprocessor macros that refer to global or local variables that are not yet defined and not passed as a parameter) can result in an invalid code being generated. This and other minor glitches can be corrected by hand without loosing the ability to retreive data from the file. Four channels are: - whitespaces, line breaks and other non-essential data. - code logic (e.g. 'if A then B else C' becomes 'if not A then C else B'); this channel is not used in this version, but reserved for the future. - code notation (usage of ;, !sth versus sth==0, etc); not used in this version. - substituted variable names. Usage of the module is pretty straightforward - please run it with no parameters to get help. I took GOBBLES Apache exploit, watermarked it, then removed all comments and copyrights, ran it thru 'ident' and, well... [+] This document matches entry 65 (channel offset 456): Source file : /home/lcamtuf/apache-scalp.c Time : Fri Aug 2 19:38:28 2002 Recipient : Evil Hacker Comment : no comment Source MD5 : 871921b3-9b9239f5 Magic value : 21903813-164c4f42 Watermark : 000000002767bdc1 [5] General usage rules / bugs ------------------------------ This tool has been tested on Linux and FreeBSD. It should be fairly easy to port to any system, it might, however, depend on little endian architecture. That is to say, I have not tested it on big endian. Feel free to do it, eventually fix any problems, and mail me back. This tool requires either OpenSSL development libraries or RSA MD5 libraries installed in the system. Before starting, make sure to read the paragraph on 32- versus 64-bit watermarks in the section [2], and other parts of this documentation in general. Make sure you use options suitable for your needs. There is no detailed installation and usage tutorial, because the author assumes that all potential users would have some basic knowledge of Unix and C. Similarly, generated output should be readable to everyone with basic understanding of this write-up. There are some general considerations when using this program: First of all, NEVER DELETE INPUT FILES OR SYNONYM DATABASES. Those files are essential for watermark recovery later. This is what makes watermarks impossible to spoof or read to others - consider those two files a part of your private key. Snowdrop will keep MD5 checksums of both files for your reference and will refuse to run if you don't have same exact files while trying to recover watermarks. If you have difficulties managing this information, use "comment" field provided by the tool to store this data. Note that disclosing those files alone does not compromise your watermark integrity - it would make it possible for others to read MD5 shortcut embedded in the file, but it would be pretty useless without the database you have. It would be possible for Snowdrop to archive those files for you in a safe location, but since Snowdrop can be theoretically run against, say, 10000 identical messages, it would be a waste of storage space to copy the configuration and input every time. Besides, in some cases, creating additional copies of the material would be not desirable. If you are forgetful, don't mind wasting some space or don't expect such a massive fingerprinting, a very simple shell script would do. Snowdrop database is located in ~/.snowdrop/database. It stores MD5 sums of input files, unique per-file magic values, watermarks, recipient names, comments and other information. You should protect this file (default permissions are safe). If a portion of your code or text is used in another program, Snowdrop will try to automatically synchronize input and output files. If this mechanism fails, please let me know - it is pretty experimental. If you plan on sending watermarked file via e-mail, it is best to do it as a text attachment. Otherwise, try to insert the file in the mail, do not copy-and-paste (as, in most clients, this would enable auto line wrapping). As to bugs... Quite frankly, this code should be written in a functional / declarative language. Simple as that, there is a lot of tasks that require some recursion, retracting from certain choices, language processing. Prolog would be better. But, for obvious reasons, I decided to go with C. The code wasn't carefully designed from the very beginning (or, more precisely, it was, but many of my assumptions turned out to be incorrect). As a result of that, this is pretty much a hack to achieve reasonable results and test the concept, but the next version should be pretty much reworked from scratch. That said... There is very little atom length checking while most of the time, static buffers are used. So run it only against normal text files you've looked over, with no words or lines over 4 kB, etc. You probably don't want to report this to BUGTRAQ ;-) Another problem is that the code is awfully slow on resynchronization (if two files have some sections grossly different). It typically takes up to few seconds per line. Your options are, being patient when you try to recover the watermark, of watermark smaller portions of relevant text instead of a huge document. There are three main issues that cause this problem: - resynchronization algorithm is not very optimal because of the irreversible nature of get_orig_atom(); that is, to recover from five deleted words, the *whole* document is being processed five times. Because of that, recovery from added words is much faster. It is not a big deal to make get_orig_atom() reversible, just add a stack, store all flags and pointers. - string manipulation functions make way too many copies; more neat string management and comparsion (for example, the code I used in Catty 2) should be implemented at some point; the point is that it'd have to be used everywhere, so this requires code review. - get_value() essentially tries to brute force and compare instead of reversing set_value(). This causes some performance loss, but also saved me some serious debugging. get_value() has to be modified eventually to deliver better reliability anyway. As of today, watermark recovery algorithm is not really perfect. If streams are non-continous (frequent changes were introduced), long enough segments of the watermark may be not present in the reconstructed stream. The ideal reconstruction algorithm will keep track of 'gaps' in the stream and fill them with newly acquired data instead of always simply appending it. Current algorithm, however, requires that at least eight bit segments are continous in any of the domains and otherwise fails to find a matching key. Another watermark recovery problem is that domains don't have equal priorities. While it is possible to recover synonym data from an atom if there is a change in the number of whitespaces preceeding it, it might be impossible to recover whitespace data if the synonym was changed. This is another weakness of the brute force get_value(). This shouldn't be a real problem, as it shouldn't happen too often that the document is completely reworded but whitespaces are preserved - but it is an issue. [6] Developing own modules -------------------------- This is a draft section, expect slight differences between this description and the real behavior. I plan on making the API simplier in the future. Module API is pretty trivial. To add a new module, you have to include it in the LANG= line in Makefile, and create a lang-nnn.c file, where nnn is your module name. The module should include "language.h" file, and have the following exports: void set_original(const char* buf); This should set new input buffer, a continuous memory region starting at 'buf' and NULL-terminated, as the input data. It should also reinitialize module to its original state (which is critical, since every module is called twice during the watermarking process). void set_watermarked(const char* buf); This works like set_original, except for reinitialization part, for watermark comparison. Essentially, the main code needs language atom extraction done on both original and compared file to compare elements. char* get_orig_atom(void); char* get_water_atom(void); Those two guys return language atom from two different sources. Those functions should be identical, except that get_water_atom() should not modify returned data or internal state of the parser. It is guaranteed that get_orig_atom() will be called only once, when we really mean to work on this particular atom, while get_water_atom() can be called a zillion times. int get_storage(const char* orig, const int domain); This function, called on an atom from the original file (and only from this file), is supposed to return its storage capacity, in bits. Note that if you can store six values in an atom, you effectively can store only two bits (four values) in there. Unless you want to re-work the code, half-bits are not allowed ;-) Domain stands for modification domain. This is because every atom can be modified independently in four different ways. Domains are as follows: #define DOMAIN_WHITE 0 // Whitespaces, comments #define DOMAIN_GRAMMAR 1 // Grammar / logic changes #define DOMAIN_FORMAT 2 // Formatting, notation #define DOMAIN_SYNONYMS 3 // Synonyms / name substitution You have to be sure there is no combination of modifications in every domain that would be ambiguous. Also, capacity of each domain should not depend on what happened in other domains. In other words, if for term "foo" in this particular location in the file, DOMAIN_GRAMMAR returns capacity 2, it should ALWAYS return this capacity, even if the word has been replaced by DOMAIN_SYNONYMS or modified in other way. If multiple modifications are possible in each domain, it has to return the best match (with most storage capacity; if same storage capacities are shared by several modifications, pick the one that is least vulnerable to edits or least intrusive). get_storage() should NOT change any parameters or internal state of the module. char* set_value(const char* orig,int value, const int domain); This call should return a modified copy of the original atom orig set to the value 'value' in the domain 'domain', using the modification previously agreed with get_storage(). This is called from domain 3 to domain 0, every subsequent call gets a copy of previous result as its input. This function should commit to all choices. In other words, if we decided to store something in indentation, this call should modify internal state so that indentation is copied to next lines. int get_value(const char* orig,const char* water,int* sc,int* va,int test); This function should retrieve a value stored in atom 'water' in comparison to atom 'orig' in all four domains, 3 to 0. Storage capacities in each domain should be stored in four subsequent integers pointed by 'sc', read values - in four subsequent integers of 'va'. Unless 'test' is non-zero, this function should also advance all internal state counters and such just the way set_value would do it. This function should try to retreive as much information as possible from domains. If nothing can be retreived, return 0. Otherwise, return 1. Helper functions: char* get_langdesc(void); Return up to approx. 25 character module language description unsigned int md5_importantstuff(void); Generate an MD5 sum of all files this module depends on. This is important for detecting and reporting changes in those files that could affect functionality. Result 0 if this module does not rely on files and other variables of this kind. void module_help(void); Write short module help. void md5_wrong(void); Write a warning message. This function is called if no matches were found for signatures with MD5 shortcut returned by module equal to what md5_importantstuff() returned, but some signatures were found with other MD5 values. This should essentially display "perhaps you used different input data to watermark this file"? Macros: fatal(x...) printf-alike macro that displays a message and terminates. debug(x...) printf-alike macro that should be used for all status output (all kinds of non-fatal messages). snowdrop/lang-c.c0100644000000000000000000003572407537771122012763 0ustar rootroot/* snowdrop - text watermarking and watermark recovery --------------------------------------------------- Copyright (C) 2002 by Michal Zalewski This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. As a special exception, this program may be linked with the OpenSSL library, despite that library's more restrictive license. C code language backend. */ #include #include #include #include #include #include #include "language.h" static char* orig_data; static char* water_data; static int orig_off; static int water_off; static char lfcmt, // We're inside a comment that runs to \n rgcmt, // We're inside a block comment instr, // We're inside a string isaname, // Last read thing is a name lastterm, // Last possible terminator? ismacro, // Is a macro? prevwasslash; // Got slashed? static char subst,didchange; struct changeT { char* from,*to; char implied; }; #define MAXVAR 10240 struct changeT change[MAXVAR+1]; static int chgtop=0; static int just_testing; #define MOD_NONE 0 #define MOD_CHGNAME 1 // Change variable name #define MOD_PSPACE 2 // Change the amount of spaces void set_original(const char* buf) { if (!buf) fatal("set_original(NULL)"); orig_data=(char*)buf; orig_off=0; water_off=0; instr=0; chgtop=0; isaname=0; didchange=0; lfcmt=0; ismacro=0; prevwasslash=0; rgcmt=0; lastterm=0; subst=0; } void set_watermarked(const char* buf) { if (!buf) fatal("set_watermarked(NULL)"); if (!orig_data) fatal("set_watermarked before set_original"); water_data=(char*) buf; water_off=0; } char orig_buf[MAXBUF+1]; // Cos z _ w srodku jest jednym atomem. char* get_orig_atom(void) { int i,nspaces=0; char* now=orig_data+orig_off; char* misio; char tmp[2]={0,0}; int gotret=0; if (!orig_data) fatal("get_orig_atom before set_original"); orig_buf[0]=0; while (isspace(*now)) { if (*now==' ') nspaces++; else if (*now=='\t') nspaces+=8; else if (*now=='\n') { lfcmt=0; gotret++; } now++; } if (!*now) return 0; while (gotret--) strcat(orig_buf,"\n"); for (i=0;i [%s] (subst %d)\n",misio,change[i].to,change[i].implied); if (!change[i].implied) strcpy(misio,change[i].to); else subst=1; didchange=1; isaname=0; break; } } // debug("%d/%d ",isaname,subst); orig_off=now-orig_data; return orig_buf; } char water_buf[MAXBUF+1]; char* get_water_atom(void) { int i,nspaces=0; char* now=water_data+water_off; char* misio; char tmp[2]={0,0}; int gotret=0; if (!water_data) fatal("get_water_atom before set_watermarked"); water_buf[0]=0; while (isspace(*now)) { if (*now==' ') nspaces++; else if (*now=='\t') nspaces+=8; else if (*now=='\n') gotret++; now++; } if (!*now) return 0; while (gotret--) strcat(water_buf,"\n"); for (i=0;i= top_storage) { \ mod_type=(id); top_storage=(siz); } int get_storage(const char* orig, const int domain) { const char* foo=orig; int top_storage=0; switch (domain) { case DOMAIN_WHITE: while (isspace(*foo)) foo++; if (!strchr(orig,'\n')) if (!(instr || rgcmt || lfcmt || ismacro)) { switch (*foo) { case '*': case ';': case '!': case '/': case '^': case '%': case '[': case ']': case '(': case ')': case '}': case '{': case ':': case '?': CHECK_STOR(1,MOD_PSPACE); break; } } break; /* - indentation 3 bits (regular handling) */ case DOMAIN_SYNONYMS: if (isaname) { CHECK_STOR(5*2,MOD_CHGNAME); } break; } return top_storage; } static char setv[MAXBUF+1]; static char namechars[]="abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; extern int synced; char* set_value(const char* orig,int value, const int domain) { int i; char sm[2]={0,0}; int unic=0; char *bt=(char*)orig,*foo=setv; int bc=0; while (isspace(*bt)) { bt++; bc++; } if (!(i=get_storage(orig,domain))) fatal("set_value on a fixed atom"); // debug("Asked to set %s to %d in domain %d (storage was %d)\n",orig,value,domain,i); setv[0]=0; switch (mod_type) { case MOD_PSPACE: for (i=0;i>5) & 31]; // debug("Asked to store %d (%d:%d) instead of %s.\n",value,value&31,(value >>5)&31,orig); *(foo+2)=0; // If it is not unique, try appending a number or char until it is. recheck: for (i=0;i= strlen(namechars)) fatal("too many variable names?"); goto recheck; } if (!strcmp(foo,"int") || !strcmp(foo,"for") || !strcmp(foo,"do") || !strcmp(foo,"if")) { if (!(*(foo+2))) *(foo+2)='p'; else (*(foo+2))++; *(foo+3)=0; } dontwantit: if (!just_testing) { for (i=0;i [%s] (imp %d syn %d)\n",chgtop,bt,foo,!synced,synced); chgtop++; } foome2: return setv; break; } fatal("set value with bogus MOD_*"); return 0; } int strspcmp(const char* a,const char* b) { while (isspace(*a)) a++; while (isspace(*b)) b++; if (!*a || !*b) return 31337; // Bleh. return strcmp(a,b); } static int warned; static char setval_copy[MAXBUF+1]; int get_value(const char* orig,const char* water,int* scr,int* va,char test) { int i3,i2,i1,i0; int sc[4]; int cnt=0; int retme=0; const char* shortwater=water; // char footest[1024]={0}; while (isspace(*shortwater)) shortwater++; //debug("in get_value (%s %s %d)\n",orig,water,test); if (!orig) fatal("get_value with orig==NULL"); if (!water) fatal("get_value with water==NULL"); if (!scr) fatal("get_value with sc==NULL"); if (!va) fatal("get_value with va==NULL"); redome: // Enter dummy mode, save buffer. just_testing=1; sc[3]=get_storage(orig,3); sc[2]=get_storage(orig,2); sc[1]=get_storage(orig,1); sc[0]=get_storage(orig,0); for (i3=0;i3<(1<\n",didchange,isaname); // debug(">> %s << \n",footest); return retme; } int get_water_pos(void) { return water_off; } void set_water_pos(int x) { water_off=x; } char* get_langdesc(void) { return "standalone C code (**** BROKEN BETA ****)"; } // There is no configuration. unsigned int md5_importantstuff(void) { return 0xf00; } // There is no help. void module_help(void) { return; } // There is no configuration MD5 mismatch. void md5_wrong(void) { return; } snowdrop/lang-eng.c0100644000000000000000000004715207537771126013314 0ustar rootroot/* snowdrop - text watermarking and watermark recovery --------------------------------------------------- Copyright (C) 2002 by Michal Zalewski This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. As a special exception, this program may be linked with the OpenSSL library, despite that library's more restrictive license. English language backend. */ #include #include #include #include #include #include #ifdef USE_OPENSSL #include #else #include #include #define MD5_Init MD5Init #define MD5_Final MD5Final #define MD5_Update MD5Update #endif /* USE_OPENSSL */ #include "language.h" // Max synonym cache entries #define MAXCACHE 4096 #define MAXSYN 16 // How often do you want typos (number of atoms)? #ifndef ENG_FINE #define TYPORATIO 150 #else #define TYPORATIO 500 #endif static int word_cnt; // Original term counter static char* use_quot; // Close quotes using this string static int indent_val=-1; // Indentation, if any static char prev_punct; // Previous atom was a punctuation mark static int cur_size; // Current input atom storage capacity static int cur_mod; // Current input atom modification static int just_testing; // Just teestiiing! #define MOD_NONE 0 // No modification #define MOD_SYNONYM 1 // Put a synonym #define MOD_TYPO 2 // Make a typo #define MOD_QUOTE 4 // Change quotes #define MOD_PSPACE 5 // Add spaces #define MOD_CAPS 7 // Capitalization #define MOD_PERIOD 8 // ; -> . #define MOD_DASH 9 // - -> 0xad #define MAXWORD 1024 struct syncache { char* from; char* to[MAXSYN+1]; char tcnt; }; struct syncache scache[MAXCACHE+1]; int sctop; struct sd_syns { char *from, *to, bid; }; static struct sd_syns syn[MAXWORD+1]; // Load synonym database, of course... static void load_synonyms(void) { int line=0,added=0,z,m; char buf[MAXBUF+1]; FILE* f; if (syn[0].from) return; if (getenv("SD_SYNONYMS")) { strcpy(buf,getenv("SD_SYNONYMS")); f=fopen(buf,"r"); } else { sprintf(buf,"%s/.snowdrop/synonyms",getenv("HOME")); f=fopen(buf,"r"); if (!f) f=fopen("/usr/share/snowdrop/synonyms","r"); if (!f) f=fopen("synonyms","r"); } if (!f) fatal("cannot find synonym dictionary (%s)",buf); while (fgets(buf,MAXBUF,f)) { char* bcop; char w1[MAXBUF], w2[MAXBUF],c; int i; line++; if (buf[strlen(buf)-1]=='\n') buf[strlen(buf)-1]=0; if (strchr(buf,'#')) *strchr(buf,'#')=0; bcop=buf; while (isspace(*bcop)) bcop++; if (!(*bcop)) continue; if (sscanf(buf,"%s %c %s",w1,&c,w2)!=3) fatal("malformed dictionary line %d [1]",line); if (!strcasecmp(w1,w2)) fatal("NOOP dictionary entry at line %d",line); i=0; while (syn[i].from) { if (!strcasecmp(syn[i].from,w1)) if (!strcasecmp(syn[i].to,w2)) fatal("duplicate dictionary entry for %s - %s at line %d",w1,w2,line); if (!strcasecmp(syn[i].from,w2)) if (!strcasecmp(syn[i].to,w1)) fatal("duplicate dictionary entry for %s - %s (reverse) at line %d",w1,w2,line); i++; } for (z=0;z') syn[i].bid=0; else if (c=='|') syn[i].bid=1; else fatal("malformed dictionary line %d [2]",line); added++; // Add a synonym for w1 -> w2 for (m=0;m=MAXCACHE) fatal("MAXCACHE exceeded"); scache[m].from=syn[i].from; scache[m].to[(int)scache[m].tcnt]=syn[i].to; scache[m].tcnt++; if (scache[m].tcnt>=MAXSYN) fatal("MAXSYN for %s exceeded",w1); // Add a synonym for w2 -> w1 if bid if (syn[i].bid) { for (m=0;m=MAXCACHE) fatal("MAXCACHE exceeded"); scache[m].from=syn[i].to; scache[m].to[(int)scache[m].tcnt]=syn[i].from; scache[m].tcnt++; if (scache[m].tcnt>=MAXSYN) fatal("MAXSYN for %s exceeded",w2); } } fclose(f); debug("[+] Loaded %d synonyms (%d lines parsed).\n",added,line); } static unsigned int got_md5; unsigned int md5_importantstuff(void) { int i=0; unsigned int result[4]; MD5_CTX kuku; load_synonyms(); if (got_md5) return got_md5; MD5_Init(&kuku); while (syn[i].from) { MD5_Update(&kuku,&i,sizeof(int)); MD5_Update(&kuku,syn[i].from,strlen(syn[i].from)+1); MD5_Update(&kuku,&syn[i].bid,1); MD5_Update(&kuku,syn[i].to,strlen(syn[i].to)+1); MD5_Update(&kuku,"-|-",3); i++; } MD5_Final((char*)result,&kuku); return got_md5=(result[0] ^ result[1] ^ result[2] ^ result[3]); } // Get the number of synonyms matching the term. static int lookup_syn_cnt(const char* term) { int q=0; while (q= top_storage) { \ mod_type=(id); top_storage=(siz); } static inline int storcap(int max) { int bits=0,pw=1; while (max >= pw) { pw<<=1; bits++; } return bits-1; } static int mod_type; int get_storage(const char* orig, const int domain) { const char* text=orig; int tsp=0; int top_storage=0; if (!orig) fatal("get_storage(NULL...)"); while (*text == ' ') {tsp++; text++; } switch (domain) { case DOMAIN_WHITE: #ifndef ENG_FINE if (prev_punct && orig[0]==' ') { CHECK_STOR(1,MOD_PSPACE); } #endif if (strchr(orig,'\n')) { #ifdef ENG_FINE if (linesofar>81) { CHECK_STOR(1,MOD_PSPACE); } else if (linesofar<76) CHECK_STOR(1,MOD_PSPACE); #else if (linesofar>81) { CHECK_STOR(4,MOD_PSPACE); } else if (linesofar<79) CHECK_STOR(storcap(79-linesofar),MOD_PSPACE); #endif } break; case DOMAIN_GRAMMAR: // FIXME: make typos a bit less predictable? if (!(word_cnt % TYPORATIO) && isalnum(*text)) { #ifdef ENG_FINE CHECK_STOR(4,MOD_TYPO); #else CHECK_STOR(5,MOD_TYPO); #endif } break; case DOMAIN_FORMAT: // If the word has at least two uppercase letters, // either make it first-only or all uppercase. This gives us // one bit and would not break synonym capitalization. { int i,gu=0; for (i=0;i 1) CHECK_STOR(1,MOD_CAPS); } #ifndef ENG_FINE // We can ruin some ;s ;-) if (*text==';') CHECK_STOR(1,MOD_PERIOD); #endif // We can ruin some -s ;-) if (*text=='-' || *(unsigned char*)text==0xad) CHECK_STOR(1,MOD_DASH); // We can also mess with quotes. This is good. if (!use_quot) if (!strcmp(text,"''") || !strcmp(text,"`") || !strcmp(text,"'") || !strcmp(text,"``") || !strcmp(text,"\"") || !strcmp(text,",,")) CHECK_STOR(3,MOD_QUOTE); break; case DOMAIN_SYNONYMS: // Determine how many synonyms can be substituted for a word. // If any, add 1 to the number (as we can left the word unchanged, // as well). Now, determine largest power of two less or equal to // the number we got. This is our storage capacity. { int i; i=lookup_syn_cnt(text); if (i>0) { CHECK_STOR(storcap(i+1),MOD_SYNONYM); } } break; default: fatal("bogus domain in get_storage"); } return top_storage; } static char setv[MAXBUF+1]; static char typovals[]="abcdefghijklmnopqrstuvwxyz0123456789"; char* set_value(const char* orig,int value, const int domain) { int cap,i; const char* text=orig; char* foo; cap=get_storage(orig,domain); if (cap <= 0) fatal("set_value with fixed atom"); if (cap > 16) fatal("set_value with atom of excessive storage capacity"); if (value >= (1< This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. As a special exception, this program may be linked with the OpenSSL library, despite that library's more restrictive license. English language (better quality, lower capacity) backend. Done as a hack for lang-eng.c. */ #define ENG_FINE 1 #include "lang-eng.c" snowdrop/synonyms0100644000000000000000000003045707537771150013277 0ustar rootroot# # snowdrop - text watermarking and watermark recovery # --------------------------------------------------- # # Copyright (C) 2002 by Michal Zalewski # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # As a special exception, this program may be linked with the # OpenSSL library, despite that library's more restrictive license. # # -- # # **************************************************************** # * NOTE: ALWAYS BACKUP THIS FILE. YOU WILL NEED THE OLD VERSION * # * TO RECOVER WATERMARKS EMBEDDED BEFORE THE MODIFICATION... * # **************************************************************** # # This is a database of 'synonyms' for technical publications. Intended # to preserve general meaning, but might cause minor differences, designed # to deliver high capacity even in short documents. Note that this database # may degrade the quality of non-technical writing. If you want to use # it for other purposes, please review it, so you avoid - for example - # replacing 'identical' with 'same' in a philosophical essay about the # nature of identity and self-consciousness ;-) Same way, 'moral' and # 'ethical' are defined as synonymous. Not to mention 'wished' and 'hoped'. # Be careful. # Format: word1 OP word2. OP is the operator, | means bidirectional # synonym (that is, word1 can be substituted for word2 and vice versa), # while > means unidirectional synonym (word2 for word1 only). There is no # redundant '<' operator. Note that you cannot use multi-word phrases # here. Sorry. Implement it in next versions ;-) # Do include all forms of the word and all variants. For example, if # 'foo' is synonymous to 'bar', and 'bar' to 'baz', please include # 'foo' - 'bar', 'bar' - 'baz', and 'foo' - 'baz', if suitable, and # include 'foos', 'bars', 'bazes', 'fooed', 'bared', 'bazed', 'fooing', # 'baring', 'bazing', 'fooation', 'baration', 'bazition'. Do NOT put # words that are synonyms only in some cases (e.g. only in noun form, # or only in specific cases or very specialized meaning). For example, # not discussing subtle linguistical differences, you do not want to put # 'might | may' there. Why? Because there is no such month as 'Might' ;-) # But unidirectional operator would do in this case. # # NOTE: No entries with uppercase characters. Period. # gb | gigabytes os > system abnormal | anomalous accomplish | achieve accomplished | achieved accomplishes | achieves accomplishment | achievement achieve > reach achieved > reached acknowledgement | acknowledgment adequate | sufficient adequately | sufficiently aesthetic | esthetic aesthetics | esthetics ageing | aging algorithm > approach algorithm > strategy algorithm > tactics algorithm | formula algorithm | method algorithms | methods alter > change altered > changed alternation > change alters > changes aluminium | aluminum analog | analogue analyse | analyze anomaly | disruption anybody | anyone apologise | apologize approach | strategy area | zone arguable | questionable arguably | doubtfully arguement > argument arrive > come arrived > came arrives > comes arriving > coming assume | assert assumed | asserted assumes | asserts assumption | assertion assumptions | assertions average > typical axe > ax basic | simple behavior | behaviour best | optimal big | great big | huge big | large broken | damaged broken | faulty build > make burnt > burned bytes > b calculate | compute calculated | computed calculates | computes calculating | computing calculation | computation can't | cannot canceled | cancelled catalogue > catalog caution > warning center | centre certain | specific character > letter characteristics | profile characteristics | specifics characters > letters cluster > array clusters > arrays collect > gather collected > gathered collecting > gathering collects > gather colour > color commonly | typically complexity > difficulty compliance > agreement complied > agreed complies > agrees comply > agree component | element components | elements comprehensive > complete computation > evaluation #computer > machine #computer > system #computer | hardware computing > evaluating computing > processing concept | idea concepts | ideas concern > issue connector > adapter consequence | implication console > display console > terminal consuming | expensive converter > adapter correctly | properly costly | expensive cosy | cozy counseling | counselling crash > failure crew > group crew > team criticise | criticize cycle | loop cycled | looped cycles | loops cycling | looping damaged > faulty datagram | packet datagrams | packets declaration | specification declare | specify declared | specified declares | specified default > standard defence | defense define > describe defined > described defines > described defining > describing definition > description delete > remove deleted > removed deletes > removes deletion > removal deliver | provide delivered | provided delivers | provides desirable | expected desired | desirable desired | expected detect > sense detected | sensed detecting | sensing detection | sensing detects | senses dialogue > dialog difficulties | obstacles difficulty | obstacle disk > storage display > terminal distant | far document > article done | completed done | finished each | every easy | simple easy | trivial eclaring | specifying effective | optimal effective | optimised efficiency | performance efficient | optimal efficient | optimized else | otherwise emphasise | emphasize empirical | experimental employees | staff emulate | mimick emulates | mimicks encrypted | crypted encrypting | crypting encylycopaedia | encylopedia enormously > highly enormously > very enormously | extremely enquire | inquire enquiry | inquiry enrollment | enrolment ensure | verify ensures | verifies equaling | equalling erroneous | faulty evaluate > test evaluate | verify evaluated | tested evaluated | verified evaluates > tests evaluates | verifies evaluating | testing evaluating | verification evaluation | testing evaluation | verification exactly | precisely excessive | redundand excessive | superfluous predict > expect expected | predicted predicts > expects extremely | highly fast | quick faster | quicker favor | favour favorite | favourite favorites | favourites finished | completed first | initial fits | matches following > next formula > approach formula > method frequent > common frequently | commonly fulfil | fulfill full > complete gateway | router generalised > general glamor | glamour goal > target gray | grey great | huge great | large halt | stop halted | stopped halting | stopping halts | stops happening | occouring happening | occuring harbor | harbour hardware > machine honor | honour honors | honours huge | large humans | people humor | humour hypothetical > potential identical | same illustrate > demonstrate illustrated > demonstrated illustrates > demonstrates illustrating > demonstrating illustration > demonstration important | relevant incorrect > broken incorrect > faulty incorrect | erroneous incorrectly | erroneously increased | maximized increasing | maximizing indicate > display indicate > show indicated > displayed indicated > showed indicates > displays indicates > shows indicating > displaying indicating > showing indication | indicator information > data inherit > receive inherited > received inheriting > receiving inherits > receives inhibit > prevent inhibited > prevented inhibiting > preventing inhibits > prevents insecure | unsafe insignificant | unimportant into > in intruder > attacker intrusion > compromise irrelevant | insignificant irrelevant | unimportant jewelry | jewellery job | task judgement | judgment kilobytes | kb kinds > sorts kinds > types labeled | labelled labor | labour labors | labours language | dialect languages | dialects licence | license likely | possibly likely | probably limitation | restriction limited | restricted limiting | restricting located | placed location > place locations > places looped > repeated major | important major | relevant major | significant malicious | harmful maneuver | manoeuvre matching | comparable matching | similar maximize > increase maximizes > increases mediaeval | medieval mediocre > medium megabytes > mb memorise | memorize men | people message > information method | tactics microsoft > msft middle > center midsize > medium might > may minor | insignificant minor | irrelevant minor | unimportant modeling | modelling modification > change modification | alternation modified > changed modified | altered modifies > changes modifies | alters modify > change modify | alter mold | mould moral | ethical multiple > many necessary | required nobody | noone normal | typical normally | typically obstacle | problem obstacles | problems often | commonly often | frequently optimization > improvement optimize > improve optimized > improved optimizes > improves over > above peak > maximum performance > speed personnel | staff pipeline | queue pixel > dot platform > system platforms > systems probable > possible possibly | probably pound | lb pounds | lbs precise > careful precisely > carefully predictable | deterministic previous > prior previously > already principle > rule principles > rules probability | likehood problem > issue processor | cpu program > application program > software programs > applications programs > software projected | expected projected | predicted proposal | suggestion propose | suggest proposed | suggested proposes | suggests proposing | suggesting proposition | suggestion provide > give provided > gave provides > gives providing > giving proxy > gateway publication > article publication > document publication > paper publication > writing quantity > amount quarreling | quarrelling rapid > fast rapid > quick rarely | seldom readily > currently realise | realize really | truly reason > cause receive | obtain received | obtained receives | obtains receiving | obtaining reconnaissance > intelligence redhat > rh reduce > decrease reduced | minimised reducing | decreasing reduction > decrease redundand | superfluous region > area relevant | significant relied | depended relies | depends rely | depend relying | depening renamed > moved requirements > needs requires > needs research | study resize > scale retain > keep retained > kept retaining > keeping retains > keeps rightful | legitimate routing > forwarding safe | secure said > mentioned said > stated sample > test satisfied > met satisfies > fits satisfies > meets satisfy > fit satisfy > meet say > mention say > state says > mentions says > states search > seek seconds > s seem > look seem | appear seemed > looked seemed | appeared seems > looks seems | appears segment > block send | transmit sensor | detector separate > dedicated separator | delimiter shared > common shipped > delivered shipped > provided should | shall signaled | signalled signaling | signalling similar | comparable simple | trivial simulation > model skilful | skillful small | tiny solution > answer somebody | someone specifics | details staff | workers subsequent > following subsequent > next superuser > administrator supicious > alarming surpress > prevent surpress | inhibit surpressed > prevented surpressed | inhibited surpresses > prevents surpresses | inhibits surpressing > preventing surpressing | inhibiting surpression > prevention surpression | inhibition technique > approach technique > methodology techniques > methods terminate > halt terminate > stop terminated > halted terminated > stopped terminates > halts terminates > stops terminating > halting terminating > stopping termination > halt termination > stop tested | verified testing | verifying theater | theatre therefore > so timer > clock topic | subject topology > structure total > complete total > overall towards > to transfer > flow traveled | travelled traveling | travelling trend | tendency tunneling | encapsulating utilities > programs utilities > software utilities > tools utility > program utility > software utility > tool utilization > usage utilize > use utilized > used utilizes > uses verifies > tests verify > test very | extremely very | highly warned | cautioned warning > notice wheter > if wished | hoped wishing | hoping would > will years | yrs 1st > first 2nd > second 3rd > third 4th > fourth 5th > fifth 6th > sixth 7th > seventh 8th > eighth 9th > ninth 10th > tenth an > a snowdrop/ChangeLog0100644000000000000000000000046307757265777013242 0ustar rootroot Version 0.02b: - Fixes an annoying bug that caused the code to segfault when there is no sync near the end of the watermarked file. Bit walk algorithm is still a bit broken, and the overall performance and storage capacity sucks, the code needs to be eventually rewritten. Any takers?