mairix-0.22/0000755001161100116110000000000011402542300012521 5ustar richardrichardmairix-0.22/Makefile.in0000644001161100116110000000667511402542166014616 0ustar richardrichard######################################################################### # # mairix - message index builder and finder for maildir folders. # # Copyright (C) Richard P. Curnow 2002-2004,2006 # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. # # ======================================================================= ######################################################################### # Edit the following variables as required CC=@cc@ CFLAGS=@cflags@ @defs@ CPPFLAGS=@CPPFLAGS@ LDFLAGS=@LDFLAGS@ LIBS=@LIBS@ ####################################################################### # If you're generating a package, you may want to use # make DESTDIR=temporary_dir install # to get the software installed to a directory where you can create # a tdl.tar.gz from it DESTDIR= ####################################################################### prefix=$(DESTDIR)@prefix@ bindir=$(DESTDIR)@bindir@ mandir=$(DESTDIR)@mandir@ man1dir=$(mandir)/man1 man5dir=$(mandir)/man5 infodir=$(DESTDIR)@infodir@ docdir=$(DESTDIR)@docdir@ ######################################################################### # Things below this point shouldn't need to be edited. OBJ = mairix.o db.o rfc822.o tok.o hash.o dirscan.o writer.o \ reader.o search.o stats.o dates.o datescan.o mbox.o md5.o \ fromcheck.o glob.o dumper.o expandstr.o dotlock.o \ nvp.o nvpscan.o all : mairix mairix : $(OBJ) $(CC) -o mairix $(CFLAGS) $(LDFLAGS) $(OBJ) $(LIBS) %.o : %.c memmac.h mairix.h reader.h Makefile $(CC) -c $(CFLAGS) $(CPPFLAGS) -o $@ $< datescan.c datescan.h : datescan.nfa ./dfasyn/dfasyn ./dfasyn/dfasyn -o datescan.c -ho datescan.h -r datescan.report -v -u datescan.nfa fromcheck.c fromcheck.h : fromcheck.nfa ./dfasyn/dfasyn ./dfasyn/dfasyn -o fromcheck.c -ho fromcheck.h -r fromcheck.report -v -u fromcheck.nfa nvpscan.c nvpscan.h : nvp.nfa ./dfasyn/dfasyn ./dfasyn/dfasyn -o nvpscan.c -ho nvpscan.h -r nvpscan.report -v -u nvp.nfa dates.o : datescan.h mbox.o : fromcheck.h nvp.o : nvpscan.h version.h: ./mkversion ./dfasyn/dfasyn: if [ -d dfasyn ]; then cd dfasyn ; make CC="$(CC)" CFLAGS="$(CFLAGS)" ; else echo "No dfasyn subdirectory?" ; exit 1 ; fi clean: -rm -f *~ *.o mairix *.s core -rm -f mairix.cp mairix.fn mairix.aux mairix.log mairix.ky mairix.pg mairix.toc mairix.tp mairix.vr -rm -f fromcheck.[ch] datescan.[ch] -rm -f nvpscan.[ch] if [ -d dfasyn ]; then cd dfasyn ; make clean ; fi distclean: clean -rm -f Makefile config.log install: [ -d $(prefix) ] || mkdir -p $(prefix) [ -d $(bindir) ] || mkdir -p $(bindir) [ -d $(mandir) ] || mkdir -p $(mandir) [ -d $(man1dir) ] || mkdir -p $(man1dir) [ -d $(man5dir) ] || mkdir -p $(man5dir) cp -f mairix $(bindir) chmod 555 $(bindir)/mairix cp -f mairix.1 $(man1dir) chmod 444 $(man1dir)/mairix.1 cp -f mairixrc.5 $(man5dir) chmod 444 $(man5dir)/mairixrc.5 .PHONY : all install clean distclean mairix.o : version.h mairix-0.22/dumper.c0000644001161100116110000000760611402542166014204 0ustar richardrichard/* mairix - message index builder and finder for maildir folders. ********************************************************************** * Copyright (C) Richard P. Curnow 2004, 2005 * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. * ********************************************************************** */ /* Database dumper */ #include #include #include #include #include #include #include #include #include #include #include "mairix.h" #include "reader.h" #include "memmac.h" static void dump_toktable(struct read_db *db, struct toktable_db *tbl, const char *title) { int i, n, j, incr; int on_line; unsigned char *foo; printf("Contents of <%s> table\n", title); n = tbl->n; printf("%d entries\n", n); for (i=0; i\n", i, db->data + tbl->tok_offsets[i]); foo = (unsigned char *) db->data + tbl->enc_offsets[i]; j = 0; on_line = 0; printf(" "); while (*foo != 0xff) { if (on_line > 15) { printf("\n"); on_line = 0; } incr = read_increment(&foo); j += incr; printf("%d ", j); on_line++; } printf("\n"); } } void dump_database(char *filename) { struct read_db *db; int i; db = open_db(filename); printf("Dump of %s\n", filename); printf("%d messages\n", db->n_msgs); for (i=0; in_msgs; i++) { printf("%6d: ", i); switch (rd_msg_type(db, i)) { case DB_MSG_DEAD: printf("DEAD"); break; case DB_MSG_FILE: printf("FILE %s, size=%d, tid=%d", db->data + db->path_offsets[i], db->size_table[i], db->tid_table[i]); break; case DB_MSG_MBOX: { unsigned int mbix, msgix; decode_mbox_indices(db->path_offsets[i], &mbix, &msgix); printf("MBOX %d, msg %d, offset=%d, size=%d, tid=%d", mbix, msgix, db->mtime_table[i], db->size_table[i], db->tid_table[i]); } break; } if (db->msg_type_and_flags[i] & FLAG_SEEN) printf(" seen"); if (db->msg_type_and_flags[i] & FLAG_REPLIED) printf(" replied"); if (db->msg_type_and_flags[i] & FLAG_FLAGGED) printf(" flagged"); printf("\n"); } printf("\n"); if (db->n_mboxen > 0) { printf("\nMBOX INFORMATION\n"); printf("%d mboxen\n", db->n_mboxen); for (i=0; in_mboxen; i++) { if (db->mbox_paths_table[i]) { printf("%4d: %d msgs in %s\n", i, db->mbox_entries_table[i], db->data + db->mbox_paths_table[i]); } else { printf("%4d: dead\n", i); } } printf("\n"); } printf("Hash key %08x\n\n", db->hash_key); printf("--------------------------------\n"); dump_toktable(db, &db->to, "To"); printf("--------------------------------\n"); dump_toktable(db, &db->cc, "Cc"); printf("--------------------------------\n"); dump_toktable(db, &db->from, "From"); printf("--------------------------------\n"); dump_toktable(db, &db->subject, "Subject"); printf("--------------------------------\n"); dump_toktable(db, &db->body, "Body"); printf("--------------------------------\n"); dump_toktable(db, &db->attachment_name, "Attachment names"); printf("--------------------------------\n"); close_db(db); return; } mairix-0.22/hash.c0000644001161100116110000001242111402542166013622 0ustar richardrichard/* Hash function */ #include "mairix.h" /* -------------------------------------------------------------------- lookup2.c, by Bob Jenkins, December 1996, Public Domain. hash(), hash2(), hash3, and mix() are externally useful functions. Routines to test the hash are included if SELF_TEST is defined. You can use this free for any purpose. It has no warranty. -------------------------------------------------------------------- */ #include #include #include #define hashsize(n) ((unsigned int)1<<(n)) #define hashmask(n) (hashsize(n)-1) /* -------------------------------------------------------------------- mix -- mix 3 32-bit values reversibly. For every delta with one or two bit set, and the deltas of all three high bits or all three low bits, whether the original value of a,b,c is almost all zero or is uniformly distributed, * If mix() is run forward or backward, at least 32 bits in a,b,c have at least 1/4 probability of changing. * If mix() is run forward, every bit of c will change between 1/3 and 2/3 of the time. (Well, 22/100 and 78/100 for some 2-bit deltas.) mix() was built out of 36 single-cycle latency instructions in a structure that could supported 2x parallelism, like so: a -= b; a -= c; x = (c>>13); b -= c; a ^= x; b -= a; x = (a<<8); c -= a; b ^= x; c -= b; x = (b>>13); ... Unfortunately, superscalar Pentiums and Sparcs can't take advantage of that parallelism. They've also turned some of those single-cycle latency instructions into multi-cycle latency instructions. Still, this is the fastest good hash I could find. There were about 2^^68 to choose from. I only looked at a billion or so. -------------------------------------------------------------------- */ #define mix(a,b,c) \ { \ a -= b; a -= c; a ^= (c>>13); \ b -= c; b -= a; b ^= (a<<8); \ c -= a; c -= b; c ^= (b>>13); \ a -= b; a -= c; a ^= (c>>12); \ b -= c; b -= a; b ^= (a<<16); \ c -= a; c -= b; c ^= (b>>5); \ a -= b; a -= c; a ^= (c>>3); \ b -= c; b -= a; b ^= (a<<10); \ c -= a; c -= b; c ^= (b>>15); \ } /* same, but slower, works on systems that might have 8 byte ub4's */ #define mix2(a,b,c) \ { \ a -= b; a -= c; a ^= (c>>13); \ b -= c; b -= a; b ^= (a<< 8); \ c -= a; c -= b; c ^= ((b&0xffffffff)>>13); \ a -= b; a -= c; a ^= ((c&0xffffffff)>>12); \ b -= c; b -= a; b = (b ^ (a<<16)) & 0xffffffff; \ c -= a; c -= b; c = (c ^ (b>> 5)) & 0xffffffff; \ a -= b; a -= c; a = (a ^ (c>> 3)) & 0xffffffff; \ b -= c; b -= a; b = (b ^ (a<<10)) & 0xffffffff; \ c -= a; c -= b; c = (c ^ (b>>15)) & 0xffffffff; \ } /* -------------------------------------------------------------------- hash() -- hash a variable-length key into a 32-bit value k : the key (the unaligned variable-length array of bytes) len : the length of the key, counting by bytes level : can be any 4-byte value Returns a 32-bit value. Every bit of the key affects every bit of the return value. Every 1-bit and 2-bit delta achieves avalanche. About 36+6len instructions. The best hash table sizes are powers of 2. There is no need to do mod a prime (mod is sooo slow!). If you need less than 32 bits, use a bitmask. For example, if you need only 10 bits, do h = (h & hashmask(10)); In which case, the hash table should have hashsize(10) elements. If you are hashing n strings (ub1 **)k, do it like this: for (i=0, h=0; i= 12) { a += (k[0] +((unsigned int)k[1]<<8) +((unsigned int)k[2]<<16) +((unsigned int)k[3]<<24)); b += (k[4] +((unsigned int)k[5]<<8) +((unsigned int)k[6]<<16) +((unsigned int)k[7]<<24)); c += (k[8] +((unsigned int)k[9]<<8) +((unsigned int)k[10]<<16)+((unsigned int)k[11]<<24)); mix(a,b,c); k += 12; len -= 12; } /*------------------------------------- handle the last 11 bytes */ c += length; switch(len) /* all the case statements fall through */ { case 11: c+=((unsigned int)k[10]<<24); case 10: c+=((unsigned int)k[9]<<16); case 9 : c+=((unsigned int)k[8]<<8); /* the first byte of c is reserved for the length */ case 8 : b+=((unsigned int)k[7]<<24); case 7 : b+=((unsigned int)k[6]<<16); case 6 : b+=((unsigned int)k[5]<<8); case 5 : b+=k[4]; case 4 : a+=((unsigned int)k[3]<<24); case 3 : a+=((unsigned int)k[2]<<16); case 2 : a+=((unsigned int)k[1]<<8); case 1 : a+=k[0]; /* case 0: nothing left to add */ } mix(a,b,c); /*-------------------------------------------- report the result */ return c; } mairix-0.22/mkversion0000755001161100116110000000050411402542166014475 0ustar richardrichard#!/bin/sh rm -f version.h echo "#ifndef VERSION_H" > version.h echo "#define VERSION_H 1" >> version.h if [ -f version.txt ]; then ver=`cat version.txt` echo "#define PROGRAM_VERSION \"$ver\"" >> version.h else echo "#define PROGRAM_VERSION \"DEVELOPMENT\"" >> version.h fi echo "#endif /* VERSION_H */" >> version.h mairix-0.22/writer.c0000644001161100116110000004475711402542166014234 0ustar richardrichard/* mairix - message index builder and finder for maildir folders. ********************************************************************** * Copyright (C) Richard P. Curnow 2002,2003,2004,2005,2006 * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. * ********************************************************************** */ /* Write the database to disc. */ #include "mairix.h" #include "reader.h" #include #include #include #include #include #include struct write_map_toktable {/*{{{*/ /* Table of character offsets to null-terminated token texts */ int tok_offset; /* Table of character offsets to byte strings containing compressed * delta-encoding of file indices matching the token */ int enc_offset; };/*}}}*/ struct write_map_toktable2 {/*{{{*/ /* Table of character offsets to null-terminated token texts */ int tok_offset; /* Table of character offsets to byte strings containing compressed * delta-encoding of file indices matching the token */ int enc0_offset; int enc1_offset; };/*}}}*/ struct write_map {/*{{{*/ /* Contain offset information for the various tables. UI stuff in 4 byte units rel to base addr. Char stuff in byte units rel to base addr. */ /* Path information */ int path_offset; int mtime_offset; /* Message file mtimes (maildir/mh), mbox number (mbox) */ int size_offset; /* Message sizes (maildir/mh), entry in respective mbox (mbox) */ int date_offset; /* Message dates (all folder types) */ int tid_offset; /* Thread group index table (all folder types) */ int mbox_paths_offset; int mbox_entries_offset; int mbox_mtime_offset; int mbox_size_offset; /* Character offset to checksum of first msg in the mbox. Positions of * subsequent messages computed by indexing - no explicit table entries * anywhere. */ int mbox_checksum_offset; struct write_map_toktable to; struct write_map_toktable cc; struct write_map_toktable from; struct write_map_toktable subject; struct write_map_toktable body; struct write_map_toktable attachment_name; struct write_map_toktable2 msg_ids; /* To get base address for character data */ int beyond_last_ui_offset; }; /*}}}*/ static void create_rw_mapping(char *filename, size_t len, int *out_fd, char **out_data)/*{{{*/ { int fd; char *data; struct stat sb; fd = open(filename, O_RDWR | O_CREAT, 0600); if (fd < 0) { report_error("open", filename); unlock_and_exit(2); } if (fstat(fd, &sb) < 0) { report_error("stat", filename); unlock_and_exit(2); } if (sb.st_size < len) { /* Extend */ if (lseek(fd, len - 1, SEEK_SET) < 0) { report_error("lseek", filename); unlock_and_exit(2); } if (write(fd, "\000", 1) < 0) { report_error("write", filename); unlock_and_exit(2); } } else if (sb.st_size > len) { /* Truncate */ if (ftruncate(fd, len) < 0) { report_error("ftruncate", filename); unlock_and_exit(2); } } else { /* Exactly the right length already - nothing to do! */ } data = mmap(0, len, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); if (data == MAP_FAILED) { report_error("writer:mmap", filename); unlock_and_exit(2); } *out_data = data; *out_fd = fd; } /*}}}*/ static int toktable_char_length(struct toktable *tab)/*{{{*/ { int result = 0; int i; for (i=0; isize; i++) { if (tab->tokens[i]) { result += (1 + strlen(tab->tokens[i]->text)); result += (1 + tab->tokens[i]->match0.n); } } return result; } /*}}}*/ static int toktable2_char_length(struct toktable2 *tab)/*{{{*/ { int result = 0; int i; for (i=0; isize; i++) { if (tab->tokens[i]) { result += (1 + strlen(tab->tokens[i]->text)); result += (1 + tab->tokens[i]->match0.n); result += (1 + tab->tokens[i]->match1.n); } } return result; } /*}}}*/ static int char_length(struct database *db)/*{{{*/ { /* Return total length of character data to be written. */ int result; int i; result = 0; /* For type table. */ result += db->n_msgs; for (i=0; in_msgs; i++) { switch (db->type[i]) { case MTY_DEAD: break; case MTY_MBOX: break; case MTY_FILE: assert(db->msgs[i].src.mpf.path); result += (1 + strlen(db->msgs[i].src.mpf.path)); break; } } for (i=0; in_mboxen; i++) { struct mbox *mb = &db->mboxen[i]; result += mb->n_msgs * sizeof(checksum_t); if (mb->path) { result += (1 + strlen(mb->path)); } } result += toktable_char_length(db->to); result += toktable_char_length(db->cc); result += toktable_char_length(db->from); result += toktable_char_length(db->subject); result += toktable_char_length(db->body); result += toktable_char_length(db->attachment_name); result += toktable2_char_length(db->msg_ids); return result; } /*}}}*/ static void compute_mapping(struct database *db, struct write_map *map)/*{{{*/ { int total = UI_HEADER_LEN; map->path_offset = total, total += db->n_msgs; map->mtime_offset = total, total += db->n_msgs; map->date_offset = total, total += db->n_msgs; map->size_offset = total, total += db->n_msgs; map->tid_offset = total, total += db->n_msgs; map->mbox_paths_offset = total, total += db->n_mboxen; map->mbox_entries_offset = total, total += db->n_mboxen; map->mbox_mtime_offset = total, total += db->n_mboxen; map->mbox_size_offset = total, total += db->n_mboxen; map->mbox_checksum_offset = total, total += db->n_mboxen; map->to.tok_offset = total, total += db->to->n; map->to.enc_offset = total, total += db->to->n; map->cc.tok_offset = total, total += db->cc->n; map->cc.enc_offset = total, total += db->cc->n; map->from.tok_offset = total, total += db->from->n; map->from.enc_offset = total, total += db->from->n; map->subject.tok_offset = total, total += db->subject->n; map->subject.enc_offset = total, total += db->subject->n; map->body.tok_offset = total, total += db->body->n; map->body.enc_offset = total, total += db->body->n; map->attachment_name.tok_offset = total, total += db->attachment_name->n; map->attachment_name.enc_offset = total, total += db->attachment_name->n; map->msg_ids.tok_offset = total, total += db->msg_ids->n; map->msg_ids.enc0_offset = total, total += db->msg_ids->n; map->msg_ids.enc1_offset = total, total += db->msg_ids->n; map->beyond_last_ui_offset = total; } /*}}}*/ static void write_header(char *data, unsigned int *uidata, struct database *db, struct write_map *map)/*{{{*/ { /* Endianness-independent writes - at least the magic number will be * recognized if the database is read by this program on a machine of * opposite endianness. */ unsigned char *ucdata = (unsigned char *) data; ucdata[0] = HEADER_MAGIC0; ucdata[1] = HEADER_MAGIC1; ucdata[2] = HEADER_MAGIC2; ucdata[3] = HEADER_MAGIC3; uidata[UI_ENDIAN] = 0x44332211; /* For checking reversed endianness on read */ uidata[UI_N_MSGS] = db->n_msgs; uidata[UI_MSG_CDATA] = map->path_offset; /* offset table of ptrs to filenames */ uidata[UI_MSG_MTIME] = map->mtime_offset; /* offset of mtime table */ uidata[UI_MSG_DATE] = map->date_offset; /* offset of table of message Date: header lines as time_t */ uidata[UI_MSG_SIZE] = map->size_offset; /* offset of table of message sizes in bytes */ uidata[UI_MSG_TID] = map->tid_offset; /* offset of table of thread group numbers */ uidata[UI_MBOX_N] = db->n_mboxen; uidata[UI_MBOX_PATHS] = map->mbox_paths_offset; uidata[UI_MBOX_ENTRIES] = map->mbox_entries_offset; uidata[UI_MBOX_MTIME] = map->mbox_mtime_offset; uidata[UI_MBOX_SIZE] = map->mbox_size_offset; uidata[UI_MBOX_CKSUM] = map->mbox_checksum_offset; uidata[UI_HASH_KEY] = db->hash_key; uidata[UI_TO_N] = db->to->n; uidata[UI_TO_TOK] = map->to.tok_offset; uidata[UI_TO_ENC] = map->to.enc_offset; uidata[UI_CC_N] = db->cc->n; uidata[UI_CC_TOK] = map->cc.tok_offset; uidata[UI_CC_ENC] = map->cc.enc_offset; uidata[UI_FROM_N] = db->from->n; uidata[UI_FROM_TOK] = map->from.tok_offset; uidata[UI_FROM_ENC] = map->from.enc_offset; uidata[UI_SUBJECT_N] = db->subject->n; uidata[UI_SUBJECT_TOK] = map->subject.tok_offset; uidata[UI_SUBJECT_ENC] = map->subject.enc_offset; uidata[UI_BODY_N] = db->body->n; uidata[UI_BODY_TOK] = map->body.tok_offset; uidata[UI_BODY_ENC] = map->body.enc_offset; uidata[UI_ATTACHMENT_NAME_N] = db->attachment_name->n; uidata[UI_ATTACHMENT_NAME_TOK] = map->attachment_name.tok_offset; uidata[UI_ATTACHMENT_NAME_ENC] = map->attachment_name.enc_offset; uidata[UI_MSGID_N] = db->msg_ids->n; uidata[UI_MSGID_TOK] = map->msg_ids.tok_offset; uidata[UI_MSGID_ENC0] = map->msg_ids.enc0_offset; uidata[UI_MSGID_ENC1] = map->msg_ids.enc1_offset; return; } /*}}}*/ static char *write_type_and_flag_table(struct database *db, unsigned int *uidata, char *data, char *cdata)/*{{{*/ { int i; for (i=0; in_msgs; i++) { struct msgpath *msgdata = db->msgs + i; switch (db->type[i]) { case MTY_FILE: cdata[i] = DB_MSG_FILE; break; case MTY_MBOX: cdata[i] = DB_MSG_MBOX; break; case MTY_DEAD: cdata[i] = DB_MSG_DEAD; break; } if (msgdata->seen) cdata[i] |= FLAG_SEEN; if (msgdata->replied) cdata[i] |= FLAG_REPLIED; if (msgdata->flagged) cdata[i] |= FLAG_FLAGGED; } uidata[UI_MSG_TYPE_AND_FLAGS] = cdata - data; return cdata + db->n_msgs; } /*}}}*/ static char *write_messages(struct database *db, struct write_map *map, unsigned int *uidata, char *data, char *cdata)/*{{{*/ { int i; char *start_cdata = cdata; for (i=0; in_msgs; i++) { int slen; switch (db->type[i]) { case MTY_FILE: slen = strlen(db->msgs[i].src.mpf.path); uidata[map->path_offset + i] = cdata - data; uidata[map->mtime_offset + i] = db->msgs[i].src.mpf.mtime; uidata[map->size_offset + i] = db->msgs[i].src.mpf.size; uidata[map->date_offset + i] = db->msgs[i].date; uidata[map->tid_offset + i] = db->msgs[i].tid; memcpy(cdata, db->msgs[i].src.mpf.path, 1 + slen); /* include trailing null */ cdata += (1 + slen); break; case MTY_MBOX: { int mbno = db->msgs[i].src.mbox.file_index; int msgno = db->msgs[i].src.mbox.msg_index; struct mbox *mb = &db->mboxen[mbno]; uidata[map->path_offset + i] = encode_mbox_indices(mbno, msgno); uidata[map->mtime_offset + i] = mb->start[msgno]; uidata[map->size_offset + i] = mb->len[msgno]; uidata[map->date_offset + i] = db->msgs[i].date; uidata[map->tid_offset + i] = db->msgs[i].tid; } break; case MTY_DEAD: uidata[map->path_offset + i] = 0; /* Can't ever happen for real */ uidata[map->mtime_offset + i] = 0; /* For cleanliness */ uidata[map->size_offset + i] = 0; /* For cleanliness */ /* The following line is necessary, otherwise 'random' tid * information is written to the database, which can crash the search * functions. */ uidata[map->tid_offset + i] = db->msgs[i].tid; break; } } if (verbose) { printf("Wrote %d messages (%d bytes of tables, %d bytes of text)\n", db->n_msgs, 4*5*db->n_msgs, (int)(cdata - start_cdata)); } return cdata; /* new value */ } /*}}}*/ #if 0 static int compare_tokens(const void *a, const void *b)/*{{{*/ { const struct token **aa = (const struct token **) a; const struct token **bb = (const struct token **) b; return strcmp((*aa)->text, (*bb)->text); } /*}}}*/ #endif static char *write_mbox_headers(struct database *db, struct write_map *map, unsigned int *uidata, char *data, char *cdata)/*{{{*/ { int i, len; char *start_cdata = cdata; for (i=0; in_mboxen; i++) { struct mbox *mb = &db->mboxen[i]; uidata[map->mbox_entries_offset + i] = mb->n_msgs; uidata[map->mbox_mtime_offset + i] = mb->current_mtime; uidata[map->mbox_size_offset + i] = mb->current_size; if (mb->path) { uidata[map->mbox_paths_offset + i] = cdata - data; len = strlen(mb->path); memcpy(cdata, mb->path, 1+len); cdata += 1+len; } else { uidata[map->mbox_paths_offset + i] = 0; } } if (verbose) { printf("Wrote %d mbox headers (%d bytes of tables, %d bytes of paths)\n", db->n_mboxen, 4*4*db->n_mboxen, (int)(cdata - start_cdata)); } return cdata; } /*}}}*/ static char * write_mbox_checksums(struct database *db, struct write_map *map, unsigned int *uidata, char *data, char *cdata)/*{{{*/ { int i, j; char *start_cdata = cdata; for (i=0; in_mboxen; i++) { struct mbox *mb = &db->mboxen[i]; uidata[map->mbox_checksum_offset + i] = cdata - data; for (j=0; jn_msgs; j++) { memcpy(cdata, mb->check_all[j], sizeof(checksum_t)); cdata += sizeof(checksum_t); } } if (verbose) { printf("Wrote %d bytes of mbox message checksums\n", (int)(cdata - start_cdata)); } return cdata; } /*}}}*/ static char *write_toktable(struct toktable *tab, struct write_map_toktable *map, unsigned int *uidata, char *data, char *cdata, char *header_name)/*{{{*/ { int i, j, n, max; char *start_cdata, *mid_cdata; struct token **stok; stok = new_array(struct token *, tab->n); max = tab->size; n = tab->n; for (i=0, j=0; itokens[i]; if (tok) { stok[j++] = tok; } } assert(j == n); #if 0 /* The search functions don't rely on the tokens being sorted. So not * sorting here will save time. */ qsort(stok, n, sizeof(struct token *), compare_tokens); #endif start_cdata = cdata; /* FIXME : Eventually, the tokens have to be sorted - need to feed them from * a different data structure (array with no holes) */ for (i=0; itok_offset + i] = cdata - data; slen = strlen(stok[i]->text); memcpy(cdata, stok[i]->text, 1 + slen); cdata += (1 + slen); } mid_cdata = cdata; for (i=0; imatch0.n; uidata[map->enc_offset + i] = cdata - data; memcpy(cdata, stok[i]->match0.msginfo, dlen); cdata += dlen; *cdata++ = 0xff; /* termination character */ } if (verbose) { printf("%s: Wrote %d tokens (%d bytes of tables, %d bytes of text, %d bytes of hit encoding)\n", header_name, n, 2*4*n, (int)(mid_cdata - start_cdata), (int)(cdata - mid_cdata)); } free(stok); return cdata; } /*}}}*/ static char *write_toktable2(struct toktable2 *tab, struct write_map_toktable2 *map, unsigned int *uidata, char *data, char *cdata, char *header_name)/*{{{*/ { int i, j, n, max; char *start_cdata, *mid_cdata; struct token2 **stok; stok = new_array(struct token2 *, tab->n); max = tab->size; n = tab->n; for (i=0, j=0; itokens[i]; if (tok) { stok[j++] = tok; } } assert(j == n); #if 0 /* The search functions don't rely on the tokens being sorted. So not * sorting here will save time. */ qsort(stok, n, sizeof(struct token *), compare_tokens); #endif start_cdata = cdata; /* FIXME : Eventually, the tokens have to be sorted - need to feed them from * a different data structure (array with no holes) */ for (i=0; itok_offset + i] = cdata - data; slen = strlen(stok[i]->text); memcpy(cdata, stok[i]->text, 1 + slen); cdata += (1 + slen); } mid_cdata = cdata; for (i=0; imatch0.n; uidata[map->enc0_offset + i] = cdata - data; memcpy(cdata, stok[i]->match0.msginfo, dlen); cdata += dlen; *cdata++ = 0xff; /* termination character */ } for (i=0; imatch1.n; uidata[map->enc1_offset + i] = cdata - data; memcpy(cdata, stok[i]->match1.msginfo, dlen); cdata += dlen; *cdata++ = 0xff; /* termination character */ } if (verbose) { printf("%s: Wrote %d tokens (%d bytes of tables, %d bytes of text, %d bytes of hit encoding)\n", header_name, n, 2*4*n, (int)(mid_cdata - start_cdata), (int)(cdata - mid_cdata)); } free(stok); return cdata; } /*}}}*/ void write_database(struct database *db, char *filename, int do_integrity_checks)/*{{{*/ { int file_len; int fd; char *data, *cdata; unsigned int *uidata; struct write_map map; if (do_integrity_checks) { check_database_integrity(db); } if (!verify_mbox_size_constraints(db)) { unlock_and_exit(1); } /* Work out mappings */ compute_mapping(db, &map); file_len = char_length(db) + (4 * map.beyond_last_ui_offset); create_rw_mapping(filename, file_len, &fd, &data); uidata = (unsigned int *) data; /* align(int) < align(page)! */ cdata = data + (4 * map.beyond_last_ui_offset); write_header(data, uidata, db, &map); cdata = write_type_and_flag_table(db, uidata, data, cdata); cdata = write_messages(db, &map, uidata, data, cdata); cdata = write_mbox_headers(db, &map, uidata, data, cdata); cdata = write_mbox_checksums(db, &map, uidata, data, cdata); cdata = write_toktable(db->to, &map.to, uidata, data, cdata, "To"); cdata = write_toktable(db->cc, &map.cc, uidata, data, cdata, "Cc"); cdata = write_toktable(db->from, &map.from, uidata, data, cdata, "From"); cdata = write_toktable(db->subject, &map.subject, uidata, data, cdata, "Subject"); cdata = write_toktable(db->body, &map.body, uidata, data, cdata, "Body"); cdata = write_toktable(db->attachment_name, &map.attachment_name, uidata, data, cdata, "Attachment Name"); cdata = write_toktable2(db->msg_ids, &map.msg_ids, uidata, data, cdata, "(Threading)"); /* Write data */ /* Unmap / close file */ if (munmap(data, file_len) < 0) { report_error("munmap", filename); unlock_and_exit(2); } if (fsync(fd) < 0) { report_error("fsync", filename); unlock_and_exit(2); } if (close(fd) < 0) { report_error("close", filename); unlock_and_exit(2); } } /*}}}*/ mairix-0.22/reader.c0000644001161100116110000001375011402542166014147 0ustar richardrichard/* mairix - message index builder and finder for maildir folders. ********************************************************************** * Copyright (C) Richard P. Curnow 2002,2003,2004,2005 * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. * ********************************************************************** */ /* Database reader */ #include #include #include #include #include #include #include #include #include #include #include "reader.h" #include "memmac.h" #include "mairix.h" int read_increment(unsigned char **encpos) {/*{{{*/ unsigned char *j = *encpos; int result; unsigned char x0, x1, x2, x3; x0 = *j++; if ((x0 & 0xc0) == 0xc0) { /* 4 byte encoding */ x1 = *j++; x2 = *j++; x3 = *j++; result = ((x0 & 0x3f) << 24) + (x1 << 16) + (x2 << 8) + x3; } else if (x0 & 0x80) { /* 2 byte encoding */ x1 = *j++; result = ((x0 & 0x7f) << 8) + x1; } else { /* Single byte encoding */ result = x0; } *encpos = j; return result; } /*}}}*/ static void read_toktable_db(char *data, struct toktable_db *toktable, int start, unsigned int *uidata)/*{{{*/ { int n; n = toktable->n = uidata[start]; toktable->tok_offsets = uidata + uidata[start+1]; toktable->enc_offsets = uidata + uidata[start+2]; return; } /*}}}*/ static void read_toktable2_db(char *data, struct toktable2_db *toktable, int start, unsigned int *uidata)/*{{{*/ { int n; n = toktable->n = uidata[start]; toktable->tok_offsets = uidata + uidata[start+1]; toktable->enc0_offsets = uidata + uidata[start+2]; toktable->enc1_offsets = uidata + uidata[start+3]; return; } /*}}}*/ struct read_db *open_db(char *filename)/*{{{*/ { int fd, len; char *data; struct stat sb; struct read_db *result; unsigned int *uidata; unsigned char *ucdata; fd = open(filename, O_RDONLY); if (fd < 0) { report_error("open", filename); unlock_and_exit (2); } if (fstat(fd, &sb) < 0) { report_error("stat", filename); unlock_and_exit(2); } len = sb.st_size; data = (char *) mmap(0, len, PROT_READ, MAP_SHARED, fd, 0); if (data == MAP_FAILED) { report_error("reader:mmap", filename); unlock_and_exit(2); } if (!data) { /* Empty file opened => database corrupt for sure */ if (close(fd) < 0) { report_error("close", filename); unlock_and_exit(2); } return NULL; } if (close(fd) < 0) { report_error("close", filename); unlock_and_exit(2); } result = new(struct read_db); uidata = (unsigned int *) data; /* alignment is assured */ ucdata = (unsigned char *) data; result->len = len; result->data = data; /*{{{ Magic number check */ if (ucdata[0] == HEADER_MAGIC0 || ucdata[1] == HEADER_MAGIC1 || ucdata[2] == HEADER_MAGIC2) { if (ucdata[3] != HEADER_MAGIC3) { fprintf(stderr, "Another version of this program produced the existing database! Please rebuild.\n"); unlock_and_exit(2); } } else { fprintf(stderr, "The existing database wasn't produced by this program! Please rebuild.\n"); unlock_and_exit(2); } /*}}}*/ /* {{{ Endianness check */ if (uidata[UI_ENDIAN] == 0x11223344) { fprintf(stderr, "The endianness of the database is reversed for this machine\n"); unlock_and_exit(2); } else if (uidata[UI_ENDIAN] != 0x44332211) { fprintf(stderr, "The endianness of this machine is strange (or database is corrupt)\n"); unlock_and_exit(2); } /* }}} */ /* Now build tables of where things are in the file */ result->n_msgs = uidata[UI_N_MSGS]; result->msg_type_and_flags = ucdata + uidata[UI_MSG_TYPE_AND_FLAGS]; result->path_offsets = uidata + uidata[UI_MSG_CDATA]; result->mtime_table = uidata + uidata[UI_MSG_MTIME]; result->size_table = uidata + uidata[UI_MSG_SIZE]; result->date_table = uidata + uidata[UI_MSG_DATE]; result->tid_table = uidata + uidata[UI_MSG_TID]; result->n_mboxen = uidata[UI_MBOX_N]; result->mbox_paths_table = uidata + uidata[UI_MBOX_PATHS]; result->mbox_entries_table = uidata + uidata[UI_MBOX_ENTRIES]; result->mbox_mtime_table = uidata + uidata[UI_MBOX_MTIME]; result->mbox_size_table = uidata + uidata[UI_MBOX_SIZE]; result->mbox_checksum_table = uidata + uidata[UI_MBOX_CKSUM]; result->hash_key = uidata[UI_HASH_KEY]; read_toktable_db(data, &result->to, UI_TO_BASE, uidata); read_toktable_db(data, &result->cc, UI_CC_BASE, uidata); read_toktable_db(data, &result->from, UI_FROM_BASE, uidata); read_toktable_db(data, &result->subject, UI_SUBJECT_BASE, uidata); read_toktable_db(data, &result->body, UI_BODY_BASE, uidata); read_toktable_db(data, &result->attachment_name, UI_ATTACHMENT_NAME_BASE, uidata); read_toktable2_db(data, &result->msg_ids, UI_MSGID_BASE, uidata); return result; } /*}}}*/ static void free_toktable_db(struct toktable_db *x)/*{{{*/ { /* Nothing to do */ } /*}}}*/ static void free_toktable2_db(struct toktable2_db *x)/*{{{*/ { /* Nothing to do */ } /*}}}*/ void close_db(struct read_db *x)/*{{{*/ { free_toktable_db(&x->to); free_toktable_db(&x->cc); free_toktable_db(&x->from); free_toktable_db(&x->subject); free_toktable_db(&x->body); free_toktable_db(&x->attachment_name); free_toktable2_db(&x->msg_ids); if (munmap(x->data, x->len) < 0) { perror("munmap"); unlock_and_exit(2); } free(x); return; } /*}}}*/ mairix-0.22/search.c0000644001161100116110000011451211402542166014150 0ustar richardrichard/* mairix - message index builder and finder for maildir folders. ********************************************************************** * Copyright (C) Richard P. Curnow 2002,2003,2004,2005,2006 * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. * ********************************************************************** */ #include #include #include #include #include #include #include #include #include #include #include #include /* Lame fix for systems where NAME_MAX isn't defined after including the above * set of .h files (Solaris, FreeBSD so far). Probably grossly oversized but * it'll do. */ #if !defined(NAME_MAX) #define NAME_MAX 4096 #endif #include "mairix.h" #include "reader.h" #include "memmac.h" static void mark_hits_in_table(struct read_db *db, struct toktable_db *tt, int hit_tok, char *hits)/*{{{*/ { /* mark files containing matched token */ int idx; unsigned char *j, *first_char; idx = 0; first_char = (unsigned char *) db->data + tt->enc_offsets[hit_tok]; for (j = first_char; *j != 0xff; ) { idx += read_increment(&j); assert(idx < db->n_msgs); hits[idx] = 1; } } /*}}}*/ static void mark_hits_in_table2(struct read_db *db, struct toktable2_db *tt, int hit_tok, char *hits)/*{{{*/ { /* mark files containing matched token */ int idx; unsigned char *j, *first_char; idx = 0; first_char = (unsigned char *) db->data + tt->enc1_offsets[hit_tok]; for (j = first_char; *j != 0xff; ) { idx += read_increment(&j); assert(idx < db->n_msgs); hits[idx] = 1; } } /*}}}*/ /* See "Fast text searching with errors, Sun Wu and Udi Manber, TR 91-11, University of Arizona. I have been informed that this algorithm is NOT patented. This implementation of it is entirely the work of Richard P. Curnow - I haven't looked at any related source (webglimpse, agrep etc) in writing this. */ static void build_match_vector(char *substring, unsigned long *a, unsigned long *hit)/*{{{*/ { int len; char *p; int i; len = strlen(substring); if (len > 31 || len == 0) { fprintf(stderr, "Can't match patterns longer than 31 characters or empty\n"); unlock_and_exit(2); } memset(a, 0xff, 256 * sizeof(unsigned long)); for (p=substring, i=0; *p; p++, i++) { a[(unsigned int) *(unsigned char *)p] &= ~(1UL << i); } *hit = ~(1UL << (len-1)); return; } /*}}}*/ static int substring_match_0(unsigned long *a, unsigned long hit, int left_anchor, char *token)/*{{{*/ { int got_hit=0; char *p; unsigned long r0; unsigned long anchor, anchor1; r0 = ~0; got_hit = 0; anchor = 0; anchor1 = left_anchor ? 0x1 : 0x0; for(p=token; *p; p++) { int idx = (unsigned int) *(unsigned char *)p; r0 = (r0<<1) | anchor | a[idx]; if (~(r0 | hit)) { got_hit = 1; break; } anchor = anchor1; } return got_hit; } /*}}}*/ static int substring_match_1(unsigned long *a, unsigned long hit, int left_anchor, char *token)/*{{{*/ { int got_hit=0; char *p; unsigned long r0, r1, nr0; unsigned long anchor, anchor1; r0 = ~0; r1 = r0<<1; got_hit = 0; anchor = 0; anchor1 = left_anchor ? 0x1 : 0x0; for(p=token; *p; p++) { int idx = (unsigned int) *(unsigned char *)p; nr0 = (r0<<1) | anchor | a[idx]; r1 = ((r1<<1) | anchor | a[idx]) & ((r0 & nr0) << 1) & r0; r0 = nr0; if (~((r0 & r1) | hit)) { got_hit = 1; break; } anchor = anchor1; } return got_hit; } /*}}}*/ static int substring_match_2(unsigned long *a, unsigned long hit, int left_anchor, char *token)/*{{{*/ { int got_hit=0; char *p; unsigned long r0, r1, r2, nr0, nr1; unsigned long anchor, anchor1; r0 = ~0; r1 = r0<<1; r2 = r1<<1; got_hit = 0; anchor = 0; anchor1 = left_anchor ? 0x1 : 0x0; for(p=token; *p; p++) { int idx = (unsigned int) *(unsigned char *)p; nr0 = (r0<<1) | anchor | a[idx]; nr1 = ((r1<<1) | anchor | a[idx]) & ((r0 & nr0) << 1) & r0; r2 = ((r2<<1) | anchor | a[idx]) & ((r1 & nr1) << 1) & r1; r0 = nr0; r1 = nr1; if (~((r0 & r1 & r2) | hit)) { got_hit = 1; break; } anchor = anchor1; } return got_hit; } /*}}}*/ static int substring_match_3(unsigned long *a, unsigned long hit, int left_anchor, char *token)/*{{{*/ { int got_hit=0; char *p; unsigned long r0, r1, r2, r3, nr0, nr1, nr2; unsigned long anchor, anchor1; r0 = ~0; r1 = r0<<1; r2 = r1<<1; r3 = r2<<1; got_hit = 0; anchor = 0; anchor1 = left_anchor ? 0x1 : 0x0; for(p=token; *p; p++) { int idx = (unsigned int) *(unsigned char *)p; nr0 = (r0<<1) | anchor | a[idx]; nr1 = ((r1<<1) | anchor | a[idx]) & ((r0 & nr0) << 1) & r0; nr2 = ((r2<<1) | anchor | a[idx]) & ((r1 & nr1) << 1) & r1; r3 = ((r3<<1) | anchor | a[idx]) & ((r2 & nr2) << 1) & r2; r0 = nr0; r1 = nr1; r2 = nr2; if (~((r0 & r1 & r2 & r3) | hit)) { got_hit = 1; break; } anchor = anchor1; } return got_hit; } /*}}}*/ static int substring_match_general(unsigned long *a, unsigned long hit, int left_anchor, char *token, int max_errors, unsigned long *r, unsigned long *nr)/*{{{*/ { int got_hit=0; char *p; int j; unsigned long anchor, anchor1; r[0] = ~0; anchor = 0; anchor1 = left_anchor ? 0x1 : 0x0; for (j=1; j<=max_errors; j++) { r[j] = r[j-1] << 1; } got_hit = 0; for(p=token; *p; p++) { int idx = (unsigned int) *(unsigned char *)p; int d; unsigned int compo; compo = nr[0] = ((r[0]<<1) | anchor | a[idx]); for (d=1; d<=max_errors; d++) { nr[d] = ((r[d]<<1) | anchor | a[idx]) & ((r[d-1] & nr[d-1])<<1) & r[d-1]; compo &= nr[d]; } memcpy(r, nr, (1 + max_errors) * sizeof(unsigned long)); if (~(compo | hit)) { got_hit = 1; break; } anchor = anchor1; } return got_hit; } /*}}}*/ static void match_substring_in_table(struct read_db *db, struct toktable_db *tt, char *substring, int max_errors, int left_anchor, char *hits)/*{{{*/ { int i, got_hit; unsigned long a[256]; unsigned long *r=NULL, *nr=NULL; unsigned long hit; char *token; build_match_vector(substring, a, &hit); got_hit = 0; if (max_errors > 3) { r = new_array(unsigned long, 1 + max_errors); nr = new_array(unsigned long, 1 + max_errors); } for (i=0; in; i++) { token = db->data + tt->tok_offsets[i]; switch (max_errors) { /* Optimise common cases for few errors to allow optimizer to keep bitmaps * in registers */ case 0: got_hit = substring_match_0(a, hit, left_anchor, token); break; case 1: got_hit = substring_match_1(a, hit, left_anchor, token); break; case 2: got_hit = substring_match_2(a, hit, left_anchor, token); break; case 3: got_hit = substring_match_3(a, hit, left_anchor, token); break; default: got_hit = substring_match_general(a, hit, left_anchor, token, max_errors, r, nr); break; } if (got_hit) { mark_hits_in_table(db, tt, i, hits); } } if (r) free(r); if (nr) free(nr); } /*}}}*/ static void match_substring_in_table2(struct read_db *db, struct toktable2_db *tt, char *substring, int max_errors, int left_anchor, char *hits)/*{{{*/ { int i, got_hit; unsigned long a[256]; unsigned long *r=NULL, *nr=NULL; unsigned long hit; char *token; build_match_vector(substring, a, &hit); got_hit = 0; if (max_errors > 3) { r = new_array(unsigned long, 1 + max_errors); nr = new_array(unsigned long, 1 + max_errors); } for (i=0; in; i++) { token = db->data + tt->tok_offsets[i]; switch (max_errors) { /* Optimise common cases for few errors to allow optimizer to keep bitmaps * in registers */ case 0: got_hit = substring_match_0(a, hit, left_anchor, token); break; case 1: got_hit = substring_match_1(a, hit, left_anchor, token); break; case 2: got_hit = substring_match_2(a, hit, left_anchor, token); break; case 3: got_hit = substring_match_3(a, hit, left_anchor, token); break; default: got_hit = substring_match_general(a, hit, left_anchor, token, max_errors, r, nr); break; } if (got_hit) { mark_hits_in_table2(db, tt, i, hits); } } if (r) free(r); if (nr) free(nr); } /*}}}*/ static void match_substring_in_paths(struct read_db *db, char *substring, int max_errors, int left_anchor, char *hits)/*{{{*/ { int i; unsigned long a[256]; unsigned long *r=NULL, *nr=NULL; unsigned long hit; build_match_vector(substring, a, &hit); if (max_errors > 3) { r = new_array(unsigned long, 1 + max_errors); nr = new_array(unsigned long, 1 + max_errors); } for (i=0; in_msgs; i++) { char *token = NULL; unsigned int mbix, msgix; switch (rd_msg_type(db, i)) { case DB_MSG_FILE: token = db->data + db->path_offsets[i]; break; case DB_MSG_MBOX: decode_mbox_indices(db->path_offsets[i], &mbix, &msgix); token = db->data + db->mbox_paths_table[mbix]; break; case DB_MSG_DEAD: hits[i] = 0; /* never match on dead paths */ goto next_message; } assert(token); switch (max_errors) { /* Optimise common cases for few errors to allow optimizer to keep bitmaps * in registers */ case 0: hits[i] = substring_match_0(a, hit, left_anchor, token); break; case 1: hits[i] = substring_match_1(a, hit, left_anchor, token); break; case 2: hits[i] = substring_match_2(a, hit, left_anchor, token); break; case 3: hits[i] = substring_match_3(a, hit, left_anchor, token); break; default: hits[i] = substring_match_general(a, hit, left_anchor, token, max_errors, r, nr); break; } next_message: (void) 0; } if (r) free(r); if (nr) free(nr); } /*}}}*/ static void match_string_in_table(struct read_db *db, struct toktable_db *tt, char *key, char *hits)/*{{{*/ { /* TODO : replace with binary search? */ int i; for (i=0; in; i++) { if (!strcmp(key, db->data + tt->tok_offsets[i])) { /* get all matching files */ mark_hits_in_table(db, tt, i, hits); } } } /*}}}*/ static void match_string_in_table2(struct read_db *db, struct toktable2_db *tt, char *key, char *hits)/*{{{*/ { /* TODO : replace with binary search? */ int i; for (i=0; in; i++) { if (!strcmp(key, db->data + tt->tok_offsets[i])) { /* get all matching files */ mark_hits_in_table2(db, tt, i, hits); } } } /*}}}*/ static int parse_size_expr(char *x)/*{{{*/ { int result; int n; if (1 == sscanf(x, "%d%n", &result, &n)) { x += n; switch (*x) { case 'k': case 'K': result <<= 10; break; case 'm': case 'M': result <<= 20; break; default: break; } return result; } else { fprintf(stderr, "Could not parse message size expression <%s>\n", x); return -1; } } /*}}}*/ static void parse_size_range(char *size_expr, int *has_start, int *start, int *has_end, int *end)/*{{{*/ { char *x = size_expr; char *dash; int len; if (*x == ':') x++; len = strlen(x); dash = strchr(x, '-'); *has_start = *has_end = 0; if (dash) { char *p, *q; if (dash > x) { char *s; s = new_array(char, dash - x + 1); for (p=s, q=x; q end) { int temp = start; start = end; end = temp; } } for (i=0; in_msgs; i++) { start_cond = has_start ? (db->size_table[i] > start) : 1; end_cond = has_end ? (db->size_table[i] < end ) : 1; if (start_cond && end_cond) { hits[i] = 1; } } } /*}}}*/ static void find_date_matches_in_table(struct read_db *db, char *date_expr, char *hits)/*{{{*/ { time_t start, end; int has_start, has_end, start_cond, end_cond; int i; int status; status = scan_date_string(date_expr, &start, &has_start, &end, &has_end); if (status) { unlock_and_exit (2); } if (has_start && has_end) { /* Allow user to put the endpoints in backwards */ if (start > end) { time_t temp = start; start = end; end = temp; } } for (i=0; in_msgs; i++) { start_cond = has_start ? (db->date_table[i] > start) : 1; end_cond = has_end ? (db->date_table[i] < end ) : 1; if (start_cond && end_cond) { hits[i] = 1; } } } /*}}}*/ static void find_flag_matches_in_table(struct read_db *db, char *flag_expr, char *hits)/*{{{*/ { int pos_seen, neg_seen; int pos_replied, neg_replied; int pos_flagged, neg_flagged; int negate; char *p; int i; negate = 0; pos_seen = neg_seen = 0; pos_replied = neg_replied = 0; pos_flagged = neg_flagged = 0; for (p=flag_expr; *p; p++) { switch (*p) { case '-': negate = 1; break; case 's': case 'S': if (negate) neg_seen = 1; else pos_seen = 1; negate = 0; break; case 'r': case 'R': if (negate) neg_replied = 1; else pos_replied = 1; negate = 0; break; case 'f': case 'F': if (negate) neg_flagged = 1; else pos_flagged = 1; negate = 0; break; default: fprintf(stderr, "Did not understand the character '%c' (0x%02x) in the flags argument F:%s\n", isprint(*p) ? *p : '.', (int) *(unsigned char *) p, flag_expr); break; } } for (i=0; in_msgs; i++) { if ((!pos_seen || (db->msg_type_and_flags[i] & FLAG_SEEN)) && (!neg_seen || !(db->msg_type_and_flags[i] & FLAG_SEEN)) && (!pos_replied || (db->msg_type_and_flags[i] & FLAG_REPLIED)) && (!neg_replied || !(db->msg_type_and_flags[i] & FLAG_REPLIED)) && (!pos_flagged || (db->msg_type_and_flags[i] & FLAG_FLAGGED)) && (!neg_flagged || !(db->msg_type_and_flags[i] & FLAG_FLAGGED))) { hits[i] = 1; } } } /*}}}*/ static char *mk_maildir_path(int token, char *output_dir, int is_in_new, int is_seen, int is_replied, int is_flagged)/*{{{*/ { char *result; char uniq_buf[48]; int len; len = strlen(output_dir) + 64; /* oversize */ result = new_array(char, len + 1 + sizeof(":2,FRS")); strcpy(result, output_dir); strcat(result, is_in_new ? "/new/" : "/cur/"); sprintf(uniq_buf, "123456789.%d.mairix", token); strcat(result, uniq_buf); if (is_seen || is_replied || is_flagged) { strcat(result, ":2,"); } if (is_flagged) strcat(result, "F"); if (is_replied) strcat(result, "R"); if (is_seen) strcat(result, "S"); return result; } /*}}}*/ static char *mk_mh_path(int token, char *output_dir)/*{{{*/ { char *result; char uniq_buf[8]; int len; len = strlen(output_dir) + 10; /* oversize */ result = new_array(char, len); strcpy(result, output_dir); strcat(result, "/"); sprintf(uniq_buf, "%d", token+1); strcat(result, uniq_buf); return result; } /*}}}*/ static int looks_like_maildir_new_p(const char *p)/*{{{*/ { const char *s1, *s2; s2 = p; while (*s2) s2++; while ((s2 > p) && (*s2 != '/')) s2--; if (s2 <= p) return 0; s1 = s2 - 1; while ((s1 > p) && (*s1 != '/')) s1--; if (s1 <= p) return 0; if (!strncmp(s1, "/new/", 5)) { return 1; } else { return 0; } } /*}}}*/ static void create_symlink(char *link_target, char *new_link)/*{{{*/ { if (symlink(link_target, new_link) < 0) { if (verbose) { perror("symlink"); fprintf(stderr, "Failed path <%s> -> <%s>\n", link_target, new_link); } } } /*}}}*/ static void append_file_to_mbox(const char *path, FILE *out)/*{{{*/ { unsigned char *data; int len; create_ro_mapping(path, &data, &len); if (data) { fprintf(out, "From mairix@mairix Mon Jan 1 12:34:56 1970\n"); fprintf(out, "X-source-folder: %s\n", path); fwrite (data, sizeof(unsigned char), len, out); free_ro_mapping(data, len); } return; } /*}}}*/ static int had_failed_checksum; static void get_validated_mbox_msg(struct read_db *db, int msg_index,/*{{{*/ int *mbox_index, unsigned char **mbox_data, int *mbox_len, unsigned char **msg_data, int *msg_len) { /* msg_data==NULL if checksum mismatches */ unsigned char *start; checksum_t csum; unsigned int mbi, msgi; *msg_data = NULL; *msg_len = 0; decode_mbox_indices(db->path_offsets[msg_index], &mbi, &msgi); *mbox_index = mbi; create_ro_mapping(db->data + db->mbox_paths_table[mbi], mbox_data, mbox_len); if (!*mbox_data) return; start = *mbox_data + db->mtime_table[msg_index]; *msg_len = db->size_table[msg_index]; compute_checksum((char *)start, *msg_len, &csum); if (!memcmp((db->data + db->mbox_checksum_table[mbi] + (msgi * sizeof(checksum_t))), &csum, sizeof(checksum_t))) { *msg_data = start; } else { had_failed_checksum = 1; } return; } /*}}}*/ static void append_mboxmsg_to_mbox(struct read_db *db, int msg_index, FILE *out)/*{{{*/ { /* Need to common up code with try_copy_to_path */ unsigned char *mbox_start, *msg_start; int mbox_len, msg_len; int mbox_index; get_validated_mbox_msg(db, msg_index, &mbox_index, &mbox_start, &mbox_len, &msg_start, &msg_len); if (msg_start) { /* Artificial from line, we don't have the envelope sender so this is going to be artificial anyway. */ fprintf(out, "From mairix@mairix Mon Jan 1 12:34:56 1970\n"); fprintf(out, "X-source-folder: %s\n", db->data + db->mbox_paths_table[mbox_index]); fwrite(msg_start, sizeof(unsigned char), msg_len, out); } if (mbox_start) { free_ro_mapping(mbox_start, mbox_len); } } /*}}}*/ static void try_copy_to_path(struct read_db *db, int msg_index, char *target_path)/*{{{*/ { unsigned char *data; int mbox_len, msg_len; int mbi; FILE *out; unsigned char *start; get_validated_mbox_msg(db, msg_index, &mbi, &data, &mbox_len, &start, &msg_len); if (start) { out = fopen(target_path, "wb"); if (out) { fprintf(out, "X-source-folder: %s\n", db->data + db->mbox_paths_table[mbi]); fwrite(start, sizeof(char), msg_len, out); fclose(out); } } if (data) { free_ro_mapping(data, mbox_len); } return; } /*}}}*/ static struct msg_src *setup_mbox_msg_src(char *filename, off_t start, size_t len)/*{{{*/ { static struct msg_src result; result.type = MS_MBOX; result.filename = filename; result.start = start; result.len = len; return &result; } /*}}}*/ static void get_flags_from_file(struct read_db *db, int idx, int *is_seen, int *is_replied, int *is_flagged) { *is_seen = (db->msg_type_and_flags[idx] & FLAG_SEEN) ? 1 : 0; *is_replied = (db->msg_type_and_flags[idx] & FLAG_REPLIED) ? 1 : 0; *is_flagged = (db->msg_type_and_flags[idx] & FLAG_FLAGGED) ? 1 : 0; } static int do_search(struct read_db *db, char **args, char *output_path, int show_threads, enum folder_type ft, int verbose)/*{{{*/ { char *colon, *start_words; int do_body, do_subject, do_from, do_to, do_cc, do_date, do_size; int do_att_name; int do_flags; int do_path, do_msgid; char *key; char *hit0, *hit1, *hit2, *hit3; int i; int n_hits; int left_anchor; had_failed_checksum = 0; hit0 = new_array(char, db->n_msgs); hit1 = new_array(char, db->n_msgs); hit2 = new_array(char, db->n_msgs); hit3 = new_array(char, db->n_msgs); /* Argument structure is * x:tokena+tokenb,~tokenc,tokend+tokene * * + (and) binds more tightly than , * , (or) binds more tightly than separate args * * * hit1 gathers the tokens and'ed with + * hit2 gathers the tokens or'ed with , * hit3 gathers the separate args and'ed with * */ /* Everything matches until proven otherwise */ memset(hit3, 1, db->n_msgs); while (*args) { /* key is a single argument, separate args are and-ed together */ key = *args++; memset(hit2, 0, db->n_msgs); memset(hit1, 1, db->n_msgs); do_to = 0; do_cc = 0; do_from = 0; do_subject = 0; do_body = 0; do_date = 0; do_size = 0; do_path = 0; do_msgid = 0; do_att_name = 0; do_flags = 0; colon = strchr(key, ':'); if (colon) { char *p; for (p=key; p\n", *p); break; } } start_words = 1 + colon; } else { do_body = do_subject = do_to = do_cc = do_from = 1; start_words = key; } if (do_date || do_size || do_flags) { memset(hit0, 0, db->n_msgs); if (do_date) { find_date_matches_in_table(db, start_words, hit0); } else if (do_size) { find_size_matches_in_table(db, start_words, hit0); } else if (do_flags) { find_flag_matches_in_table(db, start_words, hit0); } /* AND-combine match vectors */ for (i=0; in_msgs; i++) { hit1[i] &= hit0[i]; } } else { /*{{{ Scan over separate words within this argument */ do { /* / = 'or' separator * , = 'and' separator */ char *orsep; char *andsep; char *word, *orig_word, *lower_word; char *equal; char *p; int negate; int had_orsep; int max_errors; orsep = strchr(start_words, '/'); andsep = strchr(start_words, ','); had_orsep = 0; if (andsep && (!orsep || (andsep < orsep))) { char *p, *q; word = new_array(char, 1 + (andsep - start_words)); /* maybe oversize */ for (p=word, q=start_words; q < andsep; q++) { if (!isspace(*(unsigned char *)q)) { *p++ = *q; } } *p = 0; start_words = andsep + 1; } else if (orsep) { /* comes before + if there's a + */ char *p, *q; word = new_array(char, 1 + (orsep - start_words)); /* maybe oversize */ for (p=word, q=start_words; q < orsep; q++) { if (!isspace(*(unsigned char *)q)) { *p++ = *q; } } *p = 0; start_words = orsep + 1; had_orsep = 1; } else { word = new_string(start_words); while (*start_words) ++start_words; } orig_word = word; if (word[0] == '~') { negate = 1; word++; } else { negate = 0; } if (word[0] == '^') { left_anchor = 1; word++; } else { left_anchor = 0; } equal = strchr(word, '='); if (equal) { *equal = 0; max_errors = atoi(equal + 1); /* Extend this to do anchoring etc */ } else { max_errors = 0; /* keep GCC quiet */ } /* Canonicalise search string to lowercase, since the database has all * tokens handled that way. But not for path search! */ lower_word = new_string(word); for (p=lower_word; *p; p++) { *p = tolower(*(unsigned char *)p); } memset(hit0, 0, db->n_msgs); if (equal) { if (do_to) match_substring_in_table(db, &db->to, lower_word, max_errors, left_anchor, hit0); if (do_cc) match_substring_in_table(db, &db->cc, lower_word, max_errors, left_anchor, hit0); if (do_from) match_substring_in_table(db, &db->from, lower_word, max_errors, left_anchor, hit0); if (do_subject) match_substring_in_table(db, &db->subject, lower_word, max_errors, left_anchor, hit0); if (do_body) match_substring_in_table(db, &db->body, lower_word, max_errors, left_anchor, hit0); if (do_att_name) match_substring_in_table(db, &db->attachment_name, lower_word, max_errors, left_anchor, hit0); if (do_path) match_substring_in_paths(db, word, max_errors, left_anchor, hit0); if (do_msgid) match_substring_in_table2(db, &db->msg_ids, lower_word, max_errors, left_anchor, hit0); } else { if (do_to) match_string_in_table(db, &db->to, lower_word, hit0); if (do_cc) match_string_in_table(db, &db->cc, lower_word, hit0); if (do_from) match_string_in_table(db, &db->from, lower_word, hit0); if (do_subject) match_string_in_table(db, &db->subject, lower_word, hit0); if (do_body) match_string_in_table(db, &db->body, lower_word, hit0); if (do_att_name) match_string_in_table(db, &db->attachment_name, lower_word, hit0); /* FIXME */ if (do_path) match_substring_in_paths(db, word, 0, left_anchor, hit0); if (do_msgid) match_string_in_table2(db, &db->msg_ids, lower_word, hit0); } free(lower_word); /* AND-combine match vectors */ for (i=0; in_msgs; i++) { if (negate) { hit1[i] &= !hit0[i]; } else { hit1[i] &= hit0[i]; } } if (had_orsep) { /* OR-combine match vectors */ for (i=0; in_msgs; i++) { hit2[i] |= hit1[i]; } memset(hit1, 1, db->n_msgs); } free(orig_word); } while (*start_words); /*}}}*/ } /* OR-combine match vectors */ for (i=0; in_msgs; i++) { hit2[i] |= hit1[i]; } /* AND-combine match vectors */ for (i=0; in_msgs; i++) { hit3[i] &= hit2[i]; } } n_hits = 0; if (show_threads) {/*{{{*/ char *tids; tids = new_array(char, db->n_msgs); memset(tids, 0, db->n_msgs); for (i=0; in_msgs; i++) { if (hit3[i]) { tids[db->tid_table[i]] = 1; } } for (i=0; in_msgs; i++) { if (tids[db->tid_table[i]]) { hit3[i] = 1; } } free(tids); } /*}}}*/ switch (ft) { case FT_MAILDIR:/*{{{*/ for (i=0; in_msgs; i++) { if (hit3[i]) { int is_seen, is_replied, is_flagged; get_flags_from_file(db, i, &is_seen, &is_replied, &is_flagged); switch (rd_msg_type(db, i)) { case DB_MSG_FILE: { char *target_path; char *message_path; int is_in_new; message_path = db->data + db->path_offsets[i]; is_in_new = looks_like_maildir_new_p(message_path); target_path = mk_maildir_path(i, output_path, is_in_new, is_seen, is_replied, is_flagged); create_symlink(message_path, target_path); free(target_path); ++n_hits; } break; case DB_MSG_MBOX: { char *target_path = mk_maildir_path(i, output_path, !is_seen, is_seen, is_replied, is_flagged); try_copy_to_path(db, i, target_path); free(target_path); ++n_hits; } break; case DB_MSG_DEAD: break; } } } break; /*}}}*/ case FT_MH:/*{{{*/ for (i=0; in_msgs; i++) { if (hit3[i]) { switch (rd_msg_type(db, i)) { case DB_MSG_FILE: { char *target_path = mk_mh_path(i, output_path); create_symlink(db->data + db->path_offsets[i], target_path); free(target_path); ++n_hits; } break; case DB_MSG_MBOX: { char *target_path = mk_mh_path(i, output_path); try_copy_to_path(db, i, target_path); free(target_path); ++n_hits; } break; case DB_MSG_DEAD: break; } } } break; /*}}}*/ case FT_MBOX:/*{{{*/ { FILE *out; out = fopen(output_path, "ab"); if (!out) { fprintf(stderr, "Cannot open output folder %s\n", output_path); unlock_and_exit(1); } for (i=0; in_msgs; i++) { if (hit3[i]) { switch (rd_msg_type(db, i)) { case DB_MSG_FILE: { append_file_to_mbox(db->data + db->path_offsets[i], out); ++n_hits; } break; case DB_MSG_MBOX: { append_mboxmsg_to_mbox(db, i, out); ++n_hits; } break; case DB_MSG_DEAD: break; } } } fclose(out); } break; /*}}}*/ case FT_RAW:/*{{{*/ for (i=0; in_msgs; i++) { if (hit3[i]) { switch (rd_msg_type(db, i)) { case DB_MSG_FILE: { ++n_hits; printf("%s\n", db->data + db->path_offsets[i]); } break; case DB_MSG_MBOX: { unsigned int mbix, msgix; int start, len, after_end; start = db->mtime_table[i]; len = db->size_table[i]; after_end = start + len; ++n_hits; decode_mbox_indices(db->path_offsets[i], &mbix, &msgix); printf("mbox:%s [%d,%d)\n", db->data + db->mbox_paths_table[mbix], start, after_end); } break; case DB_MSG_DEAD: break; } } } break; /*}}}*/ case FT_EXCERPT:/*{{{*/ for (i=0; in_msgs; i++) { if (hit3[i]) { struct rfc822 *parsed = NULL; switch (rd_msg_type(db, i)) { case DB_MSG_FILE: { char *filename; ++n_hits; printf("---------------------------------\n"); filename = db->data + db->path_offsets[i]; printf("%s\n", filename); parsed = make_rfc822(filename); } break; case DB_MSG_MBOX: { unsigned int mbix, msgix; int start, len, after_end; unsigned char *mbox_start, *msg_start; int mbox_len, msg_len; int mbox_index; start = db->mtime_table[i]; len = db->size_table[i]; after_end = start + len; ++n_hits; printf("---------------------------------\n"); decode_mbox_indices(db->path_offsets[i], &mbix, &msgix); printf("mbox:%s [%d,%d)\n", db->data + db->mbox_paths_table[mbix], start, after_end); get_validated_mbox_msg(db, i, &mbox_index, &mbox_start, &mbox_len, &msg_start, &msg_len); if (msg_start) { enum data_to_rfc822_error error; struct msg_src *msg_src; msg_src = setup_mbox_msg_src(db->data + db->mbox_paths_table[mbix], start, msg_len); parsed = data_to_rfc822(msg_src, (char *) msg_start, msg_len, &error); } if (mbox_start) { free_ro_mapping(mbox_start, mbox_len); } } break; case DB_MSG_DEAD: break; } if (parsed) { char datebuf[64]; struct tm *thetm; if (parsed->hdrs.to) printf(" To: %s\n", parsed->hdrs.to); if (parsed->hdrs.cc) printf(" Cc: %s\n", parsed->hdrs.cc); if (parsed->hdrs.from) printf(" From: %s\n", parsed->hdrs.from); if (parsed->hdrs.subject) printf(" Subject: %s\n", parsed->hdrs.subject); if (parsed->hdrs.message_id) printf(" Message-ID: %s\n", parsed->hdrs.message_id); thetm = gmtime(&parsed->hdrs.date); strftime(datebuf, sizeof(datebuf), "%a, %d %b %Y", thetm); printf(" Date: %s\n", datebuf); free_rfc822(parsed); } } } break; /*}}}*/ default: assert(0); break; } free(hit0); free(hit1); free(hit2); free(hit3); if ((ft != FT_RAW) && (ft != FT_EXCERPT)) { printf("Matched %d messages\n", n_hits); } fflush(stdout); if (had_failed_checksum) { fprintf(stderr, "WARNING : \n" "Matches were found in mbox folders but the message checksums failed.\n" "You may need to run mairix in indexing mode then repeat your search.\n"); } /* Return error code 1 to the shell if no messages were matched. */ return (n_hits == 0) ? 1 : 0; } /*}}}*/ static int directory_exists_remove_other(char *name)/*{{{*/ { struct stat sb; if (stat(name, &sb) < 0) { return 0; } if (S_ISDIR(sb.st_mode)) { return 1; } else { /* Try to remove. */ unlink(name); return 0; } } /*}}}*/ static void create_dir(char *path)/*{{{*/ { if (mkdir(path, 0700) < 0) { fprintf(stderr, "Could not create directory %s\n", path); unlock_and_exit(2); } fprintf(stderr, "Created directory %s\n", path); return; } /*}}}*/ static void maybe_create_maildir(char *path)/*{{{*/ { char *subdir, *tailpos; int len; if (!directory_exists_remove_other(path)) { create_dir(path); } len = strlen(path); subdir = new_array(char, len + 5); strcpy(subdir, path); strcpy(subdir+len, "/"); tailpos = subdir + len + 1; strcpy(tailpos,"cur"); if (!directory_exists_remove_other(subdir)) { create_dir(subdir); } strcpy(tailpos,"new"); if (!directory_exists_remove_other(subdir)) { create_dir(subdir); } strcpy(tailpos,"tmp"); if (!directory_exists_remove_other(subdir)) { create_dir(subdir); } free(subdir); return; } /*}}}*/ static void clear_maildir_subfolder(char *path, char *subdir)/*{{{*/ { char *sdir; char *fpath; int len; DIR *d; struct dirent *de; struct stat sb; len = strlen(path) + strlen(subdir); sdir = new_array(char, len + 2); fpath = new_array(char, len + 3 + NAME_MAX); strcpy(sdir, path); strcat(sdir, "/"); strcat(sdir, subdir); d = opendir(sdir); if (d) { while ((de = readdir(d))) { strcpy(fpath, sdir); strcat(fpath, "/"); strcat(fpath, de->d_name); if (lstat(fpath, &sb) >= 0) { /* Deal with both symlinks to maildir/MH messages as well as real files * where mbox messages have been written. */ if (S_ISLNK(sb.st_mode) || S_ISREG(sb.st_mode)) { /* FIXME : Can you unlink from a directory while doing a readdir loop over it? */ if (unlink(fpath) < 0) { fprintf(stderr, "Unlinking %s failed\n", fpath); } } } } closedir(d); } free(fpath); free(sdir); } /*}}}*/ static void clear_mh_folder(char *path)/*{{{*/ { char *fpath; int len; DIR *d; struct dirent *de; struct stat sb; len = strlen(path); fpath = new_array(char, len + 3 + NAME_MAX); d = opendir(path); if (d) { while ((de = readdir(d))) { if (valid_mh_filename_p(de->d_name)) { strcpy(fpath, path); strcat(fpath, "/"); strcat(fpath, de->d_name); if (lstat(fpath, &sb) >= 0) { /* See under maildir above for explanation */ if (S_ISLNK(sb.st_mode) || S_ISREG(sb.st_mode)) { /* FIXME : Can you unlink from a directory while doing a readdir loop over it? */ if (unlink(fpath) < 0) { fprintf(stderr, "Unlinking %s failed\n", fpath); } } } } } closedir(d); } free(fpath); } /*}}}*/ static void clear_mbox_folder(char *path)/*{{{*/ { unlink(path); } /*}}}*/ int search_top(int do_threads, int do_augment, char *database_path, char *complete_mfolder, char **argv, enum folder_type ft, int verbose)/*{{{*/ { struct read_db *db; int result; db = open_db(database_path); switch (ft) { case FT_MAILDIR: maybe_create_maildir(complete_mfolder); break; case FT_MH: if (!directory_exists_remove_other(complete_mfolder)) { create_dir(complete_mfolder); } break; case FT_MBOX: /* Nothing to do */ break; case FT_RAW: case FT_EXCERPT: break; default: assert(0); } if (!do_augment) { switch (ft) { case FT_MAILDIR: clear_maildir_subfolder(complete_mfolder, "new"); clear_maildir_subfolder(complete_mfolder, "cur"); break; case FT_MH: clear_mh_folder(complete_mfolder); break; case FT_MBOX: clear_mbox_folder(complete_mfolder); break; case FT_RAW: case FT_EXCERPT: break; default: assert(0); } } result = do_search(db, argv, complete_mfolder, do_threads, ft, verbose); free(complete_mfolder); close_db(db); return result; } /*}}}*/ mairix-0.22/stats.c0000644001161100116110000000765311402542166014050 0ustar richardrichard/* mairix - message index builder and finder for maildir folders. ********************************************************************** * Copyright (C) Richard P. Curnow 2002-2004 * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. * ********************************************************************** */ #include "mairix.h" #include "memmac.h" #include "reader.h" static void do_toktable(struct toktable *x, int *lc, int *elc, int *ec, int size, int *ml, int *mel, int *me) { int i; for (i=0; isize; i++) { struct token *tok = x->tokens[i]; unsigned char *j, *last_char; int incr; if (tok) { int len = strlen(tok->text); if (len > size) { fprintf(stderr, "Token length %d exceeds size\n", len); } else { lc[len]++; if (len > *ml) *ml = len; } /* Deal with encoding length */ if (tok->match0.n > size) { fprintf(stderr, "Token encoding length %d exceeds size\n", tok->match0.n); } else { elc[tok->match0.n]++; if (tok->match0.n > *mel) *mel = tok->match0.n; } /* Deal with encoding */ j = tok->match0.msginfo; last_char = j + tok->match0.n; while (j < last_char) { incr = read_increment(&j); if (incr > size) { fprintf(stderr, "Encoding increment %d exceeds size\n", incr); } else { ec[incr]++; if (incr > *me) *me = incr; } } } } } void print_table(int *x, int max) { int total, sum; int i; int kk, kk1; total = 0; for (i = 0; i<=max; i++) { total += x[i]; } sum = 0; kk1 = 0; for (i = 0; i<=max; i++) { sum += x[i]; kk = (int)((double)sum*256.0/(double)total); printf("%5d : %5d %3d %3d\n", i, x[i], kk-kk1, kk); kk1 = kk; } } void get_db_stats(struct database *db) { /* Deal with paths later - problem is, they will be biased by length of folder_base at the moment. */ int size = 4096; int *len_counts, *enc_len_counts, *enc_counts; int max_len, max_enc_len, max_enc; max_len = 0; max_enc_len = 0; max_enc = 0; len_counts = new_array(int, size); memset(len_counts, 0, size * sizeof(int)); enc_len_counts = new_array(int, size); memset(enc_len_counts, 0, size * sizeof(int)); enc_counts = new_array(int, size); memset(enc_counts, 0, size * sizeof(int)); do_toktable(db->to, len_counts, enc_len_counts, enc_counts, size, &max_len, &max_enc_len, &max_enc); do_toktable(db->cc, len_counts, enc_len_counts, enc_counts, size, &max_len, &max_enc_len, &max_enc); do_toktable(db->from, len_counts, enc_len_counts, enc_counts, size, &max_len, &max_enc_len, &max_enc); do_toktable(db->subject, len_counts, enc_len_counts, enc_counts, size, &max_len, &max_enc_len, &max_enc); do_toktable(db->body, len_counts, enc_len_counts, enc_counts, size, &max_len, &max_enc_len, &max_enc); #if 0 /* no longer works now that the msg_ids table has 2 encoding chains. fix * this when required. */ do_toktable(db->msg_ids, len_counts, enc_len_counts, enc_counts, size, &max_len, &max_enc_len, &max_enc); #endif printf("Max token length : %d\n", max_len); print_table(len_counts, max_len); printf("Max encoding vector length : %d\n", max_enc_len); print_table(enc_len_counts, max_enc_len); printf("Max encoding increment : %d\n", max_enc); print_table(enc_counts, max_enc); return; } mairix-0.22/memmac.h0000644001161100116110000000422011402542166014141 0ustar richardrichard/* mairix - message index builder and finder for maildir folders. ********************************************************************** * Copyright (C) Richard P. Curnow 2002-2004 * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. * ********************************************************************** */ #ifndef MEMMAC_H #define MEMMAC_H /*{{{ Safe alloc helpers (GCC extensions) */ extern void out_of_mem(char *file, int line, size_t size); #undef TEST_OOM #ifdef TEST_OOM extern int total_bytes; #endif static __inline__ void* safe_malloc(char *file, int line, size_t s)/*{{{*/ { void *x = malloc(s); #ifdef TEST_OOM total_bytes += s; if (total_bytes > 131072) x = NULL; #endif if (!x) out_of_mem(file, line, s); return x; } /*}}}*/ static __inline__ void* safe_realloc(char *file, int line, void *old_ptr, size_t s)/*{{{*/ { void *x = realloc(old_ptr, s); if (!x) out_of_mem(file, line, s); return x; } /*}}}*/ #ifndef TEST #define Malloc(s) safe_malloc(__FILE__, __LINE__, s) #define Realloc(xx,s) safe_realloc(__FILE__, __LINE__,xx,s) #else #define Malloc(s) malloc(s) #define Realloc(xx,s) realloc(xx,s) #endif /*}}}*/ /*{{{ Memory macros*/ #define new_string(s) strcpy((char *) Malloc(1+strlen(s)), (s)) #define extend_string(x,s) (strcat(Realloc(x, (strlen(x)+strlen(s)+1)), s)) #define new(T) (T *) Malloc(sizeof(T)) #define new_array(T, n) (T *) Malloc(sizeof(T) * (n)) #define grow_array(T, n, oldX) (T *) ((oldX) ? Realloc(oldX, (sizeof(T) * (n))) : Malloc(sizeof(T) * (n))) #define EMPTY(x) {&(x), &(x)} /*}}}*/ #endif /* MEMMAC_H */ mairix-0.22/glob.c0000644001161100116110000002207311402542166013626 0ustar richardrichard/* mairix - message index builder and finder for maildir folders. ********************************************************************** * Copyright (C) Richard P. Curnow 2003,2004,2005 * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. * ********************************************************************** */ #include #include #include #include #include "mairix.h" struct globber { unsigned int pat[256]; unsigned int starpat; unsigned int twostarpat; unsigned int hit; }; struct globber_array { int n; struct globber **globs; }; static const char *parse_charclass(const char *in, struct globber *result, unsigned int mask)/*{{{*/ { int first = 1; int prev = -1; in++; /* Advance over '[' */ while (*in) { if (*in == ']') { if (first) { result->pat[(int)']'] |= mask; } else { return in; } } else if (*in == '-') { /* Maybe range */ if ((prev < 0) || !in[1] || (in[1]==']')) { /* - at either end of string (or right after an earlier range) means * normal - */ result->pat['-'] |= mask; } else { int next = in[1]; int hi, lo; int i; /* Cope with range being inverted */ if (prev < next) { lo = prev, hi = next; } else { lo = next, hi = prev; } for (i=lo; i<=hi; i++) { int index = 0xff & i; result->pat[index] |= mask; } /* require 1 extra increment */ in++; prev = -1; /* Avoid junk like [a-e-z] */ } } else { int index = 0xff & (int)*in; result->pat[index] |= mask; } prev = *in; first = 0; in++; } return in; } /*}}}*/ struct globber *make_globber(const char *wildstring)/*{{{*/ { struct globber *result; int n, i; const char *p; char c; int index; unsigned int mask; result = new(struct globber); memset(&result->pat, 0x00, 256*sizeof(unsigned int)); memset(&result->starpat, 0x00, sizeof(unsigned int)); memset(&result->twostarpat, 0x00, sizeof(unsigned int)); mask = 0x1; n = 0; for (p=wildstring; *p; p++) { mask = 1<twostarpat |= mask; p++; } else { /* Match zero or more of anything */ result->starpat |= mask; } break; /*}}}*/ case '[':/*{{{*/ p = parse_charclass(p, result, mask); n++; break; /*}}}*/ case '?':/*{{{*/ for (i=0; i<256; i++) { result->pat[i] |= mask; } n++; break; /*}}}*/ default:/*{{{*/ index = 0xff & (int)c; result->pat[index] |= mask; n++; break; /*}}}*/ } } result->hit = (1<pat[index]); #endif stars = (reg & g->starpat); twostars = (reg & g->twostarpat); if (index != '/') { stars2 = stars | twostars; } else { stars2 = twostars; } reg &= g->pat[index]; reg <<= 1; reg |= stars2; #if DODEBUG printf(" new_reg=%08lx ", reg); printf("starpat=%08lx stars=%08lx stars2=%08lx\n", g->starpat, stars, stars2); #endif s++; } #if DODEBUG printf("reg=%08lx hit=%08lx\n", reg, g->hit); #endif reg &= g->hit; if (reg) { return 1; } else { return 0; } } /*}}}*/ struct globber_array *colon_sep_string_to_globber_array(const char *in)/*{{{*/ { char **strings; int n_strings; int i; struct globber_array *result; split_on_colons(in, &n_strings, &strings); result = new(struct globber_array); result->n = n_strings; result->globs = new_array(struct globber *, n_strings); for (i=0; iglobs[i] = make_globber(strings[i]); free(strings[i]); } free(strings); return result; } /*}}}*/ int is_globber_array_match(struct globber_array *ga, const char *s)/*{{{*/ { int i; if (!ga) return 0; for (i=0; in; i++) { if (is_glob_match(ga->globs[i], s)) return 1; } return 0; } /*}}}*/ void free_globber_array(struct globber_array *in)/*{{{*/ { int i; for (i=0; in; i++) { free_globber(in->globs[i]); } free(in); } /*}}}*/ static char *copy_folder_name(const char *start, const char *end)/*{{{*/ { /* 'start' points to start of string to copy. Any '\:' sequence is replaced by ':' . Otherwise \ is treated normally. 'end' can be 1 beyond the end of the string to copy. Otherwise it can be null, meaning treat 'start' as the start of a normal null-terminated string. */ char *p; const char *q; int len; char *result; if (end) { len = end - start; } else { len = strlen(start); } result = new_array(char, len + 1); for (p=result, q=start; end ? (q < end) : *q; q++) { if ((q[0] == '\\') && (q[1] == ':')) { /* Escaped colon : drop the backslash */ } else { *p++ = *q; } } *p = '\0'; return result; } /*}}}*/ void string_list_to_array(struct string_list *list, int *n, char ***arr)/*{{{*/ { int N, i; struct string_list *a, *next_a; char **result; for (N=0, a=list->next; a!=list; a=a->next, N++) ; result = new_array(char *, N); for (i=0, a=list->next; idata; next_a = a->next; free(a); } *n = N; *arr = result; } /*}}}*/ void split_on_colons(const char *str, int *n, char ***arr)/*{{{*/ { struct string_list list, *new_cell; const char *left_to_do; list.next = list.prev = &list; left_to_do = str; do { char *colon; char *xx; colon = strchr(left_to_do, ':'); /* Allow backslash-escaped colons in filenames */ if (colon && (colon > left_to_do) && (colon[-1]=='\\')) { int is_escaped; do { colon = strchr(colon + 1, ':'); is_escaped = (colon && (colon[-1] == '\\')); } while (colon && is_escaped); } /* 'colon' now points to the first non-escaped colon or is null if there were no more such colons in the rest of the line. */ xx = copy_folder_name(left_to_do, colon); if (colon) { left_to_do = colon + 1; } else { while (*left_to_do) ++left_to_do; } new_cell = new(struct string_list); new_cell->data = xx; new_cell->next = &list; new_cell->prev = list.prev; list.prev->next = new_cell; list.prev = new_cell; } while (*left_to_do); string_list_to_array(&list, n, arr); } /*}}}*/ #if defined (TEST) void run1(char *ref, char *s, int expected)/*{{{*/ { struct globber *g; int result; g = make_globber(ref); result = is_glob_match(g, s); printf("ref=%s, str=%s, %s %s\n", ref, s, result ? "MATCHED" : "not matched", (expected==result) ? "" : "??????"); free_globber(g); } /*}}}*/ int main (int argc, char **argv)/*{{{*/ { run1("ab?de", "abdde", 1); run1("ab?de", "abcde", 1); run1("ab?de", "Abcde", 0); run1("ab?de", "abcd", 0); run1("ab?de", "abc", 0); run1("ab[cd]de", "abdde", 1); run1("ab[cd]de", "abbde", 0); run1("ab[cd]de", "abcde", 1); run1("ab*de", "ade", 0); run1("ab*de", "abde", 1); run1("ab*de", "abcde", 1); run1("ab*de", "abccde", 1); run1("ab*de", "abccdfde", 1); run1("ab*de", "abccdedf", 0); run1("ab[b-d]de", "abade",0); run1("ab[b-d]de", "abcDe",0); run1("ab[b-d]de", "abcde",1); run1("ab[b-d]de", "abdde",1); run1("ab[b-d]de", "abEde", 0); run1("[a-z][0-9A-F][]a-f-]", "yE]", 1); run1("[a-z][0-9A-F][]a-f-]", "uE[", 0); run1("[a-z][0-9A-F][]a-f-]", "vG-", 0); run1("[a-z][0-9A-F][]a-f-]", "w8-", 1); run1("*", "a", 1); run1("*", "", 1); run1("a*", "a", 1); run1("a*", "aa", 1); run1("a*", "aaA", 1); run1("*a", "aaa", 1); run1("*a", "a", 1); run1("x*abc", "xabdxabc", 1); run1("*", "", 1); run1("a*", "", 0); run1("*a", "", 0); run1("a", "", 0); run1("*abc*", "x/abc/y", 0); run1("**abc**", "x/abc/y", 1); run1("x/*/abc**", "x/z/abc/y", 1); run1("x/*/abc**", "x/z/w/abc/y", 0); run1("x/*/abc**", "x/zz/w/abc/y", 0); run1("x/*/abc**", "x/z/ww/abc/y", 0); run1("x/**/abc**", "x/z/w/abc/y", 1); run1("x/**/abc**", "x/zz/w/abc/y", 1); return 0; } /*}}}*/ #endif mairix-0.22/db.c0000644001161100116110000011110011402542166013256 0ustar richardrichard/* mairix - message index builder and finder for maildir folders. ********************************************************************** * Copyright (C) Richard P. Curnow 2002,2003,2004,2005,2006,2007,2009 * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. * ********************************************************************** */ /* Handle complete database */ #include "mairix.h" #include "reader.h" #include #include #include #include struct sortable_token {/*{{{*/ char *text; int index; }; /*}}}*/ static int compare_sortable_tokens(const void *a, const void *b)/*{{{*/ { const struct sortable_token *aa = (const struct sortable_token *) a; const struct sortable_token *bb = (const struct sortable_token *) b; int foo; foo = strcmp(aa->text, bb->text); if (foo) { return foo; } else { if (aa->index < bb->index) return -1; else if (aa->index > bb->index) return +1; else return 0; } } /*}}}*/ static void check_toktable_enc_integrity(int n_msgs, struct toktable *table)/*{{{*/ { /* FIXME : Check reachability of tokens that are displaced from their natural * hash bucket (if deletions have occurred during purge). */ int idx, incr; int i, k; unsigned char *j, *last_char; int broken_chains = 0; struct sortable_token *sort_list; int any_duplicates; for (i=0; isize; i++) { struct token *tok = table->tokens[i]; if (tok) { idx = 0; incr = 0; last_char = tok->match0.msginfo + tok->match0.n; for (j = tok->match0.msginfo; j < last_char; ) { incr = read_increment(&j); idx += incr; } if (idx != tok->match0.highest) { fprintf(stderr, "broken encoding chain for token <%s>, highest=%ld\n", tok->text, tok->match0.highest); fflush(stderr); broken_chains = 1; } if (idx >= n_msgs) { fprintf(stderr, "end of chain higher than number of message paths (%d) for token <%s>\n", n_msgs, tok->text); fflush(stderr); broken_chains = 1; } } } assert(!broken_chains); /* Check there are no duplicated tokens in the table. */ sort_list = new_array(struct sortable_token, table->n); k = 0; for (i=0; isize; i++) { struct token *tok = table->tokens[i]; if (tok) { sort_list[k].text = new_string(tok->text); sort_list[k].index = i; k++; } } assert(k == table->n); qsort(sort_list, table->n, sizeof(struct sortable_token), compare_sortable_tokens); /* Check for uniqueness of neighbouring token texts */ any_duplicates = 0; for (i=0; i<(table->n - 1); i++) { if (!strcmp(sort_list[i].text, sort_list[i+1].text)) { fprintf(stderr, "Token table contains duplicated token %s at indices %d and %d\n", sort_list[i].text, sort_list[i].index, sort_list[i+1].index); any_duplicates = 1; } } /* release */ for (i=0; in; i++) { free(sort_list[i].text); } free(sort_list); if (any_duplicates) { fprintf(stderr, "Token table contained duplicate entries, aborting\n"); assert(0); } } /*}}}*/ static int compare_strings(const void *a, const void *b)/*{{{*/ { const char **aa = (const char **) a; const char **bb = (const char **) b; return strcmp(*aa, *bb); } /*}}}*/ static void check_message_path_integrity(struct database *db)/*{{{*/ { /* TODO : for now only checks integrity of non-mbox paths. */ /* Check there are no duplicates */ int i; int n; int has_duplicate = 0; char **paths; paths = new_array(char *, db->n_msgs); for (i=0, n=0; in_msgs; i++) { switch (db->type[i]) { case MTY_DEAD: case MTY_MBOX: break; case MTY_FILE: paths[n++] = db->msgs[i].src.mpf.path; break; } } qsort(paths, n, sizeof(char *), compare_strings); for (i=1; i repeated\n", paths[i]); has_duplicate = 1; } } fflush(stderr); assert(!has_duplicate); free(paths); return; } /*}}}*/ void check_database_integrity(struct database *db)/*{{{*/ { if (verbose) fprintf(stderr, "Checking message path integrity\n"); check_message_path_integrity(db); /* Just check encoding chains for now */ if (verbose) fprintf(stderr, "Checking to\n"); check_toktable_enc_integrity(db->n_msgs, db->to); if (verbose) fprintf(stderr, "Checking cc\n"); check_toktable_enc_integrity(db->n_msgs, db->cc); if (verbose) fprintf(stderr, "Checking from\n"); check_toktable_enc_integrity(db->n_msgs, db->from); if (verbose) fprintf(stderr, "Checking subject\n"); check_toktable_enc_integrity(db->n_msgs, db->subject); if (verbose) fprintf(stderr, "Checking body\n"); check_toktable_enc_integrity(db->n_msgs, db->body); if (verbose) fprintf(stderr, "Checking attachment_name\n"); check_toktable_enc_integrity(db->n_msgs, db->attachment_name); } /*}}}*/ struct database *new_database(void)/*{{{*/ { struct database *result = new(struct database); struct timeval tv; pid_t pid; result->to = new_toktable(); result->cc = new_toktable(); result->from = new_toktable(); result->subject = new_toktable(); result->body = new_toktable(); result->attachment_name = new_toktable(); result->msg_ids = new_toktable2(); gettimeofday(&tv, NULL); pid = getpid(); result->hash_key = tv.tv_sec ^ (pid ^ (tv.tv_usec << 15)); result->msgs = NULL; result->type = NULL; result->n_msgs = 0; result->max_msgs = 0; result->mboxen = NULL; result->n_mboxen = 0; result->max_mboxen = 0; return result; } /*}}}*/ void free_database(struct database *db)/*{{{*/ { int i; free_toktable(db->to); free_toktable(db->cc); free_toktable(db->from); free_toktable(db->subject); free_toktable(db->body); free_toktable(db->attachment_name); free_toktable2(db->msg_ids); if (db->msgs) { for (i=0; in_msgs; i++) { switch (db->type[i]) { case MTY_DEAD: break; case MTY_MBOX: break; case MTY_FILE: assert(db->msgs[i].src.mpf.path); free(db->msgs[i].src.mpf.path); break; } } free(db->msgs); free(db->type); } free(db); } /*}}}*/ static int get_max (int a, int b) {/*{{{*/ return (a > b) ? a : b; } /*}}}*/ static void import_toktable(char *data, unsigned int hash_key, int n_msgs, struct toktable_db *in, struct toktable *out)/*{{{*/ { int n, size, i; n = in->n; size = 1; while (size < n) size <<= 1; size <<= 1; /* safe hash table size */ out->size = size; out->mask = size - 1; out->n = n; out->tokens = new_array(struct token *, size); memset(out->tokens, 0, size * sizeof(struct token *)); out->hwm = (n + size) >> 1; for (i=0; ienc_offsets[i]; idx = 0; for (j = enc; *j != 0xff; ) { incr = read_increment(&j); idx += incr; } enc_len = j - enc; enc_hi = idx; text = data + in->tok_offsets[i]; hash = hashfn((unsigned char *) text, strlen(text), hash_key); nt = new(struct token); nt->hashval = hash; nt->text = new_string(text); /* Allow a bit of headroom for adding more entries later */ nt->match0.max = get_max(16, enc_len + (enc_len >> 1)); nt->match0.n = enc_len; nt->match0.highest = enc_hi; assert(nt->match0.highest < n_msgs); nt->match0.msginfo = new_array(unsigned char, nt->match0.max); memcpy(nt->match0.msginfo, enc, nt->match0.n); index = hash & out->mask; while (out->tokens[index]) { /* Audit to look for corrupt database with multiple entries for the same * string. */ if (!strcmp(nt->text, out->tokens[index]->text)) { fprintf(stderr, "\n!!! Corrupt token table found in database, token <%s> duplicated, aborting\n", nt->text); fprintf(stderr, " Delete the database file and rebuild from scratch as a workaround\n"); /* No point going on - need to find out why the database got corrupted * in the 1st place. Workaround for user - rebuild database from * scratch by deleting it then rerunning. */ unlock_and_exit(1); } ++index; index &= out->mask; } out->tokens[index] = nt; } } /*}}}*/ static void import_toktable2(char *data, unsigned int hash_key, int n_msgs, struct toktable2_db *in, struct toktable2 *out)/*{{{*/ { int n, size, i; n = in->n; size = 1; while (size < n) size <<= 1; size <<= 1; /* safe hash table size */ out->size = size; out->mask = size - 1; out->n = n; out->tokens = new_array(struct token2 *, size); memset(out->tokens, 0, size * sizeof(struct token *)); out->hwm = (n + size) >> 1; for (i=0; ienc0_offsets[i]; idx = 0; for (j = enc0; *j != 0xff; ) { incr = read_increment(&j); idx += incr; } enc0_len = j - enc0; enc0_hi = idx; /*}}}*/ /*{{{ do enc1*/ enc1 = (unsigned char *) data + in->enc1_offsets[i]; idx = 0; for (j = enc1; *j != 0xff; ) { incr = read_increment(&j); idx += incr; } enc1_len = j - enc1; enc1_hi = idx; /*}}}*/ text = data + in->tok_offsets[i]; hash = hashfn((unsigned char *) text, strlen(text), hash_key); nt = new(struct token2); nt->hashval = hash; nt->text = new_string(text); /* Allow a bit of headroom for adding more entries later */ /*{{{ set up match0 chain */ nt->match0.max = get_max(16, enc0_len + (enc0_len >> 1)); nt->match0.n = enc0_len; nt->match0.highest = enc0_hi; assert(nt->match0.highest < n_msgs); nt->match0.msginfo = new_array(unsigned char, nt->match0.max); memcpy(nt->match0.msginfo, enc0, nt->match0.n); /*}}}*/ /*{{{ set up match1 chain */ nt->match1.max = get_max(16, enc1_len + (enc1_len >> 1)); nt->match1.n = enc1_len; nt->match1.highest = enc1_hi; assert(nt->match1.highest < n_msgs); nt->match1.msginfo = new_array(unsigned char, nt->match1.max); memcpy(nt->match1.msginfo, enc1, nt->match1.n); /*}}}*/ index = hash & out->mask; while (out->tokens[index]) { ++index; index &= out->mask; } out->tokens[index] = nt; } } /*}}}*/ struct database *new_database_from_file(char *db_filename, int do_integrity_checks)/*{{{*/ { /* Read existing database from file for doing incremental update */ struct database *result; struct read_db *input; int i, n, N; result = new_database(); input = open_db(db_filename); if (!input) { /* Nothing to initialise */ if (verbose) printf("Database file was empty, creating a new database\n"); return result; } /* Build pathname information */ n = result->n_msgs = input->n_msgs; result->max_msgs = input->n_msgs; /* let it be extended as-and-when */ result->msgs = new_array(struct msgpath, n); result->type = new_array(enum message_type, n); result->hash_key = input->hash_key; /* Set up mbox structures */ N = result->n_mboxen = result->max_mboxen = input->n_mboxen; result->mboxen = N ? (new_array(struct mbox, N)) : NULL; for (i=0; imbox_paths_table[i]) { result->mboxen[i].path = new_string(input->data + input->mbox_paths_table[i]); } else { /* mbox is dead. */ result->mboxen[i].path = NULL; } result->mboxen[i].file_mtime = input->mbox_mtime_table[i]; result->mboxen[i].file_size = input->mbox_size_table[i]; nn = result->mboxen[i].n_msgs = input->mbox_entries_table[i]; result->mboxen[i].max_msgs = nn; result->mboxen[i].start = new_array(off_t, nn); result->mboxen[i].len = new_array(size_t, nn); result->mboxen[i].check_all = new_array(checksum_t, nn); /* Copy the entire checksum table in one go. */ memcpy(result->mboxen[i].check_all, input->data + input->mbox_checksum_table[i], nn * sizeof(checksum_t)); result->mboxen[i].n_so_far = 0; } for (i=0; itype[i] = MTY_DEAD; break; case DB_MSG_FILE: result->type[i] = MTY_FILE; result->msgs[i].src.mpf.path = new_string(input->data + input->path_offsets[i]); result->msgs[i].src.mpf.mtime = input->mtime_table[i]; result->msgs[i].src.mpf.size = input->size_table[i]; break; case DB_MSG_MBOX: { unsigned int mbi, msgi; int n; struct mbox *mb; result->type[i] = MTY_MBOX; decode_mbox_indices(input->path_offsets[i], &mbi, &msgi); result->msgs[i].src.mbox.file_index = mbi; mb = &result->mboxen[mbi]; assert(mb->n_so_far == msgi); n = mb->n_so_far; result->msgs[i].src.mbox.msg_index = n; mb->start[n] = input->mtime_table[i]; mb->len[n] = input->size_table[i]; ++mb->n_so_far; } break; } result->msgs[i].seen = (input->msg_type_and_flags[i] & FLAG_SEEN) ? 1:0; result->msgs[i].replied = (input->msg_type_and_flags[i] & FLAG_REPLIED) ? 1:0; result->msgs[i].flagged = (input->msg_type_and_flags[i] & FLAG_FLAGGED) ? 1:0; result->msgs[i].date = input->date_table[i]; result->msgs[i].tid = input->tid_table[i]; } import_toktable(input->data, input->hash_key, result->n_msgs, &input->to, result->to); import_toktable(input->data, input->hash_key, result->n_msgs, &input->cc, result->cc); import_toktable(input->data, input->hash_key, result->n_msgs, &input->from, result->from); import_toktable(input->data, input->hash_key, result->n_msgs, &input->subject, result->subject); import_toktable(input->data, input->hash_key, result->n_msgs, &input->body, result->body); import_toktable(input->data, input->hash_key, result->n_msgs, &input->attachment_name, result->attachment_name); import_toktable2(input->data, input->hash_key, result->n_msgs, &input->msg_ids, result->msg_ids); close_db(input); if (do_integrity_checks) { check_database_integrity(result); } return result; } /*}}}*/ static void add_angled_terms(int file_index, unsigned int hash_key, struct toktable2 *table, int add_to_chain1, char *s)/*{{{*/ { char *left, *right; if (s) { left = strchr(s, '<'); while (left) { right = strchr(left, '>'); if (right) { *right = '\0'; add_token2_in_file(file_index, hash_key, left+1, table, add_to_chain1); *right = '>'; /* restore */ } else { break; } left = strchr(right, '<'); } } } /*}}}*/ /* Macro for what characters can make up token strings. The following characters have special meanings: 0x2b + 0x2d - 0x2e . 0x40 @ 0x5f _ since they can occur within email addresses and message IDs when considered as a whole rather than as individual words. Underscore (0x5f) is considered a word-character always too. */ static unsigned char special_table[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 00-0f */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 10-1f */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 2, 0, /* 20-2f */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 30-3f */ 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 40-4f */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, /* 50-5f */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 60-6f */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 70-7f */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8f */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 90-9f */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a0-af */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* b0-bf */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* c0-cf */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* d0-df */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* e0-ef */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* f0-ff */ }; #if 0 #define CHAR_VALID(x,mask) (isalnum((unsigned char) x) || (special_table[(unsigned int)(unsigned char) x] & mask)) #endif static inline int char_valid_p(char x, unsigned int mask)/*{{{*/ { unsigned char xx = (unsigned char) x; if (isalnum(xx)) return 1; else if (special_table[(unsigned int) xx] & mask) return 1; else return 0; } /*}}}*/ static void tokenise_string(int file_index, unsigned int hash_key, struct toktable *table, char *data, int match_mask)/*{{{*/ { char *ss, *es, old_es; ss = data; for (;;) { while (*ss && !char_valid_p(*ss,match_mask)) ss++; if (!*ss) break; es = ss + 1; while (*es && char_valid_p(*es,match_mask)) es++; /* deal with token [ss,es) */ old_es = *es; *es = '\0'; /* FIXME: Ought to do this by passing start and length - clean up later */ add_token_in_file(file_index, hash_key, ss, table); *es = old_es; if (!*es) break; ss = es; } } /*}}}*/ static void tokenise_html_string(int file_index, unsigned int hash_key, struct toktable *table, char *data)/*{{{*/ { char *ss, *es, old_es; /* FIXME : Probably want to rewrite this as an explicit FSM */ ss = data; for (;;) { /* Assume < and > are never valid token characters ! */ while (*ss && !char_valid_p(*ss, 1)) { if (*ss++ == '<') { /* Skip over HTML tag */ while (*ss && (*ss != '>')) ss++; } } if (!*ss) break; es = ss + 1; while (*es && char_valid_p(*es, 1)) es++; /* deal with token [ss,es) */ old_es = *es; *es = '\0'; /* FIXME: Ought to do this by passing start and length - clean up later */ add_token_in_file(file_index, hash_key, ss, table); *es = old_es; if (!*es) break; ss = es; } } /*}}}*/ void tokenise_message(int file_index, struct database *db, struct rfc822 *msg)/*{{{*/ { struct attachment *a; /* Match on whole addresses in these headers as well as the individual words */ if (msg->hdrs.to) { tokenise_string(file_index, db->hash_key, db->to, msg->hdrs.to, 1); tokenise_string(file_index, db->hash_key, db->to, msg->hdrs.to, 2); } if (msg->hdrs.cc) { tokenise_string(file_index, db->hash_key, db->cc, msg->hdrs.cc, 1); tokenise_string(file_index, db->hash_key, db->cc, msg->hdrs.cc, 2); } if (msg->hdrs.from) { tokenise_string(file_index, db->hash_key, db->from, msg->hdrs.from, 1); tokenise_string(file_index, db->hash_key, db->from, msg->hdrs.from, 2); } if (msg->hdrs.subject) tokenise_string(file_index, db->hash_key, db->subject, msg->hdrs.subject, 1); for (a=msg->atts.next; a!=&msg->atts; a=a->next) { switch (a->ct) { case CT_TEXT_PLAIN: tokenise_string(file_index, db->hash_key, db->body, a->data.normal.bytes, 1); break; case CT_TEXT_HTML: tokenise_html_string(file_index, db->hash_key, db->body, a->data.normal.bytes); break; case CT_MESSAGE_RFC822: /* Just recurse for now - maybe we should have separate token tables * for tokens occurring in embedded messages? */ if (a->data.rfc822) { tokenise_message(file_index, db, a->data.rfc822); } break; default: /* Don't do anything - unknown text format or some nasty binary stuff. * In future, we could have all kinds of 'plug-ins' here, e.g. * something that can parse PDF to get the basic text strings out of * the pages? */ break; } if (a->filename) { add_token_in_file(file_index, db->hash_key, a->filename, db->attachment_name); } } /* Deal with threading information */ add_angled_terms(file_index, db->hash_key, db->msg_ids, 1, msg->hdrs.message_id); add_angled_terms(file_index, db->hash_key, db->msg_ids, 0, msg->hdrs.in_reply_to); add_angled_terms(file_index, db->hash_key, db->msg_ids, 0, msg->hdrs.references); } /*}}}*/ static void scan_maildir_flags(struct msgpath *m)/*{{{*/ { const char *p, *start; start = m->src.mpf.path; m->seen = 0; m->replied = 0; m->flagged = 0; for (p=start; *p; p++) {} for (p--; (p >= start) && ((*p) != ':'); p--) {} if (p >= start) { if (!strncmp(p, ":2,", 3)) { p += 3; while (*p) { switch (*p) { case 'F': m->flagged = 1; break; case 'R': m->replied = 1; break; case 'S': m->seen = 1; break; default: break; } p++; } } } } /*}}}*/ static void scan_new_messages(struct database *db, int start_at)/*{{{*/ { int i; for (i=start_at; in_msgs; i++) { struct rfc822 *msg = NULL; switch (db->type[i]) { case MTY_DEAD: assert(0); break; case MTY_MBOX: assert(0); /* Should never get here - mbox messages are scanned elsewhere. */ break; case MTY_FILE: if (verbose) fprintf(stderr, "Scanning <%s>\n", db->msgs[i].src.mpf.path); msg = make_rfc822(db->msgs[i].src.mpf.path); break; } if(msg) { db->msgs[i].date = msg->hdrs.date; scan_maildir_flags(&db->msgs[i]); tokenise_message(i, db, msg); free_rfc822(msg); } else fprintf(stderr, "Skipping %s (could not parse message)\n", db->msgs[i].src.mpf.path); } } /*}}}*/ static inline void set_bit(unsigned long *x, int n)/*{{{*/ { int set; unsigned long mask; set = (n >> 5); mask = (1UL << (n & 31)); x[set] |= mask; } /*}}}*/ static inline int isset_bit(unsigned long *x, int n)/*{{{*/ { int set; unsigned long mask; set = (n >> 5); mask = (1UL << (n & 31)); return (x[set] & mask) ? 1 : 0; } /*}}}*/ static int find_base(int *table, int index) {/*{{{*/ int a = index; /* TODO : make this compress the path lengths down to the base entry */ while (table[a] != a) { a = table[a]; } return a; } /*}}}*/ static void find_threading(struct database *db)/*{{{*/ { /* ix is a table mapping path array index to the lowest path array index that * is known to share at least one message ID in its hdrs somewhere (i.e. they * must be in the same thread) */ int *ix; int i, m, np, nm, sm; int next_tid; np = db->n_msgs; nm = db->msg_ids->n; sm = db->msg_ids->size; ix = new_array(int, np); for (i=0; imsg_ids->tokens[m]; if (tok) { unsigned char *j = tok->match0.msginfo; unsigned char *last_char = j + tok->match0.n; int cur = 0, incr, first=1; int new_base=-1, old_base; while (j < last_char) { incr = read_increment(&j); cur += incr; if (first) { new_base = find_base(ix, cur); first = 0; } else { old_base = find_base(ix, cur); if (old_base < new_base) { ix[new_base] = old_base; new_base = old_base; } else if (old_base > new_base) { assert(new_base != -1); ix[old_base] = new_base; } } } } } /* Now make each entry point directly to its base */ for (i=0; imsgs[i].tid = next_tid++; } else { db->msgs[i].tid = db->msgs[ix[i]].tid; } } free(ix); return; } /*}}}*/ static int lookup_msgpath(struct msgpath *sorted_paths, int n_msgs, char *key)/*{{{*/ { /* Implement bisection search */ int l, h, m, r; l = 0, h = n_msgs; m = -1; while (h > l) { m = (h + l) >> 1; /* Should only get called on 'file' type messages - TBC */ r = strcmp(sorted_paths[m].src.mpf.path, key); if (r == 0) break; if (l == m) return -1; if (r > 0) h = m; else l = m; } return m; } /*}}}*/ void maybe_grow_message_arrays(struct database *db)/*{{{*/ { if (db->n_msgs == db->max_msgs) { if (db->max_msgs <= 128) { db->max_msgs = 256; } else { db->max_msgs += (db->max_msgs >> 1); } db->msgs = grow_array(struct msgpath, db->max_msgs, db->msgs); db->type = grow_array(enum message_type, db->max_msgs, db->type); } } /*}}}*/ static void add_msg_path(struct database *db, char *path, time_t mtime, size_t message_size)/*{{{*/ { maybe_grow_message_arrays(db); db->type[db->n_msgs] = MTY_FILE; db->msgs[db->n_msgs].src.mpf.path = new_string(path); db->msgs[db->n_msgs].src.mpf.mtime = mtime; db->msgs[db->n_msgs].src.mpf.size = message_size; ++db->n_msgs; } /*}}}*/ static int do_stat(struct msgpath *mp)/*{{{*/ { struct stat sb; int status; status = stat(mp->src.mpf.path, &sb); if ((status < 0) || !S_ISREG(sb.st_mode)) { return 0; } else { mp->src.mpf.mtime = sb.st_mtime; mp->src.mpf.size = sb.st_size; return 1; } } /*}}}*/ int update_database(struct database *db, struct msgpath *sorted_paths, int n_msgs, int do_fast_index)/*{{{*/ { /* The incoming list must be sorted into order, to make binary searching * possible. We search for each existing path in the incoming sorted array. * If the date differs, or the file no longer exist, the existing database * entry for that file is nulled. (These are only recovered if the database * is actively compressed.) If the date differed, a new entry for the file * is put at the end of the list. Similarly, any new file goes at the end. * These new entries are all rescanned to find tokens and add them to the * database. */ char *file_in_db, *file_in_new_list; int matched_index; int i, new_entries_start_at; int any_new, n_newly_pruned, n_already_dead; int status; file_in_db = new_array(char, n_msgs); file_in_new_list = new_array(char, db->n_msgs); bzero(file_in_db, n_msgs); bzero(file_in_new_list, db->n_msgs); n_already_dead = 0; n_newly_pruned = 0; for (i=0; in_msgs; i++) { switch (db->type[i]) { case MTY_FILE: matched_index = lookup_msgpath(sorted_paths, n_msgs, db->msgs[i].src.mpf.path); if (matched_index >= 0) { if (do_fast_index) { /* Assume the presence of a matching path is good enough without * even bothering to stat the file that's there now. */ file_in_db[matched_index] = 1; file_in_new_list[i] = 1; } else { status = do_stat(sorted_paths + matched_index); if (status) { if (sorted_paths[matched_index].src.mpf.mtime == db->msgs[i].src.mpf.mtime) { /* Treat stale files as though the path has changed. */ file_in_db[matched_index] = 1; file_in_new_list[i] = 1; } } else { /* This path will get treated as dead, and be re-stated below. * When that stat fails, the path won't get added to the db. */ } } } break; case MTY_MBOX: /* Nothing to do on this pass. */ break; case MTY_DEAD: break; } } /* Add new entries to database */ new_entries_start_at = db->n_msgs; for (i=0; in_msgs; i++) { /* Weed dead entries */ switch (db->type[i]) { case MTY_FILE: if (!file_in_new_list[i]) { free(db->msgs[i].src.mpf.path); db->msgs[i].src.mpf.path = NULL; db->type[i] = MTY_DEAD; ++n_newly_pruned; } break; case MTY_MBOX: { int msg_index, file_index, number_valid; int mbox_valid; msg_index = db->msgs[i].src.mbox.msg_index; file_index = db->msgs[i].src.mbox.file_index; assert (file_index < db->n_mboxen); mbox_valid = (db->mboxen[file_index].path) ? 1 : 0; number_valid = db->mboxen[file_index].n_old_msgs_valid; if (!mbox_valid || (msg_index >= number_valid)) { db->type[i] = MTY_DEAD; ++n_newly_pruned; } } break; case MTY_DEAD: /* already dead */ ++n_already_dead; break; } } if (verbose) { fprintf(stderr, "%d newly dead messages, %d messages now dead in total\n", n_newly_pruned, n_newly_pruned+n_already_dead); } any_new = 0; for (i=0; i 0); } /*}}}*/ static void recode_encoding(struct matches *m, int *new_idx)/*{{{*/ { unsigned char *new_enc, *old_enc; unsigned char *j, *last_char; int incr, idx, n_idx; old_enc = m->msginfo; j = old_enc; last_char = old_enc + m->n; new_enc = new_array(unsigned char, m->max); /* Probably not bigger than this. */ m->n = 0; m->highest = 0; m->msginfo = new_enc; idx = 0; while (j < last_char) { incr = read_increment(&j); idx += incr; n_idx = new_idx[idx]; if (n_idx >= 0) { check_and_enlarge_encoding(m); insert_index_on_encoding(m, n_idx); } } free(old_enc); } /*}}}*/ static void recode_toktable(struct toktable *tbl, int *new_idx)/*{{{*/ { /* Re-encode the vectors according to the new path indices */ int i; int any_dead = 0; int any_moved, pass; for (i=0; isize; i++) { struct token *tok = tbl->tokens[i]; if (tok) { recode_encoding(&tok->match0, new_idx); if (tok->match0.n == 0) { /* Delete this token. Gotcha - there may be tokens further on in the * array that didn't get their natural hash bucket due to collisions. * Need to shuffle such tokens up to guarantee that the buckets between * the natural one and the one where they are now are all occupied, to * prevent their lookups failing. */ #if 0 fprintf(stderr, "Token <%s> (bucket %d) no longer has files containing it, deleting\n", tok->text, i); #endif free_token(tok); tbl->tokens[i] = NULL; --tbl->n; /* Maintain number in use counter */ any_dead = 1; } } } if (any_dead) { /* Now close gaps. This has to be done in a second pass, otherwise we get a * problem with moving entries that need deleting back before the current scan point. */ pass = 1; for (;;) { int i; if (verbose) { fprintf(stderr, "Pass %d\n", pass); } any_moved = 0; for (i=0; isize; i++) { if (tbl->tokens[i]) { int nat_bucket_i; nat_bucket_i = tbl->tokens[i]->hashval & tbl->mask; if (nat_bucket_i != i) { /* Find earliest bucket that we could move i to */ int j = nat_bucket_i; while (j != i) { if (!tbl->tokens[j]) { /* put it here */ #if 0 fprintf(stderr, "Moved <%s> from bucket %d to %d (natural bucket %d)\n", tbl->tokens[i]->text, i, j, nat_bucket_i); #endif tbl->tokens[j] = tbl->tokens[i]; tbl->tokens[i] = NULL; any_moved = 1; break; } else { j++; j &= tbl->mask; } } if (tbl->tokens[i]) { #if 0 fprintf(stderr, "NOT moved <%s> from bucket %d (natural bucket %d)\n", tbl->tokens[i]->text, i, nat_bucket_i); #endif } } } } if (!any_moved) break; pass++; } } } /*}}}*/ static void recode_toktable2(struct toktable2 *tbl, int *new_idx)/*{{{*/ { /* Re-encode the vectors according to the new path indices */ int i; int any_dead = 0; int any_moved, pass; for (i=0; isize; i++) { struct token2 *tok = tbl->tokens[i]; if (tok) { recode_encoding(&tok->match0, new_idx); recode_encoding(&tok->match1, new_idx); if ((tok->match0.n == 0) && (tok->match1.n == 0)) { /* Delete this token. Gotcha - there may be tokens further on in the * array that didn't get their natural hash bucket due to collisions. * Need to shuffle such tokens up to guarantee that the buckets between * the natural one and the one where they are now are all occupied, to * prevent their lookups failing. */ #if 0 fprintf(stderr, "Token <%s> (bucket %d) no longer has files containing it, deleting\n", tok->text, i); #endif free_token2(tok); tbl->tokens[i] = NULL; --tbl->n; /* Maintain number in use counter */ any_dead = 1; } } } if (any_dead) { /* Now close gaps. This has to be done in a second pass, otherwise we get a * problem with moving entries that need deleting back before the current scan point. */ pass = 1; for (;;) { int i; if (verbose) { fprintf(stderr, "Pass %d\n", pass); } any_moved = 0; for (i=0; isize; i++) { if (tbl->tokens[i]) { int nat_bucket_i; nat_bucket_i = tbl->tokens[i]->hashval & tbl->mask; if (nat_bucket_i != i) { /* Find earliest bucket that we could move i to */ int j = nat_bucket_i; while (j != i) { if (!tbl->tokens[j]) { /* put it here */ #if 0 fprintf(stderr, "Moved <%s> from bucket %d to %d (natural bucket %d)\n", tbl->tokens[i]->text, i, j, nat_bucket_i); #endif tbl->tokens[j] = tbl->tokens[i]; tbl->tokens[i] = NULL; any_moved = 1; break; } else { j++; j &= tbl->mask; } } if (tbl->tokens[i]) { #if 0 fprintf(stderr, "NOT moved <%s> from bucket %d (natural bucket %d)\n", tbl->tokens[i]->text, i, nat_bucket_i); #endif } } } } if (!any_moved) break; pass++; } } } /*}}}*/ int cull_dead_messages(struct database *db, int do_integrity_checks)/*{{{*/ { /* Return true if any culled */ int *new_idx, i, j, n_old; int any_culled = 0; /* Check db is OK before we start on this. (Check afterwards is done in the * writer.c code.) */ if (do_integrity_checks) { check_database_integrity(db); } if (verbose) { fprintf(stderr, "Culling dead messages\n"); } n_old = db->n_msgs; new_idx = new_array(int, n_old); for (i=0, j=0; itype[i]) { case MTY_FILE: case MTY_MBOX: new_idx[i] = j++; break; case MTY_DEAD: new_idx[i] = -1; any_culled = 1; break; } } recode_toktable(db->to, new_idx); recode_toktable(db->cc, new_idx); recode_toktable(db->from, new_idx); recode_toktable(db->subject, new_idx); recode_toktable(db->body, new_idx); recode_toktable(db->attachment_name, new_idx); recode_toktable2(db->msg_ids, new_idx); /* And crunch down the filename table */ for (i=0, j=0; itype[i]) { case MTY_DEAD: break; case MTY_FILE: case MTY_MBOX: if (i > j) { db->msgs[j] = db->msgs[i]; db->type[j] = db->type[i]; } j++; break; } } db->n_msgs = j; free(new_idx); /* .. and cull dead mboxen */ cull_dead_mboxen(db); return any_culled; } /*}}}*/ mairix-0.22/expandstr.c0000644001161100116110000001032611402542166014711 0ustar richardrichard/* mairix - message index builder and finder for maildir folders. ********************************************************************** * Copyright (C) Richard P. Curnow 2004 * Copyright (C) Andreas Amann 2010 * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. * ********************************************************************** */ #include "mairix.h" #include #include #include #include #include static int isenv(unsigned char x)/*{{{*/ { /* Return true if x is valid as part of an environment variable name. */ if (isalnum(x)) return 1; else if (x == '_') return 1; else return 0; } /*}}}*/ static int home_dir_len(void)/*{{{*/ { struct passwd *foo; char *lookup; lookup = getenv("HOME"); if (lookup) { return strlen(lookup); } foo = getpwuid(getuid()); return strlen(foo->pw_dir); } /*}}}*/ static char *env_lookup(const char *p, const char *q)/*{{{*/ { char *var; char *lookup, *result; char *s; var = new_array(char, (q-p)+1); for (s=var; ppw_dir); strcpy(to, foo->pw_dir); } return to + len; } /*}}}*/ static char *append_env(char *to, const char *p, const char *q)/*{{{*/ { char *foo; int len; foo = env_lookup(p, q); if (foo) { len = strlen(foo); strcpy(to, foo); free(foo); } else { len = 0; } return (to + len); } /*}}}*/ static void do_expand(const char *p, char *result)/*{{{*/ { const char *q; int first; first = 1; while (*p) { if (first && (*p == '~') && (p[1] == '/')) { result = append_home_dir(result); p++; } else if ((*p == '$') && (p[1] == '{')) { p += 2; q = p; while (*q && (*q != '}')) q++; result = append_env(result, p, q); p = *q ? (q + 1) : q; } else if (*p == '$') { p++; q = p; while (*q && isenv(*(unsigned char*)q)) q++; result = append_env(result, p, q); p = q; } else { *result++ = *p++; } first = 0; } *result = 0; } /*}}}*/ char *expand_string(const char *p)/*{{{*/ { /* Return a copy of p, but with ~ expanded to the user's home directory $env expanded to the value of that environment variable */ int len; char *result; len = compute_length(p); result = new_array(char, len+1); do_expand(p, result); return result; } /*}}}*/ mairix-0.22/dates.h0000644001161100116110000000232711402542166014010 0ustar richardrichard/* mairix - message index builder and finder for maildir folders. ********************************************************************** * Copyright (C) Richard P. Curnow 2002-2004 * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. * ********************************************************************** */ #ifndef DATES_H #define DATES_H enum DATESCAN_TYPE { DS_FAILURE, DS_D, DS_Y, DS_YYMMDD, DS_SCALED, DS_M, DS_DM, DS_MD, DS_YM, DS_MY, DS_YMD, DS_DMY, }; extern int datescan_next_state(int current_state, int next_token); extern enum DATESCAN_TYPE datescan_exitval[]; #endif /* DATES_H */ mairix-0.22/configure0000755001161100116110000001747011402542166014453 0ustar richardrichard#!/bin/sh ######################################################################### # # mairix - message index builder and finder for maildir folders. # # Copyright (C) Richard P. Curnow 2003,2004,2005 # Copyright (C) Paramjit Oberoi 2005 # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. # # ======================================================================= if [ -f config.log ]; then rm -f config.log ; fi exec 5>config.log MYCC=${CC:-gcc} MYCFLAGS=${CFLAGS:--O2 -Wall} MYCPPFLAGS=${CPPFLAGS:-} MYLDFLAGS=${LDFLAGS:-} # ======================================================================= # Functions #{{{ cleanup cleanup () { if [ -f docheck.c ]; then rm -f docheck.c ; fi if [ -f docheck.o ]; then rm -f docheck.o ; fi if [ -f docheck ]; then rm -f docheck ; fi rm -rf docheck.c docheck.o docheck } #}}} #{{{ test_cc : basic compiler sanity check test_cc () { printf "Testing whether your compiler \"$MYCC $MYCFLAGS\" works : " cat >docheck.c < int main (int argc, char **argv) { return 0; } EOF ${MYCC} ${MYCFLAGS} -o docheck docheck.c 1>&5 2>&5 if [ $? -eq 0 ] then printf "it works\n" else printf "it doesn't work\n" printf "Failed program was\n" 1>&5 cat docheck.c 1>&5 rm -f docheck.c docheck exit 1 fi cleanup } #}}} #{{{ test_for_stdint_h test_for_stdint_h () { cat >docheck.c < int main(int argc, char **argv) { return 0; } EOF ${MYCC} ${MYCFLAGS} -c -o docheck.o docheck.c >/dev/null 2>&1 if [ $? -eq 0 ] then result=0 else result=1 fi rm -f docheck.c docheck.o echo $result } #}}} #{{{ test_for_inttypes_h test_for_inttypes_h () { cat >docheck.c < int main(int argc, char **argv) { return 0; } EOF ${MYCC} ${MYCFLAGS} -c -o docheck.o docheck.c >/dev/null 2>&1 if [ $? -eq 0 ] then result=0 else result=1 fi rm -f docheck.c docheck.o echo $result } #}}} #{{{ test_for_zlib test_for_zlib () { cat > docheck.c < int main () { const char *foo; foo = zlibVersion(); return 0; } EOF echo "Test program is" 1>&5 cat docheck.c 1>&5 ${MYCC} ${MYCPPFLAGS} ${MYCFLAGS} ${MYLDFLAGS} -o docheck docheck.c -lz 1>&5 2>&1 if [ $? -eq 0 ] then result=0 else result=1 fi rm -f docheck.c docheck echo $result } #}}} #{{{ test_for_bzlib test_for_bzlib () { cat > docheck.c < int main () { const char *foo; foo = BZ2_bzlibVersion(); return 0; } EOF echo "Test program is" 1>&5 cat docheck.c 1>&5 ${MYCC} ${MYCPPFLAGS} ${MYCFLAGS} ${MYLDFLAGS} -o docheck docheck.c -lbz2 1>&5 2>&1 if [ $? -eq 0 ] then result=0 else result=1 fi rm -f docheck.c docheck echo $result } #}}} #{{{ usage usage () { cat < if you have header files in a nonstandard directory LDFLAGS linker flags, e.g. -L if you have libraries in a nonstandard directory Use these variables to override the choices made by \`configure' or to help it to find libraries and programs with nonstandard names/locations. Report bugs to . EOF } #}}} # ======================================================================= # Defaults for variables PREFIX=/usr/local use_readline=yes bad_options=no use_gzip_mbox=yes use_bzip_mbox=yes # Parse options to configure for option do case "$option" in --prefix=* | --install-prefix=* ) PREFIX=`echo $option | sed -e 's/[^=]*=//;'` ;; --bindir=* ) BINDIR=`echo $option | sed -e 's/[^=]*=//;'` ;; --mandir=* ) MANDIR=`echo $option | sed -e 's/[^=]*=//;'` ;; --infodir=* ) INFODIR=`echo $option | sed -e 's/[^=]*=//;'` ;; --docdir=* ) DOCDIR=`echo $option | sed -e 's/[^=]*=//;'` ;; --enable-gzip-mbox ) use_gzip_mbox=yes ;; --disable-gzip-mbox ) use_gzip_mbox=no ;; --enable-bzip-mbox ) use_bzip_mbox=yes ;; --disable-bzip-mbox ) use_bzip_mbox=no ;; -h | --help ) usage exit 1 ;; * ) printf "Unrecognized option : $option\n" bad_options=yes ;; esac done if [ ${bad_options} = yes ]; then exit 1 fi DEFS="" test_cc printf "Checking for : " if [ `test_for_stdint_h` -eq 0 ]; then printf "Yes\n" DEFS="${DEFS} -DHAS_STDINT_H" else printf "No\n" fi printf "Checking for : " if [ `test_for_inttypes_h` -eq 0 ]; then printf "Yes\n" DEFS="${DEFS} -DHAS_INTTYPES_H" else printf "No\n" fi if [ $use_gzip_mbox = "yes" ]; then printf "Checking for zlib : " if [ `test_for_zlib` -eq 0 ]; then printf "Yes\n"; DEFS="${DEFS} -DUSE_GZIP_MBOX" LIBS="-lz" else printf "No (disabled gzipped mbox support)\n"; fi fi if [ $use_bzip_mbox = "yes" ]; then printf "Checking for bzlib : " if [ `test_for_bzlib` -eq 0 ]; then printf "Yes\n"; DEFS="${DEFS} -DUSE_BZIP_MBOX" LIBS="${LIBS} -lbz2" else printf "No (disabled bzip2ed mbox support)\n"; fi fi #{{{ Determine version number of the program. if [ -f version.txt ]; then revision=`cat version.txt` else revision="DEVELOPMENT" fi #}}} if [ "x" = "x${BINDIR}" ]; then BINDIR=${PREFIX}/bin ; fi if [ "x" = "x${MANDIR}" ]; then MANDIR=${PREFIX}/man ; fi if [ "x" = "x${INFODIR}" ]; then INFODIR=${PREFIX}/info ; fi if [ "x" = "x${DOCDIR}" ]; then DOCDIR=${PREFIX}/doc/mairix-${revision} ; fi echo "Generating Makefile" rm -f Makefile sed -e "s%@cc@%${MYCC}%; \ s%@defs@%${DEFS}%; \ s%@cflags@%${MYCFLAGS}%; \ s%@prefix@%${PREFIX}%; \ s%@bindir@%${BINDIR}%; \ s%@mandir@%${MANDIR}%; \ s%@infodir@%${INFODIR}%; \ s%@docdir@%${DOCDIR}%; \ s%@LIBS@%${LIBS}%; \ s%@CPPFLAGS@%${MYCPPFLAGS}%; \ s%@LDFLAGS@%${MYLDFLAGS}%; \ " < Makefile.in > Makefile # Avoid editing Makefile instead of Makefile.in chmod ugo-w Makefile # ======================================================================= # vim:et:sw=2:ht=2:sts=2:fdm=marker:cms=#%s mairix-0.22/rfc822.c0000644001161100116110000012053511402542166013713 0ustar richardrichard/* mairix - message index builder and finder for maildir folders. ********************************************************************** * Copyright (C) Richard P. Curnow 2002,2003,2004,2005,2006,2007,2010 * rfc2047 decode: * Copyright (C) Mikael Ylikoski 2002 * gzip mbox support: * Copyright (C) Ico Doornekamp 2005 * Copyright (C) Felipe Gustavo de Almeida 2005 * bzip2 mbox support: * Copyright (C) Paramjit Oberoi 2005 * caching uncompressed mbox data: * Copyright (C) Chris Mason 2006 * memory leak fixes: * Copyright (C) Samuel Tardieu 2008 * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. * ********************************************************************** */ #include "mairix.h" #include "nvp.h" #include #include #include #include #include #include #include #ifdef USE_GZIP_MBOX # include #endif #ifdef USE_BZIP_MBOX # include #endif struct DLL {/*{{{*/ struct DLL *next; struct DLL *prev; }; /*}}}*/ static void enqueue(void *head, void *x)/*{{{*/ { /* Declare this way so it can be used with any kind of double linked list * having next & prev pointers in its first two words. */ struct DLL *h = (struct DLL *) head; struct DLL *xx = (struct DLL *) x; xx->next = h; xx->prev = h->prev; h->prev->next = xx; h->prev = xx; return; } /*}}}*/ enum encoding_type {/*{{{*/ ENC_UNKNOWN, ENC_NONE, ENC_BINARY, ENC_7BIT, ENC_8BIT, ENC_QUOTED_PRINTABLE, ENC_BASE64 }; /*}}}*/ struct content_type_header {/*{{{*/ const char *major; /* e.g. text */ const char *minor; /* e.g. plain */ const char *boundary; /* for multipart */ /* charset? */ }; /*}}}*/ struct line {/*{{{*/ struct line *next; struct line *prev; char *text; }; /*}}}*/ static void init_headers(struct headers *hdrs)/*{{{*/ { hdrs->to = NULL; hdrs->cc = NULL; hdrs->from = NULL; hdrs->subject = NULL; hdrs->message_id = NULL; hdrs->in_reply_to = NULL; hdrs->references = NULL; hdrs->date = 0; hdrs->flags.seen = 0; hdrs->flags.replied = 0; hdrs->flags.flagged = 0; }; /*}}}*/ static void splice_header_lines(struct line *header)/*{{{*/ { /* Deal with newline then tab in header */ struct line *x, *next; for (x=header->next; x!=header; x=next) { #if 0 printf("next header, x->text=%08lx\n", x->text); printf("header=<%s>\n", x->text); #endif next = x->next; if (isspace(x->text[0] & 0xff)) { /* Glue to previous line */ char *p, *newbuf, *oldbuf; struct line *y; for (p=x->text; *p; p++) { if (!isspace(*(unsigned char *)p)) break; } p--; /* point to final space */ y = x->prev; #if 0 printf("y=%08lx p=%08lx\n", y->text, p); #endif newbuf = new_array(char, strlen(y->text) + strlen(p) + 1); strcpy(newbuf, y->text); strcat(newbuf, p); oldbuf = y->text; y->text = newbuf; free(oldbuf); y->next = x->next; x->next->prev = y; free(x->text); free(x); } } return; } /*}}}*/ static int audit_header(struct line *header)/*{{{*/ { /* Check for obvious broken-ness * 1st line has no leading spaces, single word then colon * following lines have leading spaces or single word followed by colon * */ struct line *x; int first=1; int count=1; for (x=header->next; x!=header; x=x->next) { int has_leading_space=0; int is_blank; int has_word_colon=0; if (1 || first) { /* Ignore any UUCP or mbox style From line at the start */ if (!strncmp("From ", x->text, 5)) { continue; } /* Ignore escaped From line at the start */ if (!strncmp(">From ", x->text, 6)) { continue; } } is_blank = !(x->text[0]); if (!is_blank) { char *p; int saw_char = 0; has_leading_space = isspace(x->text[0] & 0xff); has_word_colon = 0; /* default */ p = x->text; while(*p) { if(*p == ':') { has_word_colon = saw_char; break; } else if (isspace(*(unsigned char *) p)) { has_word_colon = 0; break; } else { saw_char = 1; } p++; } } if (( first && (is_blank || has_leading_space || !has_word_colon)) || (!first && (is_blank || !(has_leading_space || has_word_colon)))) { #if 0 fprintf(stderr, "Header line %d <%s> fails because:", count, x->text); if (first && is_blank) { fprintf(stderr, " [first && is_blank]"); } if (first && has_leading_space) { fprintf(stderr, " [first && has_leading_space]"); } if (first && !has_word_colon) { fprintf(stderr, " [first && !has_word_colon]"); } if (!first && is_blank) { fprintf(stderr, " [!first && is_blank]"); } if (!first && !(has_leading_space||has_word_colon)) { fprintf(stderr, " [!first && !has_leading_space||has_word_colon]"); } fprintf(stderr, "\n"); #endif /* Header fails the audit */ return 0; } first = 0; count++; } /* If we get here the header must have been OK */ return 1; }/*}}}*/ static int match_string(const char *ref, const char *candidate)/*{{{*/ { int len = strlen(ref); return !strncasecmp(ref, candidate, len); } /*}}}*/ static char equal_table[] = {/*{{{*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 00-0f */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 10-1f */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 20-2f */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, /* 30-3f */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 40-4f */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 50-5f */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 60-6f */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 70-7f */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8f */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 90-9f */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a0-af */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* b0-bf */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* c0-cf */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* d0-df */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* e0-ef */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* f0-ff */ }; /*}}}*/ static int base64_table[] = {/*{{{*/ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, /* 00-0f */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, /* 10-1f */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -1, -1, 63, /* 20-2f */ 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, 0, -1, -1, /* 30-3f */ -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, /* 40-4f */ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, /* 50-5f */ -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, /* 60-6f */ 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -1, -1, /* 70-7f */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, /* 80-8f */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, /* 90-9f */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, /* a0-af */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, /* b0-bf */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, /* c0-cf */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, /* d0-df */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, /* e0-ef */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 /* f0-ff */ }; /*}}}*/ static int hex_to_val(char x) {/*{{{*/ switch (x) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': return (x - '0'); break; case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': return 10 + (x - 'a'); break; case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': return 10 + (x - 'A'); break; default: return 0; } } /*}}}*/ static void decode_header_value(char *text){/*{{{*/ /* rfc2047 decode, written by Mikael Ylikoski */ char *s, *a, *b, *e, *p, *q; for (p = q = s = text; (s = strstr(s, "=?")); s = e + 2) { if (p == q) p = q = s; else while (q != s) *p++ = *q++; s += 2; a = strchr(s, '?'); if (!a) break; a++; b = strchr(a, '?'); if (!b) break; b++; e = strstr(b, "?="); if (!e) break; /* have found an encoded-word */ if (b - a != 2) continue; /* unknown encoding */ if (*a == 'q' || *a == 'Q') { int val; q = b; while (q < e) { if (*q == '_') { *p++ = 0x20; q++; } else if (*q == '=') { q++; val = hex_to_val(*q++) << 4; val += hex_to_val(*q++); *p++ = val; } else *p++ = *q++; } } else if (*a == 'b' || *a == 'B') { int reg, nc, eq; /* register, #characters in reg, #equals */ int dc; /* decoded character */ eq = reg = nc = 0; for (q = b; q < e; q++) { unsigned char cq = *(unsigned char *)q; dc = base64_table[cq]; eq += equal_table[cq]; if (dc >= 0) { reg <<= 6; reg += dc; nc++; if (nc == 4) { *p++ = ((reg >> 16) & 0xff); if (eq < 2) *p++ = ((reg >> 8) & 0xff); if (eq < 1) *p++ = reg & 0xff; nc = reg = 0; if (eq) break; } } } } else { continue; /* unknown encoding */ } q = e + 2; } if (p == q) return; while (*q != '\0') *p++ = *q++; *p = '\0'; } /*}}}*/ static char *copy_header_value(char *text){/*{{{*/ char *p; for (p = text; *p && (*p != ':'); p++) ; if (!*p) return NULL; p++; p = new_string(p); decode_header_value(p); return p; } /*}}}*/ static void copy_or_concat_header_value(char **previous, char *text){/*{{{*/ char *p = copy_header_value(text); if (*previous) { *previous = extend_string(*previous, ", "); *previous = extend_string(*previous, p); free(p); } else *previous = p; } /*}}}*/ static enum encoding_type decode_encoding_type(const char *e)/*{{{*/ { enum encoding_type result; const char *p; if (!e) { result = ENC_NONE; } else { for (p=e; *p && isspace(*(unsigned char *)p); p++) ; if ( match_string("7bit", p) || match_string("7-bit", p) || match_string("7 bit", p)) { result = ENC_7BIT; } else if (match_string("8bit", p) || match_string("8-bit", p) || match_string("8 bit", p)) { result = ENC_8BIT; } else if (match_string("quoted-printable", p)) { result = ENC_QUOTED_PRINTABLE; } else if (match_string("base64", p)) { result = ENC_BASE64; } else if (match_string("binary", p)) { result = ENC_BINARY; } else { fprintf(stderr, "Warning: unknown encoding type: '%s'\n", e); result = ENC_UNKNOWN; } } return result; } /*}}}*/ static void parse_content_type(struct nvp *ct_nvp, struct content_type_header *result)/*{{{*/ { result->major = NULL; result->minor = NULL; result->boundary = NULL; result->major = nvp_major(ct_nvp); if (result->major) { result->minor = nvp_minor(ct_nvp); } else { result->minor = NULL; result->major = nvp_first(ct_nvp); } result->boundary = nvp_lookupcase(ct_nvp, "boundary"); } /*}}}*/ static char *looking_at_ws_then_newline(char *start)/*{{{*/ { char *result; result = start; do { if (*result == '\n') return result; else if (!isspace(*(unsigned char *) result)) return NULL; else result++; } while (1); /* Can't get here */ assert(0); } /*}}}*/ static char *unencode_data(struct msg_src *src, char *input, int input_len, const char *enc, int *output_len)/*{{{*/ { enum encoding_type encoding; char *result, *end_result; char *end_input; encoding = decode_encoding_type(enc); end_input = input + input_len; /* All mime encodings result in expanded data, so this is guaranteed to * safely oversize the output array */ result = new_array(char, input_len + 1); /* Now decode */ switch (encoding) { case ENC_7BIT:/*{{{*/ case ENC_8BIT: case ENC_BINARY: case ENC_NONE: { memcpy(result, input, input_len); end_result = result + input_len; } break; /*}}}*/ case ENC_QUOTED_PRINTABLE:/*{{{*/ { char *p, *q; p = result; for (p=result, q=input; qtype) { case MS_FILE: result = src->filename; break; case MS_MBOX: len = strlen(src->filename); len += 32; if (!buffer || (len > buffer_len)) { free(buffer); buffer = new_array(char, len); buffer_len = len; } sprintf(buffer, "%s[%d,%d)", src->filename, (int) src->start, (int) (src->start + src->len)); result = buffer; break; default: result = NULL; break; } return result; } /*}}}*/ static int split_and_splice_header(struct msg_src *src, char *data, struct line *header, char **body_start)/*{{{*/ { char *sol, *eol; int blank_line; header->next = header->prev = header; sol = data; do { if (!*sol) break; blank_line = 1; /* until proven otherwise */ eol = sol; while (*eol && (*eol != '\n')) { if (!isspace(*(unsigned char *) eol)) blank_line = 0; eol++; } if (*eol == '\n') { if (!blank_line) { int line_length = eol - sol; char *line_text = new_array(char, 1 + line_length); struct line *new_header; strncpy(line_text, sol, line_length); line_text[line_length] = '\0'; new_header = new(struct line); new_header->text = line_text; enqueue(header, new_header); } sol = eol + 1; /* Start of next line */ } else { /* must be null char */ fprintf(stderr, "Got null character whilst processing header of %s\n", format_msg_src(src)); return -1; /* & leak memory */ } } while (!blank_line); *body_start = sol; if (audit_header(header)) { splice_header_lines(header); return 0; } else { #if 0 /* Caller generates message */ fprintf(stderr, "Message had bad rfc822 headers, ignoring\n"); #endif return -1; } } /*}}}*/ /* Forward prototypes */ static void do_multipart(struct msg_src *src, char *input, int input_len, const char *boundary, struct attachment *atts, enum data_to_rfc822_error *error); /*{{{ do_body() */ static void do_body(struct msg_src *src, char *body_start, int body_len, struct nvp *ct_nvp, struct nvp *cte_nvp, struct nvp *cd_nvp, struct attachment *atts, enum data_to_rfc822_error *error) { char *decoded_body; int decoded_body_len; const char *content_transfer_encoding; content_transfer_encoding = NULL; if (cte_nvp) { content_transfer_encoding = nvp_first(cte_nvp); if (!content_transfer_encoding) { fprintf(stderr, "Giving up on %s, content_transfer_encoding header not parseable\n", format_msg_src(src)); return; } } decoded_body = unencode_data(src, body_start, body_len, content_transfer_encoding, &decoded_body_len); if (ct_nvp) { struct content_type_header ct; parse_content_type(ct_nvp, &ct); if (ct.major && !strcasecmp(ct.major, "multipart")) { do_multipart(src, decoded_body, decoded_body_len, ct.boundary, atts, error); /* Don't need decoded body any longer - copies have been taken if * required when handling multipart attachments. */ free(decoded_body); if (error && (*error == DTR8_MISSING_END)) return; } else { /* unipart */ struct attachment *new_att; const char *disposition; new_att = new(struct attachment); disposition = cd_nvp ? nvp_first(cd_nvp) : NULL; if (disposition && !strcasecmp(disposition, "attachment")) { const char *lookup; lookup = nvp_lookupcase(cd_nvp, "filename"); if (lookup) { new_att->filename = new_string(lookup); } else { /* Some messages have name=... in content-type: instead of * filename=... in content-disposition. */ lookup = nvp_lookup(ct_nvp, "name"); if (lookup) { new_att->filename = new_string(lookup); } else { new_att->filename = NULL; } } } else { new_att->filename = NULL; } if (ct.major && !strcasecmp(ct.major, "text")) { if (ct.minor && !strcasecmp(ct.minor, "plain")) { new_att->ct = CT_TEXT_PLAIN; } else if (ct.minor && !strcasecmp(ct.minor, "html")) { new_att->ct = CT_TEXT_HTML; } else { new_att->ct = CT_TEXT_OTHER; } } else if (ct.major && !strcasecmp(ct.major, "message") && ct.minor && !strcasecmp(ct.minor, "rfc822")) { new_att->ct = CT_MESSAGE_RFC822; } else { new_att->ct = CT_OTHER; } if (new_att->ct == CT_MESSAGE_RFC822) { new_att->data.rfc822 = data_to_rfc822(src, decoded_body, decoded_body_len, error); free(decoded_body); /* data no longer needed */ } else { new_att->data.normal.len = decoded_body_len; new_att->data.normal.bytes = decoded_body; } enqueue(atts, new_att); } } else { /* Treat as text/plain {{{*/ struct attachment *new_att; new_att = new(struct attachment); new_att->filename = NULL; new_att->ct = CT_TEXT_PLAIN; new_att->data.normal.len = decoded_body_len; /* Add null termination on the end */ new_att->data.normal.bytes = new_array(char, decoded_body_len + 1); memcpy(new_att->data.normal.bytes, decoded_body, decoded_body_len + 1); free(decoded_body); enqueue(atts, new_att);/*}}}*/ } } /*}}}*/ /*{{{ do_attachment() */ static void do_attachment(struct msg_src *src, char *start, char *after_end, struct attachment *atts) { /* decode attachment and add to attachment list */ struct line header, *x, *nx; char *body_start; int body_len; struct nvp *ct_nvp, *cte_nvp, *cd_nvp; if (split_and_splice_header(src, start, &header, &body_start) < 0) { fprintf(stderr, "Giving up on attachment with bad header in %s\n", format_msg_src(src)); return; } /* Extract key headers */ ct_nvp = cte_nvp = cd_nvp = NULL; for (x=header.next; x!=&header; x=x->next) { if (match_string("content-type:", x->text)) { ct_nvp = make_nvp(src, x->text + sizeof("content-type:") - 1); } else if (match_string("content-transfer-encoding:", x->text)) { cte_nvp = make_nvp(src, x->text + sizeof("content-transfer-encoding:") - 1); } else if (match_string("content-disposition:", x->text)) { cd_nvp = make_nvp(src, x->text + sizeof("content-disposition:") - 1); } } #if 0 if (ct_nvp) { fprintf(stderr, "======\n"); fprintf(stderr, "Dump of content-type hdr\n"); nvp_dump(ct_nvp, stderr); free(ct_nvp); } if (cte_nvp) { fprintf(stderr, "======\n"); fprintf(stderr, "Dump of content-transfer-encoding hdr\n"); nvp_dump(cte_nvp, stderr); free(cte_nvp); } #endif if (body_start > after_end) { /* This is a (maliciously?) b0rken attachment, e.g. maybe empty */ if (verbose) { fprintf(stderr, "Message %s contains an invalid attachment, length=%d bytes\n", format_msg_src(src), (int)(after_end - start)); } } else { body_len = after_end - body_start; /* Ignore errors in nested body parts. */ do_body(src, body_start, body_len, ct_nvp, cte_nvp, cd_nvp, atts, NULL); } /* Free header memory */ for (x=header.next; x!=&header; x=nx) { nx = x->next; free(x->text); free(x); } if (ct_nvp) free_nvp(ct_nvp); if (cte_nvp) free_nvp(cte_nvp); if (cd_nvp) free_nvp(cd_nvp); } /*}}}*/ /*{{{ do_multipart() */ static void do_multipart(struct msg_src *src, char *input, int input_len, const char *boundary, struct attachment *atts, enum data_to_rfc822_error *error) { char *normal_boundary, *end_boundary; char *b0, *b1, *be; char *line_after_b0, *start_b1_search_from; int boundary_len; int looking_at_end_boundary; if (!boundary) { fprintf(stderr, "Can't process multipart message %s with no boundary string\n", format_msg_src(src)); if (error) *error = DTR8_MULTIPART_SANS_BOUNDARY; return; } boundary_len = strlen(boundary); normal_boundary = new_array(char, boundary_len + 3); end_boundary = new_array(char, boundary_len + 5); strcpy(normal_boundary, "--"); strcat(normal_boundary, boundary); strcpy(end_boundary, "--"); strcat(end_boundary, boundary); strcat(end_boundary, "--"); b0 = NULL; /* Scan input to look for boundary markers */ be = strstr(input, end_boundary); if (!be) { if (error) { *error = DTR8_MISSING_END; return; } else { /* soldier on as best we can */ be = strchr(input, 0); } } line_after_b0 = input; do { int boundary_ok; start_b1_search_from = line_after_b0; do { /* reject boundaries that aren't a whole line */ b1 = strstr(start_b1_search_from, normal_boundary); if (!b1) { if (*be) { fprintf(stderr, "Oops, didn't find another normal boundary in %s\n", format_msg_src(src)); goto cleanup; } else { b1 = be; /* tolerate missing end boundary */ break; } } looking_at_end_boundary = (b1 == be); boundary_ok = 1; if ((b1 > input) && (*(b1-1) != '\n')) boundary_ok = 0; if (!looking_at_end_boundary && (b1 + boundary_len + 2 < input + input_len) && (*(b1 + boundary_len + 2) != '\n')) boundary_ok = 0; if (!boundary_ok) { char *eol = strchr(b1, '\n'); if (!eol) { fprintf(stderr, "Oops, didn't find another normal boundary in %s\n", format_msg_src(src)); goto cleanup; } start_b1_search_from = 1 + eol; } } while (!boundary_ok); /* b1 is now looking at a good boundary, which might be the final one */ if (b0) { /* don't treat preamble as an attachment */ do_attachment(src, line_after_b0, b1, atts); } b0 = b1; line_after_b0 = strchr(b0, '\n'); if (line_after_b0 == 0) line_after_b0 = b0 + strlen(b0); else ++line_after_b0; } while (b1 != be); cleanup: free(normal_boundary); free(end_boundary); } /*}}}*/ static time_t parse_rfc822_date(char *date_string)/*{{{*/ { struct tm tm; char *s, *z; /* Format [weekday ,] day-of-month month year hour:minute:second timezone. Some of the ideas, sanity checks etc taken from parse.c in the mutt sources, credit to Michael R. Elkins et al */ s = date_string; z = strchr(s, ','); if (z) s = z + 1; while (*s && isspace(*s)) s++; /* Should now be looking at day number */ if (!isdigit(*s)) goto tough_cheese; tm.tm_mday = atoi(s); if (tm.tm_mday > 31) goto tough_cheese; while (isdigit(*s)) s++; while (*s && isspace(*s)) s++; if (!*s) goto tough_cheese; if (!strncasecmp(s, "jan", 3)) tm.tm_mon = 0; else if (!strncasecmp(s, "feb", 3)) tm.tm_mon = 1; else if (!strncasecmp(s, "mar", 3)) tm.tm_mon = 2; else if (!strncasecmp(s, "apr", 3)) tm.tm_mon = 3; else if (!strncasecmp(s, "may", 3)) tm.tm_mon = 4; else if (!strncasecmp(s, "jun", 3)) tm.tm_mon = 5; else if (!strncasecmp(s, "jul", 3)) tm.tm_mon = 6; else if (!strncasecmp(s, "aug", 3)) tm.tm_mon = 7; else if (!strncasecmp(s, "sep", 3)) tm.tm_mon = 8; else if (!strncasecmp(s, "oct", 3)) tm.tm_mon = 9; else if (!strncasecmp(s, "nov", 3)) tm.tm_mon = 10; else if (!strncasecmp(s, "dec", 3)) tm.tm_mon = 11; else goto tough_cheese; while (!isspace(*s)) s++; while (*s && isspace(*s)) s++; if (!isdigit(*s)) goto tough_cheese; tm.tm_year = atoi(s); if (tm.tm_year < 70) { tm.tm_year += 100; } else if (tm.tm_year >= 1900) { tm.tm_year -= 1900; } while (isdigit(*s)) s++; while (*s && isspace(*s)) s++; if (!*s) goto tough_cheese; /* Now looking at hms */ /* For now, forget this. The searching will be vague enough that nearest day is good enough. */ tm.tm_hour = 0; tm.tm_min = 0; tm.tm_sec = 0; tm.tm_isdst = 0; return mktime(&tm); tough_cheese: return (time_t) -1; /* default value */ } /*}}}*/ static void scan_status_flags(const char *s, struct headers *hdrs)/*{{{*/ { const char *p; for (p=s; *p; p++) { switch (*p) { case 'R': hdrs->flags.seen = 1; break; case 'A': hdrs->flags.replied = 1; break; case 'F': hdrs->flags.flagged = 1; break; default: break; } } } /*}}}*/ /*{{{ data_to_rfc822() */ struct rfc822 *data_to_rfc822(struct msg_src *src, char *data, int length, enum data_to_rfc822_error *error) { struct rfc822 *result; char *body_start; struct line header; struct line *x, *nx; struct nvp *ct_nvp, *cte_nvp, *cd_nvp; int body_len; if (error) *error = DTR8_OK; /* default */ result = new(struct rfc822); init_headers(&result->hdrs); result->atts.next = result->atts.prev = &result->atts; if (split_and_splice_header(src, data, &header, &body_start) < 0) { if (verbose) { fprintf(stderr, "Giving up on message %s with bad header\n", format_msg_src(src)); } if (error) *error = DTR8_BAD_HEADERS; return NULL; } /* Extract key headers {{{*/ ct_nvp = cte_nvp = cd_nvp = NULL; for (x=header.next; x!=&header; x=x->next) { if (match_string("to", x->text)) copy_or_concat_header_value(&result->hdrs.to, x->text); else if (match_string("cc", x->text)) copy_or_concat_header_value(&result->hdrs.cc, x->text); else if (!result->hdrs.from && match_string("from", x->text)) result->hdrs.from = copy_header_value(x->text); else if (!result->hdrs.subject && match_string("subject", x->text)) result->hdrs.subject = copy_header_value(x->text); else if (!ct_nvp && match_string("content-type", x->text)) ct_nvp = make_nvp(src, x->text + sizeof("content-type:") - 1); else if (!cte_nvp && match_string("content-transfer-encoding", x->text)) cte_nvp = make_nvp(src, x->text + sizeof("content-transfer-encoding:") - 1); else if (!cd_nvp && match_string("content-disposition", x->text)) cd_nvp = make_nvp(src, x->text + sizeof("content-disposition:") - 1); else if (!result->hdrs.date && match_string("date", x->text)) { char *date_string = copy_header_value(x->text); result->hdrs.date = parse_rfc822_date(date_string); free(date_string); } else if (!result->hdrs.message_id && match_string("message-id", x->text)) result->hdrs.message_id = copy_header_value(x->text); else if (!result->hdrs.in_reply_to && match_string("in-reply-to", x->text)) result->hdrs.in_reply_to = copy_header_value(x->text); else if (!result->hdrs.references && match_string("references", x->text)) result->hdrs.references = copy_header_value(x->text); else if (match_string("status", x->text)) scan_status_flags(x->text + sizeof("status:"), &result->hdrs); else if (match_string("x-status", x->text)) scan_status_flags(x->text + sizeof("x-status:"), &result->hdrs); } /*}}}*/ /* Process body */ body_len = length - (body_start - data); do_body(src, body_start, body_len, ct_nvp, cte_nvp, cd_nvp, &result->atts, error); /* Free header memory */ for (x=header.next; x!=&header; x=nx) { nx = x->next; free(x->text); free(x); } if (ct_nvp) free_nvp(ct_nvp); if (cte_nvp) free_nvp(cte_nvp); if (cd_nvp) free_nvp(cd_nvp); return result; } /*}}}*/ #define ALLOC_NONE 1 #define ALLOC_MMAP 2 #define ALLOC_MALLOC 3 int data_alloc_type; #if USE_GZIP_MBOX || USE_BZIP_MBOX #define SIZE_STEP (8 * 1024 * 1024) #define COMPRESSION_NONE 0 #define COMPRESSION_GZIP 1 #define COMPRESSION_BZIP 2 static int get_compression_type(const char *filename) {/*{{{*/ size_t len = strlen(filename); int ptr; #ifdef USE_GZIP_MBOX ptr = len - 3; if (len > 3 && strncasecmp(filename + ptr, ".gz", 3) == 0) { return COMPRESSION_GZIP; } #endif #ifdef USE_BZIP_MBOX ptr = len - 4; if (len > 3 && strncasecmp(filename + ptr, ".bz2", 4) == 0) { return COMPRESSION_BZIP; } #endif return COMPRESSION_NONE; } /*}}}*/ static int is_compressed(const char *filename) {/*{{{*/ return (get_compression_type(filename) != COMPRESSION_NONE); } /*}}}*/ struct zFile {/*{{{*/ union { /* Both gzFile and BZFILE* are defined as void pointers * in their respective header files. */ #ifdef USE_GZIP_MBOX gzFile gzf; #endif #ifdef USE_BZIP_MBOX BZFILE *bzf; #endif void *zptr; } foo; int type; }; /*}}}*/ static struct zFile * xx_zopen(const char *filename, const char *mode) {/*{{{*/ struct zFile *zf = new(struct zFile); zf->type = get_compression_type(filename); switch (zf->type) { #ifdef USE_GZIP_MBOX case COMPRESSION_GZIP: zf->foo.gzf = gzopen(filename, "rb"); break; #endif #ifdef USE_BZIP_MBOX case COMPRESSION_BZIP: zf->foo.bzf = BZ2_bzopen(filename, "rb"); break; #endif default: zf->foo.zptr = NULL; break; } if (!zf->foo.zptr) { free(zf); return 0; } return zf; } /*}}}*/ static void xx_zclose(struct zFile *zf) {/*{{{*/ switch (zf->type) { #ifdef USE_GZIP_MBOX case COMPRESSION_GZIP: gzclose(zf->foo.gzf); break; #endif #ifdef USE_BZIP_MBOX case COMPRESSION_BZIP: BZ2_bzclose(zf->foo.bzf); break; #endif default: zf->foo.zptr = NULL; break; } free(zf); } /*}}}*/ static int xx_zread(struct zFile *zf, void *buf, int len) {/*{{{*/ switch (zf->type) { #ifdef USE_GZIP_MBOX case COMPRESSION_GZIP: return gzread(zf->foo.gzf, buf, len); break; #endif #ifdef USE_BZIP_MBOX case COMPRESSION_BZIP: return BZ2_bzread(zf->foo.bzf, buf, len); break; #endif default: return 0; break; } } /*}}}*/ #endif #if USE_GZIP_MBOX || USE_BZIP_MBOX /* do we need ROCACHE_SIZE > 1? the code supports any number here */ #define ROCACHE_SIZE 1 struct ro_mapping { char *filename; unsigned char *map; size_t len; }; static int ro_cache_init = 0; static struct ro_mapping ro_mapping_cache[ROCACHE_SIZE]; /* find a temp file in the mapping cache. If nothing is found lasti is * set to the next slot to use for insertion. You have to check that slot * to see if it is currently in use */ static struct ro_mapping *find_ro_cache(const char *filename, int *lasti) { int i = 0; struct ro_mapping *ro = NULL; if (lasti) *lasti = 0; if (!ro_cache_init) return NULL; for (i = 0 ; i < ROCACHE_SIZE ; i++) { ro = ro_mapping_cache + i; if (!ro->map) { if (lasti) *lasti = i; return NULL; } if (strcmp(filename, ro->filename) == 0) return ro; } /* if we're here, the map is full. They will reuse slot 0 */ return NULL; } /* * put a new tempfile into the cache. It is mmaped as part of this function * so you can safely close the file handle after calling this. */ static struct ro_mapping *add_ro_cache(const char *filename, int fd, size_t len) { int i = 0; struct ro_mapping *ro = NULL; if (!ro_cache_init) { memset(&ro_mapping_cache, 0, sizeof(ro_mapping_cache)); ro_cache_init = 1; } ro = find_ro_cache(filename, &i); if (ro) { fprintf(stderr, "%s already in ro cache\n", filename); return NULL; } ro = ro_mapping_cache + i; if (ro->map) { munmap(ro->map, ro->len); ro->map = NULL; free(ro->filename); } ro->map = (unsigned char *)mmap(0, len, PROT_READ, MAP_SHARED, fd, 0); if (ro->map == MAP_FAILED) { ro->map = NULL; perror("rfc822:mmap"); return NULL; } ro->len = len; ro->filename = new_string(filename); return ro; } #endif /* USE_GZIP_MBOX || USE_BZIP_MBOX */ void create_ro_mapping(const char *filename, unsigned char **data, int *len)/*{{{*/ { struct stat sb; int fd; #if USE_GZIP_MBOX || USE_BZIP_MBOX struct zFile *zf; #endif if (stat(filename, &sb) < 0) { report_error("stat", filename); *data = NULL; return; } #if USE_GZIP_MBOX || USE_BZIP_MBOX if(is_compressed(filename)) { unsigned char *p; size_t cur_read; struct ro_mapping *ro; FILE *tmpf; /* this branch never returns things that are freeable */ data_alloc_type = ALLOC_NONE; ro = find_ro_cache(filename, NULL); if (ro) { *data = ro->map; *len = ro->len; return; } if(verbose) { fprintf(stderr, "Decompressing %s...\n", filename); } tmpf = tmpfile(); if (!tmpf) { perror("tmpfile"); goto comp_error; } zf = xx_zopen(filename, "rb"); if (!zf) { fprintf(stderr, "Could not open %s\n", filename); goto comp_error; } p = new_array(unsigned char, SIZE_STEP); cur_read = xx_zread(zf, p, SIZE_STEP); if (fwrite(p, cur_read, 1, tmpf) != 1) { fprintf(stderr, "failed writing to temp file for %s\n", filename); goto comp_error; } *len = cur_read; if (cur_read >= SIZE_STEP) { while(1) { int ret; cur_read = xx_zread(zf, p, SIZE_STEP); if (cur_read <= 0) break; *len += cur_read; ret = fwrite(p, cur_read, 1, tmpf); if (ret != 1) { fprintf(stderr, "failed writing to temp file for %s\n", filename); goto comp_error; } } } free(p); xx_zclose(zf); if(*len > 0) { ro = add_ro_cache(filename, fileno(tmpf), *len); if (!ro) goto comp_error; *data = ro->map; *len = ro->len; } else { *data = NULL; } fclose(tmpf); return; comp_error: *data = NULL; *len = 0; if (tmpf) fclose(tmpf); return; } #endif /* USE_GZIP_MBOX || USE_BZIP_MBOX */ *len = sb.st_size; if (*len == 0) { *data = NULL; return; } if (!S_ISREG(sb.st_mode)) { *data = NULL; return; } fd = open(filename, O_RDONLY); if (fd < 0) { report_error("open", filename); *data = NULL; return; } *data = (unsigned char *) mmap(0, *len, PROT_READ, MAP_SHARED, fd, 0); if (close(fd) < 0) report_error("close", filename); if (*data == MAP_FAILED) { report_error("rfc822:mmap", filename); *data = NULL; return; } data_alloc_type = ALLOC_MMAP; } /*}}}*/ void free_ro_mapping(unsigned char *data, int len)/*{{{*/ { int r; if(data_alloc_type == ALLOC_MALLOC) { free(data); } if(data_alloc_type == ALLOC_MMAP) { r = munmap(data, len); if(r < 0) { fprintf(stderr, "munmap() errord\n"); exit(1); } } } /*}}}*/ static struct msg_src *setup_msg_src(char *filename)/*{{{*/ { static struct msg_src result; result.type = MS_FILE; result.filename = filename; return &result; } /*}}}*/ struct rfc822 *make_rfc822(char *filename)/*{{{*/ { int len; unsigned char *data; struct rfc822 *result; create_ro_mapping(filename, &data, &len); /* Don't process empty files */ result = NULL; if (data) { struct msg_src *src; /* Now process the data */ src = setup_msg_src(filename); /* For one message per file, ignore missing end boundary condition. */ result = data_to_rfc822(src, (char *) data, len, NULL); free_ro_mapping(data, len); } return result; } /*}}}*/ void free_rfc822(struct rfc822 *msg)/*{{{*/ { struct attachment *a, *na; if (!msg) return; if (msg->hdrs.to) free(msg->hdrs.to); if (msg->hdrs.cc) free(msg->hdrs.cc); if (msg->hdrs.from) free(msg->hdrs.from); if (msg->hdrs.subject) free(msg->hdrs.subject); if (msg->hdrs.message_id) free(msg->hdrs.message_id); if (msg->hdrs.in_reply_to) free(msg->hdrs.in_reply_to); if (msg->hdrs.references) free(msg->hdrs.references); for (a = msg->atts.next; a != &msg->atts; a = na) { na = a->next; if (a->filename) free(a->filename); if (a->ct == CT_MESSAGE_RFC822) { free_rfc822(a->data.rfc822); } else { free(a->data.normal.bytes); } free(a); } free(msg); } /*}}}*/ #ifdef TEST static void do_indent(int indent)/*{{{*/ { int i; for (i=indent; i>0; i--) { putchar(' '); } } /*}}}*/ static void show_header(char *tag, char *x, int indent)/*{{{*/ { if (x) { do_indent(indent); printf("%s: %s\n", tag, x); } } /*}}}*/ static void show_rfc822(struct rfc822 *msg, int indent)/*{{{*/ { struct attachment *a; show_header("From", msg->hdrs.from, indent); show_header("To", msg->hdrs.to, indent); show_header("Cc", msg->hdrs.cc, indent); show_header("Date", msg->hdrs.date, indent); show_header("Subject", msg->hdrs.subject, indent); for (a = msg->atts.next; a != &msg->atts; a=a->next) { printf("========================\n"); switch (a->ct) { case CT_TEXT_PLAIN: printf("Attachment type text/plain\n"); break; case CT_TEXT_HTML: printf("Attachment type text/html\n"); break; case CT_TEXT_OTHER: printf("Attachment type text/non-plain\n"); break; case CT_MESSAGE_RFC822: printf("Attachment type message/rfc822\n"); break; case CT_OTHER: printf("Attachment type other\n"); break; } if (a->ct != CT_MESSAGE_RFC822) { printf("%d bytes\n", a->data.normal.len); } if ((a->ct == CT_TEXT_PLAIN) || (a->ct == CT_TEXT_HTML) || (a->ct == CT_TEXT_OTHER)) { printf("----------\n"); printf("%s\n", a->data.normal.bytes); } if (a->ct == CT_MESSAGE_RFC822) { show_rfc822(a->data.rfc822, indent + 4); } } } /*}}}*/ int main (int argc, char **argv)/*{{{*/ { struct rfc822 *msg; if (argc < 2) { fprintf(stderr, "Need a path\n"); unlock_and_exit(2); } msg = make_rfc822(argv[1]); show_rfc822(msg, 0); free_rfc822(msg); /* Print out some stuff */ return 0; } /*}}}*/ #endif /* TEST */ mairix-0.22/mbox.c0000644001161100116110000007321111402542166013650 0ustar richardrichard/* mairix - message index builder and finder for maildir folders. ********************************************************************** * Copyright (C) Richard P. Curnow 2003,2004,2005,2006,2007 * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. * ********************************************************************** */ #include #include #include #include #include #include #include #include #include #include "mairix.h" #include "from.h" #include "fromcheck.h" #include "md5.h" struct extant_mbox {/*{{{*/ char *full_path; time_t mtime; size_t size; int db_index; /* + stuff to store positions etc of individual messages. */ }; /*}}}*/ static int compare_extant_mboxen(const void *a, const void *b)/*{{{*/ { const struct extant_mbox *aa = (const struct extant_mbox *) a; const struct extant_mbox *bb = (const struct extant_mbox *) b; return strcmp(aa->full_path, bb->full_path); } /*}}}*/ static int lookup_extant_mbox(struct extant_mbox *sorted_mboxen, int n_extant, char *key)/*{{{*/ { /* Implement bisection search */ int l, h, m, r; l = 0, h = n_extant; m = -1; while (h > l) { m = (h + l) >> 1; /* Should only get called on 'file' type messages - TBC */ r = strcmp(sorted_mboxen[m].full_path, key); if (r == 0) break; if (l == m) return -1; if (r > 0) h = m; else l = m; } return m; } /*}}}*/ static void append_new_mboxen_to_db(struct database *db, struct extant_mbox *extant_mboxen, int n_extant)/*{{{*/ { int N, n_reqd; int i, j; for (i=N=0; in_mboxen + N; if (n_reqd > db->max_mboxen) { db->max_mboxen = n_reqd; db->mboxen = grow_array(struct mbox, n_reqd, db->mboxen); } /* Init new entries. */ for (j=0, i=db->n_mboxen; jmboxen[i].path = new_string(extant_mboxen[j].full_path); db->mboxen[i].current_mtime = extant_mboxen[j].mtime; db->mboxen[i].current_size = extant_mboxen[j].size; db->mboxen[i].file_mtime = 0; db->mboxen[i].file_size = 0; db->mboxen[i].n_msgs = 0; db->mboxen[i].n_old_msgs_valid = 0; db->mboxen[i].max_msgs = 0; db->mboxen[i].start = NULL; db->mboxen[i].len = NULL; db->mboxen[i].check_all = NULL; i++; } } db->n_mboxen = n_reqd; } /*}}}*/ void compute_checksum(const char *data, size_t len, checksum_t *csum)/*{{{*/ { MD5_CTX md5; MD5Init(&md5); MD5Update(&md5, (unsigned char *) data, len); MD5Final(&md5); memcpy(csum, md5.digest, sizeof(md5.digest)); return; } /*}}}*/ static int message_is_intact(struct mbox *mb, int idx, char *va, size_t len)/*{{{*/ { /* TODO : later, look at whether to optimise this in some way, e.g. by doing an initial check on just the first 1k of a message, this will detect failures much faster at the cost of extra storage. */ if (mb->start[idx] + mb->len[idx] > len) { /* Message overruns the end of the file - can't possibly be intact. */ return 0; } else { checksum_t csum; compute_checksum(va + mb->start[idx], mb->len[idx], &csum); if (!memcmp(mb->check_all[idx], &csum, sizeof(checksum_t))) { return 1; } else { return 0; } } return 0; } /*}}}*/ static int find_number_intact(struct mbox *mb, char *va, size_t len)/*{{{*/ { /* Pick up the common obvious case first - where new messages have been appended to the end of the mbox */ if (mb->n_msgs == 0) { return 0; } else if (message_is_intact(mb, mb->n_msgs - 1, va, len)) { return mb->n_msgs; /* The lot */ } else if (!message_is_intact(mb, 0, va, len)) { return 0; /* None of them. */ } else { /* Looks like a deletion has occurred earlier in the file => binary chop search to find the last message that's still valid. Assume that everything below a valid message is still valid itself (possibly dangerous assumption, time will tell.) */ int l, m, h; l = 0; h = mb->n_msgs; /* Loop invariant : always, mesasage[l] is intact, message[h] isn't. */ while (l < h) { m = (h + l) >> 1; if (m==l) break; if (message_is_intact(mb, m, va, len)) { l = m; } else { h = m; } } /* By loop invariant, message[l] is the highest valid one. */ return (l + 1); } } /*}}}*/ static int fromtab_inited = 0; static signed char fromtab[256]; static void init_fromtab(void)/*{{{*/ { memset(fromtab, 0xff, 256); fromtab[(int)(unsigned char)'\n'] = ~(1<<0); fromtab[(int)(unsigned char)'F'] = ~(1<<1); fromtab[(int)(unsigned char)'r'] = ~(1<<2); fromtab[(int)(unsigned char)'o'] = ~(1<<3); fromtab[(int)(unsigned char)'m'] = ~(1<<4); fromtab[(int)(unsigned char)' '] = ~(1<<5); } /*}}}*/ /* REAL CHECKING : need to see if the line looks like this: * From [ ]