swish++-6.1.5/0000755000076500000000000000000010746421524011456 5ustar pjlwheelswish++-6.1.5/AssociateMeta.h0000644000076500000000000000310310166044112014334 0ustar pjlwheel/* ** SWISH++ ** AssociateMeta.h ** ** Copyright (C) 2001 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef AssociateMeta_H #define AssociateMeta_H // local #include "conf_bool.h" //***************************************************************************** // // SYNOPSIS // class AssociateMeta : public conf // // DESCRIPTION // // An AssociateMeta is-a conf containing the Boolean value // indicating whether to associate words to meta names during indexing. // // This is the same as index's -A command-line option. // //***************************************************************************** { public: AssociateMeta() : conf( "AssociateMeta", true ) { } CONF_BOOL_ASSIGN_OPS( AssociateMeta ) }; extern AssociateMeta associate_meta; #endif /* AssociateMeta_H */ /* vim:set et sw=4 ts=4: */ swish++-6.1.5/auto_vec.h0000644000076500000000000000450010427760220013426 0ustar pjlwheel/* ** PJL C++ Library ** auto_vec.h ** ** Copyright (C) 1998 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef auto_vec_H #define auto_vec_H namespace PJL { //***************************************************************************** // // SYNOPSIS // template< typename T > class auto_vec // // DESCRIPTION // // A auto_vec will delete the vector of objects it points to upon // destruction. This is very similar to STL's auto_ptr, but for // vectors. // // SEE ALSO // // Bjarne Stroustrup. "The C++ Programming Language, 3rd ed." // Addison-Wesley, Reading, MA, 1997. pp. 367-368. // //***************************************************************************** { public: explicit auto_vec( T *p = 0 ) : m_p( p ) { } ~auto_vec() { delete[] m_p; } auto_vec& operator=( T *p ) { if ( p != m_p ) { delete[] m_p; m_p = p; } return *this; } T& operator*() { return *m_p; } T const& operator*() const { return *m_p; } T& operator[]( int i ) { return m_p[ i ]; } T const& operator[]( int i ) const { return m_p[ i ]; } operator T*() { return m_p; } operator T const*() const { return m_p; } private: T *m_p; auto_vec( auto_vec const& ); // forbid copy auto_vec& operator=( auto_vec< T >& ); // forbid assignment }; } // namespace PJL #endif /* auto_vec_H */ /* vim:set et sw=4 ts=4: */ swish++-6.1.5/BUGS0000644000076500000000000000237107602510402012133 0ustar pjlwheel=============================================================================== What to do if you find a bug =============================================================================== In the hopefully rare circumstance that you find a bug, please report it to me by e-mail: . Please include the following information: 1. A description of the problem. 2. Operating system and version. 3. The exact command line used. 4. If the command dumps core, a stack trace. To get a stack trace: 1. Go to the "config" directory and edit the "config.mk" file uncommenting the line: DEBUG:= true 2. Go back to the top-level directory and type: make distclean make 3. Re-execute the same command to make it dump core again. A "core" file should be generated (if the filesystem is writable and you have write permission to it). For Mac OS X, core files are placed in the directory /cores and are of the form "core.XXXXX" where XXXXX is the process ID. If you can write to the filesystem yet no core file was generated, type: ulimit -c unlimited and try again. 4. Execute gdb by typing: gdb /path/to/swish++-executable /path/to/core 5. Then in gdb, type "where" and copy and paste the output into an e-mail message to me. swish++-6.1.5/Changes0000644000076500000000000075216410746420617012772 0ustar pjlwheelSWISH++ Changes =============== ******************************************************************************* 6.1.5 ******************************************************************************* BUG FIXES --------- * mod/html/elements.c 1. s/int const/long const/ for 64-bit systems. * version.h 1. Upped version to 6.1.5. ******************************************************************************* 6.1.4 ******************************************************************************* BUG FIXES --------- * The definition of MAKEDEPEND was wrong. This broke dependency-file generation when not using g++. (This bug fix shall be known as bug fix MD1.) * Indexing of some ID3 tags was broken. (This bug fix shall be known as bug fix ID31.) CHANGES, file-by-file --------------------- * config/config.mk 1. For MAKEDEPEND, s/:=/=/ for bug fix MD1. * mod/id3/id3v2.c 1. In parse_int(), added cast to unsigned char for bug fix ID31. * version.h 1. Upped version to 6.1.4. ******************************************************************************* 6.1.3 ******************************************************************************* BUG FIXES --------- * The search(1) -d option didn't work. (This bug fix shall be known as bug fix SEARCHd.) * Fixed a mistake in the httpindex.1 manual page that showed incorrect use of multiple -e options. (This bug fix shall be known as bug fix HTTP1.) CHANGES, file-by-file --------------------- * man/man1/httpindex.1 1. Fixed the aforementioned mistake. * classic_formatter.c * classic_formatter.h * encoded_char.c * encoded_char.h * indexer.c * indexer.h * results_formatter.c * results_formatter.h * xml_formatter.c * xml_formatter.h 1. Added virtual destructors. * search.c 1. In main(), added check of opt.dump_word_index_opt for bug fix SEARCHd. * version.h 1. Upped version to 6.1.3. ******************************************************************************* 6.1.2 ******************************************************************************* BUG FIXES --------- * A LaTeX file ending in a '%' (with no newline) would result in a core dump. (This bug fix shall be known as bug fix LATEX1.) CHANGES, file-by-file --------------------- * mod/latex/mod_latex.c 1. Reworked handling of '%' in index_words() for bug fix LATEX1. * version.h 1. Upped version to 6.1.2. ******************************************************************************* 6.1.1 ******************************************************************************* BUG FIXES --------- * Fixed compilation on non-MacOSX systems. (This bug fix shall be known as bug fix MAC1.) CHANGES, file-by-file --------------------- * search.c 1. Moved ';' in usage message for bug fix MAC1. * version.h 1. Upped version to 6.1.1. ******************************************************************************* 6.1 ******************************************************************************* NEW FEATURES ------------ * Made search(1) cooperate with Mac OS X's launchd(8). (This feature shall be known as feature LAUNCHD.) CHANGES, file-by-file --------------------- * conf_var.h 1. Added "launchdcooperation" for feature LAUNCHD. * FollowLinks.h 1. Added missing: #ifndef PJL_NO_SYMBOLIC_LINKS * INSTALL.unix 1. Added seperate daemon section. * man/man1/search.1 1. Added mention of new -l/--launchd options for feature LAUNCHD. * man/man4/swish++.conf.4 1. Added mention of new LaunchdCooperation configuration variable for feature LAUNCHD. * Group.h * SearchBackground.h 1. Added missing: #ifdef SEARCH_DAEMON * LaunchdCooperation.h 1. New file for feature LAUNCHD. * search.c 1. Added launchd_cooperation global variable. 2. Made specification of launchd cooperation force not going into the background and not set resource limits. 3. Added -l case. 4. Added -l/--launchd to usage message. * search.h 1. Added launchd_opt. * search_daemon.c 1. Added checking of launchd_cooperation variable. * search_options.c 1. Added "launchd". * swish++.conf 1. Added new LaunchdCooperation configuration variable for feature LAUNCHD. * User.h 1. Added missing: #ifdef SEARCH_DAEMON 2. Added comment about -U. * version.h 1. Upped version to 6.1. ******************************************************************************* 6.0.6 ******************************************************************************* BUG FIXES --------- * The "fix" in 6.0.4 for queries containing meta-names broke another form of using meta-names, e.g.: some_meta=(word1 word2) that *should* be equivalent to: some_meta=word1 some_meta=word2 but wasn't. (This bug fix shall be known as bug fix MN2.) * The installation and removal of SYSV start/stop script symlinks was totally wrong. (This bug fix shall be known as bug fix SYSV.) * Some words with hyphens in manual pages weren't indexed correctly. (This bug fix shall be known as bug fix MHY.) CHANGES, file-by-file --------------------- * config/src/explicit.c * config/src/namespaces.c * fake_ansi.h 1. Removed since they're no longer needed. * GNUmakefile 1. Reworked creation of SYSV start/stop symlinks for bug fix SYSV. * mod/mail/mod_mail.c * mod/mail/multipart.c * mod/mail/vcard.c 1. Split handling of multipart and vCard mail into its own files. * mod/man/mod_man.c 1. Changed to using "normal" way of using iso8859_1_to_ascii() for bug fix MHY. * query.c 1. Completely reworked the way query parameters are passed for bug fix MN2. * version.h 1. Upped version to 6.0.6. ******************************************************************************* 6.0.5 ******************************************************************************* BUG FIXES --------- * Decoding of mail attachments didn't work right if the attachments contained bytes whose value > 127. (This bug fix shall be known as bug fix ATB.) CHANGES, file-by-file --------------------- * conf_var.c * do_file.c * mmap_file.[ch] 1. s/normal/bt_normal/ 2. s/random/bt_random/ 3. s/sequential/bt_sequential/ * charsets/utf7.c * encodings/base64.c 1. Removed call to iso8859_1_to_ascii() for bug fix ATB. * indexer.c 1. s/No_Meta_ID/Meta_ID_None/ 2. Removed new_file(). 3. Added call to iso8859_1_to_ascii() for bug fix ATB. * indexer.h 1. s/No_Meta_ID/Meta_ID_None/ 2. Removed new_file(). * index_segment.h * search.c 1. s/word_index/isi_word/ 2. s/stop_word_index/isi_stop_word/ 3. s/dir_index/isi_dir/ 4. s/file_index/isi_file/ 5. s/meta_name_index/isi_meta_name/ * mod/html/elements.h 1. s/forbidden/et_forbidden/ 2. s/optional/et_optional/ 3. s/required/et_required/ * mod/html/mod_html.c 1. s/No_Meta_ID/Meta_ID_None/ 2. Moved element_stack_ here. 3. Reworked new_file(). * mod/html/mod_html.h 1. s/No_Meta_ID/Meta_ID_None/ 2. Moved element_stack_ to .c file. * mod/id3/id3v2.h 1. s/Failure/hr_failure/ 2. s/Success/hr_success/ 3. s/End_of_Frames/hr_end_of_frames/ * mod/id3/id3v2.c * mod/id3/mod_id3.c * mod/id3/mod_id3.h 1. s/No_Meta_ID/Meta_ID_None/ * mod/mail/mod_mail.c 1. s/No_Meta_ID/Meta_ID_None/ 2. Moved stack_type, boundary_stack, and did_last_header here. 3. Renamed content_type enum values. 4. Reworked new_file(). * mod/mail/mod_mail.h 1. s/No_Meta_ID/Meta_ID_None/ 2. Moved stack_type, boundary_stack, and did_last_header to .c file. 3. Renamed content_type enum values. * mod/latex/mod_latex.c * mod/man/mod_man.c * mod/rtf/mod_rtf.c 1. Added call to iso8859_1_to_ascii() for bug fix ATB. 2. s/No_Meta_ID/Meta_ID_None/ * query.c 1. s/no_token/tt_none/ 2. s/and_token/tt_and/ 3. s/equal_token/tt_equal/ 4. s/lparen_token/tt_lparen/ 5. s/near_token/tt_near/ 6. s/not_near_token/tt_not_near/ 7. s/not_token/tt_not/ 8. s/or_token/tt_or/ 9. s/rparen_token/tt_rparen/ A. s/word_star_token/tt_word_star/ B. s/word_token/tt_word/ C. s/No_Meta_ID/Meta_ID_None/ * query_node.c * mod/man/mod_man.h * mod/rtf/mod_rtf.h 1. s/No_Meta_ID/Meta_ID_None/ * stem_word.c 1. s/initial/st_initial/ 2. s/vowel/st_vowel/ 3. s/consonant/st_consonant/ * stop_words.c 1. s/stop_word_index/isi_stop_word/ * token.[ch] 1. s/no_token/tt_none/ 2. s/and_token/tt_and/ 3. s/equal_token/tt_equal/ 4. s/lparen_token/tt_lparen/ 5. s/near_token/tt_near/ 6. s/not_near_token/tt_not_near/ 7. s/not_token/tt_not/ 8. s/or_token/tt_or/ 9. s/rparen_token/tt_rparen/ A. s/word_star_token/tt_word_star/ B. s/word_token/tt_word/ * word_info.h 1. s/No_Meta_ID/Meta_ID_None/ * version.h 1. Upped version. ******************************************************************************* 6.0.4 ******************************************************************************* BUG FIXES --------- * Queries containing meta-names didn't work right if the meta-name was given in any position other than last. (This bug fix shall be known as bug fix MN1.) CHANGES, file-by-file --------------------- * query.c 1. In parse_meta(), added: "args.meta_id = No_Meta_ID" for bug fix MN1. * version.h 1. Changed to "6.0.4". ******************************************************************************* 6.0.3 ******************************************************************************* BUG FIXES --------- * The calculation of word deltas was wrong. (This bug fix shall be known as bug fix CWD.) CHANGES, file-by-file --------------------- * index.c 1. Removed no-class and dump-html options. (They should have been removed a long time ago because module-specific options were moved to the modules themselves.) * indexer.h 1. In resume_indexing(), added "if" statement. * INSTALL.unix 1. Changed minimum supported g++ compiler to the 3.x series, i.e., 2.95.x and earlier are no longer supported. * mod/html/mod_html.c 1. In parse_html_tag(), now additionally skipping XML processing instructions. * word_info.c * word_info.h 1. Added last_absolute_word_pos_ for bug fix CWD. * version.h 1. Changed to "6.0.3". ******************************************************************************* 6.0.2 ******************************************************************************* BUG FIXES --------- * error_string() in util.h failed to compile using g++ 3.4.1. (This bug fix shall be known as bug fix G341.) CHANGES, file-by-file --------------------- * Group.c * SocketAddress.c * WordThreshold.c 1. Removed config variable name from error message. * conf_var.c * search.c * stop_words.c * thread_pool.c 1. Switched to using error_string. * index.c 1. Switched to using error_string. 2. Added: max_out_limit( RLIMIT_FSIZE ); * man/man4/swish++.conf.4 1. Added missing ID3 and LaTeX modules. * mod/html/mod_html.c 1. Made entity_to_ascii() and find_attribute() static. * util.h 1. Reworked error_string() for bug fix G341. * version.h 1. Changed to "6.0.2". ******************************************************************************* 6.0.1 ******************************************************************************* BUG FIXES --------- * Changes to make it compile with g++ 3.4.0 which purports to be much more standards-conforming. (This bug fix shall be known as bug fix G34.) CHANGES, file-by-file --------------------- * my_set.h 1. s/end()/this->end()/ for bug fix G34 * pattern_map.h 1. s/begin()/this->begin()/, s/end()/this->end()/ for bug fix G34 2. Made __SUNPRO_CC section the only one for bug fix G34. * version.h 1. Changed to "6.0.1". ******************************************************************************* 6.0 ******************************************************************************* NEW FEATURES ------------ * Added the ability to search using "near." The downside is that word-position data must be stored for every word. This approximately doubles the size of the generated indicies. (This feature shall be known as feature NS.) BUG FIXES --------- * file_list::const_iterator didn't work right for copy construction or assignment. (This bug fix shall be known as bug fix FLCI.) OTHER CHANGES ------------- * Reworked the thread_pool/thread code to use thread-local data. It makes the code much simpler. * Moved scripts (*.in files) to new scripts directory. * Added a makedepend.pl script to make dependencies when g++ is not being used, e.g. CC on Solaris. * Made lots of other changes to get SWISH++ to compile using Sun's CC. CHANGES, file-by-file --------------------- * charsets/GNUmakefile 1. s/$(AR) rv/$(AR)/ for Sun's CC. 2. Added: $(TEMPLATE_REPOSITORY) for Sun's CC. * config/config.mk 1. Added FEATURE_LIST, FEATURE_DEFS, and word_pos for feature NS. 2. Now no longer doing -fno-rtti when the word_pos feature is being compiled in for feature NS. 3. Added MAKEDEPEND variable. 4. Reworked OS definition; eliminated $(OS). 5. Added AR definition. 6. Added TEMPLATE_REPOSITORY for Sun's CC. * config/makedepend.pl 1. New file. * config/mod.mk * charsets/GNUmakefile * encodings/GNUmakefile 1. s/include/-include/ to silence "no such file or directory" messages. * config.h 1. Made comments for ShellFilenameDelimChars and ShellFilenameEscapeChars clear that they are for FILE (not path) names. 2. Added WordsNear_Default for feature NS. 3. s/WIN32/__CYGWIN__/ * conf_var.c 1. Removed \n from internal_error. 2. In conf_var::map_ref(), added storewordpositions and wordsnear for feature NS. * directory.c 1. s/WIN32/__CYGWIN__/ * do_file.c 1. Added code to reset word_pos for feature NS. * encoded_char.h 1. On line 106: s/value_type const/encoded_char_range::value_type const/ to make Sun's CC happy. * encodings/GNUmakefile 1. s/$(AR) rv/$(AR)/ for Sun's CC. 2. Added: $(TEMPLATE_REPOSITORY) for Sun's CC. * exit_codes.h 1. Added Exit_No_Create_Thread_Key. 2. Added Exit_No_Word_Pos_Data for feature NS. * extract.c 1. s/ctime/time.h/ for Sun's CC. * file_list.c 1. Added definition of file_list::const_iterator::end_value for bug fix FLCI. 2. In operator++(), now using end_value for bug fix FLCI. 3. Added code to accumulate word position data for feature NS. * file_list.h 1. Made temp object in operator++(int) const. 2. Added static end_value for bug fix FLCI. * filter.c 1. Got rid of newlines in error messages. * GNUmakefile 1. Added: DEBUG_eval_query 2. Added query_node.c to S_SOURCES for feature NS. 3. s/include/-include/ to silence "no such file or directory" messages. 4. Added: $(TEMPLATE_REPOSITORY) for Sun's CC. * IncludeMeta.c 1. Added const_cast() around strchr() for Sun's broken CC strchr() declaration. 2. s/char const *const m/char *const m/ for Sun's CC compiler. * index.c 1. Added #include "StoreWordPositions.h" for feature NS. 2. s/word_file_max/word_files_max/ 3. Added store_word_positions and word_pos for feature NS. 4. Added -P option for feature NS. 5. In merge_indicies() and write_word_index(), added call to write_word_pos() for feature NS. 6. s/ctime/time.h/ for Sun's CC. * indexer.c 1. Added #include "StoreWordPositions.h" for feature NS. 2. Added "extern int word_pos" for feature NS. 3. Removed \n from internal_error. 4. In indexer::index_word(), added "++word_pos" for feature NS. 5. In indexer::index_word(), added code to store word position data for feature NS. * INSTALL.unix * INSTALL.win32 1. Removed note about "no such file or directory" warnings. * man/man1/index.1 1. In the description of the Mail module, item #7, added mention of -A option. 2. For the -A option, elaborated description. 3. Added description of the -P option for feature NS. 4. Added mention of StoreWordPositions variable for feature NS. 5. Added mention of WordsNear variable for feature NS. * man/man1/search.1 1. Added description of "near" for feature NS. 2. Reworked EXAMPLES section. 3. Added: Could not create thread key. 4. Added: Attempted "near" search without word-position data. * man/man4/swish++.conf.4 1. Added StoreWordPositions for feature NS. 2. Added WordsNear for feature NS. * man/man4/swish++.index.4 1. Added Word-position list for feature NS. * mmap_file.c 1. s/MacOSX/__APPLE__/ 2. s/ctime/time.h/ for Sun's CC. * mmap_file.h 1. Removed trailing ',' from behavior_type. 2. Added "#ifndef __SUNPRO_CC" for problem with Sun's CC compiler. * mod/mod_id3/mod_id3.h 1. Removed trailing ',' from enums. * mod/mod_mail/mod_mail.h 1. s/Multipart,/Multipart/ 2. Added: struct message_type; friend struct message_type; * pattern_map.h 1. s/WIN32/__CYGWIN__/ 2. Added #ifdef for Sun's CC compiler. * query.c 1. s/find_result/word_range/ 2. Added #include "query_node.h" for feature NS. 3. Added parse_args struct for feature NS. 4. Reworked parse functions to take parse_args for feature NS. 5. Move is_too_frequent() to query.h. 6. Reworked parse functions to build query_nodes. 7. Moved the code for perform_and() to query_node.c. 8. Added assert_index_has_word_pos_data() for feature NS. * query.h 1. s/find_result/word_range/ 2. Move is_too_frequent() here. * query_node.[ch] 1. New files for neature NS. * search.c 1. Added #include "WordsNear.h" for feature NS. 2. s/word_file_max_arg/word_files_max_arg/ 3. Added -n option for feature NS. 4. s/ctime/time.h/ for Sun's CC. 5. Added #include "vector_adapter.h" for Sun's CC. 6. s/search_result_type/search_result/ 7. In search(), added #ifdef __SUNPRO_CC since Sun's CC compiler and/or their STL implementation seems pretty broken. * search.h 1. s/word_file_max_arg/word_files_max_arg/ 2. Added words_near_arg for feature NS. * search_daemon.c 1. s/ctime/time.h/ for Sun's CC. * search_options.c 1. Added "-n" for feature NS. * search_thread.c 1. s/ctime/time.h/ for Sun's CC. * simple_pool.h * StoreWordPositions.h 1. New file for feature NS. * swish++.conf 1. Removed -f and -p options from search(1). 2. Added StoreWordPositions for feature NS. 3. Added WordsNear for feature NS. * thread_pool.c 1. Added thread_pool::thread::thread_obj_key_. 2. s/thread_pool_thread_cleanup/thread_pool_thread_data_cleanup/ 3. Added code to deal with thread-specific data. 4. Added thread_pool_thread_once(). 5. s/destructing_/in_cleanup_/ 6. s/ctime/time.h/ for Sun's CC. 7. s/start_function_type/thread_start_function_type/ for Sun's CC. * thread_pool.h 1. Removed thread_pool_thread_cleanup(). 2. Added thread_pool_thread_data_cleanup(). 3. Added thread_pool_thread_once(). 4. Removed thread_pool::thread::operator delete(). 5. s/destructing_/in_cleanup_/ 6. Removed thread_pool::thread::thread_. 7. Added thread_pool::thread::thread_obj_key_. 8. s/start_function_type/thread_start_function_type/ for Sun's CC. * token.c 1. Added "near" for feature NS. 2. Got rid of #include for Sun's CC. 3. s/transform/to_lower/ for Sun's CC. * token.h 1. Added near_token and not_near_token for feature NS. * util.c 1. Added: to_lower( char*, char const* ) * util.h 1. Added: pjl_abs() for feature NS. 2. Removed \n from internal_error. 3. Added FOR_EACH_IN_PAIR. 4. Added: to_lower( char*, char const* ) 5. s/ctime/time.h/ for Sun's CC. * vector_adapter.h 1. New file for Sun's CC. * version.h 1. Changed to "6.0". * word_info.c 1. Moved word_info::file constructors here. 2. Added write_word_pos() for feature NS. * word_info.h 1. Added file::has_meta_id(). 2. Added word position data for feature NS. * word_markers.h 1. Added Word_Pos_List_Marker for feature NS. * WordsNear.h 1. New file for feature NS. ******************************************************************************* 5.15.4 ******************************************************************************* BUG FIXES --------- * extract(1) using stdin was broken. (This bug fix shall be known as bug fix ESI2.) CHANGES, file-by-file --------------------- * extract.c 1. In main() in the code for processing stdin, s/*argv/file_name/ for bug fix ESI2. * version.h 1. Changed to "5.15.4". ******************************************************************************* 5.15.3 ******************************************************************************* BUG FIXES --------- * Fixed a bug in the code that merges partial indicies. (Hopefully, this was the last bug introduced as a result of the new index file format.) (This bug fix shall be known as bug fix NMF.) CHANGES, file-by-file --------------------- * index.c 1. In merge_indicies(), moves declaration of "continues" out of the loop for bug fix NMF. * version.h 1. Changed to "5.15.3". ******************************************************************************* 5.15.2 ******************************************************************************* BUG FIXES --------- * Search results didn't include the last one, i.e., if there are N results, only N-1 were returned. (This bug fix shall be known as bug fix RM1.) CHANGES, file-by-file --------------------- * file_list.h 1. Added: typedef unsigned char byte; 2. s/unsigned char/byte/ * file_list.c 1. s/unsigned char/byte/ 2. In calc_size(), fixed list-skipping code. 3. In operator++(), added "sentinel" code for bug fix RM1. * SearchResults.xsd 1. s/homepage.mac.com/www.pauljlucas.org/ * version.h 1. Changed to "5.15.2". * xml_formatter.c 1. Changed xmlns location. ******************************************************************************* 5.15.1 ******************************************************************************* BUG FIXES --------- * Fixed a bug in new file-format decoder. (This bug fix shall be known as bug fix FFD1.) CHANGES, file-by-file --------------------- * file_list.c 1. In calc_size(), removed incorrect ++p for bug fix FFD1. * version.h 1. Changed to "5.15.1". ******************************************************************************* 5.15 ******************************************************************************* NEW FEATURES ------------ * Numbers stored in the generated index file are now more highly compressed resulting in an overall average savings of approximately 6% in index size. (This feature shall be known as feature HCI.) BUG FIXES --------- * The call to change the behavior of mmap(2) was in the wrong place. (This bug fix shall be known as bug fix MMB2.) CHANGES, file-by-file --------------------- * bcd.h * bcd.c 1. Replaced by enc_int.[hc], respectively, for feature HCI. * conf_var.c 1. Moved call to change behavior of mmap(2) for bug fix MMB2. * file_info.c 1. s/bcd.h/enc_int.h/ for feature HCI. 2. s/parse_bcd/dec_int/ for feature HCI. * file_list.c 1. s/bcd.c/enc_int.c/ for feature HCI. 2. Reworked calc_size() and operator++() for new index file format for feature HCI. * GNUmakefile 1. s/bcd.c/enc_int.c/ for feature HCI. * index.c 1. s/bcd.c/enc_int.c/ for feature HCI. 2. Added: #include "word_markers.h" for feature HCI. 3. s/parse_bcd/dec_int/ for feature HCI. 4. Reworked merge_indicies() and write_word_index() for new file format for feature HCI. * query.c 1. s/bcd.h/enc_int.h/ for feature HCI. 2. s/parse_bcd/dec_int/ for feature HCI. * search.c 1. Removed unneeded #include "bcd.h" * word_info.c 1. s/bcd.h/enc_int.h/ for feature HCI. 2. Added: #include "word_markers.h" for feature HCI. 3. Now using Meta_Name_List_Marker and Stop_Marker for feature HCI. * word_markers.h 1. New file for feature HCI. * version.h 1. Changed to "5.15". ******************************************************************************* 5.14.2 ******************************************************************************* BUG FIXES --------- * When filenames containing shell meta-characters were passed to the shell for filtering, they weren't escaped. (This bug fix shall be known as bug fix SMC.) * For extract(1), added check to ensure that the extracted file name's length does not exceed PATH_MAX. (This bug fix shall be known as bug fix EEL.) * The call to change the behavior of mmap(2) was in the wrong place. (This bug fix shall be known as bug fix MMB.) CHANGES, file-by-file --------------------- * config.h 1. Updated Word_Max_Consec_Consonants to 7. 2. Updated Word_Max_Consec_Vowels to 5 3. Added ShellFilenameDelimChars and ShellFilenameEscapeChars for bug fix SMC. * conf_string.h 1. Added length() and size(). * do_file.c 1. Added check against PATH_MAX for bug fix EEL. 2. Moved call to change behavior of mmap(2) for bug fix MMB. * filter.c 1. Added escape_filename() and unescape_filename() for bug fix SMC. 2. Changed substitute() to use escaped filename for bug fix SMC. * man/man1/index.1 1. s/subdiretories/subdirectories/ * man/man1/search.1 1. Added missing mention of exit codes 68 and 69. * searchc.in 1. Added -F option. * version.h 1. Changed to "5.14.2". ******************************************************************************* 5.14.1 ******************************************************************************* BUG FIXES --------- * Fixed a small error when compiling under Solaris. (This bug fix shall be know as bug fix MADV1.) * The version string in 5.14 wasn't updated. CHANGES, file-by-file --------------------- * mmap_file.c 1. Added cast to caddr_t in madvise() call for bug fix MADV1. * version.h 1. Changed to "5.14.1". ******************************************************************************* 5.14 ******************************************************************************* NEW FEATURES ------------ * The searchd script now supports chkconfig. (This feature shall be know as feature CHK.) BUG FIXES --------- * Use of an iterator in the rank_full_index() function was improper. Apparantly it doesn't matter under GCC/HP implementations of STL, but does under the .NET implementation. (This bug fix shall be known as bug fix INVIT.) * The is_xxxxx() functions in util.h were doing sign-extension during char-to- int conversion (apparantly only) under the .NET compiler. This has been fixed by using a proper cast. (This bug fix shall be known as bug fix CHCAST.) CLARIFICATIONS -------------- * The undocumented behavior of index(1) skipping all files that started with '.' has been changed to skip only the directory entires '.' and ".."; this has also been documented. (This clarification shall be know as clarification DOTS.) CHANGES, file-by-file --------------------- * directory.c 1. In do_directory(), changed code for DOTS. * index.c 1. In rank_full_index(), fixed tha handling of the 'w' iterator for bug fix INVIT. * INSTALL.unix 1. Updated gcc information. * man/man1/index.1 1. For the ID3 module description, fixed references to "header": they should be "field." * searchd.in 1. Added chkconfig information for feature CHK. 2. s!.echotmp!/tmp/.echotmp! * util.h 1. Added static_cast<>()s to the is_xxxxx() functions for bug fix CHCAST. * version.h 1. Changed to "5.14". ******************************************************************************* 5.13.5 ******************************************************************************* BUG FIXES --------- * The top-level GNUmakefile didn't make the "etc" directory (I_ETC) if it didn't exist. (This was supposedly fixed in 5.8, but apparantly not.) (This bug fix shall be know as bug fix I_ETC2.) CHANGES, file-by-file --------------------- * GNUmakefile 1. Added $(I_ETC) as a target of the $(MKDIR) $@ line for bug fix I_ETC2. * version.h 1. Changed to "5.13.5." ******************************************************************************* 5.13.4 ******************************************************************************* BUG FIXES --------- * For search(1), the --max-results option was missing the "max" part. (This bug fix shall be know as bug fix MRO.) CHANGES, file-by-file --------------------- * search_options.c 1. s/results/max-results/ for bug fix MRO. * version.h 1. Changed to "5.13.4." ******************************************************************************* 5.13.3 ******************************************************************************* BUG FIXES --------- * Fixed occasional segmentation fault in the Manual page indexing module. (This bug fix shall be know as big fix MMC1.) CHANGES, file-by-file --------------------- * BUGS 1. New file containing bug-reporting instructions. * INSTALL.win32 1. s!sources.redhat.com/cygwin!cygwin.com! 2. Removed mention of Windows 95 -- it's dead. 3. Added mention of Windows XP. * man/man4/swish++.index.4 1. Added (missing) mention of MP3 file titles. * mod/man/mod_man.c 1. In index_words(), s/while ( true )/while ( c != e.end_pos() )/ for bug fix MMC1. * version.h 1. Changed to "5.13.3." ******************************************************************************* 5.13.2 ******************************************************************************* BUG FIXES --------- * Fixed threads linking problem. (This bug fix shall be know as big fix TL1.) CHANGES, file-by-file --------------------- * GNUmakefile 1. s/PTHREAD_LIB/PTHREAD_LINK/ for bug fix TL1. 2. Added $(PTHREAD_LINK) to E_LINK for bug fix TL1. * version.h 1. Changed to "5.13.2." ******************************************************************************* 5.13.1 ******************************************************************************* BUG FIXES --------- * Yet more BSD compilation fixes. (Why the hell can't BSD headers #include everything they need themselves?) (This bug fix shall be know as big fix BSD6.) CHANGES, file-by-file --------------------- * util.h 1. Added: #include for bug fix BSD6. * version.h 1. Changed to "5.13.1." ******************************************************************************* 5.13 ******************************************************************************* NEW FEATURES ------------ * Added an indexing module for ID3 tags (typically found inside MP3 files). ID3v1.x and ID3v2.x through 2.4.0 are supported (with the exception of encrypted frames). (This feature shall be known as feature ID3.) * Since it was needed by feature ID3, decoding of UTF-16 text (both big- and little-endian) was added. (This feature shall be known as feature UTF16.) BUG FIXES --------- * If the Mail module was compiled without Base64 encoding compiled in, it was indexed as plain text (which is wrong). It should be treated as Binary and not indexed at all. (This bug fix shall be known as bug fix MN64.) * If the Mail module was compiled without the UTF-7 charset compiled in, it was indexed as plain text (which is wrong). It should be treated as Binary and not indexed at all. (This bug fix shall be known as bug fix MNU7.) * httpindex didn't accept some index(1) options that it should have. (This bug fix shall be known as bug fix HIO.) * httpindex could block if index(1) was generating partial indicies. (This bug fix shall be known as bug fix HPI.) CHANGES, file-by-file --------------------- * charsets/charsets.h 1. Added UNKNOWN_CHARSET for feature ID3. 2. Added charset_utf16be() and charset_utf16le() for feature UTF16. * charsets/utf16.c 1. New file for feature UTF16. * config/config-sh 1. Added a target.mk argument. 2. Now generating a target.mk file. 3. Added DATE. 4. s/TARGET/TARGET_H/ 5. Changed define() to emit for both targets. * config/config.mk 1. Added "id3" to MOD_LIST for feature ID3. 2. Added "utf17" to CHARSET_LIST for feature ID3. 3. Removed CHARSET_LIST and ENCODING_LIST from inside MOD_mail. 4. Added "DECODING:= -DIMPLEMENT_DECODING" to simplify encoded_char.[ch]. 5. s/PTHREAD_LIB/PTHREAD_LINK/ 6. s/SOCKET_LIB/SOCKET_LINK/ 7. Added ZLIB_LINK for feature ID3. 8. Added "$(DECODING)" to CCFLAGS. 9. Added platform.mk fo dependency line. * config/GNUmakefile 1. Added platform.mk to $(TARGET). 2. s/$@/$(TARGET)/ 3. Removed .*.d * config/src/zlib.c 1. New file for feature ID3. * do_file.c 1. s/#ifdef MOD_mail/#ifdef IMPLEMENT_DECODING/ for feature ID3. 2. Switched to using indexer::text_indexer(). * encoded_char.c * encoded_char.h 1. s/#ifdef MOD_mail/#ifdef IMPLEMENT_DECODING/ for feature ID3. * encodings/encodings.h 1. s/-1/~0/ * GNUmakefile 1. Added include of platform.mk 2. Added -DDEBUG_id3v2 for feature ID3. 3. Changed the way CFLAGS was assigned. 4. Added config to SUBDIRS. 5. Added ifndef HAVE_ZLIB 6. Added $(BLIB_LINK) to I_LINK for feature ID3. 7. s/PTHREAD_LIB/PTHREAD_LINK/ 8. s/SOCKET_LIB/SOCKET_LINK/ 9. Removed platform.h from disclean. * httpindex.in 1. Added missing index(1) options for bug fix HIO. 2. Added code to read extra lines from index(1) for bug fix HPI. * indexer.c 1. Added text_indexer_. 2. In map_ref(), added assignment to text_indexer_. * indexer.h 1. Made find_meta() public. 2. Made index_word() public. 3. Added text_indexer(). 4. Added text_indexer_. * man/man1/index.1 1. Added description of ID3 module for feature ID3. * mod/id3/GNUmakefile * mod/id3/id3v1.c * mod/id3/id3v1.h * mod/id3/id3v2.c * mod/id3/id3v2.h * mod/id3/mod_id3.c * mod/id3/mod_id3.h 1. New files for feature ID3. * mod/html/mod_html.c * mod/latex/mod_latex.c * mod/mail/mod_mail.c * mod/man/mod_man.c 1. Switched to using move_if_match(). * mod/mail/mod_mail.c 1. Made indexing treat Base64 as Binary if its encoding wasn't compiled in for bug fix MN64. 2. Made indexing skip UTF-7 and UTF-8 if their respective character set code wasn't compiled in for bug fix MNU7. 3. In index_headers(), Removed module #ifdef's since they weren't really needed. * mod/mail/mod_mail.h 1. Removed module #ifdef's since they weren't really needed. * README 1. Aded blurb about ID3 for feature ID3. * swish++.conf 1. Added IncludeMeta's for ID3v2 tag fields for feature ID3. 2. Added "IncludeFule ID3 *.mp3" for feature ID3. * util.h 1. Added NUM_ELEMENTS(). * word_util.c * word_util.h 1. Added move_if_match(). * www_example 1. Added "--" at end of search options to close security hole. * version.h 1. Changed to "5.13." ******************************************************************************* 5.12.1 ******************************************************************************* BUG FIXES --------- * Some ranks returned were negative. (This became broken in 5.11.) (This bug fix shall be known as bug fix FNR.) * The version number for 5.12 wasn't updated in the code from 5.11.1. CHANGES, file-by-file --------------------- * word_info.h 1. Changed occurrences_ and rank_ from short to int for bug fix FNR. * version.h 1. Changed to "5.12.1." ******************************************************************************* 5.12 ******************************************************************************* NEW FEATURES ------------ * WordThreshold may now be set either in a config. file or on the command line (as opposed to being only a compiled-in constant). However, only the super- user may specify a value larger than the default. (This feature shall be know as feature SWT.) CHANGES, file-by-file --------------------- * config.h 1.s/Word_Threshold/WordThreshold_Default/ for feature SWT. * conf_var.c 1. Added wordthreshold for feature SWT. * do_file.c 1. s/Word_Threshold/word_threshold/ for feature SWT. * exit_codes.h 1. Added Exit_Not_Root for feature SWT. * GNUmakefile 1. Added WordThreshold.c to I_SOURCES for feature SWT. * index.c 1. Added: #include "WordThreshold.h" for feature SWT. 2. Added word_threshold for feature SWT. 3. In main(), added "word-threshold" and -W options for feature SWT. 4. In usage(), added -W and --word-threshold options for feature SWT. * man/man1/index.1 1. Added -W, --word-threshold, end WordThreshold for feature SWT. 2. Added exit code 13 for feature SWT. * man/man4/swish++.conf.4 1. Added WordThreshold for feature SWT. * mod/html/html.c 1. Moved over line of ':' to be in-line with those in index.c for feature SWT. * swish++.conf 1. Added Word_Threshold for feature SWT. * WordThreshold.c * WordThreshold.h 1. New file for feature SWT. * version.h 1. Upped version to "5.12". ******************************************************************************* 5.11.1 ******************************************************************************* BUG FIXES --------- * Invalid UTF-8 could send the indexer into an infinite loop. (This bug fix shall be know as bug fix IU8.) CHANGES, file-by-file --------------------- * charsets/utf8.c 1. Changed syncing to skip forward rather than back for bug fix IU8. 2. Added check for FE and FF bytes. * version.h 1. Upped version to "5.11.1". ******************************************************************************* 5.11 ******************************************************************************* NEW FEATURES ------------ * The ranking result numbers (1-100) have been significantly improved: they are much less striated now. (Simply increasing the scaling factor by 3 orders of magnitide did the trick.) (This feature shall be know as feature IRF.) CHANGES, file-by-file --------------------- * index.c 1. Added Rank_Factor constant. 2. Changed rank factor from 10000 to 10000000 for feature IRF. * man/man1/extract.1 1. Changed wording for -e to agree with index(1). * man/man1/index.1 1. Added (missing) documentation that -e and --pattern options can take multiple patterns separated by commas. * swish++.conf 1. Added *.png to ExcludeFile 2. Changed Word filter to wvText and added URL for www.wvware.com. 3. Added (missing) LaTeX and Man IncludeFile lines. * version.h 1. Upped version to "5.11". ******************************************************************************* 5.10 ******************************************************************************* NEW FEATURES ------------ * Some XHTML 2.0 elements have been added to the HTML module. (This feature shall be know as feature XHTML2.) * Theoretically improved indexing performace by adding calls to madvise(2). (This feature shall be know as feature MADV.) BUG FIXES --------- * Specifying filename patterns for extract(1) was broken. (This bug fix shall be know as bug fix EFP.) * extract(1) was broken altogether since 6/16/2000. Apparantly, very few people use it since nobody pointed it out. (This bug fix shall be know as bug fix ETB.) CHANGES, file-by-file --------------------- * config/config.mk 1. Added -D_BSD_SOURCE to LINUX definition. 2. Added MAC_OS_X as a separate OS. * config/src/madvise.c 1. New file for feature MADV. * conf_var.c 1. In parse_file(), added call to mmap_file::behavior(). * do_file.c 1. In do_file(), added call to mmap_file::behavior() for feature MADV. 2. In do_file(), added "out = &extracted_file;" for bug fix ETB. * extract.c 1. In main(), s/.insert( pat )/.insert( pat, 0 )/ for bug fix EFP. * ExtractFile.c 1. s/insert( new_strdup( s ) )/insert( new_strdup( s ), 0 )/ for bug fix EFP. * ExtractFile.h 1. Made derived from conf_var and pattern_map for bug fix EFP. * mmap_file.c * mmap_file.h 1. Added behavior() for feature MADV. * mod/html/elements.c 1. Added the h, line, name, nl, quote, and section elements for feature XHTML2. * search.c 1. In main(), added call to mmap_file::behavior(). * stop_words.c 1. Added call to mmap_file::behavior(). * version.h 1. Upped version to "5.10". ******************************************************************************* 5.9.6 ******************************************************************************* BUG FIXES --------- * Non-space whitespace characters were mistakenly turned into spaces. This has been this way since version 5.8 and, in theory, broke meta-names in e-mail files (although the old code seems to have worked, but I don't know how). (This bug fix shall be known as bug fix NSW.) * Recognition of LaTeX commands seems to have been completely broken. (This bug fix shall be known as bug fix LAC.) * Unknown LaTeX commands are now ignored (as they should have been) rather than indexed. (This bug fix shall be known as bug fix LAS.) CHANGES, file-by-file --------------------- * iso8859-1.c 1. Changed ' ' to '\t', '\n', '\v', '\f', '\r' for bug fix NSW. * mod/latex/mod_latex.c 1. Added find_left(). 2. In index_words(), added check for '\r'. 3. In parse_latex_command(), removed local scope. 4. In parse_latex_command(), s/!is_alnum/is_alnum/ for bug fix LAC. 5. In parse_latex_command(), added call to find_left(). 6. In parse_latex_command(), added code such that if a command is not found, skip it (for bug fix LAS). * version.h 1. Upped version to "5.9.6". ******************************************************************************* 5.9.5 ******************************************************************************* BUG FIXES --------- * For document sets that contain a lot of words (more than 2^31), the number of total words reported was negative due to signed integer overflow. The fix was to make the counter unsigned. (This bug fix shall be known as bug fix ULN.) CHANGES, file-by-file --------------------- * index.c * indexer.c 1. Made num_indexed_words, num_total_words, and num_unique_words unsigned long rather than just long for bug fix ULN. * version.h 1. Upped version to "5.9.5". ******************************************************************************* 5.9.4 ******************************************************************************* BUG FIXES --------- * There was a problem whereby search(1) would core-dump under FreeBSD. This problem surfaced a while ago, then disappeared, and now it's back again. This problem has finally been fixed (apparantly). (This bug fix shall be known as bug fix END.) CHANGES, file-by-file --------------------- * GNUmakefile 1. Added ifdefs for CHARSET_LIST and ENCODING_LIST. * extract.c * index.c * mod/html/mod_html.c * mod/mail/mod_mail.c * mod/man/mod_man.c * query.c * search.c * search_daemon.c * search_thread.c 1. Made whatever can be declared static actually static. * stem_word.c 1. Made "end" static for bug fix END. 2. Made whatever else can be declared static actually static. * version.h 1. Upped version to "5.9.4". ******************************************************************************* 5.9.3 ******************************************************************************* BUG FIXES --------- * Under Windows, the printing of file names was slightly messed up. (This bug fix shall be known as bug fix WDS.) CHANGES, file-by-file --------------------- * directory.c 1. In do_check_add_file(), s!'/'!Dir_Sep_Char! for bug fix WDS. * version.h 1. Upped version to "5.9.3". ******************************************************************************* 5.9.2 ******************************************************************************* BUG FIXES --------- * If the "mail" module wasn't selected for compilation, then overall compilation failed due to a missing #ifdef. (This bug fix shall be known as bug fix MMI.) CHANGES, file-by-file --------------------- * encoded_char.h 1. In encoded_char_range::const_iterator::const_iterator(), added missing "#ifdef MOD_mail" for bug fix MMI. * version.h 1. Upped version to "5.9.2". ******************************************************************************* 5.9.1 ******************************************************************************* BUG FIXES --------- * The feature of being able to do "not foo = bar" introduced in 5.8 was broken: right intent, wrong line of code. D'oh! (This bug fix shall be known as bug fix NMN1.) CHANGES, file-by-file --------------------- * query.c 1. In parse_primary() in lparen_token case, s/parse_meta/parse_query2/ for bug fix NMN1. 2. In parse_primary() in not_token case, s/parse_primary/parse_meta/ for bug fix NMN1. * version.h 1. Upped version to "5.9.1". ******************************************************************************* 5.9 ******************************************************************************* NEW FEATURES ------------ * Added XML schema information in seach results XML output. (This feature shall be known as feature XMLS.) CHANGES, file-by-file --------------------- * man/man1/search.1 1. Fixed formatting for grammar. 2. Added LaTeX to set of files that can have titles. 3. Added XML schema information for feature XMLS. * man/man4/swish++.index.4 1. Added missing mention of LaTeX titles. * version.h 1. Upped version to "5.9". * SearchResults.xsd 1. New file for feature XMLS. * xml_formatter.c 1. Added SWISHPP_URI, SEARCH_RESULTS_DTD, SEARCH_RESULTS_NS_URI, SEARCH_RESULTS_XSD, and XML_SCHEMA_INSTANCE_URI for feature XMLS. 2. In pre(), put " 2. Added: #include "iso8859-1.h" 3. s/decoder_type/encoding_type/ 4. Added: encoded_char_range::charset_type for feature UTF. 5. Added charset_type to encoded_char_range and encoded_char_range::const_iterator constructors. 6. s/decoder_/encoding_/ 7. Added encoded_char_range::charset_ for feature UTF. 8. Added encoded_char_range::decoder class. 9. In encoded_char_range::const_iterator::decode(), added code to call charset decoder for feature UTF. A. In encoded_char_range::const_iterator::operator*(), added call to iso8859_1_to_ascii(). * encodings/GNUmakefile * encodings/base64.c * encodings/encodings.h * encodings/quoted_printable.c * encodings/README 1. New files. * extract.c 1. In extract_words(), removed called to iso8859_to_ascii(). * fdbuf.h 1. Added #ifdef PJL_GCC_2xx for streambuf vs. streambuf.h for bug fix GCC31. * GNUmakefile 1. Added $(I_ETC) as a target of the $(MKDIR) $@ line for bug fix I_ETC. 2. Added CHARSET_* and ENCODING_* variables for feature UTF. 3. Added CHARSET_LIB and ENCODING_LIB targets for feature UTF. 4. Added "charsets" and "encodings" to MAKE_SUBDIRS for feature UTF. 5. Added iso8859-1.c to I_SOURCES, S_SOURCES, and E_SOURCES. 6. Added $(CHARSET_LINK) and $(ENCODING_LINK) to I_LINK for feature UTF. 7. Added $(PTHREAD_LIB) to I_LINK for bug fix SOL_THR. 8. Added $(CHARSET_LIB) and $(ENCODING_LIB) to index for feature UTF. * indexer.c 1. In index_words(), removed called to iso8859_to_ascii(). * init_modules-sh * init_mod_vars-sh 1. Added -e 's/mod_/MOD_/' to sed lines. * INSTALL.unix 1. Added mention of minimum GNU make version for bug fix MAKE_VER. * iso8859-1.c * iso8859-1.h 1. Moved from word_util.[ch] * man/man1/index.1 1. For the Mail module, added mention of UTF-7 and UTF-8 for feature UTF. 2. Added a caveat about an e-mail message having a simultaneous encoding and character set. 3. Added Unicode references. * man/man1/search.1 1. Changed query grammar for feature NMN. * mmap_file.h 1. Replaced deprecated declarations for reverse_iterator and const_reverse_iterator for bug fix GCC31. * mod/mail/mod_html.c 1. Added: #include "charsets/unicode.h" 2. In entity_to_ascii(), now using unicode_to_ascii(). 3. In index_words(), removed call to iso8859_to_ascii(). * mod/mail/mod_mail.c 1. Added: #include "encoded_char.h" for feature UTF. 2. Removed: #include "word_util.h" 3. Replaced decoder_ with charset_ and encoding_ for feature UTF. 4. In index_headers(), added code to extract the charset for feature UTF. 5. In index_multipart(), added call to encoded_char_range::decoder::reset_all(). 6. s/while ( 1 )/while ( true )/ * mod/mail/mod_mail.h 1. Added: #include "charsets/charsets.h" 2. Added: #include "encodings/encodings.h" 3. Replaced decoder_ with charset_ and encoding_ * mod/rtc/mod_man.c * mod/rtc/mod_rtf.c 1. In index_words(), removed call to iso8859_to_ascii(). * option_stream.c 1. Added #include for bug fix GCC31. * query.c 1. In parse_query2(), s/parse_query2/parse_meta/ for feature NMN. * README 1. Added missing item mentioning ability to index LaTeX and RTF documents. 2. Moved "Index non-text files such as Microsoft Office documents" up one. 3. Added mention of UTF-7 and UTF-8 character sets for feature UTF. * stop_words.c * token.c 1. Added: #include "iso8859-1.h" 2. s/iso8859_to_ascii/iso8859_1_to_ascii/ * token.h 1. Added #ifdef PJL_GCC_2xx for strstream vs. sstream for bug fix GCC31. 2. Added #ifdef PJL_GCC_2xx for istrstream vs. istringstream for bug fix GCC31. * version.h 1. Upped version to "5.8". * word_util.c * word_util.h 1. Moved iso8859 stuff to iso8859-1.[ch] ******************************************************************************* 5.7.1 ******************************************************************************* BUG FIXES --------- * Some "and" query results were slightly messed up due to an iterator being invalidated. (This bug fix shall be known as bug fix PAI.) * Ranks could be printed as zero (wrong!). (This bug fix shall be known as bug fix RANK0.) CHANGES, file-by-file --------------------- * auto_vec.h 1. Made const/non-const versions of accessors. * conf_enum.c * IncludeMeta.c 1. In parse_value(), made "lower" non-const to work with updated auto_vec. * config/GNUmakefile * GNUmakefile 1. Removed "dist" target. * query.c 1. In perform_and(), added a temporary iterator for bug fix PAI. * search.c 1. In search(), added code to ensure rank is not zero for bug fix RANK0. 2. In search(), added const to highest_rank. * thread_pool.h 1. Made various thread_pool data members "volatile" because this should be done for variables that are accessed by multiple threads. * version.h 1. Upped version to to "5.7.1". ******************************************************************************* 5.7 ******************************************************************************* NEW FEATURES ------------ * LaTeX files can now be indexed directly. (This feature shall be known as feature LATEX.) * Document titles now have multiple whitespace characters squeezed into single whitespace characters. (This feature shall be known as feature STWS.) * index(1) will now use the value of the environment variable TMPDIR if it's set as the default temporary directory. However, the value is still superseded by one of -T, --temp-dir, or TempDirectory if given. (This feature shall be known as feature TMPDIR.) BUG FIXES --------- * HTML comment parsing was broken in that it allowed "->" in addition to "-->" to terminate a comment. (This bug fix shall be known as bug fix HTC.) * Yet more bugs in the thread-pooling code. (This bug fix shall be known as bug fix TPB.) CHANGES, file-by-file --------------------- * config/config.mk 1. Added "latex" to MOD_LIST for feature LATEX. 2. Added I_ETC. * config.h 1. Changed value for SocketQueueSize_Default to 511. * GNUmakefile 1. Added TempDirectory.c for feature TMPDIR. 2. Changed installation of swish++.conf to $(I_ETC) * indexer.c 1. In tidy_title(), added code to squeeze multiple whitespace characters for feature STWS. * man/man1/index.1 1. Added TMPDIR for feature TMPDIR. 2. Added LaTeX section for feature LATEX. 3. Added Leslie Lamport reference for feature LATEX. 4. s/SCCS/CVS/ since nobody uses SCCS any more. * man/man1/search.1 1. Changed default value for -q to 511. 2. Added missing "encoding" to XML example. * man/man4/swish++.conf.4 1. Added missing TempDirectory. * mod/html/mod_html.c 1. In is_html_comment(), reworked code for bug fix HTC. 2. In entity_to_ascii(), added static reference to char_entity_map::instance(). 3. In parse_html_tag() and post_options(), made "elements" reference static. * mod/latex/GNUmakefile * mod/latex/commands.c * mod/latex/commands.h * mod/latex/latex_config.h * mod/latex/mod_latex.c * mod/latex/mod_latex.h 1. New files for feature LATEX. * README.Solaris 1. Removed "ephemeral ports" since that wasn't right. * search_daemon.c 1. Removed: accept_failed(). 2. Added: handle_accept() and reset_socket(). 3. Moved thread_pool object inside of handle_accept(). * search_thread.c 1. Factored out reset_socket(). * swish++.conf 1. Changed value for SocketQueueSize to 511. * TempDirectory.c 1. New file for feature TMPDIR. * TempDirectory.h 1. Added default_value() for feature TMPDIR. 2. Moved #include "config.h" to new .c file for feature TMPDIR. * thread_pool.c 1. s/thread_pool_thread_destroy/thread_pool_thread_cleanup/ 2. Added: thread_pool_decrement_busy(). 3. In thread_pool_thread_main(), changed code so that pool_.t_idle_ is always signalled when idle. 4. In thread_pool_thread_main(), added: pthread_cleanup_push( thread_pool_decrement_busy, t ) to ensure that t->pool_.t_busy_ gets decremented even if the thread is killed. 5. In thread_pool_thread_main(), added DEFER_CANCEL/RESTORE_CANCEL around code that removes a task from the queue. 6. In ~thread(), added DEFER_CANCEL/RESTORE_CANCEL. 7. In thread_pool::thread_pool(), added t_busy_( 0 ) for bug fix TPB. 8. In thread_pool_thread_main(), made signaling of idle independent of the size of the thread pool. 9. Made new_task() take and return a bool argument and queue the task only if it will queue it. * thread_pool.h 1. s/thread_pool_thread_destroy/thread_pool_thread_cleanup/ 2. Added: thread_pool_decrement_busy(). 3. Made new_task() take and return a bool argument. * version.h 1. Upped version to to "5.7". ******************************************************************************* 5.6 ******************************************************************************* NEW FEATURES ------------ * The text/enriched attachment indexer that was part of the Mail module was split off into its own RTF (Rich Text Format) module so stand-alone RTF files can be indexed. (This feature shall be known as feature RTF.) * For search(1) running as a daemon, added code to reset the TCP connection for bad requests. The reason for doing this is so we don't potentially have a socket lingering in TIME-WAIT from a client that was too dumb to give us a valid request in the first place. This helps alleviate denial-of-service attacks (if that's what's going on). This came about due to the way Solaris handles TIME-WAIT. Read the new README.Solaris file for details. This change has no effect in in Linux 2.2.x kernels since sending a reset on close by setting SO_LINGER wasn't implemented. (This feature shall be known as feature RST.) BUG FIXES --------- * The files Group.c and SocketAddress.c didn't compile under FreeBSD. (This bug fix shall be know as big fix BSD5.) CHANGES, file-by-file --------------------- * config/config.mk 1. Added "rtf" to MOD_LIST. 2. Added explanation about module dependencies. * Group.c * SocketAddress.c 1. Added #include for bug fix BSD5. * INSTALL.unix 1. Added mention of README.Solaris file for feature RST. 2. s/www.objectspace.com/www.stlport.org/ 3. Moved module wording in step 2 of building to config/config.mk. * man/man1/index.1 1. Added description of RTF module. * man/man4/swish++.conf.4 1. Added mention of RTF module. 2. Added mention that text/enriched attachments can be indexed only if the RTF module is compiled into index(1). 3. Added mention that text/html attachments can be indexed only if the HTML module is compiled into index(1). 4. Fixed RFC 1563 attribution. * mod/mail/mod_mail.c 1. Removed: #include "platform.h" 2. Added: #include "mod/rtf/mod_rtf.h" 3. Removed index_enriched(). 4. In index_headers(), made "text/enriched" #ifdef'd on mod_rtf. 5. In index_words(), switched to using RTF module. * mod/mail/mod_mail.h 1. Fixed RFC attributions. 2. Made Text_Enriched #ifdef'd on mod_rtf. 3. Removed index_enriched(). * mod/rtf/GNUmakefile * mod/rtf/mod_rtf.c * mod/rtf/mod_rtf.h 1. New files for feature RTF. * README.Solaris 1. New file for feature RST. * search.c 1. Made return-type of search() and service_request() bool for feature RST. * search.h 1. Made return-type of service_request() bool for feature RST. * searchc.in 1. Added call to shutdown() after sending query. * search_thread.c 1. In search_thread::main(), added: out << flush; 2. In search_thread::main(), removed EINTR guard (not needed). * swish++.conf 1. Added: IncludeFile RTF *.rtf * thread_pool.c 1. s/thread_main/thread_pool_thread_main/ 2. s/thread_destroy/thread_pool_thread_destroy/ 3. Replaced q_lock class by simple mutex again. 4. Changed state_ back to destructing_. 5. Added DEFER_CANCEL, RESTORE_CANCEL, MUTEX_LOCK, MUTEX_UNLOCK macros. 6. In thread_pool_thread_destroy(), removed unlocking of q_lock. 7. In thread_pool_thread_main(), moved unlock of run_lock_ here. 8. In thread_pool_thread_main(), reworked mutex locking. 9. In ~thread(), removed mutex_lock of t_lock_. A. In thread_pool(), removed ERRORCHECK attribute. B. In ~thread_pool(), added DEFER_CANCEL(). C. In new_task(), reworked mutex locking. D. In new_task(), Added DEFER_CANCEL(). * thread_pool.h 1. s/thread_main/thread_pool_thread_main/ 2. s/thread_destroy/thread_pool_thread_destroy/ 3. Replaced q_lock class by simple mutex again. 4. Added private default constructor to argument_type. 5. Changed state_ back to destructing_. * version.h 1. Upped version to to "5.6". ******************************************************************************* 5.5.3 ******************************************************************************* NEW FEATURES ------------ * A sample Procmail recipe has been included that can be used to split incoming mail messages into individual files for indexing. (This feature shall be known as feature SIM.) * The indexing word-determination rules have been relaxed somewhat; the following rules have been eliminated: 1. Starts with a capital letter, is of mixed case, and contains more than a third capital letters. This enables words like FedEx to be indexed. 2. Contains a capital letter other than the first. This enables words like iMac to be indexed. (This feature shall be known as feature RWD.) BUG FIXES --------- * When running as a server, search(1) had a memory leak. (This bug fix shall be know as bug fix SML.) * When running as a server, search(1) didn't make the sockets reusable. (This bug fix shall be know as bug fix RSA.) CHANGES, file-by-file --------------------- * GNUmakefile 1. For INITD_DIR and LEVEL_DIR, redirected error output to /dev/null. * INSTALL.unix 1. Added mention of Procmail for feature SIM. * man/man1/index.1 1. Removed mention of removed word-determination rules for feature RWD. * procmailrc 1. New file for feature SIM. * searchd.in 1. Added: KILL=`which kill` 2. Added "|| exit 1" in a few places. 3. Added "sleep 3" in restart case. * search.c 1. In search(), added "delete format" for bug fix SML. * search_daemon.c 1. Added BIND_SOCKET() for bug fix RSA. * searchmonitor.in 1. Added: KILL=`which kill` * version.h 1. Upped version to 5.5.3. * word_util.c 2. In is_ok_word(), removed rules for feature RWD. * www_example/sample.html 1. Converted to XMTML. ******************************************************************************* 5.5.2 ******************************************************************************* BUG FIXES --------- * Indexing attachments has been broken since version 5.2. Major d'oh. (This bug fix shall be known as bug fix IAB.) CHANGES, file-by-file --------------------- * mod/mail/mod_mail.c 1. In index_headers(), put a missing "else" back for bug fix IAB. * version.h 1. Upped version to 5.5.2. ******************************************************************************* 5.5.1 ******************************************************************************* BUG FIXES --------- * Automatic thread-pool size reduction had a race condition where too many threads could be destroyed. (This bug fix shall be known as bug fix TCD.) CHANGES, file-by-file --------------------- * thread_pool.cpp 1. Changed thread::destructing_ to thread::state_ for bug fix TCD. 2. In thread_main(), set thread state to expired before calling delete on it for bug fix TCD. * thread_pool.h 1. Changed thread::destructing_ to thread::state_ for bug fix TCD. * version.h 1. Upped version to 5.5.1. ******************************************************************************* 5.5 ******************************************************************************* NEW FEATURES ------------ * search(1) can now be run as a daemon without it automatically putting itself into the background. This is useful in order to wrap a start script around it and automatically restart it if it dies for any reason. Correspondingly, there are 2 new utility scripts: searchmonitor (a process monitor for search) and searchd (a start/stop script for SysV-like systems). (This bug fix shall be known as feature NOB.) * search(1), when run as a daemon, can give away its root privileges if it started with them. There are now new command-line options of -U, --user, -G, and --group as well as new configuration variables User and Group. (This bug fix shall be known as feature GAR.) BUG FIXES --------- * When search(1) was running as a daemon, it ignored -F and --format options specified via the socket. (This bug fix shall be known as bug fix SDF.) * For very large document sets when many partial indicies were generated, if the number of partial indicies exceeded the maximum number of file descriptors a process could have open, merging would fail. (This bug fix shall be known as bug fix MFD.) CHANGES, file-by-file --------------------- * conf_enum.c * conf_enum.h 1. Added the is_legal() function for bug fix SDF. * config.h 1. Added Group_Default and UserDefault for feature GAR. * conf_var.c 1. In map_ref(), added "user" and "group" for feature GAR. 2. In map_ref(), added "searchbackground" for feature NOB. * GNUmakefile 1. Added Group.c and User.c to S_SOURCES for feature GAR. 2. Removed WIN32 PERL_TARGET conditional since WIN32 isn't set at that point. 3. s/PERL_TARGET/OTHER_TARGET/ 4. Added searchmonitor to OTHER_TARGET for feature NOB. 5. Added INITD_TARGET for feature NOB. 6. Added BIN_TARGET since other targets get installed places other than in a bin directory. 7. Added INITD_DIR and LEVEL_DIR to figure out a SysV system's run level directories for feature NOB. 8. Added installation of /etc/swish++.conf for feature NOB. 9. Added install_sysv target for feature NOB. A. Added uninstallation of start/stop scripts to uninstall target for feature NOB. * Group.c * Group.h 1. New files for feature GAR. * exit_codes.h 1. Added Exit_No_User and Exit_No_Group for feature GAR. 2. Changed Exit_Internal_Error from 255 to 127. * index.c 1. In main(), added maxing out of number of file descriptors to enable more partial indicies to be generated for bug fix MFD. * INSTALL.unix 1. Added step 5 regarding installing the searchd start/stop script for feature NOB. * man/man1/index.1 1. Added missing error codes 40 and 127. * man/man1/search.1 1. Added description of -B and --no-background options and the SearchBackground variable for feature NOB. 2. Added description of -U, --user, -G, and --group options and the User and Group variables for feature GAR. 3. Added subsections to Daemon section. 4. Added mention of giving away root privileges for feature GAR. 5. Added mention of searchmonitor(8) for feature NOB. * man/man4/swish++.conf.4 1. Added mention of SearchBackground variable for feature NOB. 2. Added mention of Group and User variables for feature GAR. * man/man8/GNUmakefile * man/man8/searchd.8 * man/man8/searchmonitor.8 * SearchBackground.h 1. New files for feature NOB. * search.c 1. Added #include "SearchBackground.h" for feature NOB. 2. Added global search_background variable for feature NOB. 3. In main(), added check of search_background_opt for feature NOB. 4. In search_options::search_options(), added initialization of search_background_opt for feature NOB. 5. In search_options::search_options(), added case for 'B' for feature NOB. 6. In usage(), added usage for -B and --no-background for feature NOB. 7. In search(), added results_format parameter for bug fix SDF. 8. In search_options::search_options(), added code to check legality of argument to -F option for bug fix SDF. 9. In service_request(), added opt.results_format_arg to call to search() for bug fix SDF. A. Added #include "User.h" and "Group.h" for feature GAR. B. Added global user and group variables for feature GAR. C. In main(), added check of group_arg and user_arg for feature GAR. D. In search(), added static_cast to get rid of float->int conversion warning. E. In search_options::search_options(), added initialization of user_arg and group_arg for feature GAR. F. In search_options::search_options(), added cases for 'G' and 'U' for feature GAR. G. In Usage(), added description of -G and -U for feature GAR. * search.h 1. Added search_background_opt for feature NOB. 2. Added user_arg and group_arg for feature GAR. * searchd.in * searchmonitor.in 1. New files for feature NOB. * search_daemon.c 1. Added #include "SearchBackground.h" for feature NOB. 2. In become_daemon(), added tests of search_background for feature NOB. 3. Added #include "User.h" and "Group.h for feature GAR. 4. In become_daemon(), added code to change UID/GID for feature GAR. * search_options.c 1. Added no-background option for feature NOB. 2. Added user and group options for feature GAR. * swish++.conf 1. Added SearchBackground for feature NOB. 2. s!/tmp/search.pid!/var/run/search.pid! * User.c * User.h 1. New files for feature GAR. * util.h 1. In max_out_limit(), set limit to infinity if running as root for bug fix MFD. * version.h 1. Updated version to "5.5". ******************************************************************************* 5.4.6 ******************************************************************************* BUG FIXES --------- * On systems (such as Solaris) where /bin/sh is still really Bourne shell (as opposed to bash in disguise), -e tests don't work. (This bug fix shall be known as bug fix DEF.) CHANGES, file-by-file --------------------- * GNUmakefile * init_mod_vars-sh 1. s/-e/-f/ for bug fix DEF. * version.h 1. Updated version to "5.4.6". ******************************************************************************* 5.4.5 ******************************************************************************* BUG FIXES --------- * If AssociateMeta, IncludeFile, IncludeMeta, ExcludeFile, or ExcludeMeta were not given in a configuration file, values given via the command line were discarded. (This bug fix shall be known as bug fix CRA.) * On some systems, the auto-building of dependencies got into an infinite loop since the "dep" directory's timestamp was updated for every dependency file and thus everything that depended on it was always out of date. Why this doesn't happen on all systems isn't clear. (This bug fix shall be known as bug fix DTS.) CHANGES, file-by-file --------------------- * config/config.mk 1. Removed "dep" for bug fix DTS. * config/GNUmakefile 1. s/dep/.*.d/ for bug fix DTS. * conf_var.c 1. In parse_file(), removed call to reset_all() for bug fix CRA. * conf_var.h 1. Made reset_all() public. * GNUmakefile * mod.mk 1. Changed "dep/%.d" (back) to ".%.d" for bug fix DTS. 2. In distclean rule, s/dep/.*.d/ for bug fix DTS. * INSTALL.win32 * INSTALL.unix 1. s/dep/.*.d/ for bug fix DTS. * version.h 1. Updated version to "5.4.5". ******************************************************************************* 5.4.4 ******************************************************************************* BUG FIXES --------- * In index(1), the config-file option wasn't recognized because it was spelled as just "config" in the source code. D'oh! (This bug fix shall be known as bug fix LCO.) * Configuration file variables in modules were somehow being corrupted so some weren't being recognized any longer. I really don't know what was going on. But, module-specific variables weren't recognized at all in search(1). Oops. (This bug fix shall be known as bug fix XCV.) CHANGES, file-by-file --------------------- * conf_var.c 1. In map_ref(), added call to init_mod_vars() for bug fix XCV. * conf_var.h 1. Added init_mod_vars() for bug fix XCV. * GNUmakefile 1. Added init_mod_vars.c to I_SOURCES, S_SOURCES, and E_SOURCES for bug fix XCV. 2. Added rule to make init_mod_vars.c for bug fix XCV. * index.c 1. In main(), s/config/config-file/ for bug fix LCO. * init_mod_vars-sh 1. New file for bug fix XCV. * mod/html/mod_html.c * mod/html/mod_html.h * mod/mail/mod_mail.c * mod/mail/mod_mail.h 1. Moved constructor to .h file and removed register_var() for bug fix XCV. * mod/html/vars * mod/mail/vars 1. New files for bug fix XCV. * version.h 1. Updated version to "5.4.4". ******************************************************************************* 5.4.3 ******************************************************************************* BUG FIXES --------- * When compiling without the search daemon, search(1) wouldn't link because it needs conf_enum.o and it wasn't compiled. (This bug fix shall be know as big fix CEO.) * The file thread_pool.c didn't compile under FreeBSD. (This bug fix shall be know as big fix BSD4.) CHANGES, file-by-file --------------------- * config/config.mk 1. In "OS selection" section, added comment for Mac OS X. * GNUmakefile 1. Moved conf_enum.c so that it's always compiled for bug fix CEO. * INSTALL.unix 1. Added fact that g++ 2.95.2 works. 2. Added note about g++ 2.96. * thread_pool.c 1. Added "#ifndef FreeBSD" around use of PTHREAD_MUTEX_ERRORCHECK for bug fix BSD4. * version.h 1. Updated version to "5.4.3". ******************************************************************************* 5.4.2 ******************************************************************************* BUG FIXES --------- * The "classic" results formatting was broken in that the result separator wasn't output in all the places it should be. How I didn't catch this isn't clear. (This bug fix shall be know as bug fix CFS.) CHANGES, file-by-file --------------------- * classic_formatter.c 1. In result(), added missing "results_separator" for bug fix CFS. * version.h 1. Updated version to "5.4.2". ******************************************************************************* 5.4.1 ******************************************************************************* BUG FIXES --------- * The command-line option spec. building introduced in version 5.4 was broken. (This bug fix shall be know as bug fix COS.) CHANGES, file-by-file --------------------- * indexer.c 1. In indexer::all_mods_options(), s/++option_count/*c++ = *s/ for buf fix COS. * version.h 1. Updated version to "5.4.1". ******************************************************************************* 5.4 ******************************************************************************* NEW FEATURES ------------ * Search results can now optionally be output in XML. (This feature shall be known as feature XML.) * The modular indexing rearchitecture is now complete. CHANGES, file-by-file --------------------- * classic_formatter.c * classic_formatter.h 1. New files for feature XML. * conf_var.c 1. In map_ref(), removed ExcludeClass and FilterAttachment. 2. Added: register_var() 3. In map_ref(), added ResultsFormat for feature XML. * conf_var.h 1. Added: register_var() * file_info.c 1. Reordered mem-initializers to match new order in declaration for feature XML. 2. Added file_info( unsigned char const* ) for feature XML. * file_info.h 1. Added file_info( unsigned char const* ) for feature XML. 2. Reordered data members to facilitate new constructor for feature XML. * GNUmakefile 1. Added file_info.c, classic_formatter.c, ResultsFormat.c, results_formatter.c, and xml_formatter.c to S_SOURCES for feature XML. * index.c 1. Removed #include of mod_html .h files. 2. Removed mod_html command-line options. 3. In main(), added code to gather all module options. 4. In main(), moved code to dump HTML elements into mod_html. 5. In usage(), removed mod_html usage. 6. In usage(), added call to: indexer::all_mods_usage(). * indexer.c * indexer.h 1. Added any_mod_claims_option(), all_mods_options(), all_mods_post_options(), all_mods_usage(), claims_option(), option_spec(), post_options(), and usage(). * INSTALL.unix 1. Updated Unix prerequisites. * man/man1/search.1 1. Added XML results description for feature XML. 2. Added -F, --format, and ResultsFormat for feature XML. 3. Corrected wording regaring titles. 4. For -P and --pid-file, added mention of default being none. 5. For -u and --socket-file, added mention of default being /tmp/search.socket. 6. In meta data query examples, removed mention of "HTML or XHTML" since other document types can have meta information. 7. Added XML output caveat for feature XML. 8. Added reference to XML specification for feature XML. * man/man4/swish++.conf.4 1. Added ResultsFormat for feature XML. * mod/html/mod_html.c 1. Moved constructor definition here. 2. In constructor, added call to register_var( "excludeclass" ); 3. Moded global dump_html_elements_opt definition here. 4. Added claims_option(), option_spec(), post_options(), and usage(). * mod/html/mod_html.h 1. Mode constructor definition to mod_html.c. 2. Added claims_option(), option_spec(), post_options(), and usage(). * mod/mail/mod_mail.c 1. Moved constructor definition here. 2. Added call to register_var( "filterattachment" ). * mod/mail/mod_mail.h 1. Moved constructor definition to mod_mail.c. * README 1. Added "XML search results" for feature XML. * ResultsFormat.c * ResultsFormat.h * results_formatter.c * results_formatter.h 1. New files for feature XML. * search.c 1. Added #include of classic_formatter.h, file_info.h, ResultsFormat.h, results_formatter.h, ResultsMax.h, and xml_formatter.h for feature XML. 2. Added global results_format for feature XML. 3. In main(), added test of opt.results_format_arg for feature XML. 4. In search(), replaced result output with new result formatter classes for feature XML. 5. In search_options::search_options(), added initialization of restuls_format_arg for feature XML. 6. In search_options::search_options(), added case 'F' for feature XML. 7. Rewrote write_file_info() using a file_info. 8. In usage(), added usage message for -F option for feature XML. * search.h 1. Added results_format_arg for feature XML. * search_options.c 1. Added "format" for feature XML. * SearchResults.dtd 1. New file for feature XML. * swish++.conf 1. Added ResultsFormat for feature XML. * version.h 1. Upped version to "5.4." * xml_formatter.c * xml_formatter.h 1. New files for feature XML. ******************************************************************************* 5.3.6 ******************************************************************************* NEW FEATURES ------------ * When compiling using g++, added additional compiler options to reduce code size and slightly improve performance. (This feature shall be known as feature GPPO.) BUG FIXES --------- * Indexing files via standard input where the order of the directories wasn't "monotonically increasing," didn't work: files ended up in the wrong directory. As a beneficial consequence, the -D and -G options and the DirectoriesGrow and DirectoriesReserve variables are no longer needed. (This bug fix shall be known as big fix ISI2.) * Destroying a thread_pool's threads didn't work properly in that the clean-up function for all threads didn't get called. (This didn't matter for SWISH++ since search(1) never destroys its thread_pool.) (This bug fix shall be known as big fix TPD.) CHANGES, file-by-file --------------------- * config/config.mk 1. Added -D_XOPEN_SOURCE=500 for compiling search daemon under Linux for bug fix TPD. 2. Added DEBUG variable. 3. Added -fno-rtti to CCFLAGS for feature GPPO. 4. Added -fomit-frame-pointer to OPTIM for feature GPPO. * config/mod.mk 1. s/DEBUG/DEBUGFLAGS/ * config.h 1. Removed DirectoriesGrow_Default and DirectoriesReserve_Default as part of bug fix ISI2. * conf_var.c 1. In parse_file(), added reset_all(). 2. In reset_all(), added check for null pointer. 3. Removed DirectoriesGrow and DirectoriesReserve as part of bug fix ISI2. * DirectoriesGrow.h * DirectoriesReserve.h 1. Removed these files as part of bug fix ISI2. * directory.c 1. Changed do_file() to take a second dir_index argument for bug fix ISI2. 2. Changed return type of check_add_directory() to return a directory index for bug fix ISI2. 3. In check_add_directory(), changed from using a set to a map where the value is the directory index for bug fix ISI2. 4. In do_check_add_file() and do_directory(), made it get and pass dir_index to do_file() for bug fix ISI2. 5. Removed directories_reserve as part of bug fix ISI2. * directory.h 1. s/dir_list/dir_set/ * do_file.c 1. When compiled for index(1), made do_file() take a second argument of dir_index for bug fix ISI2. 2. Added dir_index to call to file_info() constructor for bug fix ISI2. * exit_codes.h 1. s/Exit_No_Init_Condition/Exit_No_Init_Thread_Condition/ 2. s/Exit_No_Init_Mutex/Exit_No_Init_Thread_Mutex/ * file_info.c 1. Made first constructor take dir_index argument for bug fix ISI2. 2. Removed second constructor for bug fix ISI2. 3. Integrated construct() into lone constructor. * file_info.h 1. Removed constructor not taking dir_index for bug fix ISI2. 2. Removed construct(). * GNUmakefile 1. Added DEBUGFLAGS. 2. For $(MOD_LIBS) rule, s/DEBUG/DEBUGFLAGS/ * index.c 1. In load_old_index(), removed new_strdup()'s since file_info is now doing them for bug fix ISI2. 2. Removed DirectoriesGrow and DirectoriesReserve as part of bug fix ISI2. 3. Removed -D and -G options as part of bug fix ISI2. 4. In write_dir_index(), added code to order the directories for bug fix ISI2. 5. Moved definition of exlude_class_names to mod_html.c. * index_header.c 1. s/dir_list/dir_set/ for bug fix ISI2. * man/man1/httpindex.1 1. Added -e's to example. * man/man1/index.1 1. Removed -D, --dirs-reserve, -G, --dirs-grow, DirectoriesGrow, and DirectoriesReserve as part of bug fix ISI2. * man/man4/swish++.conf.4 1. Removed DirectoriesGrow and DirectoriesReserve as part of bug fix ISI2. * mod/html/mod_html.c 1. Moved definition of exlude_class_names here. * search_daemon.c 1. In set_signal_handlers(), removed SA_RESTART. * swish++.conf 1. Removed DirectoriesGrow and DirectoriesReserve as part of bug fix ISI2. * thread_pool.c 1. In thread_destroy(), added code to unlock q_lock for bug fix TPD. 2. In thread_destroy(), added code to decrement q_lock's reference count for bug fix TPD. 3. In thread_destroy(), added code to deallocate thread object storage for bug fix TPD. 4. In thread_main(), added code to increment q_lock's reference count for bug fix TPD. 5. s/Exit_No_Init_Condition/Exit_No_Init_Thread_Condition/ 6. s/Exit_No_Init_Mutex/Exit_No_Init_Thread_Mutex/ 7. In ~thread(), removed destructing_lock_ since I don't think it's needed. 8. In ~thread(), added optimization for a thread committing suicide. 9. Added q_lock::dec_ref() and q_lock::inc_ref() functions for bug fix TPD. A. In thread_pool::thread_pool(), created q_lock with the PTHREAD_MUTEX_ERRORCHECK attribute for bug fix TPD. B. In ~thread_pool(), removed destructing_lock_ since I don't think it's needed. C. In ~thread_pool(), added code to decrement q_lock's reference count for bug fix TPD. * thread_pool.h 1. Overrode thread::operator delete() to do nothing for bug fix TPD. 2. Changed q_lock_ to a reference-counted object for bug fix TPD. 3. Made max/min threads and thread timeout settable after thread_pool creation. * version.h 1. Upped version to "5.3.6." ******************************************************************************* 5.3.5 ******************************************************************************* NEW FEATURES ------------ * The code for modules has been reorganized into subdirectories that build libraries with the goal of having a completely modular indexing architecture similar to the way Apache has modules. This is a work-in-progress. BUG FIXES --------- * Version 5.1 broke indexing file names via standard input. D'oh! (This bug fix shall be known as bug fix FSI.) * Version 5.1 also added unnecessary work for extract(1). (This bug fix shall be known as bug fix ESI.) * The thread::~thread() destructor mistakenly killed the calling thread rather than itself. Oops. This didn't actually matter for SWISH++ since it's never called. (This bug fix shall be known as bug fix TPTD.) CHANGES, file-by-file --------------------- * config.h 1. Moved MOD_HTML parameters to mod/html/html_config.h. * config/config.mk 1. Changed format of MOD_LIST. 2. Added RANLIB. 3. Moved auto-dependency generation here. * config/GNUmakefile 1. Moved TARGET definition before include. 2. Added removal of accidental dep subdirectory. * config/mod.mk 1. New file for modularization. * conf_var.c 1. s/MOD_HTML/mod_html/ 2. s/MOD_MAIL/mod_mail/ * directory.c 1. Made this file #include'd by index.c and extract.c for bug fix FSI. 2. Added: #include "fake_ansi.h" 3. Added "#ifdef INDEX" in various places for bug fix ESI. 4. Added do_check_add_file() for bug fix FSI. * directory.h 1. Removed local #include's and follow_symbolic_links and function declarations for bug fix FSI. * elements.c * elements.h * entities.c * entities.h * ExcludeClass.h * mod_html.c * mod_html.h 1. Moved to mod/html subdirectory. * encoded_char.c * encoded_char.h 1. s/MOD_MAIL/mod_mail/ * extract.c 1. Moved #include of platform.h first so PJL_NO_SYMBOLIC_LINKS would be defined at the right time. 2. Added: #include "FollowLinks.h" for bug fix ESI. 3. Added: #include "directory.c" for bug fix ESI. 4. s/::strdup()/new_strdup()/ * ExcludeFile.c * ExtractFile.c 1. s/::strdup()/new_strdup()/ * file_info.c 1. s/::strdup()/new_strdup()/ * FollowLinks.h 1. s/follow_links/follow_symbolic_links/ * FilterAttachment.h * mod_mail.c * mod_mail.h 1. Moved to mod/mail subdirectory. * GNUmakefile 1. Moved target definition before include. 2. s/C_TARGET/CPP_TARGET/ 3. Added MOD_LIBS, MOD_LIB_PATHS, MOD_LINK. 4. s/I_SRCS/I_SOURCES/, s/I_OBJS/I_OBJECTS/, s/S_SRCS/S_SOURCES/, s/S_OBJS/S_OBJECTS/, s/E_SRCS/E_SOURCES/, s/E_OBJS/E_OBJECTS/ 5. Removed module-specific .c files. 6. Removed $(CCLINK) -- not used. 7. Made use of $(MOD_LINK) 8. Removed entities.c from E_SOURCES -- not used. 9. Added $(MOD_LIBS) to index dependencies. A. Added rule to build init_modules.c B. Added ruleto build module libraries. C. Moved auto-dependency generation to config/config.mk. D. Added MAKE_SUBDIRS function and made use of it in clean, distclean. E. Removed directory.c from I_SOURCES AND E_SOURCES for bug fix ESI. * IncludeFile.c * IncludeMeta.c 1. s/::strdup()/new_strdup()/ * index.c 1. Changed includes to use mod/html form. 2. s/MOD_HTML/mod_html/ 3. Moved #include of platform.h first so PJL_NO_SYMBOLIC_LINKS would be defined at the right time. 4. Added: #include "FollowLinks.h" for bug fix FSI. 5. Added: #include "directory.c" for bug fix FSI. 6. In main(), s/do_file()/do_check_add_file()/ 7. s/::strdup()/new_strdup()/ * indexer.c 1. s/::strdup()/new_strdup()/ * init_modules.c 1. Removed since it's not automatically generated. * init_modules-sh 1. Added to generate init_modules.c automatically. * mod_man.c * mod_man.h 1. Moved to mod/man subdirectory. * mod/html/html_config.h 1. Moved MOD_HTML-specific configuration parameters here. * mod/html/ExcludeClass.h * mod/html/GNUmakefile * mod/html/elements.c * mod/html/elements.h * mod/html/entities.c * mod/html/entities.h * mod/html/mod_html.c * mod/html/mod_html.h * mod/mail/FilterAttachment.h * mod/mail/GNUmakefile * mod/mail/encoded_char.c * mod/mail/mod_mail.c * mod/mail/mod_mail.h * mod/man/GNUmakefile * mod/man/mod_man.c * mod/man/mod_man.h 1. Moved from top-level directory. * stem_word.c * stop_words.c 1. s/::strdup()/new_strdup()/ * thread_pool.c 1. Fixed thread::~thread() for bug fix TPTD. * util.c * util.h 1. Removed unneeded #include's. * version.h 1. Changed version to "5.3.5". ******************************************************************************* 5.3.4 ******************************************************************************* BUG FIXES --------- * File titles turned to garbage when indexing file incrementally. (This bug fix shall be known as IIT.) * option_stream's test main() was incorrectly defined inside the PJL namespace. (This bug fix shall be know as bug fix OSM.) * option_stream didn't report an error for an option that required an argument when no argument was given when said option was the last thing on the commend line. (This bug fix shall be know as bug fix OSN.) CHANGES, file-by-file --------------------- * index.c 1. Merged parse_file_info() function into load_old_index(). 2. In (what is now) load_old_index(), added a strdup() for the file's title for bug fix IIT. 3. In write_dir_index(), switched to using FOR_EACH(). * option_stream.c 1. Got rid of option_stream::option::copy(). 2. Moved the test main() outside of the PJL namespace for bug fix OSM. 3. s/c_/short_name_/, s/index_/argi_/. 4. Added was_short_option_ variable for bug fix OSN. 5. Replaced some duplicated option argument code with a goto. 6. Reworked argument processing for buf fix OSN. * option_stream.h 1. Got rid of option_stream::option::copy() and destructor. 2. Made copy constructor and assignment operator private. 3. Renamed the following: s/c/short_name/, s/index_/argi_/ * version.h 1. Changed version to "5.3.4". ******************************************************************************* 5.3.3 ******************************************************************************* BUG FIXES --------- * SIGPIPE wasn't handled at all so a search client that disconnected unexpectedly could crash the server. Now that this has been fixed, the server also needs to check the state of the outgoing stream during writes for an error: if an error occurs, assume the client disconnected from the socket and stop sending output. (This bug fix shall be known as bug fix PIPE.) * On Linux systems, multiple reads from the search daemon timed out sooner than requested because Linux modifies the timeval struct passed to select() to reflect the amount of time not slept. (This bug fix shall be known as bug fix LMR.) CHANGES, file-by-file --------------------- * search.c 1. In main(), s/search_options opt/search_options const opt/ 2. In dump_single_word(), dump_word_window(), several places in search() and service_request, added a check for the state of the "out" stream for bug fix PIPE. * search_daemon.c 1. Added set_signal_handlers() for bug fix PIPE. * search_thread.c 1. In search_thread::main(): s/search_options opt/search_options const opt/ 2. In timed_read_line(), reworked the timeout such that the timeval struct is always initialized properly for every loop iteration for bug fix LMR. * version.h 1. Changed version to "5.3.3". ******************************************************************************* 5.3.2 ******************************************************************************* BUG FIXES --------- * On some platforms, index(1) would index "0 words" for every file. (This bug fix shall be known as bug fix IZW.) * There was a race condition in the search daemon thread pool code whereby the prototype thread could begin executing before its owning thread_pool was fully constructed. (This bug fix shall be known as bug fix TPR.) CHANGES, file-by-file --------------------- * encoded_char.h 1. Made encoded_char_range::const_iterator's ch_ and decode() compile in only when MOD_MAIL is defined for bug fix IZW. 2. In encoded_char_range::const_iterator::operator*(), made it return *pos_ when MOD_MAIL wasn't defined for bug fix IZW. * exit_codes.h 1. Added Exit_No_Init_Condition and Exit_No_Init_Mutex. * man/man1/search.1 1. Added exit coded 66 and 67. * thread_pool.c 1. In thread_main(), added "::pthread_mutex_lock( &t->run_lock_ );" for bug fix TPR. 2. In thread_pool::thread::thread(), added initialization and locking of run_lock_ for bug fix TPR. 3. In thread_pool::thread::~thread(), added destruction of run_lock_ for bug fix TPR. 4. Changed thread_pool::thread_pool() to take a pointer to non-const thread for bug fix TPR. 5. In thread_pool::thread_pool(), made use of Exit_No_Init_Condition and Exit_No_Init_Mutex. 6. In thread_pool::thread_pool(), added prototype thread to pool and now creating thread_min - 1 additional threads. 7. In thread_pool::thread_pool(), s/create/create_and_run()/ for bug fix TPR. 8. In thread_pool::new_task(), s/create/create_and_run()/ for bug fix TPR. * thread_pool.h 1. Added thread_pool::thread::run_lock_ for bug fix TPR. 2. Added thread_pool::thread::run() and create_and_run() for bug fix TPR. 3. Changed thread_pool::thread_pool() to take a pointer to non-const thread for bug fix TPR. * version.h 1. Changed version to "5.3.2". ******************************************************************************* 5.3.1 ******************************************************************************* BUG FIXES --------- * Searching with more that two "and" terms caused a core cump. This bug was a result of the "enhancement" to doing multiple "and" terms in 5.3. (This bug fix shall be known as bug fix MAT.) * Compiling with all but the text module produced a syntax error. (This bug fix shall be known as bug fix NMS.) CHANGES, file-by-file --------------------- * conf_bool.h * conf_int.h 1. Removed extraneous backslash. * init_modules.c 1. Added #include "indexer" for bug fix NMS. * query.c 1. In perform_and(), added needed "break" for bug fix MAT. * version.h 1. Changed version to "5.3.1". ******************************************************************************* 5.3 ******************************************************************************* BUG FIXES --------- * The weighting of multiple "and" terms has been fixed. Previously, the query: mouse and computer and keyboard was parsed and treated as: (mouse and computer) and keyboard 25% 25% 50% The problem was that the last term always got 50% of the weighting and the rest got 50% divided by the number of terms minus 1. In order to weight all the terms equally, the "and" results for each term are now saved in a list and then and'ed together at the end. (This bug fix shall be known as bug fix MAW.) * The index(1) manual page didn't explicitly state that words are converted to lower case prior to indexing. CHANGES, file-by-file --------------------- * conf_var.c 1. Changed from abort() to internal_error. * exit_codes.h 1. Added: Exit_Internal_Error * filter.c * indexer.c * option_stream.c * thread_pool.c 1. Changed from abort() to internal_error. * man/man1/index.1 1. Added paragraph at the end of the "Word Determination" subsection addressing conversion to lower case prior to indexing. * query.c 1. Changed from abort() to internal_error. 2. Moved declarations for get_meta_id(), parse_meta(), parse_primary(), and parse_optional_relop() here from query.h. 3. In parse_meta() and parse_primary(), got rid of unused default value. 4. Changed what what parse_query() to parse_query2(). 5. Added a new parse_query(). 6. Added and_results_type argument to parsing functions for bug fix MAW. 7. In parse_query2(), deferred and'ing of results for bug fix MAW. 8. Added perform_and() function for bug fix MAW. * query.h 1. Moved declarations for get_meta_id(), parse_meta(), parse_primary(), and parse_optional_relop() to query.c. 2. Added: stop_word_set 3. s/set< string >/stop_word_set/ 4. For parse_query(), got rid of unneeded bool& and int arguments. 5. s/search_results_type/search_results/ 6. s/find_results_type/find_results/ * search.c 1. s/set< string >/stop_word_set/ 2. In search(), got rid of unused "ignore" variable. 3. s/search_results_type/search_results/ 4. s/find_results_type/find_results/ * util.h 1. Added internal_error and report_error(). * version.h 1. Upped version. ******************************************************************************* 5.2 ******************************************************************************* NEW FEATURES ------------ * E-mail attachments can now be filtered by external programs. (This feature shall be know as feature AFP.) CHANGES, file-by-file --------------------- * conf_filter.c * conf_filter.h 1. Replaced FilterFile.c and made generic for feature AFP. * conf_var.c 1. Added filterattachment for feature AFP. * do_file.c * extract.c 1. s/filters/file_filters/ for featuer AFP. * filter.h 1. Added: substitute( std::string const &file_name ); * FilterAttachment.h 1. Added this file for feature AFP. * FilterFile.c 1. Replaced by conf_filter.c * FilterFile.h 1. Made FilterFile derived from conf_filter for feature AFP. 2. s/filters/file_filters/ for featuer AFP. * GNUmakefile 1. Added conf_filter.c for feature AFP. 2. Removed FilterFile.c for feature AFP. * index.c 1. s/filters/file_filters/ for featuer AFP. * man/man1/extract.1 1. Added FilterAttachment for feature AFP. * man/man1/index.1 1. Added mention of FilterAttachment for feature AFP. 2. s/-D/-G/ * man/man4/swish++.conf.4 1. Added "Filter variables" section. 2. Added information on filtering attachments for feature AFP. 3. Added more references. * mod_mail.c 1. Added #include's for , , "FilterAttachment.h", and "Verbosity.h" for feature AFP. 2. Added "attachment_filters" declaration for feature AFP. 3. Added index_via_filter() for feature AFP. 4. In index_headers(), added code for filters for feature AFP. 5. In index_words(), added case for External_Filter for feature AFP. * mod_mail.h 1. Added "External_Filter" for feature AFP. 2. Changed message_type from s pair<> to a struct for feature AFP. * README 1. Added mention of filtering attachments for feature AFP. * swish++.conf 1. Added FilterAttachment section for feature AFP. 2. Added: FilterFile *.ps pstotext %f > @%F.txt 3. Added: FilterFile *.bz2 bunzip2 -c %f > @%F * version.h 1. Upped version. ******************************************************************************* 5.1 ******************************************************************************* NEW FEATURES ------------ * Reduced index storage size by recording directory names once. Note that the old -G option for index(1) has changed to -g and that there is a new -G option. (This feature shall be known as feature DIR1.) BUG FIXES --------- * The swish++.conf(4) manual page was missing FilesReserve and ResultsMax. (This bug fix shall be known as bug fix MFR.) CHANGES, file-by-file --------------------- * bcd.h 1. Added: #include "fake_ansi.h" * config.h 1. Added DirectoriesGrow_Default and DirectoriesReserve_Default for feature DIR1. * config/config.mk 1. Added g++ 3.0-specific warnings to CCFLAGS for development purposes. * conf_bool.h * conf_enum.h * conf_int.h * conf_set.h * conf_string.h 1. Added: #include "fake_ansi.h" * conf_percent.c * conf_percent.h * DirectoriesGrow.h * DirectoriesReserve.h 1. Added these files for feature DIR1. * conf_var.c 1. Added DirectoriesGrow and DirectoriesReserve configuration variables for feature DIR1. * directory.c 1. Added dir_list for feature DIR1. 2. Added directories_reserve. 3. Added check_add_directory() for feature DIR1. 4. s/queue< string >/queue< char const* >/ 5. Switched from using std::string to create the current path to using a simpler char buffer. 6. Made sure the directory that is passed to do_directory() recursively has been strdup()'d. * directory.h 1. Added dir_list for feature DIR1. 2. Added check_add_directory() for feature DIR1. 3. Added: #include "fake_ansi.h" * elements.h * entities.h 1. Added: #include "fake_ansi.h" * ExcludeFile.h 1. Added extern declaration. * extract.c 1. s/do_directory( file_name )/do_directory( ::strdup( file_name ) )/ * ExtractFile.h * ExtractFilter.h 1. Added extern declaration. * ExtractExtension.h 1. Added extern declaration. 2. Added: #include "fake_ansi.h" * fake_ansi.h 1. Removed __STL_NO_NAMESPACES and __STL_USE_NAMESPACES. This stopped working and I can't figure out why. * FilesGrow.c 1. This functionality was replaced by conf_percent.c for feature DIR1. * FilesGrow.h 1. Changed to be derived from conf_percent. 2. Added extern declaration. * file_info.c 1. Added result_separator. 2. Redid the constructor mem-initializers. 3. Moved common constructor code to construct(). 4. Added a second constructor used for reconsituting instances during incremental indexing. 5. Moved code for parse() to index.c. * file_info.h 1. Removed: #include 2. Added: #include "fake_ansi.h" 3. Added a second constructor used for reconsituting instances during incremental indexing. 4. Added dir_index() and dir_index_ for feature DIR1. 5. Added: construct() 6. Made all data mambers private and added accessor functions. 7. s/struct/class/ 8. Added: const_iterator, begin(), end(), ith_info(), and num_files(). * FilterFile.h 1. Added extern declaration. 2. Removed: #include * fnmatch.h 1. Removed unused #ifndef's. 2. Added #undef's. 3. Removed FNM_ERROR since it's not used. * FollowLinks.h 1. Added extern declaration. * GNUmakefile 1. Added conf_percent.c for feature DIR1. 2. Removed FilesGrow.c for feature DIR1. 3. Removed file_info.c from S_SRCS since file_info::out() has been moved to write_file_info() in search.c for feature DIR1. 4. Added query.c to S_SRCS. * IncludeFile.h 1. Added extern declaration. * IncludeMeta.h 1. Added: #include "fake_ansi.h" * Incremental.h 1. Added extern declaration. * index.c 1. Added DirectoriesGrow and DirectoriesReserve for feature DIR1. 2. Added my_write() since ostream::write() now apparantly requires a char* rather than a void* and I'm lazy about having to cast the pointers. 3. Added dirs-reserve and dirs-grow command-line options for feature DIR1. 4. Added #ifdef PJL_GCC_295. 5. In load_old_index(), added loading of directory index for feature DIR1. 6. Moved index-file header-writing code to index_header.c. 7. Added write_dir_index() for feature DIR1. 8. Added new options to usage message for feature DIR1. 9. In usage(), s//title/. A. In main(), added: check_add_directory( "." ); B. s/file_info::parse/parse_file_info/ C. Added parse_file_info(). D. s/do_directory( file_name )/do_directory( ::strdup( file_name ) )/ * IndexFile.h 1. Added: #include "fake_ansi.h" * indexer.h 1. Added: #include "fake_ansi.h" * index_header.c 1. Added this file to have index-file header-writing code only once. * index_segment.h 1. Added dir_index for feature DIR1. * man/man1/index.1 1. Added -D, --dirs-reserve options for feature DIR1. 2. Changed old -G option to -g for feature DIR1. 3. Added new -G, --dirs-grow options for feature DIR1. 4. Added missing FilesGrow variable for bug fix MFR. 5. Added DirectoriesGrow and DirectoriesReserve variables for feature DIR1. * man/man4/swish++.conf.4 1. Added missing FilesReserve variable for bug fix MFR. 2. Added DirectoriesGrow and DirectoriesReserve vairable for feature DIR1. 3. Added "Percentage variables" section. * man/man4/swish++.index.4 1. Added directory index description. 2. Added other module cases describing a file's title. 3. Made separate BCD subsection. * meta_map.h * mmap_file.h * mod_html.h * mod_mail.h 1. Added: #include "fake_ansi.h" * mod_man.c 1. In index_words(), s/register char const* c/char const* c/ since its address is taken. * my_set.h 1. Moved declaration of #include "fake_ansi.h". * omanip.h 1. Added: #define PJL /* nothing */ 2. Added: #include "fake_ansi.h" * option_stream.h * pattern_map.h * PidFile.h 1. Added: #include "fake_ansi.h" * query.c * query.h 1. Split out thr query-parsing code from search.c to here. * ResultsMax.h 1. Added extern declaration. * ResultSeparator.h 1. Added extern declaration. 2. Added: #include "fake_ansi.h" * search.c 1. Added "directories" index_segment global variable for feature DIR1. 2. Moved file_info::out() to write_file_info() for feature DIR1. 3. Moved result_separator definition here for feature DIR1. 4. Moved query-parsing code to query.c. * search.h * SocketAddress.h * SocketFile.h 1. Added: #include "fake_ansi.h" * StemWords.h 1. Added extern declaration. * StopWordFile.h 1. Added: #include "fake_ansi.h" * swish++.conf 1. Added DirectoriesGrow and DirectoriesReserve for feature DIR1. * TempDirectory.h 1. Added: #include "fake_ansi.h" * thread_pool.c 1. Added start_function_type to thread() constructor. * thread_pool.h 1. Added start_function_type to thread() constructor. 2. Added: #define PJL /* nothing */ 3. Added: #include "fake_ansi.h" * util.h 1. Added: #include "fake_ansi.h" * version.h 1. Updated version to "5.1". * WordFilesMax.h * WordPercentMax.h 1. Added extern declaration. ******************************************************************************* 5.0.1 ******************************************************************************* BUG FIXES --------- * This releases fixes a lot of compile issues (mostly namespaces) with g++ 3.0. (This bug fix shall be known as bug fix GCC3.) * The changes to fix the above have apparantly caused bugs in (at least) g++ 2.95.3 to manifest themselves: 1. In some cases, the compiler "forgets" that operator<<( ostream&, string const& ) has been defined. The hack workaround is to use operator<<( ostream&, char const* ) and use string::c_str(). 2. The compiler "forgets" that stream manipulators have been defined. The workaround is not to use them. :-( (This fix shall be known as fix OOS.) CHANGES, file-by-file --------------------- * bcd.h 1. Switched to using local omanip since depending on the underlying C++ implementation is not portable. This was done for GCC3. * config/config-sh 1. Added PJL_GCC_295 since it's used in multiple places. This was done for OOS. * config/config.mk 1. Made OPTIM = -O2 for g++ also since the optimizer under 3.0 takes ridiculously long and uses most of the CPU and memory. 2. s/($(CC),g++)/($(findstring g++,$(CC)),g++)/ * conf_var.h 1. s/cerr/std::cerr/ for OOS. * do_file.c 1. s/basename/pjl_basename/ due to name collision. * fake_ansi.h 1. Replaced __GNUC__, et al, with PJL_GCC_295 for OOS. * fdbuf.c * fdbuf.h 1. Added these files since the ability to attach an fstream to a Unix file descriptor has been removed from ANSI C++. This was done for OOS. * filter.c 1. s/basename/pjl_basename/ due to name collision. * filter.h 1. s/std::unlink/::unlink/ for OOS. * GNUmakefile 1. Added fdbuf.c to S_SRCS for OOS. * index.c 1. Added my_write() since ostream::write() now apparantly requires a char* rather than a void* and I'm lazy about having to cast the pointers. This was done for OOS. 2. s/o.write( /my_write( o, / for OOS. 3. Added #ifdef PJL_GCC_295 for fix OOS. * index_segment.h 1. s/random_access_iterator_tag/std::random_access_iterator_tag/ for OOS. * less.h 1. Added needed "namespace std { ... }" for OOS. * mmap_file.c 1. s/ios::open_mode/ios::openmode/ for OOS. * mmap_file.h 1. Added missing #include <fstream> for OOS. 2. s/ios::open_mode/std::ios::openmode/ for OOS. 3. s/reverse_bidirectional_iterator/std::reverse_bidirectional_iterator/ for OOS. * omanip.h 1. Added this file to roll own ostream manipulator since depending on the underlying C++ implementation is not portable. This was done for OOS. * option_stream.h 1. s/cerr/std::cerr/ for OOS. * pattern_map.h 1. Removed PJL_LOCAL_FNMATCH since it's not needed. 2. s/unary_function/std::unary_function/ for OOS. 3. s/std::fnmatch/::fnmatch/ for OOS. * search.c 1. s/#include <iomanip>/#include "omanip"/ for OOS. 2. Added #ifdef PJL_GCC_295 for fix OOS. * search.h 1. s/cerr/std::cerr/ for OOS. 2. s/cout/std::cout/ for OOS. * search_thread.c 1. Removed #include <fstream> for OOS. 2. Added #include "fdbuf.h" for OOS. 3. Switched to using fdbuf since the ability to attach an fstream to a Unix file descriptor has been removed from ANSI C++. * stem_word.h 1. s/less/std::less/ * util.h 1. Added missing #include <iostream> 2. s/basename/pjl_basename/ due to name collision. 3. s/std::stat/::stat/ for OOS. 4. s/std::lstat/::lstat/ for OOS. 5. s/cerr/std::cerr/ for OOS. 6. s/endl/std::endl/ for OOS. * version.h 1. Updated version to "5.0.1". * word_info.c 1. Added missing "using namespace std;" for OOS. ******************************************************************************* 5.0 ******************************************************************************* NEW FEATURES ------------ * The indexing code has bee rearchitected to be modular allowing for new file formats to be indexed directly (without filters). Consequently, the indexing of HTML files has been turned into a module. The -e option and IncludeFile variable are now INCOMPATIBLE with previous releases. Read the updated documentation. (This feature shall be known as feature MOD.) * A filter module for mail (and news) files has been added. (This feature shall be known as feature MAIL.) * A filter module for manual page files has been added. (This feature shall be known as feature MAN.) * For index, a new -A or --no-assoc-meta option and AssociateMeta configuration variable have been added. (This feature shall be known as feature AMN.) * There is a new %E (second-to-last filename extenstion) substitution. (This feature shall be known as feature 22L.) * FilterFile configuration lines are now different and INCOMPATIBLE with previous releases. The @ character no longer does substitutions but merely marks the target filename. This was done to enable filtering to files having a fixed name to be able to handle filenames with spaces better. (This feature shall be known as feature SM2.) * The search daemon can now answer queries via TCP sockets in addition to Unix domain sockets. (This feature shall be known as feature TCP.) * You can now specify the separator character in search results. (This feature shall be known as feature SRS.) * Added parsing of XHTML 1.1 ruby elements. (This feature shall be known as feature RUBY.) BUG FIXES --------- * The index(1) -T option was ignored. (This bug fix shall be known as bug fix ITO.) * A configuration file that did not end in a newline would cause a segfault. (I think: I never tried it, but it looked like a bug to me.) (This bug fix shall be known as bug fix CNL.) * Configuration error messages output "(null)" (or seg-faulted) for the variable name. I don't see how the compiler didn't catch this since the name_ data member is const and therefore must be initialized in the constructor. (This bug fix shall be known as bug fix NVR.) * Setting the SearchDaemon config. variable to Y didn't allow no command-line arguments to be given. (This bug fix shall be known as bug fix DCL.) * Filter substitution incorrectly rescanned substituted text. (This bug fix shall be known as bug fix FSR.) * Several tweaks were made to make SWISH++ compiled under FreeBSD. (This bug fix shall be known as bug fix BSD3.) * Added -lnsl for compiling the search daemon under Solaris. (This bug fix shall be known as bug fix SOL2.) * Removed some more buffer-overflow bugs. (This bug fix shall be known as bug fix BOB.) * Filename patterns didn't match if the wildcard wasn't first, e.g., foo* (This bug fix shall be known as bug fix WWF.) CHANGES, file-by-file --------------------- * AssociateMeta.h 1. Added this file for feature AMN. * auto_vec.h 1. Removed #ifdef SEARCH_DAEMON since it's now used by code not in the search daemon for bug fix BOB. 2. Added "explicit" to constructor. 3. Added: auto_vec<T>& operator=( T *p ) 4. s/T *const p_/T *p_/ 5. Added PJL namespace. * bcd.c 1. s/fake_ansi.h/platform.h/ 2. s/STATIC_CAST(...)/static_cast<...>/ * config/config.mk 1. Added MOD_* definitions for feature MOD. 2. Added MOD_LIST to CCFLAGS for feature MOD. 3. Removed definition of MAKE. 4. Added separate "OS selection" section since there's now FreeBSD and Solaris also. 5. Added "PTHREAD_LIB= -pthread" for bug fix BSD. 6. Added "SOCKET_LIB+= -lnsl" for bug fix SOL2. 7. Wrapped thread and socket stuff inside "ifdef SEARCH_DAEMON". 8. Added OS variable. 9. Added OPTIM variable since -O3 in the cygwin environment causes a segfault due to an optimizer bug, presumeably. A. Added: -DMOD_MAN for feature MAN. B. If g++, added: -fno-exceptions to reduce code size. * config/src/mutable.c 1. Removed this file since all C++ compilers should now support "mutable". * config/src/new_casts.c 1. Removed this file since compilers should be implementing new casts by now. * config/src/socklen_1_socklen_t.c * config/src/socklen_2_int.c * config/src/socklen_2_unsigned.c 1. Added "#include <sys/types.h>" for bug fix BSD3. * config.h 1. Added "#ifdef MOD_HTML" around HTML and XHTML options for feature MOD. 2. Moved Title_Max_Size and TitleLines_Default down to Miscellaneous section for feature MAIL. 3. Added SocketPort_Default for feature TCP. * conf_bool.h * conf_int.c * conf_int.h * conf_set.c * conf_set.h * conf_string.c * conf_string.h * ExcludeFile.h * FilesGrow.c * FilesGrow.h * FilterFile.c * FilterFile.h 1. Removed "var_name" parameter from parse_value(). * conf_bool.c 1. Removed "var_name" parameter from parse_value(). 2. Added "using namespace std;" since it should have been there all along. 3. Switched to using auto_vec<char> and to_lower_r() for bug fix BOB. 4. Added PJL namespace. * conf_enum.c * conf_enum.h 1. Added for feature TCP. * conf_int.c 1. Switched to using auto_vec<char> and to_lower_r() for bug fix BOB. 2. s/cerr << error/error()/ 3. Added PJL namespace. * conf_set.h 1. Added PJL namespace. * conf_string.c 1. Added (missing) include of platform.h and namespace stuff. 2. Added code to strip leading/trailing quotes for feature SRS. 3. s/cerr << error/error()/ * conf_string.h 1. Added == and != operators. * conf_var.c 1. Removed HTMLFile for feature MOD. 2. Added "#ifdef MOD_HTML" around ExcludeClass for feature MOD. 3. Removed "var_name" parameter from parse_value(). 4. Replaced alias_name() by constructor. 5. Added ExtractFile for feature MOD. 6. In parse_file(), redid the finding of a newline for bug fix CNL. 7. In parse_file(), made use of find_newline(). 8. In map_ref(), added "SocketAddress" for feature TCP. 9. In conf_var::conf_var(), added initialization of name_ for bug fix NVR. A. In conf_var::conf_var(): s/map_ref()[ name_ ]/map_ref()[ to_lower( name_ ) ]/ so the case for variable names is irrelevant. B. In conf_var::map_ref(), made all variable names lower case. C. In conf_var::parse_line(), added "to_lower( line )" so the case for variable names is irrelevant. D. In conf_var::parse_line(): s/ in config. file// E. In conf_var::map_ref(), changed to doing initialization via a table. (This had the side-effect of making "search" work under FreeBSD.) F. Added ResultSeparator variable for feature SRS. G. s/cerr << warning/warning()/ H. Added PJL namespace. I. s/isspace/is_space/ J. Added "associatemeta" for feature AMN. * conf_var.h 1. Added default argument of "cerr" to error() and warning(). * directory.c 1. Moved configuration variable extern declarations to .h files. * do_file.c 1. Added "#ifdef INDEX" around declaration of orig_file_size and orig_file_name. (It should have been there all along.) 2. Reworked calling of the indexer for feature MOD. 3. Added ExtractFile::const_iterator for feature MOD. 4. s/name_set_.contains()/seen_file()/ 5. s/file_info::current_file().num_words_/fi->num_words()/ 6. Removed "filter_list.reserve( 5 )" so as not to waste time and thereby penalize the performance for files that are not filtered. 7. Recalculated basename for bug fix WWF. * elements.c 1. Added "#ifdef MOD_HTML" for feature MOD. 2. s/REINTERPRET_CAST(...)/reinterpret_cast<...>/ 3. Added ruby elements for feature RUBY. * elements.h 1. Added "#ifdef MOD_HTML" for feature MOD. 2. Added PJL namespace. * entities.h * entities.c 1. Added "#ifdef MOD_HTML" for feature MOD. * encoded_char.c * encoded_char.h 1. Added for feature MAIL. * ExcludeClass.h 1. Added "extern ExcludeClass exclude_class_names;". * ExcludeFile.c 1. Added "using namespace std;". 2. Removed "var_name" parameter from parse_value(). * ExcludeMeta.h 1. Added "extern ExcludeMeta exclude_meta_names;". * exit_codes.h 1. Created TCP and Unix versions of the search daemon exit codes. * extract.c 1. s/IncludeFile/ExtractFile/ for feature MOD since extraction doesn't use modules. 2. Reworked -e and -E options to allow multiple, comma-separated patterns just like for index(1). 3. Added PJL namespace. 4. In extract_words(), removed "buf" and now using "word" exclusively. * ExtractFile.c * ExtractFile.h 1. Added for feature MOD. * fake_ansi.h 1. Got rid of faking "mutable" since all C++ compilers should now support this. 2. Removed new casts section since compilers should be implementing them by now. 3. Added hack to fix g++/STL/iterator bug. * file_info.c 1. s/fake_ansi.h/platform.h/ 2. s/REINTERPRET_CAST(...)/reinterpret_cast<...>/ 3. Added: #ifndef PJL_NO_NAMESPACES (it should have been there all along). 4. Added definition of result_separator variable for feature SRS. 5. s/' '/result_separator for feature SRS. * file_info.h 1. s/ostream/std::ostream/ 2. Made all but list_ data members private. 3. Added public accessor functions for now-private data members. 4. Added inc_words() and seen_file(). 5. Added PJL namespace. * file_list.c 1. Removed: #include "fake_ansi.h" 2. Removed PJL_NO_MUTABLE section. 3. s/THIS->// * file_list.h 1. Removed: #include "fake_ansi.h". 2. s/REINTERPRET_CAST(...)/reinterpret_cast<...>/ 3. Removed pointer and reference type. 4. Made const_iterator derived from std::iterator. * file_vector.c * file_vector.h 1. Replaced by mmap_file.[ch] * FilesReserve.h 1. Added "extern FilesReserve files_reserve;". * filter.c 1. Removed all WIN32 special cases. 2. Added (missing) #include "platform.h" 3. s/find()/rfind()/ 4. Added code to increment pos past substituted text for bug fix FSR. 5. Added code for %E for feature 22L. 6. Changed handling of @ for feature SM2. 7. Made use of basename() added to util.h. * filter.h 1. s/::unlink/std::unlink/ * FilterFile.c 1. Changed handling of @ for feature SM2. 2. Consequently, now require only 1 substitution. 3. Added %E as a valid substitution for feature 22L. * fnmatch.c 1. Added: #include "platform.h" 2. Added: #ifndef PJL_NO_NAMESPACES * GNUmakefile 1. Reorganized HTML sources for feature MOD. 2. Added MOD_MAIL sources for feature MAIL. 3. Added conf_enum.c, SearchDaemon.c, and SocketAddress.c for feature TCP. 4. Added IncludeMeta.c for feature MAIL. 5. Added splitmail target for feature MAIL. 6. s/ifndef WIN32/ifdef SEARCH_DAEMON/ 7. Added fnmatch.c conditionally for WIN32 to E_SRCS. 8. Removed WIN32 special case for platform.h. 9. s/=/:=/ A. Reworded C++ compiler section. B. Added MOD_MAN sources for feature MAN. * html.c * html.h 1. Replaced by mod_html.c and mod_html.h, respectively, for feature MOD. * IncludeFile.h 1. Removed "var_name" parameter from parse_value(). 2. Removed the alias for HTML_File for feature MOD. 3. s/pattern_map< bool >/pattern_map< indexer* >/ for feature MOD. * IncludeFile.c 1. Removed "var_name" parameter from parse_value(). 2. Changed form of line to include indexer for feature MOD. 3. Added "using namespace std;". * IncludeMeta.c 1. Added this file for feature MAIL. 2. Added PJL namespace. * IncludeMeta.h 1. Added "extern IncludeMeta include_meta_names;". 2. Changed base class from conf_set to conf_var and map for feature MAIL. * index.c 1. Added "#ifdef MOD_HTML" for feature MOD. 2. Performed following substitutions for feature MOD. s/html.h/mod_html.h/ s/index.h/indexer.h/ 3. Changed the syntax for -e for feature MOD. 4. Removed the -h option for feature MOD. 5. Moved index_word() to indexer.c for feature MOD. 6. Allowed multiple patterns to be specified via -E option. 7. In main(), performed following substitution: s/TempDirectory_Default/0/ for bug fix ITO. 8. Updated the usage message for feature MOD. 9. Moved configuration variable extern declarations to .h files. A. In main() for case 'm', performed following substitution: s/include_meta_names.insert( to_lower( opt.arg() ) ) /include_meta_names.parse_value( opt.arg() )/ B. Added "#include <sys/time.h>" for bug fix BSD3. C. s/REINTERPRET_CAST(...)/reinterpret_cast<...>/ D. s/remove_temp_files()/remove_temp_files( void )/ for picky HP-UX compiler. E. In rank(), s/num_words_/num_words()/ F. In write_file_index(), made use of new file_info member functions. G. Removed all WIN32 special cases. H. Added PJL namespace. I. Added associate_meta global variable for feature AMN. J. Added "no-assoc-meta" and 'A' command-line options for feature AMN. * index.h 1. Replaced by indexer.h for feature MOD. * indexer.c * indexer.h 1. Added for feature MOD. 2. Added PJL namespace. * index_segment.c 1. Removed: #include "fake_ansi.h" 2. s/REINTERPRET_CAST(...)/reinterpret_cast<...>/ 3. Added PJL namespace. * index_segment.h 1. Made index_segment::const_iterator derived from std::iterator. 2. Added PJL namespace. * init_modules.c 1. Added for features MOD and MAIL. * INSTALL.win32 1. Changed from mingw to cygwin. 2. Removed note about extract(1). 3. Changed build instructions to match Unix version. * itoa.c 1. s/fake_ansi.h/platform.h/ 2. Added PJL namespace. * itoa.h 1. Added PJL namespace. * less.h 1. s/binary_function/std::binary_function/ * man/man1/extract.1 1. Added description for multiple patterns for -e, --pattern, -E, and --no-pattern options. 2. s/pjl@best.com/pauljlucas@mac.com/ * man/man1/httpindex.1 * man/man4/swish++.index.4 1. s/pjl@best.com/pauljlucas@mac.com/ * man/man1/index.1 1. s/pjl@best.com/pauljlucas@mac.com/ 2. Added description of modules and mod_mail for feature MAIL. 3. Removed -h, --html-pattern, and HTMLFile. 4. Reworked description of -m and --meta. 5. Added references for feature MAIL. 6. Added references for feature MAN. 7. Added -A, --no-assoc-meta, and AssociateMeta for feature AMN. 8. Added mention of and reference for Ruby elements for feature RUBY. 9. Made -T option no longer refer to <TITLE> element. * man/man1/search.1 1. Added -R, --separator, ResultSeparator for feature SRS. 2. Made "select" in daemon example more concise. * man/man1/splitmail.1 1. Added for feature MAIL. * man/man3/WWW.3 1. s/pjl@best.com/pauljlucas@mac.com/ 2. Redid formatting of references. 3. Removed trim_whitespace(), url_decode(), and url_encode() since they are no longer used now that the search.cgi example uses CGI.pm * man/man4/swish++.conf.4 1. Added section for enumeration variables and SearchDaemon for feature TCP. 2. Changed IncludeFile from a set variable to an other variable for feature MOD. 3. Added SocketAddress for feature TCP. 4. Added section for IncludeMeta for feature MAIL. 5. s/pjl@best.com/pauljlucas@mac.com/ 6. Added: "For variables_names, case is irrelevant." 7. Added note about preserving whitespace in string values. 8. Added ResultSeparator for feature SRS. 9. Added more IP address detail for SocketAddress. A. Added "# WRONG!" comment to filter example. B. Added "AssociateMeta" for feature AMN. C. Added missing FollowLinks. D. Added "Man" module for feature MAN. * mmap_file.c 1. Added "#include <sys/time.h>" for bug fix BSD3. 2. s/REINTERPRET_CAST( caddr_t )( -1 )/MAP_FAILED/ 3. Removed all WIN32 code. 4. s/fake_ansi.h/platform.h/ 5. Added PJL namespace. * mmap_file.h 1. Removed all WIN32 code. 2. Added PJL namespace. * mod_html.c 1. Replaced html.c for feature MOD. 2. Reworked everything to use encoded_char_ranges. 3. Moved configuration variable extern declarations to .h files. 4. In parse_html_tag(), s/tag/name/ 5. Added PJL namespace. 6. Removed "buf" and now using "word" by itself. 7. s/isxdigit/is_xdigit/ 8. s/isdigit/is_digit/ 9. s/isalpha/is_alpha/ A. s/isspace/is_space/ B. Reworked meta names are handled for feature AMN. C. In tag_cmp(), "fixed" increment and end-of-string test. * mod_html.h 1. Replaced html.h for feature MOD. 2. Made find_title(), index_words(), and new_file() public so they could be accessed from mod_mail.c. 3. Made index_words() and parse_html_tag() take an encoded_char_range or encoded_char_range::const_iterator argument so they could parse HTML that is encoded. 4. Moved configuration variable extern declarations to .h files. 5. Added PJL namespace. * mod_mail.c * mod_mail.h 1. Added for feature MAIL. * mod_man.c * mod_man.h 1. Added for feature MAN. * my_set.h 1. Added PJL namespace. * option_stream.c * option_stream.h 1. Added PJL namespace. * pattern_map.h 1. s/::find_if/std::find_if/ 2. Added: #ifdef PJL_LOCAL_FNMATCH 3. Added "typename" to declaration of map_type. 4. s/value_type const&/argument_type/ * PidFile.h 1. Added "extern PidFile pid_file_name;". * platform.h.win32 1. Removed since not longer needed under cygwin. * postscript.h 1. Added PJL namespace. * README 1. Added new feature descriptions. 2. s!www.best.com/~pjl!homepage.mac.com/pauljlucas! 3. Added mention of Christoph Conrad. * RecurseSubdirs.h 1. Added "extern RecurseSubdirs recurse_subdirs;". * ResultSeparator.h 1. Added this file for feature SRS. * search.c 1. s/html.h/indexer.h/ for feature MOD. 2. Added #include "SocketAddress.h" for feature TCP. 3. s/am_daemon/daemon_type/ and s/daemon_opt/type_type_arg/ for feature TCP. 4. Made the daemon configuration variables global and become_daemon() take no arguments because the argument list was getting way too long. 5. In search_options::search_options(), added socket_address_arg for feature TCP. 6. In search_options::search_options(), added -a option for feature TCP. 7. In usage(), updated message for feature TCP. 8. Added "#include <sys/time.h>" for bug fix BSD3. 9. In main(), moved check of number of command-line arguments after conf_var::parse_file() and command-line override code for bug fix DCL. A. Switched to using auto_vec<char> and to_lower_r() all the time for bug fix BOB. B. Removed all WIN32 special cases. C. Added: #include "ResultsSeparator.h" for feature SRS. D. In main(), added code for result_separator for feature SRS. E. In dump_single_word(), search(), and service_request(): s/' '/result_separator/ for feature SRS. F. In search_options::search_options(), added 'R' case for feature SRS. G. In usage(), added line for -R for feature SRS. H. Added PJL namespace. * search.h 1. s/bool daemon_opt/char const* daemon_opt_arg/ for feature TCP. 2. Added socket_address_arg for feature TCP. 3. s/ostream/std::ostream/ (it should have been that way all along). 4. Added result_separator_arg for feature SRS. 5. Added PJL namespace. * searchc.in 1. Added stuff to connect via a TCP socket to the search daemon for feature TCP. 2. Updated Perl book references for 3rd ed. * search_daemon.c 1. Moved configuration variable extern declarations to .h files. 2. Added "#include <sys/time.h>" for bug fix BSD3. 3. In accept_failed(), added "#ifdef EPROTO" for bug fix BSD3. 4. Partitioned the code into smaller functions. 5. Added PJL namespace. * SearchDaemon.c 1. Added a bunch of #include's ane extern declarations since become_daemon() now uses globals rather than parameters. This was done for feature TCP. 2. Added accept_failed() function for feature TCP. 3. In become_daemon(), added code for TCP sockets for feature TCP. * SearchDaemon.h 1. Changed to be derived from conf_enum for feature TCP. * SearchDaemon.c 1. Added this file for feature TCP. * search_options.c 1. Added "separator", 'R' option for feature SRS. * search_thread.c 1. s/fake_ansi.h/platform.h/ 2. Added PJL namespace. 3. s/isspace/is_space/ * search_thread.h 1. Added PJL namespace. * SocketAddress.h * SocketAddress.c 1. Added these files for feature TCP. * SocketFile.h 1. Added "extern SocketFile socket_file_name;". * socket_options.c 1. Added "socket-address" for feature TCP. 2. s/daemon/daemon-type/ and made it take an argument for feature TCP. * SocketQueueSize.h 1. Added "extern SocketQueueSize socket_queue_size;". * SocketTimeout.h 1. Added "extern SocketTimeout socket_timeout;". * splitmail.in 1. Added this utility for feature MAIL. * stem_word.c 1. Performed following substitution: s/replace_suffix( char *word, rule_list* ) /replace_suffix( char *word, rule_list const* )/ It should have been that way all along. * stop_words.c 1. Added "shall", "you'll", "you're". 2. Added PJL namespace. 3. s/word_buf/word/ 4. s/word_len/len/ * stop_words.c 1. Added PJL namespace. * swish++.conf 1. Added ExtractFile for feature MOD. 2. Removed HTMLFile for feature MOD. 3. Added module name to IncludeFile for feature MOD. 4. Changed SearchDaemon for feature TCP. 5. Added SocketAddress for feature TCP. 6. Added missing variables and sorted alphabetically properly. 7. Added IncludeMeta values for mail/news. 8. Added ResultSeparator variable for feature SRS. 9. Changed @ in FilterFile lines for feaure SM2. A. Added "AssociateMeta" for feature AMN. * swish++.conf.4 1. Added note about preserving whitespace in string values. 2. Added ResultSeparator for feature SRS. 3. Added more IP address detail for SocketAddress. 4. Added "# WRONG!" comment to filter example. 5. Added "For variables_names, case is irrelevant." 6. Sorted Other variables. 7. Added section for IncludeMeta. 8. Removed HTMLFile. 9. Added ExtractFile. A. Added section for enumeration variables and SearchDaemon. B. Changed IncludeFile from a set variable to an other variable. C. Added SocketAddress. D. s/pjl@best.com/pauljlucas@mac.com/ E. Added %E substitution for feature 22L. F. Added @ changes for feature SM2. G. Added code to increment pos for bug fix FSR. * thread_pool.c 1. s/fake_ansi.h/platform.h/ 2. Added: #ifndef PJL_NO_NAMESPACES (it should have been there all along). 3. s/STATIC_CAST(...)/static_cast<...>/ 4. Added PJL namespace. * thread_pool.h 1. Added PJL namespace. 2. s/queue/std::queue/ 3. s/set/std::set/ 4. Made ~thread() public. * ThreadsMax.h 1. Added "extern ThreadsMax max_threads;". * ThreadsMin.h 1. Added "extern ThreadsMin min_threads;". * ThreadTimeout.h 1. Added "extern ThreadTimeout thread_timeout;". * TitleLines.h 1. Changed comments to reflect that TitleLines isn't used exclusively for HTML or XHTML files any more for feature MAIL. 2. Added "extern TitleLines num_title_lines;" * token.c 1. s/(cfc)/static_cast<char (*)(char)>/ * util.c 1. Moved "char_buffer_pool<128,5> buf" to file scope. 2. s/fake_ansi.h/platform.h/ 3. Added: to_lower_r(char const*, char const*) for bug fix BOB. * util.h 1. Added find_newline() and skip_newline(). 2. Added "#include <sys/time.h>" for bug fix BSD3. 3. Removed #ifdef SEARCH_DAEMON around to_lower_r(char const*) since it's now used by code not in the search daemon for bug fix BOB. 4. Added: to_lower_r(char const*, char const*) for bug fix BOB. 5. s/file_vector::const_iterator/char const */ 6. Added: is_alnum(), is_alpha(), is_digit(), is_punct(), is_space(), is_upper(), and is_xdigit(). * Verbosity.h 1. Added "extern Verbosity verbosity;". * version.h 1. Updated version to "5.0". * word_info.h 1. s/html.h/indexer.h/ for feature MOD. 2. Added PJL namespace. * man/man3/WWW.3 1. Removed trim_whitespace(), url_decode(), and url_encode() since they are no longer used now that the search.cgi example uses CGI.pm * option_stream.h 1. s/ostream/std::ostream/ * util.h 1. Removed all WIN32 special cases. 2. Added: basename(). * Win32-Makefile-index.v * Win32-Makefile-search.v 1. Removed these since they are not needed under cygwin. * word_info.h 1. s/ostream/std::ostream/ * word_util.c 1. s/isdigit/is_digit/ 2. s/ispunct/is_punct/ 3. s/isupper/is_upper/ * word_util.h 1. s/STATIC_CAST(...)/static_cast<...>/ 2. s/isalnum/is_alnum/ * www_example/search.cgi 1. Updated Perl book references for 3rd ed. 2. s/the.index/swish++.index/ 3. Added $SOCKET_ADDRESS for feature TCP. 4. Rewrote to use standard CGI.pm module. 5. Added code to do TCP sockets for feature TCP. 6. Fixed printing of file size in results. ******************************************************************************* 4.8 ******************************************************************************* NEW FEATURES ------------ * The filename pattern matching (FNP) feature introduced in 4.5 has finally been ported to Windows. (This feature shall be known as feature WFNP.) BUG FIXES --------- * The GNUmakefile didn't build dependencies properly for files that are conditionally #include'd. (This bug fix shall be knows as bug fix IDB.) * The directory separator character ('/' for Unix) is apparantly transformed into '\' for Windows by the intermediate Windows port of POSIX functions. However, in the case where '/' is inserted into a string and that string is printed, the mere printing won't do the transformation. (This bug fix shall be knows as bug fix WDSC.) CHANGES, file-by-file --------------------- * auto_vec.h 1. Renamed and cleaned-up from managed_ptr.h. * config.h 1. Added: TempDirectory_Default[] = "/temp"; when compiling for Windows. * conf_int.c 1. s/managed_vec/auto_vec/ * copying.dj * fnmatch.c * fnmatch.h 1. Added these files for feature WFNP. * directory.c 1. Added Dir_Sep_Char for bug fix WDSC. * GNUmakefile 1. Added: I_SRCS+= fnmatch.c for feature WFNP. 2. Performed the following substitution: s/CPPFLAGS/CFLAGS/ for bug fix IDB. 3. Added some comments for the clean, distclean, and dist targets. * index_segment.c 1. Performed the following substitution: s/long/size_type/ It should have been that way all along. * index_segment.h 1. Performed the following substitution: s/long size_type/unsigned long size_type/ ...no need for it to be signed. 2. Performed the following substitution: s/char* value_type/char const* value_type/ It should have been that way all along. 3. Eliminated "pointer" and "reference" types since they weren't used. 4. Performed the following substitution: s/long/size_type/ It should have been that way all along. * managed_ptr.h 1. Renamed to auto_vec.h. * man/man1/index.1 1. Added mention of "/temp" for Windows. 2. Added section on differences for the Windows command line. * pattern_map.h 1. Added: #include "fnmatch.h" for feature WFNP. * search.c 1. Performed the following substitution: 1/managed_vec/auto_vec/ * version.h 1. Updated version to "4.8". ******************************************************************************* 4.7 ******************************************************************************* NEW FEATURES ------------ * Added 'b' and 'B' substitutions for filters that are the base name and base name minus the extension of a file name, respectively. This is useful when you need the temporary files created in a location other than where the originals are, for example when the originals are on a filesystem that you don't have write access to. Note that, for consistency, the 'E' substitution has been renamed to 'F'. This is therefore an incompatible change with previous versions of SWISH++. (This feature shall be knows as feautre BBS.) CHANGES, file-by-file --------------------- * filter.c 1. Added code to determine the base name of a file for feature BBS. 2. Added 'b' and 'B' cases for feature BBS. 3. Renamed 'E' case to 'F' case for consistency. * FilterFile.c 1. Added 'b' and 'B' substitutions as legal for feature BBS. 2. Changed 'E' substitution to 'F' for consistency. 3. Edit corresponding error message for feature BBS. * GNUmakefile 1. Undid the split of the dist and distclean targets done in version 4.5 since that change started to bug me to much. * man/man4/swish++.conf.4 1. Changed description of filter substitutions to match feature BBS. * swish++.conf 1. Changed all '@E' to '@F' corresponding to filter.c item #3. * version.h 1. Updated version to "4.7". ******************************************************************************* 4.6.6 ******************************************************************************* BUG FIXES --------- * Fixed segmentation fault when parsing HTML files that contain tags longer than Tag_Name_Max_Size characters. (This bug fix shall be known as bug fix HTL.) CHANGES, file-by-file --------------------- * html.c 1. In parse_html_tag(), added tag buffer overflow check for bug fix HTL. * man/man4/swish++.conf.4 1. Added (missing) mention of "HTMLFile". * man/man4/swish++.index.4 1. Fixed typo: s/numm/null/ * version.h 1. Updated version to "4.6.6". ******************************************************************************* 4.6.5 ******************************************************************************* BUG FIXES --------- * Adding files incrementally to an index that has meta names caused a SEGFAULT. (This bug fix shall be known as bug fix IIM.) CHANGES, file-by-file --------------------- * index.c 1. In load_old_index(), performed the following substitution for bug fix IIM: s/meta_names[ *meta_name ] = parse_bcd( p );/ meta_names[ ::strdup( *meta_name ) ] = parse_bcd( p );/ * version.h 1. Updated version to "4.6.5". ******************************************************************************* 4.6.4 ******************************************************************************* BUG FIXES --------- * File having path names longer than 255 characters weren't indexed. (This bug fix shall be known as bug fix PATH.) CHANGES, file-by-file --------------------- * do_file.c * extract.c * index.c 1. Performed the following substitution for bug fix PATH: s/NAME_MAX/PATH_MAX/ * util.h 1. Performed the following substitution for bug fix PATH: s/NAME_MAX = 255/PATH_MAX = 1024/ * version.h 1. Updated version to "4.6.4". ******************************************************************************* 4.6.3 ******************************************************************************* BUG FIXES --------- * DD elements weren't implicitly terminated by a new <DT> tag. (This bug fix shall be known as bug fix DDDT.) CHANGES, file-by-file --------------------- * elements.c 1. Added: "dt", "/dt", to the "dd" line for bug fix DDDT. * version.h 1. Updated version to "4.6.3". ******************************************************************************* 4.6.2 ******************************************************************************* BUG FIXES --------- * When using filters, the post-filtered filename and size were stored in the index rather than the original filename and size. (This bug fix will be known as bug fix FFS.) CHANGES, file-by-file --------------------- * do_file.c 1. Added orig_file_size for bug fix FFS. 2. Moved test for encountering file during incremental indexing to before the filter filename substitutions for bug fix FFS. 3. Added orig_file_name for bug fix FFS. 4. Changed "new file_info()" call to use orig_file_name and orig_file_size for bug fix FFS. * filter.c * filter.h 1. Performed following substitution: s/target_file_/target_file_name_/ * util.h 1. Added file_size() for bug fix FFS. * version.h 1. Updated version to "4.6.2". ******************************************************************************* 4.6.1 ******************************************************************************* BUG FIXES --------- * extract(1) incorrectly required arguments for -l and -r. (This bug fix will be known as bug fix LRO.) CHANGES, file-by-file --------------------- * extract.c 1. In main(), fixed opt_spec for bug fix LRO. * version.h 1. Updated version to "4.6.1". ******************************************************************************* 4.6 ******************************************************************************* NEW FEATURES ------------ * Added the ability to specify the extension appended to files for extract(1). (This feature will be known as feature SEE.) * Added the ability to run extract(1) as a filter. (This feature will be known as feature EF.) BUG FIXES --------- * extract(1) didn't print the file name of files that didn't exist in its error message. (This bug fix will be known as bug fix DNE.) CHANGES, file-by-file --------------------- * config.h 1. Added ExtractExtension_Default for feature SEE. * conf_var.c 1. In map_ref(), added ExtractExtension for feature SEE. 2. In map_ref(), added ExtractFilter for feature EF. * do_file.c 1. Changed to use user-specified extension for feature SEE. 2. Changed to have the ability write to standard output for feature EF. * exit_codes.h 1. Added Exit_No_Such_File for feature EF. * extract.c 1. Added global extract_extension variable for feature SEE. 2. In main(), added -x and --extension options for feature SEE. 3. In usage(), added description of -x and --extension options for feature SEE. 4. In extract_*() functions, changed ofstream argument to ostream for feature EF. 5. Added extract_as_filter global variable for feature EF. 6. In main() and usage(), added -f and --filter option for feature EF. 7. In main(), added code for the filter case for feature EF. 8. In main(), fixed bug DNE. * ExtractExtension.h 1. Added this file for feature SEE. * ExtractFilter.h 1. Added this file for feature EF. * man/man1/extract.1 1. Added description of -x and --extension options for feature SEE. 2. Added description of -f and --filter options for feature EF. * man/man4/swish++.conf.4 * swish++.conf 1. Added ExtractExtension variable for feature SEE. 2. Added ExtractFilter variable for feature EF. * version.h 1. Updated version to "4.6". ******************************************************************************* 4.5 ******************************************************************************* NEW FEATURES ------------ * Added the ability to index, not index, and filter files based on filename pattern rather than merely extension. (This feature will be known as feature FNP.) BUG FIXES --------- * If an HTML file doing selective non-indexing via CLASS attributes wasn't well-formed such that an HTML element having the CLASS attribute didn't end properly, then all words in all subsequent files indexed would be discarded. (This bug fix shall be known as bug fix ECC.) * The --verbosity option in index(1) wasn't recognized. (This bug fix shall be known as bug fix VLO.) CHANGES, file-by-file --------------------- * config/config.mk 1. Added -pedantic to CCFLAGS to make code cleaner. * config/config-sh 1. Changed &- to /dev/null because of a weird interaction with g++ -pedantic. * conf_var.c 1. Performed the following substitution for configuration variable names for feature FNP: s/(.+)Extension/$1File/ * do_file.c 1. Changed extension-based processing to pattern-based for feature FNP. 2. Added "true" argument to call of index_words() for bug fix ECC. * entities.c 1. Added "apos", "Scaron", "scaron", and "Yuml" for support of XHTML. * ExcludeExtension.h * IncludeExtension.c * IncludeExtension.h * FilterExtension.c * FilterExtension.h * ExcludeFile.h * IncludeFile.c * IncludeFile.h * FilterFile.c * FilterFile.h 1. Replaced *Extension files with *File equivalents for feature FNP. * exit_codes.h 1. Added Exit_End_Enum_Marker to make compile with the -pedantic option of g++. * extract.c 1. Same as conf_var.c item #1. 2. Performed following substitution for feature FNP: s/extension/pattern/ 3. Made usage() take an ostream& argument just so it parallels the way it's done in index.c. * file_info.c 1. In file_info::out(), eliminated unused num_words variable. * file_list.c 1. Removed #include <sys/types.h> since it's apparantly not needed. * file_vector.h 1. Replaced off_t with size_t since mmap(2) uses size_t. * GNUmakefile 1. Added/removed source files for feature FNP. 2. Added ".PHONY: all" so a "make -t" doesn't "touch all". 3. Changed from using .%.d to dep/%.d dependency files since Windows doesn't like filenames beginning with a dot. 4. Split dist and distclean targets since only distclean should remove the dependencies. * html.h * index_segment.h * option_stream.h * token.h 1. Removed comma and end of enum list to make compile with the -pedantic option of g++. * html.c 1. Added mention of XHTML in comments. * index.c 1. Same as conf_var.c item #1. 2. Same as extract.c item #2. 3. In main() for the 'C', 'm', and 'M' command-line options cases, removed unnecessary strdup() since the variables are derived from conf_set that is-a string_set that uses std::string so the strings are copied anyway. 4. In load_old_index(), removed unnecessary strdup() for similar reason to item 3. 5. In index_word(), performed following substitution: s/if ( exclude_class_count )/if ( exclude_class_count > 0 )/ just because it seemed "more correct." 6. Corresponding change as index.h item #1. 7. In index_words(), removed "static bool new_file" variable for bug fix ECC. 8. In index_words(), added: if ( is_new_file ) exclude_class_count = 0; for bug fix ECC. 9. In index_words(), removed last "new_file = true;" for bug fix ECC. A. In main(), performed following substitution: s/verbose/verbosity/ for bug fix VLO. * index.h 1. Added "is_new_file" argument for bug fix ECC. * man/man1/index.1 * man/man1/extract.1 * man/man4/swish++.conf.4 1. Modified description for feature FNP. * my_set.h 1. Performed following substitition: s/key_type/T const &/ to make it compile with the -pedantic option of g++. 2. Added specialization of my_set< char const* > (see the comment for why). * pattern_map.h 1. Added this file for feature FNP. * README 1. Changed "extension" to "patterns" for feature FNP. * stop_words.c 1. Added "mustn't" to the list of stop-words. * swish++.conf 1. Same as conf_var.c item #1. * version.h 1. Updated version to "4.5". * word_util.c 1. Added #include <iostream> if DEBUG_is_ok_word is defined. ******************************************************************************* 4.4 ******************************************************************************* NEW FEATURES ------------ * The FilterExtension variable now allows you to specify literal % and @ characters by simply doubling the character. (This feature will be known as feature FLC.) BUG FIXES --------- * Added various #include lines and replacement for ENODATA to make it compile on FreeBSD systems. (This bug fix will be known as bug fix BSD2.) * Fixed a bug in config.pl whereby a \$ would cause all $ after it not to expand because the while loop exited prematurely. (This didn't currently matter, but it might in the future.) (This bug fix will be known as bug fix CEV.) * Fixed a bug in install-sh whereby source files in subdirectories would not be chown'd and chmod'd properly. (This didn't matter for SWISH++ the way it's distributed, but it might in the future.) (This bug fix will be known as bug fix ISD.) CHANGES, file-by-file --------------------- * config.pl 1. * config/man.mk 1. Added empty "all" rule so text versions of manual pages are not automatically built. * directory.c 1. Added #include <sys/types.h> for bug fix BSD2. * file_vector.c 1. Added #include <sys/types.h> for bug fix BSD2. 2. Added test for ENODATA and, if not available, use something else. (For bug fix BSD2.) * filter.c 1. Added code for feature FLC. 2. Added special-case code for sleep(3) for Windows. * FilterExtension.c 1. Added code for feature FLC. * GNUmakefile 1. Performed substitutions of the form: s/@cd $(DIR) && $(MAKE) $@/@$(MAKE) -C $(DIR) $@/ since GNU make has a -C option. 2. Added conditionals so as not to regenerate .d files when making clean, distclean, or dist. 3. Added "txt" as a target. * install-sh 1. Added code to strip directories via basename and xargs for bug fix ISD. * man/Makefile 1. Renamed to GNUmakefile. * man/GNUmakefile 1. Renamed from Makefile. 2. Replaced all targets with simpler %. * man/man1/GNUmakefile * man/man1/Makefile * man/man3/GNUmakefile * man/man3/Makefile * man/man4/GNUmakefile * man/man4/Makefile 1. Renamed Makefile to GNUmakefile. * man/man4/swish++.conf.4 1. Added description of feature FLC. * search.c * search_daemon.c * util.h 1. Added #include <ctime> for bug fix BSD2. * version.h 1. Updated version to "4.4". ******************************************************************************* 4.3.1 ******************************************************************************* BUG FIXES --------- * Indexing via standard input did NOT index all files: it still required you to specify extensions via -e. This is totally wrong. It seems it's been wrong since version 3.1. I'm surprised nobody noticed. (This bug fix will be know as bug fix SII.) * Fixed build problem on Debian Linux systems having to do with the type of the 3rd argument to accept(3). (This bug fix will be know as bug fix A3A.) CHANGES, file-by-file --------------------- * config/config-sh 1. Completely reworked to support multiple tests for the same thing for bug fix A3A. * config/src/explicit.c * config/src/mutable.c * config/src/namespaces.c * config/src/new_casts.c 1. Performed following subtitution: s/DEFINE/FAIL/ for bug fix A3A. * config/socklen.c 1. Replaced by other socklen*.c files. * config/socklen_1_socklen_t.c * config/socklen_2_int.c * config/socklen_2_unsigned.c 1. Replaced socklen.c with individual tests for bug fix A3A. * do_file.c 1. Performed following substitution: s/exclude_extensions.empty()/!include_extensions.empty()/ for bug fix SII. * GNUmakefile 1. Added platform.h as a dependency for all .*.d files so it will get built first since it is #include'd by most other files. * httpindex.in 1. Added more options that are passed to index(1). * man/man1/httpindex.1 1. Corresponding changes for httpindex.in. * search_daemon.c 1. Replaced use of PJL_SOCKLEN_NOT_INT with PJL_SOCKLEN_TYPE for bug fix A3A. * version.h 1. Updated version to "4.3.1". ******************************************************************************* 4.3 ******************************************************************************* NEW FEATURES ------------ * 'search' has a new -P (or --pid-file) option and PidFile configuration variable to specify a file to write its process ID when running as a daemon. (This feature will be known as feature PID.) BUG FIXES --------- * The FilesGrow feature was broken because I forgot a needed set of parentheses to get the precedence right. :-( (This bug will be known as bug fix FGP.) * For 'search', the description of the -p and --word-percent option in the usage message wasn't printed unless it was compiled as a daemon (and these options have nothing to do with it being a daemon). (This bug will be known as bug fix WPO.) CHANGES, file-by-file --------------------- * config.h 1. Performed the following substitution: s/swish++.socket/search.socket/ * config/config.pl 1. Rewrote it so that it parses Makefiles directly rather than needing to be passed arguments. * config/Makefile 1. Condensed a few things. * conf_var.c 1. Added PidFile variable for feature PID. * exit_codes.h 1. Added Exit_No_Write_PID for feature PID. * FilesGrow.h 1. Added parentheses around ?: operator for bug fix FGP. * GNUmakefile 1. Added in place of Makefiles. * html.c 1. Made is_html_comment() automatically skip it. * index.c 1. In index_words(), performed following substitution: s/char/file_vector::value_type/ It should have been that way all along. * Makefile * Makefile.win32 1. Replaced by GNUmakefile. * man/man1/search.1 1. Added description of feature PID. 2. Added -a, --socket-address for feature TCP. 3. Made changes to daemon (daemon-type) for feature TCP. 4. Added stuff for TCP sockets for feature TCP. 5. Updated exit status codes for feature TCP. 6. s/pjl@best.com/pauljlucas@mac.com/ 7. Updated Perl book references for 3rd ed. * man/man4/swish++.conf.4 1. Added mention of PidFile. * PidFile.h 1. Added this file for feature PID. * search.c 1. Added #include "PidFile.h" for feature PID. 2. Added "char const*" argument to become_daemon() function declaration for feature PID. 3. In main(), added: PidFile pid_file_name; and code to override with a command-line option for feature PID. 5. In search_options::search_options(), added pid_file_name_arg and case for it for feature PID. 6. In usage(), added description of -P and --pid-file options for feature PID. 7. In usage(), fixed bug WPO. * searchc.in 1. Same as config.h item #1. * search_daemon.c 1. Added "#include <fstream>" for feature PID. 2. Removed "#ifndef WIN32" since the search daemon feature isn't supported for Windows anyway. 3. Added "pid_file_name" argument to become_daemon() for feature PID. 4. Added code to write the process ID for feature PID. * search.h 1. Added pid_file_name_arg to struct for feature PID. * search_options.c 1. Added "pid-file" option for feature PID. * SocketFile.c 1. Removed this file to remove absolute path name requirement. * StopWordFile.h 1. Performed following substitution: s/string/std::string/ It should have been that way all along. * swish++.conf 1. Added PidFile for feature PID. 2. Same as config.h item #1. * version.h 1. Updated version to "4.3". * www_example/search.cgi 1. Same as config.h item #1. ******************************************************************************* 4.2 ******************************************************************************* NEW FEATURES ------------ * You can now index incrementally. (This feature will be known as feature II.) BUG FIXES --------- * You could get a segmentation fault if you indexed an HTML file that has no title and has fewer than title_lines lines. (This bug fix will be known as bug fix FTL.) * There was a small memory leak with stop-words when search(1) ran as a daemon. (This bug fix will be known as bug fix SWL.) * When running as a daemon, the thread timeout value was set to a garbage value. (This bug fix will be known as bug fix TTG.) * In index(1), the --files-reserve option was incorrectly named --file-reserve. (This bug fix will be known as bug fix FRO.) CHANGES, file-by-file --------------------- * config/config.mk 1. Added GCC_WARNINGS for development purposes. * config.h 1. Added FilesGrow_Default for feature II. * conf_bool.h 1. Make parse_value() protected rather than private for feature II. 2. Performed the following substitution: s/string/std::string/ * conf_int.c 1. Made parse_value() thread-safe. 2. Same as conf_bool.h item #1. 3. Same as conf_bool.h item #2. * conf_set.h 1. Same as conf_bool.h item #1. 2. Added CONF_SET_ASSIGN_OPS macro. * conf_string.h 1. Added operator+=() for feature II. 2. Performed the following substitution: s/string/std::string/ * conf_var.c 1. Added definition of conf_var::~conf_var(). 2. In map_ref(), added "Incremental" and "FilesGrow" for feature II. * conf_var.h 1. Added #include <string>. 2. Added virtual ~conf_var() because the class has virtual functions. 3. Performed the following substitution: s/string/std::string/ * do_file.c 1. In do_file(), added code to check for encountering the same file when incrementally indexing for feature II. 2. In do_file(), moved the code that checks the title for null to file_info.c. * ExcludeClass.h * ExcludeExtension.h * ExcludeMeta.h * IncludeMeta.h 1. Added CONF_SET_ASSIGN_OPS. * exit_codes.h 1. Adjusted exit codes for feature II. * file_index.h * file_index.c 1. These files were deleted; their functionality was consolidated into index_segment.[ch]. * file_info.c 1. A num_words parameter was added to the constructor for feature II. 2. operator<< was eliminated for feature II. 3. Added file_info::out() and file_info::parse() for feature II. 4. Eliminated the class-specific operator new and moved the functionality inside the constructor. * file_info.h Everything in file_info.c plus: 1. The private data members were made public and const for feature II. 2. Added name_set_ for feature II. * file_list.c 1. In calc_size(), code was added to skip occurrences for feature II. 2. In operator++(), code was added to parse occurrences for feature II. * file_list.h 1. Performed the following substitution: s/word_index/index_segment/ for feature II. * FilesReserve.h 1. Added extern declaration. * file_vector.h 1. In end(), performed following substitution: s/size()/size_/ so g++ can inline the functions. * filter.c 1. Changed types of pos and target_pos to string::size_type and changed -1 to string::npos. (It should have been this way all along.) * html.c 1. In grep_title(), removed register storage class for 'c' since its address is taken. 2. In grep_title(), added "return 0" at the end for bug fix FTL. * Incremental.c * Incremental.h 1. Added these files for feature II. * index.c 1. Added global variables "files_grow", "incremental", and "partial_index_file_names" for feature II. 2. Added load_old_index() function for feature II. 3. In main() and usage(), changed "file-reserve" option to "files-reserve" for bug fix FRO. 4. In main(), added code to new -I and -G options for feature II. 5. In main(), added code to call load_old_index() for feature II. 6. In index_words(), removed "register" from declaration of argument 'c' since its address is taken. 7. In merge_indicies(), changed from using num_temp_files to partial_index_file_names.size() for feature II. 8. In write_file_index(), added code to write the format directly since file_info::operator<<() was eliminated. 9. In write_partial_index(), added code to add the partial index file name to the global variable "partial_index_file_names" for feature II. A. In write_word_index(), added code to write the occurrence data for feature II. B. In usage(), added description of new -I, --incremental, -G, and --files-grow options for feature II. * index_segment.c * index_segment.h 1. Added these files. * man/man1/index.1 * man/man1/search.1 1. Added description of incremental indexing and new supporting options and variables for feature II. * man/man4/swish++.conf.4 1. Added description of "Incremental" and "FilesGrow" variables for feature II. * man/man4/swish++.index.4 1. Updated the description of the index file format for feature II. * managed_ptr.h 1. Added various member functions. * option_stream.c 1. Removed the constructor without the ostream argument and just made the other constructor have a default value. * search.c 1. Performed the following substitutions: s/result_type/search_result_type/ s/results_type/search_results_type/ that fixed the problem with deriving sort_by_rank from binary_function. 2. Changed the types of "files", "meta_names", "stop_words", and "words" to "index_segment" for feature II. 3. In main(), added word_file_max_arg and word_percent_max_arg for feature II. 4. Moved the become_daemon() into the new search_daemon.c file. 5. In dump_single_word(), added occurrence data to output. 6. Added is_too_frequent() function for feature II. 7. In parse_query(), added a default case to the switch statement. 8. In parse_primary(), eliminated the call to strdup() for bug fix SWL. 9. In parse_primary(), added code to check to see if a word is too frequent for feature II. A. In search_options::search_options(), added code for word_file_max_arg and word_percent_max_arg for feature II. B. In service_request(), added occurrence data to dump output for the entire index. C. In usage(), added description for =f and -p options for feature II. * search.h 1. Added word_file_max_arg and word_percent_max_arg for feature II. * search_daemon.c 1. Moved the daemon code into this file. * search_options.c 1. Added "word-files" and "percent-files" arguments for feature II. * stop_words.c * stop_words.h 1. Added new contructor for feature II. * thread_pool.c 1. In thread_pool::thread_pool(), performed following substitution: s/timeout_( timeout_ )/timeout_( timeout )/ for bug fix TTG. * util.c * util.h 1. Moved get_index_info() to index_segment.c for feature II. * version.h 1. Updated version to "4.2". * word_info.h 1. Removed "union" in word_info::file for feature II. 2. Added initialization of rank_ data member for feature II. ******************************************************************************* 4.1 ******************************************************************************* NEW FEATURES ------------ * Generated index files are now approximately 24% smaller. (This feature will be known as feature SIF.) BUG FIXES --------- * When index(1) generated partial indicies, it never removed words that occurred in more files than the allowable percentage. (This bug fix will be known as bug fix PIR.) * The -T/--temp-dir options were left out of the option specification for index(1) so you could never specify these options on the command line. (This bug fix will be known as bug fix TDO.) * Added a call to setsid() for becomming a search daemon. It should have been there all along. (This bug fix will be known as bug fix SID.) CHANGES, file-by-file --------------------- * bcd.c * bcd.h 1. Added these files for feature SIF. * config.h 1. Added #error directives to force people to read config.h and set important values for their system. * exit_codes.h 1. Added Exit_No_Unlink. * file_list.c 1. Rewrote file_list::calc_size() for bug fix PIR. I was counting spaces. Why? I don't know. There were never any spaces in the file list data. As far as I can tell, this never worked. 2. Rewrote file_list::const_iterator::operator++() to parse index file word data in new format for feature SIF. * file_list.h 1. Performed following substitution: s/ptr/ptr_/ 2. Performed following substitution: s/char const/unsigned char const/ for feature SIF since we now have to deal with byte values greater than 0x7F. * index.c 1. In merge_indicies() and write_word_index(), changed the format in which the index file is written from ASCII to BCD for feature SIF. 2. It seems as though the merge_indicies() code was slightly broken. While it appeared to merge properly, I don't think it ever threw out words that exceeded any thresholds. I rewrote chunks of it for bug fix PIR. 3. Factored out some code into a new is_too_frequent() function. 4. In write_meta_name_index(), changed the ASCII numerical output to BCD for feature SIF. 5. In write_full_index(), changed the comment describing the format of the index file for feature SIF. 6. In main(), added -T/--temp-dir to option spec. for bug fix TDO. * Makefile * Makefile.win32 1. Added bcd.o target and dependencies for feature SIF. 2. Added word_info.o target and dependencies. * man/man1/index.1 1. Added Exit_No_Unlink to exit codes section. * man/man4/swish++.index.4 1. Changed the desription of the index file format for feature SIF. * search.c 1. In get_meta_id(), changed the parsing of the META ID from ASCII to BCD for feature SIF. 2. In become_daemon(), added call to setsid() for bug fix SID. * version.h 1. Updated version to "4.1". * word_info.c 1. Added this file to factor out the code for writing META IDs. * word_info.h 1. Added write_meta_ids() function declaration. ******************************************************************************* 4.0 ******************************************************************************* NEW FEATURES ------------ * 'search' now has the ability to run in the background as a multi-threaded daemon process functioning as a search server. (This feature will be known as feature MSD.) CHANGES, file-by-file --------------------- * config/config.mk 1. Added SEARCH_DAEMON, PTHREAD_LIB, and SOCKET_LIB variables for feature MSD. * config/Makefile 1. Added src/errno$(CCCEXT) dependency to $(TARGET) for feature MSD. 2. Added src/socklen$(CCCEXT) dependency to $(TARGET) for feature MSD. * config/config-sh 1. Performed following substitution: s/trap "x=$?; rm -f *$CCOEXT $TARGET; exit $x" 0 1 2 15/ trap "rm -f *$CCOEXT $TARGET; exit 1" 0 1 2 15 since it wasn't saving the exit code and always exiting with 0 (I don't know why.) This was done for feature MSD. 2. Added code to handle the new ERROR case for feature MSD. * INSTALL.win32 1. Added reference to Makefile.win32. * Makefile 1. Added new objects and dependencies for feature MSD. 2. Added new DEBUG_threads flag for feature MSD. * Makefile.win32 1. Added this not including the objects and dependencies for feature MSD until somebody helps me port MSD to Windows. * man/man1/search.1 1. Added description of feature MSD. * man/man4/swish++.conf.4 1. Added new configuration file variables for feature MSD. * README 1. Added synopsis of feature MSD. * search.c 1. Added #include of headers for socket-related stuff for feature MSD. 2. Added global "am_daemon" variable for feature MSD. 3. Added a "set<string>&" parameter to parse_meta(), parse_primary(), and parse_query() functions to collect the stop words found for feature MSD. 4. In main(), moved command-line option processing code into new search_options object for feature MSD. 5. In main(), moved search-request code into new search() and service_request() functions for feature MSD. 6. Added new become_daemon() function for feature MSD. 7. Added "out" parameters to dump_single_word() and dump_word_window() functions for feature MSD. 8. In both dump_single_word() and dump_word_window(), made lower_word a managed_vec<char> if compiled as a search daemon. 9. In usage() added description for new options for feature MSD. * managed_ptr.h * SearchDaemon.h * search_thread.c * search_thread.h * SocketFile.h * SocketQueueSize.h * src/socklen.c * thread_pool.c * thread_pool.h * ThreadsMax.h * ThreadsMin.h 1. New files for feature MSD. * stem_word.c 1. In stem_word(), added mutex around access to cache for feature MSD. * util.c * util.h 1. Added to_lower_r() function for feature MSD. * version.h 1. Updated version to "4.0". ******************************************************************************* 3.1 ******************************************************************************* NEW FEATURES ------------ * All executables now accept alternate long ("GNU-style") command-line options. (This feature will be known as feature LOPT.) * Added -? option to print usage ("help") message. (This feature will be known as feature HELP.) * Added new -h option and HTMLExtention configuration variable to allow filename extensions that are to be treated as HTML to be specified. (This feature will be known as feature HEXT.) * In 'search', allow max-results to be specified as 0. This allows the -R option to be eliminated. (This feature will be known as feature MR0.) * Added code to max-out various resource limits since SWISH++ is resource- intensive. This may alleviate out-of-memory conditions on some platforms. (This feature will be known as feature MAXR.) * Added code to map ISO 8859-1 (Latin 1) characters to their closest ASCII equivalent so that they are treated eactly like character entity references. This should also improve use of SWISH++ in other languages. (This feature will be known as feature ISOMAP.) BUG FIXES --------- * Feature CLASS has had a subtle bug in it since its inception in version 2.0. It was over-zealous in closing HTML elements. For example, given: <TABLE> <TR> <!-- 2 --> <TD> <!-- 1 --> <TABLE> <!-- 0 --> <TR> <TD> Hello </TD> </TABLE> <!-- trouble --> </TD> </TABLE> The "trouble" tag should have stopped closing elements at tag "0" but it kept closing elements "1" and "2" because </TD> and </TR> have </TABLE> in their set of close tags. The fix is to make a tag check to see whether the element it's closing is its own start tag: if so, stop. (This bug fix will be known as bug fix CLASS1.) * Really long start tags, those with lots of attributes, exceeding 128 characters, would cause a buffer overflow and, occasionally, a core dump. The fix is to change the way getting the name of tags is done. (This bug fix will be known as bug fix RLST.) CHANGES, file-by-file --------------------- * config/config/mk 1. Added suffix and rule for ".in" files. * conf_bool.h * conf_int.h * conf_set.h 1. Added reset() member function. 2. Added var_name argument to parse_value() for feature HEXT. * conf_string.h 1. Added reset() member function. 2. Changed use of "char const*" to "string" so as not to have to sorry about properly managing the value. 3. Added var_name argument to parse_value() for feature HEXT. * conf_var.c 1. Added alias_name() function for feature HEXT. 2. In parse_const_value() and parse_line(), added var_name argument to parse_value() call for feature HEXT. 3. In msg(), performed following substitution: s/cerr/o/ It should have been that way all along. 4. Made parse_config_file() a member function. 5. Added reset_all() member function. 5. In map_ref(), added "HTMLExtension" for feature HEXT. * conf_var.h 1. Added alias_name() function for feature HEXT. 2. Made error() use the 'o' reference instread of cerr. 3. Added argument to warning(). 4. Added reset_all() member function. * config.h 1. Added WordPercentMax_Default. It should have been there all along. 2. Added Tag_Name_Max_Size for bug fix RLST. * do_file.c 1. Removed is_html_ext() for feature HEXT. 2. Changes MAXNAMLEN to NAME_MAX. 3. In do_file(), changed access to include_extensions for feature HEXT. * entities.c * entities.h 1. Moved num_entities[] to word_util.[ch] for feature ISOMAP. * exit_codes.h 1. Flipped errors 50 and 51. * extract.c 1. Removed MAXNAMLEN in favor of NAME_MAX in util.h. 2. Replaced getopt() code with opt_stream code for feature LOPT. 3. In main(), added code to max-out RLIMIT_CPU for feature MAXR. 4. In main(), added code for new -? option for feature HELP. 5. In usage(), added long option descriptions for feature LOPT. 6. In usage(), added description for -? (and --help) options for features HELP and LOPT, respectively. 7. Corresponding change to file_vector.h item #1. 8. Performed following substitution: s/ERROR/error()/ 9. In extract_words(), added call to iso8859_to_ascii() for feature ISOMAP. * file_index.h 1. Corresponding changes for file_vector.h item #1. * file_info.c 1. Performed following substitution: s/num_files_reserve/files_reserve/ * file_vector.h 1. Detemplatized file_vector class to be only of char. I realized that file_vectors of other types may not have their elements suitably alogned and could cause alighment faults. * file_vector.c 1. Corresponding change to file_vector.h item #1. 2. In file_vector::init(), added code to max-out RLIMIT_VMEM for feature MAXR. 3. For Unix, error() now returns the value of the standard errno variable. 4. In open(), made it so that it won't attempt to mmap() the file if it has zero size; instead set error to ENODATA. * FilterExtension.c 1. Added missing #include <cstdlib> * FilterExtension.h 1. Added reset() member function. * html.c 1. Added #include "word_util.h" 2. Corresponding change to file_vector.h item #1. 3. Made find_attribute() insist that the "attribute" argument be passed as lower case. This way, it doesn't have to convert it to lower case. 4. Made tag_cmp() insist that the "tag" argument be passed as lower case. This way, it doesn't have to convert it to lower case. 5. In parse_html_tag(), corresponding changes for items #3 and #4. 6. In parse_html_tag(), added code for bug fix CLASS1. 7. In parse_html_tag(), changed first call to to_lower( begin, end ) to be inline code for bug fix RLST. 8. In convert_entity(), performed following substitution: s/num_entities/iso8859_map/ for feature ISOMAP. * html.h 1. Corresponding change to file_vector.h item #1. * index.c 1. Same as extract.c item #1. 2. Same as extract.c item #2. 3. Same as extract.c item #3. 4. Same as extract.c item #4. 5. Same as extract.c item #5. 6. Same as extract.c item #6. 7. Same as extract.c item #7. 8. Same as extract.c item #8. 9. Same as extract.c item #9. A. In main(), added code to max-out RLIMIT_AS and RLIMIT_DATA for feature MAXR. B. In usage(), added missing description for -T option. C. Same as file_info.c #1 plus: s/num_files_reserve_arg/files_reserve_arg/ * index.h 1. Corresponding change to file_vector.h item #1. * IndexFile.h 1. Corresponding change to conf_string item #2. * itoa.c * itoa.h 1. Moved ltoa() into its own file. * Makefile 1. Added dependencies for option_stream.[cho] for feature LOPT. 2. Added dependencies for itoa.[ch]. 3. Made TARGET names more explicit. 4. Performed following substitution: s/the.index/swish++.index/ This should have been done in release 3.0. * man/man1/extract.1 * man/man1/index.1 * man/man1/search.1 1. Added descriptions for feature LOPT. * my_set.h 1. Eliminated my_set::key_type since it's already defined in set. 2. Made base_type private. 3. Changed what was "string_set" to "char_ptr_set" and added a new "string_set" that really is a set of strings. * option_stream.c * option_stream.h 1. Added for feature LOPT. * ResultsMax.h 1. Changed allowable lower bound for ResultsMax to 0 for fearure MR0. * search.c 1. Same as extract.c item #2. 2. In main(), added code to max-out RLIMIT_AS for feature MAXR. 3. Changed use of istrstream to token_stream. 4. In parse_meta(), made tokens const. 5. In parse_meta() and parse_optional_relop(), changed lines of the form: t.put_back(); to: query.put_back( t ); 6. In parse_primary(), used improved less_stem to eliminate conditional calls of binary_search() and equal_range(). 7. Same as extract.c item #5. 8. In main(), added call to setlocale(3) for feature LOCALE. * stem_word.h 1. Enhanced less_stem class to accept a Boolean argument whether to stem or not. 2. Made stem_word() a member function. * stem_word.c 1. Corresponding change to stem_word.h item #2. 2. Added #include "word_util.h" * stop_words.c 1. Added #include "word_util.h" 2. Performed following substitution: s/ERROR/error()/ 3. Corresponding change to file_vector.h item #1. 4. Added call to iso8859_to_ascii() for feature ISOMAP. * StopWordFile.h * TempDirectory.h 1. Corresponding change to conf_string item #2. * token.c 1. Removed token::hold() in favor of new token_stream class. 2. In operator>>(), replaced call to to_lower() with ::transform() thus making operator>>() thread-safe (since to_lower() isn't thread-safe). 3. Added #include "word_util.h" 4. Added call to iso8859_to_ascii() for feature ISOMAP. * token.h 1. Added token_stream class to hold "put back" tokens. This makes it thread-safe. * util.c 1. Moved ltoa() to itoa.c so it's in its own .o file since only index(1) uses it so there's no reason for it to be linked into extract(1) or search(1). 2. Added max_out_limit() for feature MAXR. 3. Moved is_ok_word() to word_util.h thereby making it easier for others to customize or replace it with a custom one (perhaps with different heuristics for other languages). 4. Moved parse_config_file() to be a member of conf_var. 5. Corresponding change to file_vector.h item #1. * util.h 1. Added definition of NAME_MAX to replace MAXNAMLEN to be more POSIXly correct. 2. Correpsonding change as do_file.c item #1. 3. Added declaration of max_out_limit() for feature MAXR. 4. Moved declarations of ltoa() and itoa() to itoa.h. 5. The is_ok_word(), is_vowel(), is_word_begin_char(), is_word_char(), and is_word_end_char() have been moved to word_util.h thereby making it easier for others to customize or replace them with custom ones (perhaps with different heuristics for other languages). 6. Replaced ERROR macro with error() function. 7. Added error_string() function. 8. Corresponding change to file_vector.h item #1. * version.h 1. Updated version to "3.1". * word_index.c * word_index.h 1. Corresponding change to file_vector.h item #1. * WordPercentMax.h 1. Added WordPercentMax_Default to constructor. (It should have been here since version 3.0.) * word_util.c 1. Corresponding changes to util.c item #3 and util.h item #5. 2. Moved num_entities[] definition from entities.c to word_util.c and renamed it iso8859_map[] for feature ISOMAP. * word_util.h 1. Corresponding changes to util.c item #3 and util.h item #5. 2. Moved num_entities[] declaration from entities.h to word_util.h and renamed it iso8859_map[] for feature ISOMAP. 3. Added iso8859_to_ascii() function for feature ISOMAP. * WWW.pm 1. Made regular expression for e_mail more accurate. * www_example/search.cgi 1. Added code to pass along -s option to do stemming. * www_example/search.html 1. Added checkbox for stemming. ******************************************************************************* 3.0.3 ******************************************************************************* NEW FEATURES ------------ * A -H option has been added to 'index' to dump the built-in set of recognized HTML elements to standard output (so you can check to see if a certain tag is recognized or not). (This feature will be known as feature OPTH.) * Boolean configuration file variables now accept "on" and "off" values. (This feature will be known as feature ON_OFF.) BUG FIXES --------- * There was a small memory leak when indexing META names. (This bug fix will be known as bug fix ML1.) * Reporting errors in a configuration file says what line number the error is on. However, the same error-reporting code is also used to print errors when command-line arguments are invalid. The line number variable wasn't cleared so it would print an erroneous line number for an invalid command-line option. (This bug fix will be known as bug fix CLN.) * Parsing of Boolean values in configuration files was completely broken. (This bug fix will be know as bug fix PBV.) * WWW::extract_description() did it wrong for ALT attributes with an empty value, i.e., ALT="". (This bug fix will be known as bug fix ADE.) CHANGES, file-by-file --------------------- * conf_bool.c 1. In parse_value(), added code to accept "on" and "off" for feature ON_OFF. 2. In parse_value(), added '!' characters before ::strcmp() calls for bug fix PBV. * conf_bool.h * conf_int.h * conf_string.h 1. Made assignment operators protected since (1) they're not inherited and (2) it's an abstract class. * conf_int.c * conf_string.c 1. Performed following substitution: s/cerr/error()/ * conf_var.c 1. Corresponding change to conf_var.h #1 2. In parse_line(), added: current_config_file_line_no_ = 0; for bug fix CLN. * conf_var.h 1. Made msg() accept an ostream& to write to. 2. Performed following substitution: s/string/std::string/ * do_file.c 1. Corresponding change to my_set.h #1. * elements.c 1. Added element_map::instance() for feature OPTH. 2. Added explicit case for element::forbidden. * elements.h 1. Corresponding change for elements.c item #1. 2. Made element_map::element_map() private for feature OPTH. 3. Added operator<<( ostream&, element_map::value_type const& ) for feature OPTH. * extract.c 1. Corresponding change to my_set.h #1. 2. In usage(), performed following substitution: s/Dump default stop-words/Dump stop-words/ since it dumps whatever stop-words are being used, not just the built-in default set. * filter.h 1. Performed following substitution: s/string/std::string/ * FilterExtension.c 1. Performed following substitution: s/cerr/error()/ * html.c 1. In convert_entity(), changed access to the char_entity_map for feature OPTH. 2. In parse_html_tag(), corresponding change to my_set.h #1. 3. In parse_html_tag(), corresponding change to elements.c #1. 4. In parse_html_tag(), changed the way META names are looked up for bug fix ML1. Specifically, we no longer unconditionally do a strdup(): this was the source of the memory leak. * index.c 1. Added #include "elements.h" for feature OPTH. 2. In main() and usage(), added code for feature OPTH. 3. Corresponding change to my_set.h #1. 4. Corresponding change to extract.c #2. * less.h 1. Started using binary_function's first_argument_type, second_argument_type, and result_type typedefs. * Makefile 1. Added dependency for index.c on elements.h feature OPTH. * man/man1/index.1 1. Added description for new -H option for feature OPTH. 2. Mentioned which verbosity level is the default. 3. Added a reference to the "Index of Elements" in the HTML 4.0 specification. * man/man4/swish++.conf.4 1. Added "on" and "off" for feature ON_OFF. * my_set.h 1. Performed following substitution: s/find/contains/ to distinguish it from STL find() functions that return iterators. * search.c 1. Corresponding change to my_set.h #1. 2. In usage(), removed "standard out" verbiage. * stem_word.c 1. In stem_word(), removed use of char_buffer_pool. * stem_word.h 1. Corresponding change to less.h item #1. * util.c 1. Performed following substitution: s/string/std::string/ * util.h 1. Used S_ISxxx() macros for file tests rather than S_IFxxx. * version.h 1. Updated version to "3.0.3". * WWW.pm 1. Changed lines 103 and 104 from: $s =~ s/<[^>]+?ALT\s*=\s*(['"])([^>]+)\1[^>]*?>/$2/gi; $s =~ s/<[^>]+?ALT\s*=\s*(['"])([^'"]+)\1?\s*$/$2/i; to: $s =~ s/<[^>]+?ALT\s*=\s*(['"])([^>]*?)\1[^>]*?>/$2/gi; $s =~ s/<[^>]+?ALT\s*=\s*(['"])([^'"]*)\1?\s*$/$2/i; for bug fix ADE. ******************************************************************************* 3.0.2 ******************************************************************************* BUG FIXES --------- * The -r option for index and extract was broken by release 3.0; it's fixed now. (This bug fix will be known as bug fix DASHR.) CHANGES, file-by-file --------------------- * directory.c 1. On line 104, reversed the order of the conditions to now be: if ( is_directory( path ) && recurse_subdirectories ) for bug fix DASHR. For directories, a stat(2) wasn't being performed so the is_plain_file() call in do_file() didn't work. * extract.c * index.c 1. In main(), performed following substitutions for command line argument variables: s/char*/char const*/ * search.c 1. In main(), performed following substitutions for command line argument variables: s/char*/char const*/ 2. Performed following substitutions: s/dump_match/dump_match_arg/ s/dump_window_size/dump_window_size_arg/ s/skip_results/skip_results_arg/ * version.h 1. Updated version to "3.0.2". ******************************************************************************* 3.0.1 ******************************************************************************* BUG FIXES --------- * The code failed to compile under g++ 2.95 because it caught errors that previous versions of g++ allowed to compile. (This bug fix will be known as GCC2.95.) * There were a few mistakes in the section 1 manual pages to cover all the changes to version 3.0. (This bug fix will be known as MAN3.) CHANGES, file-by-file --------------------- * elements.c 1. On line 276, added an intermediate cast to int to get rid of an error trying to convert directly from a char* to an enum for bug fix GCC2.95. * index.c 1. In rank_full_index(), added another local scope for bug fix GCC2.95. * man/man1/extract.1 * man/man1/index.1 * man/man1/search.1 1. Performed following substitution: s/the.index/swish++.index/ for bug fix MAN3. 2. Fixed some formatting errors. * man/man4/swish++.conf.4 1. Fixed some formatting errors. * search.c 1. Performed following substitution: s/result_type/results_type/ s/sorted_result_type/sorted_results_type/ and added new result_type type for bug fix GCC2.95. 2. In main(), performed following substitution;: s/typedef vector< result_type::value_type > sorted_result_type; /typedef vector< result_type > sorted_results_type;/ for bug fix GCC2.95. * util.h 1. Rewrote is_directory() and is_plain_file() in terms of file_exists(). * version.h 1. Updated version to "3.0.1". * word_index.h 1. Added definitions for: word_index::const_iterator::operator+=() word_index::const_iterator::operator-=() for bug fix GCC2.95. ******************************************************************************* 3.0 ******************************************************************************* NEW FEATURES ------------ * SWISH++ now allows flexible file filtering for extraction and indexing. (This feature will be known as feature FFF.) * SWISH++ now allows configuration files since they were necessary for feature FFF. If I had to add them, I might as well do it right. (This feature will be known as feature CONF.) * SWISH++ now compiles and runs under Windows (95/98/NT). (This feature will be known as feature WIN32.) * 'index' now accepts a -T option that allows the directory to use for temporary files to be specified. (This feature will be known as feature TEMP.) * 'index' and 'extract' now report the number of files examined in addition to the number indexed or extracted, respectively. (This feature will be known as feature EXAM.) BUG FIXES --------- * In the admitedly rare case of a malformed HTML file ending in a '<' character (without a newline, i.e., '<' is the *VERY* last character in the file), 'index' would core-dump. (This bug fix will be known as bug fix EGT.) CHANGES, file-by-file --------------------- * conf_bool.c * conf_bool.h * conf_int.c * conf_int.h * conf_set.c * conf_set.h * conf_string.c * conf_string.h * conf_var.c * conf_var.h * ExcludeClass.h * ExcludeExtension.h * ExcludeMeta.h * FilesReserve.h * filter.c * filter.h * FilterExtension.c * FilterExtension.h * FollowLinks.h * IncludeExtension.h * IncludeMeta.h * IndexFile.h * man/man4/swish++.conf.4 * RecurseSubdirs.h * ResultsMax.h * StemWords.h * StopWordFile.h * TitleLines.h * Verbosity.h * WordFilesMax.h * WordPercentMax.h 1. New files for feature CONF. * config.h 1. Added Config_Filename_Default for feature CONF. 2. Performed following substitution: s/the.index/swish++.index/ * config/config.mk 1. Added -DWIN32 to CCFLAGS for feature WIN32. 2. Added more comments to CCFLAGS. 3. Added CCLINK for feature WIN32. 4. Added a "You shouldn't have to change anything below this line" line. 5. Added more comments for the "Manual pages" section and the DISTILL variable. 6. Added .SUFFIXES at bottom. * config/config-sh 1. Renamed from config.sh so some versions of make don't get confused with the .sh suffix and try to build it. 2. Define PJL_NO_SYMBOLIC_LINKS if WIN32 is defined for feature WIN32. * config/Makefile 1. Removed test for bool type: bool is now a requirement of the C++ compiler. This was necessary for feature CONF since it specializes a template on bool. 2. Performed following substitution: s/config.sh/config-sh/ corresponding to config/config-sh item #1. * config/src/bool.c 1. This file was removed corresponding to config/Makefile item #1. * directory.c 1. Performed following substitutions: s/bool recurse_subdirectories/RecurseSubdirs recurse_subdirectories/ s/int verbosity/Verbosity verbosity/ for feature CONF. 2. Added PJL_NO_SYMBOLIC_LINKS for WIN32. 3. Moved definition of stat_buf to util.c. * directory.h 1. Include platform.h for new PJL_NO_SYMBOLIC_LINKS symbol. 2. Moved stat_buf and file test functions to util.h. * do_file.c 1. The common code between 'index' and 'extract' was moved here. 2. The increment of "num_examined_files" was added for feature EXAM. * exit_codes.h 1. New header file. * extract.c 1. Added explicit definition of MAXNAMLEN under Windows for feature WIN32. 2. Performed following substitutions: s/string_set exclude_extensions/ExcludeExtension exclude_extensions/ s/string_set include_extensions/IncludeExtension include_extensions/ for feature CONF. 3. Corresponding change to directory.c item #1. 4. Added extract_words() function to parallel index.c's index_words() function. 5. In main(), redid the way in which command line options are processed such that they take precedence over configuration file variables for feature CONF. 6. In main(), made -l option conditional on whether we're compiling under Window or not for feature WIN32. 7. In main(), added -c option for feature CONF. 8. In main(), added code to test whether a file or directory actually exists before calling do_directory or do_file(). 9. Moved code for do_file() to do_file.c to factor out code common between extract and index. A. In usage(), added description of -c option for feature CONF. B. In usage(), made description of -l option conditional on Windows for feature WIN32. C. Changed all calls to exit(3) to use new exit code enums. D. Added "num_examined_files" global variable for feature EXAM. E. In main(), added code to print "num_examined_files" for feature EXAM. * fake_ansi.h 1. Removed __cplusplus test. 2. Removed section for bool type: bool is now a requirement of the C++ compiler. This was necessary for feature CONF since it specializes a template on bool. * file_index.h * file_index.c 1. Removed #include "fake_ansi.h" since bool is now required. * file_list.c 1. Added #include "fake_ansi.h". 2. Removed erroneous #include "html.h". * file_vector.h 1. Added #include's for Windows for feature WIN32. 2. Added conditional compilation for file_vector_base's size_type and fd_ for Windows for feature WIN32. * file_vector.c 1. Removed #include "fake_ansi.h" since bool is now required. 2. Added conditional compilation for Windows for feature WIN32. * html.c 1. Performed following substitutions: s/no_index_class_count/exclude_class_count/ s/no_index_class_names/exclude_class_names/ 2. In parse_html_tag(), added: if ( c == end ) return; for bug fix EGT. * html.h 1. Performed following substitutions: s/no_meta_id/No_Meta_ID/ s/meta_id_not_found/Meta_ID_Not_Found/ to make all enum's have capital letters. * index.c 1. Corresponding change to extract.c item #1. 2. Corresponding change to extract.c item #2. 3. Corresponding change to extract.c item #5. 4. Corresponding change to extract.c item #6. 5. Corresponding change to extract.c item #7. 6. Corresponding change to extract.c item #8. 7. Corresponding change to extract.c item #9. 8. Corresponding change to extract.c item #A. 9. Corresponding change to extract.c item #B. A. Corresponding change to extract.c item #C. B. Performed following substitutions: s/no_index_class_count/exclude_class_count/ s/no_index_class_names/exclude_class_names/ C. Performed following substitutions: s/int num_files_reserve/FilesReserve num_files_reserve/ s/int num_title_lines/TitleLines num_title_lines/ s/int word_file_file_max/WordFilesMax word_file_max/ s/int word_file_percent_max/WordPercentMax word_percent_max/ for feature CONF. D. In main() and write_partial_index(), added "ios::binary" to "out" ofstream for feature WIN32. E. In main(), added code for -T option for feature TEMP. F. Corresponding change to extract.c item #D. G. Corresponding change to extract.c item #E. * index.h 1. Corresponding change as html.h #1. * INSTALL.unix 1. Remaned from INSTALL due to introduction of INSTALL.win32 * INSTALL.win32 1. New file for feature WIN32 * Makefile 1. Added more comments for DEBUG options. 2. Added new targets for feature CONF. 3. Redid a lot of dependencies as a result. * man/man1/index.1 1. Added descriptions of configuration file variable for feature CONF. 2. Added Filters subsection to DESCRIPTION for feature FFF. 3. Added description of -c option for feature CONF. 4. Added caveat that the -l option is not available under Windows for feature WIN32. 5. Added description of -T option for feature TEMP. 6. Added CONFIGURATION FILE section for feature CONF. 7. Added Filters subsection to EXAMPLES for feature FFF. 8. Expanded EXIT STATUS section to list specific exit codes. 9. Added compress(1), gunzip(1), gzip(1), uncompress(1), and swish++.conf(4) to SEE ALSO section. * man/man1/extract.1 1. Added descriptions of configuration file variable for feature CONF. 5. Added caveat that the -l option is not available under Windows for feature WIN32. 6. Expanded EXIT STATUS section to list specific exit codes. 7. Added swish++.conf to FILES section for feature CONF. 8. Performed following substitution: s/the.index/swish++.index/ 9. Added swish++.conf(4) to SEE ALSO section for feature CONF. * man/man1/search.1 1. Added decription of -c option for feature CONF. 2. Added CONFIGURATION FILE section for feature CONF. 3. Expanded EXIT STATUS section to list specific exit codes. 4. Added swish++.conf to FILES section for feature CONF. 5. Performed following substitution: s/the.index/swish++.index/ 6. Added swish++.conf(4) to SEE ALSO section for feature CONF. * man/man4/Makefile 1. Corresponding change to swish++.index.4 item #1. 2. Added swish++.conf.4 for feature CONF. * man/man4/swish++.index.4 1. This file was renamed from swish++.4. * search.c 1. Performed following substitution: s/bool stem_words/StemWords stem_words/ for feature CONF. 2. Corresponding change as html.h #1. 3. Corresponding change to extract.c item #C. 4. Corresponding change to extract.c item #5. 5. Corresponding change to extract.c item #A. * stop_words.c 1. Added local static variable to constructor. 2. Corresponding change to extract.c item #C. * stop_words.h 1. Removed private static data member. * swish++.conf 1. Added template configuration for feature FFF. * token.c 1. Performed following substitution: s/fake_ansi.h/platform.h/ since bool is now required. * util.c 1. Moved stat_buf here from directory.h. 2. Added parse_config_file() for feature CONF. * util.h 1. Corresponding change to util.c item #1. 2. Moved file test functions here from directory.h. 3. orresponding change to util.c item #2. * version.h 1. Updated version to "3.0". * word_index.c 1. Removed #include "fake_ansi.h" since bool is now required. * word_index.h 1. Removed #include "fake_ansi.h" since bool is now required. * word_info.h 1. Corresponding change as html.h #1. ******************************************************************************* 2.0.1 ******************************************************************************* BUG FIXES --------- * The code parsed HTML attributes inside HTML comments. This is (obviously) the wrong thing to do. HTML comments declarations are now really, really ignored. Honest. (This bug fix will be known as bug fix ACP.) * The code parsed HTML attributes inside <!DOCTYPE ...> declarations. This is also (obviously) the wrong thing to do. <!DOCTYPE...> declarations are now also ignored. (This bug fix will be known as bug fix EXP.) * The set of HTML end tags that close some HTML elements was incomplete. (This bug fix will be known as bug fix HC1.) CHANGES, file-by-file --------------------- * elements.c 1. For the <colgroup> element, added <colgroup> for bug fix HC1. 2. For the <td> element, added <tbody>, </tbody>, </td>, <tfoot>, </tfoot>, <tr>, and </tr> for bug fix HC1. 3. For the <tfoot> element, added <tbody> and <thead> for bug fix HC1. 4. For the <th> element, added <tbody>, </tbody>, <tfoot>, </tfoot>, </th>, <tr>, and </tr> for bug fix HC1. 5. For the <thead> element, added <tbody> and <tfoot> for bug fix HC1. 6. For the <tr> element, added <tbody>, </tbody>, <tfoot>, </tfoot>, and </thead> for bug fix HC1. * html.c 1. In parse_html_tag(), added "if ( ... ) return;" around call to skip_html_tag() for bug fix ACP. 2. In parse_html_tag(), added check to see if first character of an HTML tag is '!' for bug fix EXP. 3. In skip_html_tag(), changed return type to "bool" and added "return" statements for bug fix ACP. * version.h 1. Updated version to "2.0.1". ******************************************************************************* 2.0 ******************************************************************************* NEW FEATURES ------------ * SWISH++ can now selectively not index text in HTML files within HTML elements that are members of specified classes. (This feature will be known as feature CLASS.) * The 'search' command now offers optional stemming. Indexing is unaffected. (This feature will be known as feature STEM.) * In all earlier versions, the number of total words reported was actually the total number of words indexed; now, it is the total number of words parsed and the former "total words" is now reported as the number of words indexed. (This feature will be known as feature NTW.) * The 'search' command now outputs an additional comment "results" followed by the total number of search results. Additionally, there is a new -R command- line option to print this alone. (This feature will be known as feature PRC.) CHANGES, file-by-file --------------------- * elements.c * elements.h 1. Added these files for feature CLASS. * html.c 1. Added #include "elements.h" for feature CLASS. 2. Added extern references to no_index_class_names and no_index_class_count corresponding to index.c #1. 3. Performed the following substitution: s/to_upper/to_lower/ to eliminate the to_upper() function entirely. 4. In grep_title(), performed the following substitution: s/TITLE/title/ so we can eliminate the to_upper() function entirely. 5. In parse_html_tag(), corresponding change for html.h #1. 6. In parse_html_tag(), added code for feature CLASS. * html.h 1. For parse_html_tag() function, added: bool is_new_file = false for feature CLASS. * index.c 1. Added global variables: string_set no_index_class_names; int no_index_class_count; for feature CLASS. 2. Added global variable: long num_indexed_words; for feature NTW. 3. In main(), added -C option for feature CLASS. 4. In main(), added code to print num_indexed_words for feature NTW. 5. In index_word(), performed following substitution: s/num_total_words/num_indexed_words/ for feature NTW. 6. In index_word(), added new: ++num_total_words; for feature NTW. 7. In index_word(), added code to test no_index_class_count for feature CLASS. 8. In index_words(), added: static bool new_file; variable for feature CLASS. 9. In usage(), added description for -C option for feature CLASS. A. In merge_indicies(), changed write-header code to neither allocate nor write the offsets for stop words or meta names if there are zero of them. B. In rank_full_index(), added check to see if there are no indexed words: if not, return. C. In write_full_index(), added check to see if there are no indexed words: if not, return. D. In write_full_index(), changed write-header code to neither allocate nor write the offsets for stop words or meta names if there are zero of them. * Makefile 1. Added -DDEBUG_parse_class for feature CLASS. 2. Added elements.o object for feature CLASS. 3. Added -DDEBUG_stem_word for feature STEM. 4. Added target for stem_word.o for feature STEM. * man/man1/index.1 1. Added description for -C option and examples for feature CLASS. * man/man1/search.1 1. Added description of new stemming option for feature STEM. 2. Added description of new -R option for feature PRC. * search.c 1. Added global variable: bool stem_words; for feature STEM. 2. In main(), performed following substitution: s/dDi:m:Ms:SVw:/dDi:m:Mr:RsSVw:/ for features PRC and STEM. 3. In main(), changed what was option 's' to option 'r' and added a new option 's' for feature STEM. 4. In main, added a new -R option for feature PRC. 5. In parse_primary(), added "less_stem" object to word_token case as well as having two exclusive calls to binary_search() and equal_range() depending upon stem_words for feature STEM. 6. In usage(), corresponding changes to items #3 and #4. * stem_word.c * stem_word.h 1. Added these files for feature STEM. * postscript.h 1. Added more comments. * util.c 1. Moved is_vowel() function to util.h and made it so that it does not call tolower(). 2. In is_ok_word(), performed following substitution: s/is_vowel( *c )/is_vowel( tolower( *c ) )/ corresponding to item #1. 3. In ltoa() and to_lower(), made use of new char_buffer_pool class. * util.h 1. Added char_buffer_pool class since its functionality is being used 3 times now. 2. Moved is_vowel() function here from util.c. 3. Added lots more comments. * version.h 1. Updated version to "2.0". ******************************************************************************* 1.7 ******************************************************************************* NEW FEATURES ------------ * Since version 1.4, SWISH++ indexed the text in the ALT attributes of AREA and IMG elements. SWISH++ now adds a few attributes. The complete set is: Attribute Element --------- ------- TITLE any ALT AREA, IMG, INPUT STANDBY OBJECT SUMMARY TABLE (This feature will be known as IEA.) * Added Word_Min_Vowels to config.h so vowel checks can be disabled (or made more stringent). (This feature will be know as feature WMV.) BUG FIXES --------- * When a given word appeared through many files, its ranks came out rather "flat" in the search results. This has been fixed. (This bug fix will be known as bug fix 10K.) CHANGES, file-by-file --------------------- * config.h 1. Added Word_Min_Vowels definition for feature WMV. * extract.c 1. Split out function extract_word() from do_file() to parallel changes in index.c. 2. Moved 'in_postscript' variable to be at file scope due to #1. 3. In do_file(), added missing 'const' to declaration: static ext_proc_map const ext_procs; It should have been there all along. * html.c 1. Added declarations for find_attribute(), skip_html_comment() and skip_html_tag() to the top of the file. They should have been there all along. 2. In convert_entity(), added missing 'const' to declaration: static chat_entity_map const char_entities; It should have been there all along. 3. Modified find_attribute() so that the 'begin' and 'end' iterators are touched only if the attribute is found. 4. Split out function skip_html_tag() from parse_html_tag() because it's cleaner that way. 5. In parse_html_tag(), was able to eliminate the 'parse_elements' parameter due to #4. 6. In parse_html_tag(), added code for feature IEA. * html.h 1. Corresponding change for html.c #5. * index.c 1. Split out function index_word() from index_words() because it's cleaner that way. 2. Peformed following substitution: s/1000.0/10000.0/ for bug fix 10K. 3. In usage(), peformed following substitution for the -M option: s/in index/to index/ * man/man1/index.1 1. Additions for feature IEA. * util.c 1. In is_ok_word(), added Word_Min_Vowels for feature WMV. 2. In is_ok_word(), deleted 'consonants' variable since it wasn't being used. 3. Redid to_lower() function to use multiple buffers. 4. Overloaded to_lower() function to take a pair of iterators. * util.h 1. Corresponding change for util.c #4. * version.h 1. Updated version to "1.7". ******************************************************************************* 1.6 ******************************************************************************* NEW FEATURES ------------ * The value of the CONTENT attribute for META elements can now selectively be indexed based on the value of the NAME attribute, either by explicit inclusion or exclusion. (This feature will be known as feature MIE.) * The WWW Perl library has a new function, extract_meta(), that can extract the value of the CONTENT attribute from a META element having a given NAME attribute from an HTML file. This can be used to display meta information in search results, e.g., for a given search result, also display its author, publication date, etc. (This feature will be known as feature EMC.) BUG FIXES --------- * If parentheses were used in conjunction with 'not' in a query involving meta names, it didn't work, e.g.: search author = not hawking worked as expected, but: search author = not ( hawking ) didn't even though it is (supposed to be) equivalent. (This bug fix will be known as bug fix MNP.) CHANGES, file-by-file --------------------- * html.c 1. Added #include "my_set.h" for feature MIE. 2. At global scope, added declarations: extern string_set exclude_meta_name, include_meta_names; for feature MIE. 3. In function parse_html_tag(), added code for feature MIE. * index.c 1. Added declarations: string_set exclude_meta_name; string_set include_meta_names; for feature MIE. 2. In main(), added "m:M:" to opts[] and cases for 'm' and 'M' command-line options for feature MIE. 3. In usage(), added explanation of -m and -M options for feature MIE. * Makefile 1. Added dependency of my_set.h to html.o for feature MIE. * man/man1/index.1 1. Added description of new -m and -M command-line options for feature MIE. * man/man3/WWW.3 1. Added description for extract_meta() function for feature EMC. * search.c 1. Added "int = no_meta_id" to declarations and definitions of parse_meta() and parse_query() functions for bug fix MNP. 2. In parse_primary()'s lparen_token case, added "meta_id" to recursive call of parse_query() for bug fix MNP. * version.h 1. Updated version to "1.6". * WWW.pm 1. Added extract_meta() function for feature EMC. 2. Rewrote extract_description() in terms of extract_meta(). ******************************************************************************* 1.5.1 ******************************************************************************* NEW FEATURES ------------ * Both 'index' and 'extract' now have a new verbosity level 4 that prints filenames that are not indexed or extracted, respectively, and why. (This feature was added to help fix bug fix HTH.) (This feature will be known as feature IEV4.) * The 'httpindex' script's -v option now works exactly like that of 'index'. (This feature will be known as feature HTV.) BUG FIXES --------- * META attribute name parsing had a bug where the find_attribute() function could occasionally run past the 'end' of where it was supposed to look. (This bug fix will be known as bug fix FAE.) * The 'httpindex' script would hang if it told 'index' to index a file and, for whatever reason, 'index' couldn't since 'index' would silently skip the file. (This bug fix will be known as bug fix HTH.) * The WWW::extract_description() function returned the first $description::chars characters of a file untouched if the file did not end with one of the filename extensions matched by the pattern /\.(?:[a-z]?html?|txt)$/i. What it should do is return a null description. (This bug fix will be known as bug fix EDN.) * The 'httpindex' script didn't test the extracted description to see if it is null: if it is, it should not attempt to overwrite the original file with the description and instead just delete the file. (This bug fix will be known as bug fix HTND.) CHANGES, file-by-file --------------------- * extract.c 1. In main(), changed upper-bound for verbosity to 4 for feature IEV4. 2. In do_file(), added additional print statements for feature IEV4. 3. In usage(), changed message to show verbosity range as 0-4 for feature IEV4. * html.c 1. In find_attribute(), made it correctly skip attribute names that don't match for bug fix FAE. 2. In find_attribute(), made it so that 'c' is never incremented past 'end' (as it sometimes incorrectly was) for bug fix FAE. * httpindex.in 1. Performed following substitution: s/-v3/-v4/ for bug fix HTH. 2. Added code to test the extracted description to see if it is null for bug fix HTND. 3. If a file can not be overwriten with its description (using the -d option), a warning is now merely issued rather than dieing as in version 1.5. 4. Added code for feature HTV. * index.c 1. Same as extract.c #1. 2. Same as extract.c #2. 3. Same as extract.c #3. * man/man1/extract.1 * man/man1/index.1 1. Updated description for feature IEV4. * version.h 1. Updated version to "1.5.1". * WWW.pm 1. Added a "default case" to WWW::extract_description() for bug fix EDN. ******************************************************************************* 1.5 ******************************************************************************* NEW FEATURES ------------ * A new command, httpindex, has been added to assist in indexing files on remote servers. (This feature will be known as feature HTTP.) BUG FIXES --------- * The regular expressions in extract_description() in WWW.pm had some bugs. (This bug fix will be known as bug fix WRE.) * The ignore stop words feature (feature ISW) added in version 1.2 that was broken, fixed, and fixed again is being fixed yet again so that ignored words are reported even if there are no other results. (This bug fix will be known as bug fix ISW4.) CHANGES, file-by-file --------------------- * config/config.mk 1. Added PERL variable for feature HTTP. * config/config.pl 1. This Perl configuration script was added for feature HTTP. * extract.c 1. Moved #include <dirent.h> after <sys/types.h> for BSD systems. * file_vector.c 1. Modified the behavior of file_vector<T> not to return an error if the file being mapped is of zero length for feature HTTP. * httpindex.in 1. This Perl 5 script was added for feature HTTP. * index.c 1. In do_file(), added a check to skip an empty file since file_vector<T> now opens empty files for feature HTTP. 2. Moved #include <dirent.h> after <sys/types.h> for BSD systems. * INSTALLATION 1. A third prerequisite of Perl 5 was added for feature HTTP. 2. A fourth prerequisite of wget was added for feature HTTP. * Makefile 1. A target was added for httpindex for feature HTTP. 2. The Makefile now also installs WWW.pm since it it required by httpindex. * man/man1/httpindex.1 1. Added this manual page for feature HTTP. * man/man1/Makefile 1. Added targets for new httpindex.1 manual page for feature HTTP. * search.c 1. In main(), moved: if ( skip_results >= results.size() ) return 0; past the code that prints the stop words for bug fix ISW4. * version.h 1. Updated version to "1.5". * WWW.pm 1. This file has been moved up from the subdirectory www_example. 2. In extract_description(), the regular expressions to extract the META NAME=description descriptions had missing '?'s added for bug fix WRE. 3. In extract_description(), the regular expression to remove a trailing ALT attribute was fixed for bug fix WRE. ******************************************************************************* 1.4.1 ******************************************************************************* BUG FIXES --------- * The META names words were associated with was completely wrong. It worked in a small number of test cases (my original test cases -- figures), but not in the general case. (This bug fix will be known as bug fix MID.) * In 1.4, a given word could be associated with at most 1 meta name per file. This limitation was an oversight. It has been corrected. (This bug fix will be known as bug fix MMM.) CHANGES, file-by-file --------------------- * extract.c 1. Performed following substitution: s/string_set.h/my_set.h/ * file_list.c 1. Performed following substitution: s/meta_index/meta_id/ for bug fix MID. 2. Added code to read multiple meta-IDs for bug fix MMM. * file_list.h 1. Changed declaration of file_list::value_type to be simply word_info::file since the structures are the same. * file_list.c 1. Same as file_list.c #1. * html.h 1. Same as file_list.c #1. * index.c 1. Same as file_list.c #1. 2. Same as extract.c #1. 3. Added remove_tmp_files() function and set it to be called viat atexit() so that temporary files are removed even if the program terminates prematurely. 4. In index_words(), added code to add multiple meta-IDs for bug fix MMM. 5. In merge_indices(), added code to write multiple meta-IDs for bug fix MMM. 6. In write_meta_name_index(), added code to write out the numeric ID for META name for bug fix MID. 7. In write_word_index(), same as #4. * index.h 1. Same as file_list.c #1. * less.h 1. Added explicit default constructor since g++ 2.8.0 complains if it isn't there and you try to define a "const less" object. * my_set.h 1. This file was renamed from string_set.h. 2. Made string_set class generic for any type T since we now use a set< short > in word_info::file. 3. Changed declaration of string_set to be simply: typedef my_set< char const* > string_set; * postscript.h 1. Same as extract.c #1. * search.c 1. Same as extract.c #1. 2. Same as file_list.c #1. 3. In dump_single_word(), performed following substitution: s/less< char const* >/less< char const* > const/ since it can be const (and everything that can be const should be). 4. In dump_word_window(), same as #3. 5. Added get_meta_id() function for bug fix MID. 6. In parse_meta(), performed following substitution: s/::distance( meta_names.begin(), found.first ) /get_meta_id( found.first )/ for bug fix MID. 7. In parse_meta(), same as #3. 8. In parse_primary(), same as #3. 9. In parse_primary(), added code in while loop at end of function to check all meta-IDs associated with a word for bug fix MMM. * stop_words.h 1. Same as extract.c #1. * string_set.h 1. This file was renamed to my_set.h. (See it for additional changes.) * version.h 1. Updated version to "1.4.1". * word_info.c 1. Same as file_list.c #1. * word_info.h 1. Changed word_info::file struct to include a set of meta-IDs for bug fix MMM. 2. Changed word_info::file struct to use shorts for occurrences and rank to conserve memory (since additional memory is now being taken up by the set of meta-IDs). 3. Gave the word_info::file struct 3 speparate constructors so the minimal amount of code is executed depending on how an object is constructed. ******************************************************************************* 1.4 ******************************************************************************* NEW FEATURES ------------ * SWISH++ now indexes and can search META data. (This feature will be known as feature META.) * SWISH++ now indexes the words in ALT attributes within AREA and IMG elements. (This feature will be known as feature ALT.) * SWISH++ can now index files and directories specified via standard input instead of via the command line. When doing this, extensions of files to index need not explicitly be specified via the -e option, i.e., 'index' assumes you know what you're doing when specifying filenames. (This feature will be known as feature ISI.) * For both 'index' and 'extract', a new -r command line option was added to suppress recursively indexing files in subdirectories. This option is most useful in conjunction with the new ISI feature. (This feature will be known as feature CLR.) * Added an optimization option for detemining whether a character is a "word character" by eliminating the call to strchr() in is_word_char(). This yields about a 10% performance improvement during indexing. (This feature will be known as feature WCO.) * The code for the 'index' was profiled and a couple of performance tweaks were made yielding about a 7% performance improvement. (This feature will be known as feature PPT.) BUG FIXES --------- * A small bug whereby the last word of a file was not indexed if the last line didn't end in a newline (or a whitespace character in general) was fixed. (This bug fix will be known as bug fix ILW.) CHANGES, file-by-file --------------------- * config.h 1. Added OPTIMIZE_WORD_CHARS, OPTIMIZE_WORD_BEGIN_CHARS, and OPTIMIZE_WORD_END_CHARS for feature WCO. * directory.c 1. Added a comment regarding do_file(). 2. Added check of new global "recurse_subdirectories" variable for feature CLR. * entities.h 1. Added a comment regarding the use of "less< key_type >" with the map. * ext_proc.h 1. Performed following substitution: s/map_type::const_iterator i/map_type::const_iterator const i/ It should have been that way all along. * extract.c 1. In main(), added code for feature ISI. 2. In main(), added handling of new -r option for feature CLR. 3. In do_file(), redid main 'while' loop and added 'if's for bug fix ILW. 4. In do_file(), replaced calls to strchr() with new is_word_begin_char() and is_word_end_char() functions for feature WCO. 5. In usage(), added missing description for -E option. 6. In usage(), added description for new -r option for feature CLR. * file_info.h 1. Added current_file() function for feature META. * file_list.c 1. In operator++(), added meta-index parsing code for feature META. * file_list.h 1. Added value_type::meta_index data member for feature META. * html.c 1. Added #include "index.h" and #include "meta_map.h" for features ALT and META. 2. Throughout the entire file, improved the SEE ALSO references, added URLs. 3. Added find_attribute() for features ALT and META. 4. Performed following substitution: s/skip_html_tag/parse_html_tag/ for features ALT and META. 5. Added parse_elements parameter to parse_html_tag() and code to parse ALT attributes and META elements for features ALT and META. * html.h 1. Added definitions of no_meta_index and meta_index_not_found for feature META. 2. Performed following substitution: s/skip_html_tag/parse_html_tag/ for feature META. * index.c 1. Added #include "index.h" and #include "meta_map.h" for feature META. 2. Added definition of meta_names for feature META. 3. In main(), added code for feature ISI. 4. In main(), added handling of new -r option for feature CLR. 5. Refactored do_file() by splitting out the actual word indexing part into a new function index_words() for feature META. The index_words() function is now also called by parse_html_tag() to index the words in the CONTENT attribute of META elements. 6. In do_file(), replaced 3 function calls to strcmp() to see if a file is an HTML file with a callto a new, inlined is_html_ext() function for feature PPT. 7. In index_words(), redid main 'while' loop and added 'if's for bug fix ILW. 8. In index_words(), added 'if' so as not to parse '<' as the start of and HTML tag if meta_index >= 0 for feature META. 9. In index_words(), replaced calls to strchr() with new is_word_begin_char() and is_word_end_char() functions for feature WCO. A. In merge_indicies(), added code to write meta index for feature META. B. In merge_indices(), redid code for writing the word index to use low ASCII characters as separators for feature META. C. Replaced a lot of 'for' loops iterating over an entire sequence with a new FOR_EACH or TRANSFORM_EACH macro. (I got tired of typing.) D. In rank_full_index(), moved the code to compute the ranks AFTER the tests to see whether a word occurs too frequently. It was originally placed before since file_count needed to be calculated, but I realized this is known ahead of time as simply info.files_.size(). E. Added write_meta_name_index() for feature META. F. In write_full_index(), added call to write_meta_name_index() for feature META. G. In write_word_index(), redid code for writing the word index to use low ASCII characters as separators for feature META. H. In usage(), added description for new -r option for feature CLR. * index.h 1. Added this new file for feature META. * Makefile 1. Added new dependencies for feature META. * man/Makefile 1. Added missing "pdf" target. * man/man1/extract.1 1. Added description of new -r option for feature CRL. 2. Added description of feature ISI. * man/man1/index.1 1. Added description of META element indexing for feature META. 2. Improved references, added URL. 3. Added description of new -r option for feature CRL. 4. Added description of feature ISI. * man/man1/search.1 1. Added description and examples of META element searching for feature META. * man/man4/swish++ 1. Modified description of index file format for feature META. * meta_map.h 1. Added this file for feature META. * search.c 1. Added #include "html.h" for feature META. 2. Added definition of meta_names for feature META 3. In main(), added dump_meta_names and new -M command line option for feature META. 4. In main(), used new enum for calls to word_index::set_index_file(). 5. Replaced a lot of 'for' loops iterating over an entire sequence with a new FOR_EACH or TRANSFORM_EACH macro. (I got tired of typing.) 6. In dump_word_window(), added missing description for 'match' parameter. 7. In parse_query(), performed following substitution: s/parse_primary/parse_meta/ for feature META. 8. Added parse_meta() function for feature META. 9. In parse_primary(), added meta_index parameter for feature META. A. In parse_primary(), added code to add words to result only if the meta-name matches for feature META. B. In usage(), added description of new -M option for feature META. * stop_words.c 1. In stop_wrod_set::stop_word_set(), redid main 'while' loop and added 'if's for bug fix ILW. * token.c 1. Reworked token::hold() to accomodate more than one put_back() in a row for feature META since parse_meta() requires two look-ahead tokens. 2. Added new case for the '=' token for feature META. * token.h 1. Added equal_token for feature META. 2. Corresponding change to token.c item #1. * util.c 1. In is_ok_word(), performed following substitution: s/int const len = ::strlen( word )/int const len = c - word/ for feature PPT. 2. In to_lower(), replaced call to transform() with simple while loop for feature PPT. * util.h 1. Added new FOR_EACH and TRANSFORM_EACH macros since I got tired of typing. 2. Added new is_html_ext() function for feature PPT. 3. Redid is_word_char() function for feature WCO. 4. Added is_word_begin_char() and is_word_end_char() functions for feature WCO. * version.h 1. Updated version to "1.4". * word_index.h 1. Added enum for word indices to word_index class. * word_info.h 1. Added #include "html.h" for feature META. 2. Added word_info::file::meta_index_ data member and modified constructor accordingly for feature META. ******************************************************************************* 1.3.2 ******************************************************************************* BUG FIXES --------- * The ignore stop words feature (feature ISW) added in version 1.2 was slightly broken in 1.2.1; it was "fixed" in 1.2.2 (bug fix ISW2), but not quite in that if left hand side of a query was ignored, thw whole thing was. (This bug fix will be known as bug fix ISW3.) * In 'index', the check for whether filename extensions were supplied was too early in the code so the -S option didn't work. (This bug fix will be known as bug fix CFE.) CHANGES, file-by-file --------------------- * config/man.mk 1. Made "make dist" make the manual pages in PDF format in addition to text format. * index.c 1. Relocated code to check whether filename extensions were supplied for bug fix CFE. 2. In main(), used an ostream_iterator() to dump stop words. 3. In do_file(), split tests for stop-words into two separate 'if' statements so to_lower() isn't called unless absolutely necessary. * search.c 1. In parse_query(), redid ignore-handling code for bug fix ISW3. 2. In main(), used an ostream_iterator() to dump stop words. * stop_words.c 1. Added stop-words: billions, eighteen, fifteen, fourteen, millions, ninteen, second, seconds, seventeen, sixteen, tens, third, thirteen, trillions. * util.c 1. In is_ok_word() on line 192, changed floating point calculation to integer by multiplying LHS by 100 to increase performance. 2. On the same line, performed the followingg substitution: s/>=/>/ so the code matches the documentation that says, "... contains more than a third capital letters ..." * version.h 1. Updated version to "1.3.2". * www_example/search.cgi 1. Removed extraneous 'o' (optimize) options from regular expressions. * www_example/WWW.pm 1. Added GNU Public Licensce notice at top. 2. In trim_whitespace(), used map() rather than a for loop. 3. Removed extraneous 'o' (optimize) options from regular expressions. ******************************************************************************* 1.3.1 ******************************************************************************* BUG FIXES --------- * Unbeknownst to me, I introduced a bug in 1.2.2 that broke wildcard searches. (Doh!) This has been fixed. (This bug will will be known as bug fix WCF.) CHANGES, file-by-file --------------------- * man/man1/search.1 1. Make it explicitly clear that wildcards are not permitted for the -d and -w options. * token.c 1. Moved the line: ::strcpy( t.lower_buf_, to_lower( t.buf_ ) ); before: if ( t.type_ ) return in; for bug fix WCF. * version.h 1. Updated version to "1.3.1". ******************************************************************************* 1.3 ******************************************************************************* NEW FEATURES ------------ * In "search," a "window" of words can be dumped around the query words. (This feature will be known as feature DWW.) * In "search," the -d option to dump the index for a word now dumps all the query words instead of a single word. Additionally, a stop-word used to print "stop-word"; now it prints "# ignored: " followed by the word. (This feature will be known as feature DQW.) * In "search," the -d option to dump the index for a word now prints the comment: # not found: word if 'word' is not found in the index. (This feature will be known as feature NFW.) CHANGES, file-by-file --------------------- * directory.c 1. Changed order of #include's putting direct.h last so that it compiles OK under FreeBSD 2.2.7. * man/man1/search.1 1. Corresponding changes for features DWW, DQW, and NFW. * search.c 1. In main(), performed the following substitutions: s/char const *dump_word/bool dump_word_index/ s/dump_word/dump_word_index/ s/d:Di:m:s:SV/dDi:m:s:SV/ for feature DQW. 2. In main(), performed the following substitution: s/dDi:m:s:SV/dDi:m:s:SVw:/ 3. In main(), performed the following substitution: s/dump_entire_index || dump_stop_words || dump_word /dump_entire_index || dump_stop_words/ for feature DQW since the -d option no longer takes an argument. 4. In main(), added code to handle new -w option for feature DWW. 5. In main(), added 'while' loop to code to dump multiple words for feature DQW. 6. In dump_single_word(), performed following substitution: s/"stop-word"/"# ignored: " << word/ for feature DQW. 7. In dump_single_word(), added printing of new "not found" comment key for feature NFW. 8. Added function dump_word_window() for feature DWW. 9. In usage(), performed following substitution: s/-d word/-d/ for feature DQW. A. In usage(), added text for new -w option for feature DWW. * version.h 1. Updated version to "1.3". ******************************************************************************* 1.2.2 ******************************************************************************* NEW FEATURES ------------ * A heuristic was added not to index a word if it contains more than a threshold number of consecutive punctuation characters. (This feature will be known as feature MCP.) * Files can now be indexed by exclusion of filename extensions rather than by inclusion via a new -E command-line option. (This feature will be known as feature EFE.) BUG FIXES --------- * The ignore stop words feature (feature ISW) added in version 1.2 was slightly broken in 1.2.1 in that the list of ignored words was no longer reported. (This bug fix will be known as bug fix ISW2.) CHANGES, file-by-file --------------------- * config.h 1. Performed following substitution: s/Word_Hex_Min_Size/Word_Hex_Max_Size/ The original name was inconsistent with the other parameters. 2. Added "Word_Max_Consec_Puncts" for feature MCP. * config.mk 1. Performed following substitution: s/install.sh/install-sh/ so some versions of make don't get confused with the .sh suffix and try to build it. * extproc.c 1. Added definitions for WEXITSTATUS and WIFEXITED if not defined on a particular system. * extract.c 1. Corresponding change for config.h item 1. 2. Performed following variable substitution: s/extensions/include_extensions/ for feature EFE. 3. Added variable exclude_extensions for feature EFE. 4. In main(), added code to handle new -E option for feature EFE. 5. In do_file(), added check against new exclude_extensions variable for feature EFE. 6. In usage(), added text for new -E option for feature EFE. * index.c 1. Performed following variable substitution: s/extensions/include_extensions/ for feature EFE. 2. Added variable exclude_extensions for feature EFE. 3. In main(), added code to handle new -E option for feature EFE. 4. In do_file(), added check against new exclude_extensions variable for feature EFE. 5. In usage(), added text for new -E option for feature EFE. * man/man1/extract.c 1. Changed description for feature EFE. * man/man1/index.c 1. Changed description for feature MCP. 2. Changed description for feature EFE. * search.c 1. Deleted is_stop_word() function for bug fix ISW2. 2. In dump_single_word(), added code formerly in is_stop_word() here for bug fix ISW2. 3. In parse_primary(), added code formerly in is_stop_word() here for bug fix ISW2. * token.c 1. Changed token so that it is not converted to all lower-case for bug fix ISW2. Previously, acronyms were not recognized in lower case and keywords ("and," "or," and "not") were not recognized in upper case. 2. Added code to make a copy of the token string in all lower case. This is still needed for stop-word determination. * token.h 1. Added second buffer to hold all-lower-case version of token text for bug fix ISW2. * util.c 1. In is_ok_word(), added code for feature MCP. * version.h 1. Updated version to "1.2.2". ******************************************************************************* 1.2.1 ******************************************************************************* NEW FEATURES ------------ * In "search," the original -d option that used to dump the entire index now dumps the index entry for a single word. Correspondingly, a new -D option now does what -d used to do. (This feature will be known as feature DSW.) * In "search," the dump of the index entries now includes the rank. (This feature will be known as feature DIR.) BUG FIXES --------- * Numeric entity references were not converted to their ASCII equivalents. (I don't know how I missed this.) (This bug fix will be known as bug fix NER.) * A search query that contained only stop-words returned all files (up to the specified limit or default maximum). (This bug fix will be known as bug fix RSW.) CHANGES, file-by-file --------------------- * config/config.mk 1. Added comment at top to remind people that they must do a "make distclean" before recompiling if they change any definitions. 2. Performed the following substitution: s!/usr/ucb/install!$(ROOT)/install.sh! * entities.c 1. Added num_entities[] for bug fix NER. 2. Performed following substitutions: s/entity_map/char_entity_map/ s/entity/char_entity/ s/entity_name/name/ so as to distinguish them from the newly-added num_entities[]. 3. Added: "ETH", 'D', "eth", 'd', to char_entity_table[]. * entities.h 1. Added: extern char const num_entities[ 256 ]; for bug fix NER. 2. Corresponding changes for entities.c item 2. * html.c 1. Corresponding changes for entities.c item 2. 2. Made use of new num_entities[] for bug fix NER. * index.c 1. On line 400, performed the following substitution: s/*lower_word/*const lower_word/ It should have been that originally. * install.sh 1. Created this shell script to use for installs instead of having to rely on the OS having a "Berkeley-esque" install command. * man/man1/search.1 1. Changed description for feature DSW. * search.c 1. Added new find_result_type typedef. 2. Added a new dump_single_word() function for feature DSW. 3. Added "bool &ignore" parameter to parse_query() and parse_primary() functions for bug fix RSW. 4. Factored out code that determines whether a word was indexed or not into a new function is_stop_word(). 5. In main(), added code for feature DSW. 6. In main(), added code for feature DIR. 7. In parse_query(), added code to ignore stop-words properly for bug fix RSW. 8. On line 423, performed following substitution: s/iterator/const_iterator/ It should have been that originally for "const correctness." 9. In parse_primary(), made use of new is_stop_word() function corresponding to item 4. A. In parse_primary(), added code under "not" case to check to see whether the primary should be ignored for bug fix RSW. B. In usage(), added text to usage message for feature DSW. * util.c 1. On line 65, performed following substitution: s/STATIC_CAST/REINTERPRET_CAST/ It should have been that originally. * version.h 1. Updated version to "1.2.1". * www_example/search.cgi 1. Added "&'" characters to those that are not stripped from the query. 2. Added: next if /^#/; so as to ignore comments we know nothing about that future releases of SWISH++ may emit. ******************************************************************************* 1.2 ******************************************************************************* NEW FEATURES ------------ * SWISH++ now stores the list of stop-words in the generated index file so they can be ignored on searches later. Previosuly, using a stop-word in a query would always yield 0 results since the stop-word isn't in the index. After thinking about it, this is just plain stupid. (This feature will be referred to as feature ISW.) * You can now specify the number of files to reserve space for on the command line for "index" overriding the default. (This feature will be referred to as feature ICF.) * You can now specify the number of lines to look into a file for HTML <TITLE> tags on the command line for "index" overriding the default. (This feature will be referred to as feature ICt.) * Added default values to usage messages. (This feature will be referred to as feature UDV.) BUG FIXES --------- * The detection of malformed queries was completely broken. I don't see how this went undetected for this long. (This bug fix will be referred to as bug fix DMQ.) * In the example WWW.pm Perl library, not all the "Unix-unfriendly" characters were stripped from filenames upon upload. (This bug fix will be referred to as bug fix UUC.) CHANGES, file-by-file --------------------- * config.h 1. Performed following substitutions: s/Title_Lines/Title_Lines_Default/ for feature ICt. 2. Performed following substitutions: s/Files_Default/Files_Reserve_Default/ for feature ICF. 3. Added: Index_Filename_Default * extract.c 1. Added code to usage() for feature UDV. 2. In do_file(), added code to check whether a word is a stop-word explicitly for feature ISW. This corresponds to the change for util.c item 2. * file_index.c 1. Moved index file header parsing code into a new function get_index_info() for feature ISW. 2. Added: #include "util.h" * file_info.c 1. Added: extern int num_files_reserve; for feature ICF. 2. Performed following substitution: s/Files_Default/num_files_reserve/ for feature ICF. * html.c 1. Added: extern int num_title_lines; for feature ICt. 2. Performed following substitution: s/Title_Lines/num_title_lines/ * index.c 1. Added write_stop_word_index() function for feature ISW. 2. Added: int num_files_reserve = Files_Reserve_Default; for feature ICF. 3. Added: int num_title_lines = Title_Lines_Default; for feature ICF. 4. Performed following substitutions: s/total_words/num_total_words/ s/unique_words/num_unique_words/ 5. Performed following substitution: s/"the.index"/Index_Filename_Default/ 6. Added 'F' option to command line parsing code and usage message for feature ICF. 7. Added 't' option to command line parsing code and usage message for feature ICt. 8. In do_file(), added code to check whether a word is a stop-word explicitly for feature ISW. This corresponds to the change for util.c item 2. 9. In merge_indices(), removed extra_stop_words and am now using stop_words since they all have to be written to the index file together. This was done for feature ISW. A. In merge_indices(), added code to write additional header information for the stop-words. B. In rank_full_index(), now add computed stop-words to global set so they can all be written to the index file together. This was done for feature ISW. C. In write_full_index(), added code to write additional header information for the stop-words. D. Added code to usage() for feature UDV. * Makefile 1. Added new dependencies for feature ISW. * man/man1/index.1 1. Added description of new option for feature ICF. 2. Added description of new option for feature ICt. * man/man1/search.1 1. Added description of comments "search" outputs for feature ISW. 2. Added description of new -S option for feature ISW. * man/man4/swish++.4 1. Added description of new index file format for feature ISW. * search.c 1. Added definitions: word_index stop_words; string_set stop_words_found; for feature ISW. 2. Performed following substitution: s/"the.index"/Index_Filename_Default/ 3. In main(), added code for new -S option to dump the stop- words from an index file. This was done for feature ISW. 4. In main(), added test of EOF for the query_stream to ensure the entire query is parsed successfully for bug fix DMQ. 5. In main(), added code to output stop-words ignored in the query for feature ISW. 6. In parse_query(), changed: if ( !parse_primary( query, temp1 ) ) break; to: if ( !parse_primary( query, temp1 ) ) return false; for bug fix DMQ. 7. In parse_optional_relop(), changed code in default case by adding a check for a ')' token for bug fix DMQ. 8. In parse_primary(), added code to search stop-words for a word in a query and ignore it for feature ISW. 9. In parse_primary() for the lparen_token case, performed following substitution: s/lparen_token/rparen_token/ for bug fix DMQ. A. Added code to usage() for feature UDV. * stop_words.c 1. Added "let's". 2. In constructor, changed use of "new" to "strdup". 3. Change corresponding to util.c item 3. * util.c 1. Added function get_index_info() to extract number of offset information of an index file for feature ISW. 2. In is_ok_word(), removed check for stop-words for feature ISW. The calling code must now check for stop-words itself. This was necessary because "search" checks for stop-words differently than either "index" or "extract" does. 3. Added function: char const *to_lower( char const *s ) for feature ISW. 4. Added "missing": #include <cstring> * util.h 1. Change corresponding to util.c item 1. 2. Change corresponding to util.c item 2. * version.h 1. Updated version to "1.2". * word_index.c 1. Added int parameter since a word_index is now used for both the regular word index (0) and the new stop-word index (1). 2. Moved index file header parsing code into a new function get_index_info() in util.c for feature ISW. * word_index.h 1. Added int parameter to both constructor and set_file_index() since a word_index is now used for both the regular word index (0) and the new stop-word index (1). * www_example/WWW.pm 1. In parse_multipart(), added $'()*/\ characters to those stripped from filenames for bug fix UUC. * www_example/search.cgi 1. Added code to handle ignored words returned by "search" for feature ISW. ******************************************************************************* 1.1 ******************************************************************************* NEW FEATURES ------------ * SWISH++ is now out of beta test. (Nobody has submitted a bug report in a while.) * From "index," you can now dump the built-in default set of stop-words to a file to edit and then use to index. (This feature will be referred to as feature ESW.) * Some example Perl 5 code for interfacing SWISH++ to a web-based search form has been provided. (This feature will be referred to as feature W3E.) BUG FIXES --------- * The definition of the THIS macro in fake_ansi.h was just wrong and there is no way to fix it; so it and all references to it have been deleted. (This bug fix will be referred to as bug fix XTHIS.) CHANGES, file-by-file --------------------- * extract.c 1. In main(), added code to process the new command-line options of -s for feature ESW. 2. In usage(), augmented message for feature ESW. * fake_ansi.h 1. Deleted definition of THIS macro for bug fix XTHIS. * file_list.c 1. Deleted references to THIS macro formerly defined in fake_ansi.h and defined a local version instead for bug fix XTHIS. * index.c 1. In main(), added code to process the two new command-line options of -s and -S for feature ESW. 2. In usage(), augmented message for feature ESW. * Makefile 1. Added specific build rules for stop_words.c for feature ESW. 2. Added dependency on stop_words.h to index.c for feature ESW. 3. Cleanedup rules for "clean," "dist," and "distclean." * man/Makefile 1. Added provision to build man3 subdirectory for feature W3E. * man/man1/index.1 1. Added descriptions of new command-line options for feature ESW. 2. Added missing description of additional processing done for HTML files. * man/man3/Makefile * man/man3/www.3 1. New files for W3E. * stop_words.c 1. Added global pointer to set-word set for feature ESW. 2. Added constructor for stop_word_set to initialize the set of stop-words either from the built-in default set or from a file. * stop_words.h 1. New file for ESW. * string_set.h 1. Changed definition of string_set to be derived from rather than contain a std::set for feature ESW. * util.c 1. Moved stop_word_set definitions to stop_words.c for feature ESW. * version.h 1. Updated version to "1.1". * www_example/WWW.pm 1. Added form data parsing library in Perl 5 for feature W3E. * www_example/search.cgi * www_example/search.html 1. Added example code for feature W3E. ******************************************************************************* 1.1b3 ******************************************************************************* BUG FIXES --------- * Fixed a bug where unbalanced quotes inside comments would cause a core dump. After rereading the HTML 4.0 specification regarding comments, quotes are not to be balanced or otherwise treated specially inside comments. (This bug fix will be referred to as bug fix CQU.) CHANGES, file-by-file --------------------- * ext_proc.c 1. In process_file(), made pid_error static as it should have been all along. * html.c 1. Added inclusion of util.h to access to_upper() function for bug fix CQU. 2. Added following functions for bug fix CQU: is_html_comment() skip_html_comment() tag_cmp() 3. In grep_title(), changed for loop to while loop to have more precise control over when the iterator is advanced for bug fix CQU. 4. In grep_title(), now check to see if an HTML tag is a comment. 5. In grep_title(), replaced code to check title tag by a call to the new tag_cmp() function. 6. In skip_html_tag(), added calls to is_html_comment() and skip_html_comment() since comments must be skipped differently. (For bug fix CQU.) * Makefile 1. Added util.h to html.o dependencies for bug fix CQU. 2. Added "the.index" to the $(RM) line for the clean target. 3. Deleted the second erroneous dist target. * itoa.c 1. Deleted this extraneous file. * util.c 1. In ltoa(), made Buf_Size and Num_Buffers static as they should have been all along. * util.h 1. Added to_upper() inline function for bug fix CQU. * version.h 1. Updated version to "1.1b3". ******************************************************************************* 1.1b2 ******************************************************************************* NEW FEATURES ------------ * For HTML files having titles longer than Title_Max_Size in length, the last three characters are replaces by an ellipsis ("..."). (This feature will be referred to as feature ELL.) BUG FIXES --------- * Fixed a core dump in grep_title() for HTML files having titles that exceed Title_Max_Size in length. (This bug fix will be referred to as bug fix GT1.) CHANGES, file-by-file --------------------- * file_vector.c 1. Performed following substitution: s/sysent.h/unistd.h/ for portability. * html.c 1. Added code for feature ELL. 2. Fixed grep_title() for bug fix GT1. * version.h 1. Updated version to "1.1b2". ******************************************************************************* 1.1b1 ******************************************************************************* NEW FEATURES ------------ * The search command has a new -s option to specify the number of initial results to skip. Used in conjuntion with -m, results can be returned in "pages." (This feature will be referred to as feature SSR.) CHANGES, file-by-file --------------------- * search.c 1. Added comment for sort_by_rank struct. This was an omission. 2. Added -s option in main() for feature SSR. 3. Added skip_results variable in main() for feature SSR. 4. Added -s option in usage() for feature SSR. 5. Removed extra semicolon in usage() that cause only part of the usage message to print. * version.h 1. Updated version to "1.1b1". * man/man1/search.1 1. Added description of -s option for feature SSR. ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/charsets/�����������������������������������������������������������������������������0000755�0000765�0000000�00000000000�10746421420�013265� 5����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/charsets/charsets.h�������������������������������������������������������������������0000644�0000765�0000000�00000003453�10030464526�015257� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** charsets/charsets.h ** ** Copyright (C) 2002 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef charsets_H #define charsets_H // local #include "encoded_char.h" encoded_char_range::charset_type const US_ASCII = 0; encoded_char_range::charset_type const ISO_8859_1 = 0; encoded_char_range::charset_type const UNKNOWN_CHARSET = reinterpret_cast<encoded_char_range::charset_type>( ~0 ); #ifdef CHARSET_utf7 encoded_char_range::value_type charset_utf7( encoded_char_range::pointer begin, encoded_char_range::pointer &pos, encoded_char_range::pointer end ); #endif #ifdef CHARSET_utf8 encoded_char_range::value_type charset_utf8( encoded_char_range::pointer begin, encoded_char_range::pointer &pos, encoded_char_range::pointer end ); #endif #ifdef CHARSET_utf16 encoded_char_range::value_type charset_utf16be( encoded_char_range::pointer begin, encoded_char_range::pointer &pos, encoded_char_range::pointer end ); encoded_char_range::value_type charset_utf16le( encoded_char_range::pointer begin, encoded_char_range::pointer &pos, encoded_char_range::pointer end ); #endif #endif /* charsets_H */ /* vim:set noet sw=8 ts=8: */ ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/charsets/GNUmakefile������������������������������������������������������������������0000644�0000765�0000000�00000002663�10036607774�015360� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������## # SWISH++ # charsets/GNUmakefile # # Copyright (C) 2002 Paul J. Lucas # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. ## ########## You shouldn't have to change anything below this line. ############# TARGET:= libcharsets.a .PHONY: all all: $(TARGET) ROOT:= .. include $(ROOT)/config/config.mk INCLUDES:= -I$(ROOT) CFLAGS:= $(CCFLAGS) $(DEBUGFLAGS) $(INCLUDES) SOURCES:= $(foreach charset,$(CHARSET_LIST),$(charset).c) ## # Build rules ## $(TARGET): $(SOURCES:.c=.o) $(RM) $@ $(AR) $@ $^ -$(RANLIB) $@ # Don't do the "include" if the goal contains the word "clean," i.e., either # the "clean" or "distclean" goal. ifneq ($(findstring clean,$(MAKECMDGOALS)),clean) -include $(SOURCES:%.c=.%.d) endif ## # Utility rules ## clean: $(RM) *.o $(TEMPLATE_REPOSITORY) distclean: clean $(RM) $(TARGET) .*.d �����������������������������������������������������������������������������swish++-6.1.5/charsets/README�����������������������������������������������������������������������0000644�0000765�0000000�00000001416�07475323775�014172� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������=============================================================================== README for SWISH++ charsets =============================================================================== The files in this directory handle some values for the charset parameter of the Content-Type header (RFC 2045 section 5) used by the mail indexer, to wit: utf7 and utf8. The values us-ascii and iso8859-1 are easily handled internally; other values are currently not handled at all. The reason these files are here in this subdirectory rather than in mod/mail is because, some day, there may be a way to specify non-mail files that are in, say, the UTF-8 character set to be decoded. If this ever happens, then additional non-mail decoders could be added to this subdirectory, e.g., UTF-16. ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/charsets/unicode.h��������������������������������������������������������������������0000644�0000765�0000000�00000003537�10030464526�015074� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** charsets/unicode.h ** ** Copyright (C) 2002 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef unicode_H #define unicode_H // local #include "iso8859-1.h" typedef unsigned long ucs4; //***************************************************************************** // // SYNOPSIS // inline char unicode_to_ascii( ucs4 c ) // // DESCRIPTION // // Convert a 32-bit Unicode character to its closest 7-bit ASCII // equivalent. (This mostly means that accents are stripped.) If there // is no closest equivalent, ' ' (space) is returned. // // This function exists to ensure that the value of the character used // to index iso8859_1_map[] is within range. // // PARAMETERS // // c The character to be converted. // // RETURN VALUE // // Returns said character. // // SEE ALSO // // International Standards Organization. "ISO 8859-1: Information // Processing -- 8-bit single-byte coded graphic character sets -- Part 1: // Latin alphabet No. 1," 1987. // //***************************************************************************** { return c < sizeof( iso8859_1_map ) / sizeof( iso8859_1_map[0] ) ? iso8859_1_to_ascii( c ) : ' '; } #endif /* unicode_H */ /* vim:set noet sw=8 ts=8: */ �����������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/charsets/utf16.c����������������������������������������������������������������������0000644�0000765�0000000�00000006165�10263526062�014410� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** charsets/utf16.c ** ** Copyright (C) 2002 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifdef CHARSET_utf16 // local #include "encoded_char.h" #include "unicode.h" //***************************************************************************** // // SYNOPSIS // encoded_char_range::value_type charset_utf16be( encoded_char_range::pointer, encoded_char_range::pointer &c, encoded_char_range::pointer end ) // // DESCRIPTION // // Convert a UTF-16-big-endian encoded character sequence to its ASCII // equivalent. // // PARAMETERS // // c A pointer marking the position of the character to decode. It // is left after the decoded character. // // end A pointer marking the end of the entire encoded range. // // RETURN VALUE // // Returns the decoded character or ' ' upon error. // // SEE ALSO // // The Unicode Consortium. "Encoding Forms," The Unicode Standard 3.0, // section 2.3, Addison-Wesley, 2000. // //***************************************************************************** { if ( c == end || c+1 == end ) return ' '; ucs4 const u = (static_cast<ucs4>( c[0] ) << 8) | c[1]; c += 2; return unicode_to_ascii( u ); } //***************************************************************************** // // SYNOPSIS // encoded_char_range::value_type charset_utf16le( encoded_char_range::pointer, encoded_char_range::pointer &c, encoded_char_range::pointer end ) // // DESCRIPTION // // Convert a UTF-16-little-endian encoded character sequence to its ASCII // equivalent. // // PARAMETERS // // c A pointer marking the position of the character to decode. It // is left after the decoded character. // // end A pointer marking the end of the entire encoded range. // // RETURN VALUE // // Returns the decoded character or ' ' upon error. // // SEE ALSO // // The Unicode Consortium. "Encoding Forms," The Unicode Standard 3.0, // section 2.3, Addison-Wesley, 2000. // //***************************************************************************** { if ( c == end || c+1 == end ) return ' '; ucs4 const u = (static_cast<ucs4>( c[1] ) << 8) | c[0]; c += 2; return unicode_to_ascii( u ); } #endif /* CHARSET_utf16 */ /* vim:set et sw=4 ts=4: */ �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/charsets/utf7.c�����������������������������������������������������������������������0000644�0000765�0000000�00000022747�10166052462�014334� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** charsets/utf7.c ** ** Copyright (C) 2002 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifdef CHARSET_utf7 // standard #include <cstring> #include <iostream> // local #include "encoded_char.h" #include "platform.h" #include "unicode.h" using namespace std; namespace { class utf7_decoder : public encoded_char_range::decoder { public: int buf_count_; pointer prev_c_; private: virtual void reset() { buf_count_ = 0; prev_c_ = 0; } }; } // namespace //***************************************************************************** // // SYNOPSIS // encoded_char_range::value_type charset_utf7( encoded_char_range::pointer begin, encoded_char_range::pointer &c, encoded_char_range::pointer end ) // // DESCRIPTION // // Convert a UTF-7-encoded character sequence to its ASCII equivalent. // // PARAMETERS // // begin An pointer marking the beginning of the entire encoded range. // // c An pointer marking the position of the character to decode. It // is left after the decoded character. // // end An pointer marking the end of the entire encoded range. // // RETURN VALUE // // Returns the decoded character or ' ' upon error. // // SEE ALSO // // The Unicode Consortium. "Encoding Forms," The Unicode Standard 3.0, // section 2.3, Addison-Wesley, 2000. // // Ned Freed and Nathaniel S. Borenstein. "RFC 2045: Multipurpose // Internet Mail Extensions (MIME) Part One: Format of Internet Message // Bodies," Section 6.8, "Base64 Content-Transfer-Encoding," RFC 822 // Extensions Working Group of the Internet Engineering Task Force, // November 1996. // // David Goldsmith and Mark Davis. "RFC 2152: UTF-7, a mail-safe // transformation format of Unicode," Network Working Group of the // Internet Engineering Task Force, May 1997. // // NOTE // // This code is based on the decode_base64() function as part of "encdec // 1.1" by Jörgen Hägg <jh@efd.lth.se>, 1993. // //***************************************************************************** { int const Bits_Per_Char = 6; // by definition of Base64 encoding static char const set_B[] = // modified Base64 alphabet "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz" "0123456789+/"; // '=' is omitted intentionally static encoded_char_range::value_type buf[ 3 ]; // group-of-4 -> 3 chars static utf7_decoder decoder; ////////// Return previously decoded character //////////////////////////// // // See if the pointer is less than a buffer's-worth away from the previous // pointer: if so, simply return the already-decoded character. // return_decoded_char: encoded_char_range::difference_type const delta = c - decoder.prev_c_; if ( delta >= 0 && delta < decoder.buf_count_ ) { if ( ++c != end && delta == decoder.buf_count_ - 1 ) if ( ++c != end && *c == '-' ) { // // From RFC 2152, Rule 2: // // As a special case, if the sequence terminates with the // character '-' then that character is absorbed ... // ++c; } return buf[ delta ]; } ////////// Determine whether current character is encoded ///////////////// // // We need to "sync" by looking backwards and finding the first character // that is not in set B (meaning it's not encoded) and then going forwards // again to know whether the character at the current postition is encoded // or not. // bool encoded = false; encoded_char_range::pointer const orig_c = c; while ( true ) { if ( ::strchr( set_B, *c ) ) { // // The character is in set B: hence, we don't know whether it's // encoded or not. // if ( c == begin ) { // // We ran into "begin" before being able to sync: this is // weird. Set the position one past the original position to // try to skip this weirdness and return something innocuous // like a space since we have to return something. // c = orig_c + 1; return ' '; } --c; continue; } // // The character isn't in set B: hence, it's not encoded. Now work our // way back to the original position looking for '+' and '-' characters // to keep track of whether the character at the original position is // encoded or not. // bool just_saw_plus = false; while ( c < orig_c ) switch ( *c++ ) { case '+': just_saw_plus = true; encoded = true; break; case '-': encoded = false; // no break; default: just_saw_plus = false; } // // OK, we're back at the original position knowing whether the // character here is encoded or not. // switch ( *c ) { case '+': if ( !encoded ) { // // This '+' isn't encoded so it's the start of an encoded // sequence. // if ( ++c == end ) { // // We unexpectedly ran into the "end": try to do // something sensible and simply return the plus as if // it weren't encoded after all. // return '+'; } encoded = true; } break; case '-': encoded = false; if ( just_saw_plus ) { // // Ibid.: // // ... as a special case, the sequence "+-" may be // used to encode the character '+'. // ++c; return '+'; } else if ( encoded ) { // // Ibid.: // // As a special case, if the sequence terminates with // the character '-' then that character is absorbed // ... // // Therefore, return the next character unless we ran into // "end" in which case return something innocuous like a // space since we have to return something. // return ++c == end ? ' ' : *c++; } } if ( !encoded ) { // // The character isn't encoded: return it as-is. // return *c++; } break; } ////////// Decode a UTF-7 character /////////////////////////////////////// // // Calculate a combined value of the encoded 6-bit characters. // register ucs4 value = 0; register int i; for ( i = 0; i <= 3; ++i ) { // // Find the character in set B. // if ( char const *const a = ::strchr( set_B, *c ) ) { value += (a - set_B) << ((3 - i) * Bits_Per_Char); ++c; } else { // // We encountered a character not in set B: stop. Ibid: // // ... octets are to be interpreted as elements of the // Modified Base64 alphabet until a character not in that // alphabet is encountered. // break; } } i = 4 - i; if ( i <= 2 ) { // // Now that we have a combined value, break it back apart but in 8-bit // chunks, i.e., ordinary characters. // value >>= 8 * i; for ( int j = 2 - i; j >= 0; --j ) { buf[ j ] = value & 255; value >>= 8; } decoder.buf_count_ = 3 - i; } else { // // The encoded sequence was bad, e.g. +6. // decoder.buf_count_ = 0; } // // Remember the position of the pointer marking the beginning of the range // of characters that have been decoded. If we subsequently are asked to // decode a character in the range [i,i+buf_count), we can simply return // the character. // decoder.prev_c_ = c = orig_c + 1; goto return_decoded_char; } #endif /* CHARSET_utf7 */ /* vim:set et sw=4 ts=4: */ �������������������������swish++-6.1.5/charsets/utf8.c�����������������������������������������������������������������������0000644�0000765�0000000�00000012226�10215701731�014317� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** charsets/utf8.c ** ** Copyright (C) 2002 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifdef CHARSET_utf8 // local #include "encoded_char.h" #include "unicode.h" //***************************************************************************** // // SYNOPSIS // encoded_char_range::value_type charset_utf8( encoded_char_range::pointer /* begin */, encoded_char_range::pointer &c, encoded_char_range::pointer end ) // // DESCRIPTION // // Convert a UTF-8-encoded character sequence to its ASCII equivalent. // // PARAMETERS // // begin An pointer marking the beginning of the entire encoded range. // // c A pointer marking the position of the character to decode. // It is left after the decoded character. // // end A pointer marking the end of the entire encoded range. // // RETURN VALUE // // Returns the decoded character or ' ' upon error. // // SEE ALSO // // The Unicode Consortium. "Encoding Forms," The Unicode Standard 3.0, // section 2.3, Addison-Wesley, 2000. // // Francois Yergeau. "RFC 2279: UTF-8, a transformation format of ISO // 10646," Network Working Group of the Internet Engineering Task Force, // January 1998. // //***************************************************************************** { // // If the byte value is in the ASCII range, we can simply return the // character. // if ( static_cast<unsigned char>( *c ) <= 127u ) return *c++; // // Make sure we're at the first byte of the UTF-8 character; if not, "sync" // by skipping characters until we're at a first byte. Only the first byte // has the bit pattern 11xxxxxx so it's easy to find. // while ( (static_cast<unsigned char>( *c ) & 0xC0u) != 0xC0u ) { if ( c == end ) { // // We ran into "end" before being able to sync: this is weird. // Return something innocuous like a space since we have to return // something. // return ' '; } ++c; } if ( (static_cast<unsigned char>( *c ) & 0xFEu) == 0xFEu ) { // // The octets FE and FF are explicity forbidden: skip over the // offending byte and return something innocuous like a space since we // have to return something. // ++c; return ' '; } // // Using a static table to know how many bytes are in the UTF-8 character // is the fastest way. // static char const trailing_bytes_table[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // C0-CF 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // D0-DF 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // E0-EF 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, // F0-FF }; int const trailing_bytes = trailing_bytes_table[ static_cast<unsigned char>( *c ) ]; register ucs4 u = 0; switch ( trailing_bytes ) { case 5: u += static_cast<unsigned char>( *c++ ); u <<= 6; if ( c == end ) break; case 4: u += static_cast<unsigned char>( *c++ ); u <<= 6; if ( c == end ) break; case 3: u += static_cast<unsigned char>( *c++ ); u <<= 6; if ( c == end ) break; case 2: u += static_cast<unsigned char>( *c++ ); u <<= 6; if ( c == end ) break; case 1: u += static_cast<unsigned char>( *c++ ); u <<= 6; if ( c == end ) break; case 0: u += static_cast<unsigned char>( *c++ ); } static unsigned long const offset_table[] = { 0x0u, 0x3080u, 0xE2080u, 0x3C82080u, 0xFA082080u, 0x82082080u }; u -= offset_table[ trailing_bytes ]; return unicode_to_ascii( u ); } #endif /* CHARSET_utf8 */ /* vim:set et sw=4 ts=4: */ ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/classic_formatter.c�������������������������������������������������������������������0000644�0000765�0000000�00000005573�10300243532�015323� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** classic_formatter.c ** ** Copyright (C) 2001 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ // local #include "classic_formatter.h" #include "file_info.h" #include "index_segment.h" #include "platform.h" #include "ResultSeparator.h" #include "util.h" /* for FOR_EACH */ extern index_segment directories; //***************************************************************************** // // SYNOPSIS // classic_formatter::~classic_formatter() // // DESCRIPTION // // Destroy a classic_formatter. // // NOTE // // This is out-of-line only because it's virtual. // //***************************************************************************** { // do nothing } //***************************************************************************** // // SYNOPSIS // void classic_formatter::pre( stop_word_set const &stop_words ) const // // DESCRIPTION // // Output search-result "meta" information: the set of stop words found in // the query (if any) and the number of results. // // PARAMETERS // // stop_words The set of stop words. // //***************************************************************************** { // Print stop-words, if any. if ( !stop_words.empty() ) { out_ << "# ignored:"; FOR_EACH( stop_word_set, stop_words, word ) out_ << ' ' << *word; out_ << '\n'; } out_ << "# results: " << results_ << '\n'; } //***************************************************************************** // // SYNOPSIS // void classic_formatter::result( int rank, file_info const &fi ) const // // DESCRIPTION // // Output an individual search result's information: it's rank, path, // size, and title. // // PARAMETERS // // rank The rank (1-100) of the result. // // fi The search result's file information. // //***************************************************************************** { out_ << rank << result_separator << directories[ fi.dir_index() ] << '/' << fi.file_name() << result_separator << fi.size() << result_separator << fi.title() << '\n'; } /* vim:set et sw=4 ts=4: */ �������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/classic_formatter.h�������������������������������������������������������������������0000644�0000765�0000000�00000003170�10300243532�015317� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** classic_formatter.h ** ** Copyright (C) 1998 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef classic_formatter_H #define classic_formatter_H // local #include "results_formatter.h" //***************************************************************************** // // SYNOPSIS // class classic_formatter : public results_formatter // // DESCRIPTION // // A classic_formatter is-a results_formatter for formatting search // results in the "classic" SWISH++ format. // //***************************************************************************** { public: classic_formatter( std::ostream &o, int results ) : results_formatter( o, results ) { } virtual ~classic_formatter(); virtual void pre( stop_word_set const& ) const; virtual void result( int rank, file_info const& ) const; }; #endif /* classic_formatter_H */ /* vim:set et sw=4 ts=4: */ ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/conf_bool.c���������������������������������������������������������������������������0000644�0000765�0000000�00000005054�10166052461�013562� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** conf_bool.c ** ** Copyright (C) 1998 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ // standard #include <iostream> // local #include "auto_vec.h" #include "conf_bool.h" #include "exit_codes.h" #include "platform.h" #include "util.h" using namespace PJL; using namespace std; extern char const* me; //***************************************************************************** // // SYNOPSIS // conf<bool>::conf( char const *name, bool default_value ) : // // DESCRIPTION // // Construct (initialize) a conf<bool>. // // PARAMETERS // // name The name of the configuration variable. // // default_value The default value for the configuration variable. // //***************************************************************************** conf_var( name ), default_value_( default_value ), value_( default_value ) { // do nothing else } //***************************************************************************** // // SYNOPSIS // /* virtual */ void conf<bool>::parse_value( char *line ) // // DESCRIPTION // // Parse a Boolean value from the line of text. Acceptable values // (regardless of case) are: f, false, n, no, off, on, t, true, y, yes // // PARAMETERS // // line The line of text to be parsed. // //***************************************************************************** { auto_vec<char> const lower( to_lower_r( line ) ); if ( *lower ) { if ( !::strcmp( lower, "false" ) || !::strcmp( lower, "no" ) || !::strcmp( lower, "off" ) || ( lower[1] == '\0' && (*lower == 'f' || *lower == 'n') ) ) { operator=( false ); return; } if ( !::strcmp( lower, "true" ) || !::strcmp( lower, "on" ) || !::strcmp( lower, "yes" ) || ( lower[1] == '\0' && (*lower == 't' || *lower == 'y') ) ) { operator=( true ); return; } } error() << '"' << name() << "\" is not one of: " "f, false, n, no, off, on, t, true, y, yes\n"; ::exit( Exit_Config_File ); } ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/conf_bool.h���������������������������������������������������������������������������0000644�0000765�0000000�00000004470�10263525236�013573� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** conf_bool.h ** ** Copyright (C) 1998 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef conf_bool_H #define conf_bool_H // standard #include <string> // local #include "conf_var.h" //***************************************************************************** // // SYNOPSIS // template<> class conf<bool> : public conf_var // // DESCRIPTION // // A conf<bool> is-a conf_var for containing the value of a Boolean // configuration variable. // //***************************************************************************** { public: operator bool() const { return value_; } protected: conf( char const *name, bool default_value ); conf<bool>& operator=( bool new_value ) { value_ = new_value; return *this; } CONF_VAR_ASSIGN_OPS( conf<bool> ) virtual void parse_value( char *line ); private: bool const default_value_; bool value_; virtual void reset() { value_ = default_value_; } }; #define CONF_BOOL_ASSIGN_OPS(T) \ T& operator=( bool b ) { \ conf<bool>::operator=( b ); \ return *this; \ } \ T& operator=( std::string const &s ) { \ conf<bool>::operator=( s ); \ return *this; \ } \ T& operator=( char const *s ) { \ conf<bool>::operator=( s ); \ return *this; \ } #endif /* conf_bool_H */ /* vim:set et sw=4 ts=4: */ ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/conf_enum.c���������������������������������������������������������������������������0000644�0000765�0000000�00000005740�10166052461�013575� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** conf_enum.c ** ** Copyright (C) 2000 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ // standard #include <iostream> // local #include "auto_vec.h" #include "conf_enum.h" #include "exit_codes.h" #include "platform.h" #include "util.h" using namespace PJL; using namespace std; extern char const* me; //***************************************************************************** // // SYNOPSIS // conf_enum::conf_enum( char const *name, char const *const legal_values[] ) : // // DESCRIPTION // // Construct (initialize) a conf_enum. // // PARAMETERS // // name The name of the configuration variable. // // legal_values The set of legal values. // //***************************************************************************** conf<std::string>( name, legal_values[0] ), legal_values_( legal_values ) { // do nothing else } //***************************************************************************** // // SYNOPSIS // bool conf_enum::is_legal( char const *value, ostream &err ) const // // DESCRIPTION // // Checks to see if a given value is legal, i.e., among the pre-determined // set of legal values. // // PARAMETERS // // value The value to be checked. // // err The ostream to write an error message to, if any. // // RETURN VALUE // // Returns true only if the value is legal. // //***************************************************************************** { if ( *value ) { auto_vec<char> const lower( to_lower_r( value ) ); for ( char const *const *v = legal_values_; *v; ++v ) if ( !::strcmp( lower, *v ) ) return true; } err << error << '"' << name() << "\" is not one of: "; bool comma = false; for ( char const *const *v = legal_values_; *v; ++v ) { if ( comma ) err << ", "; else comma = true; err << *v; } err << '\n'; return false; } //***************************************************************************** // // SYNOPSIS // /* virtual */ void conf_enum::parse_value( char *line ) // // DESCRIPTION // // Parse an enum value from the line of text. It must be one of the legal // values. // // PARAMETERS // // line The line of text to be parsed. // //***************************************************************************** { if ( !is_legal( line ) ) ::exit( Exit_Config_File ); auto_vec<char> lower( to_lower_r( line ) ); conf<string>::parse_value( lower ); } ��������������������������������swish++-6.1.5/conf_enum.h���������������������������������������������������������������������������0000644�0000765�0000000�00000003737�10263525236�013611� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** conf_enum.h ** ** Copyright (C) 2000 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef conf_enum_H #define conf_enum_H // local #include "conf_string.h" //***************************************************************************** // // SYNOPSIS // class conf_enum : public conf<std::string> // // DESCRIPTION // // A conf_enum is-a conf<std::string> whose value must be of a // pre-determined set of legal values. // //***************************************************************************** { public: bool is_legal( char const*, std::ostream& = std::cerr ) const; protected: conf_enum( char const *name, char const *const legal_values[] ); CONF_STRING_ASSIGN_OPS( conf_enum ) virtual void parse_value( char *line ); private: char const *const *const legal_values_; }; #define CONF_ENUM_ASSIGN_OPS(T) \ T& operator=( std::string const &s ) { \ conf_enum::operator=( s ); \ return *this; \ } \ T& operator=( char const *s ) { \ conf_enum::operator=( s ); \ return *this; \ } #endif /* conf_enum_H */ /* vim:set et sw=4 ts=4: */ ���������������������������������swish++-6.1.5/conf_filter.c�������������������������������������������������������������������������0000644�0000765�0000000�00000005750�10166052461�014117� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** conf_filter.c ** ** Copyright (C) 1998 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ // standard #include <cstdlib> /* for exit(2) */ #include <cstring> // local #include "exit_codes.h" #include "conf_filter.h" #include "platform.h" using namespace std; //***************************************************************************** // // SYNOPSIS // /* virtual */ void conf_filter::parse_value( char *line ) // // DESCRIPTION // // Parse a conf_filter configuration file line. The format of such a line // is: // // pattern command // // where "pattern" is a pattern and "command" is the command-line for // executing the filter on a file. // // Furthermore, ensure the filter contains % and @ filename substitutions. // // PARAMETERS // // line The line to be parsed. // //***************************************************************************** { char const *const pattern = ::strtok( line, " \r\t" ); if ( !pattern ) { error() << "no pattern\n"; ::exit( Exit_Config_File ); } char const *const command = ::strtok( 0, "\n" ); if ( !command ) { error() << "no filter command\n"; ::exit( Exit_Config_File ); } // // Check a filter command's %@ substitutions to ensure they're valid, // that there are at least two of them, and that exactly one of them is // a @ meaning the target filename. Also ignore %% or @@ respresenting // literal @ or %, respectively. // bool found_target = false; int num_substitutions = 0; for ( register char const* s = command; *s && ( s = ::strpbrk( s, "%@" ) ); ++s ) { if ( s[0] == s[1] ) { // %% or @@ ... ++s; // ... skip past it continue; } if ( *s == '@' ) if ( found_target ) { error() << "more than one @\n"; ::exit( Exit_Config_File ); } else { found_target = true; continue; } switch ( s[1] ) { case 'b': case 'B': case 'e': case 'E': case 'f': case 'F': ++num_substitutions; continue; } error() << "non-[bBeEfF%] character after %\n"; ::exit( Exit_Config_File ); } if ( num_substitutions < 1 ) { error() << "at least 1 substitution is required\n"; ::exit( Exit_Config_File ); } if ( !found_target ) { error() << "filter does not contain required @\n"; ::exit( Exit_Config_File ); } map_.insert( map_type::value_type( ::strdup( pattern ), value_type( ::strdup( command ) ) ) ); } ������������������������swish++-6.1.5/conf_filter.h�������������������������������������������������������������������������0000644�0000765�0000000�00000004152�10263525236�014122� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** conf_filter.h ** ** Copyright (C) 1998 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef conf_filter_H #define conf_filter_H // standard #include <string> // local #include "conf_var.h" #include "filter.h" #include "pattern_map.h" //***************************************************************************** // // SYNOPSIS // class conf_filter : public conf_var // // DESCRIPTION // // A conf_filter is-a conf_var for mapping a filename pattern to a filter // (being a Unix process called via command-line). Certain filename // patterns need to be filtered first, e.g., uncompressed. // //***************************************************************************** { public: typedef char const* key_type; typedef filter value_type; typedef value_type const* const_pointer; const_pointer operator[]( key_type key ) const { map_type::const_iterator const i = map_.find( key ); return i != map_.end() ? &i->second : 0; } const_pointer operator[]( std::string const &key ) const { return operator[]( key.c_str() ); } protected: conf_filter( char const *name ) : conf_var( name ) { } typedef pattern_map< value_type > map_type; map_type map_; virtual void parse_value( char *line ); virtual void reset() { map_.clear(); } }; #endif /* conf_filter_H */ /* vim:set et sw=4 ts=4: */ ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/conf_int.c����������������������������������������������������������������������������0000644�0000765�0000000�00000006257�10166052461�013427� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** conf_int.c ** ** Copyright (C) 1998 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ // standard #include <cstdlib> /* for atoi(3) */ #include <cstring> #include <iostream> // local #include "auto_vec.h" #include "conf_int.h" #include "exit_codes.h" #include "platform.h" #include "util.h" using namespace PJL; using namespace std; extern char const* me; //***************************************************************************** // // SYNOPSIS // conf<int>::conf( char const *name, int default_value, int min, int max ) : // // DESCRIPTION // // Construct (initialize) a conf<int>. // // PARAMETERS // // name The name of the configuration variable. // //***************************************************************************** conf_var( name ), default_value_( default_value ), min_( min ), max_( max ), value_( default_value ) { // do nothing else } //***************************************************************************** // // SYNOPSIS // conf<int>& conf<int>::operator=( int new_value ) // // DESCRIPTION // // Assign a new value to the configuration variable, but only if its value // is within the legal range; otherwise complain. // // PARAMETERS // // new_value The potential new value. // //***************************************************************************** { if ( new_value >= min_ && new_value <= max_ ) { value_ = new_value; return *this; } error() << '"' << name() << "\" value \"" << new_value << "\" not in range [" << min_ << '-'; if ( max_ == INT_MAX ) cerr << "infinity"; else cerr << max_; cerr << "]\n"; ::exit( Exit_Config_File ); } //***************************************************************************** // // SYNOPSIS // /* virtual */ void conf<int>::parse_value( char *line ) // // DESCRIPTION // // Parse an integer value from a configuration file line. If successful, // assign the value to ourselves; otherwise complain. The string // "infinity" (regardless of case) is accepted as a legal value. // // PARAMETERS // // line The line to be parsed. // //***************************************************************************** { if ( !line || !*line ) { error() << '"' << name() << "\" has no value\n"; ::exit( Exit_Config_File ); } auto_vec<char> const lower( to_lower_r( line ) ); if ( !::strcmp( lower, "infinity" ) ) { operator=( INT_MAX ); return; } int const n = ::atoi( line ); if ( n || *line == '0' ) { operator=( n ); return; } error() << '"' << name() << "\" has a non-numeric value\n"; ::exit( Exit_Config_File ); } �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/conf_int.h����������������������������������������������������������������������������0000644�0000765�0000000�00000005301�10263526011�013414� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** conf_int.h ** ** Copyright (C) 1998 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef conf_int_H #define conf_int_H // system #include <climits> /* for INT_MAX */ #include <string> // local #include "conf_var.h" //***************************************************************************** // // SYNOPSIS // template<> class conf<int> : public conf_var // // DESCRIPTION // // A conf<int> is-a conf_var for containing the value of an integer // configuration variable. // //***************************************************************************** { public: conf<int>& operator++() { ++value_; return *this; } conf<int> operator++( int ) { conf<int> tmp = *this; ++value_; return tmp; } conf<int>& operator--() { --value_; return *this; } conf<int> operator--( int ) { conf<int> tmp = *this; --value_; return tmp; } operator int() const { return value_; } protected: conf( char const *name, int default_value, int min = 0, int max = INT_MAX ); conf<int>& operator=( int ); CONF_VAR_ASSIGN_OPS( conf<int> ) virtual void parse_value( char *line ); private: int const default_value_, min_, max_; int value_; virtual void reset() { value_ = default_value_; } }; #define CONF_INT_ASSIGN_OPS(T) \ T& operator=( int i ) { \ conf<int>::operator=( i ); \ return *this; \ } \ T& operator=( std::string const &s ) { \ conf<int>::operator=( s ); \ return *this; \ } \ T& operator=( char const *s ) { \ conf<int>::operator=( s ); \ return *this; \ } #endif /* conf_int_H */ /* vim:set et sw=4 ts=4: */ �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/conf_percent.c������������������������������������������������������������������������0000644�0000765�0000000�00000003131�10166052461�014261� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** conf_percent.c ** ** Copyright (C) 2001 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ // standard #include <cstring> // local #include "conf_percent.h" #include "exit_codes.h" #include "platform.h" using namespace std; //***************************************************************************** // // SYNOPSIS // /* virtual */ void conf_percent::parse_value( char *line ) // // DESCRIPTION // // Parse an integer value from a configuration file line. If the value is // followed by a '%' then it's a percentage. // // PARAMETERS // // line The line to be parsed. // //***************************************************************************** { conf<int>::parse_value( line ); if ( is_percentage_ = !!::strchr( line, '%' ) ) { int const value = *this; if ( value < 0 || value > 101 ) { error() << '"' << name() << "\" value \"" << value << "\" not in range [0-101]%\n"; ::exit( Exit_Config_File ); } } } ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/conf_percent.h������������������������������������������������������������������������0000644�0000765�0000000�00000004653�10263525236�014303� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** conf_percent.h ** ** Copyright (C) 2001 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef conf_percent_H #define conf_percent_H // local #include "conf_int.h" //***************************************************************************** // // SYNOPSIS // class conf_percent : public conf<int> // // DESCRIPTION // // A conf_percent is-a conf<int> for containing the value of an integer // either an as absolute number or a percentage. // //***************************************************************************** { public: int operator()( int size ) { return size + (is_percentage_ ? size * operator int() / 100 : operator int() ); } protected: conf_percent( char const *name, int default_value, int min = 0, int max = INT_MAX ); CONF_INT_ASSIGN_OPS( conf_percent ) virtual void parse_value( char *line ); private: bool is_percentage_; }; inline conf_percent::conf_percent( char const *name, int default_value, int min, int max ) : conf<int>( name, default_value, min, max ) { } #define CONF_PERCENT_ASSIGN_OPS(T) \ T& operator=( int i ) { \ conf_percent::operator=( i ); \ return *this; \ } \ T& operator=( std::string const &s ) { \ conf_percent::operator=( s ); \ return *this; \ } \ T& operator=( char const *s ) { \ conf_percent::operator=( s ); \ return *this; \ } #endif /* conf_percent_H */ /* vim:set et sw=4 ts=4: */ �������������������������������������������������������������������������������������swish++-6.1.5/conf_set.c����������������������������������������������������������������������������0000644�0000765�0000000�00000002543�10166052461�013422� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** conf_set.c ** ** Copyright (C) 1998 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ // standard #include <cstring> // local #include "conf_set.h" #include "platform.h" using namespace std; //***************************************************************************** // // SYNOPSIS // /* virtual */ void conf_set::parse_value( char *line ) // // DESCRIPTION // // Parse the line of text by splitting it into words that are separated by // whitespace. // // PARAMETERS // // line The line of text to be parsed. // //***************************************************************************** { for ( register char const *s; s = ::strtok( line, " \r\t" ); line = 0 ) insert( s ); } �������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/conf_set.h����������������������������������������������������������������������������0000644�0000765�0000000�00000003634�10263525236�013434� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** conf_set.h ** ** Copyright (C) 1998 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef conf_set_H #define conf_set_H // local #include "conf_var.h" #include "my_set.h" //***************************************************************************** // // SYNOPSIS // class conf_set : public conf_var, public PJL::string_set // // DESCRIPTION // // A conf_set is-a conf_var and-a string_set for containing a set of // configuration variable values. // //***************************************************************************** { protected: conf_set( char const *name ) : conf_var( name ) { } CONF_VAR_ASSIGN_OPS( conf_set ) virtual void parse_value( char *line ); private: virtual void reset() { clear(); } }; #define CONF_SET_ASSIGN_OPS(T) \ T& operator=( char const *s ) { \ conf_set::operator=( s ); \ return *this; \ } \ T& operator=( std::string const &s ) { \ conf_set::operator=( s ); \ return *this; \ } #endif /* conf_set_H */ /* vim:set et sw=4 ts=4: */ ����������������������������������������������������������������������������������������������������swish++-6.1.5/conf_string.c�������������������������������������������������������������������������0000644�0000765�0000000�00000004446�10166052461�014141� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** conf_string.c ** ** Copyright (C) 1998 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ // standard #include <cstring> #include <iostream> // local #include "conf_string.h" #include "exit_codes.h" #include "platform.h" #include "util.h" /* for error() */ using namespace std; extern char const* me; //***************************************************************************** // // SYNOPSIS // conf<std::string>::conf( char const *name, char const *default_value ) : // // DESCRIPTION // // Construct a conf<std::string> setting its name and default value. // // PARAMETERS // // name The name of the configuration variable. // // default_value Its default value. // //***************************************************************************** conf_var( name ), default_value_( default_value ), value_( default_value ) { // do nothing else } //***************************************************************************** // // SYNOPSIS // /* virtual */ void conf<std::string>::parse_value( char *line ) // // DESCRIPTION // // Parse a single string value from the line of text. // // PARAMETERS // // line The line of text to be parsed. // //***************************************************************************** { if ( !line || !*line ) { error() << '"' << name() << "\" has no value\n"; ::exit( Exit_Config_File ); } // // If the first non-whitespace character is a quote and the last // non-whitespace character is the SAME quote, strip the quotes. // if ( *line == '\'' || *line == '"' ) { char *const last = line + ::strlen( line ) - 1; if ( *line == *last ) ++line, *last = '\0'; } value_ = line; } ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/conf_string.h�������������������������������������������������������������������������0000644�0000765�0000000�00000007647�10263525236�014157� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** conf_string.h ** ** Copyright (C) 1998 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef conf_string_H #define conf_string_H // standard #include <string> // local #include "conf_var.h" //***************************************************************************** // // SYNOPSIS // template<> class conf<std::string> : public conf_var // // DESCRIPTION // // A conf<std::string> is-a conf_var for containing a configuration // variable string value. // //***************************************************************************** { public: int length() const { return value_.length(); } int size() const { return value_.size(); } operator char const*() const { return value_.c_str(); } friend bool operator==( conf<std::string> const &i, conf<std::string> const &j ) { return i.value_ == j.value_; } friend bool operator!=( conf<std::string> const &i, conf<std::string> const &j ) { return !( i == j ); } friend bool operator==( conf<std::string> const &i, std::string const &s ) { return i.value_ == s; } friend bool operator!=( conf<std::string> const &i, std::string const &s ) { return !( i == s ); } friend bool operator==( std::string const &s, conf<std::string> const &i ) { return i == s; } friend bool operator!=( std::string const &s, conf<std::string> const &i ) { return i != s; } friend bool operator==( conf<std::string> const &i, char const *s ) { return i.value_ == s; } friend bool operator!=( conf<std::string> const &i, char const *s ) { return !( i == s ); } friend bool operator==( char const *s, conf<std::string> const &i ) { return i == s; } friend bool operator!=( char const *s, conf<std::string> const &i ) { return i != s; } protected: conf( char const *name, char const *default_value = "" ); CONF_VAR_ASSIGN_OPS( conf<std::string> ) conf<std::string>& operator+=( std::string const &s ) { value_ += s; return *this; } conf<std::string>& operator+=( char const *s ) { value_ += s; return *this; } virtual void parse_value( char *line ); private: std::string const default_value_; std::string value_; virtual void reset() { value_ = default_value_; } }; #define CONF_STRING_ASSIGN_OPS(T) \ T& operator=( std::string const &s ) { \ conf<std::string>::operator=( s ); \ return *this; \ } \ T& operator=( char const *s ) { \ conf<std::string>::operator=( s ); \ return *this; \ } \ T& operator+=( std::string const &s ) { \ conf<std::string>::operator+=( s ); \ return *this; \ } \ T& operator+=( char const *s ) { \ conf<std::string>::operator+=( s ); \ return *this; \ } #endif /* conf_string_H */ /* vim:set et sw=4 ts=4: */ �����������������������������������������������������������������������������������������swish++-6.1.5/conf_var.c����������������������������������������������������������������������������0000644�0000765�0000000�00000027252�10337000177�013420� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** conf_var.c ** ** Copyright (C) 1998 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ // standard #include <cctype> #include <cstring> #include <iostream> // local #include "config.h" #include "conf_var.h" #include "exit_codes.h" #include "mmap_file.h" #include "platform.h" #include "util.h" using namespace std; using namespace PJL; extern char const *me; int conf_var::current_config_file_line_no_ = 0; //***************************************************************************** // // SYNOPSIS // conf_var::conf_var( char const *var_name ) // // DESCRIPTION // // Construct a configuration variable by mapping a variable name to an // instance of a conf_var (really, an instance of some derived class). // Only a single instance of any given variable may exist. // // PARAMETERS // // var_name The name of the variable. // //***************************************************************************** : name_( var_name ) { conf_var *&var = map_ref()[ to_lower( name_ ) ]; if ( var ) internal_error << "conf_var::conf_var(): \"" << name_ << "\" registered more than once" << report_error; var = this; } //***************************************************************************** // // SYNOPSIS // /* virtual */ conf_var::~conf_var() // // DESCRIPTION // // Destructs a conf_var. It is out-of-line only because it's virtual // (so its address is taken and put into the vtbl). // //***************************************************************************** { // do nothing } //***************************************************************************** // // SYNOPSIS // /* static */ conf_var::map_type& conf_var::map_ref() // // DESCRIPTION // // Define and initialize (exactly once) a static data member for conf_var // and return a reference to it. The reason for this function is to // guarantee that the map is initialized before its first use across all // translation units, something that would not guaranteed if it were a // static data member initialized at file scope. // // We also load the map with all configuration variable names (in lower // case so variables in config files will be case-insensitive) so we can // tell the difference between a variable that doesn't exist (and that we // will complain about to the user) and one that simply isn't used in a // particular executable (and will be silently ignored). // // RETURN VALUE // // Returns a reference to a static instance of an initialized map_type. // // SEE ALSO // // Margaret A. Ellis and Bjarne Stroustrup. "The Annotated C++ // Reference Manual." Addison-Wesley, Reading, MA, 1990. p. 19. // //***************************************************************************** { static map_type m; static bool init; if ( !init ) { init = true; // must set this before init_mod_vars() init_mod_vars(); // defined in init_mod_vars.c static char const *const var_name_table[] = { "associatemeta", "excludefile", "excludemeta", "extractextension", "extractfile", "extractfilter", "filesgrow", "filesreserve", "filterfile", "followlinks", "includefile", "includemeta", "incremental", "indexfile", "recursesubdirs", "resultsformat", "resultseparator", "resultsmax", "stemwords", "stopwordfile", "tempdirectory", "titlelines", "verbosity", "wordfilesmax", "wordpercentmax", "wordthreshold", #ifdef FEATURE_word_pos "storewordpositions", "wordsnear", #endif #ifdef SEARCH_DAEMON "group", #ifdef __APPLE__ "launchdcooperation", #endif "pidfile", "searchbackground", "searchdaemon", "socketaddress", "socketfile", "socketqueuesize", "sockettimeout", "threadsmax", "threadsmin", "threadtimeout", "user", #endif /* SEARCH_DAEMON */ 0, }; for ( register char const *const *v = var_name_table; *v; ++v ) m[ *v ] = 0; } return m; } //***************************************************************************** // // SYNOPSIS // ostream& conf_var::msg( ostream &o, char const *label ) // // DESCRIPTION // // Emit the standard message preamble to the given ostream, that is the // program name, a line number if a configuration file is being parsed, // and a label. // // PARAMETERS // // o The ostream to write the message to. // // label The label to emit, usually "error" or "warning". // // RETURN VALUE // // Returns the ostream passed in. // //***************************************************************************** { o << me; if ( current_config_file_line_no_ ) { // // This is set only by parse_line() during the parsing of a // configuration file. // o << ": config file line " << current_config_file_line_no_; current_config_file_line_no_ = 0; } return o << ": " << label << ": "; } //***************************************************************************** // // SYNOPSIS // void conf_var::parse_const_value( char const *line ) // // DESCRIPTION // // Parse a line that can't be modified by simply copying it and calling // parse_value() on the copy. // //***************************************************************************** { char *const line_copy = new_strdup( line ); parse_value( line_copy ); delete[] line_copy; } //***************************************************************************** // // SYNOPSIS // /* static */ void conf_var::parse_file( char const *file_name ) // // DESCRIPTION // // Parse the lines in a configuration file setting variables accordingly. // // PARAMETERS // // file_name The name of the configuration file to parse. // //***************************************************************************** { mmap_file const conf_file( file_name ); if ( !conf_file ) { if ( !::strcmp( file_name, ConfigFile_Default ) ) { // // The configuration file couldn't be opened; however, the file // name is the default, so assume that none is being used and // simply return. // return; } cerr << "could not read configuration from \"" << file_name << '"' << error_string( conf_file.error() ); ::exit( Exit_Config_File ); } conf_file.behavior( mmap_file::bt_sequential ); register int line_no = 0; register mmap_file::const_iterator c = conf_file.begin(); while ( c != conf_file.end() ) { // // Find the end of the line. // mmap_file::const_iterator const nl = find_newline( c, conf_file.end() ); if ( nl == conf_file.end() ) break; ++line_no; // // See if the line is entirely whitespace optionally followed by a // comment starting with '#': if so, skip it. If we don't end up // skipping it, leading whitespace will have been skipped. // for ( ; c != nl; ++c ) { if ( is_space( *c ) ) continue; if ( *c == '#' ) goto next_line; break; } if ( c != nl ) { // // The line has something on it worth parsing further: copy it // (less leading and trailing whitespace) to a modifyable buffer // and null-terminate it to make that task easier. // char buf[ 256 ]; ptrdiff_t len = nl - c; ::strncpy( buf, c, len ); while ( len > 0 ) if ( is_space( buf[ len - 1 ] ) ) --len; else break; buf[ len ] = '\0'; parse_line( buf, line_no ); } next_line: c = skip_newline( nl, conf_file.end() ); } } //***************************************************************************** // // SYNOPSIS // /* static */ void conf_var::parse_line( char *line, int line_no ) // // DESCRIPTION // // Parse a non-comment or non-blank line from a the configuration file, // the first word of which is the variable name. Look up the variable in // our map and delegate the parsing of the rest of the line to an instance // of a derived class that knows how to parse its own line format. // // PARAMETERS // // line A line from a configuration file to be parsed. // // line_no The line number of the line. // //***************************************************************************** { current_config_file_line_no_ = line_no; ::strtok( line, " \r\t" ); // just the variable name map_type::const_iterator const i = map_ref().find( to_lower( line ) ); if ( i == map_ref().end() ) { warning() << '"' << line << "\" unrecognized; ignored\n"; return; } if ( i->second ) { // // Chop off trailing newline and remove leading whitespace from value. // register char *value = ::strtok( 0, "\r\n" ); while ( *value && is_space( *value ) ) ++value; i->second->parse_value( value ); } // else // // This config. variable is not used by the current executable: silently // ignore it. // current_config_file_line_no_ = 0; } //***************************************************************************** // // SYNOPSIS // /* static */ void conf_var::register_var( char const *name ) // // DESCRIPTION // // Register a variable name for so that it can be used in a config. file. // Derived indexing modules use this function to register module-specific // variables. // // PARAMETERS // // name The name of the variable. // //***************************************************************************** { map_type::const_iterator const i = map_ref().find( name ); if ( i != map_ref().end() ) internal_error << "conf_var::register_var(): \"" << name << "\" registered more than once" << report_error; map_ref()[ name ] = 0; } //***************************************************************************** // // SYNOPSIS // /* static */ void conf_var::reset_all() // // DESCRIPTION // // Reset all configuration variables to their default values. // //***************************************************************************** { map_type &m = map_ref(); TRANSFORM_EACH( map_type, m, i ) if ( i->second ) i->second->reset(); } /* vim:set et sw=4 ts=4: */ ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/conf_var.h����������������������������������������������������������������������������0000644�0000765�0000000�00000010036�10263525236�013423� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** conf_var.h ** ** Copyright (C) 1998 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef conf_var_H #define conf_var_H // standard #include <iostream> #include <map> #include <string> // local #include "less.h" //***************************************************************************** // // SYNOPSIS // class conf_var // // DESCRIPTION // // A conf_var is the abstract base class for all configuration file // variables. It parses a given line and, based on the variable name, // which is the first word on the line, dispatches to an appropriate // instance of a derived class to parse the rest of the line. // //***************************************************************************** { public: static void parse_file( char const *file_name ); char const* name() const { return name_; } static void register_var( char const *name ); protected: conf_var( char const *var_name ); virtual ~conf_var(); virtual void parse_value( char *line ) = 0; // Derived classes must define this to parse a line // and set their value. void parse_const_value( char const *line ); // This is a convenience function that can be called // to parse a line where the line is const. This is // used by operator=(). virtual void reset() = 0; // Reset value to default. static void reset_all(); // Reset all configuration variables to defaults. static std::ostream& error ( std::ostream& = std::cerr ); static std::ostream& warning( std::ostream& = std::cerr ); private: // // Note that the declaration of std::map has a default "Compare" template // parameter of "less< key_type >" and, since we've included less.h above // that defines "less< char const* >", C-style string comparisons work // properly. // typedef std::map< char const*, conf_var* > map_type; char const *const name_; static int current_config_file_line_no_; static void init_mod_vars(); // generated by init_vars-sh static map_type& map_ref(); static std::ostream& msg( std::ostream&, char const *label ); static void parse_line( char *line, int line_no ); }; ////////// Inlines //////////////////////////////////////////////////////////// inline std::ostream& conf_var::error( std::ostream &o ) { return msg( o, "error" ); } inline std::ostream& conf_var::warning( std::ostream &o ) { return msg( o, "warning" ); } // // We define this macro for convenience since operator=() is not inherited. // #define CONF_VAR_ASSIGN_OPS(T) \ T& operator=( std::string const &s ) { \ parse_const_value( s.c_str() ); \ return *this; \ } \ T& operator=( char const *s ) { \ parse_const_value( s ); \ return *this; \ } // // This template declaration allows specializations for T later. // template< class T > class conf; #endif /* conf_var_H */ /* vim:set et sw=4 ts=4: */ ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/config/�������������������������������������������������������������������������������0000755�0000765�0000000�00000000000�10746421524�012723� 5����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/config/config-sh����������������������������������������������������������������������0000755�0000765�0000000�00000010172�10166052462�014524� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#! /bin/sh ## # SWISH++ # config-sh -- Configuration script # # Copyright (C) 1998 Paul J. Lucas # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. ## ## # This code is Bourne Shell for maximal portability. ## ########### You shouldn't have to change anything below this line. ############ ME=$0; ME=`expr $ME : '.*/\(.*\)'` USAGE="usage: $ME target.h target.mk compiler [ compiler-options ]" N=3 [ $# -lt $N ] && { echo $USAGE >&2; exit 1; } TARGET_H=$1 TARGET_MK=$2 CC=$3 shift $N CFLAGS="$* -c" DATE=`date` echo echo "$ME: checking C++ compiler $CC" trap "rm -f *.o $TARGET_H $TARGET_MK; exit 1" 0 1 2 15 ########## .h target preamble ################################################# cat > $TARGET_H <<! /* ** SWISH++ ** `basename $TARGET_H` ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* ** Note: This configuration file was automatically generated for the C++ ** compiler $CC on $DATE. */ #ifndef platform_H #define platform_H #ifdef WIN32 #define PJL_NO_SYMBOLIC_LINKS #endif ! ########## .mk target preamble ################################################ cat > $TARGET_MK <<! ## # SWISH++ # `basename $TARGET_MK` # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. ## ## # Note: This make file was automatically generated for the C++ compiler # $CC on $DATE. ## ! ########## The main code ###################################################### define() { definition=`grep $1 $file | cut -f3-` [ -z "$definition" ] && return symbol=`echo $definition | cut -f1 -d' '` { echo "#ifndef $symbol"; echo "#define $definition"; echo "#endif"; } >> $TARGET_H value=`echo $definition | cut -f2 -d' '` echo "$symbol:=$value" >> $TARGET_MK } for file in src/*.c do echo "+ checking for \"`grep TEST $file | cut -f3-`\"..." if $CC $CFLAGS $file >/dev/null 2>/dev/null then define PASS continue fi if grep ERROR $file >&- 2>&- then echo '------------------------------------------------------------------------------' grep ERROR $file | cut -f3- echo '------------------------------------------------------------------------------' exit 1 fi define FAIL done echo ########## .h target epilogue ################################################# cat >> $TARGET_H <<! #endif /* platform_H */ ! trap "x=$?; rm -f *.o; exit $x" 0 1 2 15 ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/config/config.mk����������������������������������������������������������������������0000644�0000765�0000000�00000023453�10304360062�014515� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������## # SWISH++ # config/config.mk # # Copyright (C) 1998 Paul J. Lucas # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. ## ## # Note: If you later redefine any of these variables, you *MUST* first # do a "make distclean" before you do another "make". ## ############################################################################### # # OS selection # # Uncomment whichever line represents your OS. If your OS isn't listed # here, first try none; then, if that doesn't work, pick Linux and see if # that works. # ############################################################################### #FREE_BSD:=1 LINUX:=1 #MAC_OS_X:=1 #SOLARIS:=1 #WIN32:=1 ############################################################################### # # SWISH++ stuff # ############################################################################### MOD_LIST:= html id3 latex mail man rtf # The indexing modules you want built into index(1). If you want # to index mail files and you want to be able to index HTML # attachments, then you need to build-in "html"; similarly, if # you want to be able to index RTF attachments, then you need to # build-in "rtf". CHARSET_LIST:= utf7 utf8 utf16 # The character sets you want index(1) to be able to decode. # Note that "us-ascii" and "iso8859-1" are implicitly included. # # If you have no intention of indexing mail or news files, then # you do not need utf7. # # If you have no intention of indexing either mail, news, or MP3 # files and being able to index ID3 tags containing Unicode # characters, then you do not need either utf8 or utf16. ENCODING_LIST:= base64 quoted_printable # The Content-Transfer-Encodings you want index(1) to be able to # decode only when MOD_LIST contains "mail". ## READ THIS -> Including any character sets or encodings requires more # processing PER CHARACTER and therefore will be slower for ALL # files (not just those that are encoded). So if you don't need # any character sets (other than ISO 8859-1) or encodings, do NOT # compile them in. FEATURE_LIST:= word_pos # The set of optional features you want built into SWISH++: # # 1. word_pos: Store word positions during indexing needed to do # "near" searches. Storing said data approximately doubles # the size of the generated index. # Leave the following lines alone! CHARSET_DEFS:= $(foreach charset,$(CHARSET_LIST),-DCHARSET_$(charset)) ENCODING_DEFS:= $(foreach encoding,$(ENCODING_LIST),-DENCODING_$(encoding)) FEATURE_DEFS:= $(foreach feature,$(FEATURE_LIST),-DFEATURE_$(feature)) MOD_DEFS:= $(foreach mod,$(MOD_LIST),-DMOD_$(mod)) # These too! ifneq ($(CHARSET_DEFS),"") DECODING:= -DIMPLEMENT_DECODING endif ifneq ($(ENCODING_DEFS),"") DECODING:= -DIMPLEMENT_DECODING endif ifndef WIN32 # The search daemon ability is not currently supported for # Windows. The only way it will ever be is if somebody # volunteers to port the socket and multithreading code. SEARCH_DAEMON:= -DSEARCH_DAEMON -DMULTI_THREADED -D_REENTRANT # These definitions will build search(1) with the ability to run # in the background as a multi-threaded daemon process. Comment # this out if you have no need for this feature. Currently, the # daemon ability is supported only for Unix and not Windows. # # In order to build with the search daemon ability, use of the C # global variable "errno" must be thread-safe. This usually # means that "errno" is defined as a macro rather than an integer # variable. Different operating systems have different ways to # enable this. For example, Solaris requires that you #define # _REENTRANT; FreeBSD has it enabled automatically; for all other # operating systems, check your documentation. (Start with # error(3) and intro(2).) ifdef SEARCH_DAEMON ifdef FREE_BSD PTHREAD_LINK:= -pthread else PTHREAD_LINK:= -lpthread endif # Library to link against for POSIX threads if building with the # search daemon ability. ifdef LINUX SEARCH_DAEMON+= -D_XOPEN_SOURCE=500 # Linux needs this to define more POSIX thread functions. endif ifdef SOLARIS SOCKET_LINK:= -lsocket -lnsl # Library to link against for sockets if building with the search # daemon ability. endif endif # SEARCH_DAEMON endif # WIN32 ifeq ($(findstring id3,$(MOD_LIST)),id3) ZLIB_LINK:= -lz # Library to link against for zlib compression if building with # the ID3 module. endif ############################################################################### # # General stuff # ############################################################################### AR:= ar rv # The command (plus arguments) to create archive libraries; # usually "ar rv". RM:= rm -fr # The command to remove files recursively and ignore errors; # usually "rm -fr" for Unix or "erase" for Windows. PERL:= /usr/local/bin/perl # The full path to the Perl 5 executable; usually "/bin/perl" or # "/usr/local/bin/perl" for Unix or "\Perl\bin\perl" for # Windows. RANLIB:= ranlib # The command to generate library tables-of-contents; usually # "ranlib". If your OS doesn't need this done to libraries, you # can still leave this here since errors from this command are # ignored in the makefiles. SHELL:= /bin/sh # The shell to spawn for subshells; usually "/bin/sh". STRIP:= strip # The command to strip symbolic information from executables; # usually "strip". You can leave this defined even if your OS # doesn't have it or any equivalent since any errors from this # command are ignored in the makefiles. ############################################################################### # # C++ compiler # ############################################################################### CC:= g++ # The C++ compiler you are using; usually "CC" or "g++". #DEBUG:= true ifdef DEBUG OPTIM:= -g else OPTIM:= -O2 # The optimization level. Many compilers allow a digit after the # O to specify the level of optimization; if so, set yours to the # highest number your compiler allows without eliciting problems # in the optimizer. # # Using g++, -O3 under Cynwin under Windows produces bad code; # -O3 with 3.0 causes the optimizer to take ridiculously long and # use most of the CPU and memory. # # If SWISH++ doesn't work correctly with optimization on, but it # works just fine with it off, then there is a bug in your # compiler's optimizer. ifeq ($(findstring g++,$(CC)),g++) OPTIM+= -fomit-frame-pointer endif endif # DEBUG CCFLAGS:= -I. $(CHARSET_DEFS) $(ENCODING_DEFS) $(DECODING) $(MOD_DEFS) \ $(FEATURE_DEFS) $(SEARCH_DAEMON) $(OPTIM) # Flags for the C++ compiler. ifdef LINUX CCFLAGS+= -D_BSD_SOURCE endif ifeq ($(findstring g++,$(CC)),g++) CCFLAGS+= -fno-exceptions # Since SWISH++ doesn't use exceptions, turn off code generation # for them to save space in the executables. ifneq ($(findstring word_pos,$(FEATURE_LIST)),word_pos) CCFLAGS+= -fno-rtti # SWISH++ uses RTTI only for the word_pos feature, so, if the # feature isn't being compiled in, turn off code generation for # RTTI to save space in the executables. endif endif ifeq ($(findstring g++,$(CC)),g++) #CCFLAGS+= -fmessage-length=0 -W -Wcast-align -Wcast-qual -Wnon-virtual-dtor -Wpointer-arith -Wreorder -Wswitch -Wtraditional -Wuninitialized -Wunreachable-code -Wunused #-Winline -Wshadow endif # Warning flags specific to g++. Unless you are modifying the # source code, you should leave this commented out. ifdef SOLARIS TEMPLATE_REPOSITORY:= SunWS_cache # The name of the directory used as C++ template repository. # Using Sun's CC, this is usually "SunWS_cache". If another # compiler is being used that doesn't use such a repository # (e.g., g++), it's harmless to leave this as-is. endif ############################################################################### # # Installation # ############################################################################### INSTALL:= $(ROOT)/install-sh # Install command; usually "$(ROOT)/install-sh". I_ROOT:= /usr/local # The top-level directory of where SWISH++ will be installed. I_BIN:= $(I_ROOT)/bin # Where executables are installed; usually "$(I_ROOT)/bin". I_ETC:= $(I_ROOT)/etc # Where .conf files are installed; usually "$(I_ROOT)/etc". I_LIB:= $(I_ROOT)/lib # Where libraries are installed; usually "$(I_ROOT)/lib". I_MAN:= $(I_ROOT)/man # Where manual pages are installed; usually "$(I_ROOT)/man". I_OWNER:= -o bin # The owner of the installed files. I_GROUP:= -g bin # The group of the installed files. I_MODE:= -m 644 # File permissions for regular files (non executables). I_XMODE:= -m 755 # File permissions for eXecutables and directories. MKDIR:= $(INSTALL) $(I_OWNER) $(I_GROUP) $(I_XMODE) -d # Command used to create a directory. ########## You shouldn't have to change anything below this line. ############# # $(ROOT) is defined by the Makefile including this. # Must not use := here! ifeq ($(findstring g++,$(CC)),g++) MAKEDEPEND= $(CC) -MM $(CFLAGS) else MAKEDEPEND= $(PERL) $(ROOT)/config/makedepend.pl $(CFLAGS) endif .%.d : %.c $(ROOT)/platform.h $(SHELL) -ec '$(MAKEDEPEND) $< | sed "s!\([^:]*\):!\1 $@ : !g" > $@; [ -s $@ ] || $(RM) $@' ifneq ($(findstring platform,$(TARGET)),platform) $(ROOT)/platform.h $(ROOT)/config/platform.mk: @$(MAKE) -C $(ROOT)/config endif .SUFFIXES: .in % :: %.in $(PERL) $(ROOT)/config/config.pl $< < $(ROOT)/config/config.mk # vim:set noet sw=8 ts=8: ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/config/config.pl����������������������������������������������������������������������0000644�0000765�0000000�00000005017�10034714110�014512� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������## # SWISH++ # config/config.pl: Perl script configuration script # # Copyright (C) 2000 Paul J. Lucas # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. ## ########## You shouldn't have to change anything below this line. ############# use File::Basename; use File::Find; $ME = basename( $0 ); # basename of executable ## # Check command-line arguments. ## $#ARGV + 1 == 1 or die "usage: $ME file.in\n"; ( $IN_FILE ) = @ARGV; ## # Populate a key/value variable substitution hash from standard input. ## while ( <STDIN> ) { next if /^\s*#/; # skip comments chop; next unless /^\s*(\w+)\s*:?=\s*([^#]+)\s*/;# skip non-assignment lines my( $k, $v ) = ( $1, $2 ); # got an assignment $v =~ s/\s+$//; # remove trailing whitespace ## # Perform variable expansion on the RHS of the assignment allowing # either $VAR or $(VAR), the latter for 'make' variables. Do NOT # expand \$other, i.e., a literal $. (See also "Programming Perl," p. # 69.) # # In order not to expand \$other, change all \$ to <DOLLAR>, i.e., some # character sequence not containing a $ and most likely not otherwise # appearing in the string. When done, change them back. ## $v =~ s/\\\$/<DOLLAR>/g; $v = "$`$kv{ $1 }$'" while $v =~ /\$\(?(\w+)\)?/; $v =~ s/<DOLLAR>/\$/g; $kv{ $k } = $v; } ## # Perform substitutions in file or files. ## sub substitute { return unless /\.in$/ && -T $_; unless ( open( FILE_IN, $_ ) ) { warn "$ME: can not read $_\n"; return; } my( $mode, $uid, $gid ) = (stat( _ ))[2,4,5]; my $file_out = $_; $file_out =~ s/\.in$//; unless ( open( FILE_OUT, ">$file_out" ) ) { warn "$ME: can not write $file_out\n"; close( FILE_IN ); return; } while ( <FILE_IN> ) { s/%%$k%%/$v/g while ( $k, $v ) = each %kv; print FILE_OUT; } close( FILE_OUT ); close( FILE_IN ); chmod( "0$mode", $file_out ); chown( $uid, $gid, $file_out ); } find( \&substitute, $IN_FILE ); # vim:set noet sw=8 ts=8: �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/config/GNUmakefile��������������������������������������������������������������������0000644�0000765�0000000�00000002153�10034714037�014770� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������## # SWISH++ # config/GNUmakefile # # Copyright (C) 2000 Paul J. Lucas # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. ## ########## You shouldn't have to change anything below this line. ############# ROOT:= .. TARGET:= $(ROOT)/platform.h platform.mk .PNONY: all all: $(TARGET) include $(ROOT)/config/config.mk ## # Build rules ## $(TARGET): src/*.c ./config-sh $(TARGET) $(CC) $(CCFLAGS) ## # Utility rules ## clean: distclean: clean $(RM) $(TARGET) # vim:set noet sw=8 ts=8: ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/config/makedepend.pl������������������������������������������������������������������0000644�0000765�0000000�00000006420�10304356614�015354� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������## # PJL Script Library # config/makedepend.pl # # Copyright (C) 2005 Paul J. Lucas # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. ## ########## You shouldn't have to change anything below this line. ############# use File::Basename; $ME = basename( $0 ); # basename of executable sub usage { die "usage: $ME [-I dir]... file\n"; } ## # Go though all command-line arguments and capture -I options; ignore all # others; stop when we encounter a non-option. ## my $arg_is_next; my @dirs = ( '.' ); while ( defined( $_ = shift( @ARGV ) ) ) { /(.+)/ && $arg_is_next && do { push( @dirs, $1 ); $arg_is_next = 0; next; }; /^[^-]/ && do { unshift( @ARGV, $_ ); last; }; /^-I(.*)/ && do { if ( $1 ) { push( @dirs, $1 ); } else { $arg_is_next = 1; } next; }; } $#ARGV + 1 == 1 or usage(); ( $SOURCE_FILE ) = @ARGV; push( @files, $SOURCE_FILE ); # prime the pump map { s!/$!!; } @dirs; # get rid of trailing /'s map { $dir_set{ $_ } = 1; } @dirs; ## # Process all files. ## while ( my $file = shift( @files ) ) { ## # See if the current file contains a path in a subdirectory, e.g.: # # #include "sub/dir/foo.h" # # If so, add the subdirectory to the list/set of directories. ## my $file_dir = dirname( $file ); unless ( exists $dir_set{ $file_dir } ) { $dir_set{ $file_dir } = 1; push( @dirs2, $file_dir ); } ## # Look in all directories in @dirs for the current file. ## my $found_dir; for my $dir ( @dirs ) { my $path = "$dir/$file"; if ( open( SOURCE, $path ) ) { $found_dir = $dir; goto FOUND; } for my $dir2 ( @dirs2 ) { $path = "$dir/$dir2/$file"; if ( open( SOURCE, $path ) ) { $found_dir = "$dir/$dir2"; goto FOUND; } } next; FOUND: $dep_set{ $path } = 1; last; } die "$ME: error: can not open $file\n" unless $found_dir; ## # Pluck files #include'd and add them to the list of files to process only # if we haven't seen them before. ## while ( <SOURCE> ) { next unless /^#\s*include\s+"([^"]+)"/; push( @files, $1 ) unless exists $dep_set{ "$found_dir/$1" }; } close( SOURCE ); } ## # Print the file and its dependencies. ## ( $OBJECT_FILE = $SOURCE_FILE ) =~ s/\.\w+$/\.o/; print "$OBJECT_FILE : ", join( ' ', keys %dep_set ), "\n"; # vim:set et sw=4 ts=4: ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/config/man.mk�������������������������������������������������������������������������0000644�0000765�0000000�00000005067�10034714037�014031� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������## # SWISH++ # config/man.mk # # Copyright (C) 1998 Paul J. Lucas # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. ## NROFF= nroff -man # Command for formatting Unix manual pages for a terminal screen; # usually "nroff -man". TROFF= troff -man # Command for formatting Unix manual pages for a phototypesetter; # usually "troff -man". DPOST= /usr/lib/lp/postscript/dpost ifneq ($(wildcard $(DPOST)),) TO_PS= $(DPOST) else TO_PS= grops endif # Command to convert troff output to PostScript; usually # "/usr/lib/lp/postscript/dpost" for Solaris or "grops" for # systems using groff. TO_TXT= col -b # Command to strip all non-text characters from nroff output to # generate plain text versions of manual pages; usually # "col -b". ########## You shouldn't have to change anything below this line. ############# include $(ROOT)/config/config.mk # $(SECT) is defined by the makefile including this PAGES:= $(wildcard *.$(SECT)) TARGET_TXT:= $(PAGES:.$(SECT)=.txt) TARGET_PDF:= $(PAGES:.$(SECT)=.pdf) TARGET_PS:= $(PAGES:.$(SECT)=.ps) ## # Build rules ## .SUFFIXES: .SUFFIXES: .$(SECT) .pdf .ps .txt %.txt : %.$(SECT) $(NROFF) $< | $(TO_TXT) > $@ %.ps : %.$(SECT) $(TROFF) $< | $(TO_PS) > $@ # Use this function to try to locate Acrobat Distiller since it produces # better PDF than Ghostscript. pathsearch = $(firstword $(wildcard $(addsuffix /$(1),$(subst :, ,$(PATH))))) %.pdf : %.ps ifneq ($(call pathsearch,distill),) distill $< else gs -q -dNOPAUSE -sDEVICE=pdfwrite -sOutputFile=$@ $< -c quit endif all: text txt: $(TARGET_TXT) pdf: $(TARGET_PDF) ps : $(TARGET_PS) all: text pdf ps ## # Install rules ## install: $(I_MAN)/man$(SECT) $(INSTALL) $(I_OWNER) $(I_GROUP) $(I_MODE) $(PAGES) $? $(I_MAN)/man$(SECT): $(MKDIR) $@ uninstall: cd $(I_MAN)/man$(SECT) && $(RM) $(PAGES) ## # Utility rules ## clean distclean: $(RM) $(TARGET_TXT) $(TARGET_PDF) $(TARGET_PS) dist: $(MAKE) text pdf $(RM) $(TARGET_PS) # vim:set noet sw=8 ts=8: �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/config/mod.mk�������������������������������������������������������������������������0000644�0000765�0000000�00000002653�10303236400�014023� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������## # SWISH++ # config/mod.mk # # Copyright (C) 2001 Paul J. Lucas # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. ## ########## You shouldn't have to change anything below this line. ############# TARGET:= libmod_$(notdir $(CURDIR)).a .PHONY: all all: $(TARGET) include $(ROOT)/config/config.mk INCLUDES:= -I$(ROOT) CFLAGS:= $(CCFLAGS) $(DEBUGFLAGS) $(INCLUDES) SOURCES:= $(wildcard *.c) ## # Build rules ## $(TARGET): $(SOURCES:.c=.o) $(RM) $@ $(AR) $@ $^ -$(RANLIB) $@ # Don't do the "include" if the goal contains the word "clean," i.e., either # the "clean" or "distclean" goal. ifneq ($(findstring clean,$(MAKECMDGOALS)),clean) -include $(SOURCES:%.c=.%.d) endif ## # Utility rules ## clean: $(RM) *.o $(TEMPLATE_REPOSITORY) distclean: clean $(RM) $(TARGET) .*.d # vim:set noet sw=8 ts=8: �������������������������������������������������������������������������������������swish++-6.1.5/config/src/���������������������������������������������������������������������������0000755�0000765�0000000�00000000000�10746421420�013505� 5����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/config/src/errno.c��������������������������������������������������������������������0000644�0000765�0000000�00000000562�10030464471�015000� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** errno.c ** ** TEST thread-safe errno ** ERROR "errno" does not appear to be thread-safe; see the comment for ** ERROR SEARCH_DAEMON in config.mk. */ #ifdef SEARCH_DAEMON #include <cerrno> #ifndef errno This is an intentional error to get compilation to fail if errno is not defined as a macro. #endif #endif /* SEARCH_DAEMON */ /* vim:set noet sw=8 ts=8: */ ����������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/config/src/madvise.c������������������������������������������������������������������0000644�0000765�0000000�00000000341�10030464471�015276� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** madvise.c ** ** TEST madvise(2) ** FAIL PJL_NO_MADVISE */ #include <sys/types.h> #include <sys/mman.h> void f() { // // See if madvise(2) exists. // ::madvise( 0, 0, MADV_NORMAL ); } /* vim:set noet sw=8 ts=8: */ �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/config/src/socklen_1_socklen_t.c������������������������������������������������������0000644�0000765�0000000�00000000543�10030464471�017571� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** socklen_1_socklen_t.c ** ** TEST socket length type is socklen_t ** PASS PJL_SOCKLEN_TYPE socklen_t */ #include <sys/types.h> #include <sys/socket.h> void f() { // // See if this socket implementation takes a socklen_t 3rd argument to // accept(2). // socklen_t len; ::accept( 1, (struct sockaddr*)0, &len ); } /* vim:set noet sw=8 ts=8: */ �������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/config/src/socklen_2_int.c������������������������������������������������������������0000644�0000765�0000000�00000000506�10030464471�016402� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** socklen_2_int.c ** ** TEST socket length type is int ** PASS PJL_SOCKLEN_TYPE int */ #include <sys/types.h> #include <sys/socket.h> void f() { // // See if this socket implementation takes an int 3rd argument to // accept(2). // int len; ::accept( 1, (struct sockaddr*)0, &len ); } /* vim:set noet sw=8 ts=8: */ ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/config/src/socklen_2_unsigned.c�������������������������������������������������������0000644�0000765�0000000�00000000537�10030464471�017430� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** socklen_2_unsigned.c ** ** TEST socket length type is unsigned ** PASS PJL_SOCKLEN_TYPE unsigned */ #include <sys/types.h> #include <sys/socket.h> void f() { // // See if this socket implementation takes an unsigned 3rd argument to // accept(2). // unsigned len; ::accept( 1, (struct sockaddr*)0, &len ); } /* vim:set noet sw=8 ts=8: */ �����������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/config/src/zlib.c���������������������������������������������������������������������0000644�0000765�0000000�00000000347�10030464471�014614� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** zlib.c ** ** TEST zlib.h ** PASS HAVE_ZLIB */ #include <zlib.h> void f() { unsigned char dest[ 1024 ], src[ 1024 ]; unsigned long dest_len; uncompress( dest, &dest_len, src, sizeof src ); } /* vim:set noet sw=8 ts=8: */ �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/config.h������������������������������������������������������������������������������0000644�0000765�0000000�00000027224�10165135413�013075� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** config.h ** ** Copyright (C) 1998 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef config_H #define config_H ////////// Word determination ///////////////////////////////////////////////// int const Word_Hard_Min_Size = 3; int const Word_Hard_Max_Size = 25; // The minimum and maximum lengths a word must be in order even to // bother doing more aggressive checks on it to determine if it should // be indexed. int const Word_Min_Size = 4; // The minimum length a non-acronym word must be in order to be // considered for indexing. int const Word_Min_Vowels = 1; // The minimum number of vowels a word must have in order to be // indexed. int const Word_Hex_Max_Size = 4; // The maximum length a string composed entirely of hexadecimal digits // i.e., ASCII hex data, can be before it is discarded. Note that the // word "cafe" is a legitimate English word composed entirely of // hexedecimal digits. This parameter is used only by extract(1) in // extract.c. // I don't think there is a word in English that has more than... int const Word_Max_Consec_Consonants = 7; // ...this many consecutive consonants (like "symphysis") int const Word_Max_Consec_Vowels = 5; // ...this many consecutive vowels (like "queueing") int const Word_Max_Consec_Same = 2; // ...this many of the same alphabetic character consecutively int const Word_Max_Consec_Puncts = 1; // ...this many punctuation character in a row // Characters that are permissible in words: letters must be lower case and // upper case letters would be redundant. // char const Word_Chars[] = "&'-0123456789abcdefghijklmnopqrstuvwxyz_"; // Characters that may be in a word. Note that '&' is here so // acronyms like "AT&T" are treated as one word. Unlike SWISH-E, ';' // does not need to be here to recognize and convert character entity // references. #define OPTIMIZE_WORD_CHARS 1 // If you are using the default set of characters, that is the // alphanumerics and "&'-_" characters, then having this macro set to // 1 will optimize the is_word_char() function yielding about a 10% // performance improvement; alternatively, you can also edit that // function to keep the optimization if you are not using the default // set of characters. See word_util.h for details. char const Word_Begin_Chars[] = "0123456789abcdefghijklmnopqrstuvwxyz"; // Characters that may begin a word; should be a subset of the above. #define OPTIMIZE_WORD_BEGIN_CHARS 1 // Same deal as with OPTIMIZE_WORD_CHARS. char const Word_End_Chars[] = "0123456789abcdefghijklmnopqrstuvwxyz"; // Characters that may end a word; usually the same as the above. #define OPTIMIZE_WORD_END_CHARS 1 // Same deal as with OPTIMIZE_WORD_CHARS. #ifdef SEARCH_DAEMON ////////// Search server daemon parameters //////////////////////////////////// char const SocketFile_Default[] = "/tmp/search.socket"; // Default name of the Unix domain socket file; this can be overridden // either in a config. file or on the command line. int const SocketPort_Default = 1967; // Default port number of the TCP socket; this can be overridden // either in a config. file or on the command line. int const SocketQueueSize_Default = 511; // Maximum number of queued connections for a socket. From [Stevens // 1998], p. 96: // // Historically, sample code always shows a backlog of 5, // as that was the maximum value supported by 4.2BSD. // This was adequate in the 1980s when busy servers would // handle only a few hundred connections per day. But // with the growth of the World Wide Web (WWW), where busy // servers handle millions of connections per day, this // small number is completely inadequate. Busy HTTP // servers must specify a much larger backlog, and newer // kernels must support larger values. // // Unfortunately, Stevens doesn't say what a good value is. The // default 511 value is taken from httpd.h in Apache: // // It defaults to 511 instead of 512 because some systems // store it as an 8-bit datatype; 512 truncated to 8-bits // is 0, while 511 is 255 when truncated. // // If it's good enough for Apache, it's good enough for us. This can // be overridden either in a config. file or on the command line. int const SocketTimeout_Default = 10; // seconds // The number of seconds a client has to complete a search request // before being disconnected. This is to prevent a client from // connecting, not completing a request, and causing the thread // servicing the request to wait forever. This can be overridden // either in a config. file or on the command line. int const ThreadsMin_Default = 5; // The minimum number of simultanous threads; this can be overridden // either in a config. file or on the command line. int const ThreadsMax_Default = 100; // The maximum number of simultanous threads; this can be overridden // either in a config. file or on the command line. int const ThreadTimeout_Default = 30; // seconds // The number of seconds until an idle spare thread times out and // destroys itself. This can be overridden either in a config. // file or on the command line. char const User_Default[] = "nobody"; char const Group_Default[] = "nobody"; // The user and group to switch to after initialization (if root to // begin with). This can be overridden either in a config. file or // on the command line. #endif ////////// Miscellaneous parameters /////////////////////////////////////////// char const ConfigFile_Default[] = "swish++.conf"; // Default name of the configuration file; this can be overridden on // the command line. char const ExtractExtension_Default[] = "txt"; // Default extension to append to filenames during extraction. This // can be overridden either in a config. file or on the command line. int const FilesGrow_Default = 100; // Default number of files to grow reserved space for when // incrementally indexing. This can be overridden either in a config. // file or on the command line. int const FilesReserve_Default = 1000; // Default maximum number of files to reserve space for; see // file_info.c for details. This can be overridden either in a // config. file or on the command line. int const Fork_Attempts = 5; // Number of times to try to fork before giving up. This parameter is // used only in filter.c. int const Fork_Sleep = 5; // seconds // Number of seconds to sleep before retrying to fork. This parameter // is used only in filter.c. char const IndexFile_Default[] = "swish++.index"; // Default name of the index file generated/searched; can be // overridden either in a config. file or on the command line. int const ResultsMax_Default = 100; // Default maximum number of search results; this can be overridden // either in a config. file or on the command line. char const ShellFilenameDelimChars[] = " \t&;<>|"; // Characters in a Unix shell command that delimit file names. Note // that this says "file" (not "path") names. char const ShellFilenameEscapeChars[] = " !\"#$&'()*/;<>?[\\]^`{|}~"; // Characters in a file name that must be escaped when passed to a // Unix shell. This is a superset of what are commonly referred to as // "meta-characers" because the space and tab characters are included. // Note again that this says "file" (not "path") name. #ifdef __CYGWIN__ char const TempDirectory_Default[] = "/temp"; #else #error You have not set TempDirectory_Default for your system. #error Comment out these lines after you have set it. char const TempDirectory_Default[] = "/tmp"; #endif // Default directory to use for temporary files during indexing. If // your OS mounts swap space via /tmp (e.g., Solaris), as indexing // progresses and more files get created in /tmp, you will have less // swap space, indexing will get slower, and you may run out of // memory. If this is the case, you can either change this default // here for all users (preferred) or override it either in a config. // file or on the command line to use a directory on a real // filesystem, i.e., one on a physical disk, e.g., /var/tmp on some // OSs. The directory must exist. int const TitleLines_Default = 12; // Specifies the maximum number of lines into a file for its "title" // (whatever that means for a given file format); this can be // overridden either in a config. file or on the command line. int const Title_Max_Size = 200; // Maximum length of a file "title" (whatever that means for a given // file format). #ifdef FEATURE_word_pos int const WordsNear_Default = 10; // The maximum number of words apart two words can be to be considered // "near" each other; this can be overridden either in a config. file // or on the command line. #endif int const WordPercentMax_Default = 100; // Default maximum percentage of files a word may occur in before it // is discarded as being too frequent; this can be overridden either // in a config. file or on the command line. #error You have not set WordThreshold_Default for your system. #error Comment out these lines after you have set it. int const WordThreshold_Default = 250000; // The word count past which partial indicies are generated and merged // since all the words are too big to fit into memory at the same // time. If you index and your machine begins to swap like mad, lower // this value. The above works OK in a 64MB machine. A rule of thumb // is to add 250000 words for each additional 64MB of RAM you have. // These numbers are for a SPARC machine running Solaris. Other // machines running other operating systems use memory differently. // You simply have to experiment. Only the super-user can increase // this either in a config. file or on the command line. #endif /* config_H */ /* vim:set et sw=4 ts=4: */ ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/Copying�������������������������������������������������������������������������������0000644�0000765�0000000�00000043076�07062631526�013025� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������ GNU GENERAL PUBLIC LICENSE Version 2, June 1991 Copyright (C) 1989, 1991 Free Software Foundation, Inc. 675 Mass Ave, Cambridge, MA 02139, USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This General Public License applies to most of the Free Software Foundation's software and to any other program whose authors commit to using it. (Some other Free Software Foundation software is covered by the GNU Library General Public License instead.) You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs; and that you know you can do these things. To protect your rights, we need to make restrictions that forbid anyone to deny you these rights or to ask you to surrender the rights. These restrictions translate to certain responsibilities for you if you distribute copies of the software, or if you modify it. For example, if you distribute copies of such a program, whether gratis or for a fee, you must give the recipients all the rights that you have. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. We protect your rights with two steps: (1) copyright the software, and (2) offer you this license which gives you legal permission to copy, distribute and/or modify the software. Also, for each author's protection and ours, we want to make certain that everyone understands that there is no warranty for this free software. If the software is modified by someone else and passed on, we want its recipients to know that what they have is not the original, so that any problems introduced by others will not reflect on the original authors' reputations. Finally, any free program is threatened constantly by software patents. We wish to avoid the danger that redistributors of a free program will individually obtain patent licenses, in effect making the program proprietary. To prevent this, we have made it clear that any patent must be licensed for everyone's free use or not licensed at all. The precise terms and conditions for copying, distribution and modification follow. GNU GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License applies to any program or other work which contains a notice placed by the copyright holder saying it may be distributed under the terms of this General Public License. The "Program", below, refers to any such program or work, and a "work based on the Program" means either the Program or any derivative work under copyright law: that is to say, a work containing the Program or a portion of it, either verbatim or with modifications and/or translated into another language. (Hereinafter, translation is included without limitation in the term "modification".) Each licensee is addressed as "you". Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running the Program is not restricted, and the output from the Program is covered only if its contents constitute a work based on the Program (independent of having been made by running the Program). Whether that is true depends on what the Program does. 1. You may copy and distribute verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and give any other recipients of the Program a copy of this License along with the Program. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Program or any portion of it, thus forming a work based on the Program, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) You must cause the modified files to carry prominent notices stating that you changed the files and the date of any change. b) You must cause any work that you distribute or publish, that in whole or in part contains or is derived from the Program or any part thereof, to be licensed as a whole at no charge to all third parties under the terms of this License. c) If the modified program normally reads commands interactively when run, you must cause it, when started running for such interactive use in the most ordinary way, to print or display an announcement including an appropriate copyright notice and a notice that there is no warranty (or else, saying that you provide a warranty) and that users may redistribute the program under these conditions, and telling the user how to view a copy of this License. (Exception: if the Program itself is interactive but does not normally print such an announcement, your work based on the Program is not required to print an announcement.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Program, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Program, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Program. In addition, mere aggregation of another work not based on the Program with the Program (or with a work based on the Program) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may copy and distribute the Program (or a work based on it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you also do one of the following: a) Accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, b) Accompany it with a written offer, valid for at least three years, to give any third party, for a charge no more than your cost of physically performing source distribution, a complete machine-readable copy of the corresponding source code, to be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, c) Accompany it with the information you received as to the offer to distribute corresponding source code. (This alternative is allowed only for noncommercial distribution and only if you received the program in object code or executable form with such an offer, in accord with Subsection b above.) The source code for a work means the preferred form of the work for making modifications to it. For an executable work, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the executable. However, as a special exception, the source code distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. If distribution of executable or object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place counts as distribution of the source code, even though third parties are not compelled to copy the source along with the object code. 4. You may not copy, modify, sublicense, or distribute the Program except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense or distribute the Program is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 5. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Program or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Program (or any work based on the Program), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Program or works based on it. 6. Each time you redistribute the Program (or any work based on the Program), the recipient automatically receives a license from the original licensor to copy, distribute or modify the Program subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties to this License. 7. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Program at all. For example, if a patent license would not permit royalty-free redistribution of the Program by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Program. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system, which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 8. If the distribution and/or use of the Program is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Program under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 9. The Free Software Foundation may publish revised and/or new versions of the General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of this License, you may choose any version ever published by the Free Software Foundation. 10. If you wish to incorporate parts of the Program into other free programs whose distribution conditions are different, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS Appendix: How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. <one line to give the program's name and a brief idea of what it does.> Copyright (C) 19yy <name of author> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. Also add information on how to contact you by electronic and paper mail. If the program is interactive, make it output a short notice like this when it starts in an interactive mode: Gnomovision version 69, Copyright (C) 19yy name of author Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, the commands you use may be called something other than `show w' and `show c'; they could even be mouse-clicks or menu items--whatever suits your program. You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the program, if necessary. Here is a sample; alter the names: Yoyodyne, Inc., hereby disclaims all copyright interest in the program `Gnomovision' (which makes passes at compilers) written by James Hacker. <signature of Ty Coon>, 1 April 1989 Ty Coon, President of Vice This General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Library General Public License instead of this License. ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/copying.dj����������������������������������������������������������������������������0000644�0000765�0000000�00000003774�07156314372�013463� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������This is the file "copying.dj". It does NOT apply to any sources or binaries copyrighted by UCB Berkeley, the Free Software Foundation, or any other agency besides DJ Delorie and others who have agreed to allow their sources to be distributed under these terms. Copyright Information for sources and executables that are marked Copyright (C) DJ Delorie 7 Kim Lane Rochester NH 03867-2954 This document is Copyright (C) DJ Delorie and may be distributed verbatim, but changing it is not allowed. Source code copyright DJ Delorie is distributed under the terms of the GNU General Public Licence, with the following exceptions: * Sources used to build crt0.o, gcrt0.o, libc.a, libdbg.a, and libemu.a are distributed under the terms of the GNU Library General Public License, rather than the GNU GPL. * Any existing copyright or authorship information in any given source file must remain intact. If you modify a source file, a notice to that effect must be added to the authorship information in the source file. * Runtime binaries, as provided by DJ in DJGPP, may be distributed without sources ONLY if the recipient is given sufficient information to obtain a copy of djgpp themselves. This primarily applies to go32-v2.exe, emu387.dxe, and stubedit.exe. * Runtime objects and libraries, as provided by DJ in DJGPP, when linked into an application, may be distributed without sources ONLY if the recipient is given sufficient information to obtain a copy of djgpp themselves. This primarily applies to crt0.o and libc.a. ----- Changes to source code copyright BSD or FSF by DJ Delorie fall under the terms of the original copyright. A copy of the files "COPYING" and "COPYING.LIB" are included with this document. If you did not receive a copy of these files, you may obtain one from whence this document was obtained, or by writing: Free Software Foundation 675 Mass Ave Cambridge, MA 02139 USA ����swish++-6.1.5/directory.c���������������������������������������������������������������������������0000644�0000765�0000000�00000016666�10263525237�013645� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** directory.c ** ** Copyright (C) 1998 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* ** Note that this file is #include'd into index.c and extract.c because ** it generates different code depending on which one it's compiled into. */ // standard #include <cstring> #include <iostream> #include <queue> #include <sys/types.h> /* needed by dirent.h */ #include <dirent.h> // local #include "directory.h" #include "my_set.h" #include "platform.h" #include "RecurseSubdirs.h" #include "util.h" #include "Verbosity.h" using namespace PJL; using namespace std; extern void do_file( char const *file_name, int dir_index ); #ifdef __CYGWIN__ // // The directory separator character ('/' for Unix) is apparantly transformed // into '\' for Windows by the intermediate Windows port of POSIX functions. // However, in the case where '/' is inserted into a string and that string is // printed, the mere printing won't do the transformation. Hence, this file // contains the one place in all of the SWISH++ code where we need to use '\' // explicitly when compiling under Windows. // char const Dir_Sep_Char = '\\'; #else char const Dir_Sep_Char = '/'; #endif /* __CYGWIN__ */ #ifdef INDEX dir_set_type dir_set; #endif #ifndef PJL_NO_SYMBOLIC_LINKS #include "FollowLinks.h" FollowLinks follow_symbolic_links; #endif #ifdef INDEX //***************************************************************************** // // SYNOPSIS // int check_add_directory( char const *dir_path ) // // DESCRIPTION // // Check to see if the given directory has been added to the list of // directories encountered: if not, add it. // // PARAMETERS // // dir_path The full path of a directory. The string must point to // storage that will last for the duration of the program. // // RETURN VALUE // // Returns the index number of the directory. // //***************************************************************************** { pair< dir_set_type::iterator, bool > const p = dir_set.insert( dir_set_type::value_type( dir_path, 0 ) ); if ( p.second ) { // // We really did insert a new directory: set the index to the actual // value. // p.first->second = dir_set.size() - 1; } return p.first->second; } //***************************************************************************** // // SYNOPSIS // void do_check_add_file( char const *file_name ) // // SYNOPSIS // // In the cases where a file is indexed directly from either the command // line or via standard input, its directory has to be added to dir_set. // //***************************************************************************** { char *const dir_path = new_strdup( file_name ); char *const slash = ::strrchr( dir_path, Dir_Sep_Char ); int dir_index; // // Check for the case of "./file": the directory "." doesn't need to be // added since it's automatically added. // if ( slash && (slash > dir_path + 1 || *dir_path != '.' ) ) { *slash = '\0'; dir_index = check_add_directory( dir_path ); } else { delete[] dir_path; dir_index = 0; } do_file( file_name, dir_index ); } #endif /* INDEX */ //***************************************************************************** // // SYNOPSIS // void do_directory( char const *dir_path ) // // DESCRIPTION // // Call do_file() for every file in the given directory; it will queue // subdirectories encountered and call do_directory() on them. It will // not follow symbolic links unless explicitly told to do so. // // This function uses a queue and recurses only once so as not to have too // many directories open concurrently. This has the side-effect of // indexing in a breadth-first order rather than depth-first. // // PARAMETERS // // dir_path The full path of the directory of the files and // subdirectories to index. The string must point to storage // that will last for the duration of the program. // //***************************************************************************** { typedef queue<char const*> dir_queue_type; static dir_queue_type dir_queue; static int recursion; if ( verbosity > 1 ) { if ( verbosity > 2 ) cout << '\n'; cout << dir_path << flush; } #ifndef PJL_NO_SYMBOLIC_LINKS if ( is_symbolic_link( dir_path ) && !follow_symbolic_links ) { if ( verbosity > 3 ) cout << " (skipped: symbolic link)"; if ( verbosity > 1 ) cout << '\n'; return; } #endif DIR *const dir_p = ::opendir( dir_path ); if ( !dir_p ) { if ( verbosity > 3 ) cout << " (skipped: can not open)"; if ( verbosity > 1 ) cout << '\n'; return; } if ( verbosity > 1 ) { if ( verbosity > 2 ) cout << ':'; cout << '\n'; } #ifdef INDEX int const dir_index = check_add_directory( dir_path ); #endif // // Have a buffer for the full path to a file in a directory. For each // file, simply strcpy() the file name into place one character past the // '/'. // char path[ PATH_MAX + 1 ]; ::strcpy( path, dir_path ); char *file = path + ::strlen( path ); *file++ = Dir_Sep_Char; struct dirent const *dir_ent; while ( dir_ent = ::readdir( dir_p ) ) { // // See if the name is "." or "..": if so, skip it. // if ( dir_ent->d_name[0] == '.' ) { if ( !dir_ent->d_name[1] ) continue; if ( dir_ent->d_name[1] == '.' && !dir_ent->d_name[2] ) continue; } ::strcpy( file, dir_ent->d_name ); if ( is_directory( path ) && recurse_subdirectories ) dir_queue.push( new_strdup( path ) ); else { // // Note that do_file() is called in the case where 'path' is a // directory and recurse_subdirectories is false. This is OK since // do_file() checks for and only does plain files. It's also // desirable to call do_file() so we don't have to repeat the code // to print verbose information for 'path'. // #ifdef INDEX do_file( path, dir_index ); #else do_file( path ); #endif } } ::closedir( dir_p ); if ( recursion ) return; ////////// Do all subdirectories ////////////////////////////////////////// while ( !dir_queue.empty() ) { char const *const dir_path = dir_queue.front(); dir_queue.pop(); ++recursion; do_directory( dir_path ); --recursion; } } /* vim:set et sw=4 ts=4: */ ��������������������������������������������������������������������������swish++-6.1.5/directory.h���������������������������������������������������������������������������0000644�0000765�0000000�00000002302�10263525237�013630� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** directory.h ** ** Copyright (C) 1998 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef directory_H #define directory_H // standard #include <map> // local #include "less.h" typedef std::map< char const*, int > dir_set_type; extern dir_set_type dir_set; // // This contains a map of all directory paths and an int that gives its // index (the directory number in the order encountered). #endif /* directory_H */ /* vim:set et sw=4 ts=4: */ ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/do_file.c�����������������������������������������������������������������������������0000644�0000765�0000000�00000020156�10203731236�013217� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** do_file.c ** ** Copyright (C) 1998 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* ** Note that this file is #include'd into index.c and extract.c because ** it generates different code depending on which one it's compiled into. */ // standard #include <vector> // local #include "encoded_char.h" //***************************************************************************** // // SYNOPSIS // #ifdef INDEX void do_file( char const *file_name, int dir_index ) #else void do_file( char const *file_name ) #endif // // DESCRIPTION // // Either index or extract text from the given file, but only if its // extension is among (not among) the specified set. It will not follow // symbolic links unless either the FollowLinks config. file variable or // the -l command-line option was given. // // For extraction, the algorithm is loosely based on what the Unix // strings(1) command does except it goes a bit further to discard things // like Encapsulated PostScript and raw hex data. // // PARAMETERS // // file_name The file to process. // //***************************************************************************** { char const *const orig_base_name = pjl_basename( file_name ); ++num_examined_files; if ( verbosity > 3 ) // print base name of file cout << " " << orig_base_name << flush; ////////// Simple checks to see if we should process the file ///////////// if ( !is_plain_file() ) { // // We're able to use the zero-argument form of is_plain_file() because // the stat_buf is cached by the call to file_exists() in both index.c // and extract.c just before the call to do_file(). // if ( verbosity > 3 ) cout << " (skipped: not plain file)\n"; return; } #ifdef INDEX // // Record the size of the original (non-filtered) file here before we call // is_symbolic_link() below. This is the size that is stored in the index. // off_t const orig_file_size = file_size(); #endif /* INDEX */ #ifndef PJL_NO_SYMBOLIC_LINKS if ( is_symbolic_link( file_name ) && !follow_symbolic_links ) { // // Despite the above comment for is_plain_file(), we have to use the // one-argument form is is_symbolic_link() because we need to call // lstat(2) rather than stat(2). // if ( verbosity > 3 ) cout << " (skipped: symbolic link)\n"; return; } #endif /* PJL_NO_SYMBOLIC_LINKS */ #ifdef INDEX // // If incrementally indexing, it's possible that we've encountered the file // before. // if ( incremental && file_info::seen_file( file_name ) ) { if ( verbosity > 3 ) cout << " (skipped: encountered before)\n"; return; } #endif /* INDEX */ ////////// Perform filter name substitution(s) //////////////////////////// typedef vector< filter > filter_list_type; filter_list_type filter_list; #ifdef INDEX char const *const orig_file_name = file_name; #endif while ( true ) { // // Determine if the file needs to be filtered and, if so, set the // filename to what it would become if it were filtered. // FilterFile::const_pointer const f = file_filters[ file_name ]; if ( !f ) break; filter_list.push_back( *f ); file_name = filter_list.back().substitute( file_name ); } char const *const base_name = pjl_basename( file_name ); // // Skip the file if it matches one of the set of unacceptable patterns. // if ( exclude_patterns.matches( base_name ) ) { if ( verbosity > 3 ) cout << " (skipped: file excluded)\n"; return; } // // See if the filename pattern is included. // #ifdef INDEX IncludeFile::const_iterator const #else ExtractFile::const_iterator const #endif include_pattern = include_patterns.find( base_name ); // // Skip the file if the set of acceptable patterns doesn't contain the // candidate, but only if there was at least one acceptable pattern // specified. // bool const found_pattern = include_pattern != include_patterns.end(); if ( !include_patterns.empty() && !found_pattern ) { if ( verbosity > 3 ) cout << " (skipped: file not included)\n"; return; } #ifdef EXTRACT ostream *out; ofstream extracted_file; if ( extract_as_filter ) { // // We're running as a filter: write to standard output. // out = &cout; } else { // // We're not running as a filter: check to see if the extracted file // already exists; if so, skip extraction entirely. // if ( ::strlen( file_name ) + extract_extension.length() > PATH_MAX ) { if ( verbosity > 3 ) cout << " (skipped: " << extract_extension << " file-name too long)\n"; return; } char extracted_file_name[ PATH_MAX + 1 ]; ::strcpy( extracted_file_name, file_name ); ::strcat( extracted_file_name, extract_extension ); if ( file_exists( extracted_file_name ) ) { if ( verbosity > 3 ) cout << " (skipped: " << extract_extension << " file already exists)\n"; return; } extracted_file.open( extracted_file_name ); if ( !extracted_file ) { if ( verbosity > 3 ) cout << " (skipped: can not create " << extract_extension << " file)\n"; return; } out = &extracted_file; } #endif /* EXTRACT */ // // Execute the filter(s) on the file. // FOR_EACH( filter_list_type, filter_list, f ) if ( !( file_name = f->exec() ) ) { if ( verbosity > 3 ) cout << " (skipped: could not filter)\n"; return; } // // We can (finally!) open the (possibly post-filtered) file. // mmap_file const file( file_name ); if ( !file ) { if ( verbosity > 3 ) cout << " (skipped: can not open)\n"; return; } file.behavior( mmap_file::bt_sequential ); if ( verbosity == 3 ) // print base name of file cout << " " << orig_base_name << flush; #ifdef INDEX if ( file.empty() ) { // // Don't waste a file_info entry on it. // if ( verbosity > 2 ) cout << " (0 words)\n"; return; } ////////// Index the file ///////////////////////////////////////////////// #ifdef IMPLEMENT_DECODING encoded_char_range::decoder::reset_all(); #endif indexer *const i = found_pattern ? include_pattern->second : indexer::text_indexer(); file_info *const fi = new file_info( orig_file_name, dir_index, orig_file_size, i->find_title( file ) ); #ifdef FEATURE_word_pos word_pos = 0; #endif i->index_file( file ); if ( verbosity > 2 ) cout << " (" << fi->num_words() << " words)\n"; if ( words.size() >= word_threshold ) write_partial_index(); #endif /* INDEX */ #ifdef EXTRACT ////////// Extract the file /////////////////////////////////////////////// ++num_extracted_files; extract_words( file.begin(), file.end(), *out ); #endif /* EXTRACT */ } /* vim:set et sw=4 ts=4: */ ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/enc_int.c�����������������������������������������������������������������������������0000644�0000765�0000000�00000005536�10263526043�013246� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** enc_int.c ** ** Copyright (C) 2003 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ // standard #include <iostream> // local #include "enc_int.h" #include "platform.h" using namespace std; //***************************************************************************** // // SYNOPSIS // int dec_int( register unsigned char const *&p ) // // DESCRIPTION // // Decode an integer from an encoded byte sequence. See the comment for // enc_int() for details of the encoding scheme. // // PARAMETERS // // p A pointer to the start of the encoded integer. After an integer is // decoded, it is left one past the last byte. // // RETURN VALUE // // The integer. // //***************************************************************************** { register unsigned n = 0; do { n = (n << 7) | (*p & 0x7Fu); } while ( *p++ & 0x80u ); return n; } //***************************************************************************** // // SYNOPSIS // ostream& enc_int( ostream &o, register unsigned n ) // // DESCRIPTION // // Write an unsigned integer to the given ostream in an encoded format. // The format uses a varying number of bytes. For a given byte, only the // lower 7 bits are used for data; the high bit, if set, is used to // indicate whether the integer continues into the next byte. The encoded // integer is written to the given ostream starting with the most // significant byte. // // PARAMETERS // // o The ostream to write to. // // n The integer to be written. // // RETURN VALUE // // Returns the passed-in ostream. // //***************************************************************************** { unsigned char buf[ 20 ]; // // Encode the integer (in reverse because it's easier) just like atoi(). // register unsigned char *p = buf + sizeof buf; do { *--p = 0x80u | (n & 0x7Fu); } while ( n >>= 7 ); buf[ sizeof buf - 1 ] &= 0x7Fu; // clear last "continuation bit" return o.write( reinterpret_cast<char*>( p ), buf + sizeof buf - p ); } /* vim:set et sw=4 ts=4: */ ������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/enc_int.h�����������������������������������������������������������������������������0000644�0000765�0000000�00000002231�10166052462�013241� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** enc_int.h ** ** Copyright (C) 2003 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef enc_int_H #define enc_int_H // standard #include <iostream> // local #include "omanip.h" int dec_int( unsigned char const*& ); std::ostream& enc_int( std::ostream&, unsigned ); inline PJL::omanip<unsigned> enc_int( unsigned n ) { return PJL::omanip<unsigned>( enc_int, n ); } #endif /* enc_int_H */ /* vim:set et sw=4 ts=4: */ �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/encoded_char.c������������������������������������������������������������������������0000644�0000765�0000000�00000003445�10300243715�014214� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** encoded_char.c ** ** Copyright (C) 2002 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifdef IMPLEMENT_DECODING // local #include "encoded_char.h" encoded_char_range::decoder::set_type encoded_char_range::decoder::set_; //***************************************************************************** // // SYNOPSIS // encoded_char_range::decoder::~decoder() // // DESCRIPTION // // Destroy an encoded_char_range::decoder. // // NOTE // // This is out-of-line only because it's virtual. // //***************************************************************************** { // do nothing } //***************************************************************************** // // SYNOPSIS // void encoded_char_range::decoder::reset_all() // // SYNOPSIS // // Reset all the registered decoders. // //***************************************************************************** { for ( set_type::iterator i = set_.begin(); i != set_.end(); ++i ) (*i)->reset(); } #endif /* IMPLEMENT_DECODING */ /* vim:set et sw=4 ts=4: */ ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/encoded_char.h������������������������������������������������������������������������0000644�0000765�0000000�00000030535�10300243715�014221� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** encoded_char.h ** ** Copyright (C) 2000 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef encoded_char_H #define encoded_char_H // standard #include <iterator> #include <set> // local #include "iso8859-1.h" #include "util.h" /* for to_lower() */ //***************************************************************************** // // SYNOPSIS // class encoded_char_range // // DESCRIPTION // // An encoded_char_range is an abstraction that contains a range of // characters in memory that are encoded according to some scheme, either // a Content-Transfer-Encoding (such as Quoted-Printable or Base64) or a // character set (such as UTF-7 or UTF-8). A const_iterator is used to // iterate over the range and, when dereferenced, decodes a character. // // However, doing this is a serious performance hit since it has to be // done for every single character examined. Hence, the code is #ifdef'd // for MOD_id3 and MOD_mail: if neither is used, there's no need for any // special decoding. // //***************************************************************************** { public: typedef ptrdiff_t difference_type; typedef char value_type; typedef value_type const* pointer; typedef value_type (*charset_type )( pointer, pointer&, pointer ); typedef value_type (*encoding_type)( pointer, pointer&, pointer ); class const_iterator; friend class const_iterator; encoded_char_range( pointer begin, pointer end, charset_type = 0, encoding_type = 0 ); encoded_char_range( const_iterator const &pos ); encoded_char_range( const_iterator const &begin, const_iterator const &end ); // default copy constructor is fine // default assignment operator is fine const_iterator begin() const; pointer begin_pos() const { return begin_; } void begin_pos( pointer p ) { begin_ = p; } void begin_pos( const_iterator const& ); const_iterator end() const; pointer end_pos() const { return end_; } void end_pos( pointer p ) { end_ = p; } void end_pos( const_iterator const& ); #ifdef IMPLEMENT_DECODING class decoder; #endif protected: encoded_char_range() { } pointer begin_; pointer end_; #ifdef IMPLEMENT_DECODING charset_type charset_; encoding_type encoding_; #endif }; //***************************************************************************** // // SYNOPSIS // class encoded_char_range::const_iterator : public encoded_char_range, public std::iterator< std::forward_iterator_tag, encoded_char_range::value_type const > // // DESCRIPTION // // An encoded_char_range::const_iterator is (not surprisingly) an iterator // for an encoded_char_range. It might seem a bit odd to have an iterator // derived from the container class it's an iterator for (that's because // it is odd), but the iterator needs access to all its data members and // going through an extra level of indirection by having a pointer to it // would be slower. // //***************************************************************************** { public: typedef encoded_char_range::difference_type difference_type; typedef encoded_char_range::value_type value_type; typedef encoded_char_range::pointer pointer; const_iterator() { } const_iterator( pointer begin, pointer end, charset_type = 0, encoding_type = 0 ); // default copy constructor is fine // default assignment operator is fine value_type operator*() const; const_iterator& operator++(); const_iterator operator++(int); bool at_end() const { return pos_ == end_; } pointer pos() const { return pos_; } pointer& pos() { return pos_; } pointer prev_pos() const { return prev_; } friend bool operator==( const_iterator const&, const_iterator const& ); friend bool operator==( const_iterator const&, pointer ); private: mutable pointer pos_; mutable pointer prev_; #ifdef IMPLEMENT_DECODING mutable value_type ch_; mutable bool decoded_; mutable int delta_; #endif const_iterator( encoded_char_range const*, pointer start_pos ); friend class encoded_char_range; // for access to c'tor above #ifdef IMPLEMENT_DECODING void decode() const; #endif }; #ifdef IMPLEMENT_DECODING //***************************************************************************** // // SYNOPSIS // class encoded_char_range::decoder // // DESCRIPTION // // An encoded_char_range::decoder is used to keep decoders' state between // calls and reset state between files to their initial states just before // starting to index a file. // //***************************************************************************** { public: typedef encoded_char_range::value_type value_type; typedef encoded_char_range::pointer pointer; virtual ~decoder(); static void reset_all(); protected: decoder() { set_.insert( this ); } virtual void reset() = 0; private: typedef std::set< decoder* > set_type; static set_type set_; }; #endif /* IMPLEMENT_DECODING */ ////////// encoded_char_range inlines ///////////////////////////////////////// // I hate lots of typing. #define ECR encoded_char_range #define ECR_CI ECR::const_iterator inline ECR::ECR( pointer begin, pointer end, charset_type charset, encoding_type encoding ) : begin_( begin ), end_( end ) #ifdef IMPLEMENT_DECODING , charset_( charset ), encoding_( encoding ) #endif { } inline ECR::ECR( const_iterator const &i ) : begin_( i.pos_ ), end_( i.end_ ) #ifdef IMPLEMENT_DECODING , charset_( i.charset_ ), encoding_( i.encoding_ ) #endif { } inline ECR::ECR( const_iterator const &begin, const_iterator const &end ) : begin_( begin.pos_ ), end_( end.pos_ ) #ifdef IMPLEMENT_DECODING , charset_( begin.charset_ ), encoding_( begin.encoding_ ) #endif { } inline ECR_CI ECR::begin() const { return const_iterator( this, begin_ ); } inline ECR_CI ECR::end() const { return const_iterator( this, end_ ); } inline void ECR::begin_pos( const_iterator const &i ) { begin_ = i.pos_; } inline void ECR::end_pos( const_iterator const &i ) { end_ = i.pos_; } ////////// encoded_char_range::const_iterator inlines ///////////////////////// inline ECR_CI::const_iterator( pointer begin, pointer end, charset_type charset, encoding_type encoding ) : encoded_char_range( begin, end, charset, encoding ), pos_( begin ) #ifdef IMPLEMENT_DECODING , decoded_( false ) #endif { } inline ECR_CI::const_iterator( ECR const *ecr, pointer start_pos ) : encoded_char_range( start_pos, ecr->end_ #ifdef IMPLEMENT_DECODING , ecr->charset_, ecr->encoding_ #endif ), pos_( start_pos ) #ifdef IMPLEMENT_DECODING , decoded_( false ) #endif { } #ifdef IMPLEMENT_DECODING //***************************************************************************** // // SYNOPSIS // inline void ECR_CI::decode() const // // DESCRIPTION // // Decode the character at the iterator's current position according to // the character range's content-transfer-encoding. // // RETURN VALUE // // Returns the decoded character. // //***************************************************************************** { // // Remember the current position to allow the decoders to advance // through the encoded text. This allows the delta to be computed so // the iterator can be incremented later. // pointer c = pos_; // // A mail message can have both an encoding and a non-ASCII or // non-ISO-8859-1 charset simultaneously, e.g., base64-encoded UTF-8. // (In practice, this particular case should never happen since UTF-7 // should be used instead; but you get the idea.) // // However, handling both an encoding and such a charset simultaneously // is a real pain because both can use multiple characters to decode a // single character and keeping track of both positions is messy and I // didn't feel like thinking about this just now. // // Hence, a current caveat is that a mail message or attachment can // have EITHER an encoding OR a non-ASCII/ISO-8859-1 character set, but // not both. If it does, the encoding takes precedence. // if ( encoding_ ) ch_ = (*encoding_)( begin_, c, end_ ); else if ( charset_ ) ch_ = (*charset_)( begin_, c, end_ ); else ch_ = iso8859_1_to_ascii( *c++ ); delta_ = c - pos_; } #endif /* IMPLEMENT_DECODING */ //***************************************************************************** // // SYNOPSIS // inline ECR::value_type ECR_CI::operator*() const // // DESCRIPTION // // Dereference an encoded_char_range::const_iterator at its current // position. // // RETURN VALUE // // Returns the decoded character. // //***************************************************************************** { #ifdef IMPLEMENT_DECODING if ( !decoded_ ) { decode(); decoded_ = true; } return ch_; #else return iso8859_1_to_ascii( *pos_ ); #endif } //***************************************************************************** // // SYNOPSIS // inline ECR_CI& ECR_CI::operator++() // // DESCRIPTION // // Pre-increment the iterator's position by one. // // RETURN VALUE // // Returns a reference to the given object. // //***************************************************************************** { #ifdef IMPLEMENT_DECODING if ( decoded_ ) { // // The character at the current position has previously been // decoded so we know the delta. However, since we're about to // increment the position to the next character, that character // will no longer have been decoded, so set decoded_ to false. // decoded_ = false; } else { // // The character at the current position has not previously // been decoded so we don't know the delta: call decode() to // calculate the delta only. We can't set decoded_ to true // since we're about to increment the position to the next // character and that character hasn't been decoded. // decode(); } #endif prev_ = pos_; #ifdef IMPLEMENT_DECODING pos_ += delta_; #else ++pos_; #endif return *this; } //***************************************************************************** // // SYNOPSIS // inline ECR_CI ECR_CI::operator++(int) // // DESCRIPTION // // Post-increment the iterator's position by one. // // RETURN VALUE // // Returns a reference to the original (pre-incremented) object. // //***************************************************************************** { ECR_CI const temp = *this; return ++*this, temp; } //***************************************************************************** // // Equality operators. // //***************************************************************************** inline bool operator==( ECR_CI const &e1, ECR_CI const &e2 ) { return e1.pos_ == e2.pos_; } inline bool operator==( ECR_CI const &e, ECR_CI::pointer p ) { return e.pos_ == p; } inline bool operator==( ECR_CI::pointer p, ECR_CI const &e ) { return e == p; } inline bool operator!=( ECR_CI const &e1, ECR_CI const &e2 ) { return !( e1 == e2 ); } inline bool operator!=( ECR_CI const &e, ECR_CI::pointer p ) { return !( e == p ); } inline bool operator!=( ECR_CI::pointer p, ECR_CI const &e ) { return e != p; } //***************************************************************************** // // SYNOPSIS // inline char *to_lower( ECR const &range ) // // DESCRIPTION // // Return a pointer to a string converted to lower case taking the // encoding of the characters into account; the original string is // untouched. The string returned is from an internal pool of string // buffers. The time you get into trouble is if you hang on to more then // Num_Buffers strings. This doesn't normally happen in practice, // however. // // PARAMETERS // // c The iterator to use. // // RETURN VALUE // // A pointer to the lower-case string. // //***************************************************************************** { extern char_buffer_pool<128,5> lower_buf; register char *p = lower_buf.next(); for ( ECR_CI c = range.begin(); !c.at_end(); ++c ) *p++ = to_lower( *c ); *p = '\0'; return lower_buf.current(); } #undef ECR_CI #undef ECR #endif /* encoded_char_H */ /* vim:set noet sw=8 ts=8: */ �������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/encodings/����������������������������������������������������������������������������0000755�0000765�0000000�00000000000�10746421420�013422� 5����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/encodings/base64.c��������������������������������������������������������������������0000644�0000765�0000000�00000017001�10166052462�014653� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** encodings/base64.c ** ** Copyright (C) 2002 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifdef ENCODING_base64 // standard #include <cstring> // local #include "encoded_char.h" #include "platform.h" #include "util.h" using namespace std; namespace { class base64_decoder : public encoded_char_range::decoder { public: pointer prev_c_; private: virtual void reset() { prev_c_ = 0; } }; } // namespace //***************************************************************************** // // SYNOPSIS // encoded_char_range::value_type encoding_base64( encoded_char_range::pointer begin, encoded_char_range::pointer &c, encoded_char_range::pointer end ) // // DESCRIPTION // // Convert a base64-encoded character sequence to its single-character // equivalent. Ideally, we want to decode only a single character, but // Base64 encoding forces us to decode 3 characters at a time because they // are encoded as a unit into 4 bytes. This makes this code a major pain // and slow because characters have to be able to be decoded with random // access, i.e., wherever the pointer is positioned. // // An approach other than the one implemented here would have been to // decode the entire range into a buffer in one shot, but this could use a // lot of memory if the range is large (and the indexer already uses lots // of memory). Additionally, mapping the iterator position from encoded // space to decoded space would have been tricky and just as much of a // pain. // // Anywhere a space is returned it's because we've encountered an error // condition and the function has to return "something" and a space is // innocuous. // // PARAMETERS // // begin An pointer marking the beginning of the entire encoded range. // // c An pointer marking the position of the character to decode. It // is left after the decoded character. // // end An pointer marking the end of the entire encoded range. // // RETURN VALUE // // Returns the decoded character or ' ' upon error. // // SEE ALSO // // Ned Freed and Nathaniel S. Borenstein. "RFC 2045: Multipurpose // Internet Mail Extensions (MIME) Part One: Format of Internet Message // Bodies," Section 6.8, "Base64 Content-Transfer-Encoding," RFC 822 // Extensions Working Group of the Internet Engineering Task Force, // November 1996. // // NOTE // // This code is based on the decode_base64() function as part of "encdec // 1.1" by Jörgen Hägg <jh@efd.lth.se>, 1993. // //***************************************************************************** { int const Bits_Per_Char = 6; // by definition of Base64 encoding static encoded_char_range::value_type buf[ 3 ]; // group-of-4 -> 3 chars static base64_decoder decoder; // // See if the pointer is less than a buffer's-worth away from the previous // pointer: if so, simply return the already-decoded character. // encoded_char_range::difference_type delta = c - decoder.prev_c_; if ( delta >= 0 && delta < sizeof buf ) { // // We advance the pointer 1 position for the first 2 characters but 2 // positions for the 3rd since we have to skip over the 4th character // used in the encoded version of the characters. // return_decoded_char: if ( ++c != end && delta == 2 ) ++c; return buf[ delta ]; } // // If we're positioned at a newline, skip over it. // encoded_char_range::pointer line_begin = skip_newline( c, end ); if ( line_begin == end ) { // // We ran into the end: return something innocuous like a space since // we have to return something. // reached_end: c = end; return ' '; } if ( line_begin == c && line_begin > begin ) { // // Both line_begin hasn't moved (meaning we didn't just skip over a // newline) and we're not at the beginning of the encoded char range: // we need to "sync" by finding the beginning of the line to know where // the groups-of-4 encoded characters start. // while ( line_begin > begin && !is_space( *line_begin ) ) --line_begin; if ( line_begin > begin ) ++line_begin; } else { // // Either line_begin moved or we're at "begin": in either case, we're // at the beginning of a line. Just skip "c" over the newline also. // c = line_begin; } // // Calculate where the start of the group-of-4 encoded characters is. // delta = c - line_begin; encoded_char_range::difference_type const delta4 = delta & ~3u; encoded_char_range::pointer const group = line_begin + delta4; if ( group + 1 == end || group + 2 == end || group + 3 == end ) { // // Well-formed Base64-encoded text should always been in groups of 4 // characters. This text isn't: stop. // goto reached_end; } // // Determine the number of characters actually encoded into the 4 by // looking for padding characters ('='). // int const num_chars = group[2] == '=' ? 1 : group[3] == '=' ? 2 : 3; // // Calculate a combined value of the encoded 6-bit characters. // register unsigned value = 0; register int i; for ( i = 0; i <= num_chars; ++i ) { static char const alphabet[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz" "0123456789+/="; // // Find the character in the Base64 alphabet. // if ( char const *const a = ::strchr( alphabet, group[i] ) ) value += (a - alphabet) << ((3 - i) * Bits_Per_Char); else { // // From RFC 2045, section 6.8: // // Any characters outside of the base64 alphabet are to be // ignored in base64-encoded data. // /* do nothing */; } } // // Now that we have a combined value, break it back apart but in 8-bit // chunks, i.e., ordinary characters. // for ( i = 2; i >= 0; --i ) { buf[ i ] = value & 255; value >>= 8; } // // Pretend to have decoded a single character and that it took only a // single byte to do it. Additionally, remember the position of the // pointer marking the beginning of the range of characters that have been // decoded. If we subsequently are asked to decode a character in the // range [i,i+3), we can simply return the character. // decoder.prev_c_ = c; delta -= delta4; goto return_decoded_char; } #endif /* ENCODING_base64 */ /* vim:set et sw=4 ts=4: */ �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/encodings/encodings.h�����������������������������������������������������������������0000644�0000765�0000000�00000003021�10030464515�015536� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** encodings/encodings.h ** ** Copyright (C) 2002 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef encodings_H #define encodings_H // local #include "encoded_char.h" encoded_char_range::encoding_type const Seven_Bit = 0; encoded_char_range::encoding_type const Eight_Bit = 0; // treaded like Seven_Bit encoded_char_range::encoding_type const Binary = reinterpret_cast<encoded_char_range::encoding_type>( ~0 ); #ifdef ENCODING_base64 encoded_char_range::value_type encoding_base64( encoded_char_range::pointer begin, encoded_char_range::pointer &pos, encoded_char_range::pointer end ); #endif #ifdef ENCODING_quoted_printable encoded_char_range::value_type encoding_quoted_printable( encoded_char_range::pointer begin, encoded_char_range::pointer &pos, encoded_char_range::pointer end ); #endif #endif /* encodings_H */ /* vim:set noet sw=8 ts=8: */ ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/encodings/GNUmakefile�����������������������������������������������������������������0000644�0000765�0000000�00000002670�10036607726�015510� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������## # SWISH++ # encodings/GNUmakefile # # Copyright (C) 2002 Paul J. Lucas # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. ## ########## You shouldn't have to change anything below this line. ############# TARGET:= libencodings.a .PHONY: all all: $(TARGET) ROOT:= .. include $(ROOT)/config/config.mk INCLUDES:= -I$(ROOT) CFLAGS:= $(CCFLAGS) $(DEBUGFLAGS) $(INCLUDES) SOURCES:= $(foreach encoding,$(ENCODING_LIST),$(encoding).c) ## # Build rules ## $(TARGET): $(SOURCES:.c=.o) $(RM) $@ $(AR) $@ $^ -$(RANLIB) $@ # Don't do the "include" if the goal contains the word "clean," i.e., either # the "clean" or "distclean" goal. ifneq ($(findstring clean,$(MAKECMDGOALS)),clean) -include $(SOURCES:%.c=.%.d) endif ## # Utility rules ## clean: $(RM) *.o $(TEMPLATE_REPOSITORY) distclean: clean $(RM) $(TARGET) .*.d ������������������������������������������������������������������������swish++-6.1.5/encodings/quoted_printable.c����������������������������������������������������������0000644�0000765�0000000�00000011435�10263525153�017135� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** encodings/quoted_printable.c ** ** Copyright (C) 2002 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifdef ENCODING_quoted_printable // standard #include <cstring> // local #include "encoded_char.h" #include "platform.h" #include "util.h" using namespace std; //***************************************************************************** // // SYNOPSIS // encoded_char_range::value_type encoding_quoted_printable( encoded_char_range::pointer, encoded_char_range::pointer &c, encoded_char_range::pointer end ) // // DESCRIPTION // // Convert a quoted-printable character sequence to its single-character // equivalent. However, if it's just a "soft line break," return the // character after it, i.e., make it seem as if the soft line break were // not there. // // Anywhere a space is returned it's because we've encountered an error // condition and the function has to return "something" and a space is // innocuous. // // PARAMETERS // // c An pointer marking the position of the character to decode. It // is left after the decoded character. // // end An pointer marking the end of the entire encoded range. // // RETURN VALUE // // Returns the decoded character or ' ' upon error. // // SEE ALSO // // Ned Freed and Nathaniel S. Borenstein. "RFC 2045: Multipurpose // Internet Mail Extensions (MIME) Part One: Format of Internet Message // Bodies," Section 6.7, "Quoted-Printable Content-Transfer-Encoding," RFC // 822 Extensions Working Group of the Internet Engineering Task Force, // November 1996. // //***************************************************************************** { // // Check to see if the character at the current position is an '=': if not, // the character is an ordinary character; if so, the character is a // quoted-printable encoded character and needs to be decoded. // if ( *c != '=' ) return *c++; if ( ++c == end ) return ' '; encoded_char_range::value_type h1; while ( true ) { h1 = *c++; if ( h1 == '\r' ) { // // The '=' was the last character on a line so this is supposed to // be a "soft line break": we therefore have to skip over it // entirely making things appear as though it's not even there by // returning the character after the break. // if ( c == end || *c == '\n' && ++c == end ) return ' '; if ( *c != '=' ) return *c++; // // The character after the soft line break just so happens to be // another '=' so we have to start all over again. // if ( ++c == end ) return ' '; continue; } if ( h1 == '\n' ) { // // Although "soft line breaks" are supposed to be represented by // CR-LF pairs, we're being robust here and allowing just an LF by // itself. // if ( c == end ) return ' '; if ( *c != '=' ) return *c++; if ( ++c == end ) return ' '; continue; } break; } if ( !is_xdigit( h1 ) || c == end ) { // // If it's not a hexadecimal digit or it's the last character, it's // malformed. // return ' '; } encoded_char_range::value_type const h2 = *c++; if ( !is_xdigit( h2 ) ) { // // This shouldn't happen in proper quoted-printable text. // return ' '; } return static_cast<encoded_char_range::value_type>( // // We're being robust by ensuring the hexadecimal characters are upper // case. // ( is_digit( h1 ) ? h1 - '0' : toupper( h1 ) - 'A' + 10 ) << 4 | ( is_digit( h2 ) ? h2 - '0' : toupper( h2 ) - 'A' + 10 ) ); } #endif /* ENCODING_quoted_printable */ /* vim:set et sw=4 ts=4: */ �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/encodings/README����������������������������������������������������������������������0000644�0000765�0000000�00000001434�07475324104�014312� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������=============================================================================== README for SWISH++ encodings =============================================================================== The files in this directory handle the values for the Content-Transfer-Encoding header (RFC 2045 section 6) used by the mail indexer, to wit: quoted-printable and base64. The values 7bit and 8bit are easily handled internally as ASCII and ISO 8859-1, respectively; the values binary, and x-token are not handled at all. The reason these files are here in this subdirectory rather than in mod/mail is because, some day, there may be a way to specify non-mail files that are, say, base-64 encoded to be decoded. If this ever happens, then additional non-mail decoders could be added to this subdirectory. ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/ExcludeFile.c�������������������������������������������������������������������������0000644�0000765�0000000�00000003067�10166052461�014015� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** ExcludeFile.c ** ** Copyright (C) 1998 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ // standard #include <cstring> // local #include "ExcludeFile.h" #include "platform.h" #include "util.h" /* for new_strdup() */ using namespace std; //***************************************************************************** // // SYNOPSIS // void ExcludeFile::parse_value( char *line ) // // DESCRIPTION // // Parse the line of text by splitting it into words that are separated by // whitespace. // // PARAMETERS // // line The line of text to be parsed. // //***************************************************************************** { for ( register char const *s; s = ::strtok( line, " \r\t" ); line = 0 ) insert( new_strdup( s ) ); } /* vim:set et sw=4 ts=4: */ �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/ExcludeFile.h�������������������������������������������������������������������������0000644�0000765�0000000�00000003653�10166044112�014015� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** ExcludeFile.h ** ** Copyright (C) 1998 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef ExcludeFile_H #define ExcludeFile_H // local #include "conf_var.h" #include "pattern_map.h" //***************************************************************************** // // SYNOPSIS // class ExcludeFile : public conf_var, public pattern_map< bool > // // DESCRIPTION // // An ExcludeFile is-a conf_var containing the set of filename patterns to // exclude during either indexing or extraction. // // This is the same as either index's or extract's -E command-line option. // // NOTE // // The bool template parameter is not used here. It's simply to reuse // the code for pattern_map as-is. // //***************************************************************************** { public: ExcludeFile() : conf_var( "ExcludeFile" ) { } CONF_VAR_ASSIGN_OPS( ExcludeFile ) void insert( char const *pattern ) { pattern_map<bool>::insert( pattern, false ); } private: virtual void parse_value( char *line ); virtual void reset() { clear(); } }; extern ExcludeFile exclude_patterns; #endif /* ExcludeFile_H */ /* vim:set et sw=4 ts=4: */ �������������������������������������������������������������������������������������swish++-6.1.5/ExcludeMeta.h�������������������������������������������������������������������������0000644�0000765�0000000�00000003000�10166044112�014006� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** ExcludeMeta.h ** ** Copyright (C) 1998 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef ExcludeMeta_H #define ExcludeMeta_H // local #include "conf_set.h" //***************************************************************************** // // SYNOPSIS // class ExcludeMeta : public conf_set // // DESCRIPTION // // An ExcludeMeta is-a conf_set containing the set of META names to // exclude during indexing. // // This is the same as index's -M command-line option. // //***************************************************************************** { public: ExcludeMeta() : conf_set( "ExcludeMeta" ) { } CONF_SET_ASSIGN_OPS( ExcludeMeta ) }; extern ExcludeMeta exclude_meta_names; #endif /* ExcludeMeta_H */ /* vim:set et sw=4 ts=4: */ swish++-6.1.5/exit_codes.h��������������������������������������������������������������������������0000644�0000765�0000000�00000003774�10032070406�013753� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** exit_codes.h ** ** Copyright (C) 1998 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef exit_codes_H #define exit_codes_H // exit(3) status codes enum { // common to all executables Exit_Internal_Error = 127, Exit_Success = 0, Exit_Config_File = 1, Exit_Usage = 2, // unique to index Exit_No_Open_Temp = 10, Exit_No_Write_Index = 11, Exit_No_Write_Temp = 12, Exit_Not_Root = 13, // unique to extract Exit_No_Such_File = 20, // common between index and extract Exit_No_Read_Stopwords = 30, // common between index and search Exit_No_Read_Index = 40, // unique to search Exit_Malformed_Query = 50, #ifdef FEATURE_word_pos Exit_No_Word_Pos_Data = 51, #endif #ifdef SEARCH_DAEMON Exit_No_Write_PID = 60, Exit_No_Host_or_IP = 61, Exit_No_TCP_Socket = 62, Exit_No_Unix_Socket = 63, Exit_No_Unlink = 64, Exit_No_TCP_Bind = 65, Exit_No_Unix_Bind = 66, Exit_No_TCP_Listen = 67, Exit_No_Unix_Listen = 68, Exit_No_Select = 69, Exit_No_Accept = 70, Exit_No_Fork = 71, Exit_No_Change_Dir = 72, Exit_No_Create_Thread = 73, Exit_No_Create_Thread_Key = 74, Exit_No_Detach_Thread = 75, Exit_No_Init_Thread_Condition = 76, Exit_No_Init_Thread_Mutex = 77, Exit_No_User = 78, Exit_No_Group = 79, #endif Exit_End_Enum_Marker }; #endif /* exit_codes_H */ /* vim:set noet sw=8 ts=8: */ ����swish++-6.1.5/extract.c�����������������������������������������������������������������������������0000644�0000765�0000000�00000040066�10203731203�013264� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** extract.c ** ** Copyright (C) 1998 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ // standard #include <cstdlib> /* for exit(2) */ #include <cstring> #include <fstream> #include <iomanip> /* for setfill(), setw() */ #include <iostream> #include <string> #include <sys/types.h> #include <time.h> // local #include "platform.h" #include "config.h" #include "ExcludeFile.h" #include "exit_codes.h" #include "ExtractExtension.h" #include "ExtractFile.h" #include "ExtractFilter.h" #include "FilterFile.h" #ifndef PJL_NO_SYMBOLIC_LINKS #include "FollowLinks.h" #endif #include "mmap_file.h" #include "option_stream.h" #include "postscript.h" #include "RecurseSubdirs.h" #include "StopWordFile.h" #include "stop_words.h" #include "util.h" #include "Verbosity.h" #include "version.h" #include "word_util.h" using namespace PJL; using namespace std; ExcludeFile exclude_patterns; // do not extract these ExtractFile include_patterns; // do extract these ExtractFilter extract_as_filter; ExtractExtension extract_extension; FilterFile file_filters; static bool in_postscript; char const* me; // executable name int num_examined_files; int num_extracted_files; RecurseSubdirs recurse_subdirectories; Verbosity verbosity; // how much to print static bool extract_word( char *word, int len, ostream& ); static void extract_words( mmap_file::const_iterator begin, mmap_file::const_iterator end, ostream& ); static ostream& usage( ostream& = cerr ); #define EXTRACT #include "do_file.c" #include "directory.c" //***************************************************************************** // // SYNOPSIS // int main( int argc, char *argv[] ) // // DESCRIPTION // // Parse the command line, initialize, call other functions ... the usual // things that are done in main(). // // PARAMETERS // // argc The number of arguments. // // argv A vector of the arguments; argv[argc] is null. Aside from the // options below, the arguments are the names of the files and // directories to be extracted. // // SEE ALSO // // Bjarne Stroustrup. "The C++ Programming Language, 3rd ed." // Addison-Wesley, Reading, MA, 1997. pp. 116-118. // //***************************************************************************** { me = ::strrchr( argv[0], '/' ); // determine base name... me = me ? me + 1 : argv[0]; // ...of executable #ifdef RLIMIT_CPU /* SVR4, 4.2+BSD */ // // Max-out the amount of CPU time we can run since extraction can take a // while. // max_out_limit( RLIMIT_CPU ); #endif /////////// Process command-line options ////////////////////////////////// static option_stream::spec const opt_spec[] = { "help", 0, '?', "config", 1, 'c', "pattern", 1, 'e', "no-pattern", 1, 'E', "filter", 0, 'f', #ifndef PJL_NO_SYMBOLIC_LINKS "follow-links", 0, 'l', #endif "no-recurse", 0, 'r', "stop-file", 1, 's', "dump-stop", 0, 'S', "verbose", 1, 'v', "version", 0, 'V', "extension", 1, 'x', 0 }; char const* config_file_name_arg = ConfigFile_Default; bool dump_stop_words_opt = false; bool extract_as_filter_opt = false; char const* extract_extension_arg = 0; #ifndef PJL_NO_SYMBOLIC_LINKS bool follow_symbolic_links_opt = false; #endif bool recurse_subdirectories_opt = false; StopWordFile stop_word_file_name; char const* stop_word_file_name_arg = 0; char const* verbosity_arg = 0; option_stream opt_in( argc, argv, opt_spec ); for ( option_stream::option opt; opt_in >> opt; ) switch ( opt ) { case '?': // Print help. cerr << usage; case 'c': // Specify config. file. config_file_name_arg = opt.arg(); break; case 'e': { // Filename pattern(s) to extract. char *a = opt.arg(); for ( char *pat; pat = ::strtok( a, "," ); ) { include_patterns.insert( pat, 0 ); a = 0; } break; } case 'E': { // Filename pattern(s) not to extract. char *a = opt.arg(); for ( char *pat; pat = ::strtok( a, "," ); ) { exclude_patterns.insert( pat ); a = 0; } break; } case 'f': // Run as a filter. extract_as_filter_opt = true; break; #ifndef PJL_NO_SYMBOLIC_LINKS case 'l': // Follow symbolic links during extraction. follow_symbolic_links_opt = true; break; #endif case 'r': // Specify whether to extract recursively. recurse_subdirectories_opt = true; break; case 's': // Specify stop-word list. stop_word_file_name_arg = opt.arg(); break; case 'S': // Dump stop-word list. dump_stop_words_opt = true; break; case 'v': // Specify verbosity level. verbosity_arg = opt.arg(); break; case 'V': // Display version and exit. cout << "SWISH++ " << version << endl; ::exit( Exit_Success ); case 'x': // Specify filename extension to append. extract_extension_arg = opt.arg(); break; default: // Bad option. cerr << usage; } argc -= opt_in.shift(), argv += opt_in.shift(); // // First, parse the config. file (if any); then override variables // specified on the command line with options. // conf_var::parse_file( config_file_name_arg ); if ( extract_as_filter_opt ) extract_as_filter = true; if ( extract_as_filter ) { // // When running as a filter, patterns aren't used. We clear them here // in case some were set via an IncludeFile directive in a // configuration file. That let's us get away with not having to // special-case the code in do_file(). // exclude_patterns.clear(); include_patterns.clear(); } if ( extract_extension_arg ) extract_extension = extract_extension_arg; if ( *extract_extension != '.' ) // prepend '.' if needed extract_extension = string( "." ) + (char const*)extract_extension; #ifndef PJL_NO_SYMBOLIC_LINKS if ( follow_symbolic_links_opt ) follow_symbolic_links = true; #endif if ( recurse_subdirectories_opt ) recurse_subdirectories = false; if ( stop_word_file_name_arg ) stop_word_file_name = stop_word_file_name_arg; if ( verbosity_arg ) verbosity = verbosity_arg; /////////// Deal with stop-words ////////////////////////////////////////// stop_words = new stop_word_set( stop_word_file_name ); if ( dump_stop_words_opt ) { ::copy( stop_words->begin(), stop_words->end(), ostream_iterator< char const* >( cout, "\n" ) ); ::exit( Exit_Success ); } /////////// Extract specified directories and files /////////////////////// bool const using_stdin = *argv && (*argv)[0] == '-' && !(*argv)[1]; if ( !( extract_as_filter || using_stdin ) && include_patterns.empty() && exclude_patterns.empty() ) error() << "filename patterns must be specified when not" " a filter nor\nusing standard input\n" << usage; if ( !argc ) cerr << usage; ////////// Extract text from specified files ////////////////////////////// time_t time = ::time( 0 ); // Go! if ( extract_as_filter ) { // // Do a single file. // if ( !file_exists( *argv ) ) { error() << *argv << " does not exist\n"; ::exit( Exit_No_Such_File ); } do_file( *argv ); } else if ( using_stdin ) { // // Read file/directory names from standard input. // char file_name[ PATH_MAX + 1 ]; while ( cin.getline( file_name, PATH_MAX ) ) { if ( !file_exists( file_name ) ) { if ( verbosity > 3 ) cout << " " << file_name << " (skipped: does not exist)\n"; continue; } if ( is_directory() ) do_directory( new_strdup( file_name ) ); else do_file( file_name ); } } else { // // Read file/directory names from command line. // for ( ; *argv; ++argv ) { if ( !file_exists( *argv ) ) { if ( verbosity > 3 ) cout << " " << *argv << " (skipped: does not exist)\n"; continue; } if ( is_directory() ) do_directory( *argv ); else do_file( *argv ); } } if ( verbosity ) { time = ::time( 0 ) - time; // Stop! cout << '\n' << me << ": done:\n " << setfill('0') << setw(2) << (time / 60) << ':' << setw(2) << (time % 60) << " (min:sec) elapsed time\n " << num_examined_files << " files, " << num_extracted_files << " extracted\n\n"; } ::exit( Exit_Success ); } //***************************************************************************** // // SYNOPSIS // bool extract_word( register char *word, register int len, ostream &out ) // // DESCRIPTION // // Potentially extract the given word. // // PARAMETERS // // word The candidate word to be extracted. // // len The length of the word since it is not null-terminated. // // out The ostream to write the word to. // // RETURN VALUE // // Returns true only if the word was extracted. // //***************************************************************************** { if ( len < Word_Hard_Min_Size ) return false; word[ len ] = '\0'; ////////// Look for Encapsulated PostScript code and skip it ////////////// if ( in_postscript ) { if ( !::strcmp( word, "%%Trailer" ) ) in_postscript = false; return false; } static postscript_comment_set const postscript_comments; if ( postscript_comments.contains( word ) ) { in_postscript = true; return false; } static postscript_operator_set const postscript_operators; if ( postscript_operators.contains( word ) ) return false; ////////// Strip chars not in Word_Begin_Chars/Word_End_Chars ///////////// for ( register int i = len - 1; i >= 0; --i ) { if ( is_word_end_char( word[ i ] ) ) break; --len; } if ( len < Word_Hard_Min_Size ) return false; word[ len ] = '\0'; while ( *word ) { if ( is_word_begin_char( *word ) || *word == '%' ) break; --len, ++word; } if ( len < Word_Hard_Min_Size ) return false; ////////// Discard what looks like ASCII hex data ///////////////////////// if ( len > Word_Hex_Max_Size && (int)::strspn( word, "0123456789abcdefABCDEF" ) == len ) return false; ////////// Stop-word checks /////////////////////////////////////////////// if ( !is_ok_word( word ) || stop_words->contains( to_lower( word ) ) ) return false; out << word << '\n'; return true; } //***************************************************************************** // // SYNOPSIS // void extract_words( register mmap_file::const_iterator c, register mmap_file::const_iterator end, ostream &out ) // // DESCRIPTION // // Extract the words between the given iterators. // // PARAMETERS // // c The iterator marking the beginning of the text to extract. // // end The iterator marking the end of the text to extract. // // out The ostream to write the words to. // //***************************************************************************** { char word[ Word_Hard_Max_Size + 1 ]; int len; int num_words = 0; bool in_word = false; in_postscript = false; while ( c != end ) { register char const ch = *c++; ////////// Collect a word ///////////////////////////////////////////// if ( is_word_char( ch ) || ch == '%' ) { if ( !in_word ) { // start a new word word[ 0 ] = ch; len = 1; in_word = true; continue; } if ( len < Word_Hard_Max_Size ) { // continue same word word[ len++ ] = ch; continue; } in_word = false; // too big: skip chars while ( c != end && is_word_char( *c++ ) ) ; continue; } if ( in_word ) { // // We ran into a non-word character, so extract the word up to, but // not including, it. // in_word = false; num_words += extract_word( word, len, out ); } } if ( in_word ) { // // We ran into 'end' while still accumulating characters into a word, // so just extract what we've got. // num_words += extract_word( word, len, out ); } if ( verbosity > 2 ) cout << " (" << num_words << " words)" << endl; } //***************************************************************************** // // Miscellaneous function(s) // //***************************************************************************** ostream& usage( ostream &err ) { err << "usage: " << me << " [options] dir ... file ...\n" "options: (unambiguous abbreviations may be used for long options)\n" "========\n" "-? | --help : Print this help message\n" "-c f | --config-file f : Name of configuration file [default: " << ConfigFile_Default << "]\n" "-e p | --pattern p : Filename pattern to extract [default: none]\n" "-E p | --no-pattern p : Filename pattern not to extract [default: none]\n" "-f | --filter : Filter one file to standard output [default: no]\n" #ifndef PJL_NO_SYMBOLIC_LINKS "-l | --follow-links : Follow symbolic links [default: no]\n" #endif "-r | --no-recurse : Don't extract subdirectories [default: do]\n" "-s f | --stop-file f : Stop-word file to use instead of built-in default\n" "-S | --dump-stop : Dump stop-words, exit\n" "-v v | --verbosity v : Verbosity level [0-4; default: 0]\n" "-V | --version : Print version number, exit\n" "-x e | --extension e : Extension to append to filename [default: txt]\n"; ::exit( Exit_Usage ); return err; // just to make compiler happy } /* vim:set et sw=4 ts=4: */ ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/ExtractExtension.h��������������������������������������������������������������������0000644�0000765�0000000�00000003220�10166052461�015127� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** ExtractExtension.h ** ** Copyright (C) 1998 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef ExtractExtension_H #define ExtractExtension_H // local #include "config.h" #include "conf_string.h" //***************************************************************************** // // SYNOPSIS // class ExtractExtension : public conf<std::string> // // DESCRIPTION // // An ExtractExtension is-a conf<std::string> containing the extension to // append to filenames during extraction. // // This is the same as extract's -x command-line option. // //***************************************************************************** { public: ExtractExtension() : conf<std::string>( "ExtractExtension", ExtractExtension_Default ) { } CONF_STRING_ASSIGN_OPS( ExtractExtension ) }; extern ExtractExtension extract_extension; #endif /* ExtractExtension_H */ /* vim:set et sw=4 ts=4: */ ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/ExtractFile.c�������������������������������������������������������������������������0000644�0000765�0000000�00000003072�10166052461�014032� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** ExtractFile.c ** ** Copyright (C) 2000 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ // standard #include <cstring> // local #include "ExtractFile.h" #include "platform.h" #include "util.h" /* for new_strdup() */ using namespace std; //***************************************************************************** // // SYNOPSIS // void ExtractFile::parse_value( char *line ) // // DESCRIPTION // // Parse the line of text by splitting it into words that are separated by // whitespace. // // PARAMETERS // // line The line of text to be parsed. // //***************************************************************************** { for ( register char const *s; s = ::strtok( line, " \r\t" ); line = 0 ) insert( new_strdup( s ), 0 ); } /* vim:set et sw=4 ts=4: */ ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/ExtractFile.h�������������������������������������������������������������������������0000644�0000765�0000000�00000003310�10166044112�014024� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** ExtractFile.h ** ** Copyright (C) 2000 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef ExtractFile_H #define ExtractFile_H // local #include "conf_var.h" #include "pattern_map.h" //***************************************************************************** // // SYNOPSIS // class ExtractFile : public conf_var, public pattern_map< bool > // // DESCRIPTION // // An ExtractFile is-a conf_var containing the set of filename patterns to // include during extraction. The bool template parameter isn't used. // // This is the same as extract's -e command-line option. // //***************************************************************************** { public: ExtractFile() : conf_var( "ExtractFile" ) { } CONF_VAR_ASSIGN_OPS( ExtractFile ) private: virtual void parse_value( char *line ); virtual void reset() { clear(); } }; extern ExtractFile include_patterns; #endif /* ExtractFile_H */ /* vim:set et sw=4 ts=4: */ ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/ExtractFilter.h�����������������������������������������������������������������������0000644�0000765�0000000�00000003147�10166044112�014402� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** ExtractFilter.h ** ** Copyright (C) 1998 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef ExtractFilter_H #define ExtractFilter_H // local #include "conf_bool.h" //***************************************************************************** // // SYNOPSIS // class ExtractFilter : public conf<bool> // // DESCRIPTION // // An ExtractFilter is-a conf<bool> containing the Boolean value // indicating whether to extract as a filter, i.e., extract a single file // to standard output. // // This is the same as extract's -f command-line option. // //***************************************************************************** { public: ExtractFilter() : conf<bool>( "ExtractFilter", false ) { } CONF_BOOL_ASSIGN_OPS( ExtractFilter ) }; extern ExtractFilter extract_as_filter; #endif /* ExtractFilter_H */ /* vim:set et sw=4 ts=4: */ �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/fdbuf.c�������������������������������������������������������������������������������0000644�0000765�0000000�00000015151�10166052462�012710� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** PJL C++ Library ** fdbuf.h ** ** Copyright (C) 2001 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ // standard #include <cerrno> #include <cstring> #include <unistd.h> // local #include "fdbuf.h" using namespace std; namespace PJL { //***************************************************************************** // // SYNOPSIS // void fdbuf::init( int fd ) // // DESCRIPTION // // Initialize an fdbuf. // //***************************************************************************** { fd_ = fd; setg( rbuf_, rbuf_ + buf_size, rbuf_ + buf_size ); setp( wbuf_, wbuf_ + buf_size ); } //***************************************************************************** // // SYNOPSIS // int fdbuf::sync() // // DESCRIPTION // // Synchronize the write buffer by writing it out. // // RETURN VALUE // // On success, returns 0; on failure, returns -1. // //***************************************************************************** { int const len = pptr() - pbase(); if ( write_buf( wbuf_, len ) != len ) return -1; pbump( -len ); return 0; } //***************************************************************************** // // SYNOPSIS // fdbuf::int_type fdbuf::overflow( int_type c ) // // DESCRIPTION // // This function is called when a single character is to be output. // // PARAMETERS // // c The single character to be stuffed into the buffer unless it is // EOF. // // RETURN VALUE // // On success, returns the passed-in character; on failure, returns EOF. // //***************************************************************************** { if ( sync() ) return EOF; if ( c != EOF ) { *pptr() = c; pbump( 1 ); } return c; } //***************************************************************************** // // SYNOPSIS // fdbuf::int_type fdbuf::underflow() // // DESCRIPTION // // This function is called when the buffer is underflowed, i.e., more // characters need to be read from the source. // // RETURN VALUE // // Returns the next character from the source or EOF on error. // //***************************************************************************** { if ( gptr() == egptr() ) { // // The get-pointer is at the end-pointer meaning that the read-buffer // is exhausted: read more. // ssize_t bytes; while ( true ) { if ( (bytes = ::read( fd_, rbuf_, buf_size )) > 0 ) { // // We read some bytes so stop. // break; } if ( bytes < 0 && (errno == EAGAIN || errno == EINTR) ) continue; return EOF; } setg( rbuf_, rbuf_, rbuf_ + bytes ); } return *gptr(); } //***************************************************************************** // // SYNOPSIS // streamsize fdbuf::write_buf( char const *buf, streamsize len ) // // DESCRIPTION // // Write a buffer to a Unix file descriptor. // // PARAMETERS // // buf The buffer to be written. // // len The length of the buffer. // // RETURN VALUE // // On success, returns the number of bytes written; on failure, returns // -1. // // SEE ALSO // // W. Richard Stevens. "Unix Network Programming, Vol 1, 2nd ed." // Prentice-Hall, Upper Saddle River, NJ, 1998. // //***************************************************************************** { streamsize total_bytes_written = 0; while ( true ) { ssize_t const bytes_written = ::write( fd_, buf, len ); if ( bytes_written >= 0 ) { // // Account for partial-writes in case the Unix file descriptor // happens to be attached to a socket. From [Stevens 1998], p. 77: // // Stream sockets (e.g., TCP sockets) exhibit a behavior with // the read and write functions that differs from normal file // I/O. A read or write on a stream socket might input or // output fewer bytes than requested, but this is not an error // condition. The reason is that buffer limits might be // reached for the kernel. All that is required is for the // caller to invoke the read or write function again, to input // or output the remaining bytes. // // Note that if the file descriptor isn't attached to a socket, the // entire buffer will be written, so the loop will exit. // total_bytes_written += bytes_written; if ( bytes_written == len ) return total_bytes_written; len -= bytes_written; buf += bytes_written; continue; } if ( errno == EINTR ) // interrupt-proof continue; return -1; } } //***************************************************************************** // // SYNOPSIS // streamsize fdbuf::xsputn( char const *buf, streamsize len ) // // DESCRIPTION // // This function is called to output an entire buffer (as opposed to a // single character). // // RETURN VALUE // // On success, returns the number of bytes written; on failure, returns // -1. // //***************************************************************************** { if ( epptr() - pptr() >= len ) { // // The contents of buf will fit into the existing put-buffer so just // put it there. // ::memcpy( pptr(), buf, len ); pbump( len ); return len; } // // The contents of buf will not fit into the existing put-buffer so // syncronize what's currently there then write out the contents of buf // directly. // return sync() ? -1 : write_buf( buf, len ); } } // namespace PJL /* vim:set et sw=4 ts=4: */ �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/fdbuf.h�������������������������������������������������������������������������������0000644�0000765�0000000�00000005430�10166052462�012714� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** PJL C++ Library ** fdbuf.h ** ** Copyright (C) 2001 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef fdbuf_H #define fdbuf_H // local #include "platform.h" // standard #include <iostream> #include <streambuf> namespace PJL { //***************************************************************************** // // SYNOPSIS // class fdbuf : public std::streambuf // // DESCRIPTION // // An fdbuf is-a streambuf that is used to be attached to a Unix file // descriptor. // // SEE ALSO // // Nicolai M. Josuttis. "The C++ Starndard Library: A Tutorial and // Reference," Addison-Wesley, 1999, pp. 672-676. // // Angelika Langer and Klaus Kreft. "Standard C++ IOStreams and Locales: // Advanced Programmer's Guide and Reference," Addison-Wesley, 2000, pp. // 225-244. // //***************************************************************************** { enum { buf_size = 1024 }; public: explicit fdbuf( int fd = -1 ) { init( fd ); } ~fdbuf() { if ( fd_ > -1 ) sync(); } // default copy constructor is fine // default assignment is fine void attach( int fd ); // In the case where an fdbuf object was constructed using the default // constructor, and therefore not attached to any file descriptor, // this function is used to attach an fdbuf to one at some later time. protected: typedef int int_type; virtual int_type overflow( int_type c ); virtual int sync(); virtual int_type underflow(); std::streamsize write_buf( char const*, std::streamsize ); virtual std::streamsize xsputn( char const *buf, std::streamsize len ); private: int fd_; char rbuf_[ buf_size ]; char wbuf_[ buf_size ]; void init( int fd ); }; /////////// Inlines /////////////////////////////////////////////////////////// inline void fdbuf::attach( int fd ) { if ( fd_ > -1 ) sync(); init( fd ); } } // namespace PJL #endif /* fdbuf_H */ /* vim:set et sw=4 ts=4: */ ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/file_info.c���������������������������������������������������������������������������0000644�0000765�0000000�00000007350�10263526010�013547� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** file_info.c ** ** Copyright (C) 1998 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ // standard #include <cstring> // local #include "config.h" #include "directory.h" #include "enc_int.h" #include "FilesReserve.h" #include "file_info.h" #include "platform.h" #include "util.h" /* for new_strdup() */ using namespace std; file_info::list_type file_info::list_; file_info::name_set_type file_info::name_set_; FilesReserve files_reserve; //***************************************************************************** // // SYNOPSIS // file_info::file_info( char const *path_name, int dir_index, size_t file_size, char const *title, int num_words ) // // DESCRIPTION // // Construct a file_info. If a title is given, use it; otherwise set the // title to be (just) the file name (not the path name). // // Additionally record its address in a list so the entire list can be // iterated over later in the order encountered. The first time through, // reserve files_reserve slots for files. If exceeded, the vector will // automatically grow, but with a slight performance penalty. // // PARAMETERS // // path_name The full path name of the file. // // dir_index The numerical index of the directory. // // file_size The size of the file in bytes. // // title The title of the file only if not null. // // num_words The number of words in the file. // //***************************************************************************** : dir_index_( dir_index ), file_name_( // // First duplicate the entire path name and put it into the set of // files encountered; then make file_name_ point to the base name // inside the same string, i.e., it shares storage. // pjl_basename( *name_set_.insert( new_strdup( path_name ) ).first ) ), size_( file_size ), num_words_( num_words ), title_( // // If there was a title given, use that; otherwise the title is the // file name. Note that it too shares storage. // title ? new_strdup( title ) : file_name_ ) { if ( list_.empty() ) list_.reserve( files_reserve ); list_.push_back( this ); } //***************************************************************************** // // SYNOPSIS // file_info::file_info( unsigned char const *p ) // // DESCRIPTION // // Construct a file_info from the raw data inside an index file. // // PARAMETERS // // p The pointer to the raw file_info data. // //***************************************************************************** : dir_index_( dec_int( p ) ), file_name_( reinterpret_cast<char const*>( p ) ), size_( dec_int( p += ::strlen( reinterpret_cast<char const*>( p ) ) + 1 ) ), num_words_( dec_int( p ) ), title_( reinterpret_cast<char const*>( p ) ) { // do nothing else } /* vim:set et sw=4 ts=4: */ ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/file_info.h���������������������������������������������������������������������������0000644�0000765�0000000�00000006004�10263526011�013550� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** file_info.h ** ** Copyright (C) 1998 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef file_info_H #define file_info_H // standard #include <cstddef> /* for size_t */ #include <vector> // local #include "my_set.h" //***************************************************************************** // // SYNOPSIS // class file_info // // DESCRIPTION // // This is used to contain information for every file encountered during // indexing. A static data member keeps track of all dynamically // allocated instances so thay can be iterated over later. // //***************************************************************************** { public: typedef std::vector< file_info* > list_type; typedef list_type::const_iterator const_iterator; typedef PJL::char_ptr_set name_set_type; file_info( char const *path_name, int dir_index, size_t file_size, char const *title, int num_words = 0 ); file_info( unsigned char const *ptr_into_index_file ); int dir_index() const { return dir_index_; } char const* file_name() const { return file_name_; } int num_words() const { return num_words_; } size_t size() const { return size_; } char const* title() const { return title_; } static const_iterator begin() { return list_.begin();} static const_iterator end() { return list_.end(); } static int current_index() { return list_.size() - 1; } static void inc_words() { ++list_.back()->num_words_; } static file_info* ith_info( int i ) { return list_[ i ]; } static int num_files() { return list_.size(); } static bool seen_file( char const *file_name ) { return name_set_.contains( file_name ); } private: int const dir_index_; char const *const file_name_; size_t const size_; int num_words_; char const *const title_; static list_type list_; static name_set_type name_set_; }; #endif /* file_info_H */ /* vim:set et sw=4 ts=4: */ ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/file_list.c���������������������������������������������������������������������������0000644�0000765�0000000�00000012114�10263526043�013567� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** file_list.c ** ** Copyright (C) 1998 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ // local #include "enc_int.h" #include "file_list.h" #include "word_markers.h" file_list::byte const file_list::const_iterator::end_value = 0; //***************************************************************************** // // SYNOPSIS // file_list::size_type file_list::calc_size() const // // DESCRIPTION // // Calculate the size of the file list (the number of files the word is // in) and cache the result. // // RETURN VALUE // // Returns the size. // //***************************************************************************** { size_ = 0; // // It would be nice if there were a way to calculate the size of the file // list other than by just marching though it. Since this should be as // fast as possible, a much simplified version of the dec_int() code has // been inlined here by hand -- a few times. (We also don't care what the // actual numbers are, so there's no point in computing them, so we save // having to do two shifts, and logical or for each file.) // register byte const *p = ptr_; while ( true ) { ++size_; while ( *p++ & 0x80 ) ; // skip file index while ( *p++ & 0x80 ) ; // skip occurrences while ( *p++ & 0x80 ) ; // skip rank bool more_lists = true; while ( more_lists ) { // // At this point, p must be pointing to a marker. // switch ( *p++ ) { // skip marker case Stop_Marker: return size_; case Word_Entry_Continues_Marker: more_lists = false; break; default: // must be a list marker while ( *p != Stop_Marker ) while ( *p++ & 0x80 ) ; ++p; } } } } //***************************************************************************** // // SYNOPSIS // file_list::const_iterator& file_list::const_iterator::operator++() // // DESCRIPTION // // Advance a file_list::const_iterator. // // RETURN VALUE // // Reference to itself as is standard practice for iterators. // // SEE ALSO // // index.c write_full_index() for a description of the index file // format. // //***************************************************************************** { if ( !c_ || c_ == &end_value ) { // // If c_'s value is the "already at end" value (null), or the "just hit // end" value, set to the "already at end" value. // c_ = 0; return *this; } v_.index_ = dec_int( c_ ); v_.occurrences_ = dec_int( c_ ); v_.rank_ = dec_int( c_ ); if ( !v_.meta_ids_.empty() ) v_.meta_ids_.clear(); #ifdef FEATURE_word_pos if ( v_.pos_deltas_.empty() ) v_.pos_deltas_.reserve( v_.occurrences_ ); else v_.pos_deltas_.clear(); #endif while ( true ) { // // At this point, c_ must be pointing to a marker. // switch ( *c_++ ) { case Stop_Marker: // // Reached the end of file list: set iterator to the "just hit // end" value. // c_ = &end_value; // no break; case Word_Entry_Continues_Marker: return *this; case Meta_Name_List_Marker: while ( *c_ != Stop_Marker ) v_.meta_ids_.insert( dec_int( c_ ) ); break; #ifdef FEATURE_word_pos case Word_Pos_List_Marker: while ( *c_ != Stop_Marker ) v_.pos_deltas_.push_back( dec_int(c_) ); break; #endif default: // // Encountered a list marker we don't know about: we are // decoding a possibly future index file format that has new // list types. Since we don't know what to do with it, just // skip all the numbers in it. // while ( *c_ != Stop_Marker ) dec_int( c_ ); } ++c_; // skip Stop_Marker } } /* vim:set et sw=4 ts=4: */ ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/file_list.h���������������������������������������������������������������������������0000644�0000765�0000000�00000007013�10166052462�013577� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** file_list.h ** ** Copyright (C) 1998 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef file_list_H #define file_list_H // standard #include <cstddef> /* for ptrdiff_t */ #include <iterator> // local #include "index_segment.h" #include "word_info.h" //***************************************************************************** // // SYNOPSIS // class file_list // // DESCRIPTION // // This class, given a index_segment::const_iterator, accesses the list of // files the word is in. Once an instance is created, the list of files // can be iterated over. // // SEE ALSO // // index_segment.h // word_info.h // index.c write_full_index() for a description of the index file // format. // //***************************************************************************** { typedef unsigned char byte; // for convenience public: ////////// typedefs /////////////////////////////////////////////////// typedef int size_type; typedef ptrdiff_t difference_type; typedef word_info::file value_type; typedef value_type const* const_pointer; typedef value_type const& const_reference; ////////// constructors /////////////////////////////////////////////// file_list( index_segment::const_iterator const &iter ) : ptr_( reinterpret_cast<byte const*>( *iter ) ), size_( -1 ) // -1 = "haven't computed yet" { while ( *ptr_++ ) ; // skip past word } ////////// iterators ////////////////////////////////////////////////// class const_iterator; friend class const_iterator; class const_iterator : public std::iterator< std::forward_iterator_tag, value_type > { public: const_iterator() { } const_reference operator* () const { return v_; } const_pointer operator->() const { return &v_; } const_iterator& operator++(); const_iterator operator++(int) { const_iterator const temp = *this; return ++*this, temp; } friend bool operator==( const_iterator const &i, const_iterator const &j ) { return i.c_ == j.c_; } friend bool operator!=( const_iterator const &i, const_iterator const &j ) { return !( i == j ); } // default copy constructor is OK // default assignment operator is OK private: const_iterator( byte const *p ) : c_( p ) { if ( c_ ) operator++(); } byte const *c_; value_type v_; static byte const end_value; friend class file_list; }; ////////// member functions /////////////////////////////////////////// const_iterator begin() const { return const_iterator( ptr_ ); } const_iterator end() const { return const_iterator( 0 ); } size_type size() const; private: byte const *ptr_; mutable size_type size_; size_type calc_size() const; }; ////////// inlines //////////////////////////////////////////////////////////// inline file_list::size_type file_list::size() const { return size_ != -1 ? size_ : calc_size(); } #endif /* file_list_H */ /* vim:set noet sw=8 ts=8: */ ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/FilesGrow.h���������������������������������������������������������������������������0000644�0000765�0000000�00000003141�10166044112�013515� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** FilesGrow.h ** ** Copyright (C) 1998 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef FilesGrow_H #define FilesGrow_H // local #include "config.h" #include "conf_percent.h" //***************************************************************************** // // SYNOPSIS // class FilesGrow : public conf_percent // // DESCRIPTION // // A FilesGrow is-a conf_percent containing either the absolute number or // percentage of files to grow reserved space for when incrementally // indexing. // // This is the same as index's -g command-line option. // //***************************************************************************** { public: FilesGrow() : conf_percent( "FilesGrow", FilesGrow_Default, 1 ) { } CONF_PERCENT_ASSIGN_OPS( FilesGrow ) }; extern FilesGrow files_grow; #endif /* FilesGrow_H */ /* vim:set et sw=4 ts=4: */ �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/FilesReserve.h������������������������������������������������������������������������0000644�0000765�0000000�00000003116�10166044112�014214� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** FilesReserve.h ** ** Copyright (C) 1998 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef FilesReserve_H #define FilesReserve_H // local #include "config.h" #include "conf_int.h" //***************************************************************************** // // SYNOPSIS // class FilesReserve : public conf<int> // // DESCRIPTION // // A FilesReserve is-a conf<int> containing the initial number of files to // reserve space for; see file_info.c for details. // // This is the same as index's -F command-line option. // //***************************************************************************** { public: FilesReserve() : conf<int>( "FilesReserve", FilesReserve_Default ) { } CONF_INT_ASSIGN_OPS( FilesReserve ) }; extern FilesReserve files_reserve; #endif /* FilesReserve_H */ /* vim:set et sw=4 ts=4: */ ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/filter.c������������������������������������������������������������������������������0000644�0000765�0000000�00000021011�10263526011�013071� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** filter.c ** ** Copyright (C) 1998 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ // standard #include <cstdlib> /* for system(3) */ #include <cstring> #include <unistd.h> /* for sleep(3) */ // local #include "config.h" #include "filter.h" #include "platform.h" #include "util.h" using namespace std; //***************************************************************************** // // SYNOPSIS // static void escape_filename( string &s ) // // DESCRIPTION // // Escape all characters in a filename for passing to a shell. // // PARAMETERS // // s The string containing the filename to escape. It is modified // in-place. // //***************************************************************************** { register string::size_type pos = 0; while ( (pos = s.find_first_of( ShellFilenameEscapeChars, pos )) != string::npos ) { s.insert( pos, 1, '\\' ); pos += 2; } } //***************************************************************************** // // SYNOPSIS // static void unescape_filename( string &s ) // // DESCRIPTION // // Unescape all '\' characters in a filename for not passing to a shell. // // PARAMETERS // // s The string containing the filename to unescape. It is modified // in-place. // //***************************************************************************** { register string::size_type pos = 0; while ( (pos = s.find( '\\', pos )) != string::npos ) s.erase( pos++, 1 ); } //***************************************************************************** // // SYNOPSIS // char const* filter::exec() const // // DESCRIPTION // // Filter a file with a command by by calling system(3C). // // RETURN VALUE // // If successful, returns the post-filtered file name; null otherwise. // // SEE ALSO // // system(3C) // //***************************************************************************** { if ( command_.empty() ) { // // This should never happen: substitute() was never called on this // filter. If this happens, the programmer goofed. // internal_error << "filter::exec(): command is empty" << report_error; } int exit_code; int attempt_count = 0; while ( ( exit_code = ::system( command_.c_str() ) ) == -1 ) { // // Try a few times before giving up in case the system is temporarily // busy. // if ( ++attempt_count > Fork_Attempts ) return 0; ::sleep( Fork_Sleep ); } return exit_code ? 0 : target_file_name_.c_str(); } //***************************************************************************** // // SYNOPSIS // char const *filter::substitute( char const *file_name ) // // DESCRIPTION // // Substitute the filename (or parts thereof) into our command template // wherever % occurs. // // PARAMETERS // // file_name The name of the file to be substituted into the command. // // RETURN VALUE // // Returns the target file name. // //***************************************************************************** { string esc_file_name( file_name ); escape_filename( esc_file_name ); // // Determine the base name of the file in case we need it for 'b' or 'B' // substitutions. // char const *const base_name = pjl_basename( esc_file_name.c_str() ); // // For this kind of string manipulation, the C++ string class is much // easier to use than the C str*() functions. // string::size_type target_pos = string::npos; command_ = command_template_; register string::size_type pos = 0; while ( (pos = command_.find_first_of( "%@", pos )) != string::npos ) { if ( pos + 1 >= command_.length() ) { // // The % or @ is the last character in the command so it can't be // substituted. This is weird, but be lenient by assuming the user // knows what s/he's doing and simply stop rather than return an // error. // break; } if ( command_[ pos ] == command_[ pos + 1 ] ) { // // We encountered either a %% or @@ to represent a literal % or @, // respectively. Simply erase one of them and skip any // substitution. // command_.erase( pos++, 1 ); continue; } if ( command_[ pos ] == '@' ) { // // We found the substitution that represents the target filename: // make a note. Note that we don't have to check to see if we've // already set target_pos (meaning there was more than one @ // substitution) because that illegal situation would have been // caught by FilterFile::parse_value(). // target_pos = pos; command_.erase( pos, 1 ); continue; } // // Perform a substitution. // switch ( command_[ pos + 1 ] ) { case 'b': // basename of filename command_.replace( pos, 2, base_name ); pos += ::strlen( base_name ); break; case 'B': { // basename minus last extension string no_ext = base_name; no_ext.erase( no_ext.rfind( '.' ) ); command_.replace( pos, 2, no_ext ); pos += no_ext.length(); break; } case 'e': { // filename extension string ext = esc_file_name; ext.erase( 0, ext.rfind( '.' ) ); command_.replace( pos, 2, ext ); pos += ext.length(); break; } case 'E': { // second-to-last filename extension string ext = esc_file_name; string::size_type const x = ext.rfind( '.' ); if ( x != string::npos ) { ext.erase( x ); ext.erase( 0, ext.rfind( '.' ) ); command_.replace( pos, 2, ext ); pos += ext.length(); } break; } case 'f': // entire filename command_.replace( pos, 2, esc_file_name ); pos += esc_file_name.length(); break; case 'F': { // filename minus last extension string no_ext = esc_file_name; no_ext.erase( no_ext.rfind( '.' ) ); command_.replace( pos, 2, no_ext ); pos += no_ext.length(); break; } } } if ( target_pos == string::npos ) { // // This should never happen: the command template should have been // checked by FilterFile::parse_line() for the existence of an @. If // this happens, the programmer goofed. // internal_error << "filter::substitute(): target_pos == string::npos" << report_error; } // // Find the first character that delimits the target file name (that is not // escaped). // pos = target_pos; while ( (pos = command_.find_first_of( ShellFilenameDelimChars, pos )) != string::npos ) if ( command_[ pos - 1 ] == '\\' ) ++pos; else break; target_file_name_ = string( command_, target_pos, pos ); // // Having shell meta-characters and whitespace automatically escaped was // good for executing the filter(s), but it's not good for actually opening // the file since no shell is involved. Therefore, we must now unescape // the final file-name. // unescape_filename( target_file_name_ ); return target_file_name_.c_str(); } /* vim:set et sw=4 ts=4: */ �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/filter.h������������������������������������������������������������������������������0000644�0000765�0000000�00000004513�10263526011�013106� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** filter.h ** ** Copyright (C) 1998 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef filter_H #define filter_H // standard #include <string> #include <unistd.h> /* for unlink(2) */ //***************************************************************************** // // SYNOPSIS // class filter // // DESCRIPTION // // A filter is a light-weight class that contains a Unix command-line and // knows how to execute itself on a file to create a filtered file. The // destructor deletes the filtered file. // //***************************************************************************** { public: explicit filter( char const *command ) : command_template_( command ) {} ~filter(); // default copy constructor is fine // default assignment operator is fine char const* substitute( char const *file_name ); char const* substitute( std::string const &file_name ); char const* exec() const; private: char const* command_template_; // // The above really should be const, but then we'd need to define a // non-default operator=(). It's simpler and harmless just to leave the // const out. // std::string command_; std::string target_file_name_; }; ////////// Inlines //////////////////////////////////////////////////////////// inline filter::~filter() { if ( !target_file_name_.empty() ) ::unlink( target_file_name_.c_str() ); } inline char const* filter::substitute( std::string const &file_name ) { return substitute( file_name.c_str() ); } #endif /* filter_H */ /* vim:set et sw=4 ts=4: */ �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/FilterFile.h��������������������������������������������������������������������������0000644�0000765�0000000�00000003030�10166044112�013636� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** FilterFile.h ** ** Copyright (C) 1998 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef FilterFile_H #define FilterFile_H // local #include "conf_filter.h" //***************************************************************************** // // SYNOPSIS // class FilterFile : public conf_filter // // DESCRIPTION // // A FilterFile is-a conf_filter for mapping a filename pattern to a // filter (being a Unix process called via command-line). Certain // filename patterns need to be filtered first, e.g., converting PDF to // text. // //***************************************************************************** { public: FilterFile() : conf_filter( "FilterFile" ) { } }; extern FilterFile file_filters; #endif /* FilterFile_H */ /* vim:set et sw=4 ts=4: */ ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/fnmatch.c�����������������������������������������������������������������������������0000644�0000765�0000000�00000006003�10166052462�013236� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* Copyright (C) 1995 DJ Delorie, see COPYING.DJ for details */ /* Added this C-style comment since the C++-style comment alone isn't good * enough to comment out a C preprocessor directive. -- pjl //#include <libc/stubs.h> */ #include <unistd.h> #include <string.h> #include <ctype.h> #include "fnmatch.h" /* changed to use "" rather than <> -- pjl */ using namespace std; #define EOS '\0' static const char *rangematch(const char *pattern, char test, int nocase); #define isslash(c) ((c) == '\\' || (c) == '/') static const char * find_slash(const char *s) { while (*s) { if (isslash(*s)) return s; s++; } return 0; } static const char * rangematch(const char *pattern, char test, int nocase) { char c, c2; int negate, ok; if ((negate = (*pattern == '!'))) ++pattern; for (ok = 0; (c = *pattern++) != ']';) { if (c == 0) return 0; /* illegal pattern */ if (*pattern == '-' && (c2 = pattern[1]) != 0 && c2 != ']') { if (c <= test && test <= c2) ok = 1; if (nocase && toupper(c) <= toupper(test) && toupper(test) <= toupper(c2)) ok = 1; pattern += 2; } else if (c == test) ok = 1; else if (nocase && (toupper(c) == toupper(test))) ok = 1; } return ok == negate ? NULL : pattern; } int fnmatch(const char *pattern, const char *string, int flags) { char c; char test; for (;;) switch ((c = *pattern++)) { case 0: return *string == 0 ? 0 : FNM_NOMATCH; case '?': if ((test = *string++) == 0 || (isslash(test) && (flags & FNM_PATHNAME))) return(FNM_NOMATCH); break; case '*': c = *pattern; /* collapse multiple stars */ while (c == '*') c = *++pattern; /* optimize for pattern with * at end or before / */ if (c == 0) { if (flags & FNM_PATHNAME) return find_slash(string) ? FNM_NOMATCH : 0; else return 0; } else if (isslash(c) && flags & FNM_PATHNAME) { if ((string = find_slash(string)) == NULL) return FNM_NOMATCH; break; } /* general case, use recursion */ while ((test = *string) != 0) { if (fnmatch(pattern, string, flags) == 0) return(0); if (isslash(test) && flags & FNM_PATHNAME) break; ++string; } return FNM_NOMATCH; case '[': if ((test = *string++) == 0 || (isslash(test) && flags & FNM_PATHNAME)) return FNM_NOMATCH; if ((pattern = rangematch(pattern, test, flags & FNM_NOCASE)) == NULL) return FNM_NOMATCH; break; case '\\': if (!(flags & FNM_NOESCAPE) && pattern[1] && strchr("*?[\\", pattern[1])) { if ((c = *pattern++) == 0) { c = '\\'; --pattern; } if (c != *string++) return FNM_NOMATCH; break; } /* FALLTHROUGH */ default: if (isslash(c) && isslash(*string)) { string++; break; } if (flags & FNM_NOCASE) { if (toupper(c) != toupper(*string++)) return FNM_NOMATCH; } else { if (c != *string++) return FNM_NOMATCH; } break; } } �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/fnmatch.h�����������������������������������������������������������������������������0000644�0000765�0000000�00000001255�10030465557�013252� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* Copyright (C) 1995 DJ Delorie, see COPYING.DJ for details */ #ifndef __dj_include_fnmatch_h_ #define __dj_include_fnmatch_h_ #ifdef __cplusplus extern "C" { #endif #ifdef FNM_NOESCAPE #undef FNM_NOESCAPE #endif #ifdef FNM_PATHNAME #undef FNM_PATHNAME #endif #ifdef FNM_PERIOD #undef FNM_PERIOD #endif #ifdef FNM_NOCASE #undef FNM_NOCASE #endif #define FNM_NOESCAPE 0x01 #define FNM_PATHNAME 0x02 #define FNM_PERIOD 0x04 #define FNM_NOCASE 0x08 #ifdef FNM_NOMATCH #undef FNM_NOMATCH #endif #define FNM_NOMATCH 1 int fnmatch( const char *pattern, const char *string, int flags ); #ifdef __cplusplus } #endif #endif /* !__dj_include_fnmatch_h_ */ /* vim:set noet sw=8 ts=8: */ ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/FollowLinks.h�������������������������������������������������������������������������0000644�0000765�0000000�00000003233�10254211003�014052� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** FollowLinks.h ** ** Copyright (C) 1998 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef PJL_NO_SYMBOLIC_LINKS #ifndef FollowLinks_H #define FollowLinks_H // local #include "conf_bool.h" //***************************************************************************** // // SYNOPSIS // class FollowLinks : public conf<bool> // // DESCRIPTION // // A FollowLinks is-a conf<bool> containing the Boolean value indicating // whether to follow symbolic links either during indexing or extraction. // // This is the same as either index's or extract's -l command-line option. // //***************************************************************************** { public: FollowLinks() : conf<bool>( "FollowLinks", false ) { } CONF_BOOL_ASSIGN_OPS( FollowLinks ) }; extern FollowLinks follow_symbolic_links; #endif /* FollowLinks_H */ #endif /* PJL_NO_SYMBOLIC_LINKS */ /* vim:set et sw=4 ts=4: */ ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/GNUmakefile���������������������������������������������������������������������������0000644�0000765�0000000�00000016272�10256554435�013544� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������## # SWISH++ # GNUmakefile # # Copyright (C) 2000 Paul J. Lucas # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. ## CPP_TARGET:= index search extract BIN_TARGET:= $(CPP_TARGET) SCRIPTS:= $(patsubst %.in,%,$(wildcard scripts/*.in)) INITD_TARGET:= searchd TARGET:= $(BIN_TARGET) $(SCRIPTS) .PHONY: all all: $(TARGET) ROOT:= . include $(ROOT)/config/config.mk # Don't do the "include" if the goal contains the word "clean" or "dist". ifneq ($(findstring clean,$(MAKECMDGOALS)),clean) ifneq ($(findstring dist,$(MAKECMDGOALS)),dist) -include $(ROOT)/config/platform.mk endif endif DEBUGFLAGS:= # Various debug flags; add to DEBUGFLAGS for debugging. # -DDEBUG_eval_query # -DDEBUG_id3v2 # -DDEBUG_is_ok_word # -DDEBUG_parse_query # -DDEBUG_stem_word # -DDEBUG_threads CCFLAGS+= $(DEBUGFLAGS) CFLAGS:= $(CCFLAGS) ########## You shouldn't have to change anything below this line. ############# SUBDIRS:= config man mod/* scripts ifdef CHARSET_LIST SUBDIRS+= charsets CHARSET_LIB:= charsets/libcharsets.a CHARSET_LIB_PATH:= -Lcharsets CHARSET_LINK:= -lcharsets endif ifdef ENCODING_LIST SUBDIRS+= encodings ENCODING_LIB:= encodings/libencodings.a ENCODING_LIB_PATH:= -Lencodings ENCODING_LINK:= -lencodings endif MOD_LIBS:= $(foreach mod,$(MOD_LIST),mod/$(mod)/libmod_$(mod).a) MOD_LIB_PATHS:= $(foreach mod,$(MOD_LIST),-Lmod/$(mod)) MOD_LINK:= $(foreach mod,$(MOD_LIST),-lmod_$(mod)) I_SOURCES:= enc_int.c \ mmap_file.c \ conf_var.c \ conf_bool.c \ conf_filter.c \ conf_int.c \ conf_percent.c \ conf_set.c \ conf_string.c \ encoded_char.c \ ExcludeFile.c \ file_info.c \ file_list.c \ filter.c \ IncludeFile.c \ IncludeMeta.c \ indexer.c \ index_segment.c \ init_modules.c \ init_mod_vars.c \ iso8859-1.c \ itoa.c \ option_stream.c \ stop_words.c \ TempDirectory.c \ util.c \ word_info.c \ WordThreshold.c \ word_util.c \ index.c ifdef WIN32 I_SOURCES+= fnmatch.c # see the comment in pattern_map.h endif I_OBJECTS:= $(I_SOURCES:.c=.o) ifndef HAVE_ZLIB ZLIB_LINK:= endif I_LINK:= $(MOD_LINK) $(ENCODING_LINK) $(CHARSET_LINK) $(PTHREAD_LINK) \ -lm $(ZLIB_LINK) S_SOURCES:= enc_int.c \ mmap_file.c \ conf_var.c \ conf_bool.c \ conf_enum.c \ conf_int.c \ conf_string.c \ index_segment.c \ init_mod_vars.c \ file_info.c \ file_list.c \ iso8859-1.c \ option_stream.c \ query_node.c \ query.c \ ResultsFormat.c \ results_formatter.c \ classic_formatter.c \ xml_formatter.c \ token.c \ stem_word.c \ util.c \ word_info.c \ word_util.c \ search.c ifdef SEARCH_DAEMON ## # See the comment in config/config.mk regarding the search daemon ability. ## S_SOURCES+= fdbuf.c \ Group.c \ thread_pool.c \ search_thread.c \ search_daemon.c \ SearchDaemon.c \ SocketAddress.c \ User.c endif S_OBJECTS:= $(S_SOURCES:.c=.o) S_LINK:= $(SOCKET_LINK) $(PTHREAD_LINK) E_SOURCES:= mmap_file.c \ conf_var.c \ conf_bool.c \ conf_filter.c \ conf_int.c \ conf_set.c \ conf_string.c \ ExcludeFile.c \ ExtractFile.c \ filter.c \ index_segment.c \ init_mod_vars.c \ iso8859-1.c \ option_stream.c \ postscript.c \ stop_words.c \ util.c \ word_util.c \ extract.c ifdef WIN32 E_SOURCES+= fnmatch.c # see the comment in pattern_map.h endif E_OBJECTS:= $(E_SOURCES:.c=.o) E_LINK:= $(PTHREAD_LINK) LIB_TARGET:= WWW.pm ## # Build rules ## extract: $(E_OBJECTS) $(CC) $(CFLAGS) -o $@ $^ $(E_LINK) index: $(I_OBJECTS) $(CHARSET_LIB) $(ENCODING_LIB) $(MOD_LIBS) $(CC) $(CFLAGS) $(CHARSET_LIB_PATH) $(ENCODING_LIB_PATH) \ $(MOD_LIB_PATHS) -o $@ $(I_OBJECTS) $(I_LINK) init_modules.c: mod/*/mod_*.h init_modules-sh ./init_modules-sh > $@ || $(RM) $@ init_mod_vars.c: mod/*/vars init_mod_vars-sh ./init_mod_vars-sh > $@ || $(RM) $@ search: $(S_OBJECTS) $(CC) $(CFLAGS) -o $@ $^ $(S_LINK) $(CHARSET_LIB): FORCE @$(MAKE) -C $(dir $@) DEBUGFLAGS="$(DEBUGFLAGS)" $(ENCODING_LIB): FORCE @$(MAKE) -C $(dir $@) DEBUGFLAGS="$(DEBUGFLAGS)" $(MOD_LIBS): FORCE @$(MAKE) -C $(dir $@) DEBUGFLAGS="$(DEBUGFLAGS)" $(SCRIPTS): FORCE @$(MAKE) -C $(dir $@) .PHONY: FORCE # Don't do the "include" if the goal contains the word "clean", "dist", or # "uninstall". ifneq ($(findstring clean,$(MAKECMDGOALS)),clean) ifneq ($(findstring dist,$(MAKECMDGOALS)),dist) ifneq ($(findstring uninstall,$(MAKECMDGOALS)),uninstall) -include $(I_SOURCES:%.c=.%.d) $(E_SOURCES:%.c=.%.d) $(S_SOURCES:%.c=.%.d) endif endif endif ps pdf txt: @$(MAKE) -C man $@ ## # Install rules ## INITD_DIR:= $(firstword $(shell ls -d /etc/init.d /etc/rc.d/init.d 2>/dev/null)) LEVEL_DIR= $(firstword $(shell ls -d /etc/rc$1.d /etc/rc.d/rc$1.d 2>/dev/null)) install: install_bin install_lib install_man install_conf install_bin: $(BIN_TARGET) $(I_BIN) $(INSTALL) $(I_OWNER) $(I_GROUP) $(I_XMODE) $(BIN_TARGET) $(I_BIN) cd $(I_BIN) && $(STRIP) $(CPP_TARGET) @$(MAKE) -C scripts $@ install_lib: $(I_LIB) $(INSTALL) $(I_OWNER) $(I_GROUP) $(I_MODE) $(LIB_TARGET) $(I_LIB) install_man: @$(MAKE) -C man install install_conf: $(I_ETC) $(INSTALL) $(I_OWNER) $(I_GROUP) $(I_MODE) swish++.conf $(I_ETC) $(I_BIN) $(I_ETC) $(I_LIB): $(MKDIR) $@ install_sysv: scripts/$(INITD_TARGET) $(INSTALL) $(I_OWNER) $(I_GROUP) $(I_XMODE) $< $(INITD_DIR) $(RM) $(call LEVEL_DIR,1)/K99$(INITD_TARGET) $(RM) $(call LEVEL_DIR,2)/K99$(INITD_TARGET) $(RM) $(call LEVEL_DIR,3)/S99$(INITD_TARGET) $(RM) $(call LEVEL_DIR,5)/S99$(INITD_TARGET) $(RM) $(call LEVEL_DIR,6)/K99$(INITD_TARGET) ln -s ../init.d/$(INITD_TARGET) $(call LEVEL_DIR,1)/K99$(INITD_TARGET) ln -s ../init.d/$(INITD_TARGET) $(call LEVEL_DIR,2)/K99$(INITD_TARGET) ln -s ../init.d/$(INITD_TARGET) $(call LEVEL_DIR,3)/S99$(INITD_TARGET) ln -s ../init.d/$(INITD_TARGET) $(call LEVEL_DIR,5)/S99$(INITD_TARGET) ln -s ../init.d/$(INITD_TARGET) $(call LEVEL_DIR,6)/K99$(INITD_TARGET) uninstall: cd $(I_BIN) && $(RM) $(BIN_TARGET) cd $(I_LIB) && $(RM) $(LIB_TARGET) $(RM) $(INITD_DIR)/$(INITD_TARGET) \ $(call LEVEL_DIR,1)/K99$(INITD_TARGET) \ $(call LEVEL_DIR,2)/K99$(INITD_TARGET) \ $(call LEVEL_DIR,3)/S99$(INITD_TARGET) \ $(call LEVEL_DIR,5)/S99$(INITD_TARGET) \ $(call LEVEL_DIR,6)/K99$(INITD_TARGET) @$(MAKE) -C man $@ @$(MAKE) -C scripts $@ ## # Utility rules ## MAKE_SUBDIRS= for dir in $(SUBDIRS); \ do [ -f $$dir/*akefile ] && $(MAKE) -C $$dir $1; \ done clean: $(RM) *.o core $(TEMPLATE_REPOSITORY) swish++.index @$(call MAKE_SUBDIRS,$@) distclean: clean $(RM) $(TARGET) .*.d init_modules.c @$(call MAKE_SUBDIRS,$@) dist: distclean @if [ -d CVS ]; then \ echo; echo "Won't 'make $@' in source tree!"; echo; exit 1; \ fi find . -name .cvsignore -exec $(RM) {} \; $(RM) *.xcode* # vim:set noet sw=8 ts=8: ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/Group.c�������������������������������������������������������������������������������0000644�0000765�0000000�00000003450�10166052461�012714� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** Group.c ** ** Copyright (C) 2001 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ // standard #include <sys/types.h> /* needed by FreeBSD systems */ #include <grp.h> /* for getgrnam(3) */ // local #include "platform.h" #include "Group.h" #include "exit_codes.h" #include "util.h" using namespace std; //***************************************************************************** // // SYNOPSIS // void Group::parse_value( char *line ) // // DESCRIPTION // // Parse a group name and look it up to ensure it's valid. // // PARAMETERS // // line The line of text to be parsed. // //***************************************************************************** { conf<string>::parse_value( line ); char const *const group_name = operator char const*(); struct group const *const g = ::getgrnam( group_name ); if ( !g ) { error() << '"' << group_name << "\" does not exist" << endl; ::exit( Exit_No_Group ); } gid_ = g->gr_gid; } /* vim:set et sw=4 ts=4: */ ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/Group.h�������������������������������������������������������������������������������0000644�0000765�0000000�00000004221�10254211410�012703� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** Group.h ** ** Copyright (C) 2001 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifdef SEARCH_DAEMON #ifndef Group_H #define Group_H // standard #include <sys/types.h> #include <unistd.h> /* for gid_t */ // local #include "config.h" #include "conf_string.h" //***************************************************************************** // // SYNOPSIS // class Group : public conf<std::string> // // DESCRIPTION // // A Group is-a conf<string> containing the group name of the user we // should run as after initialization (if we're root to begin with). // // This is the same as search's -G command-line option. // //***************************************************************************** { public: Group(); CONF_STRING_ASSIGN_OPS( Group ); bool change_to_gid() const; gid_t gid() const { return gid_; } private: virtual void parse_value( char *line ); gid_t gid_; }; extern Group group; ////////// Inlines //////////////////////////////////////////////////////////// inline Group::Group() : conf<std::string>( "Group", Group_Default ), gid_( ::getegid() ) { } inline bool Group::change_to_gid() const { if ( ::geteuid() == 0 /* root */ && gid_ != ::getgid() ) return ::setgid( gid_ ) == 0; return true; } #endif /* Group_H */ #endif /* SEARCH_DAEMON */ /* vim:set et sw=4 ts=4: */ �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/IncludeFile.c�������������������������������������������������������������������������0000644�0000765�0000000�00000003655�10166052461�014012� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** IncludeFile.c ** ** Copyright (C) 1998 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ // standard #include <cstring> // local #include "exit_codes.h" #include "IncludeFile.h" #include "platform.h" #include "util.h" /* for error(), new_strdup() */ using namespace std; //***************************************************************************** // // SYNOPSIS // void IncludeFile::parse_value( char *line ) // // DESCRIPTION // // Parse the line of text of the form: // // mod_name pattern1 pattern2 ... // // PARAMETERS // // line The line of text to be parsed. // //***************************************************************************** { char const *const mod_name = ::strtok( line, " \r\t" ); if ( !mod_name ) { error() << "no indexer module name\n"; ::exit( Exit_Config_File ); } indexer *const i = indexer::find_indexer( mod_name ); if ( !i ) { error() << '"' << mod_name << "\": no such indexing module\n"; ::exit( Exit_Config_File ); } for ( register char const *s; s = ::strtok( 0, " \r\t" ); ) insert( new_strdup( s ), i ); } /* vim:set et sw=4 ts=4: */ �����������������������������������������������������������������������������������swish++-6.1.5/IncludeFile.h�������������������������������������������������������������������������0000644�0000765�0000000�00000003430�10166044113�014001� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** IncludeFile.h ** ** Copyright (C) 1998 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef IncludeFile_H #define IncludeFile_H // local #include "conf_var.h" #include "indexer.h" #include "pattern_map.h" //***************************************************************************** // // SYNOPSIS // class IncludeFile : public conf_var, public pattern_map< indexer* > // // DESCRIPTION // // An IncludeFile is-a conf_var containing the set of filename patterns to // include during indexing. Additionally, each pattern is mapped to the // indexer that indexes that kind of file. // // This is the same as either index's -e command-line option. // //***************************************************************************** { public: IncludeFile() : conf_var( "IncludeFile" ) { } CONF_VAR_ASSIGN_OPS( IncludeFile ) private: virtual void parse_value( char *line ); virtual void reset() { clear(); } }; extern IncludeFile include_patterns; #endif /* IncludeFile_H */ /* vim:set et sw=4 ts=4: */ ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/IncludeMeta.c�������������������������������������������������������������������������0000644�0000765�0000000�00000005007�10166052461�014012� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** IncludeMeta.c ** ** Copyright (C) 2000 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ // standard #include <cstring> // local #include "auto_vec.h" #include "IncludeMeta.h" #include "exit_codes.h" #include "platform.h" #include "util.h" /* for error(), new_strdup() */ using namespace PJL; using namespace std; //***************************************************************************** // // SYNOPSIS // void IncludeMeta::parse_value( char *line ) // // DESCRIPTION // // Parse the line of text by splitting it into words that are separated by // whitespace. Additionally, "words" can be further split by '=' to do // meta name reassignment, e.g.: // // adr=address // // PARAMETERS // // line The line of text to be parsed. // //***************************************************************************** { auto_vec<char> lower( to_lower_r( line ) ); char *p = lower; for ( register char const *meta_name; meta_name = ::strtok( p, " \r\t" ); p = 0 ) { // // See if the meta name contains a reassignment: if so, chop it at the // '='. // // The const_cast<>() is needed for Sun's (wrong!) implementation of // strchr() returning char const*. Its use is harmless for other // compilers. // register char *reassign = const_cast<char*>( ::strchr( meta_name, '=' ) ); if ( reassign ) { *reassign = '\0'; if ( !*++reassign ) { error() << "name expected after '='\n"; ::exit( Exit_Config_File ); } } char *const m = new_strdup( meta_name ); insert( value_type( m, reassign ? new_strdup( reassign ) : m )); } } /* vim:set et sw=4 ts=4: */ �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/IncludeMeta.h�������������������������������������������������������������������������0000644�0000765�0000000�00000003370�10166052461�014020� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** IncludeMeta.h ** ** Copyright (C) 1998 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef IncludeMeta_H #define IncludeMeta_H // standard #include <map> // local #include "conf_var.h" #include "less.h" //***************************************************************************** // // SYNOPSIS // class IncludeMeta : public conf_var, public std::map< char const*, char const* > // // DESCRIPTION // // An IncludeMeta is-a conf_var and a map containing the set of meta names // (and their possible reassigned names) to include during indexing. // // This is the same as index's -m command-line option. // //***************************************************************************** { public: IncludeMeta() : conf_var( "IncludeMeta" ) { } CONF_VAR_ASSIGN_OPS( IncludeMeta ) virtual void parse_value( char *line ); private: virtual void reset() { clear(); } }; extern IncludeMeta include_meta_names; #endif /* IncludeMeta_H */ /* vim:set et sw=4 ts=4: */ ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/Incremental.h�������������������������������������������������������������������������0000644�0000765�0000000�00000003023�10166044113�014055� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** Incremental.h ** ** Copyright (C) 1998 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef Incremental_H #define Incremental_H // local #include "conf_bool.h" //***************************************************************************** // // SYNOPSIS // class Incremental : public conf<bool> // // DESCRIPTION // // An Incremental is-a conf<bool> containing the Boolean value indicating // whether to index incrementally. // // This is the same as index's -I command-line option. // //***************************************************************************** { public: Incremental() : conf<bool>( "Incremental", false ) { } CONF_BOOL_ASSIGN_OPS( Incremental ) }; extern Incremental incremental; #endif /* Incremental_H */ /* vim:set et sw=4 ts=4: */ �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/index.c�������������������������������������������������������������������������������0000644�0000765�0000000�00000127153�10254341251�012732� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** index.c ** ** Copyright (C) 1998 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ // standard #include <cmath> /* for log(3) */ #include <cstdlib> /* for exit(2) */ #include <cstring> #include <fstream> #include <iomanip> /* for setfill(), setw() */ #include <iostream> #include <iterator> #include <string> #include <time.h> #include <sys/time.h> /* needed by FreeBSD systems */ #include <sys/resource.h> /* for RLIMIT_* */ #include <sys/types.h> #include <unistd.h> /* for unlink(2) */ #include <vector> // local #include "platform.h" #include "AssociateMeta.h" #include "config.h" #include "enc_int.h" #include "ExcludeFile.h" #include "ExcludeMeta.h" #include "exit_codes.h" #include "file_info.h" #include "file_list.h" #include "FilesGrow.h" #include "FilesReserve.h" #include "FilterFile.h" #ifndef PJL_NO_SYMBOLIC_LINKS #include "FollowLinks.h" #endif #include "IncludeFile.h" #include "IncludeMeta.h" #include "Incremental.h" #include "indexer.h" #include "IndexFile.h" #include "index_segment.h" #include "itoa.h" #include "meta_map.h" #include "mmap_file.h" #include "option_stream.h" #include "RecurseSubdirs.h" #include "StopWordFile.h" #include "stop_words.h" #ifdef FEATURE_word_pos #include "StoreWordPositions.h" #endif #include "TempDirectory.h" #include "TitleLines.h" #include "util.h" #include "Verbosity.h" #include "version.h" #include "WordFilesMax.h" #include "word_info.h" #include "word_markers.h" #include "WordPercentMax.h" #include "WordThreshold.h" #include "word_util.h" using namespace PJL; using namespace std; static long const Rank_Factor = 10000000; // A scaling factor used in rank calculation. Empirically, // this was found to be a good number: higher, the results // don't get better; lower, the results get more striated. AssociateMeta associate_meta; ExcludeFile exclude_patterns; // do not index these IncludeFile include_patterns; // do index these ExcludeMeta exclude_meta_names; // meta names not to index IncludeMeta include_meta_names; // meta names to index FilesGrow files_grow; FilterFile file_filters; Incremental incremental; char const* me; // executable name meta_map meta_names; static int num_examined_files; static int num_temp_files; TitleLines num_title_lines; unsigned long num_total_words; // over all files indexed unsigned long num_indexed_words; // over all files indexed static unsigned long num_unique_words; // over all files indexed static vector<string> partial_index_file_names; RecurseSubdirs recurse_subdirectories; string temp_file_name_prefix; Verbosity verbosity; // how much to print word_map words; // the index being generated WordFilesMax word_files_max; WordPercentMax word_percent_max; WordThreshold word_threshold; #ifdef FEATURE_word_pos StoreWordPositions store_word_positions; int word_pos; // ith word in file #endif static void load_old_index( char const *index_file_name ); static void merge_indicies( ostream& ); static void rank_full_index(); extern "C" void remove_temp_files( void ); static ostream& usage( ostream& = cerr ); static void write_dir_index( ostream&, off_t* ); static void write_file_index( ostream&, off_t* ); static void write_full_index( ostream& ); static void write_meta_name_index( ostream&, off_t* ); static void write_partial_index(); static void write_stop_word_index( ostream&, off_t* ); static void write_word_index( ostream&, off_t* ); #define INDEX #include "do_file.c" #include "directory.c" //***************************************************************************** // // SYNOPSIS // inline void my_write( ostream &o, void const *buf, size_t len ) // // DESCRIPTION // // In the latest g++ implementation of the ANSI C++ standard library, // ostream::write() now apparantly requires a char* rather than a void*. // This function is to do the case in one place because I'm lazy. // // PARAMETERS // // o The ostream to write to. // // buf The buffer to be written. // // len The length of the buffer. // //***************************************************************************** { o.write( reinterpret_cast<char const*>( buf ), len ); } //***************************************************************************** // // SYNOPSIS // int main( int argc, char *argv[] ) // // DESCRIPTION // // Parse the command line, initialize, call other functions ... the usual // things that are done in main(). // // PARAMETERS // // argc The number of arguments. // // argv A vector of the arguments; argv[argc] is null. Aside from the // options below, the arguments are the names of the files and // directories to be indexed. // // SEE ALSO // // Bjarne Stroustrup. "The C++ Programming Language, 3rd ed." // Addison-Wesley, Reading, MA, 1997. pp. 116-118. // //***************************************************************************** { me = ::strrchr( argv[0], '/' ); // determine base name... me = me ? me + 1 : argv[0]; // ...of executable ////////// Max-out various system resources /////////////////////////////// #ifdef RLIMIT_AS /* SVR4 */ // // Max-out out out total memory potential. // max_out_limit( RLIMIT_AS ); #endif #ifdef RLIMIT_CPU /* SVR4, 4.3+BSD */ // // Max-out the amount of CPU time we can run since indexing can take a // while. // max_out_limit( RLIMIT_CPU ); #endif #ifdef RLIMIT_DATA /* SVR4, 4.3+BSD */ // // Max-out our heap allocation potential. // max_out_limit( RLIMIT_DATA ); #endif #ifdef RLIMIT_FSIZE // // Max-out the file-size creation potential. // max_out_limit( RLIMIT_FSIZE ); #endif #ifdef RLIMIT_NOFILE /* SVR4 */ // // Max-out the number of file descriptors we can have open to be able to // merge as many partial indicies as possible. // max_out_limit( RLIMIT_NOFILE ); #elif defined( RLIMIT_OFILE ) /* 4.3+BSD name for NOFILE */ max_out_limit( RLIMIT_OFILE ); #endif /////////// Process command-line options ////////////////////////////////// static option_stream::spec const opt_spec[] = { "help", 0, '?', "no-assoc-meta", 0, 'A', "config-file", 1, 'c', "pattern", 1, 'e', "no-pattern", 1, 'E', "file-max", 1, 'f', "files-reserve", 1, 'F', "files-grow", 1, 'g', "index-file", 1, 'i', "incremental", 0, 'I', #ifndef PJL_NO_SYMBOLIC_LINKS "follow-links", 0, 'l', #endif "meta", 1, 'm', "no-meta", 1, 'M', "percent-max", 1, 'p', #ifdef FEATURE_word_pos "no-pos-data", 0, 'P', #endif "no-recurse", 0, 'r', "stop-file", 1, 's', "dump-stop", 0, 'S', "title-lines", 1, 't', "temp-dir", 1, 'T', "verbosity", 1, 'v', "version", 0, 'V', "word-threshold", 1, 'W', 0 }; char const* config_file_name_arg = ConfigFile_Default; bool dump_stop_words_opt = false; char const* files_grow_arg = 0; char const* files_reserve_arg = 0; #ifndef PJL_NO_SYMBOLIC_LINKS bool follow_symbolic_links_opt = false; #endif bool incremental_opt = false; IndexFile index_file_name; char const* index_file_name_arg = 0; bool no_associate_meta_opt = false; bool no_word_pos_opt = false; char const* num_title_lines_arg = 0; bool recurse_subdirectories_opt = false; StopWordFile stop_word_file_name; char const* stop_word_file_name_arg = 0; TempDirectory temp_directory; char const* temp_directory_arg = 0; char const* verbosity_arg = 0; char const* word_files_max_arg = 0; char const* word_percent_max_arg = 0; char const* word_threshold_arg = 0; option_stream::spec *const all_options = indexer::all_mods_options( opt_spec ); option_stream opt_in( argc, argv, all_options ); for ( option_stream::option opt; opt_in >> opt; ) switch ( opt ) { case '?': // Print help. cerr << usage; case 'A': // Don't associate meta names. no_associate_meta_opt = true; break; case 'c': // Specify config. file. config_file_name_arg = opt.arg(); break; case 'e': { // Filename pattern(s) to index. if ( !::strtok( opt.arg(), ":" ) ) { error() << "no indexer module name\n"; ::exit( Exit_Usage ); } indexer *const i = indexer::find_indexer( opt.arg() ); if ( !i ) { error() << '"' << opt.arg() << "\": " "no such indexing module\n"; ::exit( Exit_Usage ); } for ( char *pat; pat = ::strtok( 0, "," ); ) include_patterns.insert( pat, i ); break; } case 'E': { // Filename pattern(s) not to index. char *a = opt.arg(); for ( char *pat; pat = ::strtok( a, "," ); ) { exclude_patterns.insert( pat ); a = 0; } break; } case 'f': // Specify the word/file file maximum. word_files_max_arg = opt.arg(); break; case 'F': // Specify files to reserve space for. files_reserve_arg = opt.arg(); break; case 'g': // Specify files to reserve space for growth. files_grow_arg = opt.arg(); break; case 'i': // Specify index file overriding the default. index_file_name_arg = opt.arg(); break; case 'I': // specify incremental indexing. incremental_opt = true; break; #ifndef PJL_NO_SYMBOLIC_LINKS case 'l': // Follow symbolic links during indexing. follow_symbolic_links_opt = true; break; #endif case 'm': // Specify meta name(s) to index. include_meta_names.parse_value( opt.arg() ); break; case 'M': // Specify meta name(s) not to index. exclude_meta_names.insert( to_lower( opt.arg() ) ); break; case 'p': // Specify the word/file percentage. word_percent_max_arg = opt.arg(); break; #ifdef FEATURE_word_pos case 'P': // Don't store word position data. no_word_pos_opt = true; break; #endif case 'r': // Specify whether to index recursively. recurse_subdirectories_opt = true; break; case 's': // Specify stop-word list. stop_word_file_name_arg = opt.arg(); break; case 'S': // Dump stop-word list. dump_stop_words_opt = true; break; case 't': // Specify number of title lines. num_title_lines_arg = opt.arg(); break; case 'T': // Specify temp. directory. temp_directory_arg = opt.arg(); break; case 'v': // Specify verbosity level. verbosity_arg = opt.arg(); break; case 'V': // Display version and exit. cout << "SWISH++ " << version << endl; ::exit( Exit_Success ); case 'W': // Word threshold. word_threshold_arg = opt.arg(); break; default: // Any indexing module claim the option? if ( !indexer::any_mod_claims_option( opt ) ) cerr << usage; } delete[] all_options; argc -= opt_in.shift(), argv += opt_in.shift(); // // First, parse the config. file (if any); then override variables with // options specified on the command line. // conf_var::parse_file( config_file_name_arg ); if ( files_grow_arg ) files_grow = files_grow_arg; if ( files_reserve_arg ) files_reserve = files_reserve_arg; #ifndef PJL_NO_SYMBOLIC_LINKS if ( follow_symbolic_links_opt ) follow_symbolic_links = true; #endif if ( incremental_opt ) incremental = true; if ( index_file_name_arg ) index_file_name = index_file_name_arg; if ( no_associate_meta_opt ) associate_meta = false; #ifdef FEATURE_word_pos if ( no_word_pos_opt ) store_word_positions = false; #endif if ( num_title_lines_arg ) num_title_lines = num_title_lines_arg; if ( recurse_subdirectories_opt ) recurse_subdirectories = false; if ( stop_word_file_name_arg ) stop_word_file_name = stop_word_file_name_arg; if ( temp_directory_arg ) temp_directory = temp_directory_arg; if ( verbosity_arg ) verbosity = verbosity_arg; if ( word_files_max_arg ) word_files_max = word_files_max_arg; if ( word_percent_max_arg ) word_percent_max = word_percent_max_arg; if ( word_threshold_arg ) word_threshold = word_threshold_arg; indexer::all_mods_post_options(); /////////// Dump stuff if requested /////////////////////////////////////// if ( dump_stop_words_opt ) { stop_words = new stop_word_set(); ::copy( stop_words->begin(), stop_words->end(), ostream_iterator< char const* >( cout, "\n" ) ); ::exit( Exit_Success ); } /////////// Index specified directories and files ///////////////////////// temp_file_name_prefix = temp_directory; if ( *temp_file_name_prefix.rbegin() != '/' ) temp_file_name_prefix += '/'; temp_file_name_prefix += string( itoa( ::getpid() ) ) + string( "." ); bool const using_stdin = *argv && (*argv)[0] == '-' && !(*argv)[1]; if ( !using_stdin && include_patterns.empty() && exclude_patterns.empty() ) error() << "filename patterns must be specified " "when not using standard input\n" << usage; if ( !argc ) cerr << usage; if ( incremental ) { load_old_index( index_file_name ); index_file_name += ".new"; } else { stop_words = new stop_word_set( stop_word_file_name ); // // In the case where several files (and no directories) are indexed, // there would be no directory; however, every file must be in a // directory, so add the directory "." here and now to the list of // directories. // check_add_directory( "." ); } ofstream out( index_file_name, ios::out | ios::binary ); if ( !out ) { error() << "can not write index to \"" << index_file_name << "\"\n"; ::exit( Exit_No_Write_Index ); } time_t time = ::time( 0 ); // Go! if ( using_stdin ) { // // Read file/directory names from standard input. // char file_name[ PATH_MAX + 1 ]; while ( cin.getline( file_name, PATH_MAX ) ) { if ( !file_exists( file_name ) ) { if ( verbosity > 3 ) cout << " " << file_name << " (skipped: does not exist)\n"; continue; } if ( is_directory() ) do_directory( new_strdup( file_name ) ); else do_check_add_file( file_name ); } } else { // // Read file/directory names from command line. // for ( ; *argv; ++argv ) { if ( !file_exists( *argv ) ) { if ( verbosity > 3 ) cout << " " << *argv << " (skipped: does not exist)\n"; continue; } if ( is_directory() ) do_directory( *argv ); else do_check_add_file( *argv ); } } if ( partial_index_file_names.empty() ) { rank_full_index(); write_full_index( out ); } else { if ( words.size() ) { // // Since we created partial indicies, write any remaining words to // their own partial index so the merge code doesn't have a special // case. // write_partial_index(); } merge_indicies( out ); } out.close(); if ( verbosity ) { time = ::time( 0 ) - time; // Stop! cout << '\n' << me << ": done:\n " << setfill('0') << setw(2) << (time / 60) << ':' << setw(2) << (time % 60) << " (min:sec) elapsed time\n " << num_examined_files << " files, " << file_info::num_files() << " indexed\n " << num_total_words << " words, " << num_indexed_words << " indexed, " << num_unique_words << " unique\n\n"; } ::exit( Exit_Success ); } //***************************************************************************** // // SYNOPSIS // bool is_too_frequent( char const *word, int file_count ) // // DESCRIPTION // // Checks to see if the word is too frequent by either exceeding the // maximum number or percentage of files it can be in. // // PARAMETERS // // word The word to be checked. // // file_count The number of files the word occurs in. // // RETURN VALUE // // Returns true only if the word is too frequent. // //***************************************************************************** { if ( file_count > word_files_max ) { if ( verbosity > 2 ) cout << "\n \"" << word << "\" discarded (" << file_count << " files)" << flush; return true; } int const wfp = file_count * 100 / file_info::num_files(); if ( wfp >= word_percent_max ) { if ( verbosity > 2 ) cout << "\n \"" << word << "\" discarded (" << wfp << "%)" << flush; return true; } return false; } //***************************************************************************** // // SYNOPSIS // void load_old_index( char const *index_file_name ) // // DESCRIPTION // // Load the stop-word, file, directory, and meta-name indicies from an // existing index file. // // PARAMETERS // // index_file_name The name of the index file to load. // //***************************************************************************** { mmap_file const index_file( index_file_name ); if ( !index_file ) { error() << "could not read index from \"" << index_file_name << '"' << error_string( index_file.error() ); ::exit( Exit_No_Read_Index ); } ////////// Load old stop words //////////////////////////////////////////// stop_words = new stop_word_set( index_file ); ////////// Load old directories /////////////////////////////////////////// index_segment old_dirs( index_file, index_segment::isi_dir ); FOR_EACH( index_segment, old_dirs, d ) check_add_directory( new_strdup( *d ) ); ////////// Load old files ///////////////////////////////////////////////// index_segment old_files( index_file, index_segment::isi_file ); if ( files_reserve <= old_files.size() ) { // // Add the FilesGrow configuration variable to the FilesReserve // configuration variable to allow room for growth. // files_reserve = files_grow( old_files.size() ); } FOR_EACH( index_segment, old_files, f ) { unsigned char const* u = reinterpret_cast<unsigned char const*>( *f ); int const dir_index = dec_int( u ); char const *const file_name = reinterpret_cast<char const*>(u); while ( *u++ ) ; // skip past filename size_t const size = dec_int( u ); int const num_words = dec_int( u ); char const *const title = reinterpret_cast<char const*>( u ); string const dir_str( old_dirs[ dir_index ] ); string const path( dir_str + '/' + file_name ); new file_info( path.c_str(), dir_index, size, title, num_words); } ////////// Load old meta names //////////////////////////////////////////// index_segment old_meta_names( index_file, index_segment::isi_meta_name ); FOR_EACH( index_segment, old_meta_names, m ) { unsigned char const* p = reinterpret_cast<unsigned char const*>( *m ); while ( *p++ ) ; // skip past meta name meta_names[ new_strdup( *m ) ] = dec_int( p ); } partial_index_file_names.push_back( index_file_name ); } //***************************************************************************** // // SYNOPSIS // inline int rank( int file_index, int occurences_in_file, double factor ) // // DESCRIPTION // // Compute the rank of a word in a file. This equation was taken from the // one used in SWISH-E whose author thinks (?) it is the one taken from // WAIS. I can't find this equation in the refernece cited below, // although that reference does list a different equation. But, if it // ain't broke, don't fix it. // // PARAMETERS // // file_index Which file we're dealing with. // // occurences_in_file The number of times the word occurs in a given // file. // // factor This should be precomputed to be the value of // Rank_Factor divided by the total number of // occurrences across all files. This number is // constant for a given word, hence the // precomputation. // // RETURN VALUE // // Returns a rank greater than zero. // // SEE ALSO // // Gerard Salton. "Automatic Text Processing: the transformation, // analysis, and retrieval of information by computer." Addison-Wesley, // Reading, MA. pp. 279-280. // //***************************************************************************** { int const r = int( ( ::log( occurences_in_file ) + 10 ) * factor / file_info::ith_info( file_index )->num_words() ); return r > 0 ? r : 1; } //***************************************************************************** // // SYNOPSIS // void merge_indicies( ostream &o ) // // DESCRIPTION // // Perform an n-way merge of the partial word index files. It first // determines the number of unique words in all the partial indicies, then // merges them all together and performs ranking at the same time. // // PARAMETERS // // o The ostream to write the index to. // //***************************************************************************** { vector< mmap_file > index( partial_index_file_names.size() ); vector< index_segment > words( partial_index_file_names.size() ); vector< index_segment::const_iterator > word( partial_index_file_names.size() ); register int i, j; ////////// Reopen all the partial indicies //////////////////////////////// ::atexit( &remove_temp_files ); i = 0; FOR_EACH( vector<string>, partial_index_file_names, file_name ) { index[ i ].open( file_name->c_str() ); if ( !index[ i ] ) { error() << "can not reopen temp. file \"" << *file_name << '"' << error_string( index[i].error() ); ::exit( Exit_No_Open_Temp ); } words[ i ].set_index_file( index[ i ], index_segment::isi_word ); ++i; } ////////// Must determine the number of unique words first //////////////// if ( verbosity > 1 ) cout << me << ": determining unique words..." << flush; for ( i = 0; i < partial_index_file_names.size(); ++i ) { // Start off assuming that all the words are unique. num_unique_words += words[ i ].size(); word[ i ] = words[ i ].begin(); } while ( true ) { // Find at least two non-exhausted indicies noting the first. register int n = 0; for ( j = 0; j < partial_index_file_names.size(); ++j ) if ( word[ j ] != words[ j ].end() ) if ( !n++ ) i = j; else if ( n >= 2 ) break; if ( n < 2 ) // couldn't find at least 2 break; // Find the lexographically least word. for ( j = i + 1; j < partial_index_file_names.size(); ++j ) if ( word[ j ] != words[ j ].end() ) if ( ::strcmp( *word[ j ], *word[ i ] ) < 0 ) i = j; file_list const list( word[ i ] ); int file_count = list.size(); // See if there are any duplicates and eliminate them. for ( j = i + 1; j < partial_index_file_names.size(); ++j ) if ( word[ j ] != words[ j ].end() ) if ( !::strcmp( *word[ j ], *word[ i ] ) ) { // // The two words are the same: add the second word's file // count to that of the first. // --num_unique_words; file_list const list( word[ j ] ); file_count += list.size(); ++word[ j ]; } if ( is_too_frequent( *word[ i ], file_count ) ) { // // The word occurs too frequently: consider it a stop word. // stop_words->insert( *word[ i ] ); --num_unique_words; } ++word[ i ]; } ////////// Write index file header //////////////////////////////////////// #define WRITE_HEADER #include "index_header.c" #undef WRITE_HEADER ////////// Merge the indicies ///////////////////////////////////////////// if ( verbosity > 1 ) cout << '\n' << me << ": merging partial indicies..." << flush; for ( i = 0; i < partial_index_file_names.size(); ++i ) word[ i ] = words[ i ].begin(); // reset all iterators int word_index = 0; while ( true ) { ////////// Find the next word ///////////////////////////////////////// // Find at least two non-exhausted indicies noting the first. register int n = 0; for ( j = 0; j < partial_index_file_names.size(); ++j ) { for ( ; word[ j ] != words[ j ].end(); ++word[ j ] ) if ( !stop_words->contains( *word[ j ] ) ) break; if ( word[ j ] != words[ j ].end() ) if ( !n++ ) i = j; else if ( n >= 2 ) break; } if ( n < 2 ) // couldn't find at least 2 break; // Find the lexographically least word. for ( j = i + 1; j < partial_index_file_names.size(); ++j ) if ( word[ j ] != words[ j ].end() ) if ( ::strcmp( *word[ j ], *word[ i ] ) < 0 ) i = j; word_offset[ word_index++ ] = o.tellp(); o << *word[ i ] << '\0'; ////////// Calc. total occurrences in all indicies //////////////////// int total_occurrences = 0; for ( j = i; j < partial_index_file_names.size(); ++j ) { if ( word[ j ] == words[ j ].end() ) continue; if ( ::strcmp( *word[ j ], *word[ i ] ) ) continue; file_list const list( word[ j ] ); FOR_EACH( file_list, list, file ) total_occurrences += file->occurrences_; } double const factor = (double)Rank_Factor / total_occurrences; ////////// Copy all index info and compute ranks ////////////////////// bool continues = false; for ( j = i; j < partial_index_file_names.size(); ++j ) { if ( word[ j ] == words[ j ].end() ) continue; if ( ::strcmp( *word[ j ], *word[ i ] ) ) continue; file_list const list( word[ j ] ); FOR_EACH( file_list, list, file ) { if ( continues ) o << Word_Entry_Continues_Marker; else continues = true; o << enc_int( file->index_ ) << enc_int( file->occurrences_ ) << enc_int( rank(file->index_, file->occurrences_, factor) ); if ( !file->meta_ids_.empty() ) file->write_meta_ids( o ); #ifdef FEATURE_word_pos if ( !file->pos_deltas_.empty() ) file->write_word_pos( o ); #endif } if ( j != i ) ++word[ j ]; } o << Stop_Marker; ++word[ i ]; } ////////// Copy remaining words from last non-exhausted index ///////////// for ( j = 0; j < partial_index_file_names.size(); ++j ) { if ( word[ j ] == words[ j ].end() ) continue; for ( ; word[ j ] != words[ j ].end(); ++word[ j ] ) { if ( stop_words->contains( *word[ j ] ) ) continue; word_offset[ word_index++ ] = o.tellp(); o << *word[ j ] << '\0'; ////////// Calc. total occurrences in all indicies //////////////// int total_occurrences = 0; file_list const list( word[ j ] ); FOR_EACH( file_list, list, file ) total_occurrences += file->occurrences_; double const factor = (double)Rank_Factor / total_occurrences; ////////// Copy all index info and compute ranks ////////////////// bool continues = false; FOR_EACH( file_list, list, file ) { if ( continues ) o << Word_Entry_Continues_Marker; else continues = true; o << enc_int( file->index_ ) << enc_int( file->occurrences_ ) << enc_int( rank(file->index_, file->occurrences_, factor) ); if ( !file->meta_ids_.empty() ) file->write_meta_ids( o ); #ifdef FEATURE_word_pos if ( !file->pos_deltas_.empty() ) file->write_word_pos( o ); #endif } o << Stop_Marker; } } write_stop_word_index( o, stop_word_offset ); write_dir_index ( o, dir_offset ); write_file_index ( o, file_offset ); write_meta_name_index( o, meta_name_offset ); ////////// Go back and write the computed offsets ///////////////////////// #define REWRITE_HEADER #include "index_header.c" #undef REWRITE_HEADER if ( verbosity > 1 ) cout << '\n'; } //***************************************************************************** // // SYNOPSIS // void rank_full_index() // // DESCRIPTION // // Compute the rank of all files for all words in the index. This // function is used only when partial indicies are not generated. Also // removes words that occur too frequently. // //***************************************************************************** { if ( words.empty() ) return; if ( verbosity > 1 ) cout << '\n' << me << ": ranking index..." << flush; for ( word_map::iterator w = words.begin(); w != words.end(); ) { word_info &info = w->second; if ( is_too_frequent( w->first.c_str(), info.files_.size() ) ) { // // The word occurs too frequently: consider it a stop word. // stop_words->insert( new_strdup( w->first.c_str() ) ); words.erase( w++ ); continue; } ++w; // // Compute the rank for this word in every file it's in. // double const factor = (double)Rank_Factor / info.occurrences_; TRANSFORM_EACH( word_info::file_list, info.files_, file ) file->rank_ = rank( file->index_, file->occurrences_, factor ); } if ( verbosity > 1 ) cout << '\n'; } //***************************************************************************** // // SYNOPSIS // void remove_temp_files( void ) // // DESCRIPTION // // Remove the temporary partial index files. This function is called via // atexit(3). // // NOTE // // This function is declared extern "C" since it is called via the C // library function atexit(3) and, because it's a C function, it expects C // linkage. // //***************************************************************************** { for ( int i = 0; i < num_temp_files; ++i ) { string const temp_file_name = temp_file_name_prefix + itoa( i ); ::unlink( temp_file_name.c_str() ); } } //***************************************************************************** // // SYNOPSIS // void write_dir_index( ostream &o, register off_t *offset ) // // DESCRIPTION // // Write the directory index to the given ostream recording the offsets as // it goes. // // PARAMETERS // // o The ostream to write the index to. // // offset A pointer to a built-in vector where to record the offsets. // //***************************************************************************** { // // First, order the directories by their index using a temporary vector. // typedef vector< char const* > dir_list_type; dir_list_type dir_list( dir_set.size() ); FOR_EACH( dir_set_type, dir_set, dir ) dir_list[ dir->second ] = dir->first; // // Now write them out in order. // register int dir_index = 0; FOR_EACH( dir_list_type, dir_list, dir ) { offset[ dir_index++ ] = o.tellp(); o << *dir << '\0'; } } //***************************************************************************** // // SYNOPSIS // void write_file_index( ostream &o, register off_t *offset ) // // DESCRIPTION // // Write the file index to the given ostream recording the offsets as it // goes. // // PARAMETERS // // o The ostream to write the index to. // // offset A pointer to a built-in vector where to record the offsets. // //***************************************************************************** { register int file_index = 0; for ( file_info::const_iterator i = file_info::begin(); i != file_info::end(); ++i ) { offset[ file_index++ ] = o.tellp(); o << enc_int( (*i)->dir_index() ) << (*i)->file_name() << '\0' << enc_int( (*i)->size() ) << enc_int( (*i)->num_words() ) << (*i)->title() << '\0'; } } //***************************************************************************** // // SYNOPSIS // void write_full_index( ostream &o ) // // DESCRIPTION // // Write the index to the given ostream. The index file is written in // such a way so that it can be mmap'd and used instantly with no parsing // or other processing. // // PARAMETERS // // o The ostream to write the index to. // // SEE ALSO // // swish++.index(4) // //***************************************************************************** { if ( !( num_unique_words = words.size() ) ) return; if ( verbosity > 1 ) cout << me << ": writing index..." << flush; #define WRITE_HEADER #include "index_header.c" #undef WRITE_HEADER write_word_index ( o, word_offset ); write_stop_word_index( o, stop_word_offset ); write_dir_index ( o, dir_offset ); write_file_index ( o, file_offset ); write_meta_name_index( o, meta_name_offset ); #define REWRITE_HEADER #include "index_header.c" #undef REWRITE_HEADER if ( verbosity > 1 ) cout << '\n'; } //***************************************************************************** // // SYNOPSIS // void write_meta_name_index( ostream &o, register off_t *offset ) // // DESCRIPTION // // Write the meta name index to the given ostream recording the offsets as // it goes. // // PARAMETERS // // o The ostream to write the index to. // // offset A pointer to a built-in vector where to record the offsets. // //***************************************************************************** { register int meta_index = 0; FOR_EACH( meta_map, meta_names, m ) { offset[ meta_index++ ] = o.tellp(); o << m->first << '\0' << enc_int( m->second ); } } //***************************************************************************** // // SYNOPSIS // void write_partial_index() // // DESCRIPTION // // Write a partial index to a temporary file. The format of a partial // index file is: // // long num_words; // off_t word_offset[ num_words ]; // (word index) // // The partial word index is in the same format as the complete index. // //***************************************************************************** { string const temp_file_name = temp_file_name_prefix + itoa( num_temp_files++ ); ofstream o( temp_file_name.c_str(), ios::out | ios::binary ); if ( !o ) { error() << "can not write temp. file \"" << temp_file_name << "\"\n"; ::exit( Exit_No_Write_Temp ); } partial_index_file_names.push_back( temp_file_name ); if ( verbosity > 1 ) cout << '\n' << me << ": writing partial index..." << flush; long const num_words = words.size(); off_t *const word_offset = new off_t[ num_words ]; // Write dummy data as a placeholder until the offsets are computed. my_write( o, &num_words, sizeof( num_words ) ); streampos const word_offset_pos = o.tellp(); my_write( o, word_offset, num_words * sizeof( word_offset[0] ) ); write_word_index( o, word_offset ); // Go back and write the computed offsets. o.seekp( word_offset_pos ); my_write( o, word_offset, num_words * sizeof( word_offset[0] ) ); delete[] word_offset; words.clear(); if ( verbosity > 1 ) cout << "\n\n"; } //***************************************************************************** // // SYNOPSIS // void write_stop_word_index( ostream &o, register off_t *offset ) // // DESCRIPTION // // Write the stop-word index to the given ostream recording the offsets as // it goes. // // PARAMETERS // // o The ostream to write the index to. // // offset A pointer to a built-in vector where to record the offsets. // //***************************************************************************** { register int word_index = 0; FOR_EACH( stop_word_set, *stop_words, word ) { offset[ word_index++ ] = o.tellp(); o << *word << '\0'; } } //***************************************************************************** // // SYNOPSIS // void write_word_index( ostream &o, register off_t *offset ) // // DESCRIPTION // // Write the word index to the given ostream recording the offsets as it // goes. // // PARAMETERS // // o The ostream to write the index to. // // offset A pointer to a built-in vector where to record the offsets. // //***************************************************************************** { register int word_index = 0; FOR_EACH( word_map, words, w ) { offset[ word_index++ ] = o.tellp(); o << w->first << '\0'; bool continues = false; word_info const &info = w->second; FOR_EACH( word_info::file_list, info.files_, file ) { if ( continues ) o << Word_Entry_Continues_Marker; else continues = true; o << enc_int( file->index_ ) << enc_int( file->occurrences_ ) << enc_int( file->rank_ ); if ( !file->meta_ids_.empty() ) file->write_meta_ids( o ); #ifdef FEATURE_word_pos if ( !file->pos_deltas_.empty() ) file->write_word_pos( o ); #endif } o << Stop_Marker; } } //***************************************************************************** // // Miscellaneous function(s) // //***************************************************************************** ostream& usage( ostream &err ) { err << "usage: " << me << " [options] dir ... file ...\n" "options: (unambiguous abbreviations may be used for long options)\n" "========\n" "-? | --help : Print this help message\n" "-A | --no-assoc-meta : Don't associate meta names [default: do]\n" "-c f | --config-file f : Name of configuration file [default: " << ConfigFile_Default << "]\n" "-e m:p | --pattern m:p : Module and file pattern to index [default: none]\n" "-E p | --no-pattern p : File pattern not to index [default: none]\n" "-f n | --word-files n : Word/file maximum [default: infinity]\n" "-F n | --files-reserve n : Reserve space for number of files [default: " << FilesReserve_Default << "]\n" "-g n | --files-grow n : Number or percentage to grow by [default: " << FilesGrow_Default << "]\n" "-i f | --index-file f : Name of index file to use [default: " << IndexFile_Default << "]\n" "-I | --incremental : Add files/words to index [default: replace]\n" #ifndef PJL_NO_SYMBOLIC_LINKS "-l | --follow-links : Follow symbolic links [default: don't]\n" #endif "-m m | --meta m : Meta name to index [default: all]\n" "-M m | --no-meta m : Meta name not to index [default: none]\n" "-p n | --word-percent n : Word/file percentage [default: 100]\n" #ifndef FEATURE_word_pos "-P | --no-pos-data : Don't store word position data [default: do]\n" #endif "-r | --no-recurse : Don't index subdirectories [default: do]\n" "-s f | --stop-file f : Stop-word file to use instead of built-in default\n" "-S | --dump-stop : Dump built-in stop-words, exit\n" "-t n | --title-lines n : Lines to look for titles [default: " << TitleLines_Default << "]\n" "-T d | --temp-dir d : Directory for temporary files [default: " << TempDirectory_Default << "]\n" "-v n | --verbosity n : Verbosity level [0-4; default: 0]\n" "-V | --version : Print version number, exit\n" "-W n | --word-threshold n : Words to make partial indicies [default: " << WordThreshold_Default << "]\n"; indexer::all_mods_usage( err ); ::exit( Exit_Usage ); return err; // just to make the compiler happy } /* vim:set et sw=4 ts=4: */ ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/index_header.c������������������������������������������������������������������������0000644�0000765�0000000�00000006757�10263526043�014254� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** index_header.c ** ** Copyright (C) 2001 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* ** Note that this file is #include'd into index.c because it would be ** annoying to make a function out of it. */ #ifdef WRITE_HEADER long const num_dirs = dir_set.size(); long const num_files = file_info::num_files(); long const num_stop_words = stop_words->size(); long const num_meta_names = meta_names.size(); off_t *const word_offset = new off_t[ num_unique_words ]; off_t *const stop_word_offset = num_stop_words ? new off_t[ num_stop_words ] : 0; off_t *const dir_offset = new off_t[ num_dirs ]; off_t *const file_offset = new off_t[ num_files ]; off_t *const meta_name_offset = num_meta_names ? new off_t[ num_meta_names ] : 0; my_write( o, &num_unique_words, sizeof( num_unique_words ) ); streampos const word_offset_pos = o.tellp(); my_write( o, word_offset, num_unique_words * sizeof( word_offset[0] ) ); my_write( o, &num_stop_words, sizeof( num_stop_words ) ); streampos const stop_word_offset_pos = o.tellp(); if ( num_stop_words ) my_write( o, stop_word_offset, num_stop_words * sizeof( stop_word_offset[0] ) ); my_write( o, &num_dirs, sizeof( num_dirs ) ); streampos const dir_offset_pos = o.tellp(); my_write( o, dir_offset, num_dirs * sizeof( dir_offset[0] ) ); my_write( o, &num_files, sizeof( num_files ) ); streampos const file_offset_pos = o.tellp(); my_write( o, file_offset, num_files * sizeof( file_offset[0] ) ); my_write( o, &num_meta_names, sizeof( num_meta_names ) ); streampos const meta_name_offset_pos = o.tellp(); if ( num_meta_names ) my_write( o, meta_name_offset, num_meta_names * sizeof( meta_name_offset[0] ) ); #endif #ifdef REWRITE_HEADER o.seekp( word_offset_pos ); my_write( o, word_offset, num_unique_words * sizeof( word_offset[0] ) ); if ( num_stop_words ) { o.seekp( stop_word_offset_pos ); my_write( o, stop_word_offset, num_stop_words * sizeof( stop_word_offset[0] ) ); } o.seekp( dir_offset_pos ); my_write( o, dir_offset, num_dirs * sizeof( dir_offset[0] ) ); o.seekp( file_offset_pos ); my_write( o, file_offset, num_files * sizeof( file_offset[0] ) ); if ( num_meta_names ) { o.seekp( meta_name_offset_pos ); my_write( o, meta_name_offset, num_meta_names * sizeof( meta_name_offset[0] ) ); } delete[] word_offset; delete[] stop_word_offset; delete[] dir_offset; delete[] file_offset; delete[] meta_name_offset; #endif /* vim:set et sw=4 ts=4: */ �����������������swish++-6.1.5/index_segment.c�����������������������������������������������������������������������0000644�0000765�0000000�00000004050�10263526043�014446� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** index_segment.c ** ** Copyright (C) 1998 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ // local #include "index_segment.h" using namespace PJL; //***************************************************************************** // // SYNOPSIS // void index_segment::set_index_file( mmap_file const &file, segment_id id ) // // DESCRIPTION // // Set the index file to use by setting data members to the proper // positions within the index file. // // CAVEAT // // Ideally, this function would be part of the constructor, but the name // of the index file can be passed in via the command line and that isn't // parsed until after the instance is constructed. // // SEE ALSO // // index.c write_full_index() -- format of index file // //***************************************************************************** { register mmap_file::const_iterator c = begin_ = file.begin(); register size_type const *p = reinterpret_cast<size_type const*>( c ); num_entries_ = p[ 0 ]; for ( int i = id; i > 0; --i ) { c += sizeof( num_entries_ ) + num_entries_ * sizeof( off_t ); p = reinterpret_cast<size_type const*>( c ); num_entries_ = p[ 0 ]; } offset_ = reinterpret_cast<off_t const*>( &p[ 1 ] ); } /* vim:set et sw=4 ts=4: */ ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/index_segment.h�����������������������������������������������������������������������0000644�0000765�0000000�00000011561�10166052462�014461� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** index_segment.h ** ** Copyright (C) 1998 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef index_segment_H #define index_segment_H // standard #include <cstddef> /* for ptrdiff_t */ #include <iterator> #include <sys/types.h> /* for off_t */ // local #include "mmap_file.h" //***************************************************************************** // // SYNOPSIS // class index_segment // // DESCRIPTION // // An instance of this class is used to access either the word, stop-word, // file, or meta-name index portions of a generated index. // // By implementing fully-blown random access iterators for it, the STL // algorithms work, in particular binary_search() and equal_range() that // are used to do look ups. // //***************************************************************************** { public: ////////// typedefs /////////////////////////////////////////////////////// typedef unsigned long size_type; typedef ptrdiff_t difference_type; typedef char const* value_type; typedef char const** const_pointer; typedef char const* const_reference; enum segment_id { isi_word = 0, isi_stop_word = 1, isi_dir = 2, isi_file = 3, isi_meta_name = 4 }; ////////// constructors /////////////////////////////////////////////////// index_segment() { } index_segment( PJL::mmap_file const &file, segment_id id ) { set_index_file( file, id ); } ////////// member functions /////////////////////////////////////////////// void set_index_file( PJL::mmap_file const&, segment_id ); size_type size() const { return num_entries_; } const_reference operator[]( size_type i ) const { return begin_ + offset_[ i ]; } ////////// iterators ////////////////////////////////////////////////////// class const_iterator; friend class const_iterator; class const_iterator : public std::iterator< std::random_access_iterator_tag, value_type > { private: index_segment const *index_; size_type i_; const_iterator( index_segment const *index, size_type i ) : index_( index ), i_( i ) { } friend class index_segment; public: const_iterator() { } const_reference operator*() const { return (*index_)[ i_ ]; } const_iterator& operator++() { return ++i_, *this; } const_iterator& operator--() { return --i_, *this; } const_iterator operator++(int) { return const_iterator( index_, i_++ ); } const_iterator operator--(int) { return const_iterator( index_, i_-- ); } const_iterator& operator+=( int n ) { return i_ += n, *this; } const_iterator& operator-=( int n ) { return i_ -= n, *this; } friend bool operator==( const_iterator const &i, const_iterator const &j ) { return i.i_ == j.i_; } friend bool operator!=( const_iterator const &i, const_iterator const &j ) { return !( i == j ); } friend const_iterator operator+( const_iterator const &i, int n ) { return const_iterator( i.index_, i.i_ + n ); } friend const_iterator operator-( const_iterator const &i, int n ) { return const_iterator( i.index_, i.i_ - n ); } friend difference_type operator-( const_iterator const &i, const_iterator const &j ) { return i.i_ - j.i_; } // default copy constructor is OK // default assignment operator is OK }; const_iterator begin() const { return const_iterator( this, 0 ); } const_iterator end() const { return const_iterator( this, num_entries_ ); } private: PJL::mmap_file::const_iterator begin_; size_type num_entries_; off_t const *offset_; }; #endif /* index_segment_H */ /* vim:set et sw=4 ts=4: */ �����������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/indexer.c�����������������������������������������������������������������������������0000644�0000765�0000000�00000042725�10300243532�013255� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** indexer.c ** ** Copyright (C) 2000 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ // standard #include <cstring> // local #include "config.h" #include "encoded_char.h" #include "file_info.h" #include "ExcludeMeta.h" #include "IncludeMeta.h" #include "indexer.h" #include "iso8859-1.h" #include "meta_map.h" #include "platform.h" #include "stop_words.h" #include "StoreWordPositions.h" #include "util.h" #include "word_info.h" #include "word_util.h" using namespace PJL; using namespace std; extern unsigned long num_indexed_words; extern unsigned long num_total_words; #ifdef FEATURE_word_pos extern int word_pos; #endif extern word_map words; int indexer::suspend_indexing_count_ = 0; indexer* indexer::text_indexer_ = 0; //***************************************************************************** // // SYNOPSIS // indexer::indexer( char const *mod_name ) // // DESCRIPTION // // Construct an indexer module by adding its name to the map of indexers. // // PARAMETERS // // mod_name The name of the indexer module. // //***************************************************************************** { indexer *&i = map_ref()[ to_lower( mod_name ) ]; if ( i ) { internal_error << "indexer::indexer(\"" << mod_name << "\"): " "registered more than once" << report_error; } i = this; } //***************************************************************************** // // SYNOPSIS // indexer::~indexer() // // DESCRIPTION // // Destroy an indexer. // // NOTE // // This is out-of-line only because it's virtual. // //***************************************************************************** { // do nothing } //***************************************************************************** // // SYNOPSIS // /* static */ option_stream::spec* indexer::all_mods_options( option_stream::spec const *main_spec ) // // DESCRIPTION // // Build a combined option specification of the main indexing options plus // any additional ones of indexing modules. // // PARAMETERS // // main_spec The option specification for the main part of index(1). // // RETURN VALUE // // Returns a pointer to an array of option_stream::spec. It should be // deleted after use (with delete[]). // //***************************************************************************** { option_stream::spec const *s; ////////// Count all options ////////////////////////////////////////////// int option_count = 0; for ( s = main_spec; s->long_name; ++s ) ++option_count; FOR_EACH( map_type, map_ref(), mod ) if ( s = mod->second->option_spec() ) while ( s->long_name ) ++option_count, ++s; ////////// Make combined option_spec ////////////////////////////////////// option_stream::spec *const combined_spec = new option_stream::spec[ option_count + 1 ]; option_stream::spec *c = combined_spec; for ( s = main_spec; s->long_name; ++s ) *c++ = *s; FOR_EACH( map_type, map_ref(), mod ) if ( s = mod->second->option_spec() ) while ( s->long_name ) *c++ = *s++; c->long_name = 0; c->arg_type = 0; c->short_name = 0; return combined_spec; } //***************************************************************************** // // SYNOPSIS // /* static */ void indexer::all_mods_post_options() // // DESCRIPTION // // This function is called to give all indexer modules a chance to do // things just after command-line options have been processed. // //***************************************************************************** { TRANSFORM_EACH( map_type, map_ref(), mod ) mod->second->post_options(); } //***************************************************************************** // // SYNOPSIS // /* static */ bool indexer::any_mod_claims_option( option_stream::option const &opt ) // // DESCRIPTION // // This function is callled to see if any indexing module claims a given // option. // // RETURN VALUE // // Returns true only if any indexing module claims the option. // //***************************************************************************** { TRANSFORM_EACH( map_type, map_ref(), mod ) if ( mod->second->claims_option( opt ) ) return true; return false; } //***************************************************************************** // // SYNOPSIS // /* static */ void indexer::all_mods_usage( ostream &o ) // // DESCRIPTION // // Write all indexing-module-specific usage options, if any, to a given // ostream. // // PARAMETERS // // o The ostream to write the usage messages to. // //***************************************************************************** { FOR_EACH( map_type, map_ref(), mod ) mod->second->usage( o ); } //***************************************************************************** // // SYNOPSIS // /* virtual */ bool indexer::claims_option( option_stream::option const& ) // // DESCRIPTION // // See if an indexing module claims an option. The default doesn't. A // derived indexer that does should override this function. // // PARAMETERS // // Not used. // // RETURN VALUE // // Returns false. // //***************************************************************************** { return false; } //***************************************************************************** // // SYNOPSIS // /* static */ int indexer::find_meta( char const *meta_name ) // // DESCRIPTION // // Look up a meta name to get its associated unique integer ID; if the // meta name didn't exist, add it, or perhaps a reassigned name of it. // However, if the name is either among the set of meta names to exclude // or not among the set to include, forget it. // // PARAMETERS // // meta_name The meta name to find. // // RETURN VALUE // // Returns the ID of the given meta name or Meta_ID_None if the meta name // is either excluded or not included. // //***************************************************************************** { if ( exclude_meta_names.contains( meta_name ) ) return Meta_ID_None; if ( !include_meta_names.empty() ) { // // There were meta names explicitly given: see if the meta name is // among them. If not, forget it; if so, possibly reassign the name. // IncludeMeta::const_iterator const m = include_meta_names.find( meta_name ); if ( m == include_meta_names.end() ) return Meta_ID_None; meta_name = m->second; } // // Look up the meta name to get its associated unique integer ID. // meta_map::const_iterator const i = meta_names.find( meta_name ); if ( i != meta_names.end() ) return i->second; // // New meta name: add it. Do this in two statements intentionally because // C++ doesn't guarantee that the RHS of assignment is evaluated first. // int const meta_id = meta_names.size(); return meta_names[ new_strdup( meta_name ) ] = meta_id; } //***************************************************************************** // // SYNOPSIS // /* virtual */ char const* indexer::find_title( mmap_file const& ) const // // DESCRIPTION // // Define the default find_title() function that simply returns null // indicating that the file has no meaningful title (like plain text // files don't). // // RETURN VALUE // // Returns null. // //***************************************************************************** { return 0; } //***************************************************************************** // // SYNOPSIS // /* static */ void indexer::index_word( register char *word, register int len, int meta_id ) // // DESCRIPTION // // Potentially index the given word. // // PARAMETERS // // word The candidate word to be indexed. // // len The length of the word since it is not null-terminated. // // meta_id The numeric ID of the meta name the word, if indexed, is to // be associated with. // //***************************************************************************** { ++num_total_words; #ifdef FEATURE_word_pos ++word_pos; #endif if ( len < Word_Hard_Min_Size ) return; if ( suspend_indexing_count_ > 0 ) { // // A derived indexer class has called suspend_indexing(), so do nothing // more. // // This facility is currently used by HTML_indexer to indicate that the // word is within an HTML or XHTML element's begin/end tags whose begin // tag's CLASS attribute value is among the set of class names not to // index, so do nothing. // return; } ////////// Strip chars not in Word_Begin_Chars/Word_End_Chars ///////////// for ( register int i = len - 1; i >= 0; --i ) { if ( is_word_end_char( word[ i ] ) ) break; --len; } if ( len < Word_Hard_Min_Size ) return; word[ len ] = '\0'; while ( *word ) { if ( is_word_begin_char( *word ) ) break; --len, ++word; } if ( len < Word_Hard_Min_Size ) return; ////////// Stop-word checks /////////////////////////////////////////////// if ( !is_ok_word( word ) ) return; char const *const lower_word = to_lower( word ); if ( stop_words->contains( lower_word ) ) return; ////////// Add the word /////////////////////////////////////////////////// file_info::inc_words(); ++num_indexed_words; word_info &wi = words[ lower_word ]; ++wi.occurrences_; if ( !wi.files_.empty() ) { // // We've seen this word before: determine whether we've seen it before // in THIS file, and, if so, increment the number of occurrences. // word_info::file &last_file = wi.files_.back(); if ( last_file.index_ == file_info::current_index() ) { ++last_file.occurrences_; goto skip_push_back; } } // // First time word occurred in current file. // wi.files_.push_back( word_info::file( file_info::current_index() ) ); skip_push_back: word_info::file &last_file = wi.files_.back(); if ( meta_id != Meta_ID_None ) last_file.meta_ids_.insert( meta_id ); #ifdef FEATURE_word_pos if ( store_word_positions ) last_file.add_word_pos( word_pos ); #endif } //***************************************************************************** // // SYNOPSIS // /* virtual */ void indexer::index_words( encoded_char_range const &e, int meta_id ) // // DESCRIPTION // // Index the words between the given iterators. The text is assumed to be // plain text. // // PARAMETERS // // e The encoded text to index. // // meta_id The numeric ID of the meta name the words index are to be // associated with. // //***************************************************************************** { char word[ Word_Hard_Max_Size + 1 ]; bool in_word = false; int len; encoded_char_range::const_iterator c = e.begin(); while ( !c.at_end() ) { register char const ch = iso8859_1_to_ascii( *c++ ); ////////// Collect a word ///////////////////////////////////////////// if ( is_word_char( ch ) ) { if ( !in_word ) { // start a new word word[ 0 ] = ch; len = 1; in_word = true; continue; } if ( len < Word_Hard_Max_Size ) { // continue same word word[ len++ ] = ch; continue; } in_word = false; // too big: skip chars while ( !c.at_end() && is_word_char( iso8859_1_to_ascii( *c++ ) ) ) ; continue; } if ( in_word ) { // // We ran into a non-word character, so index the word up to, but // not including, it. // in_word = false; index_word( word, len, meta_id ); } } if ( in_word ) { // // We ran into 'end' while still accumulating characters into a word, // so just index what we've got. // index_word( word, len, meta_id ); } } //***************************************************************************** // // SYNOPSIS // /* static */ indexer::map_type& indexer::map_ref() // // DESCRIPTION // // Define and initialize (exactly once) a static data member for indexer // and return a reference to it. The reason for this function is to // guarantee that the map is initialized before its first use across all // translation units, something that would not guaranteed if it were a // static data member initialized at file scope. // // We also initialize the map with pointers to the singleton instances of // all derived class indexers. // // RETURN VALUE // // Returns a reference to a static instance of an initialized map_type. // // SEE ALSO // // Margaret A. Ellis and Bjarne Stroustrup. "The Annotated C++ // Reference Manual." Addison-Wesley, Reading, MA, 1990. p. 19. // //***************************************************************************** { static map_type m; static bool init; if ( !init ) { init = true; // must set this before init_modules() init_modules(); // defined in init_modules.c static indexer text( "text" ); text_indexer_ = &text; } return m; } //***************************************************************************** // // SYNOPSIS // /* virtual */ option_stream::spec const* indexer::option_spec() const // // DESCRIPTION // // Return a module-specific option specification. The default returns // none. A derived indexer that has its own command-line options should // override this function. // // RETURN VALUE // // Returns null. // //***************************************************************************** { return 0; } //***************************************************************************** // // SYNOPSIS // /* virtual */ void indexer::post_options() // // DESCRIPTION // // The default does nothing after command-line options are processed. // //***************************************************************************** { // do nothing } //***************************************************************************** // // SYNOPSIS // char* indexer::tidy_title( char const *begin, char const *end ) // // DESCRIPTION // // "Tidy up" a title string by trimming leading and trailing whitespace, // squeezing multiple spaces to single spaces, and converting all // non-space whitespace characters to spaces. // // Additionally, if the length of the title exceeds Title_Max_Size, then // the title is truncated and the last 3 characters of the truncated title // are replaced with an ellipsis ("..."). // // PARAMETERS // // begin The pointer to the beginning of the title. // // end The pointer to one past the end of the title. // // RETURN VALUE // // Returns the title. // //***************************************************************************** { // Remove leading spaces while ( begin < end && is_space( *begin ) ) ++begin; // Remove trailing spaces while ( begin < --end && is_space( *end ) ) ; ++end; // Squeeze/convert multiple whitespace characters to single spaces. static char title[ Title_Max_Size + 1 ]; int consec_spaces = 0, len = 0; while ( begin < end ) { char c = *begin++; if ( is_space( c ) ) { if ( ++consec_spaces >= 2 ) continue; c = ' '; } else consec_spaces = 0; title[ len++ ] = c; if ( len == Title_Max_Size ) { ::strcpy( title + Title_Max_Size - 3, "..." ); break; } } title[ len ] = '\0'; return title; } //***************************************************************************** // // SYNOPSIS // /* virtual */ void indexer::usage( ostream& ) const // // DESCRIPTION // // Print a module-specific usage message. The default prints nothing. A // derived indexer that has its own command-line options should override // this function. // // PARAMETERS // // Not used. // //***************************************************************************** { // do nothing } /* vim:set et sw=4 ts=4: */ �������������������������������������������swish++-6.1.5/indexer.h�����������������������������������������������������������������������������0000644�0000765�0000000�00000016711�10300243532�013256� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** indexer.h ** ** Copyright (C) 2000 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef indexer_H #define indexer_H // standard #include <iostream> #include <map> #include <string> // local #include "encoded_char.h" #include "mmap_file.h" #include "option_stream.h" #include "util.h" #include "word_util.h" enum { Meta_ID_None = -1, Meta_ID_Not_Found = -2 }; //***************************************************************************** // // SYNOPSIS // class indexer // // DESCRIPTION // // An indexer the base class for all other indexers. By itself, it can // only index plain text files. // // The model used is that singleton instances of indexers are created once // at program initialization time and NOT that indexers are created and // destroyed for every file indexed. // //***************************************************************************** { public: virtual ~indexer(); static bool any_mod_claims_option( PJL::option_stream::option const& ); // See if any indexing module claims a given option. static PJL::option_stream::spec* all_mods_options( PJL::option_stream::spec const* ); // Returns a combined option specification of the main // indexing options plus any additional ones of indexing // modules. static void all_mods_post_options(); // Give all indexer modules a chance to do things just after // command-line options have been processed. static void all_mods_usage( std::ostream& ); // Print additional usage messages, if any, for all indexing // modules. static indexer* find_indexer( char const *mod_name ); // Given a module name (case is irrelevant), return its // indexer. static int find_meta( char const *meta_name ); // Look up a meta name to get its associated ID; if it doesn't // exist, add it. However, if the name is either among the // set of meta names to exclude or not among the set to // include, forget it. virtual char const* find_title( PJL::mmap_file const& ) const; // By default, a file has no title, so the file's base name // becomes its title. If a particular file type can have // something better for a title, the derived indexer class // should override this function. void index_file( PJL::mmap_file const& ); // This is the main entry point: this is called to index the // given file. static void index_word( char*, int len, int = Meta_ID_None ); // Once a word has been parsed, this is the function to be // called from within index_words() to index it, potentially. // This is not virtual intentionally for performance. virtual void index_words( encoded_char_range const&, int meta_id = Meta_ID_None ); // Index words in a file between [begin,end) and associate // them with the given meta ID. The default indexes a run of // plain text. A derived indexer will override this. static indexer* text_indexer(); // Return the plain text indexer. protected: indexer( char const *mod_name ); static void suspend_indexing(); static void resume_indexing(); // These control whether index_word() above will actually // index words. This is useful not to indexed selected // portions of files while still going through the motions of // collecting word statistics. Suspend/resume calls may nest. virtual bool claims_option( PJL::option_stream::option const& ); // See if an indexing module claims an option. The default // doesn't. A derived indexer that does should override this // function. virtual PJL::option_stream::spec const* option_spec() const; // Return a module-specific option specification. The default // returns none. A derived indexer that has its own // command-line options should override this function. virtual void post_options(); // See if an indexing module needs to do anything just after // command-line options have been processed. The default does // nothing. A derived indexer that needs to should override // this function. static char* tidy_title( char const *begin, char const *end ); // "Tidy up" a title string by trimming leading and trailing // whitespace as well as converting all non-space whitespace // characters to spaces. A derived indexer that overrides // find_title() should call this on the result to tidy it up. virtual void usage( std::ostream& ) const; // Print a module-specific usage message. The default prints // nothing. A derived indexer that has its own command-line // options should override this function. private: // // This needs to be a string rather than a char* because the module names // have to be converted to lower case strings (since they have to be stored // separately from the original module names, i.e., the original char*'s // can't be used), strings might as well be used. // typedef std::map< std::string, indexer* > map_type; indexer( indexer const& ); // forbid initialization indexer& operator=( indexer const& ); // forbid assignment static int suspend_indexing_count_; static indexer* text_indexer_; static void init_modules(); // generated by init_modules-sh static map_type& map_ref(); }; ////////// Inline functions /////////////////////////////////////////////////// inline indexer* indexer::find_indexer( char const *mod_name ) { return map_ref()[ to_lower( mod_name ) ]; } inline void indexer::index_file( PJL::mmap_file const &file ) { suspend_indexing_count_ = 0; encoded_char_range const e( file.begin(), file.end() ); index_words( e ); } inline indexer* indexer::text_indexer() { return text_indexer_; } inline void indexer::suspend_indexing() { ++suspend_indexing_count_; } inline void indexer::resume_indexing () { if ( suspend_indexing_count_ ) --suspend_indexing_count_; } #endif /* indexer_H */ /* vim:set et sw=4 ts=4: */ �������������������������������������������������������swish++-6.1.5/IndexFile.h���������������������������������������������������������������������������0000644�0000765�0000000�00000003013�10166052461�013467� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** IndexFile.h ** ** Copyright (C) 1998 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef IndexFile_H #define IndexFile_H // local #include "config.h" #include "conf_string.h" //***************************************************************************** // // SYNOPSIS // class IndexFile : public conf<std::string> // // DESCRIPTION // // An IndexFile is-a conf<std::string> containing the name of the index // file to use. // // This is the same as index's or search's -i command-line option. // //***************************************************************************** { public: IndexFile() : conf<std::string>( "IndexFile", IndexFile_Default ) { } CONF_STRING_ASSIGN_OPS( IndexFile ) }; #endif /* IndexFile_H */ /* vim:set et sw=4 ts=4: */ ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/init_mod_vars-sh����������������������������������������������������������������������0000755�0000765�0000000�00000004552�07463312221�014652� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#! /bin/sh ## # SWISH++ # init_mod_vars-sh # # Copyright (C) 2001 Paul J. Lucas # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. ## cat <<! /* ** SWISH++ ** init_mod_vars.c ** ** Copyright (C) 2001 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* ** Note: This file was automatically generated on: ** `date`. */ // local #include "conf_var.h" //***************************************************************************** // // SYNOPSIS // /* static */ void conf_var::init_mod_vars() // // DESCRIPTION // // This function is a place to bundle together the construction of the // singleton instances of indexer modules. The base indexer() constructor // will add pointers to them into the static mod_name-to-instance map. // //***************************************************************************** { ! for header in mod/*/mod_*.h do VARS_FILE=`dirname $header`/vars [ -f $VARS_FILE ] || continue cat $VARS_FILE | while read var_name do cat <<! #ifdef `basename $header | sed -e 's/\.h$//' -e 's/mod_/MOD_/'` conf_var::register_var( "$var_name" ); #endif ! done done cat <<! } ! ������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/init_modules-sh�����������������������������������������������������������������������0000755�0000765�0000000�00000005103�07463313236�014510� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#! /bin/sh ## # SWISH++ # init_modules-sh # # Copyright (C) 2001 Paul J. Lucas # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. ## cat <<! /* ** SWISH++ ** init_modules.c ** ** Copyright (C) 2001 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* ** Note: This file was automatically generated on: ** `date`. */ // local #include "indexer.h" ! MOD_HEADERS=`echo mod/*/mod_*.h` for header in $MOD_HEADERS do cat <<! #ifdef `basename $header | sed -e 's/\.h$//' -e 's/mod_/MOD_/'` #include "$header" #endif ! done cat <<! //***************************************************************************** // // SYNOPSIS // /* static */ void indexer::init_modules() // // DESCRIPTION // // This function is a place to bundle together the construction of the // singleton instances of indexer modules. The base indexer() constructor // will add pointers to them into the static mod_name-to-instance map. // //***************************************************************************** { ! for header in $MOD_HEADERS do CLASS=`fgrep " : public indexer" $header | sed 's/.*class *\([A-Za-z_][A-Za-z0-9_]*\).*/\1/'` cat <<! #ifdef `basename $header | sed -e 's/\.h$//' -e 's/mod_/MOD_/'` static $CLASS ${CLASS}_instance; #endif ! done cat <<! // The plain text indexer is initialized in indexer::map_ref(). } ! �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/install-sh����������������������������������������������������������������������������0000755�0000765�0000000�00000006126�07267213515�013472� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#! /bin/sh ## # install-sh # # Copyright (C) 2000 Paul J. Lucas # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. ## CP=cp CHOWN=chown CHGRP=chgrp CHMOD=chmod MKDIR=mkdir # uncomment the next line to test script #ECHO=echo ########## You shouldn't have to change anything below this line. ############# ME=`basename $0` usage() { echo "usage: $ME [-o owner] [-g group] [-m mode] file ... dir" >&2 echo " $ME [-o owner] [-g group] [-m mode] -d dir" >&2 } ############################################################################### # # Process command-line options # # We don't rely on getopt because it varies and/or is broken on some # systems. # ############################################################################### while [ -n "$1" ] do if [ -z "$A" ] then case $1 in -d*) if [ x"$1" = x-d ] then A=$1 else dir=`expr $1 : '-d\(.*\)'` fi ;; -g*) if [ x"$1" = x-g ] then A=$1 else group=`expr $1 : '-g\(.*\)'` fi ;; -m*) if [ x"$1" = x-m ] then A=$1 else mode=`expr $1 : '-m\(.*\)'` fi ;; -o*) if [ x"$1" = x-o ] then A=$1 else owner=`expr $1 : '-o\(.*\)'` fi ;; -*) echo "$ME: illegal option: $1" >&2 usage; exit 1 ;; *) if [ -z "$src" ] then src=$1 else src="$src $dst"; dst=$1 fi ;; esac else ##### options that have an argument case $1 in -*) break ;; esac case $A in -d) dir=$1 ;; -g) group=$1 ;; -m) mode=$1 ;; -o) owner=$1 ;; esac unset A fi shift done [ -n "$A" ] && { echo "$ME: $A requires an argument" >&2 usage; exit 2 } ############################################################################### # # Go! # ############################################################################### change() { [ -n "$owner" ] && $ECHO $CHOWN $owner $* 2>/dev/null [ -n "$group" ] && $ECHO $CHGRP $group $* 2>/dev/null [ -n "$mode" ] && $ECHO $CHMOD $mode $* 2>/dev/null } if [ -n "$dir" ] then # don't rely on mkdir -p being available echo $dir | tr '/' '\012' | while read subdir do [ -z "$subdir" ] && subdir=/ if [ -z "$path" ] then path=$subdir elif [ "$path" = / ] then path="/$subdir" else path="$path/$subdir" fi [ -d "$path" ] || { $ECHO $MKDIR $path; change $path; } done else [ -z "$src" ] && { echo "$ME: no source file(s) specified" >&2 usage; exit 3 } [ -z "$dst" ] && { echo "$ME: no destination directory specified" >&2 usage; exit 4 } $ECHO $CP $src $dst $ECHO cd $dst change `echo $src | xargs -n1 basename` fi ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/INSTALL.unix��������������������������������������������������������������������������0000644�0000765�0000000�00000010340�10265220105�013453� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������=============================================================================== Prerequisites for building SWISH++ for Unix =============================================================================== SWISH++ has the following software as prerequisites: 1. A C++ compiler. SWISH++ has been built using g++ 3.4.x (earlier 3.x versions may work also); g++ 2.95.x and earlier versions are no longer supported. Other non-g++, ANSI C++ compilers should work also. 2. A version of STL, the ANSI/ISO C++ Standard Template Library. If you are using g++ or egcs, then you also need (the matching version of) libstd++. If you are not using g++, then you need to have a port of STL that works with your compiler. Ports for various platforms are available from: http://www.stlport.org/ For more about STL in general, see: http://www.cs.rpi.edu/~musser/stl.html http://www.sgi.com/Technology/STL/ 3. GNU make 3.79.1 or later. 4. Perl 5 (or later), but only if you intend on using either httpindex, splitmail, or WWW.pm. 5. The GNU wget command, but only if you intend on using httpindex. All GNU software is freely available from: ftp://prep.ai.mit.edu/pub/gnu/ as well as many other mirror sites. =============================================================================== Building SWISH++ for Unix =============================================================================== 1. Edit the "config.h" file to your liking. This file controls how the software runs. You REALLY need to understand and properly set TempDirectory and WordThreshold. Improperly set, SWISH++ can take hours to index whereas it should only take minutes. Once you understand them, comment out the #error lines. (The #error lines are there intentionally to force you to edit the file.) 2. Go to the "config" directory and edit the "config.mk" file as necessary. This file controls how the software is compiled. 3. Go back to the top-level directory and type "make". If everything works out, the software will be built. 4. Type "make install". =============================================================================== Running SWISH++'s search(1) as a daemon for Unix =============================================================================== If you are going to run search(1) as a daemon and you want it started automatically on system boot, you need to call searchmonitor from your system's start-up scripts. For SysV-like systems, e.g. Linux and Solaris, type "make install_sysv" and then edit the variable definitions in the beginning of /etc/init.d/searchd if necessary. For most other types of Unix systems, e.g., FreeBSD, add a call to searchmonitor in /etc/rc.local like: /path/to/searchmonitor -c /path/to/swish++.conf -s /path/to/search & In either case, also edit /etc/swish++.conf to your liking. If you are running search(1) under Solaris and you expect that it will be heavily used, read README.Solaris. If you are running search(1) under Mac OS X 10.4 (Tiger) or later, you should use launchd(8) to start it. Included in this distribution (in the search.plist file) is a sample launchd propertly list file for doing this. If you want to use SWISH++ to index your incoming mail automatically, you need to use Procmail to split messages into individual files. Included in this distribution (in the procmailrc file) is a sample Procmail recipe for doing this. =============================================================================== Notes =============================================================================== 1. Don't complain to me or bother asking me for help if you get either "No such file or directory" for or errors in the standard C++ headers. It means that your C++ compiler and/or libraries are improperly installed. I know nothing about your OS or how your system is (mis)configured. Complain to your sysadmin: the person who botched the installation and whose job it is to fix it. 2. Don't bother asking me for precompiled binaries for a particular OS -- I don't have them. 3. Don't ask me questions like, "Can SWISH++ do this?" The documentation describing SWISH++ is complete. If you read the documentation and what you are looking for is not there, then SWISH++ doesn't do it. ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/INSTALL.win32�������������������������������������������������������������������������0000644�0000765�0000000�00000005144�10031435250�013440� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������=============================================================================== Prerequisites for Building SWISH++ for Microsoft Windows 98/NT/2000/XP =============================================================================== SWISH++ has the following software as prerequisites: 1. The Cygwin environment. "The Cynwin tools are ports of the popular GNU development tools and utilities for Windows 95, 98, and NT." It is available from: http://cygwin.com/ 2. Perl 5 (or later), but only if you intend on using either httpindex, searchc, or WWW.pm. A free binary distribution is available from: http://www.activestate.com/ActivePerl/ 3. The GNU wget command, but only if you intend on using httpindex. A free binary distribution is available from: http://www.qtm.net/~twegscheid/wget.html =============================================================================== Building SWISH++ for Microsoft Windows 98/NT/2000/XP =============================================================================== Note: the multi-threaded daemon server feature of search(1) is currently NOT supported under Windows, nor will it be until somebody familiar with multi-threaded programming under Windows volunteers to port it. 1. Edit the "config.h" file to your liking. This file controls how the software runs. You REALLY need to understand and properly set Word_Threshold. Improperly set, SWISH++ can take hours to index whereas it should only take minutes. Once you understand them, comment out the #error lines. (The #error lines are there intentionally to force you to edit the file.) 2. Go to the "config" directory and edit the "config.mk" file as necessary. This file controls how the software is compiled. 3. Go back to the top-level directory and type "make". If everything works out, the software will be built. =============================================================================== Notes =============================================================================== 1. Don't complain to me or bother asking me for help if you get either "No such file or directory" for or errors in the standard C++ headers. It means that your C++ compiler and/or libraries are improperly installed. I know nothing about your OS or how your system is (mis)configured. Complain to your sysadmin: the person who botched the installation and whose job it is to fix it. 2. Don't ask me questions like, "Can SWISH++ do this?" The documentation describing SWISH++ is complete. If you read the documentation and what you are looking for is not there, then SWISH++ doesn't do it. ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/iso8859-1.c���������������������������������������������������������������������������0000644�0000765�0000000�00000005314�10263526011�013102� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** iso8859-1.c ** ** Copyright (C) 2002 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ // // All characters are from the ISO 8859-1 character set mapped to 7-bit ASCII. // // This file has to be at the top-level rather than in the charsets // subdirectory where it ought to be because "search" and "extract" use it // stand-alone. // extern char const iso8859_1_map[] = { ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', // 0 ' ', '\t','\n','\v','\f','\r',' ', ' ', // | ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', // | ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', // | ' ', '!', '"', '#', '$', '%', '&', '\'',// 32 '(', ')', '*', '+', ',', '-', '.', '/', // | '0', '1', '2', '3', '4', '5', '6', '7', // | '8', '9', ':', ';', '<', '=', '>', '?', // | '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', // | 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', // | 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', // | 'X', 'Y', 'Z', '[', '\\',']', '^', '_', // | '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', // | 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', // | 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', // | 'x', 'y', 'z', '{', '|', '}', '~', ' ', // | ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', // 128 ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', // | ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', // | ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', // | ' ', '!', ' ', '#', ' ', ' ', '|', ' ', // 160 ' ', ' ', ' ', '"', ' ', '-', ' ', ' ', // | ' ', ' ', '2', '3', '\'',' ', ' ', '.', // | ' ', '1', ' ', '"', ' ', ' ', ' ', '?', // | 'A', 'A', 'A', 'A', 'A', 'A', 'E', 'C', // 192 'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I', // | 'D', 'N', 'O', 'O', 'O', 'O', 'O', ' ', // | 'O', 'U', 'U', 'U', 'U', 'Y', ' ', 's', // | 'a', 'a', 'a', 'a', 'a', 'a', 'e', 'c', // | 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i', // | 'd', 'n', 'o', 'o', 'o', 'o', 'o', ' ', // | 'o', 'u', 'u', 'u', 'u', 'y', ' ', 'y', // 255 }; /* vim:set et sw=4 ts=4: */ ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/iso8859-1.h���������������������������������������������������������������������������0000644�0000765�0000000�00000003602�10263526011�013105� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** iso8859-1.h ** ** Copyright (C) 2002 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef iso8859_1_H #define iso8859_1_H extern char const iso8859_1_map[ 256 ]; //***************************************************************************** // // SYNOPSIS // inline char iso8859_1_to_ascii( char c ) // // DESCRIPTION // // Convert an 8-bit ISO 8859-1 (Latin 1) character to its closest 7-bit // ASCII equivalent. (This mostly means that accents are stripped.) // // This function exists to ensure that the value of the character used to // index the iso8859_1_map[] vector declared above is unsigned. // // PARAMETERS // // c The character to be converted. // // RETURN VALUE // // Returns said character. // // SEE ALSO // // International Standards Organization. "ISO 8859-1: Information // Processing -- 8-bit single-byte coded graphic character sets -- Part 1: // Latin alphabet No. 1," 1987. // //***************************************************************************** { return iso8859_1_map[ static_cast<unsigned char>( c ) ]; } #endif /* iso8859_1_H */ /* vim:set et sw=4 ts=4: */ ������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/itoa.c��������������������������������������������������������������������������������0000644�0000765�0000000�00000004603�10263526011�012550� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** PJL C++ Library ** itoa.c ** ** Copyright (C) 1998 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ // local #include "itoa.h" #include "platform.h" #include "util.h" /* for char_buffer_pool<> */ using namespace std; namespace PJL { //***************************************************************************** // // SYNOPSIS // char const* ltoa( register long n ) // // DESCRIPTION // // Convert a long integer to a string. The string returned is from an // internal pool of string buffers. The time you get into trouble is if // you hang on to more then Num_Buffers strings. This doesn't normally // happen in practice, however. // // PARAMETERS // // n The long integer to be converted. // // RETURN VALUE // // A pointer to the string. // // CAVEAT // // This function is NOT thread-safe because it uses char_buffer_pool which // isn't. // // SEE ALSO // // Brian W. Kernighan, Dennis M. Ritchie. "The C Programming Language, // 2nd ed." Addison-Wesley, Reading, MA, 1988. pp. 63-64. // //***************************************************************************** { static char_buffer_pool<25,5> buf; register char *s = buf.next(); bool const is_neg = n < 0; if ( is_neg ) n = -n; do { // generate digits in reverse *s++ = n % 10 + '0'; } while ( n /= 10 ); if ( is_neg ) *s++ = '-'; *s = '\0'; // now reverse the string for ( register char *t = buf.current(); t < s; ++t ) { char const tmp = *--s; *s = *t; *t = tmp; } return buf.current(); } } // namespace PJL /* vim:set et sw=4 ts=4: */ �����������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/itoa.h��������������������������������������������������������������������������������0000644�0000765�0000000�00000002147�10166052462�012564� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** PJL C++ Library ** itoa.h ** ** Copyright (C) 1998 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef itoa_H #define itoa_H namespace PJL { // // Conversion from long and int to string: opposite of atol and atoi. // extern char const* ltoa( long ); inline char const* itoa( int n ) { return PJL::ltoa( n ); } } // namespace PJL #endif /* itoa_H */ /* vim:set et sw=4 ts=4: */ �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/LaunchdCooperation.h������������������������������������������������������������������0000644�0000765�0000000�00000003317�10254340562�015410� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** SWISH++ ** LaunchdCooperation.h ** ** Copyright (C) 2005 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #if defined( SEARCH_DAEMON ) && defined( __APPLE__ ) #ifndef LaunchdCooperation_H #define LaunchdCooperation_H // local #include "conf_bool.h" //***************************************************************************** // // SYNOPSIS // class LaunchdCooperation : public conf<bool> // // DESCRIPTION // // A LaunchdCooperation is-a conf<bool> containing the Boolean value // indicating whether to cooperate with Mac OS X's launchd. // // This is the same as search's -l command-line option. // //***************************************************************************** { public: LaunchdCooperation() : conf<bool>( "LaunchdCooperation", false ) { } CONF_BOOL_ASSIGN_OPS( LaunchdCooperation ) }; extern LaunchdCooperation launchd_cooperation; #endif /* LaunchdCooperation_H */ #endif /* SEARCH_DAEMON && __APPLE__ */ /* vim:set et sw=4 ts=4: */ �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/less.h��������������������������������������������������������������������������������0000644�0000765�0000000�00000005152�10166052462�012575� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* ** PJL C++ Library ** less.h ** ** Copyright (C) 1998 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef less_H #define less_H // standard #include <cstring> #include <functional> namespace std { //***************************************************************************** // // SYNOPSIS // template<> struct less< char const* > : std::binary_function< char const*, char const*, bool > // // DESCRIPTION // // Specialize the binary_function "less" so that C-style strings (char // const*) will work properly with STL containers. // // SEE ALSO // // binary_function(3), less(3), strcmp(3) // // Bjarne Stroustrup. "The C++ Programming Language, 3rd ed." // Addison-Wesley, Reading, MA, 1997. p. 468. // //***************************************************************************** { less() { } // This default constructor doesn't need to be defined, but g++ 2.8.0 // complains if it isn't and you try to define a "const less" object. result_type operator()( first_argument_type a, second_argument_type b ) const { return std::strcmp( a, b ) < 0; } }; //***************************************************************************** // // SYNOPSIS // template< class T > struct less_n; template<> struct less_n< char const* > : less< char const* > // // DESCRIPTION // // A less_n is-a less< char const* > that compares C-style strings, but // only for a certain maximum length. // // SEE ALSO // // less(3), strncmp(3) // //***************************************************************************** { less_n< char const* >( int len ) : n_( len ) { } result_type operator()( first_argument_type a, second_argument_type b ) const { return std::strncmp( a, b, n_ ) < 0; } private: int const n_; }; } // namespace std #endif /* less_H */ /* vim:set et sw=4 ts=4: */ ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/man/����������������������������������������������������������������������������������0000755�0000765�0000000�00000000000�10746421420�012224� 5����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/man/GNUmakefile�����������������������������������������������������������������������0000644�0000765�0000000�00000001763�10044357034�014305� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������## # SWISH++ # man/GNUmakefile # # Copyright (C) 1998 Paul J. Lucas # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. ## ########## You shouldn't have to change anything below this line. ############# ROOT:= .. include $(ROOT)/config/config.mk SUBDIRS:= man* ## # Build rules ## all %: @for dir in $(SUBDIRS); do $(MAKE) -C $$dir $@; done # vim:set noet sw=8 ts=8: �������������swish++-6.1.5/man/man1/�����������������������������������������������������������������������������0000755�0000765�0000000�00000000000�10746421524�013065� 5����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/man/man1/extract.1��������������������������������������������������������������������0000644�0000765�0000000�00000025466�07745117742�014647� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" .\" SWISH++ .\" extract.1 .\" .\" Copyright (C) 1998 Paul J. Lucas .\" .\" This program is free software; you can redistribute it and/or modify .\" it under the terms of the GNU General Public License as published by .\" the Free Software Foundation; either version 2 of the License, or .\" (at your option) any later version. .\" .\" This program is distributed in the hope that it will be useful, .\" but WITHOUT ANY WARRANTY; without even the implied warranty of .\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the .\" GNU General Public License for more details. .\" .\" You should have received a copy of the GNU General Public License .\" along with this program; if not, write to the Free Software .\" Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. .\" .\" --------------------------------------------------------------------------- .\" define code-start macro .de cS .sp .nf .RS 5 .ft CW .ta .5i 1i 1.5i 2i 2.5i 3i 3.5i 4i 4.5i 5i 5.5i .. .\" define code-end macro .de cE .ft 1 .RE .fi .if !'\\$1'0' .sp .. .\" --------------------------------------------------------------------------- .TH \f3extract\fP 1 "November 1, 2002" "SWISH++" .SH NAME extract \- SWISH++ text extractor .SH SYNOPSIS .B extract [ .I options ] .I directory... .I file... .SH DESCRIPTION .B extract is the SWISH++ text extractor, a utility to extract what text there is from a (mostly) binary file (similar to the .BR strings (1) command) prior to indexing. Original files are untouched. .PP Text is extracted from the specified files and files in the specified directories; text from files in subdirectories of specified directories is also extracted by default (unless the .BR \-r , .BR \-\-no-recurse , .BR \-f , or .B \-\-filter option or the .B RecurseSubdirs or .B ExtractFilter variable is given). .PP Ordinarily, text is extracted from files either only if their filename matches one of the patterns in the set specified with either the .B \-e or .B \-\-pattern option or the .B IncludeFile variable (unless standard input is used; see next paragraph) or is not among the set specified with either the .B \-E or .B \-\-no-pattern option or the .B ExcludeFile variable. .PP If there is a single filename of `\f(CW-\f1', the list of directories and files to extract is instead taken from standard input (one per line). In this case, filename patterns of files to extract need not be specified explicitly: all files, regardless of whether they match a pattern (unless they are among the set not to extract specified with either the .B \-E or .B \-\-no-pattern option or the .B ExcludeFile variable), are extracted, i.e., .B extract assumes you know what you're doing when specifying filenames in this manner. .PP Ordinarily, the text extracted from a file is written to another file in the same directory having the same filename but with the ``\f(CW.txt\fP'' extension appended by default, e.g., ``\f(CWfoo.doc\fP'' becomes ``\f(CWfoo.doc.txt\fP'' after extraction. (See also the .B \-x or .B \-\-extension option or the .B ExtractExtension variable.) However, extraction is not performed if the extracted text file exists. .PP If either the .B \-f or .B \-\-filter option or the .B ExtractFilter variable is given, then only a single file specified on the command line is extracted to standard output. In this case, filename patterns are not used and the existence of an extracted text file is irrelevant. .SS Filters Via the .B FilterFile configuration file variable, files having particular patterns can be filtered prior to extraction. (See the examples in .BR swish++.conf (4).) .SS Character Mapping and Word Determination .B extract performs the same character mapping, character entity conversions, and word determination heuristics used by .BR index (1) but also additionally: .TP 4 1. Considers all PostScript Level 2 operators that are not also English words to be stop words. Such words in a file usually indicate an encapsulated PostScript (EPS) file and such should not be indexed. .TP 2. Looks specifically for encapsulated PostScript (EPS) data between everything between one of \f(CW%%BeginSetup\fP, \f(CW%%BoundingBox\fP, \f(CW%%Creator\fP, \f(CW%%EndComments\fP, or \f(CW%%Title\fP and \f(CW%%Trailer\fP and discards it. .TP 3. Discards strings of ASCII hex data \f(CWWord_Hex_Min_Size\fP characters or longer, e.g., ``\f(CW7F454C46\fP.'' (Default is 5.) .SS Motivation .B extract was developed to be able to index non-text files in proprietary formats such as Microsoft Office documents. There are a couple of reasons why the functionality of .B extract isn't simply built into .BR index (1): .TP 4 1. Users who do not need to index such documents shouldn't have to pay the performance penalty for doing the extra checks for PostScript and hex data. .TP 2. While .BR index (1) can uncompress files on the fly using filters also, uncompressing them every time indexing is performed is excessive. Text extraction, on the other hand, is done only once per file; if the file is updated, the text-extracted version should be deleted and recreated. .SH OPTIONS Options begin with either a `\f(CW-\f1' for short options or a ``\f(CW--\f1'' for long options. Either a `\f(CW-\f1' or ``\f(CW--\f1'' by itself explicitly ends the options; however, the difference is that `\f(CW-\f1' is returned as the first non-option whereas ``\f(CW--\f1'' is skipped entirely. Long option names may be abbreviated so long as the abbreviation is unambiguous. .PP For a short option that takes an argument, the argument is either taken to be the remaining characters of the same option, if any, or, if not, is taken from the next option unless said option begins with a `\f(CW-\f1'. .PP Short options that take no arguments can be grouped (but the last option in the group can take an argument), e.g., \f(CW-lrv4\fP is equivalent to \f(CW-l -r -v4\fP. .PP For a long option that takes an argument, the argument is either taken to be the characters after a `\f(CW=\fP', if any, or, if not, is taken from the next option unless said option begins with a `\f(CW-\fP'. .TP 18 .B \-? .br .ns .TP .B \-\-help Print the usage (``help'') message and exit. .TP .BI \-c c .br .ns .TP .BI \-\-config-file= c The name of the configuration file, .IR c , to use. (Default is \f(CWswish++.conf\f1 in the current directory.) A configuration file is not required: if none is specified and the default does not exist, none is used; however, if one is specified and it does not exist, then this is an error. .TP .BI \-e p [, p ...] .br .ns .TP .BI \-\-pattern= p [, p ...] A filename pattern (or set of patterns separated by commas), .IR p , of files to extract text from. Case is significant. Multiple .B \-e or .B \-\-pattern options may be specified. .TP .BI \-E p [, p ...] .br .ns .TP .BI \-\-no-pattern= p [, p ...] A filename pattern or patterns, .IR p , of files .I not to extract text from. Case is significant. Multiple .B \-E or .B \-\-no-pattern options may be specified. .TP .B \-f .br .ns .TP .B \-\-filter Extract a single file to standard output and exit. .TP .B \-l .br .ns .TP .B \-\-follow-links Follow symbolic links during extraction. The default is not to follow them. (This option is not available under Microsoft Windows since it doesn't support symbolic links.) .TP .B \-r .br .ns .TP .B \-\-no-recurse Do not recursively extract the files in subdirectories, that is: when a directory is encountered, all the files in that directory are extracted (modulo the filename patterns specified via the .BR \-e , .BR \-\-pattern , .BR \-E , or .B \-\-no-pattern options or the .B IncludeFile or .B ExcludeFile variables) but subdirectories encountered are ignored and therefore the files contained in them are not extracted. (This option is most useful when specifying the directories and files to extract via standard input.) The default is to extract the files in subdirectories recursively. .TP .BI \-s f .br .ns .TP .BI \-\-stop-file= f The name of a file, .IR f , containing the set stop-words to use instead of the built-in set. Whitespace, including blank lines, and characters starting with \f(CW#\f1 and continuing to the end of the line (comments) are ignored. .TP .B \-S .br .ns .TP .B \-\-dump-stop Dump the built-in set of stop-words to standard output and exit. .TP .BI \-v c .br .ns .TP .BI \-\-verbosity= v The verbosity level, .IR v , for printing additional information to standard output during indexing. The verbosity levels, 0-4, are: .PP .RS 18 .PD 0 .TP 4 0 No output is generated (except for errors). .TP 1 Only run statistics (elapsed time, number of files, word count) are printed. .TP 2 Directories are printed as extraction progresses. .TP 3 Directories and files are printed with a word-count for each file. .TP 4 Same as 3 but also prints all files that are not extracted and why. .RE .PD .RE .TP 18 .B \-V .br .ns .TP .B \-\-version Print the version number of .BR SWISH++ and exit. .TP .BI \-x e .br .ns .TP .BI \-\-extension= e The extension to append to filenames during extraction. (It can be specified with or without the dot; default is \f(CWtxt\f1.) .SH CONFIGURATION FILE The following variables can be set in a configuration file. Variables and command-line options can be mixed. .PP .RS 5 .PD 0 .TP 18 .B ExcludeFile Same as .B \-E or .B \-\-no-pattern .TP .B ExtractExtension Same as .B \-x or .B \-\-extension .TP .B ExtractFilter Same as .B \-f or .B \-\-filter .TP .B FilterAttachment (See FILTERS in .BR swish++.conf (4).) .TP .B FilterFile (See FILTERS in .BR swish++.conf (4).) .TP .B FollowLinks Same as .B \-l or .B \-\-follow-links .TP .B IncludeFile Same as .B \-e or .B \-\-pattern .TP .B RecurseSubdirs Same as .B \-r or .B \-\-no-recurse .TP .B StopWordFile Same as .B \-s or .B \-\-stop-file .TP .B Verbosity Same as .B \-v or .B \-\-verbosity .PD .RE .SH EXAMPLES .SS Extraction To extract text from all Microsoft Office files on a web server: .cS cd /home/www/htdocs extract -v3 -e '*.doc' -e '*.ppt' -e '*.xls' . .cE .SS Filters (See the examples in .BR swish++.conf (4).) .SH EXIT STATUS Exits with one of the values given below: .PP .RS 5 .PD 0 .TP 5 0 Success. .TP 1 Error in configuration file. .TP 2 Error in command-line options. .TP 20 File to extract does not exist. .TP 30 Unable to read stop-word file. .PD .RE .SH CAVEATS .TP 4 1. Text extraction is not perfect, nor can be. .TP 2. As with .BR index (1), the word-determination heuristics employed are heavily geared for English. Using SWISH++ as-is to extract files in non-English languages is not recommended. .SH FILES .PD 0 .TP 18 \f(CWswish++.conf\f1 default configuration file name .PD .SH SEE ALSO .BR index (1), .BR search (1), .BR strings (1), .BR swish++.conf (4), .BR glob (7) .PP Adobe Systems Incorporated. .I PostScript Language Reference Manual, 2nd ed. Addison-Wesley, Reading, MA. pp. 346-359. .PP International Standards Organization. ``ISO/IEC 9945-2: Information Technology -- Portable Operating System Interface (POSIX) -- Part 2: Shell and Utilities,'' 1993. .SH AUTHOR Paul J. Lucas .RI < pauljlucas@mac.com > ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/man/man1/GNUmakefile������������������������������������������������������������������0000644�0000765�0000000�00000001634�10044357035�015137� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������## # SWISH++ # man/man1/GNUmakefile # # Copyright (C) 1998 Paul J. Lucas # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. ## ########## You shouldn't have to change anything below this line. ############# ROOT:= ../.. SECT:= 1 include $(ROOT)/config/man.mk # vim:set noet sw=8 ts=8: ����������������������������������������������������������������������������������������������������swish++-6.1.5/man/man1/httpindex.1������������������������������������������������������������������0000644�0000765�0000000�00000007165�10274035501�015157� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" .\" SWISH++ .\" httpindex.1 .\" .\" Copyright (C) 1998 Paul J. Lucas .\" .\" This program is free software; you can redistribute it and/or modify .\" it under the terms of the GNU General Public License as published by .\" the Free Software Foundation; either version 2 of the License, or .\" (at your option) any later version. .\" .\" This program is distributed in the hope that it will be useful, .\" but WITHOUT ANY WARRANTY; without even the implied warranty of .\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the .\" GNU General Public License for more details. .\" .\" You should have received a copy of the GNU General Public License .\" along with this program; if not, write to the Free Software .\" Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. .\" .\" --------------------------------------------------------------------------- .\" define code-start macro .de cS .sp .nf .RS 5 .ft CW .ta .5i 1i 1.5i 2i 2.5i 3i 3.5i 4i 4.5i 5i 5.5i .. .\" define code-end macro .de cE .ft 1 .RE .fi .if !'\\$1'0' .sp .. .\" --------------------------------------------------------------------------- .TH \f3httpindex\f1 1 "August 2, 2005" "SWISH++" .SH NAME httpindex \- HTTP front-end for SWISH++ indexer .SH SYNOPSIS .B wget [ .I options ] .I URL... .B 2>&1 | httpindex [ .I options ] .SH DESCRIPTION .B httpindex is a front-end for .BR index (1) to index files copied from remote servers using .BR wget (1). The files (in a copy of the remote directory structure) can be kept, deleted, or replaced with their descriptions after indexing. .SH OPTIONS .SS wget Options The .BR wget (1) options that are .B required are: .BR \-A , .BR \-nv , .BR \-r , and .BR \-x ; the ones that are .B highly recommended are: .BR \-l , .BR \-nh , .BR \-t , and .BR \-w . (See the EXAMPLE.) .SS httpindex Options .B httpindex accepts the same short options as .BR index (1) except for .BR \-H , .BR \-I , .BR \-l , .BR \-r , .BR \-S , and .BR \-V . .PP The following options are unique to .BR httpindex : .TP .B \-d Replace the text of local copies of retrieved files with their descriptions after they have been indexed. This is useful to display file descriptions in search results without having to have complete copies of the remote files thus saving filesystem space. (See the \f(CWextract_description()\f1 function in .BR WWW (3) for details about how descriptions are extracted.) .TP .B \-D Delete the local copies of retrieved files after they have been indexed. This prevents your local filesystem from filling up with copies of remote files. .SH EXAMPLE To index all HTML and text files on a remote web server keeping descriptions locally: .cS wget -A html,txt -linf -t2 -rxnv -nh -w2 http://www.foo.com 2>&1 | httpindex -d -e'html:*.html,text:*.txt' .cE Note that you need to redirect .BR wget (1)'s output from standard error to standard output in order to pipe it to .BR httpindex . .SH EXIT STATUS Exits with a value of zero only if indexing completed sucessfully; non-zero otherwise. .SH CAVEATS In addition to those for .BR index (1), .B httpindex does not correctly handle the use of multiple .BR \-e , .BR \-E , .BR \-m , or .B \-M options (because the Perl script uses the standard \f(CWGetOpt::Std\f1 package for processing command-line options that doesn't). The last of any of those options ``wins.'' .PP The work-around is to use multiple values for those options seperated by commas to a single one of those options. For example, if you want to do: .cS httpindex -e'html:*.html' -e'text:*.txt' .cE do this instead: .cS httpindex -e'html:*.html,text:*.txt' .cE .SH SEE ALSO .BR index (1), .BR wget (1), .BR WWW (3) .SH AUTHOR Paul J. Lucas .RI < pauljlucas@mac.com > �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������swish++-6.1.5/man/man1/index.1����������������������������������������������������������������������0000644�0000765�0000000�00000104636�10030732342�014255� 0����������������������������������������������������������������������������������������������������ustar �pjl�����������������������������wheel������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" .\" SWISH++ .\" index.1 .\" .\" Copyright (C) 2003 Paul J. Lucas .\" .\" This program is free software; you can redistribute it and/or modify .\" it under the terms of the GNU General Public License as published by .\" the Free Software Foundation; either version 2 of the License, or .\" (at your option) any later version. .\" .\" This program is distributed in the hope that it will be useful, .\" but WITHOUT ANY WARRANTY; without even the implied warranty of .\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the .\" GNU General Public License for more details. .\" .\" You should have received a copy of the GNU General Public License .\" along with this program; if not, write to the Free Software .\" Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. .\" .\" --------------------------------------------------------------------------- .\" define code-start macro .de cS .sp .nf .RS 5 .ft CW .ta .5i 1i 1.5i 2i 2.5i 3i 3.5i 4i 4.5i 5i 5.5i .. .\" define code-end macro .de cE .ft 1 .RE .fi .if !'\\$1'0' .sp .. .\" --------------------------------------------------------------------------- .TH \f3index\f1 1 "March 25, 2004" "SWISH++" .SH NAME index \- SWISH++ indexer .SH SYNOPSIS .B index [ .I options ] .I directory... .I file... .SH DESCRIPTION .B index is the SWISH++ file indexer. It indexes the specified files and files in the specified directories; files in subdirectories of specified directories are also indexed by default (unless either the .B \-r or .B \-\-no-recurse option or the .B RecurseSubdirs variable is given). Files are indexed either only if their filename matches one of the patterns in the set specified with either the .B \-e or .B \-\-pattern option or the .B IncludeFile variable (unless standard input is used; see next paragraph) or is not in the set specified with either the .B \-E or .B \-\-no-pattern option or the .B ExcludeFile variable. .P If there is a single filename of `\f(CW-\f1', the list of directories and files to index is instead taken from standard input (one per line). In this case, filename patterns of files to index need not be specified explicitly: all files, regardless of whether they match a pattern (unless they are in the set not to index specified with either the .B \-E or .B \-\-no-pattern option or the .B ExcludeFile variable), are indexed, i.e., .B index assumes you know what you're doing when specifying filenames in this manner. .P In any case, care must be taken not to specify files or subdirectories in directories that are also specified: since directories are recursively indexed by default (unless either the .B \-r or .B \-\-no-recurse option or the .B RecurseSubdirs variable is given), explicitly specifying a subdirectory or file in a directory that is also specified will result in those files being indexed more than once. .SS Character Mapping Characters in the ISO 8859-1 (Latin 1) character set are mapped to their closest ASCII equivalent before further examination and indexing. (Individual indexing modules may also do their own character mapping.) .SS Word Determination Stop words, words that occur too frequently or have no information content, are not indexed. (There is a default built-in set of a few hundred such English words.) Additionally, several heuristics are used to determine which words should not be indexed. .P First, a word is checked to see if it looks like an acronym. A word is considered an acronym only if it starts with a capital letter and is composed exclusively of capital letters, digits, and punctuation symbols, e.g., ``AT&T.'' If a word looks like an acronym, it is indexed and no further checks are done. .P Second, there are several other checks that are applied. A word is not indexed if it: .TP 4 1. Is less than \f(CWWord_Min_Size\f1 letters. (Default is 4.) .TP 2. Contains less than \f(CWWord_Min_Vowels\f1 vowels. (Default is 1.) .TP 3. Contains more than \f(CWWord_Max_Consec_Same\f1 of the same character consecutively (not including digits). (Default is 2.) .TP 4. Contains more than \f(CWWord_Max_Consec_Consonants\f1 consecutive consonants. (Default is 5.) .TP 5. Contains more than \f(CWWord_Max_Consec_Vowels\f1 consecutive vowels. (Default is 4.) .TP 6. Contains more than \f(CWWord_Max_Consec_Puncts\f1 consecutive punctuation characters. (Default is 1.) .SS Filters Via the .B FilterFile configuration file variable, files matching particular patterns can be filtered prior to indexing. Via the .B FilterAttachment configuration file variable, e-mail attachments whose MIME types match particular patterns can be filtered prior to indexing. (See FILTERS in .BR swish++.conf (4).) .SS Incremental Indexing In order to add words from new documents to an existing index, either the entire set of documents can be reindexed or the new documents alone can be incrementally indexed. In many cases, reindexing everything is sufficient since .B index is really fast. For a very large document set, however, this may use too many resources. .P However, there is a pitfall for incremental indexing: if any of the .BR \-f , .BR \-\-word-files , .BR \-p , or .B \-\-word-percent options or .B WordFilesMax or .B WordPercentMax variables are used, then words that are too frequent are discarded. If new documents are added containing very few of those words, then they could no longer be too frequent. However, there is no way to get them back since they were discarded. .P The way around this problem is not to discard any words by specifying 101%. However, because no words are discarded, the size of the index file will be larger, perhaps significantly so. .P It is possible that, in practice, the loss of words may not be that important especially if new documents are very similar to old documents and that words that were too frequent in the old set would also be too frequent in new set. .P Another way around this problem is to do periodic full indexing. .SH INDEXING MODULES .B index is written in a modular fashion where different types of files have different indexing modules. Currently, there are 7 modules: Text (plain text), HTML (HTML and XHTML), ID3 (ID3 tags found in MP3 files), LaTeX, Mail (RFC 822 and Usenet News), Manual (Unix manual pages in .BR nroff (1) with .BR man (7) macros), and RTF (Rich Text Format). .SS Text Module This module simply indexes plain text files performing character mapping and word determination as has already been described. .SS HTML and XHTML Module Additional processing is done for HTML and XHTML files. The additional processing is: .TP 4 1. Character and numeric (decimal and hexadecimal) entity references are converted to their ASCII character equivalents before further examination and indexing. For example, ``résumé'' becomes ``resume'' before indexing. .TP 2. If a matched set of \f(CW<TITLE>\f1 ... \f(CW\f1 tags is found within the first \f(CWTitleLines\f1 lines of the file (default is 12), then the text between the tags is stored in the generated index file as the file's title rather than the file's name. (Every non-space whitespace character in the title is converted to a space; leading and trailing spaces are removed.) .TP 3. If an HTML or XHTML element contains a \f(CWCLASS\f1 attribute whose value is among the set of class names specified as those not to index (via one or more of either the .B \-C or .B \-\-no-class option or the .B ExcludeClass variable), then all the text up to the tag that ends the element will not be indexed. .IP "" For an element that has an optional end tag, ``the tag that ends the element'' is either the element's end tag or a tag of another element that implicitly ends it; for an element that does not have an end tag, ``the tag that ends the element'' is the element's start tag. (See the EXAMPLES.) .IP "" All elements from the HTML 4.0 specification (including deprecated elements), Ruby elements, plus common, browser-specific elements are recognized; unrecognized elements are ignored. (See the .B \-H or .B \-\-dump-html option.) .TP 4. If an HTML or XHTML element contains a \f(CWTITLE\f1 attribute, then the words specified as the value of the \f(CWTITLE\f1 attribute are indexed. .TP 5. If an \f(CWAREA\f1, \f(CWIMG\f1, or \f(CWINPUT\f1 element contains an \f(CWALT\f1 attribute, then the words specified as the value of the \f(CWALT\f1 attribute are indexed. .TP 6. If a \f(CWMETA\f1 element contains both a \f(CWNAME\f1 and \f(CWCONTENT\f1 attribute, then the words specified as the value of the \f(CWCONTENT\f1 attribute are indexed associated with the meta name specified as the value of the \f(CWNAME\f1 attribute. .IP "" (However, if either the .B \-A or .B \-\-no-assoc-meta options or the .B AssociateMeta variable is specified, then the words specified as the value of the \f(CWCONTENT\f1 attribute are still indexed, but not associated with the meta name.) .IP "" (See also the .BR \-m , .BR \-\-meta , .BR \-M , and .B \-\-no-meta options or the .B IncludeMeta or .B ExcludeMeta variables.) Meta names can later be queried against specifically using .BR search (1). .TP 7. If a \f(CWTABLE\f1 element contains a \f(CWSUMMARY\f1 attribute, then the words specified as the value of the \f(CWSUMMARY\f1 attribute are indexed. .TP 8. If an \f(CWOBJECT\f1 element contains a \f(CWSTANDBY\f1 attribute, then the words specified as the value of the \f(CWSTANDBY\f1 attribute are indexed. .TP 9. All other HTML or XHTML tags and comments (anything between \f(CW<\f1 and \f(CW>\f1 characters) are discarded. .P In compliance with the HTML specification, any one of no quotes, single quotes, or double quotes may be used to contain attribute values and attributes can appear in any order. Values containing whitespace, however, must be quoted. The specification is vague as to whether whitespace surrounding the \f(CW=\f1 is legal, but .B index allows it. .SS ID3 Module ID3 tags are used to store audio meta information for MP3 files (generally). Since audio files contain mostly binary information, only the ID3 tag text fields are indexed. ID3 tag versions 1.x and 2.x (through 2.4) are supported (except for encrypted frames). If a file contains both 1.x and 2.x tags, only the 2.x tag is indexed. The processing done for files containing an ID3 tag is: .TP 4 1. If a title field is found, then the value of the title is stored in the generated index file as the file's title rather than the file's name. (Every non-space whitespace character in the title is converted to a space; leading and trailing spaces are removed.) .TP 2. Words that are the value of fields are indexed associated with the field name as a meta name. (However, if either the .B \-A or .B \-\-no-assoc-meta options or the .B AssociateMeta variable is specified, then the words specified as the value of the field are still indexed, but not associated with the field.) .IP "" (See also the .BR \-m , .BR \-\-meta , .BR \-M , and .B \-\-no-meta options or the .B IncludeMeta or .B ExcludeMeta variables.) Meta names can later be queried against specifically using .BR search (1). .IP "" For ID3v1.x, the recommended fields to be indexed are: .BR album , .BR artist , .BR comments , .BR genre , and .BR title . .IP "" For ID3v2.2, the recommended text fields (with reassignments) to be indexed are: .BR com=comments , .BR tal=album , .BR tcm=composer , .BR tco=genre , .BR tcr=copyright , .BR ten=encoder , .BR txt=lyricist , .BR tt1=content , .BR tt2=title , .BR tt3=subtitle , .BR ipl=musicians , .BR tot=original-title , .BR tol=original-lyricist , .BR toa=original-artist , .BR tp1=artist , .BR tp2=performers , .BR tp3=conductor , .BR tpb=publisher , .BR txx=user , .BR slt=lyrics , and .BR ult=lyrics . .IP "" For ID3v2.4, the recommended text fields (with reassignments) to be indexed are: .BR comm=comments , .BR talb=album , .BR tcom=composer , .BR tcon=genre , .BR tcop=copyright , .BR tenc=encoder , .BR text=lyricist , .BR tipl=people , .BR tit1=content , .BR tit2=title , .BR tit3=subtitle , .BR tmcl=musicians , .BR tmoo=mood , .BR toal=original-title , .BR toly=original-lyricist , .BR tope=original-artist , .BR town=owner , .BR tpe1=artist , .BR tpe2=performers , .BR tpe3=conductor , .BR tpub=publisher , .BR tsst=set-subtitle , .BR txxx=user , .BR user=terms , .BR sylt=lyrics , and .BR uslt=lyrics . .IP "" ID3v2.3 is the same as 2.4 except replace .B tmcl=musicians with .BR ipls=musicians . .IP "" All text fields (with reassignments) for all versions of ID3 can (and should) be specified concurrently so it need not be known in advance which version(s) of ID3 MP3 files are encoded with. .TP 3. For ID3v2.x, text fields that are compressed are uncompressed prior to indexing. .TP 4. For ID3v2.x, Unicode text that is encoded in either UTF-8 or UTF-16 (either big- or little-endian) is decoded prior to indexing. .SS LaTeX Module Additional processing is done for LaTeX files. If a .B \\\\title command is found within the first \f(CWTitleLines\f1 lines of the file (default is 12), then the value of the title is stored in the generated index file as the file's title rather than the file's name. (Every non-space whitespace character in the title is converted to a space; leading and trailing spaces are removed.) .SS Mail Module Additional processing is done for mail and news files. The additional processing is: .TP 4 1. If a .B Subject header is found within the first \f(CWTitleLines\f1 lines of the file (default is 12), then the value of the subject is stored in the generated index file as the file's title rather than the file's name. (Every non-space whitespace character in the title is converted to a space; leading and trailing spaces are removed.) .TP 2. Words that are the value of a header are indexed associated with the header name as a meta name. (However, if either the .B \-A or .B \-\-no-assoc-meta options or the .B AssociateMeta variable is specified, then the words specified as the value of the header are still indexed, but not associated with the header.) .IP "" (See also the .BR \-m , .BR \-\-meta , .BR \-M , and .B \-\-no-meta options or the .B IncludeMeta or .B ExcludeMeta variables.) Meta names can later be queried against specifically using .BR search (1). .IP "" The recommended headers to be indexed are: .BR Bcc , .BR Cc , .BR Comments , .BR Content-Description , .BR From , .BR Keywords , .BR Newsgroups , .BR Resent-To , .BR Subject , and .BR To . .TP 3. MIME attachments are indexed. .TP 4. Text that is in the text/enriched content type is converted to plain text prior to indexing. .TP 5. Text that is encoded as either quoted-printable or base-64 is decoded prior to indexing. .TP 6. Unicode text that is encoded in either the UTF-7 or UTF-8 character set is decoded prior to indexing. .TP 7. Text in vCards is indexed such that the values of types (fields) are associated with the types as meta names. (However, if either the .B \-A or .B \-\-no-assoc-meta options or the .B AssociateMeta variable is specified, then the words specified as the value of types are still indexed, but not associated with the types.) .IP "" The recommended vCard types (with reassignments) to be indexed are: .BR adr=address , .BR categories , .BR class , .BR label=address , .BR fn=name , .BR nickname , .BR note , .BR org , .BR role , and .BR title . .P Indexing mail and news files is most effective only when there is exactly one message per file. While Usenet news files are usually this way, mail files are not. Mail files, e.g., mailboxes, are usually comprised of multiple messages. Such files would need to be split up into files of individual messages prior to indexing since there's no point in indexing a single mailbox: every search result would return a rank of 100 for the same file. Therefore, the .BR splitmail (1) utility is included in the SWISH++ distribution. .SS Manual Module Additional processing is done for Unix manual page files. The additional processing is: .TP 4 1. If a .B NAME section heading macro (\f(CW.SH\f1) is found within the first \f(CWTitleLines\f1 lines of the file (default is 12), then the contents of the next line are stored in the generated index file as the file's title rather than the file's name. (Every non-space whitespace character in the title is converted to a space; leading and trailing spaces as well as backslash sequences, such as \f(CW\\f2\f1, are removed.) .TP 2. Words that are in a section are indexed associated with the name of the section as a meta name. (However, if either the .B \-A or .B \-\-no-assoc-meta options or the .B AssociateMeta variable is specified, then the words in a section are still indexed, but not associated with the section heading.) .IP "" Spaces in multi-word section headings are converted to dashes, e.g., ``see also'' becomes ``see-also'' as a meta name. (See also the .BR \-m , .BR \-\-meta , .BR \-M , and .B \-\-no-meta options or the .B IncludeMeta or .B ExcludeMeta variables.) Meta names can later be queried against specifically using .BR search (1). .IP "" The recommended sections to be indexed are: .BR AUTHOR , .BR BUGS , .BR CAVEATS , .BR DESCRIPTION , .BR DIAGNOSTICS , .BR ENVIRONMENT , .BR ERRORS , .BR EXAMPLES , .BR EXIT-STATUS , .BR FILES , .BR HISTORY , .BR NAME , .BR NOTES , .BR OPTIONS , .BR RETURN-VALUE , .BR SEE-ALSO , .BR SYNOPSIS , and .BR WARNINGS . .SS RTF Module This module simply indexes rich text format files without all formatting commands. .SH OPTIONS Options begin with either a `\f(CW-\f1' for short options or a ``\f(CW--\f1'' for long options. Either a `\f(CW-\f1' or ``\f(CW--\f1'' by itself explicitly ends the options; either short or long options may be used. Long option names may be abbreviated so long as the abbreviation is unambiguous. .P For a short option that takes an argument, the argument is either taken to be the remaining characters of the same option, if any, or, if not, is taken from the next option unless said option begins with a `\f(CW-\f1'. .P Short options that take no arguments can be grouped (but the last option in the group can take an argument), e.g., \f(CW-lrv4\fP is equivalent to \f(CW-l -r -v4\fP. .P For a long option that takes an argument, the argument is either taken to be the characters after a `\f(CW=\fP', if any, or, if not, is taken from the next option unless said option begins with a `\f(CW-\fP'. .TP 20 .B \-? .br .ns .TP .B \-\-help Print the usage (``help'') message and exit. .TP .B \-A .br .ns .TP .B \-\-no-assoc-meta Do not associate words with meta names during indexing nor store such associations in the generated index file. This sacrifices meta names for decreased memory usage and index file size. .TP .BI \-c f .br .ns .TP .BI \-\-config-file= f The name of the configuration file, .IR f , to use. (Default is \f(CWswish++.conf\f1 in the current directory.) A configuration file is not required: if none is specified and the default does not exist, none is used; however, if one is specified and it does not exist, then this is an error. .TP .BI \-C c .br .ns .TP .BI \-\-no-class= c For HTML or XHTML files only, a class name, .IR c , of an HTML or XHTML element whose text is not to be indexed. Multiple .B \-C or .B \-\-no-class options may be specified. .TP .BI \-e m : p [, p ...] .br .ns .TP .BI \-\-pattern= m : p [, p ...] A module name, .IR m , and a filename pattern (or set of patterns separated by commas), .IR p , of files to index. Case is irrelevant for the module name, but significant for the patterns. Multiple .B \-e or .B \-\-pattern options may be specified. .TP .BI \-E p [, p ...] .br .ns .TP .BI \-\-no-pattern= p [, p ...] A filename pattern (or set of patterns separated by commas), .IR p , of files .I not to index. Case is significant. Multiple .B \-E or .B \-\-no-pattern options may be specified. .TP .BI \-f n .br .ns .TP .BI \-\-word-files= n The maximum number of files, .IR n , a word may occur in before it is discarded as being too frequent. (Default is infinity.) .TP .BI \-F n .br .ns .TP .BI \-\-files-reserve= n Reserve space for this number of files, .IR n , to start. More space will be allocated as necessary, but with a slight performance penalty. (Default is 1000.) .TP .BI \-g n .br .ns .TP .BI \-\-files-grow= n Grow the space for the reserved number of files, .IR n , when incrementally indexing. The number can either be an absolute number of files or a percentage (when followed by a percent sign \f(CW%\f1). Just as with the .B \-F option, more space will be allocated as necessary, but with a slight performance penalty. (Default is 100.) .TP .B \-H .br .ns .TP .B \-\-dump-html Dump the built-in set of recognized HTML and XHTML elements to standard output and exit. .TP .BI \-i f .br .ns .TP .BI \-\-index-file= f The name of the generated index file, .I f (for new indexes; default is \f(CWswish++.index\f1 in the current directory) or the old index file when doing incremental indexing. .TP .B \-I .br .ns .TP .B \-\-incremental Incrementally add the indexed files and words to an existing index. The existing index is not touched; instead, a new index is created having the same pathname of the existing index with ``\f(CW.new\f1'' appended. .TP .B \-l .br .ns .TP .B \-\-follow-links Follow symbolic links during indexing. (Default is not to follow them.) This option is not available under Microsoft Windows since it doesn't support symbolic links. .TP .BI \-m m [ =n ] .br .ns .TP .BI \-\-meta= m [ =n ] The value of a meta name, .IR m , for which words are to be associated when indexed. Case is irrelevant. Multiple .B \-m or .B \-\-meta options may be specified. .IP "" A meta name can be reassigned when followed by a new name, .IR n , meaning that the name .I n and not .I m is stored in the generated index file so that queries would use the new name rather than the original. .IP "" By default, words associated with all meta names are indexed. Specifying at least one meta name via this option changes that so that only the words associated with a member of the set of meta names explicitly specified via one or more .B \-m or .B \-\-meta options are indexed. .TP .BI \-M m .br .ns .TP .BI \-\-no-meta= m The value of a meta name, .IR m , for which words are not to be indexed. Case is irrelevant. Multiple .B \-M or .B \-\-no-meta options may be specified. .TP .BI \-p n .br .ns .TP .BI \-\-word-percent= n The maximum percentage, .IR n , of files a word may occur in before it is discarded as being too frequent. (Default is 100.) If you want to keep all words regardless, specify 101. .TP .B \-P .br .ns .TP .B \-\-no-pos-data Do not store word positions in memory during indexing nor in the generated index file needed to do ``near'' searches later during searching. This sacrifices ``near'' searching for decreased memory usage and index file size (approximately 50%). .TP .B \-r .br .ns .TP .B \-\-no-recurse Do not recursively index the files in subdirectories, that is: when a directory is encountered, all the files in that directory are indexed (modulo the filename patterns specified via either the .BR \-e , .BR \-\-pattern , .BR \-E , or .B \-\-no-pattern options or the .B IncludeFile or .B ExcludeFile variables) but subdirectories encountered are ignored and therefore the files contained in them are not indexed. This option is most useful when specifying the directories and files to index via standard input. (Default is to index the files in subdirectories recursively.) .TP .BI \-s f .br .ns .TP .BI \-\-stop-file= f The name of a file, .IR f , containing the set of stop-words to use instead of the built-in set. Whitespace, including blank lines, and characters starting with \f(CW#\f1 and continuing to the end of the line (comments) are ignored. .TP .B \-S .br .ns .TP .B \-\-dump-stop Dump the built-in set of stop-words to standard output and exit. .TP .BI \-t n .br .ns .TP .BI \-\-title-lines= n The maximum number of lines, .IR n , into a file to look at for a file's title. (Default is 12.) Larger numbers slow indexing. .TP .BI \-T d .br .ns .TP .BI \-\-temp-dir= d The path of the directory, .IR d , to use for temporary files. The directory must exist. (Default is \f(CW/tmp\f1 for Unix or \f(CW/temp\f1 for Windows.) .IP "" If your OS mounts swap space on \f(CW/tmp\f1, as indexing progresses and more files get created in \f(CW/tmp\f1, you will have less swap space, indexing will get slower, and you may run out of memory. If this is the case, you should specify a directory on a real filesystem, i.e., one on a physical disk. .TP .BI \-v n .br .ns .TP .BI \-\-verbosity= n The verbosity level, .IR n , for printing additional information to standard output during indexing. The verbosity levels, 0-4, are: .P .RS 20 .PD 0 .TP 4 0 No output is generated except for errors. (This is the default.) .TP 1 Only run statistics (elapsed time, number of files, word count) are printed. .TP 2 Directories are printed as indexing progresses. .TP 3 Directories and files are printed with a word-count for each file. .TP 4 Same as 3 but also prints all files that are not indexed and why. .RE .PD .RE .TP 20 .B \-V .br .ns .TP .B \-\-version Print the version number of .B SWISH++ to standard output and exit. .TP .BI \-W n .br .ns .TP .BI \-\-word-threshold= n The word count past which partial indices are generated and merged since all the words are too big to fit into memory at the same time. If you index and your machine begins to swap like mad, lower this value. Only the super-user can specify a value larger than the compiled-in default. .SH CONFIGURATION FILE The following variables can be set in a configuration file. Variables and command-line options can be mixed, the latter taking priority. .P .RS 5 .PD 0 .TP 20 .B AssociateMeta Same as .B \-A or .B \-\-no-assoc-meta .TP .B ExcludeClass Same as .B \-C or .B \-\-no-class .TP .B ExcludeFile Same as .B \-E or .B \-\-no-pattern .TP .B ExcludeMeta Same as .B \-M or .B \-\-no-meta .TP .B FilesGrow Same as .B \-g or .B \-\-files-grow .TP .B FilesReserve Same as .B \-F or .B \-\-files-reserve .TP .B FilterAttachment (See FILTERS in .BR swish++.conf (4).) .TP .B FilterFile (See FILTERS in .BR swish++.conf (4).) .TP .B FollowLinks Same as .B \-l or .B \-\-follow-links .TP .B IncludeFile Same as .B \-e or .B \-\-pattern .TP .B IncludeMeta Same as .B \-m or .B \-\-meta .TP .B Incremental Same as .B \-I or .B \-\-incremental .TP .B IndexFile Same as .B \-i or .B \-\-index-file .TP .B RecurseSubdirs Same as .B \-r or .B \-\-no-recurse .TP .B StopWordFile Same as .B \-s or .B \-\-stop-file .TP .B StoreWordPositions Same as .B \-P or .B \-\-no-pos-data .TP .B TempDirectory Same as .B \-T or .B \-\-temp-dir .TP .B TitleLines Same as .B \-t or .B \-\-title-lines .TP .B Verbosity Same as .B \-v or .B \-\-verbosity .TP .B WordFilesMax Same as .B \-f or .B \-\-word-files .TP .B WordPercentMax Same as .B \-p or .B \-\-word-percent .TP .B WordsNear Same as .B \-n or .B \-\-near .TP .B WordThreshold Same as .B \-W or .B \-\-word-threshold .PD .RE .SH EXAMPLES .SS Unix Command-Lines All these example assume you change your working directory to your web server's document root prior to indexing. .P To index all HTML and text files on a web server: .cS index -v3 -e 'html:*.*htm*' -e 'text:*.txt' . .cE To index all files not under directories named \f(CWCVS\f1: .cS find . -name CVS -prune -o -type f -a -print | index -e 'html:*.*htm*' - .cE .SS Windows Command-Lines When using the Windows command interpreter, single quotes around filename patterns don't work; you .I must use double quotes: .cS index -v3 -e "html:*.*htm*" -e "text:*.txt" . .cE This is a problem with Windows, not SWISH++. (Double quotes will also work under Unix.) .SS Using \f(CWCLASS\fP Attributes to Index HTML Selectively In an HTML or XHTML document, there may be sections that should not be indexed. For example, if every page of a web site contains a navigation menu such as: .cS .cE or a common header and footer, then, ordinarily, those words would be indexed for every page and therefore be discarded because they would be too frequent. However, via either the .B \-C or .B \-\-no-class option or the .B ExcludeClass variable, one or more class names can be specified and then HTML or XHTML elements belonging to one of those classes will not have the text up to the tag that ends them indexed. Given a class name of, say, \f(CWno_index\f1, the above menu can be changed to: .cS \f1 tag will not be indexed. .P For an HTML element that has an optional end tag (such as the \f(CW

\f1 element), the text up to the tag that ends it will not be indexed, which is either the element's own end tag or a tag of some other element that implicitly ends it. For example, in: .cS

This was the poem that Alice read:

Jabberwocky
`Twas brillig, and the slithy toves
Did gyre and gimble in the wabe;
All mimsy were the borogoves,
And the mome raths outgrabe.
.cE the \f(CW
\f1 tag implicitly ends the \f(CW

\f1 element (as do all block-level elements) so the only text that is not indexed above is: ``This was the poem that Alice read.'' .P For an HTML or XHTML element that does not have an end tag, only the text within the start tag will not be indexed. For example, in: .cS Home .cE the word ``Home'' will not be indexed even though it ordinarily would have been if the \f(CWCLASS\f1 attribute were not there. .SS Filters (See Filters under EXAMPLES in .BR swish++.conf (4).) .SH EXIT STATUS Exits with one of the values given below: .P .RS 5 .PD 0 .TP 5 0 Success. .TP 1 Error in configuration file. .TP 2 Error in command-line options. .TP 10 Unable to open temporary file. .TP 11 Unable to write index file. .TP 12 Unable to write temporary file. .TP 13 Root-only operation attempted. .TP 30 Unable to read stop-word file. .TP 40 Unable to read index file. .TP 127 Internal error. .PD .RE .SH CAVEATS .TP 4 1. Generated index files are machine-dependent (size of data types and byte order). .TP 2. The word-determination heuristics employed are heavily geared for English. Using SWISH++ as-is to index and search files in non-English languages is not recommended. .TP 3. Unless otherwise noted above, the character encoding always used is ISO 8859-1 (Latin 1). Character encodings that are specified in HTML or XHTML files are ignored. .TP 4. An e-mail message can have both an encoding and a non-ASCII or non-ISO-8859-1 charset simultaneously, e.g., base64-encoded UTF-8. (In practice, this particular case should never happen since UTF-7 should be used instead; but you get the idea.) .IP "" However, handling both an encoding and such a charset simultaneously is problematic; hence, an e-mail message or attachment can have either an encoding or a non-ASCII or a non-ISO-8859-1 character set, but not both. If it does, the encoding takes precedence. .SH FILES .PD 0 .TP 18 \f(CWswish++.conf\f1 default configuration file name .TP \f(CWswish++.index\f1 default index file name .PD .SH ENVIRONMENT .TP 10 \f(CWTMPDIR\f1 If set, the default path of the directory to use for temporary files. The directory must exist. This is superseded by either the .B \-T or .B \-\-temp-dir option or the .B TempDirectory variable. .SH SEE ALSO .BR extract (1), .BR find (1), .BR nroff (1), .BR search (1), .BR splitmail (1), .BR swish++.conf (4), .BR glob (7), .BR man (7). .P Tim Berners-Lee. ``The text/enriched MIME Content-type,'' .IR "Request for Comments 1563" , Network Working Group of the Internet Engineering Task Force, January 1994. .P David H. Crocker. ``Standard for the Format of ARPA Internet Text Messages,'' .IR "Request for Comments 822" , Department of Electrical Engineering, University of Delaware, August 1982. .P Frank Dawson and Tim Howes. ``vCard MIME Directory Profile,'' .IR "Request for Comments 2426" , Network Working Group of the Internet Engineering Task Force, September 1998. .P Ned Freed and Nathaniel S. Borenstein. ``Multipurpose Internet Mail Extensions (MIME) Part One: Format of Internet Message Bodies,'' .IR "Request for Comments 2045" , RFC 822 Extensions Working Group of the Internet Engineering Task Force, November 1996. .P David Goldsmith and Mark Davis. ``UTF-7, a mail-safe transformation format of Unicode,'' .IR "Request for Comments 2152" , Network Working Group of the Internet Engineering Task Force, May 1997. .P International Standards Organization. .I ISO 8859-1: Information Processing .I -- 8-bit single-byte coded graphic character sets .I -- Part 1: Latin alphabet No. 1, 1987. .P \-\-. .I ISO 8879: Information Processing .I -- Text and Office Systems .I -- Standard Generalized Markup Language (SGML), 1986. .P \-\-. .I ISO/IEC 9945-2: Information Technology .I -- Portable Operating System Interface (POSIX) .I -- Part 2: Shell and Utilities, 1993. .P Leslie Lamport. .IR "LaTeX: A Document Preparation System, 2nd ed." , Addison-Wesley, Reading, MA, 1994. .P Martin Nilsson. .IR "ID3 tag version 2" , March 1998. .P \-\-. .IR "ID3 tag version 2.3.0" , February 1999. .P \-\-. .IR "ID3 tag version 2.4.0 - Main Structure" , November 2002. .P \-\-. .IR "ID3 tag version 2.4.0 - Native Frames" , November 2002. .P Steven Pemberton, et al. .IR "XHTML 1.0: The Extensible HyperText Markup Language" , World Wide Web Consortium, January 2000. .P Dave Raggett, Arnaud Le Hors, and Ian Jacobs. ``On SGML and HTML: SGML constructs used in HTML: Entities,'' .I HTML 4.0 Specification, \(sc3.2.3, World Wide Web Consortium, April 1998. .P \-\-. ``The global structure of an HTML document: The document head: The \f(CWtitle\f1 attribute,'' .I HTML 4.0 Specification, \(sc7.4.3, World Wide Web Consortium, April 1998. .P \-\-. ``The global structure of an HTML document: The document head: Meta data,'' .I HTML 4.0 Specification, \(sc7.4.4, World Wide Web Consortium, April 1998. .P \-\-. ``The global structure of an HTML document: The document body: Element identifiers: the \f(CWid\f1 and \f(CWclass\f1 attributes,'' .I HTML 4.0 Specification, \(sc7.5.2, World Wide Web Consortium, April 1998. .P \-\-. ``Tables: Elements for constructing tables: The \f(CWTABLE\f1 element,'' .I HTML 4.0 Specification, \(sc11.2.1, World Wide Web Consortium, April 1998. .P \-\-. ``Objects, Images, and Applets: Generic inclusion: the \f(CWOBJECT\f1 element,'' .I HTML 4.0 Specification, \(sc13.3, World Wide Web Consortium, April 1998. .P \-\-. ``Objects, Images, and Applets: How to specify alternate text,'' .I HTML 4.0 Specification, \(sc13.8, World Wide Web Consortium, April 1998. .P \-\-. ``Index of Elements,'' .I HTML 4.0 Specification, World Wide Web Consortium, April 1998. .P Marcin Sawicki, et al. .IR "Ruby Annotation" , World Wide Web Consortium, April 2001. .P The Unicode Consortium. ``Encoding Forms,'' .I The Unicode Standard 3.0, \(sc2.3, Addison-Wesley, 2000. .P Francois Yergeau. ``UTF-8, a transformation format of ISO 10646,'' .IR "Request for Comments 2279" , Network Working Group of the Internet Engineering Task Force, January 1998. .SH AUTHOR Paul J. Lucas .RI < pauljlucas@mac.com > swish++-6.1.5/man/man1/search.10000644000076500000000000005741210262031444014414 0ustar pjlwheel.\" .\" SWISH++ .\" search.1 .\" .\" Copyright (C) 2003 Paul J. Lucas .\" .\" This program is free software; you can redistribute it and/or modify .\" it under the terms of the GNU General Public License as published by .\" the Free Software Foundation; either version 2 of the License, or .\" (at your option) any later version. .\" .\" This program is distributed in the hope that it will be useful, .\" but WITHOUT ANY WARRANTY; without even the implied warranty of .\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the .\" GNU General Public License for more details. .\" .\" You should have received a copy of the GNU General Public License .\" along with this program; if not, write to the Free Software .\" Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. .\" .\" --------------------------------------------------------------------------- .\" define code-start macro .de cS .sp .nf .RS 5 .ft CW .ta .5i 1i 1.5i 2i 2.5i 3i 3.5i 4i 4.5i 5i 5.5i .. .\" define code-end macro .de cE .ft 1 .RE .fi .if !'\\$1'0' .sp .. .\" --------------------------------------------------------------------------- .TH \f3search\fP 1 "June 16, 2005" "SWISH++" .SH NAME search \- SWISH++ searcher .SH SYNOPSIS .B search [ .I options ] .I query .SH DESCRIPTION .B search is the SWISH++ searcher. It searches a previously generated index for the words specified in a query. In addition to running from the command-line, it can run as a daemon process functioning as a ``search server.'' .SH QUERY INPUT .SS Query Syntax The formal grammar of a query is: .RS 5 .TP 16 .IR query : .I query relop meta .br .I meta .TP .IR meta : .IR meta_name \ \f(CW=\fP\ primary .br .I primary .TP .IR meta_name : .I word .TP .IR primary : .RI \f(CW(\fP query \f(CW)\fP .br .RI \f(CWnot\fP\ meta .br .I word .br .IR word \f(CW*\fP .TP .IR relop : \f(CWand\fP .br \f(CWnear\fP .br \f(CWnot near\fP .br \f(CWor\fP .br .I (empty) .RE .PP In practice, however, the query is the set of words sought after, possibly restricted to meta data, and possibly combined with the operators ``\f(CWand\fP,'' ``\f(CWor\fP,'' ``\f(CWnear\fP,'' ``\f(CWnot\fP,'' and ``\f(CWnot near\fP.'' The asterisk (\f(CW*\fP) can be used as a wildcard character at the end of words. Note that an asterisk and parentheses are shell meta-characters and as such must either be escaped (backslashed) or quoted when passed to a shell. .PP Although syntactically legal, it is a semantic error to have ``\f(CWnear\fP'' just before ``\f(CWnot\fP'' since such queries are nonsensical, e.g.: .cS mouse near not computer .cE Queries are evaluated in left-to-right order, i.e., ``\f(CWand\fP'' has the same precedence as ``\f(CWor\fP.'' For more about query syntax, see the EXAMPLES. .SS Character Mapping and Word Determination The same character mapping and word determination heuristics used by .BR index (1) are used on queries prior to searching. .SH RESULTS OUTPUT .SS Result Components The results are output either in ``classic'' or XML format. In either case, the components of the results are: .TP 12 .I rank An integer from 1 to 100. .TP .I path-name The relative path to where the file was originally indexed. .TP .I file-size The file's size in bytes. .TP .I file-title If the file is of a format that can have titles (HTML, XHTML, LaTeX, mail, or Unix manual pages) and the title was extracted, then .I file-title is its title; otherwise, it is its filename. .SS Classic Results Format The ``classic'' results format is plain text as: .cS .I rank path-name file-size file-title .cE It can be parsed easily in Perl with: .cS ($rank,$path,$size,$title) = split( / /, $_, 4 ); .cE (The separator can be changed via the .B \-R or .B \-\-separator options or the .B ResultSeparator variable.) .P Prior to results lines, comment lines may also appear containing additional information about the query results. Comment lines are in the format of: .cS # \f2comment-key\fP: \f2comment-value\fP .cE The keys and values are: .RS 5 .TP 24 \f3ignored\fP: \f2stop-words\fP The list of stop-words (separated by spaces) ignored in the query. .TP \f3not found\fP: \f2word\fP The word was not found in the index. .TP \f3results\fP: \f2result-count\fP The total number of results. .RE .SS XML Results Format The XML results format is given by the DTD: .cS .cE and by the XML schema located at: .cS http://homepage.mac.com/pauljlucas/software/swish/SearchResults/SearchResults.xsd .cE For example: .cS \f2stop-word\fP \f2\&...\fP 42 \f2rank\fP \f2path-name\fP \f2file-size\fP \f2file-title\fP \f2\&...\fP .cE 0 .SH RUNNING AS A DAEMON PROCESS .SS Description .B search can alternatively run as a daemon process (via either the .B \-b or .B \-\-daemon-type options or the .B SearchDaemon variable) functioning as a ``search server'' by listening to a Unix domain socket (specified by either the .B \-u or .B \-\-socket-file options or the .B SocketFile variable), a TCP socket (specified by either the .B \-a or .B \-\-socket-address options or the .B SocketAddress variable), or both. Unix domain sockets are preferred for both performance and security. For search-intensive applications, such as a search engine on a heavily used web site, this can yield a large performance improvement since the start-up cost .RB ( fork (2), .BR exec (2), and initialization) is paid only once. .PP If the process was started with root privileges, it will give them away immediately after initialization and before servicing any requests. .SS Clients and Requests Search clients connect to a daemon via a socket and send a query in the same manner as on the command line (including the first word being ``\f(CWsearch\f1''). The only exception is that shell meta-characters .I "must not" be escaped (backslashed) since no shell is involved. Search results are returned via the same socket. See the EXAMPLES. .SS Multithreading A daemon can serve multiple query requests simultaneously since it is multi-threaded. When started, it ``pre-threads'' meaning that it creates a pool of threads in advance that service an indefinite number of requests as a further performance improvement since a thread is not created and destroyed per request. .PP There is an initial, minimum number of threads in the thread pool. The number of threads grows dynamically when there are more requests than threads, but not more than a specified maximum to prevent the server from thrashing. (See the .BR \-t , .BR \-\-min-threads , .BR \-T , and .B \-\-max-threads options or the .B ThreadsMin or .B ThreadsMax variables.) If the number of threads reaches the maximum, subsequent requests are queued until existing threads become available to service them after completing in-progress requests. (See either the .B \-q or .B \-\-queue-size options or the .B SocketQueueSize variable.) .PP If there are more than the minimum number of threads and some remain idle longer than a specified timeout period (because the number of requests per unit time has dropped), then threads will die off until the pool returns to its original minimum size. (See either the .B \-O or .B \-\-thread-timeout options or the .B ThreadTimeout variable.) .SS Restrictions A single daemon can search only a single index. To search multiple indices concurrently, multiple daemons can be run, each searching its own index and using its own socket. An index .I "must not" be modified or deleted while a daemon is using it. .SH OPTIONS Options begin with either a `\f(CW-\f1' for short options or a ``\f(CW--\f1'' for long options. Either a `\f(CW-\f1' or ``\f(CW--\f1'' by itself explicitly ends the options; however, the difference is that `\f(CW-\f1' is returned as the first non-option whereas ``\f(CW--\f1'' is skipped entirely. Either short or long options may be used. Long option names may be abbreviated so long as the abbreviation is unambiguous. .PP For a short option that takes an argument, the argument is either taken to be the remaining characters of the same option, if any, or, if not, is taken from the next option unless said option begins with a `\f(CW-\f1'. .PP Short options that take no arguments can be grouped (but the last option in the group can take an argument), e.g., \f(CW-Bq511\fP is equivalent to \f(CW-B -q 511\fP. .PP For a long option that takes an argument, the argument is either taken to be the characters after a `\f(CW=\fP', if any, or, if not, is taken from the next option unless said option begins with a `\f(CW-\fP'. .TP 20 .B \-? .br .ns .TP .B \-\-help Print the usage (``help'') message and exit. .TP .BI \-a a .br .ns .TP .BI \-\-socket-address= a When running as a daemon, the address, .IR a , to listen to for TCP requests. (Default is all IP addresses and port 1967.) The address argument is of the form: .sp .RS 25 .ft CW \f3[\fP \f2host\fP : \f3]\fP \f2port\fP .ft 1 .RE .TP 20 .B "" that is: an optional host and colon followed by a port number. The .I host may be one of a host name, an IP address, or the \f(CW*\f1 character meaning ``any IP address.'' Omitting the .I host and colon also means ``any IP address.'' .TP .BI \-b t .br .ns .TP .BI \-\-daemon-type= t Run as a daemon process. (Default is not to.) The type, .IR t , is one of: .RS 20 .TP 8 \f(CWnone\f1 Same as not specifying the option at all. (This does not purport to be useful, but rather consistent with the types that can be specified to the .B SearchDaemon variable.) .TP \f(CWtcp\f1 Listen on a TCP socket (see the .B \-a option). .TP \f(CWunix\f1 Listen on a Unix domain socket (see the .B \-u option). .TP \f(CWboth\f1 Listen on both. .RE .PD .RE .TP 20 .B "" By default, if executed from the command-line, .B search appears to return immediately; however, it has merely detached from the terminal and put itself into the background. There is no need to follow the command with an `\f(CW&\f1'. .TP .B \-B .br .ns .TP .B \-\-no-background When running as a daemon process, do not detach from the terminal and run in the background. (Default does.) .IP "" The reason not to run in the background is so a wrapper script can see if the process dies for any reason and automatically restart it. .IP "" This option is implied by the .B \-X or .B \-\-launchd options. .TP .BI \-c f .br .ns .TP .BI \-\-config-file= f The name of the configuration file, .IR f , to use. (Default is \f(CWswish++.conf\f1 in the current directory.) A configuration file is not required: if none is specified and the default does not exist, none is used; however, if one is specified and it does not exist, then this is an error. .TP .B \-d .br .ns .TP .B \-\-dump-words Dump the query word indices to standard output and exit. Wildcards are not permitted. .TP .B \-D .br .ns .TP .B \-\-dump-index Dump the entire word index to standard output and exit. .TP .BI \-F f .br .ns .TP .BI \-\-format= f The format, .IR f , search results are output in. The format is either \f(CWclassic\fP or \f(CWXML\f1. (Default is \f(CWclassic\f1.) .TP .BI \-G s .br .ns .TP .BI \-\-group= s The group, .IR s , to switch the process to after starting and only if started as root. (Default is \f(CWnobody\f1.) .TP .BI \-i f .br .ns .TP .BI \-\-index-file= f The name of the index file, .IR f , to use. (Default is \f(CWswish++.index\fP in the current directory.) .TP .BI \-m n .br .ns .TP .BI \-\-max-results= n The maximum number of results, .IR n , to return. (Default is 100.) .TP .B \-M .br .ns .TP .B \-\-dump-meta Dump the meta-name index to standard output and exit. .TP .BI \-n n .br .ns .TP .BI \-\-near= n The maximum number of words apart, .IR n , two words can be to be considered ``near'' each other in queries using \f(CWnear\fP. (Default is 10.) .TP .BI \-o s .br .ns .TP .BI \-\-socket-timeout= s The number of seconds, .IR s , a search client has to complete a query request before the socket connection is closed. (Default is 10.) This is to prevent a client from connecting, not completing a request, and causing the thread servicing the request to wait forever. .TP .BI \-O s .br .ns .TP .BI \-\-thread-timeout= s The number of seconds, .IR s , until an idle spare thread dies while running as a daemon. (Default is 30.) .TP .BI \-p n .br .ns .TP .BI \-\-word-percent= n The maximum percentage, .IR n , of files a word may occur in before it is discarded as being too frequent. (Default is 100.) If you want to keep all words regardless, specify 101. .TP .BI \-P f .br .ns .TP .BI \-\-pid-file= f The name of the file to record the process ID of .B search if running as a daemon. (Default is none.) .TP .BI \-q n .br .ns .TP .BI \-\-queue-size= n The maximum number of socket connections to queue. (Default is 511.) .TP .BI \-r n .br .ns .TP .BI \-\-skip-results= n The initial number of results, .IR n , to skip. (Default is 0.) Used in conjunction with .B \-m or .BR \-\-max-results , results can be returned in ``pages.'' .TP .BI \-R s .br .ns .TP .BI \-\-separator= s The classic result separator string. (Default is " ".) .TP .B \-s .br .ns .TP .B \-\-stem-words Perform stemming (suffix stripping) on words during the search. Words that end in the wildcard character are not stemmed. (Default is no.) .TP .B \-S .br .ns .TP .B \-\-dump-stop Dump the stop-word index to standard output and exit. .TP .BI \-t n .br .ns .TP .BI \-\-min-threads= n Minimum number of threads to maintain while running as a daemon. .TP .BI \-T n .br .ns .TP .BI \-\-max-threads= n Maximum number of threads to allow while running as a daemon. .TP .BI \-u f .br .ns .TP .BI \-\-socket-file= f The name of the Unix domain socket file to use while running as a daemon. (Default is \f(CW/tmp/search.socket\f1.) .TP .BI \-U s .br .ns .TP .BI \-\-user= s The user, .IR s , to switch the process to after starting and only if started as root. (Default is \f(CWnobody\f1.) .TP .B \-V .br .ns .TP .B \-\-version Print the version number of .B SWISH++ to standard output and exit. .TP .BI \-w n [, c ] .br .ns .TP .BI \-\-window= n [, c ] Dump a ``window'' of at most .I n lines around each query word matching .I c characters. Wildcards are not permitted. (Default for .I c is 0.) Every window ends with a blank line. .TP .B \-X .br .ns .TP .B \-\-launchd If run as a daemon process, cooperate with Mac OS X's .BR launchd (8) by not ``daemonizing'' itself since .BR launchd (8) handles that. This option implies the .B \-B or .B \-\-no-background options. .IP "" This option is available only under Mac OS X, should be used only for version 10.4 (Tiger) or later, and only when .B search will be started via .BR launchd (8). .SH CONFIGURATION FILE The following variables can be set in a configuration file. Variables and command-line options can be mixed, the latter taking priority. .PP .RS 5 .PD 0 .TP 20 .B Group Same as .B \-G or .B \-\-group .TP .B IndexFile Same as .B \-i or .B \-\-index-file .TP .B LaunchdCooperation Same as .B \-X or .B \-\-launchd .TP .B PidFile Same as .B \-P or .B \-\-pid-file .TP .B ResultSeparator Same as .B \-R or .B \-\-separator .TP .B ResultsFormat Same as .B \-F or .B \-\-format .TP .B ResultsMax Same as .B \-m or .B \-\-max-results .TP .B SearchBackground Same as .B \-B or .B \-\-no-background .TP .B SearchDaemon Same as .B \-b or .B \-\-daemon-type .TP .B SocketAddress Same as .B \-a or .B \-\-socket-address .TP .B SocketFile Same as .B \-u or .B \-\-socket-file .TP .B SocketQueueSize Same as .B \-q or .B \-\-queue-size .TP .B SocketTimeout Same as .B \-o or .B \-\-socket-timeout .TP .B StemWords Same as .B \-s or .B \-\-stem-words .TP .B ThreadsMax Same as .B \-T or .B \-\-max-threads .TP .B ThreadsMin Same as .B \-t or .B \-\-min-threads .TP .B ThreadTimeout Same as .B \-O or .B \-\-thread-timeout .TP .B User Same as .B \-U or .B \-\-user .TP .B WordFilesMax Same as .B \-f or .B \-\-word-files .TP .B WordPercentMax Same as .B \-p or .B \-\-word-percent .TP .B WordsNear Same as .B \-n or .B \-\-near .PD .RE .SH EXAMPLES .SS Simple Queries The query: .cS computer mouse .cE is the same as and short for: .cS computer and mouse .cE (because ``\f(CWand\fP'' is implicit) and would return only those documents that contain both words. The query: .cS cat or kitten or feline .cE would return only those documents regarding cats. The query: .cS mouse and computer or keyboard .cE is the same as: .cS (mouse and computer) or keyboard .cE (because queries are evaluated left-to-right) in that they will both return only those documents regarding either mice attached to a computer or any kind of keyboard. However, neither of those is the same as: .cS mouse and (computer or keyboard) .cE that would return only those documents regarding mice (including the rodents) and either a computer or a keyboard. .SS Queries Using Wildcards The query: .cS comput* .cE would return only those documents that contain words beginning with ``comput'' such as ``computation,'' ``computational,'' ``computer,'' ``computerize,'' ``computing,'' and others. Wildcarded words can be used anywhere ordinary words can be. The query: .cS comput* (medicine or doctor*) .cE would return only those documents that contain something about computer use in medicine or by doctors. .SS Queries Using ``not'' The query: .cS mouse or mice and not computer* .cE would return only those documents regarding mice (the rodents) and not the kind attached to a computer. .SS Queries Using ``near'' Using ``\f(CWnear\fP'' is the same as using ``\f(CWand\fP'' except that it not only requires both words to be in the documents, but that they be .I near each other, i.e., it returns potentially fewer documents than the corresponding ``\f(CWand\fP'' query. The query: .cS computer near mouse .cE would return only those documents where both words are near each other. They query: .cS mouse near (computer or keyboard) .cE is the same as: .cS (mouse near computer) or (mouse near keyboard) .cE i.e., ``near'' gets .I distributed across parenthesized subqueries. .SS Queries Using ``not near'' Using ``\f(CWnot near\fP'' is the same as using ``\f(CWand not\fP'' except that it allows the right-hand side words to be in the documents, just .I "not near" the left-hand side words, i.e., it returns potentially more documents than the corresponding ``\f(CWand not\fP'' query. Of course the word(s) on the right-hand side need not be in the documents at all, i.e., they would be considered ``infinitely far'' apart. The query: .cS mouse or mice not near computer* .cE would return only those documents regarding mice (the rodents) more effectively than the query: .cS mouse or mice and not computer* .cE because the latter would exclude documents about mice (the rodents) where computers just so happened to be mentioned in the same documents. .SS Queries Using Meta Data The query: .cS author = hawking .cE would return only those documents whose author attribute contains ``hawking.'' The query: .cS author = hawking radiation .cE would return only those documents regarding radiation whose author attribute contains ``hawking.'' The query: .cS author = (stephen hawking) .cE would return only those documents whose author is Stephen Hawking. The query: .cS author = (stephen hawking) or (black near hole*) .cE would return only those documents whose author is Stephen Hawking or that contain the word ``black'' near ``hole'' or ``holes'' regardless of the author. Note that the second set of parentheses are necessary otherwise the query would have been the same as: .cS (author = (stephen hawking) or black) near hole* .cE that would have additionally required both ``stephen'' and ``hawking'' to be near ``hole'' or ``holes.'' .SS Sending Queries to a Search Daemon To send a query request to a search daemon using Perl, first open the socket and connect to the daemon (see [Wall], pp. 439-440): .cS use Socket; $SocketFile = '/tmp/search.socket'; socket( SEARCH, PF_UNIX, SOCK_STREAM, 0 ) or die "can not open socket: $!\\n"; connect( SEARCH, sockaddr_un( $SocketFile ) ) or die "can not connect to \\"$SocketFile\\": $!\\n"; .cE Autoflush .I must be set for the socket filehandle (see [Wall], p. 781), otherwise the server thread will hang since I/O buffering will wait for the buffer to fill that will never happen since queries are short: .cS select( (select( SEARCH ), $| = 1)[0] ); .cE Next, send a query request (beginning with the word ``search'' and any options just as with a command-line) to the daemon via the socket filehandle making sure to include a trailing newline since the server reads an entire line of input (so therefore it looks and waits for a newline): .cS $query = 'mouse and computer'; print SEARCH "search $query\\n"; .cE Finally, read the results back and print them: .cS print while ; close( SEARCH ); .cE .SH EXIT STATUS Exits with one of the values given below: .PP .RS 5 .PD 0 .TP 5 0 Success. .TP 1 Error in configuration file. .TP 2 Error in command-line options. .TP 40 Unable to read index file. .TP 50 Malformed query. .TP 51 Attempted ``near'' search without word-position data. .TP 60 Could not write to PID file. .TP 61 Host or IP address is invalid or nonexistent. .TP 62 Could not open a TCP socket. .TP 63 Could not open a Unix domain socket. .TP 64 Could not .BR unlink (2) a Unix domain socket file. .TP 65 Could not .BR bind (3) to a TCP socket. .TP 66 Could not .BR bind (3) to a Unix domain socket. .TP 67 Could not .BR listen (3) to a TCP socket. .TP 68 Could not .BR listen (3) to a Unix domain socket. .TP 69 Could not .BR select (3). .TP 70 Could not .BR accept (3) a socket connection. .TP 71 Could not .BR fork (2) child process. .TP 72 Could not change directory to \f(CW/\f1. .TP 73 Could not create thread. .TP 74 Could not create thread key. .TP 75 Could not detach thread. .TP 76 Could not initialize thread condition. .TP 77 Could not initialize thread mutex. .TP 78 Could not switch to user. .TP 79 Could not switch to group. .PD .RE .SH CAVEATS .TP 4 1. Stemming can be done .B only when searching through and index of files that are in English because the Porter stemming algorithm used only stems English words. .TP 2. When run as a daemon using a TCP socket, there are no security restrictions on who may connect and search. The code to implement domain and IP address restrictions isn't worth it since such things are better handled by firewalls and routers. .TP 3. XML output can currently only be obtained for actual search results and not word, index, meta-name, or stop-word dumps. .SH FILES .PD 0 .TP 20 \f(CWswish++.conf\f1 default configuration file name .TP \f(CWswish++.index\f1 default index file name .PD .SH SEE ALSO .BR index (1), .BR perlfunc (1), .BR exec (2), .BR fork (2), .BR unlink (2), .BR accept (3), .BR bind (3), .BR listen (3), .BR select (3), .BR swish++.conf (4), .BR launchd (8), .BR searchmonitor (8) .PP Tim Bray, et al. .IR "Extensible Markup Language (XML) 1.0" , February 10, 1998. .PP Bradford Nichols, Dick Buttlar, and Jacqueline Proulx Farrell. .IR "Pthreads Programming" , O'Reilly & Associates, Sebastopol, CA, 1996. .PP M.F. Porter. ``An Algorithm For Suffix Stripping,'' .IR Program , 14(3), July 1980, pp. 130-137. .PP W. Richard Stevens. .IR "Unix Network Programming, Vol 1, 2nd ed." , Prentice-Hall, Upper Saddle River, NJ, 1998. .PP Larry Wall, et al. .IR "Programming Perl, 3rd ed." , O'Reilly & Associates, Inc., Sebastopol, CA, 2000. .SH AUTHOR Paul J. Lucas .RI < pauljlucas@mac.com > swish++-6.1.5/man/man1/splitmail.10000644000076500000000000000363607745117724015166 0ustar pjlwheel.\" .\" SWISH++ .\" splitmail.1 .\" .\" Copyright (C) 2000 Paul J. Lucas .\" .\" This program is free software; you can redistribute it and/or modify .\" it under the terms of the GNU General Public License as published by .\" the Free Software Foundation; either version 2 of the License, or .\" (at your option) any later version. .\" .\" This program is distributed in the hope that it will be useful, .\" but WITHOUT ANY WARRANTY; without even the implied warranty of .\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the .\" GNU General Public License for more details. .\" .\" You should have received a copy of the GNU General Public License .\" along with this program; if not, write to the Free Software .\" Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. .\" .\" --------------------------------------------------------------------------- .\" define code-start macro .de cS .sp .nf .RS 5 .ft CW .ta .5i 1i 1.5i 2i 2.5i 3i 3.5i 4i 4.5i 5i 5.5i .. .\" define code-end macro .de cE .ft 1 .RE .fi .if !'\\$1'0' .sp .. .\" --------------------------------------------------------------------------- .TH \f3splitmail\f1 1 "December 13, 2000" "SWISH++" .SH NAME splitmail \- split mailbox files prior to indexing .SH SYNOPSIS .B splitmail -p .I prefix .BI "[ " file " ]" .SH DESCRIPTION .B splitmail is a utility to split a mailbox file (or standard input) comprised of multiple messages into multiple files of individual messages to facilitate indexing with .BR index (1). The generated files have 5-digit increasing numbers appended to a common prefix. .SH OPTIONS .TP 12 .BI \-p prefix Specifies the common prefix. .SH EXAMPLE The command: .cS splitmail -p msg sent_messages .cE splits the mailbox \f(CWsent_messages\f1 into files named \f(CWmsg.00001\f1, \f(CWmsg.00002\f1, and so on. .SH NOTE This utility hasn't been exhaustively tested. .SH SEE ALSO .BR index (1). .SH AUTHOR Paul J. Lucas .RI < pauljlucas@mac.com > swish++-6.1.5/man/man3/0000755000076500000000000000000010746421524013067 5ustar pjlwheelswish++-6.1.5/man/man3/GNUmakefile0000644000076500000000000000163410044357035015141 0ustar pjlwheel## # SWISH++ # man/man3/GNUmakefile # # Copyright (C) 1998 Paul J. Lucas # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. ## ########## You shouldn't have to change anything below this line. ############# ROOT:= ../.. SECT:= 3 include $(ROOT)/config/man.mk # vim:set noet sw=8 ts=8: swish++-6.1.5/man/man3/WWW.30000644000076500000000000001155007745117322013644 0ustar pjlwheel.\" .\" World Wide Web Package .\" WWW.3 .\" .\" Copyright (C) 1998 Paul J. Lucas .\" .\" This program is free software; you can redistribute it and/or modify .\" it under the terms of the GNU General Public License as published by .\" the Free Software Foundation; either version 2 of the License, or .\" (at your option) any later version. .\" .\" This program is distributed in the hope that it will be useful, .\" but WITHOUT ANY WARRANTY; without even the implied warranty of .\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the .\" GNU General Public License for more details. .\" .\" You should have received a copy of the GNU General Public License .\" along with this program; if not, write to the Free Software .\" Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. .\" .\" --------------------------------------------------------------------------- .\" define code-start macro .de cS .sp .nf .RS 5 .ft CW .ta .5i 1i 1.5i 2i 2.5i 3i 3.5i 4i 4.5i 5i 5.5i .. .\" define code-end macro .de cE .ft 1 .RE .fi .if !'\\$1'0' .sp .. .\" --------------------------------------------------------------------------- .tr ~ .TH \f3WWW\f1 3 "February 12, 2000" "WWW" .SH NAME WWW \- World Wide Web Package .SH SYNOPSIS .ft CW .nf extract_description( \f2FILE\fP ) extract_meta( \f2FILE\fP, \f2NAME\fP ) hyperlink( \f2LIST\fP ) .fi .ft 1 .SH DESCRIPTION This package provides a utility functions for the World Wide Web to extract descriptions of or meta information from files, and hyperlink text. .SH SUBROUTINES The following Perl subroutines are defined and available: .IP "\f(CWextract_description( \f2FILE\fP )\f1" Extracts a description from an HTML or plain text file given by the .I FILE name; .I FILE should be an absolute path. The first \f(CW$description::chars\f1 (default: 2048) characters are read. If the file ends in one of the extensions \f(CWhtm\f1, \f(CWhtml\f1, or \f(CWshtml\f1, it is presumed to be an HTML file; if the file ends in \f(CWtxt\f1, it is presumed to be a plain text file. Other extensions are not recognized and no description is returned for them. .IP "" For HTML files, first, if a \f(CW\f1 or a \f(CW\f1 (Dublin Core) element is found, then the words specified as the value of the \f(CWCONTENT\f1 attribute is returned as the description. .IP "" Otherwise, all HTML comments, text between \f(CW


Stem words
swish++-6.1.5/xml_formatter.c0000644000076500000000000001201210300243532014464 0ustar pjlwheel/* ** SWISH++ ** xml_formatter.c ** ** Copyright (C) 2001 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ // standard #include /* for strpbrk(3) */ #include // local #include "file_info.h" #include "index_segment.h" #include "platform.h" #include "util.h" #include "xml_formatter.h" using namespace std; #define SWISH_PATH "/pauljlucas/software/swish" #define SWISH_NS_URI "http://www.pauljlucas.org" SWISH_PATH #define SWISH_PHYS_URI "http://homepage.mac.com" SWISH_PATH #define XSI_URI "http://www.w3.org/2001/XMLSchema-instance" #define SEARCH_RESULTS "SearchResults" #define SEARCH_RESULTS_DTD SEARCH_RESULTS ".dtd" #define SEARCH_RESULTS_NS_URI SWISH_NS_URI "/" SEARCH_RESULTS #define SEARCH_RESULTS_PHYS_URI SWISH_PHYS_URI "/" SEARCH_RESULTS #define SEARCH_RESULTS_XSD SEARCH_RESULTS ".xsd" extern index_segment directories; //***************************************************************************** // // SYNOPSIS // static string escape( char const *s ) // // DESCRIPTION // // Escape all '&' and '<' characters in a given string by replacing them // with "&" or "<", respectively. // // PARAMETERS // // s The string to be escaped. // // RETURN VALUE // // Returns a new string. // // SEE ALSO // // Tim Bray, et al. "Character Data and Markup," Extensible Markup // Language (XML) 1.0, section 2.4, February 10, 1998. // //***************************************************************************** { string result = s; register string::size_type i; for ( i = 0; (i = result.find( '&', i )) != string::npos; i += 5 ) result.replace( i, 1, "&" ); for ( i = 0; (i = result.find( '<', i )) != string::npos; i += 4 ) result.replace( i, 1, "<" ); return result; } //***************************************************************************** // // SYNOPSIS // xml_formatter::~xml_formatter() // // DESCRIPTION // // Destroy an xml_formatter. // // NOTE // // This is out-of-line only because it's virtual. // //***************************************************************************** { // do nothing } //***************************************************************************** // // SYNOPSIS // void xml_formatter::pre( stop_word_set const &stop_words ) const // // DESCRIPTION // // Output search-result "meta" information before the results themselves: // the set of stop words found in the query (if any) and the number of // results. // // PARAMETERS // // stop_words The set of stop words. // //***************************************************************************** { out_ << "\n" "\n" "\n"; if ( !stop_words.empty() ) { out_ << " \n"; FOR_EACH( stop_word_set, stop_words, word ) out_ << " " << *word << "\n"; out_ << " \n"; } out_ << " " << results_ << "\n"; if ( results_ ) out_ << " \n"; } //***************************************************************************** // // SYNOPSIS // void xml_formatter::result( int rank, file_info const &fi ) const // // DESCRIPTION // // Output an individual search result's information: it's rank, path, // size, and title. // // PARAMETERS // // rank The rank (1-100) of the result. // // fi The search result's file information. // //***************************************************************************** { out_ << " \n" " " << rank << "\n" " " << directories[ fi.dir_index() ] << '/' << fi.file_name() << "\n" " " << fi.size() << "\n" " "; if ( ::strpbrk( fi.title(), "&<" ) ) out_ << escape( fi.title() ); else out_ << fi.title(); out_ << "\n" " \n"; } //***************************************************************************** // // SYNOPSIS // void xml_formatter::post() const // // DESCRIPTION // // Output end tags of XML elements. // //***************************************************************************** { if ( results_ ) out_ << " \n"; out_ << "\n"; } /* vim:set noet sw=8 ts=8: */ swish++-6.1.5/xml_formatter.h0000644000076500000000000000276410300243532014506 0ustar pjlwheel/* ** SWISH++ ** xml_formatter.h ** ** Copyright (C) 1998 Paul J. Lucas ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef xml_formatter_H #define xml_formatter_H // local #include "results_formatter.h" //***************************************************************************** // // SYNOPSIS // class xml_formatter : public results_formatter // // DESCRIPTION // // An xml_formatter is-a results_formatter for formatting search results // in XML. // //***************************************************************************** { public: xml_formatter( std::ostream &o, int results ) : results_formatter( o, results ) { } virtual ~xml_formatter(); virtual void pre( stop_word_set const& ) const; virtual void result( int rank, file_info const& ) const; virtual void post() const; }; #endif /* xml_formatter_H */ /* vim:set noet sw=8 ts=8: */