pinot-1.10/0000775000175000017500000000000013620041133007560 500000000000000pinot-1.10/textcat32_conf.txt0000664000175000017500000000142513005331666013103 00000000000000# # libexttextcat 3.2 config file # The following languages are supported by Xapian::Stem # Languages names MUST be in English # /usr/share/libexttextcat/da.lm danish /usr/share/libexttextcat/nl.lm dutch /usr/share/libexttextcat/en.lm english /usr/share/libexttextcat/fi.lm finnish /usr/share/libexttextcat/fr.lm french /usr/share/libexttextcat/de.lm german /usr/share/libexttextcat/hu.lm hungarian /usr/share/libexttextcat/it.lm italian /usr/share/libexttextcat/nb.lm norwegian-bokmal /usr/share/libexttextcat/nn.lm norwegian-nynorsk /usr/share/libexttextcat/pt.lm portuguese /usr/share/libexttextcat/ro.lm romanian /usr/share/libexttextcat/ru.lm russian /usr/share/libexttextcat/es.lm spanish /usr/share/libexttextcat/sv.lm swedish /usr/share/libexttextcat/tr.lm turkish pinot-1.10/Tokenize/0000775000175000017500000000000013620041132011347 500000000000000pinot-1.10/Tokenize/FilterUtils.h0000664000175000017500000000527013005331666013725 00000000000000/* * Copyright 2007-2012 Fabrice Colin * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #ifndef _FILTER_UTILS_H #define _FILTER_UTILS_H #include #include #include "Document.h" #include "Visibility.h" #include "filters/Filter.h" /// Drives document reduction and takes action on the final document. class PINOT_EXPORT ReducedAction { public: ReducedAction(); ReducedAction(const ReducedAction &other); virtual ~ReducedAction(); ReducedAction &operator=(const ReducedAction &other); virtual bool positionFilter(const Document &doc, Dijon::Filter *pFilter); virtual bool isReduced(const Document &doc); virtual bool takeAction(Document &doc, bool isNested) = 0; }; /// Utility functions for dealing with Dijon filters. class PINOT_EXPORT FilterUtils { public: virtual ~FilterUtils(); /// Returns a Filter that handles the given MIME type, or one of its parents. static Dijon::Filter *getFilter(const std::string &mimeType); /// Indicates whether a MIME type is supported or not. static bool isSupportedType(const std::string &mimeType); /// Feeds a document's data to a filter. static bool feedFilter(const Document &doc, Dijon::Filter *pFilter); /// Populates a document based on metadata extracted by the filter. static bool populateDocument(Document &doc, Dijon::Filter *pFilter); /// Filters a document until reduced to the minimum. static bool filterDocument(const Document &doc, const std::string &originalType, ReducedAction &action); /// Convenient front-end for filterDocument() to reduce documents. static bool reduceDocument(const Document &doc, ReducedAction &action); /// Strips markup from a piece of text. static std::string stripMarkup(const std::string &text); protected: static std::set m_types; static std::map m_typeAliases; static std::string m_maxNestedSize; FilterUtils(); private: FilterUtils(const FilterUtils &other); FilterUtils &operator=(const FilterUtils &other); }; #endif // _FILTER_UTILS_H pinot-1.10/Tokenize/Makefile.am0000664000175000017500000000614113005331666013340 00000000000000# Process this file with automake to produce Makefile.in noinst_HEADERS = \ $(top_srcdir)/Tokenize/filters/ArchiveFilter.h \ $(top_srcdir)/Tokenize/filters/ChmFilter.h \ $(top_srcdir)/Tokenize/filters/ExifImageFilter.h \ $(top_srcdir)/Tokenize/filters/Exiv2ImageFilter.h \ $(top_srcdir)/Tokenize/filters/ExternalFilter.h \ $(top_srcdir)/Tokenize/filters/FileOutputFilter.h \ $(top_srcdir)/Tokenize/filters/GMimeMboxFilter.h \ $(top_srcdir)/Tokenize/filters/TagLibMusicFilter.h pkginclude_HEADERS = \ FilterUtils.h \ TextConverter.h nobase_pkginclude_HEADERS = \ filters/Filter.h \ filters/FilterFactory.h \ filters/HtmlFilter.h \ filters/HtmlParser.h \ filters/TextFilter.h \ filters/XmlFilter.h lib_LTLIBRARIES = libexiv2imagefilter.la libexternalfilter.la libmboxfilter.la libtaglibfilter.la if HAVE_LIBARCHIVE lib_LTLIBRARIES += libarchivefilter.la endif if HAVE_CHMLIB lib_LTLIBRARIES += libchmfilter.la endif pkglib_LTLIBRARIES = libFilter.la libTokenize.la libFilter_la_LDFLAGS = \ -static libFilter_la_SOURCES = \ $(top_srcdir)/Tokenize/filters/Filter.cc \ $(top_srcdir)/Tokenize/filters/FilterFactory.cc \ $(top_srcdir)/Tokenize/filters/HtmlFilter.cc \ $(top_srcdir)/Tokenize/filters/HtmlParser.cc \ $(top_srcdir)/Tokenize/filters/TextFilter.cc \ $(top_srcdir)/Tokenize/filters/XmlFilter.cc if HAVE_LIBARCHIVE libarchivefilter_la_DEPENDENCIES = libFilter.la libarchivefilter_la_SOURCES = \ $(top_srcdir)/Tokenize/filters/ArchiveFilter.cc libarchivefilter_la_LDFLAGS = -module -avoid-version libarchivefilter_la_LIBADD = -larchive endif if HAVE_CHMLIB libchmfilter_la_DEPENDENCIES = libFilter.la libchmfilter_la_SOURCES = \ $(top_srcdir)/Tokenize/filters/ChmFilter.cc libchmfilter_la_LDFLAGS = -module -avoid-version libchmfilter_la_LIBADD = -lchm endif libexiv2imagefilter_la_DEPENDENCIES = libFilter.la libexiv2imagefilter_la_SOURCES = \ $(top_srcdir)/Tokenize/filters/Exiv2ImageFilter.cc libexiv2imagefilter_la_LDFLAGS = -module -avoid-version libexiv2imagefilter_la_LIBADD = @EXIV2_LIBS@ libexternalfilter_la_DEPENDENCIES = libFilter.la libexternalfilter_la_SOURCES = \ $(top_srcdir)/Tokenize/filters/ExternalFilter.cc \ $(top_srcdir)/Tokenize/filters/FileOutputFilter.cc libexternalfilter_la_LDFLAGS = -module -avoid-version libexternalfilter_la_LIBADD = @XML_LIBS@ libmboxfilter_la_DEPENDENCIES = libFilter.la libmboxfilter_la_SOURCES = \ $(top_srcdir)/Tokenize/filters/GMimeMboxFilter.cc libmboxfilter_la_LDFLAGS = -module -avoid-version libmboxfilter_la_LIBADD = @GMIME_LIBS@ libtaglibfilter_la_DEPENDENCIES = libFilter.la libtaglibfilter_la_SOURCES = \ $(top_srcdir)/Tokenize/filters/TagLibMusicFilter.cc libtaglibfilter_la_LDFLAGS = -module -avoid-version libtaglibfilter_la_LIBADD = @TAGLIB_LIBS@ libTokenize_la_LDFLAGS = \ -static libTokenize_la_SOURCES = \ FilterUtils.cpp \ TextConverter.cpp \ $(top_srcdir)/IndexSearch/cjkv/CJKVTokenizer.cc AM_CXXFLAGS = \ @MISC_CFLAGS@ \ -I$(top_srcdir)/Utils -Ifilters \ @GMIME_CFLAGS@ @XML_CFLAGS@ @EXIV2_CFLAGS@ @TAGLIB_CFLAGS@ \ -D_DYNAMIC_DIJON_FILTERS \ -D_DIJON_EXTERNALFILTER_CONFFILE=\"$(sysconfdir)/pinot/external-filters.xml\" pinot-1.10/Tokenize/FilterUtils.cpp0000664000175000017500000003542613005331666014266 00000000000000/* * Copyright 2007-2012 Fabrice Colin * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include #include #include #include #include #include #include #include #include "config.h" #include "Memory.h" #include "MIMEScanner.h" #include "StringManip.h" #include "TimeConverter.h" #include "Url.h" #include "TextConverter.h" #include "filters/FilterFactory.h" #include "FilterUtils.h" #define UNSUPPORTED_TYPE "X-Unsupported" #define SIZE_THRESHOLD 5242880 using std::clog; using std::clog; using std::endl; using std::string; using std::set; using std::map; set FilterUtils::m_types; map FilterUtils::m_typeAliases; string FilterUtils::m_maxNestedSize; ReducedAction::ReducedAction() { } ReducedAction::ReducedAction(const ReducedAction &other) { } ReducedAction::~ReducedAction() { } ReducedAction &ReducedAction::operator=(const ReducedAction &other) { return *this; } bool ReducedAction::positionFilter(const Document &doc, Dijon::Filter *pFilter) { return false; } bool ReducedAction::isReduced(const Document &doc) { // Is it reduced to plain text ? if ((doc.getType().length() >= 10) && (doc.getType().substr(0, 10) == "text/plain")) { return true; } return false; } FilterUtils::FilterUtils() { char *pEnvVar = getenv("PINOT_MAXIMUM_NESTED_SIZE"); if ((pEnvVar != NULL) && (strlen(pEnvVar) > 0)) { off_t maxSize = (off_t)atoll(pEnvVar); if (maxSize > 0) { m_maxNestedSize = pEnvVar; } } } FilterUtils::~FilterUtils() { } Dijon::Filter *FilterUtils::getFilter(const string &mimeType) { Dijon::Filter *pFilter = NULL; // Is this type aliased ? map::const_iterator aliasIter = m_typeAliases.find(mimeType); if (aliasIter != m_typeAliases.end()) { if (aliasIter->second == UNSUPPORTED_TYPE) { // We already know that none of this type's parents are supported return NULL; } pFilter = Dijon::FilterFactory::getFilter(aliasIter->second); } else { // Is there a filter for this type ? pFilter = Dijon::FilterFactory::getFilter(mimeType); } if (pFilter != NULL) { return pFilter; } if (mimeType.empty() == false) { set parentTypes; if (m_types.empty() == true) { Dijon::FilterFactory::getSupportedTypes(m_types); } // Try that type's parents MIMEScanner::getParentTypes(mimeType, m_types, parentTypes); for (set::const_iterator parentIter = parentTypes.begin(); parentIter != parentTypes.end(); ++parentIter) { pFilter = Dijon::FilterFactory::getFilter(*parentIter); if (pFilter != NULL) { // Add an alias m_typeAliases[mimeType] = *parentIter; return pFilter; } } #ifdef DEBUG clog << "FilterUtils::getFilter: no valid parent for " << mimeType << endl; #endif // This type has no valid parent m_typeAliases[mimeType] = UNSUPPORTED_TYPE; } return NULL; } bool FilterUtils::isSupportedType(const string &mimeType) { // Is this type aliased ? map::const_iterator aliasIter = m_typeAliases.find(mimeType); if (aliasIter != m_typeAliases.end()) { if (aliasIter->second == UNSUPPORTED_TYPE) { return false; } // We were able to get a filter for this parent type // or a previous call to isSupportedType() succeeded return true; } if (Dijon::FilterFactory::isSupportedType(mimeType) == true) { return true; } if (m_types.empty() == true) { Dijon::FilterFactory::getSupportedTypes(m_types); } // Try that type's parents set parentTypes; MIMEScanner::getParentTypes(mimeType, m_types, parentTypes); for (set::const_iterator parentIter = parentTypes.begin(); parentIter != parentTypes.end(); ++parentIter) { if (Dijon::FilterFactory::isSupportedType(*parentIter) == true) { // Add an alias m_typeAliases[mimeType] = *parentIter; return true; } } #ifdef DEBUG clog << "FilterUtils::isSupportedType: no valid parent for " << mimeType << endl; #endif // This type has no valid parent m_typeAliases[mimeType] = UNSUPPORTED_TYPE; return false; } bool FilterUtils::feedFilter(const Document &doc, Dijon::Filter *pFilter) { string location(doc.getLocation()); Url urlObj(location); string fileName; off_t dataLength = 0; const char *pData = doc.getData(dataLength); bool fedInput = false; if (pFilter == NULL) { return false; } if ((urlObj.getProtocol() == "file") && (location.length() > 7)) { fileName = location.substr(7); } // Prefer feeding the data if (((dataLength > 0) && (pData != NULL)) && (pFilter->is_data_input_ok(Dijon::Filter::DOCUMENT_DATA) == true)) { fedInput = pFilter->set_document_data(pData, dataLength); } // ... to feeding the data through a temporary file if ((fedInput == false) && ((dataLength > 0) && (pData != NULL)) && (pFilter->is_data_input_ok(Dijon::Filter::DOCUMENT_FILE_NAME) == true)) { char inTemplate[18] = "/tmp/filterXXXXXX"; #ifdef HAVE_MKSTEMP int inFd = mkstemp(inTemplate); #else int inFd = -1; char *pInFile = mktemp(inTemplate); if (pInFile != NULL) { inFd = open(pInFile, O_RDONLY); } #endif if (inFd != -1) { #ifdef DEBUG clog << "FilterUtils::feedFilter: feeding temporary file " << inTemplate << endl; #endif // Save the data if (write(inFd, (const void*)pData, dataLength) != -1) { fedInput = pFilter->set_document_file(inTemplate, true); if (fedInput == false) { // We might as well delete the file now unlink(inTemplate); } } close(inFd); } } // ... to feeding the file if ((fedInput == false) && (fileName.empty() == false) && (doc.getInternalPath().empty() == true)) { if (pFilter->is_data_input_ok(Dijon::Filter::DOCUMENT_FILE_NAME) == true) { #ifdef DEBUG clog << "FilterUtils::feedFilter: feeding file " << fileName << endl; #endif fedInput = pFilter->set_document_file(fileName); } // ...and to feeding the file's contents if ((fedInput == false) && (pFilter->is_data_input_ok(Dijon::Filter::DOCUMENT_DATA) == true)) { Document docCopy(doc); if (docCopy.setDataFromFile(fileName) == false) { clog << "Couldn't load " << fileName << endl; return false; } #ifdef DEBUG clog << "FilterUtils::feedFilter: feeding contents of file " << fileName << endl; #endif pData = docCopy.getData(dataLength); if ((dataLength > 0) && (pData != NULL)) { fedInput = pFilter->set_document_data(pData, dataLength); } // Else, the file may be empty } } if (fedInput == false) { clog << "Couldn't feed filter for " << doc.getLocation(true) << endl; return false; } return true; } bool FilterUtils::populateDocument(Document &doc, Dijon::Filter *pFilter) { string charset, uri, ipath; off_t size = 0; bool checkDataType = false, checkFileType = false; if (pFilter == NULL) { return false; } // Go through the whole thing const map &metaData = pFilter->get_meta_data(); for (map::const_iterator metaIter = metaData.begin(); metaIter != metaData.end(); ++metaIter) { if (metaIter->first == "charset") { charset = metaIter->second; } else if (metaIter->first == "date") { doc.setTimestamp(metaIter->second); } else if (metaIter->first == "ipath") { ipath = metaIter->second; } else if (metaIter->first == "language") { doc.setLanguage(metaIter->second); } else if (metaIter->first == "mimetype") { string mimeType(StringManip::toLowerCase(metaIter->second)); if (mimeType == "scan") { checkDataType = true; } else if (mimeType == "scantitle") { checkFileType = true; } else { doc.setType(mimeType); } } else if (metaIter->first == "size") { size = (off_t)atoll(metaIter->second.c_str()); if (size > 0) { doc.setSize(size); } #ifdef DEBUG else clog << "FilterUtils::populateDocument: ignoring size zero" << endl; #endif } else if (metaIter->first == "uri") { uri = metaIter->second; if ((uri.length() >= 18) && (uri.find(":///tmp/filter") != string::npos)) { // We fed the filter a temporary file uri.clear(); } } else { doc.setOther(metaIter->first, metaIter->second); } } if (uri.empty() == false) { doc.setLocation(uri); } if (ipath.empty() == false) { string currentIPath(doc.getInternalPath()); if (currentIPath.empty() == false) { currentIPath += "&next&"; } currentIPath += ipath; doc.setInternalPath(currentIPath); #ifdef DEBUG clog << "FilterUtils::populateDocument: ipath " << currentIPath << endl; #endif } // Content and title may have to be converted TextConverter converter(20); map::const_iterator contentIter = metaData.find("title"); if ((contentIter != metaData.end()) && (contentIter->second.empty() == false)) { dstring nonUTF8Title(contentIter->second.c_str(), contentIter->second.length()); dstring utf8Data(converter.toUTF8(nonUTF8Title, charset)); doc.setTitle(string(utf8Data.c_str(), utf8Data.length())); } const dstring &content = pFilter->get_content(); if (content.empty() == false) { // Scan for the MIME type ? if (checkFileType == true) { // Assume the title is actually a file name string mimeType(MIMEScanner::scanFile(doc.getTitle())); if ((mimeType.empty() == true) || (mimeType == "application/octet-stream")) { // Revert to scanning the content checkDataType = true; } else { doc.setType(mimeType); } } if (checkDataType == true) { doc.setType(MIMEScanner::scanData(content.c_str(), content.length())); } if (doc.getType().substr(0, 10) == "text/plain") { dstring utf8Data(converter.toUTF8(content, charset)); if (converter.getErrorsCount() > 0) { clog << doc.getLocation(true) << " may not have been fully converted to UTF-8" << endl; } doc.setData(utf8Data.c_str(), utf8Data.length()); } else { doc.setData(content.c_str(), content.length()); } } // If the document is big'ish, try and reclaim memory int inUse = Memory::getUsage(); if ((size > SIZE_THRESHOLD) || (content.length() > SIZE_THRESHOLD)) { Memory::reclaim(); } return true; } bool FilterUtils::filterDocument(const Document &doc, const string &originalType, ReducedAction &action) { Dijon::Filter *pFilter = FilterUtils::getFilter(doc.getType()); bool fedFilter = false, positionedFilter = false, docSuccess = false, finalSuccess = false; if (pFilter != NULL) { // Limit the size of nested documents ? if (m_maxNestedSize.empty() == false) { pFilter->set_property(Dijon::Filter::MAXIMUM_NESTED_SIZE, m_maxNestedSize); } fedFilter = FilterUtils::feedFilter(doc, pFilter); } positionedFilter = action.positionFilter(doc, pFilter); if (fedFilter == false) { Document docCopy(doc); if (docCopy.getTitle().empty() == true) { Url urlObj(doc.getLocation()); // Default to the file name as title docCopy.setTitle(urlObj.getFile()); } // Take the appropriate action now finalSuccess = action.takeAction(docCopy, false); if (pFilter != NULL) { delete pFilter; } return finalSuccess; } // At this point, pFilter cannot be NULL bool hasDocs = pFilter->has_documents(); #ifdef DEBUG clog << "FilterUtils::filterDocument: has documents " << hasDocs << endl; #endif while (hasDocs == true) { string actualType(originalType); bool isNested = false; bool emptyTitle = false; if ((positionedFilter == false) && (pFilter->next_document() == false)) { #ifdef DEBUG clog << "FilterUtils::filterDocument: no more documents in " << doc.getLocation(true) << endl; #endif break; } const DocumentInfo *pInfo = dynamic_cast(&doc); string originalTitle(doc.getTitle()); if (pInfo == NULL) { #ifdef DEBUG clog << "FilterUtils::filterDocument: couldn't duplicate document information" << endl; #endif break; } Document filteredDoc(*pInfo); filteredDoc.setType("text/plain"); docSuccess = false; if (populateDocument(filteredDoc, pFilter) == false) { hasDocs = pFilter->has_documents(); continue; } // Is this a nested document ? if (filteredDoc.getInternalPath().length() > doc.getInternalPath().length()) { actualType = filteredDoc.getType(); #ifdef DEBUG clog << "FilterUtils::filterDocument: nested document of type " << actualType << endl; #endif isNested = true; } else if (originalTitle.empty() == false) { // Preserve the top-level document's title filteredDoc.setTitle(originalTitle); } else if (filteredDoc.getTitle().empty() == true) { emptyTitle = true; } // Pass it down to another filter ? if (action.isReduced(filteredDoc) == true) { // Do we need to set a default title ? if (emptyTitle == true) { Url urlObj(doc.getLocation()); // Default to the file name as title filteredDoc.setTitle(urlObj.getFile()); #ifdef DEBUG clog << "FilterUtils::filterDocument: set default title " << urlObj.getFile() << endl; #endif } filteredDoc.setType(actualType); // Take the appropriate action docSuccess = action.takeAction(filteredDoc, isNested); } else { docSuccess = filterDocument(filteredDoc, actualType, action); } // Consider indexing anything a success if (docSuccess == true) { finalSuccess = true; } if (positionedFilter == true) { break; } // Next hasDocs = pFilter->has_documents(); } delete pFilter; #ifdef DEBUG clog << "FilterUtils::filterDocument: done with " << doc.getLocation(true) << " status " << finalSuccess << endl; #endif return finalSuccess; } bool FilterUtils::reduceDocument(const Document &doc, ReducedAction &action) { string originalType(doc.getType()); return filterDocument(doc, originalType, action); } string FilterUtils::stripMarkup(const string &text) { if (text.empty() == true) { return ""; } Dijon::Filter *pFilter = Dijon::FilterFactory::getFilter("text/xml"); if (pFilter == NULL) { return ""; } Document doc; string strippedText; doc.setData(text.c_str(), text.length()); if ((feedFilter(doc, pFilter) == true) && (pFilter->next_document() == true)) { const dstring &content = pFilter->get_content(); if (content.empty() == false) { strippedText = string(content.c_str(), content.length()); } } delete pFilter; return strippedText; } pinot-1.10/Tokenize/Makefile.in0000664000175000017500000013152313620040367013352 00000000000000# Makefile.in generated by automake 1.16.1 from Makefile.am. # @configure_input@ # Copyright (C) 1994-2018 Free Software Foundation, Inc. # This Makefile.in is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, # with or without modifications, as long as this notice is preserved. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY, to the extent permitted by law; without # even the implied warranty of MERCHANTABILITY or FITNESS FOR A # PARTICULAR PURPOSE. @SET_MAKE@ # Process this file with automake to produce Makefile.in VPATH = @srcdir@ am__is_gnu_make = { \ if test -z '$(MAKELEVEL)'; then \ false; \ elif test -n '$(MAKE_HOST)'; then \ true; \ elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \ true; \ else \ false; \ fi; \ } am__make_running_with_option = \ case $${target_option-} in \ ?) ;; \ *) echo "am__make_running_with_option: internal error: invalid" \ "target option '$${target_option-}' specified" >&2; \ exit 1;; \ esac; \ has_opt=no; \ sane_makeflags=$$MAKEFLAGS; \ if $(am__is_gnu_make); then \ sane_makeflags=$$MFLAGS; \ else \ case $$MAKEFLAGS in \ *\\[\ \ ]*) \ bs=\\; \ sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \ | sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \ esac; \ fi; \ skip_next=no; \ strip_trailopt () \ { \ flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \ }; \ for flg in $$sane_makeflags; do \ test $$skip_next = yes && { skip_next=no; continue; }; \ case $$flg in \ *=*|--*) continue;; \ -*I) strip_trailopt 'I'; skip_next=yes;; \ -*I?*) strip_trailopt 'I';; \ -*O) strip_trailopt 'O'; skip_next=yes;; \ -*O?*) strip_trailopt 'O';; \ -*l) strip_trailopt 'l'; skip_next=yes;; \ -*l?*) strip_trailopt 'l';; \ -[dEDm]) skip_next=yes;; \ -[JT]) skip_next=yes;; \ esac; \ case $$flg in \ *$$target_option*) has_opt=yes; break;; \ esac; \ done; \ test $$has_opt = yes am__make_dryrun = (target_option=n; $(am__make_running_with_option)) am__make_keepgoing = (target_option=k; $(am__make_running_with_option)) pkgdatadir = $(datadir)/@PACKAGE@ pkgincludedir = $(includedir)/@PACKAGE@ pkglibdir = $(libdir)/@PACKAGE@ pkglibexecdir = $(libexecdir)/@PACKAGE@ am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd install_sh_DATA = $(install_sh) -c -m 644 install_sh_PROGRAM = $(install_sh) -c install_sh_SCRIPT = $(install_sh) -c INSTALL_HEADER = $(INSTALL_DATA) transform = $(program_transform_name) NORMAL_INSTALL = : PRE_INSTALL = : POST_INSTALL = : NORMAL_UNINSTALL = : PRE_UNINSTALL = : POST_UNINSTALL = : build_triplet = @build@ host_triplet = @host@ @HAVE_LIBARCHIVE_TRUE@am__append_1 = libarchivefilter.la @HAVE_CHMLIB_TRUE@am__append_2 = libchmfilter.la @HAVE_LIBARCHIVE_FALSE@libarchivefilter_la_DEPENDENCIES = @HAVE_CHMLIB_FALSE@libchmfilter_la_DEPENDENCIES = subdir = Tokenize ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 am__aclocal_m4_deps = $(top_srcdir)/acinclude.m4 \ $(top_srcdir)/configure.in am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ $(ACLOCAL_M4) DIST_COMMON = $(srcdir)/Makefile.am $(nobase_pkginclude_HEADERS) \ $(noinst_HEADERS) $(pkginclude_HEADERS) $(am__DIST_COMMON) mkinstalldirs = $(SHELL) $(top_srcdir)/mkinstalldirs CONFIG_HEADER = $(top_builddir)/config.h CONFIG_CLEAN_FILES = CONFIG_CLEAN_VPATH_FILES = am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; am__vpath_adj = case $$p in \ $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \ *) f=$$p;; \ esac; am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`; am__install_max = 40 am__nobase_strip_setup = \ srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'` am__nobase_strip = \ for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||" am__nobase_list = $(am__nobase_strip_setup); \ for p in $$list; do echo "$$p $$p"; done | \ sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \ $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \ if (++n[$$2] == $(am__install_max)) \ { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \ END { for (dir in files) print dir, files[dir] }' am__base_list = \ sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \ sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g' am__uninstall_files_from_dir = { \ test -z "$$files" \ || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \ || { echo " ( cd '$$dir' && rm -f" $$files ")"; \ $(am__cd) "$$dir" && rm -f $$files; }; \ } am__installdirs = "$(DESTDIR)$(libdir)" "$(DESTDIR)$(pkglibdir)" \ "$(DESTDIR)$(pkgincludedir)" "$(DESTDIR)$(pkgincludedir)" LTLIBRARIES = $(lib_LTLIBRARIES) $(pkglib_LTLIBRARIES) libFilter_la_LIBADD = am__dirstamp = $(am__leading_dot)dirstamp am_libFilter_la_OBJECTS = $(top_builddir)/Tokenize/filters/Filter.lo \ $(top_builddir)/Tokenize/filters/FilterFactory.lo \ $(top_builddir)/Tokenize/filters/HtmlFilter.lo \ $(top_builddir)/Tokenize/filters/HtmlParser.lo \ $(top_builddir)/Tokenize/filters/TextFilter.lo \ $(top_builddir)/Tokenize/filters/XmlFilter.lo libFilter_la_OBJECTS = $(am_libFilter_la_OBJECTS) AM_V_lt = $(am__v_lt_@AM_V@) am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@) am__v_lt_0 = --silent am__v_lt_1 = libFilter_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \ $(LIBTOOLFLAGS) --mode=link $(CXXLD) $(AM_CXXFLAGS) \ $(CXXFLAGS) $(libFilter_la_LDFLAGS) $(LDFLAGS) -o $@ libTokenize_la_LIBADD = am_libTokenize_la_OBJECTS = FilterUtils.lo TextConverter.lo \ $(top_builddir)/IndexSearch/cjkv/CJKVTokenizer.lo libTokenize_la_OBJECTS = $(am_libTokenize_la_OBJECTS) libTokenize_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \ $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \ $(AM_CXXFLAGS) $(CXXFLAGS) $(libTokenize_la_LDFLAGS) \ $(LDFLAGS) -o $@ am__libarchivefilter_la_SOURCES_DIST = \ $(top_srcdir)/Tokenize/filters/ArchiveFilter.cc @HAVE_LIBARCHIVE_TRUE@am_libarchivefilter_la_OBJECTS = $(top_builddir)/Tokenize/filters/ArchiveFilter.lo libarchivefilter_la_OBJECTS = $(am_libarchivefilter_la_OBJECTS) libarchivefilter_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \ $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \ $(AM_CXXFLAGS) $(CXXFLAGS) $(libarchivefilter_la_LDFLAGS) \ $(LDFLAGS) -o $@ @HAVE_LIBARCHIVE_TRUE@am_libarchivefilter_la_rpath = -rpath $(libdir) am__libchmfilter_la_SOURCES_DIST = \ $(top_srcdir)/Tokenize/filters/ChmFilter.cc @HAVE_CHMLIB_TRUE@am_libchmfilter_la_OBJECTS = $(top_builddir)/Tokenize/filters/ChmFilter.lo libchmfilter_la_OBJECTS = $(am_libchmfilter_la_OBJECTS) libchmfilter_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \ $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \ $(AM_CXXFLAGS) $(CXXFLAGS) $(libchmfilter_la_LDFLAGS) \ $(LDFLAGS) -o $@ @HAVE_CHMLIB_TRUE@am_libchmfilter_la_rpath = -rpath $(libdir) am_libexiv2imagefilter_la_OBJECTS = \ $(top_builddir)/Tokenize/filters/Exiv2ImageFilter.lo libexiv2imagefilter_la_OBJECTS = $(am_libexiv2imagefilter_la_OBJECTS) libexiv2imagefilter_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \ $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \ $(AM_CXXFLAGS) $(CXXFLAGS) $(libexiv2imagefilter_la_LDFLAGS) \ $(LDFLAGS) -o $@ am_libexternalfilter_la_OBJECTS = \ $(top_builddir)/Tokenize/filters/ExternalFilter.lo \ $(top_builddir)/Tokenize/filters/FileOutputFilter.lo libexternalfilter_la_OBJECTS = $(am_libexternalfilter_la_OBJECTS) libexternalfilter_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \ $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \ $(AM_CXXFLAGS) $(CXXFLAGS) $(libexternalfilter_la_LDFLAGS) \ $(LDFLAGS) -o $@ am_libmboxfilter_la_OBJECTS = \ $(top_builddir)/Tokenize/filters/GMimeMboxFilter.lo libmboxfilter_la_OBJECTS = $(am_libmboxfilter_la_OBJECTS) libmboxfilter_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \ $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \ $(AM_CXXFLAGS) $(CXXFLAGS) $(libmboxfilter_la_LDFLAGS) \ $(LDFLAGS) -o $@ am_libtaglibfilter_la_OBJECTS = \ $(top_builddir)/Tokenize/filters/TagLibMusicFilter.lo libtaglibfilter_la_OBJECTS = $(am_libtaglibfilter_la_OBJECTS) libtaglibfilter_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \ $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \ $(AM_CXXFLAGS) $(CXXFLAGS) $(libtaglibfilter_la_LDFLAGS) \ $(LDFLAGS) -o $@ AM_V_P = $(am__v_P_@AM_V@) am__v_P_ = $(am__v_P_@AM_DEFAULT_V@) am__v_P_0 = false am__v_P_1 = : AM_V_GEN = $(am__v_GEN_@AM_V@) am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@) am__v_GEN_0 = @echo " GEN " $@; am__v_GEN_1 = AM_V_at = $(am__v_at_@AM_V@) am__v_at_ = $(am__v_at_@AM_DEFAULT_V@) am__v_at_0 = @ am__v_at_1 = DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir) depcomp = $(SHELL) $(top_srcdir)/depcomp am__maybe_remake_depfiles = depfiles am__depfiles_remade = \ $(top_builddir)/IndexSearch/cjkv/$(DEPDIR)/CJKVTokenizer.Plo \ $(top_builddir)/Tokenize/filters/$(DEPDIR)/ArchiveFilter.Plo \ $(top_builddir)/Tokenize/filters/$(DEPDIR)/ChmFilter.Plo \ $(top_builddir)/Tokenize/filters/$(DEPDIR)/Exiv2ImageFilter.Plo \ $(top_builddir)/Tokenize/filters/$(DEPDIR)/ExternalFilter.Plo \ $(top_builddir)/Tokenize/filters/$(DEPDIR)/FileOutputFilter.Plo \ $(top_builddir)/Tokenize/filters/$(DEPDIR)/Filter.Plo \ $(top_builddir)/Tokenize/filters/$(DEPDIR)/FilterFactory.Plo \ $(top_builddir)/Tokenize/filters/$(DEPDIR)/GMimeMboxFilter.Plo \ $(top_builddir)/Tokenize/filters/$(DEPDIR)/HtmlFilter.Plo \ $(top_builddir)/Tokenize/filters/$(DEPDIR)/HtmlParser.Plo \ $(top_builddir)/Tokenize/filters/$(DEPDIR)/TagLibMusicFilter.Plo \ $(top_builddir)/Tokenize/filters/$(DEPDIR)/TextFilter.Plo \ $(top_builddir)/Tokenize/filters/$(DEPDIR)/XmlFilter.Plo \ ./$(DEPDIR)/FilterUtils.Plo ./$(DEPDIR)/TextConverter.Plo am__mv = mv -f CXXCOMPILE = $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \ $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) LTCXXCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \ $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) \ $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ $(AM_CXXFLAGS) $(CXXFLAGS) AM_V_CXX = $(am__v_CXX_@AM_V@) am__v_CXX_ = $(am__v_CXX_@AM_DEFAULT_V@) am__v_CXX_0 = @echo " CXX " $@; am__v_CXX_1 = CXXLD = $(CXX) CXXLINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \ $(LIBTOOLFLAGS) --mode=link $(CXXLD) $(AM_CXXFLAGS) \ $(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@ AM_V_CXXLD = $(am__v_CXXLD_@AM_V@) am__v_CXXLD_ = $(am__v_CXXLD_@AM_DEFAULT_V@) am__v_CXXLD_0 = @echo " CXXLD " $@; am__v_CXXLD_1 = SOURCES = $(libFilter_la_SOURCES) $(libTokenize_la_SOURCES) \ $(libarchivefilter_la_SOURCES) $(libchmfilter_la_SOURCES) \ $(libexiv2imagefilter_la_SOURCES) \ $(libexternalfilter_la_SOURCES) $(libmboxfilter_la_SOURCES) \ $(libtaglibfilter_la_SOURCES) DIST_SOURCES = $(libFilter_la_SOURCES) $(libTokenize_la_SOURCES) \ $(am__libarchivefilter_la_SOURCES_DIST) \ $(am__libchmfilter_la_SOURCES_DIST) \ $(libexiv2imagefilter_la_SOURCES) \ $(libexternalfilter_la_SOURCES) $(libmboxfilter_la_SOURCES) \ $(libtaglibfilter_la_SOURCES) am__can_run_installinfo = \ case $$AM_UPDATE_INFO_DIR in \ n|no|NO) false;; \ *) (install-info --version) >/dev/null 2>&1;; \ esac HEADERS = $(nobase_pkginclude_HEADERS) $(noinst_HEADERS) \ $(pkginclude_HEADERS) am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP) # Read a list of newline-separated strings from the standard input, # and print each of them once, without duplicates. Input order is # *not* preserved. am__uniquify_input = $(AWK) '\ BEGIN { nonempty = 0; } \ { items[$$0] = 1; nonempty = 1; } \ END { if (nonempty) { for (i in items) print i; }; } \ ' # Make sure the list of sources is unique. This is necessary because, # e.g., the same source file might be shared among _SOURCES variables # for different programs/libraries. am__define_uniq_tagged_files = \ list='$(am__tagged_files)'; \ unique=`for i in $$list; do \ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ done | $(am__uniquify_input)` ETAGS = etags CTAGS = ctags am__DIST_COMMON = $(srcdir)/Makefile.in $(top_srcdir)/depcomp \ $(top_srcdir)/mkinstalldirs DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) ACLOCAL = @ACLOCAL@ AMTAR = @AMTAR@ AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ AR = @AR@ AUTOCONF = @AUTOCONF@ AUTOHEADER = @AUTOHEADER@ AUTOMAKE = @AUTOMAKE@ AWK = @AWK@ BINDIR = @BINDIR@ CATALOGS = @CATALOGS@ CATOBJEXT = @CATOBJEXT@ CC = @CC@ CCDEPMODE = @CCDEPMODE@ CFLAGS = @CFLAGS@ CPP = @CPP@ CPPFLAGS = @CPPFLAGS@ CURL_CONFIG = @CURL_CONFIG@ CXX = @CXX@ CXXCPP = @CXXCPP@ CXXDEPMODE = @CXXDEPMODE@ CXXFLAGS = @CXXFLAGS@ CYGPATH_W = @CYGPATH_W@ DATADIR = @DATADIR@ DATADIRNAME = @DATADIRNAME@ DBUS_CFLAGS = @DBUS_CFLAGS@ DBUS_LIBS = @DBUS_LIBS@ DBUS_SERVICES_DIR = @DBUS_SERVICES_DIR@ DEFS = @DEFS@ DEPDIR = @DEPDIR@ DESKTOP_INSTALL = @DESKTOP_INSTALL@ DLLTOOL = @DLLTOOL@ DL_LIBS = @DL_LIBS@ DSYMUTIL = @DSYMUTIL@ DUMPBIN = @DUMPBIN@ ECHO_C = @ECHO_C@ ECHO_N = @ECHO_N@ ECHO_T = @ECHO_T@ EGREP = @EGREP@ EXEEXT = @EXEEXT@ EXIV2_CFLAGS = @EXIV2_CFLAGS@ EXIV2_LIBS = @EXIV2_LIBS@ FGREP = @FGREP@ GETTEXT_PACKAGE = @GETTEXT_PACKAGE@ GIO_CFLAGS = @GIO_CFLAGS@ GIO_LIBS = @GIO_LIBS@ GLIBMM_CFLAGS = @GLIBMM_CFLAGS@ GLIBMM_LIBS = @GLIBMM_LIBS@ GMIME_CFLAGS = @GMIME_CFLAGS@ GMIME_LIBS = @GMIME_LIBS@ GMOFILES = @GMOFILES@ GMSGFMT = @GMSGFMT@ GREP = @GREP@ GTHREAD_CFLAGS = @GTHREAD_CFLAGS@ GTHREAD_LIBS = @GTHREAD_LIBS@ GTKMM_CFLAGS = @GTKMM_CFLAGS@ GTKMM_LIBS = @GTKMM_LIBS@ HTTP_CFLAGS = @HTTP_CFLAGS@ HTTP_DOWNLOADER = @HTTP_DOWNLOADER@ HTTP_LIBS = @HTTP_LIBS@ INDEX_CFLAGS = @INDEX_CFLAGS@ INDEX_LIBS = @INDEX_LIBS@ INSTALL = @INSTALL@ INSTALL_DATA = @INSTALL_DATA@ INSTALL_PROGRAM = @INSTALL_PROGRAM@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ INSTOBJEXT = @INSTOBJEXT@ INTLLIBS = @INTLLIBS@ INTL_MACOSX_LIBS = @INTL_MACOSX_LIBS@ LD = @LD@ LDFLAGS = @LDFLAGS@ LIBDIR = @LIBDIR@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ LIBTOOL = @LIBTOOL@ LIPO = @LIPO@ LN_S = @LN_S@ LTLIBOBJS = @LTLIBOBJS@ LT_SYS_LIBRARY_PATH = @LT_SYS_LIBRARY_PATH@ MAKEINFO = @MAKEINFO@ MANIFEST_TOOL = @MANIFEST_TOOL@ MIMEINFO_CFLAGS = @MIMEINFO_CFLAGS@ MIMEINFO_LIBS = @MIMEINFO_LIBS@ MIN_HTTP_CFLAGS = @MIN_HTTP_CFLAGS@ MIN_HTTP_LIBS = @MIN_HTTP_LIBS@ MISC_CFLAGS = @MISC_CFLAGS@ MISC_LIBS = @MISC_LIBS@ MKDIR_P = @MKDIR_P@ MKINSTALLDIRS = @MKINSTALLDIRS@ MSGFMT = @MSGFMT@ MSGFMT_OPTS = @MSGFMT_OPTS@ NEON_CFLAGS = @NEON_CFLAGS@ NEON_LIBS = @NEON_LIBS@ NM = @NM@ NMEDIT = @NMEDIT@ NOTIFY_CFLAGS = @NOTIFY_CFLAGS@ NOTIFY_LIBS = @NOTIFY_LIBS@ OBJDUMP = @OBJDUMP@ OBJEXT = @OBJEXT@ OTOOL = @OTOOL@ OTOOL64 = @OTOOL64@ PACKAGE = @PACKAGE@ PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ PACKAGE_NAME = @PACKAGE_NAME@ PACKAGE_STRING = @PACKAGE_STRING@ PACKAGE_TARNAME = @PACKAGE_TARNAME@ PACKAGE_URL = @PACKAGE_URL@ PACKAGE_VERSION = @PACKAGE_VERSION@ PATH_SEPARATOR = @PATH_SEPARATOR@ PKG_CONFIG = @PKG_CONFIG@ PKG_CONFIG_LIBDIR = @PKG_CONFIG_LIBDIR@ PKG_CONFIG_PATH = @PKG_CONFIG_PATH@ POFILES = @POFILES@ POSUB = @POSUB@ PO_IN_DATADIR_FALSE = @PO_IN_DATADIR_FALSE@ PO_IN_DATADIR_TRUE = @PO_IN_DATADIR_TRUE@ PTHREAD_LIBS = @PTHREAD_LIBS@ RANLIB = @RANLIB@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHARED_MIME_INFO_PREFIX = @SHARED_MIME_INFO_PREFIX@ SHELL = @SHELL@ SIGCPP_CFLAGS = @SIGCPP_CFLAGS@ SIGCPP_LIBS = @SIGCPP_LIBS@ SQL_CFLAGS = @SQL_CFLAGS@ SQL_LIBS = @SQL_LIBS@ SSL_CFLAGS = @SSL_CFLAGS@ SSL_LIBS = @SSL_LIBS@ STRIP = @STRIP@ SYSCONFDIR = @SYSCONFDIR@ TAGLIB_CFLAGS = @TAGLIB_CFLAGS@ TAGLIB_LIBS = @TAGLIB_LIBS@ TEXTCAT_CFLAGS = @TEXTCAT_CFLAGS@ TEXTCAT_LIBS = @TEXTCAT_LIBS@ USE_NLS = @USE_NLS@ VERSION = @VERSION@ XAPIAN_CONFIG = @XAPIAN_CONFIG@ XGETTEXT = @XGETTEXT@ XML_CFLAGS = @XML_CFLAGS@ XML_LIBS = @XML_LIBS@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ abs_top_builddir = @abs_top_builddir@ abs_top_srcdir = @abs_top_srcdir@ ac_ct_AR = @ac_ct_AR@ ac_ct_CC = @ac_ct_CC@ ac_ct_CXX = @ac_ct_CXX@ ac_ct_DUMPBIN = @ac_ct_DUMPBIN@ am__include = @am__include@ am__leading_dot = @am__leading_dot@ am__quote = @am__quote@ am__tar = @am__tar@ am__untar = @am__untar@ bindir = @bindir@ build = @build@ build_alias = @build_alias@ build_cpu = @build_cpu@ build_os = @build_os@ build_vendor = @build_vendor@ builddir = @builddir@ datadir = @datadir@ datarootdir = @datarootdir@ docdir = @docdir@ dvidir = @dvidir@ exec_prefix = @exec_prefix@ host = @host@ host_alias = @host_alias@ host_cpu = @host_cpu@ host_os = @host_os@ host_vendor = @host_vendor@ htmldir = @htmldir@ includedir = @includedir@ infodir = @infodir@ install_sh = @install_sh@ libdir = @libdir@ libexecdir = @libexecdir@ localedir = @localedir@ localstatedir = @localstatedir@ mandir = @mandir@ mkdir_p = @mkdir_p@ oldincludedir = @oldincludedir@ pdfdir = @pdfdir@ prefix = @prefix@ program_transform_name = @program_transform_name@ psdir = @psdir@ sbindir = @sbindir@ sharedstatedir = @sharedstatedir@ srcdir = @srcdir@ sysconfdir = @sysconfdir@ target_alias = @target_alias@ top_build_prefix = @top_build_prefix@ top_builddir = @top_builddir@ top_srcdir = @top_srcdir@ noinst_HEADERS = \ $(top_srcdir)/Tokenize/filters/ArchiveFilter.h \ $(top_srcdir)/Tokenize/filters/ChmFilter.h \ $(top_srcdir)/Tokenize/filters/ExifImageFilter.h \ $(top_srcdir)/Tokenize/filters/Exiv2ImageFilter.h \ $(top_srcdir)/Tokenize/filters/ExternalFilter.h \ $(top_srcdir)/Tokenize/filters/FileOutputFilter.h \ $(top_srcdir)/Tokenize/filters/GMimeMboxFilter.h \ $(top_srcdir)/Tokenize/filters/TagLibMusicFilter.h pkginclude_HEADERS = \ FilterUtils.h \ TextConverter.h nobase_pkginclude_HEADERS = \ filters/Filter.h \ filters/FilterFactory.h \ filters/HtmlFilter.h \ filters/HtmlParser.h \ filters/TextFilter.h \ filters/XmlFilter.h lib_LTLIBRARIES = libexiv2imagefilter.la libexternalfilter.la \ libmboxfilter.la libtaglibfilter.la $(am__append_1) \ $(am__append_2) pkglib_LTLIBRARIES = libFilter.la libTokenize.la libFilter_la_LDFLAGS = \ -static libFilter_la_SOURCES = \ $(top_srcdir)/Tokenize/filters/Filter.cc \ $(top_srcdir)/Tokenize/filters/FilterFactory.cc \ $(top_srcdir)/Tokenize/filters/HtmlFilter.cc \ $(top_srcdir)/Tokenize/filters/HtmlParser.cc \ $(top_srcdir)/Tokenize/filters/TextFilter.cc \ $(top_srcdir)/Tokenize/filters/XmlFilter.cc @HAVE_LIBARCHIVE_TRUE@libarchivefilter_la_DEPENDENCIES = libFilter.la @HAVE_LIBARCHIVE_TRUE@libarchivefilter_la_SOURCES = \ @HAVE_LIBARCHIVE_TRUE@ $(top_srcdir)/Tokenize/filters/ArchiveFilter.cc @HAVE_LIBARCHIVE_TRUE@libarchivefilter_la_LDFLAGS = -module -avoid-version @HAVE_LIBARCHIVE_TRUE@libarchivefilter_la_LIBADD = -larchive @HAVE_CHMLIB_TRUE@libchmfilter_la_DEPENDENCIES = libFilter.la @HAVE_CHMLIB_TRUE@libchmfilter_la_SOURCES = \ @HAVE_CHMLIB_TRUE@ $(top_srcdir)/Tokenize/filters/ChmFilter.cc @HAVE_CHMLIB_TRUE@libchmfilter_la_LDFLAGS = -module -avoid-version @HAVE_CHMLIB_TRUE@libchmfilter_la_LIBADD = -lchm libexiv2imagefilter_la_DEPENDENCIES = libFilter.la libexiv2imagefilter_la_SOURCES = \ $(top_srcdir)/Tokenize/filters/Exiv2ImageFilter.cc libexiv2imagefilter_la_LDFLAGS = -module -avoid-version libexiv2imagefilter_la_LIBADD = @EXIV2_LIBS@ libexternalfilter_la_DEPENDENCIES = libFilter.la libexternalfilter_la_SOURCES = \ $(top_srcdir)/Tokenize/filters/ExternalFilter.cc \ $(top_srcdir)/Tokenize/filters/FileOutputFilter.cc libexternalfilter_la_LDFLAGS = -module -avoid-version libexternalfilter_la_LIBADD = @XML_LIBS@ libmboxfilter_la_DEPENDENCIES = libFilter.la libmboxfilter_la_SOURCES = \ $(top_srcdir)/Tokenize/filters/GMimeMboxFilter.cc libmboxfilter_la_LDFLAGS = -module -avoid-version libmboxfilter_la_LIBADD = @GMIME_LIBS@ libtaglibfilter_la_DEPENDENCIES = libFilter.la libtaglibfilter_la_SOURCES = \ $(top_srcdir)/Tokenize/filters/TagLibMusicFilter.cc libtaglibfilter_la_LDFLAGS = -module -avoid-version libtaglibfilter_la_LIBADD = @TAGLIB_LIBS@ libTokenize_la_LDFLAGS = \ -static libTokenize_la_SOURCES = \ FilterUtils.cpp \ TextConverter.cpp \ $(top_srcdir)/IndexSearch/cjkv/CJKVTokenizer.cc AM_CXXFLAGS = \ @MISC_CFLAGS@ \ -I$(top_srcdir)/Utils -Ifilters \ @GMIME_CFLAGS@ @XML_CFLAGS@ @EXIV2_CFLAGS@ @TAGLIB_CFLAGS@ \ -D_DYNAMIC_DIJON_FILTERS \ -D_DIJON_EXTERNALFILTER_CONFFILE=\"$(sysconfdir)/pinot/external-filters.xml\" all: all-am .SUFFIXES: .SUFFIXES: .cc .cpp .lo .o .obj $(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps) @for dep in $?; do \ case '$(am__configure_deps)' in \ *$$dep*) \ ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ && { if test -f $@; then exit 0; else break; fi; }; \ exit 1;; \ esac; \ done; \ echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu Tokenize/Makefile'; \ $(am__cd) $(top_srcdir) && \ $(AUTOMAKE) --gnu Tokenize/Makefile Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status @case '$?' in \ *config.status*) \ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ *) \ echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles)'; \ cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles);; \ esac; $(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh $(top_srcdir)/configure: $(am__configure_deps) cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh $(ACLOCAL_M4): $(am__aclocal_m4_deps) cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh $(am__aclocal_m4_deps): install-libLTLIBRARIES: $(lib_LTLIBRARIES) @$(NORMAL_INSTALL) @list='$(lib_LTLIBRARIES)'; test -n "$(libdir)" || list=; \ list2=; for p in $$list; do \ if test -f $$p; then \ list2="$$list2 $$p"; \ else :; fi; \ done; \ test -z "$$list2" || { \ echo " $(MKDIR_P) '$(DESTDIR)$(libdir)'"; \ $(MKDIR_P) "$(DESTDIR)$(libdir)" || exit 1; \ echo " $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$list2 '$(DESTDIR)$(libdir)'"; \ $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$list2 "$(DESTDIR)$(libdir)"; \ } uninstall-libLTLIBRARIES: @$(NORMAL_UNINSTALL) @list='$(lib_LTLIBRARIES)'; test -n "$(libdir)" || list=; \ for p in $$list; do \ $(am__strip_dir) \ echo " $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f '$(DESTDIR)$(libdir)/$$f'"; \ $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f "$(DESTDIR)$(libdir)/$$f"; \ done clean-libLTLIBRARIES: -test -z "$(lib_LTLIBRARIES)" || rm -f $(lib_LTLIBRARIES) @list='$(lib_LTLIBRARIES)'; \ locs=`for p in $$list; do echo $$p; done | \ sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \ sort -u`; \ test -z "$$locs" || { \ echo rm -f $${locs}; \ rm -f $${locs}; \ } install-pkglibLTLIBRARIES: $(pkglib_LTLIBRARIES) @$(NORMAL_INSTALL) @list='$(pkglib_LTLIBRARIES)'; test -n "$(pkglibdir)" || list=; \ list2=; for p in $$list; do \ if test -f $$p; then \ list2="$$list2 $$p"; \ else :; fi; \ done; \ test -z "$$list2" || { \ echo " $(MKDIR_P) '$(DESTDIR)$(pkglibdir)'"; \ $(MKDIR_P) "$(DESTDIR)$(pkglibdir)" || exit 1; \ echo " $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$list2 '$(DESTDIR)$(pkglibdir)'"; \ $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$list2 "$(DESTDIR)$(pkglibdir)"; \ } uninstall-pkglibLTLIBRARIES: @$(NORMAL_UNINSTALL) @list='$(pkglib_LTLIBRARIES)'; test -n "$(pkglibdir)" || list=; \ for p in $$list; do \ $(am__strip_dir) \ echo " $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f '$(DESTDIR)$(pkglibdir)/$$f'"; \ $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f "$(DESTDIR)$(pkglibdir)/$$f"; \ done clean-pkglibLTLIBRARIES: -test -z "$(pkglib_LTLIBRARIES)" || rm -f $(pkglib_LTLIBRARIES) @list='$(pkglib_LTLIBRARIES)'; \ locs=`for p in $$list; do echo $$p; done | \ sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \ sort -u`; \ test -z "$$locs" || { \ echo rm -f $${locs}; \ rm -f $${locs}; \ } $(top_builddir)/Tokenize/filters/$(am__dirstamp): @$(MKDIR_P) $(top_builddir)/Tokenize/filters @: > $(top_builddir)/Tokenize/filters/$(am__dirstamp) $(top_builddir)/Tokenize/filters/$(DEPDIR)/$(am__dirstamp): @$(MKDIR_P) $(top_builddir)/Tokenize/filters/$(DEPDIR) @: > $(top_builddir)/Tokenize/filters/$(DEPDIR)/$(am__dirstamp) $(top_builddir)/Tokenize/filters/Filter.lo: \ $(top_builddir)/Tokenize/filters/$(am__dirstamp) \ $(top_builddir)/Tokenize/filters/$(DEPDIR)/$(am__dirstamp) $(top_builddir)/Tokenize/filters/FilterFactory.lo: \ $(top_builddir)/Tokenize/filters/$(am__dirstamp) \ $(top_builddir)/Tokenize/filters/$(DEPDIR)/$(am__dirstamp) $(top_builddir)/Tokenize/filters/HtmlFilter.lo: \ $(top_builddir)/Tokenize/filters/$(am__dirstamp) \ $(top_builddir)/Tokenize/filters/$(DEPDIR)/$(am__dirstamp) $(top_builddir)/Tokenize/filters/HtmlParser.lo: \ $(top_builddir)/Tokenize/filters/$(am__dirstamp) \ $(top_builddir)/Tokenize/filters/$(DEPDIR)/$(am__dirstamp) $(top_builddir)/Tokenize/filters/TextFilter.lo: \ $(top_builddir)/Tokenize/filters/$(am__dirstamp) \ $(top_builddir)/Tokenize/filters/$(DEPDIR)/$(am__dirstamp) $(top_builddir)/Tokenize/filters/XmlFilter.lo: \ $(top_builddir)/Tokenize/filters/$(am__dirstamp) \ $(top_builddir)/Tokenize/filters/$(DEPDIR)/$(am__dirstamp) libFilter.la: $(libFilter_la_OBJECTS) $(libFilter_la_DEPENDENCIES) $(EXTRA_libFilter_la_DEPENDENCIES) $(AM_V_CXXLD)$(libFilter_la_LINK) -rpath $(pkglibdir) $(libFilter_la_OBJECTS) $(libFilter_la_LIBADD) $(LIBS) $(top_builddir)/IndexSearch/cjkv/$(am__dirstamp): @$(MKDIR_P) $(top_builddir)/IndexSearch/cjkv @: > $(top_builddir)/IndexSearch/cjkv/$(am__dirstamp) $(top_builddir)/IndexSearch/cjkv/$(DEPDIR)/$(am__dirstamp): @$(MKDIR_P) $(top_builddir)/IndexSearch/cjkv/$(DEPDIR) @: > $(top_builddir)/IndexSearch/cjkv/$(DEPDIR)/$(am__dirstamp) $(top_builddir)/IndexSearch/cjkv/CJKVTokenizer.lo: \ $(top_builddir)/IndexSearch/cjkv/$(am__dirstamp) \ $(top_builddir)/IndexSearch/cjkv/$(DEPDIR)/$(am__dirstamp) libTokenize.la: $(libTokenize_la_OBJECTS) $(libTokenize_la_DEPENDENCIES) $(EXTRA_libTokenize_la_DEPENDENCIES) $(AM_V_CXXLD)$(libTokenize_la_LINK) -rpath $(pkglibdir) $(libTokenize_la_OBJECTS) $(libTokenize_la_LIBADD) $(LIBS) $(top_builddir)/Tokenize/filters/ArchiveFilter.lo: \ $(top_builddir)/Tokenize/filters/$(am__dirstamp) \ $(top_builddir)/Tokenize/filters/$(DEPDIR)/$(am__dirstamp) libarchivefilter.la: $(libarchivefilter_la_OBJECTS) $(libarchivefilter_la_DEPENDENCIES) $(EXTRA_libarchivefilter_la_DEPENDENCIES) $(AM_V_CXXLD)$(libarchivefilter_la_LINK) $(am_libarchivefilter_la_rpath) $(libarchivefilter_la_OBJECTS) $(libarchivefilter_la_LIBADD) $(LIBS) $(top_builddir)/Tokenize/filters/ChmFilter.lo: \ $(top_builddir)/Tokenize/filters/$(am__dirstamp) \ $(top_builddir)/Tokenize/filters/$(DEPDIR)/$(am__dirstamp) libchmfilter.la: $(libchmfilter_la_OBJECTS) $(libchmfilter_la_DEPENDENCIES) $(EXTRA_libchmfilter_la_DEPENDENCIES) $(AM_V_CXXLD)$(libchmfilter_la_LINK) $(am_libchmfilter_la_rpath) $(libchmfilter_la_OBJECTS) $(libchmfilter_la_LIBADD) $(LIBS) $(top_builddir)/Tokenize/filters/Exiv2ImageFilter.lo: \ $(top_builddir)/Tokenize/filters/$(am__dirstamp) \ $(top_builddir)/Tokenize/filters/$(DEPDIR)/$(am__dirstamp) libexiv2imagefilter.la: $(libexiv2imagefilter_la_OBJECTS) $(libexiv2imagefilter_la_DEPENDENCIES) $(EXTRA_libexiv2imagefilter_la_DEPENDENCIES) $(AM_V_CXXLD)$(libexiv2imagefilter_la_LINK) -rpath $(libdir) $(libexiv2imagefilter_la_OBJECTS) $(libexiv2imagefilter_la_LIBADD) $(LIBS) $(top_builddir)/Tokenize/filters/ExternalFilter.lo: \ $(top_builddir)/Tokenize/filters/$(am__dirstamp) \ $(top_builddir)/Tokenize/filters/$(DEPDIR)/$(am__dirstamp) $(top_builddir)/Tokenize/filters/FileOutputFilter.lo: \ $(top_builddir)/Tokenize/filters/$(am__dirstamp) \ $(top_builddir)/Tokenize/filters/$(DEPDIR)/$(am__dirstamp) libexternalfilter.la: $(libexternalfilter_la_OBJECTS) $(libexternalfilter_la_DEPENDENCIES) $(EXTRA_libexternalfilter_la_DEPENDENCIES) $(AM_V_CXXLD)$(libexternalfilter_la_LINK) -rpath $(libdir) $(libexternalfilter_la_OBJECTS) $(libexternalfilter_la_LIBADD) $(LIBS) $(top_builddir)/Tokenize/filters/GMimeMboxFilter.lo: \ $(top_builddir)/Tokenize/filters/$(am__dirstamp) \ $(top_builddir)/Tokenize/filters/$(DEPDIR)/$(am__dirstamp) libmboxfilter.la: $(libmboxfilter_la_OBJECTS) $(libmboxfilter_la_DEPENDENCIES) $(EXTRA_libmboxfilter_la_DEPENDENCIES) $(AM_V_CXXLD)$(libmboxfilter_la_LINK) -rpath $(libdir) $(libmboxfilter_la_OBJECTS) $(libmboxfilter_la_LIBADD) $(LIBS) $(top_builddir)/Tokenize/filters/TagLibMusicFilter.lo: \ $(top_builddir)/Tokenize/filters/$(am__dirstamp) \ $(top_builddir)/Tokenize/filters/$(DEPDIR)/$(am__dirstamp) libtaglibfilter.la: $(libtaglibfilter_la_OBJECTS) $(libtaglibfilter_la_DEPENDENCIES) $(EXTRA_libtaglibfilter_la_DEPENDENCIES) $(AM_V_CXXLD)$(libtaglibfilter_la_LINK) -rpath $(libdir) $(libtaglibfilter_la_OBJECTS) $(libtaglibfilter_la_LIBADD) $(LIBS) mostlyclean-compile: -rm -f *.$(OBJEXT) -rm -f $(top_builddir)/IndexSearch/cjkv/*.$(OBJEXT) -rm -f $(top_builddir)/IndexSearch/cjkv/*.lo -rm -f $(top_builddir)/Tokenize/filters/*.$(OBJEXT) -rm -f $(top_builddir)/Tokenize/filters/*.lo distclean-compile: -rm -f *.tab.c @AMDEP_TRUE@@am__include@ @am__quote@$(top_builddir)/IndexSearch/cjkv/$(DEPDIR)/CJKVTokenizer.Plo@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@$(top_builddir)/Tokenize/filters/$(DEPDIR)/ArchiveFilter.Plo@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@$(top_builddir)/Tokenize/filters/$(DEPDIR)/ChmFilter.Plo@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@$(top_builddir)/Tokenize/filters/$(DEPDIR)/Exiv2ImageFilter.Plo@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@$(top_builddir)/Tokenize/filters/$(DEPDIR)/ExternalFilter.Plo@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@$(top_builddir)/Tokenize/filters/$(DEPDIR)/FileOutputFilter.Plo@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@$(top_builddir)/Tokenize/filters/$(DEPDIR)/Filter.Plo@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@$(top_builddir)/Tokenize/filters/$(DEPDIR)/FilterFactory.Plo@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@$(top_builddir)/Tokenize/filters/$(DEPDIR)/GMimeMboxFilter.Plo@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@$(top_builddir)/Tokenize/filters/$(DEPDIR)/HtmlFilter.Plo@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@$(top_builddir)/Tokenize/filters/$(DEPDIR)/HtmlParser.Plo@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@$(top_builddir)/Tokenize/filters/$(DEPDIR)/TagLibMusicFilter.Plo@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@$(top_builddir)/Tokenize/filters/$(DEPDIR)/TextFilter.Plo@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@$(top_builddir)/Tokenize/filters/$(DEPDIR)/XmlFilter.Plo@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/FilterUtils.Plo@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/TextConverter.Plo@am__quote@ # am--include-marker $(am__depfiles_remade): @$(MKDIR_P) $(@D) @echo '# dummy' >$@-t && $(am__mv) $@-t $@ am--depfiles: $(am__depfiles_remade) .cc.o: @am__fastdepCXX_TRUE@ $(AM_V_CXX)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.o$$||'`;\ @am__fastdepCXX_TRUE@ $(CXXCOMPILE) -MT $@ -MD -MP -MF $$depbase.Tpo -c -o $@ $< &&\ @am__fastdepCXX_TRUE@ $(am__mv) $$depbase.Tpo $$depbase.Po @AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(CXXCOMPILE) -c -o $@ $< .cc.obj: @am__fastdepCXX_TRUE@ $(AM_V_CXX)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.obj$$||'`;\ @am__fastdepCXX_TRUE@ $(CXXCOMPILE) -MT $@ -MD -MP -MF $$depbase.Tpo -c -o $@ `$(CYGPATH_W) '$<'` &&\ @am__fastdepCXX_TRUE@ $(am__mv) $$depbase.Tpo $$depbase.Po @AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(CXXCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'` .cc.lo: @am__fastdepCXX_TRUE@ $(AM_V_CXX)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.lo$$||'`;\ @am__fastdepCXX_TRUE@ $(LTCXXCOMPILE) -MT $@ -MD -MP -MF $$depbase.Tpo -c -o $@ $< &&\ @am__fastdepCXX_TRUE@ $(am__mv) $$depbase.Tpo $$depbase.Plo @AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LTCXXCOMPILE) -c -o $@ $< .cpp.o: @am__fastdepCXX_TRUE@ $(AM_V_CXX)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.o$$||'`;\ @am__fastdepCXX_TRUE@ $(CXXCOMPILE) -MT $@ -MD -MP -MF $$depbase.Tpo -c -o $@ $< &&\ @am__fastdepCXX_TRUE@ $(am__mv) $$depbase.Tpo $$depbase.Po @AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(CXXCOMPILE) -c -o $@ $< .cpp.obj: @am__fastdepCXX_TRUE@ $(AM_V_CXX)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.obj$$||'`;\ @am__fastdepCXX_TRUE@ $(CXXCOMPILE) -MT $@ -MD -MP -MF $$depbase.Tpo -c -o $@ `$(CYGPATH_W) '$<'` &&\ @am__fastdepCXX_TRUE@ $(am__mv) $$depbase.Tpo $$depbase.Po @AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(CXXCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'` .cpp.lo: @am__fastdepCXX_TRUE@ $(AM_V_CXX)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.lo$$||'`;\ @am__fastdepCXX_TRUE@ $(LTCXXCOMPILE) -MT $@ -MD -MP -MF $$depbase.Tpo -c -o $@ $< &&\ @am__fastdepCXX_TRUE@ $(am__mv) $$depbase.Tpo $$depbase.Plo @AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LTCXXCOMPILE) -c -o $@ $< mostlyclean-libtool: -rm -f *.lo clean-libtool: -rm -rf $(top_builddir)/IndexSearch/cjkv/.libs $(top_builddir)/IndexSearch/cjkv/_libs -rm -rf $(top_builddir)/Tokenize/filters/.libs $(top_builddir)/Tokenize/filters/_libs -rm -rf .libs _libs install-nobase_pkgincludeHEADERS: $(nobase_pkginclude_HEADERS) @$(NORMAL_INSTALL) @list='$(nobase_pkginclude_HEADERS)'; test -n "$(pkgincludedir)" || list=; \ if test -n "$$list"; then \ echo " $(MKDIR_P) '$(DESTDIR)$(pkgincludedir)'"; \ $(MKDIR_P) "$(DESTDIR)$(pkgincludedir)" || exit 1; \ fi; \ $(am__nobase_list) | while read dir files; do \ xfiles=; for file in $$files; do \ if test -f "$$file"; then xfiles="$$xfiles $$file"; \ else xfiles="$$xfiles $(srcdir)/$$file"; fi; done; \ test -z "$$xfiles" || { \ test "x$$dir" = x. || { \ echo " $(MKDIR_P) '$(DESTDIR)$(pkgincludedir)/$$dir'"; \ $(MKDIR_P) "$(DESTDIR)$(pkgincludedir)/$$dir"; }; \ echo " $(INSTALL_HEADER) $$xfiles '$(DESTDIR)$(pkgincludedir)/$$dir'"; \ $(INSTALL_HEADER) $$xfiles "$(DESTDIR)$(pkgincludedir)/$$dir" || exit $$?; }; \ done uninstall-nobase_pkgincludeHEADERS: @$(NORMAL_UNINSTALL) @list='$(nobase_pkginclude_HEADERS)'; test -n "$(pkgincludedir)" || list=; \ $(am__nobase_strip_setup); files=`$(am__nobase_strip)`; \ dir='$(DESTDIR)$(pkgincludedir)'; $(am__uninstall_files_from_dir) install-pkgincludeHEADERS: $(pkginclude_HEADERS) @$(NORMAL_INSTALL) @list='$(pkginclude_HEADERS)'; test -n "$(pkgincludedir)" || list=; \ if test -n "$$list"; then \ echo " $(MKDIR_P) '$(DESTDIR)$(pkgincludedir)'"; \ $(MKDIR_P) "$(DESTDIR)$(pkgincludedir)" || exit 1; \ fi; \ for p in $$list; do \ if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \ echo "$$d$$p"; \ done | $(am__base_list) | \ while read files; do \ echo " $(INSTALL_HEADER) $$files '$(DESTDIR)$(pkgincludedir)'"; \ $(INSTALL_HEADER) $$files "$(DESTDIR)$(pkgincludedir)" || exit $$?; \ done uninstall-pkgincludeHEADERS: @$(NORMAL_UNINSTALL) @list='$(pkginclude_HEADERS)'; test -n "$(pkgincludedir)" || list=; \ files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \ dir='$(DESTDIR)$(pkgincludedir)'; $(am__uninstall_files_from_dir) ID: $(am__tagged_files) $(am__define_uniq_tagged_files); mkid -fID $$unique tags: tags-am TAGS: tags tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) set x; \ here=`pwd`; \ $(am__define_uniq_tagged_files); \ shift; \ if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ test -n "$$unique" || unique=$$empty_fix; \ if test $$# -gt 0; then \ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ "$$@" $$unique; \ else \ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ $$unique; \ fi; \ fi ctags: ctags-am CTAGS: ctags ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) $(am__define_uniq_tagged_files); \ test -z "$(CTAGS_ARGS)$$unique" \ || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ $$unique GTAGS: here=`$(am__cd) $(top_builddir) && pwd` \ && $(am__cd) $(top_srcdir) \ && gtags -i $(GTAGS_ARGS) "$$here" cscopelist: cscopelist-am cscopelist-am: $(am__tagged_files) list='$(am__tagged_files)'; \ case "$(srcdir)" in \ [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \ *) sdir=$(subdir)/$(srcdir) ;; \ esac; \ for i in $$list; do \ if test -f "$$i"; then \ echo "$(subdir)/$$i"; \ else \ echo "$$sdir/$$i"; \ fi; \ done >> $(top_builddir)/cscope.files distclean-tags: -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags distdir: $(BUILT_SOURCES) $(MAKE) $(AM_MAKEFLAGS) distdir-am distdir-am: $(DISTFILES) @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ list='$(DISTFILES)'; \ dist_files=`for file in $$list; do echo $$file; done | \ sed -e "s|^$$srcdirstrip/||;t" \ -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ case $$dist_files in \ */*) $(MKDIR_P) `echo "$$dist_files" | \ sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ sort -u` ;; \ esac; \ for file in $$dist_files; do \ if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ if test -d $$d/$$file; then \ dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ if test -d "$(distdir)/$$file"; then \ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ fi; \ if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ fi; \ cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ else \ test -f "$(distdir)/$$file" \ || cp -p $$d/$$file "$(distdir)/$$file" \ || exit 1; \ fi; \ done check-am: all-am check: check-am all-am: Makefile $(LTLIBRARIES) $(HEADERS) installdirs: for dir in "$(DESTDIR)$(libdir)" "$(DESTDIR)$(pkglibdir)" "$(DESTDIR)$(pkgincludedir)" "$(DESTDIR)$(pkgincludedir)"; do \ test -z "$$dir" || $(MKDIR_P) "$$dir"; \ done install: install-am install-exec: install-exec-am install-data: install-data-am uninstall: uninstall-am install-am: all-am @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am installcheck: installcheck-am install-strip: if test -z '$(STRIP)'; then \ $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ install; \ else \ $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \ fi mostlyclean-generic: clean-generic: distclean-generic: -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) -test -z "$(top_builddir)/IndexSearch/cjkv/$(DEPDIR)/$(am__dirstamp)" || rm -f $(top_builddir)/IndexSearch/cjkv/$(DEPDIR)/$(am__dirstamp) -test -z "$(top_builddir)/IndexSearch/cjkv/$(am__dirstamp)" || rm -f $(top_builddir)/IndexSearch/cjkv/$(am__dirstamp) -test -z "$(top_builddir)/Tokenize/filters/$(DEPDIR)/$(am__dirstamp)" || rm -f $(top_builddir)/Tokenize/filters/$(DEPDIR)/$(am__dirstamp) -test -z "$(top_builddir)/Tokenize/filters/$(am__dirstamp)" || rm -f $(top_builddir)/Tokenize/filters/$(am__dirstamp) maintainer-clean-generic: @echo "This command is intended for maintainers to use" @echo "it deletes files that may require special tools to rebuild." clean: clean-am clean-am: clean-generic clean-libLTLIBRARIES clean-libtool \ clean-pkglibLTLIBRARIES mostlyclean-am distclean: distclean-am -rm -f $(top_builddir)/IndexSearch/cjkv/$(DEPDIR)/CJKVTokenizer.Plo -rm -f $(top_builddir)/Tokenize/filters/$(DEPDIR)/ArchiveFilter.Plo -rm -f $(top_builddir)/Tokenize/filters/$(DEPDIR)/ChmFilter.Plo -rm -f $(top_builddir)/Tokenize/filters/$(DEPDIR)/Exiv2ImageFilter.Plo -rm -f $(top_builddir)/Tokenize/filters/$(DEPDIR)/ExternalFilter.Plo -rm -f $(top_builddir)/Tokenize/filters/$(DEPDIR)/FileOutputFilter.Plo -rm -f $(top_builddir)/Tokenize/filters/$(DEPDIR)/Filter.Plo -rm -f $(top_builddir)/Tokenize/filters/$(DEPDIR)/FilterFactory.Plo -rm -f $(top_builddir)/Tokenize/filters/$(DEPDIR)/GMimeMboxFilter.Plo -rm -f $(top_builddir)/Tokenize/filters/$(DEPDIR)/HtmlFilter.Plo -rm -f $(top_builddir)/Tokenize/filters/$(DEPDIR)/HtmlParser.Plo -rm -f $(top_builddir)/Tokenize/filters/$(DEPDIR)/TagLibMusicFilter.Plo -rm -f $(top_builddir)/Tokenize/filters/$(DEPDIR)/TextFilter.Plo -rm -f $(top_builddir)/Tokenize/filters/$(DEPDIR)/XmlFilter.Plo -rm -f ./$(DEPDIR)/FilterUtils.Plo -rm -f ./$(DEPDIR)/TextConverter.Plo -rm -f Makefile distclean-am: clean-am distclean-compile distclean-generic \ distclean-tags dvi: dvi-am dvi-am: html: html-am html-am: info: info-am info-am: install-data-am: install-nobase_pkgincludeHEADERS \ install-pkgincludeHEADERS install-dvi: install-dvi-am install-dvi-am: install-exec-am: install-libLTLIBRARIES install-pkglibLTLIBRARIES install-html: install-html-am install-html-am: install-info: install-info-am install-info-am: install-man: install-pdf: install-pdf-am install-pdf-am: install-ps: install-ps-am install-ps-am: installcheck-am: maintainer-clean: maintainer-clean-am -rm -f $(top_builddir)/IndexSearch/cjkv/$(DEPDIR)/CJKVTokenizer.Plo -rm -f $(top_builddir)/Tokenize/filters/$(DEPDIR)/ArchiveFilter.Plo -rm -f $(top_builddir)/Tokenize/filters/$(DEPDIR)/ChmFilter.Plo -rm -f $(top_builddir)/Tokenize/filters/$(DEPDIR)/Exiv2ImageFilter.Plo -rm -f $(top_builddir)/Tokenize/filters/$(DEPDIR)/ExternalFilter.Plo -rm -f $(top_builddir)/Tokenize/filters/$(DEPDIR)/FileOutputFilter.Plo -rm -f $(top_builddir)/Tokenize/filters/$(DEPDIR)/Filter.Plo -rm -f $(top_builddir)/Tokenize/filters/$(DEPDIR)/FilterFactory.Plo -rm -f $(top_builddir)/Tokenize/filters/$(DEPDIR)/GMimeMboxFilter.Plo -rm -f $(top_builddir)/Tokenize/filters/$(DEPDIR)/HtmlFilter.Plo -rm -f $(top_builddir)/Tokenize/filters/$(DEPDIR)/HtmlParser.Plo -rm -f $(top_builddir)/Tokenize/filters/$(DEPDIR)/TagLibMusicFilter.Plo -rm -f $(top_builddir)/Tokenize/filters/$(DEPDIR)/TextFilter.Plo -rm -f $(top_builddir)/Tokenize/filters/$(DEPDIR)/XmlFilter.Plo -rm -f ./$(DEPDIR)/FilterUtils.Plo -rm -f ./$(DEPDIR)/TextConverter.Plo -rm -f Makefile maintainer-clean-am: distclean-am maintainer-clean-generic mostlyclean: mostlyclean-am mostlyclean-am: mostlyclean-compile mostlyclean-generic \ mostlyclean-libtool pdf: pdf-am pdf-am: ps: ps-am ps-am: uninstall-am: uninstall-libLTLIBRARIES \ uninstall-nobase_pkgincludeHEADERS uninstall-pkgincludeHEADERS \ uninstall-pkglibLTLIBRARIES .MAKE: install-am install-strip .PHONY: CTAGS GTAGS TAGS all all-am am--depfiles check check-am clean \ clean-generic clean-libLTLIBRARIES clean-libtool \ clean-pkglibLTLIBRARIES cscopelist-am ctags ctags-am distclean \ distclean-compile distclean-generic distclean-libtool \ distclean-tags distdir dvi dvi-am html html-am info info-am \ install install-am install-data install-data-am install-dvi \ install-dvi-am install-exec install-exec-am install-html \ install-html-am install-info install-info-am \ install-libLTLIBRARIES install-man \ install-nobase_pkgincludeHEADERS install-pdf install-pdf-am \ install-pkgincludeHEADERS install-pkglibLTLIBRARIES install-ps \ install-ps-am install-strip installcheck installcheck-am \ installdirs maintainer-clean maintainer-clean-generic \ mostlyclean mostlyclean-compile mostlyclean-generic \ mostlyclean-libtool pdf pdf-am ps ps-am tags tags-am uninstall \ uninstall-am uninstall-libLTLIBRARIES \ uninstall-nobase_pkgincludeHEADERS uninstall-pkgincludeHEADERS \ uninstall-pkglibLTLIBRARIES .PRECIOUS: Makefile # Tell versions [3.59,3.63) of GNU make to not export all variables. # Otherwise a system limit (for SysV at least) may be exceeded. .NOEXPORT: pinot-1.10/Tokenize/filters/0000775000175000017500000000000013620041132013017 500000000000000pinot-1.10/Tokenize/filters/ExternalFilter.cc0000664000175000017500000002400413012076222016201 00000000000000/* * Copyright 2007-2016 Fabrice Colin * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include "config.h" #include #include #include #include #include #include #ifdef HAVE_SOCKETPAIR #ifdef HAVE_FORK #ifdef HAVE_SETRLIMIT #include #include #include #include #include #endif #endif #endif #include #include #include #include #include #include #include "ExternalFilter.h" using std::clog; using std::endl; using std::min; using std::string; using std::set; using std::map; using namespace Dijon; #ifdef _DYNAMIC_DIJON_FILTERS DIJON_FILTER_EXPORT bool get_filter_types(MIMETypes &mime_types) { #ifdef _DIJON_EXTERNALFILTER_CONFFILE ExternalFilter::initialize(_DIJON_EXTERNALFILTER_CONFFILE, mime_types); #else ExternalFilter::initialize("/etc/dijon/external-filters.xml", mime_types); #endif return true; } DIJON_FILTER_EXPORT bool check_filter_data_input(int data_input) { Filter::DataInput input = (Filter::DataInput)data_input; if (input == Filter::DOCUMENT_FILE_NAME) { return true; } return false; } DIJON_FILTER_EXPORT Filter *get_filter(void) { return new ExternalFilter(); } #endif // This function is heavily inspired by Xapian Omega's shell_protect() static string shell_protect(const string &file_name) { string safefile(file_name); string::size_type p = 0; if ((safefile.empty() == false) && (safefile[0] == '-')) { // If the filename starts with a '-', protect it from being treated as // an option by prepending "./". safefile.insert(0, "./"); p = 2; } while (p < safefile.size()) { // Don't escape some safe characters which are common in filenames. unsigned char ch = safefile[p]; if ((isalnum(ch) == 0) && (strchr("/._-", ch) == NULL)) { safefile.insert(p, "\\"); ++p; } ++p; } return safefile; } map ExternalFilter::m_commandsByType; map ExternalFilter::m_outputsByType; map ExternalFilter::m_charsetsByType; ExternalFilter::ExternalFilter() : FileOutputFilter(), m_maxSize(0), m_doneWithDocument(false) { } ExternalFilter::~ExternalFilter() { rewind(); } bool ExternalFilter::is_data_input_ok(DataInput input) const { if (input == DOCUMENT_FILE_NAME) { return true; } return false; } bool ExternalFilter::set_property(Properties prop_name, const string &prop_value) { if ((prop_name == MAXIMUM_NESTED_SIZE) && (prop_value.empty() == false)) { m_maxSize = (off_t)atoll(prop_value.c_str()); } return true; } bool ExternalFilter::set_document_data(const char *data_ptr, off_t data_length) { return false; } bool ExternalFilter::set_document_string(const string &data_str) { return false; } bool ExternalFilter::set_document_uri(const string &uri) { return false; } bool ExternalFilter::has_documents(void) const { if ((m_doneWithDocument == false) && (m_filePath.empty() == false)) { return true; } return false; } bool ExternalFilter::next_document(void) { if ((m_doneWithDocument == false) && (m_mimeType.empty() == false) && (m_filePath.empty() == false) && (m_commandsByType.empty() == false)) { string outputType("text/plain"); ssize_t maxSize = 0; m_doneWithDocument = true; // Is this type supported ? Assume text/plain if not specified map::const_iterator commandIter = m_commandsByType.find(m_mimeType); if ((commandIter == m_commandsByType.end()) || (commandIter->second.empty() == true)) { return false; } // What's the output type ? map::const_iterator outputIter = m_outputsByType.find(m_mimeType); if (outputIter != m_outputsByType.end()) { outputType = outputIter->second; } if (outputType != "text/plain") { maxSize = m_maxSize; } if (run_command(commandIter->second, maxSize) == true) { // Fill in general details m_metaData["uri"] = "file://" + m_filePath; m_metaData["mimetype"] = outputType; // Is it in a known charset ? map::const_iterator charsetIter = m_charsetsByType.find(m_mimeType); if (charsetIter != m_charsetsByType.end()) { m_metaData["charset"] = charsetIter->second; } return true; } return false; } rewind(); return false; } bool ExternalFilter::skip_to_document(const string &ipath) { if (ipath.empty() == true) { return next_document(); } return false; } string ExternalFilter::get_error(void) const { return ""; } void ExternalFilter::initialize(const std::string &config_file, MIMETypes &types) { xmlDoc *pDoc = NULL; xmlNode *pRootElement = NULL; types.m_mimeTypes.clear(); // Parse the file and get the document #if LIBXML_VERSION < 20600 pDoc = xmlParseFile(config_file.c_str()); #else pDoc = xmlReadFile(config_file.c_str(), NULL, XML_PARSE_NOCDATA); #endif if (pDoc == NULL) { return; } // Iterate through the root element's nodes pRootElement = xmlDocGetRootElement(pDoc); for (xmlNode *pCurrentNode = pRootElement->children; pCurrentNode != NULL; pCurrentNode = pCurrentNode->next) { // What type of tag is it ? if (pCurrentNode->type != XML_ELEMENT_NODE) { continue; } // Get all filter elements if (xmlStrncmp(pCurrentNode->name, BAD_CAST"filter", 6) == 0) { string mimeType, charset, command, arguments, output; for (xmlNode *pCurrentCodecNode = pCurrentNode->children; pCurrentCodecNode != NULL; pCurrentCodecNode = pCurrentCodecNode->next) { if (pCurrentCodecNode->type != XML_ELEMENT_NODE) { continue; } char *pChildContent = (char*)xmlNodeGetContent(pCurrentCodecNode); if (pChildContent == NULL) { continue; } // Filters are keyed by their MIME type, "extension" is ignored if (xmlStrncmp(pCurrentCodecNode->name, BAD_CAST"mimetype", 8) == 0) { mimeType = pChildContent; } else if (xmlStrncmp(pCurrentCodecNode->name, BAD_CAST"charset", 7) == 0) { charset = pChildContent; } else if (xmlStrncmp(pCurrentCodecNode->name, BAD_CAST"command", 7) == 0) { command = pChildContent; } if (xmlStrncmp(pCurrentCodecNode->name, BAD_CAST"arguments", 9) == 0) { arguments = pChildContent; } else if (xmlStrncmp(pCurrentCodecNode->name, BAD_CAST"output", 6) == 0) { output = pChildContent; } // Free xmlFree(pChildContent); } if ((mimeType.empty() == false) && (command.empty() == false) && (arguments.empty() == false)) { #ifdef DEBUG clog << "ExternalFilter::initialize: " << mimeType << "=" << command << " " << arguments << endl; #endif // Command to run m_commandsByType[mimeType] = command + " " + arguments; // Output if (output.empty() == false) { m_outputsByType[mimeType] = output; } // Charset if (charset.empty() == false) { m_charsetsByType[mimeType] = charset; } types.m_mimeTypes.insert(mimeType); } } } // Free the document xmlFreeDoc(pDoc); } void ExternalFilter::rewind(void) { Filter::rewind(); m_doneWithDocument = false; } // This function is heavily inspired by Xapian Omega's stdout_to_string() bool ExternalFilter::run_command(const string &command, ssize_t maxSize) { string commandLine(command); int fds[2]; int status = 0; bool replacedParam = false, gotOutput = false; string::size_type argPos = commandLine.find("%s"); while (argPos != string::npos) { string quotedFilePath(shell_protect(m_filePath)); commandLine.replace(argPos, 2, quotedFilePath); replacedParam = true; // Next argPos = commandLine.find("%s", argPos + 1); } if (replacedParam == false) { // Append commandLine += " "; commandLine += shell_protect(m_filePath); } // We want to be able to get the exit status of the child process signal(SIGCHLD, SIG_DFL); if (socketpair(AF_UNIX, SOCK_STREAM, PF_UNSPEC, fds) < 0) { return false; } #ifdef DEBUG clog << "ExternalFilter::run_command: running " << commandLine << endl; #endif // Fork and execute the command pid_t childPid = fork(); if (childPid == 0) { // Child process // Close the parent's side of the socket pair close(fds[0]); // Connect stdout, stderr and stdlog to our side of the socket pair dup2(fds[1], 1); dup2(fds[1], 2); dup2(fds[1], 3); // Limit CPU time for external programs to 300 seconds struct rlimit cpu_limit = { 300, RLIM_INFINITY } ; setrlimit(RLIMIT_CPU, &cpu_limit); execl("/bin/sh", "/bin/sh", "-c", commandLine.c_str(), (void*)NULL); exit(-1); } // Parent process // Close the child's side of the socket pair close(fds[1]); if (childPid == -1) { // The fork failed close(fds[0]); return false; } ssize_t totalSize = 0; gotOutput = read_file(fds[0], maxSize, totalSize); // Close our side of the socket pair close(fds[0]); // Wait until the child terminates pid_t actualChildPid = waitpid(childPid, &status, 0); if ((gotOutput == false) || (actualChildPid == -1)) { return false; } if (status != 0) { if (WIFEXITED(status) && WEXITSTATUS(status) == 127) { #ifdef DEBUG clog << "ExternalFilter::run_command: couldn't run " << command << endl; #endif return false; } } #ifdef SIGXCPU if (WIFSIGNALED(status) && WTERMSIG(status) == SIGXCPU) { #ifdef DEBUG clog << "ExternalFilter::run_command: " << command << " consumed too much CPU" << endl; #endif return false; } #endif return true; } pinot-1.10/Tokenize/filters/FilterFactory.cc0000664000175000017500000001720513012116644016036 00000000000000/* * Copyright 2007-2016 Fabrice Colin * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include "config.h" #include #include #include #include #include #include #include #ifdef HAVE_DLFCN_H #include #endif #include #include #include "Filter.h" #include "TextFilter.h" #include "FilterFactory.h" #ifdef HAVE_DLFCN_H #ifdef __CYGWIN__ #define DLOPEN_FLAGS RTLD_LAZY #else #define DLOPEN_FLAGS (RTLD_LAZY|RTLD_LOCAL) #endif #endif //#if defined _GLIBCXX_USE_CXX11_ABI && _GLIBCXX_USE_CXX11_ABI #define GETFILTERTYPESFUNC "_Z16get_filter_typesRN5Dijon9MIMETypesE" #define GETFILTERFUNC "_Z10get_filterv" //#endif using std::clog; using std::clog; using std::endl; using std::string; using std::set; using std::map; using std::copy; using namespace Dijon; map FilterFactory::m_types; map FilterFactory::m_handles; FilterFactory::FilterFactory() { } FilterFactory::~FilterFactory() { } unsigned int FilterFactory::loadFilters(const string &dir_name) { unsigned int count = 0; #ifdef HAVE_DLFCN_H struct stat fileStat; if (dir_name.empty() == true) { return 0; } // Is it a directory ? if ((stat(dir_name.c_str(), &fileStat) == -1) || (!S_ISDIR(fileStat.st_mode))) { clog << "FilterFactory::loadFilters: " << dir_name << " is not a directory" << endl; return 0; } // Scan it DIR *pDir = opendir(dir_name.c_str()); if (pDir == NULL) { return 0; } // Iterate through this directory's entries struct dirent *pDirEntry = readdir(pDir); while (pDirEntry != NULL) { char *pEntryName = pDirEntry->d_name; if (pEntryName != NULL) { string fileName = pEntryName; string::size_type extPos = fileName.find_last_of("."); if ((extPos == string::npos) || (fileName.substr(extPos) != ".so")) { // Next entry pDirEntry = readdir(pDir); continue; } fileName = dir_name; fileName += "/"; fileName += pEntryName; // Check this entry if ((stat(fileName.c_str(), &fileStat) != 0) || (!S_ISREG(fileStat.st_mode))) { clog << "FilterFactory::loadFilters: couldn't stat " << pEntryName << endl; // Next entry pDirEntry = readdir(pDir); continue; } void *pHandle = dlopen(fileName.c_str(), DLOPEN_FLAGS); if (pHandle == NULL) { clog << "FilterFactory::loadFilters: " << dlerror() << endl; // Next entry pDirEntry = readdir(pDir); continue; } // What type(s) does this support ? get_filter_types_func *pTypesFunc = (get_filter_types_func *)dlsym(pHandle, GETFILTERTYPESFUNC); if (pTypesFunc == NULL) { clog << "FilterFactory::loadFilters: couldn't find " << GETFILTERTYPESFUNC << ": " << dlerror() << endl; dlclose(pHandle); // Next entry pDirEntry = readdir(pDir); continue; } MIMETypes types; unsigned int typeCount = 0; bool filterOkay = (*pTypesFunc)(types); if (filterOkay == false) { clog << "FilterFactory::loadFilters: couldn't get types from " << pEntryName << endl; } else for (set::iterator typeIter = types.m_mimeTypes.begin(); typeIter != types.m_mimeTypes.end(); ++typeIter) { string newType(*typeIter); if (m_types.find(newType) == m_types.end()) { // Add a record for this filter m_types[newType] = fileName; ++typeCount; #ifdef DEBUG clog << "FilterFactory::loadFilters: type " << newType << " is supported by " << pEntryName << endl; #endif } } if (typeCount > 0) { m_handles[fileName] = pHandle; } else { #ifdef DEBUG clog << "FilterFactory::loadFilters: no useful types from " << fileName << endl; #endif dlclose(pHandle); } } // Next entry pDirEntry = readdir(pDir); } closedir(pDir); #endif return count; } Filter *FilterFactory::getLibraryFilter(const string &mime_type) { void *pHandle = NULL; if (m_handles.empty() == true) { #ifdef DEBUG clog << "FilterFactory::getLibraryFilter: no libraries" << endl; #endif return NULL; } map::iterator typeIter = m_types.find(mime_type); if (typeIter == m_types.end()) { // We don't know about this type return NULL; } map::iterator handleIter = m_handles.find(typeIter->second); if (handleIter == m_handles.end()) { // We don't know about this library return NULL; } pHandle = handleIter->second; if (pHandle == NULL) { return NULL; } #ifdef HAVE_DLFCN_H // Get a filter object then get_filter_func *pFunc = (get_filter_func *)dlsym(pHandle, GETFILTERFUNC); if (pFunc != NULL) { return (*pFunc)(); } #ifdef DEBUG clog << "FilterFactory::getLibraryFilter: couldn't find " << GETFILTERFUNC << ": " << dlerror() << endl; #endif #endif return NULL; } Filter *FilterFactory::getFilter(const string &mime_type) { Filter *pFilter = NULL; string typeOnly(mime_type); string::size_type semiColonPos = mime_type.find(";"); // Remove the charset, if any if (semiColonPos != string::npos) { typeOnly = mime_type.substr(0, semiColonPos); } #ifdef DEBUG clog << "FilterFactory::getFilter: file type is " << typeOnly << endl; #endif if (typeOnly == "text/plain") { pFilter = new TextFilter(); } #ifndef _DYNAMIC_DIJON_HTMLFILTER else if (typeOnly == "text/html") { pFilter = new HtmlFilter(); } #endif #ifndef _DYNAMIC_DIJON_XMLFILTER else if ((typeOnly == "text/xml") || (typeOnly == "application/xml")) { pFilter = new XmlFilter(); } #endif else { pFilter = getLibraryFilter(typeOnly); } if (pFilter != NULL) { pFilter->set_mime_type(typeOnly); } return pFilter; } void FilterFactory::getSupportedTypes(set &mime_types) { mime_types.clear(); // Built-in types mime_types.insert("text/plain"); #ifndef _DYNAMIC_DIJON_HTMLFILTER mime_types.insert("text/html"); #endif #ifndef _DYNAMIC_DIJON_XMLFILTER mime_types.insert("text/xml"); mime_types.insert("application/xml"); #endif // Library-handled types for (map::iterator typeIter = m_types.begin(); typeIter != m_types.end(); ++typeIter) { mime_types.insert(typeIter->first); } } bool FilterFactory::isSupportedType(const string &mime_type) { string typeOnly(mime_type); string::size_type semiColonPos = mime_type.find(";"); // Remove the charset, if any if (semiColonPos != string::npos) { typeOnly = mime_type.substr(0, semiColonPos); } // Is it a built-in type ? if ((typeOnly == "text/plain") || #ifndef _DYNAMIC_DIJON_HTMLFILTER (typeOnly == "text/html") || #endif #ifndef _DYNAMIC_DIJON_XMLFILTER (typeOnly == "text/xml") || (typeOnly == "application/xml") || #endif (m_types.find(typeOnly) != m_types.end())) { return true; } return false; } void FilterFactory::unloadFilters(void) { #ifdef HAVE_DLFCN_H for (map::iterator iter = m_handles.begin(); iter != m_handles.end(); ++iter) { if (dlclose(iter->second) != 0) { #ifdef DEBUG clog << "FilterFactory::unloadFilters: failed on " << iter->first << endl; #endif } } #endif m_types.clear(); m_handles.clear(); } pinot-1.10/Tokenize/filters/TextFilter.cc0000664000175000017500000000512013012077204015342 00000000000000/* * Copyright 2007-2009 Fabrice Colin * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include #include "TextFilter.h" using std::string; using namespace std; using namespace Dijon; TextFilter::TextFilter() : Filter(), m_doneWithDocument(false) { } TextFilter::~TextFilter() { rewind(); } bool TextFilter::is_data_input_ok(DataInput input) const { if ((input == DOCUMENT_DATA) || (input == DOCUMENT_STRING)) { return true; } return false; } bool TextFilter::set_property(Properties prop_name, const string &prop_value) { return true; } bool TextFilter::set_document_data(const char *data_ptr, off_t data_length) { if ((data_ptr == NULL) || (data_length == 0)) { return false; } string text_doc(data_ptr, data_length); return set_document_string(text_doc); } bool TextFilter::set_document_string(const string &data_str) { if (data_str.empty() == true) { return false; } rewind(); #ifdef DEBUG clog << "TextFilter::set_document_string: " << data_str.length() << " bytes of text" << endl; #endif m_content.reserve(data_str.length()); m_content.append(data_str.c_str(), data_str.length()); m_metaData["ipath"] = ""; m_metaData["mimetype"] = "text/plain"; return true; } bool TextFilter::set_document_file(const string &file_path, bool unlink_when_done) { return false; } bool TextFilter::set_document_uri(const string &uri) { return false; } bool TextFilter::has_documents(void) const { if (m_doneWithDocument == false) { return true; } return false; } bool TextFilter::next_document(void) { if (m_doneWithDocument == false) { m_doneWithDocument = true; return true; } rewind(); return false; } bool TextFilter::skip_to_document(const string &ipath) { if (ipath.empty() == true) { return next_document(); } return false; } string TextFilter::get_error(void) const { return ""; } void TextFilter::rewind(void) { Filter::rewind(); m_doneWithDocument = false; } pinot-1.10/Tokenize/filters/ChmFilter.h0000664000175000017500000000744513427323541015013 00000000000000/* * Copyright 2011-2016 Fabrice Colin * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #ifndef _DIJON_CHMFILTER_H #define _DIJON_CHMFILTER_H #include #include #include #include "Filter.h" namespace Dijon { class ChmFilter : public Filter { public: /// Builds an empty filter. ChmFilter(); /// Destroys the filter. virtual ~ChmFilter(); // Information. /// Returns what data the filter requires as input. virtual bool is_data_input_ok(DataInput input) const; // Initialization. /** Sets a property, prior to calling set_document_XXX(). * Returns false if the property is not supported. */ virtual bool set_property(Properties prop_name, const std::string &prop_value); /** (Re)initializes the filter with the given data. * Caller should ensure the given pointer is valid until the * Filter object is destroyed, as some filters may not need to * do a deep copy of the data. * Call next_document() to position the filter onto the first document. * Returns false if this input is not supported or an error occurred. */ virtual bool set_document_data(const char *data_ptr, off_t data_length); /** (Re)initializes the filter with the given data. * Call next_document() to position the filter onto the first document. * Returns false if this input is not supported or an error occurred. */ virtual bool set_document_string(const std::string &data_str); /** (Re)initializes the filter with the given file. * Call next_document() to position the filter onto the first document. * Returns false if this input is not supported or an error occurred. */ virtual bool set_document_file(const std::string &file_path, bool unlink_when_done = false); /** (Re)initializes the filter with the given URI. * Call next_document() to position the filter onto the first document. * Returns false if this input is not supported or an error occurred. */ virtual bool set_document_uri(const std::string &uri); // Going from one nested document to the next. /** Returns true if there are nested documents left to extract. * Returns false if the end of the parent document was reached * or an error occurred. */ virtual bool has_documents(void) const; /** Moves to the next nested document. * Returns false if there are none left. */ virtual bool next_document(void); /** Skips to the nested document with the given ipath. * Returns false if no such document exists. */ virtual bool skip_to_document(const std::string &ipath); // Accessing documents' contents. /// Returns the message for the most recent error that has occurred. virtual std::string get_error(void) const; // Enumeration. /// Adds a unit. void add_unit(struct chmUnitInfo *pUnitInfo); protected: size_t m_maxSize; struct chmFile *m_pHandle; std::vector m_units; bool m_doneAll; virtual void rewind(void); bool next_document(const std::string &ipath); private: /// ChmFilter objects cannot be copied. ChmFilter(const ChmFilter &other); /// ChmFilter objects cannot be copied. ChmFilter& operator=(const ChmFilter& other); }; } #endif // _DIJON_CHMFILTER_H pinot-1.10/Tokenize/filters/Exiv2ImageFilter.h0000664000175000017500000000717113427323541016240 00000000000000/* * Copyright 2011-2016 Fabrice Colin * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #ifndef _DIJON_EXIV2IMAGEFILTER_H #define _DIJON_EXIV2IMAGEFILTER_H #include #include "Filter.h" namespace Dijon { class Exiv2ImageFilter : public Filter { public: /// Builds an empty filter. Exiv2ImageFilter(); /// Destroys the filter. virtual ~Exiv2ImageFilter(); // Information. /// Returns what data the filter requires as input. virtual bool is_data_input_ok(DataInput input) const; // Initialization. /** Sets a property, prior to calling set_document_XXX(). * Returns false if the property is not supported. */ virtual bool set_property(Properties prop_name, const std::string &prop_value); /** (Re)initializes the filter with the given data. * Caller should ensure the given pointer is valid until the * Filter object is destroyed, as some filters may not need to * do a deep copy of the data. * Call next_document() to position the filter onto the first document. * Returns false if this input is not supported or an error occurred. */ virtual bool set_document_data(const char *data_ptr, off_t data_length); /** (Re)initializes the filter with the given data. * Call next_document() to position the filter onto the first document. * Returns false if this input is not supported or an error occurred. */ virtual bool set_document_string(const std::string &data_str); /** (Re)initializes the filter with the given file. * Call next_document() to position the filter onto the first document. * Returns false if this input is not supported or an error occurred. */ virtual bool set_document_file(const std::string &file_path, bool unlink_when_done = false); /** (Re)initializes the filter with the given URI. * Call next_document() to position the filter onto the first document. * Returns false if this input is not supported or an error occurred. */ virtual bool set_document_uri(const std::string &uri); // Going from one nested document to the next. /** Returns true if there are nested documents left to extract. * Returns false if the end of the parent document was reached * or an error occurred. */ virtual bool has_documents(void) const; /** Moves to the next nested document. * Returns false if there are none left. */ virtual bool next_document(void); /** Skips to the nested document with the given ipath. * Returns false if no such document exists. */ virtual bool skip_to_document(const std::string &ipath); // Accessing documents' contents. /// Returns the message for the most recent error that has occurred. virtual std::string get_error(void) const; protected: bool m_parseDocument; virtual void rewind(void); private: /// Exiv2ImageFilter objects cannot be copied. Exiv2ImageFilter(const Exiv2ImageFilter &other); /// Exiv2ImageFilter objects cannot be copied. Exiv2ImageFilter& operator=(const Exiv2ImageFilter& other); }; } #endif // _DIJON_EXIV2IMAGEFILTER_H pinot-1.10/Tokenize/filters/ArchiveFilter.h0000664000175000017500000000756613427323541015671 00000000000000/* * Copyright 2009-2016 Fabrice Colin * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #ifndef _DIJON_ARCHIVEFILTER_H #define _DIJON_ARCHIVEFILTER_H #include #include #include "Filter.h" namespace Dijon { class ArchiveFilter : public Filter { public: /// Builds an empty filter. ArchiveFilter(); /// Destroys the filter. virtual ~ArchiveFilter(); // Information. /// Sets the MIME type the filter will handle. virtual void set_mime_type(const std::string &mime_type); /// Returns what data the filter requires as input. virtual bool is_data_input_ok(DataInput input) const; // Initialization. /** Sets a property, prior to calling set_document_XXX(). * Returns false if the property is not supported. */ virtual bool set_property(Properties prop_name, const std::string &prop_value); /** (Re)initializes the filter with the given data. * Caller should ensure the given pointer is valid until the * Filter object is destroyed, as some filters may not need to * do a deep copy of the data. * Call next_document() to position the filter onto the first document. * Returns false if this input is not supported or an error occurred. */ virtual bool set_document_data(const char *data_ptr, off_t data_length); /** (Re)initializes the filter with the given data. * Call next_document() to position the filter onto the first document. * Returns false if this input is not supported or an error occurred. */ virtual bool set_document_string(const std::string &data_str); /** (Re)initializes the filter with the given file. * Call next_document() to position the filter onto the first document. * Returns false if this input is not supported or an error occurred. */ virtual bool set_document_file(const std::string &file_path, bool unlink_when_done = false); /** (Re)initializes the filter with the given URI. * Call next_document() to position the filter onto the first document. * Returns false if this input is not supported or an error occurred. */ virtual bool set_document_uri(const std::string &uri); // Going from one nested document to the next. /** Returns true if there are nested documents left to extract. * Returns false if the end of the parent document was reached * or an error occurred. */ virtual bool has_documents(void) const; /** Moves to the next nested document. * Returns false if there are none left. */ virtual bool next_document(void); /** Skips to the nested document with the given ipath. * Returns false if no such document exists. */ virtual bool skip_to_document(const std::string &ipath); // Accessing documents' contents. /// Returns the message for the most recent error that has occurred. virtual std::string get_error(void) const; protected: off_t m_maxSize; bool m_parseDocument; bool m_isBig; char *m_pMem; int m_fd; struct archive *m_pHandle; virtual void rewind(void); void initialize(void); bool next_document(const std::string &ipath); private: /// ArchiveFilter objects cannot be copied. ArchiveFilter(const ArchiveFilter &other); /// ArchiveFilter objects cannot be copied. ArchiveFilter& operator=(const ArchiveFilter& other); }; } #endif // _DIJON_ARCHIVEFILTER_H pinot-1.10/Tokenize/filters/TagLibMusicFilter.cc0000664000175000017500000001077513012076323016576 00000000000000/* * Copyright 2007-2016 Fabrice Colin * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include #include #include #include #include #include #include "TagLibMusicFilter.h" using std::string; using std::clog; using std::endl; using namespace Dijon; #ifdef _DYNAMIC_DIJON_FILTERS DIJON_FILTER_EXPORT bool get_filter_types(MIMETypes &mime_types) { mime_types.m_mimeTypes.clear(); mime_types.m_mimeTypes.insert("audio/mpeg"); mime_types.m_mimeTypes.insert("audio/x-mp3"); mime_types.m_mimeTypes.insert("application/ogg"); mime_types.m_mimeTypes.insert("audio/x-flac+ogg"); mime_types.m_mimeTypes.insert("audio/x-flac"); return true; } DIJON_FILTER_EXPORT bool check_filter_data_input(int data_input) { Filter::DataInput input = (Filter::DataInput)data_input; if (input == Filter::DOCUMENT_FILE_NAME) { return true; } return false; } DIJON_FILTER_EXPORT Filter *get_filter(void) { return new TagLibMusicFilter(); } #endif TagLibMusicFilter::TagLibMusicFilter() : Filter(), m_parseDocument(false) { } TagLibMusicFilter::~TagLibMusicFilter() { rewind(); } bool TagLibMusicFilter::is_data_input_ok(DataInput input) const { if (input == DOCUMENT_FILE_NAME) { return true; } return false; } bool TagLibMusicFilter::set_property(Properties prop_name, const string &prop_value) { return false; } bool TagLibMusicFilter::set_document_data(const char *data_ptr, off_t data_length) { return false; } bool TagLibMusicFilter::set_document_string(const string &data_str) { return false; } bool TagLibMusicFilter::set_document_file(const string &file_path, bool unlink_when_done) { if (Filter::set_document_file(file_path, unlink_when_done) == true) { m_parseDocument = true; return true; } return false; } bool TagLibMusicFilter::set_document_uri(const string &uri) { return false; } bool TagLibMusicFilter::has_documents(void) const { return m_parseDocument; } bool TagLibMusicFilter::next_document(void) { if (m_parseDocument == true) { m_parseDocument = false; m_content.clear(); m_metaData.clear(); TagLib::FileRef fileRef(m_filePath.c_str(), false); if (fileRef.isNull() == false) { TagLib::Tag *pTag = fileRef.tag(); if ((pTag != NULL) && (pTag->isEmpty() == false)) { char yearStr[64]; string trackTitle(pTag->title().toCString(true)); trackTitle += " "; trackTitle += pTag->artist().toCString(true); #ifdef DEBUG clog << "TagLibMusicFilter::next_document: " << trackTitle.length() << " bytes of text" << endl; #endif m_content.append(trackTitle.c_str(), trackTitle.length()); m_content += " "; m_content += pTag->album().toCString(true); m_content += " "; m_content += pTag->comment().toCString(true); m_content += " "; m_content += pTag->genre().toCString(true); snprintf(yearStr, 64, " %u", pTag->year()); m_content += yearStr; m_metaData["title"] = trackTitle; m_metaData["ipath"] = ""; m_metaData["mimetype"] = "text/plain"; m_metaData["charset"] = "utf-8"; m_metaData["author"] = pTag->artist().toCString(true); } else { // This file doesn't have any tag string::size_type filePos = m_filePath.find_last_of("/"); if ((filePos != string::npos) && (m_filePath.length() - filePos > 1)) { m_metaData["title"] = m_filePath.substr(filePos + 1); } else { m_metaData["title"] = m_filePath; } m_metaData["ipath"] = ""; m_metaData["mimetype"] = "text/plain"; m_metaData["charset"] = "utf-8"; } return true; } } return false; } bool TagLibMusicFilter::skip_to_document(const string &ipath) { if (ipath.empty() == true) { return next_document(); } return false; } string TagLibMusicFilter::get_error(void) const { return ""; } void TagLibMusicFilter::rewind(void) { Filter::rewind(); m_parseDocument = false; } pinot-1.10/Tokenize/filters/ExifImageFilter.h0000664000175000017500000000715513427323541016140 00000000000000/* * Copyright 2008-2016 Fabrice Colin * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #ifndef _DIJON_EXIFIMAGEFILTER_H #define _DIJON_EXIFIMAGEFILTER_H #include #include "Filter.h" namespace Dijon { class ExifImageFilter : public Filter { public: /// Builds an empty filter. ExifImageFilter(); /// Destroys the filter. virtual ~ExifImageFilter(); // Information. /// Returns what data the filter requires as input. virtual bool is_data_input_ok(DataInput input) const; // Initialization. /** Sets a property, prior to calling set_document_XXX(). * Returns false if the property is not supported. */ virtual bool set_property(Properties prop_name, const std::string &prop_value); /** (Re)initializes the filter with the given data. * Caller should ensure the given pointer is valid until the * Filter object is destroyed, as some filters may not need to * do a deep copy of the data. * Call next_document() to position the filter onto the first document. * Returns false if this input is not supported or an error occurred. */ virtual bool set_document_data(const char *data_ptr, off_t data_length); /** (Re)initializes the filter with the given data. * Call next_document() to position the filter onto the first document. * Returns false if this input is not supported or an error occurred. */ virtual bool set_document_string(const std::string &data_str); /** (Re)initializes the filter with the given file. * Call next_document() to position the filter onto the first document. * Returns false if this input is not supported or an error occurred. */ virtual bool set_document_file(const std::string &file_path, bool unlink_when_done = false); /** (Re)initializes the filter with the given URI. * Call next_document() to position the filter onto the first document. * Returns false if this input is not supported or an error occurred. */ virtual bool set_document_uri(const std::string &uri); // Going from one nested document to the next. /** Returns true if there are nested documents left to extract. * Returns false if the end of the parent document was reached * or an error occurred. */ virtual bool has_documents(void) const; /** Moves to the next nested document. * Returns false if there are none left. */ virtual bool next_document(void); /** Skips to the nested document with the given ipath. * Returns false if no such document exists. */ virtual bool skip_to_document(const std::string &ipath); // Accessing documents' contents. /// Returns the message for the most recent error that has occurred. virtual std::string get_error(void) const; protected: bool m_parseDocument; virtual void rewind(void); private: /// ExifImageFilter objects cannot be copied. ExifImageFilter(const ExifImageFilter &other); /// ExifImageFilter objects cannot be copied. ExifImageFilter& operator=(const ExifImageFilter& other); }; } #endif // _DIJON_EXIFIMAGEFILTER_H pinot-1.10/Tokenize/filters/Filter.h0000664000175000017500000001722313427323541014356 00000000000000/* * Copyright 2007-2016 Fabrice Colin * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #ifndef _DIJON_FILTER_H #define _DIJON_FILTER_H #include #include #include #ifndef DIJON_FILTER_EXPORT #if defined __GNUC__ && (__GNUC__ >= 4) #define DIJON_FILTER_EXPORT __attribute__ ((visibility("default"))) #define DIJON_FILTER_INITIALIZE __attribute__((constructor)) #define DIJON_FILTER_SHUTDOWN __attribute__((destructor)) #else #define DIJON_FILTER_EXPORT #define DIJON_FILTER_INITIALIZE #define DIJON_FILTER_SHUTDOWN #endif #endif #include "Memory.h" namespace Dijon { class Filter; /// MIME types the filter supports. class DIJON_FILTER_EXPORT MIMETypes { public: MIMETypes(); virtual ~MIMETypes(); std::set m_mimeTypes; private: /// MIMETypes objects cannot be copied. MIMETypes(const MIMETypes &other); /// MIMETypes objects cannot be copied. MIMETypes& operator=(const MIMETypes& other); }; /** Provides the list of MIME types supported by the filter(s). * The character string is allocated with new[]. * This function is exported by dynamically loaded filter libraries. */ typedef bool (get_filter_types_func)(MIMETypes &); /** Returns what data should be passed to the filter(s). * Output is cast from Filter::DataInput to int for convenience. * This function is exported by dynamically loaded filter libraries. * The aim is to let the client application know before-hand whether * it should load documents or not. */ typedef bool (check_filter_data_input_func)(int); /** Returns a Filter that handles the given MIME type. * The Filter object is allocated with new. * This function is exported by dynamically loaded filter libraries * and serves as a factory for Filter objects, so that the client * application doesn't have to know which Filter sub-types handle * which MIME types. */ typedef Filter *(get_filter_func)(void); /** Converts text to UTF-8. */ typedef std::string (convert_to_utf8_func)(const char *, off_t, const std::string &); /// Filter interface. class DIJON_FILTER_EXPORT Filter { public: /// Builds an empty filter. Filter(); /// Destroys the filter. virtual ~Filter(); // Enumerations. /** What data a filter supports as input. * It can be either the whole document data, its file name, or its URI. */ typedef enum { DOCUMENT_DATA = 0, DOCUMENT_STRING, DOCUMENT_FILE_NAME, DOCUMENT_URI } DataInput; /** Input properties supported by the filter. * - PREFERRED_CHARSET is the charset preferred by the client application. * The filter will convert document's content to this charset if possible. * - OPERATING_MODE can be set to either view or index. * - MAXIMUM_NESTED_SIZE is the maximum size in bytes of nested documents. */ typedef enum { PREFERRED_CHARSET = 0, OPERATING_MODE, MAXIMUM_NESTED_SIZE } Properties; // Information. /// Sets the MIME type the filter will handle. virtual void set_mime_type(const std::string &mime_type); /// Returns the MIME type handled by the filter. std::string get_mime_type(void) const; /// Returns what data the filter requires as input. virtual bool is_data_input_ok(DataInput input) const = 0; // Initialization. /** Sets a property, prior to calling set_document_XXX(). * Returns false if the property is not supported. */ virtual bool set_property(Properties prop_name, const std::string &prop_value) = 0; /** (Re)initializes the filter with the given data. * Caller should ensure the given pointer is valid until the * Filter object is destroyed, as some filters may not need to * do a deep copy of the data. * Call next_document() to position the filter onto the first document. * Returns false if this input is not supported or an error occurred. */ virtual bool set_document_data(const char *data_ptr, off_t data_length) = 0; /** (Re)initializes the filter with the given data. * Call next_document() to position the filter onto the first document. * Returns false if this input is not supported or an error occurred. */ virtual bool set_document_string(const std::string &data_str) = 0; /** (Re)initializes the filter with the given file. * Call next_document() to position the filter onto the first document. * Returns false if this input is not supported or an error occurred. */ virtual bool set_document_file(const std::string &file_path, bool unlink_when_done = false); /** (Re)initializes the filter with the given URI. * Call next_document() to position the filter onto the first document. * Returns false if this input is not supported or an error occurred. */ virtual bool set_document_uri(const std::string &uri) = 0; // Going from one nested document to the next. /** Returns true if there are nested documents left to extract. * Returns false if the end of the parent document was reached * or an error occurred. */ virtual bool has_documents(void) const = 0; /** Moves to the next nested document. * Returns false if there are none left. */ virtual bool next_document(void) = 0; /** Skips to the nested document with the given ipath. * Returns false if no such document exists. */ virtual bool skip_to_document(const std::string &ipath) = 0; // Accessing documents' contents. /// Returns the message for the most recent error that has occurred. virtual std::string get_error(void) const = 0; /** Returns a dictionary of metadata extracted from the current document. * Metadata fields may include one or more of the following : * title, ipath, mimetype, language, charset, author, creator, * publisher, modificationdate, creationdate, size * Special considerations apply : * - ipath is an internal path to the nested document that can be * later passed to skip_to_document(). It may be empty if the parent * document's type doesn't allow embedding, in which case the filter * should only return one document. * - mimetype should be text/plain if the document could be handled * internally, empty if unknown. If any other value, it is expected * that the client application can pass the nested document's content * to another filter that supports this particular type. */ const std::map &get_meta_data(void) const; /// Returns content. const dstring &get_content(void) const; protected: /// The MIME type handled by the filter. std::string m_mimeType; /// Metadata dictionary. std::map m_metaData; /// Content. dstring m_content; /// The name of the input file, if any. std::string m_filePath; /// Rewinds the filter. virtual void rewind(void); private: /// Whether the input file should be deleted when done. bool m_deleteInputFile; /// Filter objects cannot be copied. Filter(const Filter &other); /// Filter objects cannot be copied. Filter& operator=(const Filter& other); /// Deletes the input file. void deleteInputFile(void); }; } #endif // _DIJON_FILTER_H pinot-1.10/Tokenize/filters/GMimeMboxFilter.cc0000664000175000017500000006163513552373420016267 00000000000000/* * Copyright 2007-2016 Fabrice Colin * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include "config.h" #include #include #include #include #include #ifdef HAVE_MMAP #include #endif #include #include #include #include #include #include #include #include "GMimeMboxFilter.h" using std::clog; using std::endl; using std::string; using std::max; using std::map; using std::set; using std::pair; using namespace Dijon; #ifdef _DYNAMIC_DIJON_FILTERS DIJON_FILTER_EXPORT bool get_filter_types(MIMETypes &mime_types) { mime_types.m_mimeTypes.clear(); mime_types.m_mimeTypes.insert("application/mbox"); mime_types.m_mimeTypes.insert("text/x-mail"); mime_types.m_mimeTypes.insert("text/x-news"); return true; } DIJON_FILTER_EXPORT bool check_filter_data_input(int data_input) { Filter::DataInput input = (Filter::DataInput)data_input; if ((input == Filter::DOCUMENT_DATA) || (input == Filter::DOCUMENT_FILE_NAME)) { return true; } return false; } DIJON_FILTER_EXPORT Filter *get_filter(void) { return new GMimeMboxFilter(); } DIJON_FILTER_INITIALIZE void initialize_gmime(void) { // Initialize gmime #if GMIME_MAJOR_VERSION >= 3 g_mime_init(); #else g_mime_init(GMIME_ENABLE_RFC2047_WORKAROUNDS); #endif } DIJON_FILTER_SHUTDOWN void shutdown_gmime(void) { // Shutdown gmime g_mime_shutdown(); } #endif static string extractField(const string &str, const string &start, const string &end, string::size_type &endPos, bool anyCharacterOfEnd = false) { string fieldValue; string::size_type startPos = string::npos; if (start.empty() == true) { startPos = 0; } else { startPos = str.find(start, endPos); } if (startPos != string::npos) { startPos += start.length(); if (end.empty() == true) { fieldValue = str.substr(startPos); } else { if (anyCharacterOfEnd == false) { endPos = str.find(end, startPos); } else { endPos = str.find_first_of(end, startPos); } if (endPos != string::npos) { fieldValue = str.substr(startPos, endPos - startPos); } } } return fieldValue; } GMimeMboxFilter::GMimeMboxPart::GMimeMboxPart(const string &subject, dstring &buffer) : m_subject(subject), m_buffer(buffer) { } GMimeMboxFilter::GMimeMboxPart::~GMimeMboxPart() { } GMimeMboxFilter::GMimeMboxFilter() : Filter(), m_returnHeaders(false), m_maxSize(0), m_pData(NULL), m_dataLength(0), m_fd(-1), m_pGMimeMboxStream(NULL), m_pParser(NULL), m_pMimeMessage(NULL), m_partsCount(-1), m_partNum(-1), m_partLevel(-1), m_currentLevel(0), m_messageStart(0), m_foundDocument(false) { } GMimeMboxFilter::~GMimeMboxFilter() { finalize(true); } bool GMimeMboxFilter::is_data_input_ok(DataInput input) const { if ((input == DOCUMENT_DATA) || (input == DOCUMENT_FILE_NAME)) { return true; } return false; } bool GMimeMboxFilter::set_property(Properties prop_name, const string &prop_value) { if (prop_name == PREFERRED_CHARSET) { m_defaultCharset = prop_value; return true; } else if (prop_name == OPERATING_MODE) { if (prop_value == "view") { m_returnHeaders = true; } else { m_returnHeaders = false; } return true; } else if ((prop_name == MAXIMUM_NESTED_SIZE) && (prop_value.empty() == false)) { m_maxSize = (off_t)atoll(prop_value.c_str()); } return false; } bool GMimeMboxFilter::set_document_data(const char *data_ptr, off_t data_length) { // Close/free whatever was opened/allocated on a previous call to set_document() finalize(true); m_partsCount = m_partNum = m_partLevel = -1; m_levels.clear(); m_messageStart = 0; m_messageDate.clear(); m_partCharset.clear(); m_foundDocument = false; m_pData = data_ptr; m_dataLength = data_length; // Assume there are documents if initialization is successful // but don't actually retrieve anything, until next or skip is called if (initializeData() == true) { m_foundDocument = initialize(); } return m_foundDocument; } bool GMimeMboxFilter::set_document_string(const string &data_str) { return false; } bool GMimeMboxFilter::set_document_file(const string &file_path, bool unlink_when_done) { // Close/free whatever was opened/allocated on a previous call to set_document() finalize(true); m_partsCount = m_partNum = m_partLevel = -1; m_levels.clear(); m_messageStart = 0; m_messageDate.clear(); m_partCharset.clear(); m_foundDocument = false; Filter::set_document_file(file_path, unlink_when_done); // Assume there are documents if initialization is successful // but don't actually retrieve anything, until next or skip is called if (initializeFile() == true) { m_foundDocument = initialize(); } return m_foundDocument; } bool GMimeMboxFilter::set_document_uri(const string &uri) { return false; } bool GMimeMboxFilter::has_documents(void) const { // As long as a document was found, chances are another one is available return m_foundDocument; } bool GMimeMboxFilter::next_document(void) { string subject; map::const_iterator titleIter = m_metaData.find("title"); if (titleIter != m_metaData.end()) { subject = titleIter->second; } return extractMessage(subject); } bool GMimeMboxFilter::skip_to_document(const string &ipath) { if (ipath.empty() == true) { if (m_messageStart > 0) { // Reset return set_document_file(m_filePath); } return true; } // ipath's format is "o=offset&l=part_levels" if (sscanf(ipath.c_str(), "o=" GMIME_OFFSET_MODIFIER "&l=[", &m_messageStart) != 1) { return false; } finalize(false); m_partsCount = -1; m_levels.clear(); string::size_type levelsPos = ipath.find("l=["); if (levelsPos != string::npos) { string::size_type endPos = 0; string levels(ipath.substr(levelsPos + 2)); string levelInfo(extractField(levels, "[", "]", endPos)); // Parse levels while (levelInfo.empty() == false) { int partLevel = 0, partsCount = 0, partNum = 0; #ifdef DEBUG clog << "GMimeMboxFilter::skip_to_document: level " << levelInfo << endl; #endif if (sscanf(levelInfo.c_str(), "%d,%d,%d", &partLevel, &partsCount, &partNum) == 3) { m_levels[partLevel] = pair(partsCount, partNum); } if (endPos == string::npos) { break; } levelInfo = extractField(levels, "[", "]", endPos); } } m_messageDate.clear(); m_partCharset.clear(); m_foundDocument = false; if (((m_filePath.empty() == false) && (initializeFile() == true)) || (initializeData() == true)) { if (initialize() == true) { // Extract the first message at the given offset m_foundDocument = extractMessage(""); } } return m_foundDocument; } string GMimeMboxFilter::get_error(void) const { return ""; } int GMimeMboxFilter::openFile(const string &filePath) { int openFlags = O_RDONLY; #ifdef O_CLOEXEC openFlags |= O_CLOEXEC; #endif // Open the mbox file #ifdef O_NOATIME int fd = open(filePath.c_str(), openFlags|O_NOATIME); #else int fd = open(filePath.c_str(), openFlags); #endif #ifdef O_NOATIME if ((fd < 0) && (errno == EPERM)) { // Try again fd = open(filePath.c_str(), openFlags); } #endif if (fd < 0) { #ifdef DEBUG clog << "GMimeMboxFilter::openFile: couldn't open " << filePath << endl; #endif return false; } #ifndef O_CLOEXEC int fdFlags = fcntl(fd, F_GETFD); fcntl(fd, F_SETFD, fdFlags|FD_CLOEXEC); #endif return fd; } bool GMimeMboxFilter::initializeData(void) { // Create a stream m_pGMimeMboxStream = g_mime_stream_mem_new_with_buffer(m_pData, m_dataLength); if (m_pGMimeMboxStream == NULL) { return false; } ssize_t streamLength = g_mime_stream_length(m_pGMimeMboxStream); if (m_messageStart > 0) { if (m_messageStart > (GMIME_OFFSET_TYPE)streamLength) { // This offset doesn't make sense ! m_messageStart = 0; } #ifdef DEBUG clog << "GMimeMboxFilter::initializeData: from offset " << m_messageStart << " to " << streamLength << endl; #endif g_mime_stream_set_bounds(m_pGMimeMboxStream, m_messageStart, (GMIME_OFFSET_TYPE)streamLength); } return true; } bool GMimeMboxFilter::initializeFile(void) { m_fd = openFile(m_filePath); if (m_fd < 0) { return false; } // Create a stream if (m_messageStart > 0) { ssize_t streamLength = g_mime_stream_length(m_pGMimeMboxStream); if (m_messageStart > (GMIME_OFFSET_TYPE)streamLength) { // This offset doesn't make sense ! m_messageStart = 0; } #ifdef DEBUG clog << "GMimeMboxFilter::initializeFile: from offset " << m_messageStart << " to " << streamLength << endl; #endif #ifdef HAVE_MMAP m_pGMimeMboxStream = g_mime_stream_mmap_new_with_bounds(m_fd, PROT_READ, MAP_PRIVATE, m_messageStart, (GMIME_OFFSET_TYPE)streamLength); #else m_pGMimeMboxStream = g_mime_stream_fs_new_with_bounds(m_fd, m_messageStart, (GMIME_OFFSET_TYPE)streamLength); #endif } else { #ifdef HAVE_MMAP m_pGMimeMboxStream = g_mime_stream_mmap_new(m_fd, PROT_READ, MAP_PRIVATE); #else m_pGMimeMboxStream = g_mime_stream_fs_new(m_fd); #endif } return true; } bool GMimeMboxFilter::initialize(void) { if (m_pGMimeMboxStream == NULL) { return false; } // And a parser m_pParser = g_mime_parser_new(); if (m_pParser != NULL) { g_mime_parser_init_with_stream(m_pParser, m_pGMimeMboxStream); g_mime_parser_set_respect_content_length(m_pParser, TRUE); // Scan for mbox From-lines #if GMIME_MAJOR_VERSION >= 3 g_mime_parser_set_format(m_pParser, GMIME_FORMAT_MBOX); #else g_mime_parser_set_scan_from(m_pParser, TRUE); #endif return true; } #ifdef DEBUG clog << "GMimeMboxFilter::initialize: couldn't create new parser" << endl; #endif return false; } void GMimeMboxFilter::finalize(bool fullReset) { if (m_pMimeMessage != NULL) { if (G_IS_OBJECT(m_pMimeMessage)) { g_object_unref(m_pMimeMessage); } m_pMimeMessage = NULL; } if (m_pParser != NULL) { // FIXME: does the parser close the stream ? if (G_IS_OBJECT(m_pParser)) { g_object_unref(m_pParser); } m_pParser = NULL; } if (m_pGMimeMboxStream != NULL) { if (G_IS_OBJECT(m_pGMimeMboxStream)) { g_object_unref(m_pGMimeMboxStream); } m_pGMimeMboxStream = NULL; } // initializeFile() will always reopen the file if (m_fd >= 0) { close(m_fd); m_fd = -1; } if (fullReset == true) { // ...but those data fields will only be reinit'ed on a full reset m_pData = NULL; m_dataLength = 0; rewind(); } } bool GMimeMboxFilter::readStream(GMimeStream *pStream, dstring &fileBuffer) { char readBuffer[4096]; ssize_t streamLen = g_mime_stream_length(pStream); ssize_t totalSize = 0, bytesRead = 0; bool gotOutput = true; #ifdef DEBUG clog << "GMimeMboxFilter::readStream: stream is " << streamLen << " bytes long" << endl; #endif do { if ((m_maxSize > 0) && (totalSize >= m_maxSize)) { #ifdef DEBUG clog << "GMimeMboxFilter::readStream: stopping at " << totalSize << endl; #endif break; } bytesRead = g_mime_stream_read(pStream, readBuffer, 4096); if (bytesRead > 0) { fileBuffer.append(readBuffer, bytesRead); totalSize += bytesRead; } else if (bytesRead == -1) { // An error occurred if (errno != EINTR) { gotOutput = false; break; } // Try again bytesRead = 1; } } while (bytesRead > 0); #ifdef DEBUG clog << "GMimeMboxFilter::readStream: read " << totalSize << "/" << fileBuffer.size() << " bytes" << endl; #endif return gotOutput; } bool GMimeMboxFilter::nextPart(const string &subject) { if (m_pMimeMessage != NULL) { // Get the top-level MIME part in the message GMimeObject *pMimePart = g_mime_message_get_mime_part(m_pMimeMessage); if (pMimePart != NULL) { GMimeMboxPart mboxPart(subject, m_content); // Extract the part's text m_content.clear(); if (extractPart(pMimePart, mboxPart) == true) { extractMetaData(mboxPart); return true; } } if (G_IS_OBJECT(m_pMimeMessage)) { g_object_unref(m_pMimeMessage); } m_pMimeMessage = NULL; } // If we get there, no suitable parts were found m_partsCount = m_partNum = m_partLevel = -1; return false; } bool GMimeMboxFilter::extractPart(GMimeObject *part, GMimeMboxPart &mboxPart) { if (part == NULL) { return false; } // Message parts may be nested while (GMIME_IS_MESSAGE_PART(part)) { #ifdef DEBUG clog << "GMimeMboxFilter::extractPart: nested message part" << endl; #endif GMimeMessage *partMessage = g_mime_message_part_get_message(GMIME_MESSAGE_PART(part)); part = g_mime_message_get_mime_part(partMessage); } // Is this a multipart ? if (GMIME_IS_MULTIPART(part)) { int partsCount = 0, partNum = 0; bool gotPart = false; m_partsCount = partsCount = g_mime_multipart_get_count(GMIME_MULTIPART(part)); ++m_currentLevel; #ifdef DEBUG clog << "GMimeMboxFilter::extractPart: message has " << m_partsCount << " parts at level " << m_currentLevel << endl; #endif map >::iterator levelIter = m_levels.find(m_currentLevel); if (levelIter != m_levels.end()) { pair partPair = levelIter->second; #ifdef DEBUG clog << "GMimeMboxFilter::extractPart: level " << m_currentLevel << " had " << partPair.first << " parts" << endl; #endif if (partPair.first == m_partsCount) { partNum = partPair.second; #ifdef DEBUG clog << "GMimeMboxFilter::extractPart: restarting level " << m_currentLevel << " at part " << partNum << endl; #endif } } else { partNum = 0; } for (; partNum < m_partsCount; ++partNum) { #ifdef DEBUG clog << "GMimeMboxFilter::extractPart: extracting part " << partNum << endl; #endif m_partNum = partNum; GMimeObject *multiMimePart = g_mime_multipart_get_part(GMIME_MULTIPART(part), partNum); if (multiMimePart == NULL) { continue; } gotPart = extractPart(multiMimePart, mboxPart); if (gotPart == true) { break; } } // Were all parts in the next level parsed ? levelIter = m_levels.find(m_currentLevel + 1); if ((levelIter == m_levels.end()) || (levelIter->second.second + 1 > levelIter->second.first)) { // Move to the next part at this level ++partNum; } levelIter = m_levels.find(m_currentLevel); if (levelIter != m_levels.end()) { if (partNum > levelIter->second.second) { levelIter->second.second = partNum; #ifdef DEBUG clog << "GMimeMboxFilter::extractPart: remembering to restart level " << m_currentLevel << " at part " << partNum << endl; #endif } } else { m_levels[m_currentLevel] = pair(partsCount, partNum); #ifdef DEBUG clog << "GMimeMboxFilter::extractPart: remembering to restart level " << m_currentLevel << " at part " << partNum << endl; #endif } --m_currentLevel; if (gotPart == true) { return true; } // None of the parts were suitable m_partsCount = m_partNum = m_partLevel = -1; } if (!GMIME_IS_PART(part)) { #ifdef DEBUG clog << "GMimeMboxFilter::extractPart: not a part" << endl; #endif return false; } GMimePart *mimePart = GMIME_PART(part); // Check the content type GMimeContentType *mimeType = g_mime_object_get_content_type(GMIME_OBJECT(mimePart)); // Set this for caller #if GMIME_MAJOR_VERSION >= 3 char *partType = g_mime_content_type_get_mime_type(mimeType); #else char *partType = g_mime_content_type_to_string(mimeType); #endif if (partType != NULL) { #ifdef DEBUG clog << "GMimeMboxFilter::extractPart: type is " << partType << endl; #endif mboxPart.m_contentType = partType; // Is the body in a local file ? if (mboxPart.m_contentType == "message/external-body") { const char *partAccessType = g_mime_content_type_get_parameter(mimeType, "access-type"); if (partAccessType != NULL) { string contentAccessType(partAccessType); #ifdef DEBUG clog << "GMimeMboxFilter::extractPart: part access type is " << contentAccessType << endl; #endif if (contentAccessType == "local-file") { const char *partLocalFile = g_mime_content_type_get_parameter(mimeType, "name"); if (partLocalFile != NULL) { mboxPart.m_contentType = "SCAN"; mboxPart.m_subject = partLocalFile; mboxPart.m_buffer.clear(); #ifdef DEBUG clog << "GMimeMboxFilter::extractPart: local file at " << partLocalFile << endl; #endif // Load the part from file int fd = openFile(partLocalFile); if (fd >= 0) { GMimeStream *fileStream = g_mime_stream_mmap_new(fd, PROT_READ, MAP_PRIVATE); if (fileStream != NULL) { readStream(fileStream, mboxPart.m_buffer); if (G_IS_OBJECT(fileStream)) { g_object_unref(fileStream); } } } } } else { mboxPart.m_contentType = "application/octet-stream"; #ifdef DEBUG clog << "GMimeMboxFilter::extractPart: unknown part access type" << endl; #endif } } } g_free(partType); } // Was the part already loaded ? if (mboxPart.m_buffer.empty() == false) { return true; } GMimeContentEncoding encodingType = g_mime_part_get_content_encoding(mimePart); #ifdef DEBUG clog << "GMimeMboxFilter::extractPart: encoding is " << encodingType << endl; #endif g_mime_part_set_content_encoding(mimePart, GMIME_CONTENT_ENCODING_QUOTEDPRINTABLE); const char *fileName = g_mime_part_get_filename(mimePart); if (fileName != NULL) { #ifdef DEBUG clog << "GMimeMboxFilter::extractPart: file name is " << fileName << endl; #endif mboxPart.m_subject = fileName; } // Create a in-memory output stream GMimeStream *memStream = g_mime_stream_mem_new(); if (memStream == NULL) { return false; } const char *charset = g_mime_content_type_get_parameter(mimeType, "charset"); if (charset != NULL) { m_partCharset = charset; #if 0 // Install a charset filter if (strncasecmp(charset, "UTF-8", 5) != 0) { GMimeFilter *charsetFilter = g_mime_filter_charset_new(charset, "UTF-8"); if (charsetFilter != NULL) { #ifdef DEBUG clog << "GMimeMboxFilter::extractPart: converting from charset " << charset << endl; #endif g_mime_stream_filter_add(GMIME_STREAM_FILTER(memStream), charsetFilter); g_object_unref(charsetFilter); } } #endif } // Write the part to the stream #if GMIME_MAJOR_VERSION >= 3 GMimeDataWrapper *dataWrapper = g_mime_part_get_content(mimePart); #else GMimeDataWrapper *dataWrapper = g_mime_part_get_content_object(mimePart); #endif if (dataWrapper != NULL) { ssize_t writeLen = g_mime_data_wrapper_write_to_stream(dataWrapper, memStream); #ifdef DEBUG clog << "GMimeMboxFilter::extractPart: wrote " << writeLen << " bytes" << endl; #endif if (G_IS_OBJECT(dataWrapper)) { g_object_unref(dataWrapper); } } g_mime_stream_flush(memStream); if ((m_returnHeaders == true) && (mboxPart.m_contentType.length() >= 10) && (strncasecmp(mboxPart.m_contentType.c_str(), "text/plain", 10) == 0)) { #if GMIME_MAJOR_VERSION >= 3 char *pHeaders = g_mime_object_get_headers(GMIME_OBJECT(m_pMimeMessage), NULL); #else char *pHeaders = g_mime_object_get_headers(GMIME_OBJECT(m_pMimeMessage)); #endif if (pHeaders != NULL) { mboxPart.m_buffer = pHeaders; mboxPart.m_buffer += "\n"; free(pHeaders); } } g_mime_stream_reset(memStream); readStream(memStream, mboxPart.m_buffer); if (G_IS_OBJECT(memStream)) { g_object_unref(memStream); } m_partLevel = m_currentLevel; return true; } bool GMimeMboxFilter::extractDate(const string &header) { const char *pDate = g_mime_object_get_header(GMIME_OBJECT(m_pMimeMessage), header.c_str()); if (pDate == NULL) { return false; } string date(pDate); struct tm timeTm; timeTm.tm_sec = timeTm.tm_min = timeTm.tm_hour = timeTm.tm_mday = 0; timeTm.tm_mon = timeTm.tm_year = timeTm.tm_wday = timeTm.tm_yday = timeTm.tm_isdst = 0; if (date.find(',') != string::npos) { strptime(pDate, "%a, %d %b %Y %H:%M:%S %z", &timeTm); if (timeTm.tm_year <= 0) { strptime(pDate, "%a, %d %b %y %H:%M:%S %z", &timeTm); } } else { strptime(pDate, "%d %b %Y %H:%M:%S %z", &timeTm); if (timeTm.tm_year <= 0) { strptime(pDate, "%d %b %y %H:%M:%S %z", &timeTm); } } // Sanity check if (timeTm.tm_year <= 0) { #ifdef DEBUG clog << "GMimeMboxFilter::extractDate: ignoring bogus year " << timeTm.tm_year << endl; #endif return false; } m_messageDate = mktime(&timeTm); #ifdef DEBUG clog << "GMimeMboxFilter::extractDate: message date is " << pDate << ": " << m_messageDate << endl; #endif return true; } bool GMimeMboxFilter::extractMessage(const string &subject) { string msgSubject(subject); m_currentLevel = 0; while (g_mime_stream_eos(m_pGMimeMboxStream) == FALSE) { // Does the previous message have parts left to parse ? if (m_partsCount == -1) { // No, it doesn't if (m_pMimeMessage != NULL) { if (G_IS_OBJECT(m_pMimeMessage)) { g_object_unref(m_pMimeMessage); } m_pMimeMessage = NULL; } // Get the next message #if GMIME_MAJOR_VERSION >= 3 m_pMimeMessage = g_mime_parser_construct_message(m_pParser, NULL); #else m_pMimeMessage = g_mime_parser_construct_message(m_pParser); #endif if (m_pMimeMessage == NULL) { clog << "Couldn't construct new MIME message" << endl; break; } #if GMIME_MAJOR_VERSION >= 3 m_messageStart = g_mime_parser_get_mbox_marker_offset(m_pParser); #else m_messageStart = g_mime_parser_get_from_offset(m_pParser); #endif gint64 messageEnd = g_mime_parser_tell(m_pParser); #ifdef DEBUG clog << "GMimeMboxFilter::extractMessage: message between offsets " << m_messageStart << " and " << messageEnd << endl; #endif if (messageEnd > m_messageStart) { // This only applies to Mozilla const char *pMozStatus = g_mime_object_get_header(GMIME_OBJECT(m_pMimeMessage), "X-Mozilla-Status"); if (pMozStatus != NULL) { long int mozFlags = strtol(pMozStatus, NULL, 16); // Watch out for Mozilla specific flags : // MSG_FLAG_EXPUNGED, MSG_FLAG_EXPIRED // They are defined in mailnews/MailNewsTypes.h and msgbase/nsMsgMessageFlags.h if ((mozFlags & 0x0008) || (mozFlags & 0x0040)) { #ifdef DEBUG clog << "GMimeMboxFilter::extractMessage: flagged by Mozilla" << endl; #endif continue; } } // This only applies to Evolution const char *pEvoStatus = g_mime_object_get_header(GMIME_OBJECT(m_pMimeMessage), "X-Evolution"); if (pEvoStatus != NULL) { string evoStatus(pEvoStatus); string::size_type flagsPos = evoStatus.find('-'); if (flagsPos != string::npos) { long int evoFlags = strtol(evoStatus.substr(flagsPos + 1).c_str(), NULL, 16); // Watch out for Evolution specific flags : // CAMEL_MESSAGE_DELETED // It's defined in camel/camel-folder-summary.h if (evoFlags & 0x0002) { #ifdef DEBUG clog << "GMimeMboxFilter::extractMessage: flagged by Evolution" << endl; #endif continue; } } } // How old is this message ? if ((extractDate("Date") == false) && (extractDate("Delivery-Date") == false) && (extractDate("Resent-Date") == false)) { m_messageDate = time(NULL); #ifdef DEBUG clog << "GMimeMboxFilter::extractMessage: message date is today's " << m_messageDate << endl; #endif } // Extract the subject const char *pSubject = g_mime_message_get_subject(m_pMimeMessage); if (pSubject != NULL) { msgSubject = pSubject; } } } #ifdef DEBUG clog << "GMimeMboxFilter::extractMessage: message subject is " << msgSubject << endl; #endif if (nextPart(msgSubject) == true) { return true; } // Try the next message } // The last message may have parts left if (m_partsCount != -1) { return nextPart(msgSubject); } return false; } void GMimeMboxFilter::extractMetaData(GMimeMboxPart &mboxPart) { string ipath; char posStr[128]; // New document m_metaData.clear(); m_metaData["title"] = mboxPart.m_subject; m_metaData["mimetype"] = mboxPart.m_contentType; if (m_messageDate.empty() == false) { m_metaData["date"] = m_messageDate; } m_metaData["charset"] = m_partCharset; snprintf(posStr, 128, "%lu", m_content.length()); m_metaData["size"] = posStr; snprintf(posStr, 128, "o=%ld&l=", m_messageStart); ipath = posStr; for (map >::const_iterator levelIter = m_levels.begin(); levelIter != m_levels.end(); ++levelIter) { int partNum = max(levelIter->second.second - 1, 0); if (levelIter->first == m_partLevel) { partNum = m_partNum; } snprintf(posStr, 128, "[%d,%d,%d]", levelIter->first, levelIter->second.first, partNum); ipath += posStr; } m_metaData["ipath"] = ipath; #ifdef DEBUG clog << "GMimeMboxFilter::extractMetaData: message location is " << ipath << endl; #endif } pinot-1.10/Tokenize/filters/XmlFilter.h0000664000175000017500000000712713427323541015041 00000000000000/* * Copyright 2007-2016 Fabrice Colin * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #ifndef _DIJON_XMLFILTER_H #define _DIJON_XMLFILTER_H #include #include "Filter.h" namespace Dijon { class XmlFilter : public Filter { public: /// Builds an empty filter. XmlFilter(); /// Destroys the filter. virtual ~XmlFilter(); // Information. /// Returns what data the filter requires as input. virtual bool is_data_input_ok(DataInput input) const; // Initialization. /** Sets a property, prior to calling set_document_XXX(). * Returns false if the property is not supported. */ virtual bool set_property(Properties prop_name, const std::string &prop_value); /** (Re)initializes the filter with the given data. * Caller should ensure the given pointer is valid until the * Filter object is destroyed, as some filters may not need to * do a deep copy of the data. * Call next_document() to position the filter onto the first document. * Returns false if this input is not supported or an error occurred. */ virtual bool set_document_data(const char *data_ptr, off_t data_length); /** (Re)initializes the filter with the given data. * Call next_document() to position the filter onto the first document. * Returns false if this input is not supported or an error occurred. */ virtual bool set_document_string(const std::string &data_str); /** (Re)initializes the filter with the given file. * Call next_document() to position the filter onto the first document. * Returns false if this input is not supported or an error occurred. */ virtual bool set_document_file(const std::string &file_path, bool unlink_when_done = false); /** (Re)initializes the filter with the given URI. * Call next_document() to position the filter onto the first document. * Returns false if this input is not supported or an error occurred. */ virtual bool set_document_uri(const std::string &uri); // Going from one nested document to the next. /** Returns true if there are nested documents left to extract. * Returns false if the end of the parent document was reached * or an error occurred. */ virtual bool has_documents(void) const; /** Moves to the next nested document. * Returns false if there are none left. */ virtual bool next_document(void); /** Skips to the nested document with the given ipath. * Returns false if no such document exists. */ virtual bool skip_to_document(const std::string &ipath); // Accessing documents' contents. /// Returns the message for the most recent error that has occurred. virtual std::string get_error(void) const; protected: bool m_doneWithDocument; virtual void rewind(void); bool parse_xml(const std::string &xml_doc); private: /// XmlFilter objects cannot be copied. XmlFilter(const XmlFilter &other); /// XmlFilter objects cannot be copied. XmlFilter& operator=(const XmlFilter& other); }; } #endif // _DIJON_XMLFILTER_H pinot-1.10/Tokenize/filters/Filter.cc0000664000175000017500000000370113012075545014506 00000000000000/* * Copyright 2007-2016 Fabrice Colin * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include #include #include #include #include #include "Filter.h" using std::string; using std::set; using std::map; using std::clog; using std::endl; using namespace Dijon; MIMETypes::MIMETypes() { } MIMETypes::~MIMETypes() { } Filter::Filter() : m_deleteInputFile(false) { } Filter::~Filter() { deleteInputFile(); } bool Filter::set_document_file(const string &file_path, bool unlink_when_done) { if (file_path.empty() == true) { return false; } rewind(); m_filePath = file_path; m_deleteInputFile = unlink_when_done; return true; } void Filter::set_mime_type(const string &mime_type) { m_mimeType = mime_type; } string Filter::get_mime_type(void) const { return m_mimeType; } const map &Filter::get_meta_data(void) const { return m_metaData; } const dstring &Filter::get_content(void) const { return m_content; } void Filter::rewind(void) { m_metaData.clear(); m_content.clear(); deleteInputFile(); m_filePath.clear(); m_deleteInputFile = false; } void Filter::deleteInputFile(void) { if ((m_deleteInputFile == true) && (m_filePath.empty() == false)) { unlink(m_filePath.c_str()); } } pinot-1.10/Tokenize/filters/FileOutputFilter.h0000664000175000017500000000225213012077501016363 00000000000000/* * Copyright 2011-2016 Fabrice Colin * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #ifndef _DIJON_FILEOUTPUTFILTER_H #define _DIJON_FILEOUTPUTFILTER_H #include "Filter.h" namespace Dijon { class DIJON_FILTER_EXPORT FileOutputFilter : public Filter { public: /// Builds an empty filter. FileOutputFilter(); /// Destroys the filter. virtual ~FileOutputFilter(); protected: bool read_file(int fd, ssize_t maxSize, ssize_t &totalSize); }; } #endif // _DIJON_FILEOUTPUTFILTER_H pinot-1.10/Tokenize/filters/FilterFactory.h0000664000175000017500000000402313005331666015677 00000000000000/* * Copyright 2007 Fabrice Colin * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #ifndef _DIJON_FILTERFACTORY_H #define _DIJON_FILTERFACTORY_H #include #include #include #include "Filter.h" #ifndef _DYNAMIC_DIJON_HTMLFILTER #include "HtmlFilter.h" #endif #ifndef _DYNAMIC_DIJON_XMLFILTER #include "XmlFilter.h" #endif namespace Dijon { /// Factory for filters with related utility methods. class FilterFactory { public: virtual ~FilterFactory(); /// Loads the filter libraries found in the given directory. static unsigned int loadFilters(const std::string &dir_name); /// Returns a Filter that handles the given MIME type. static Filter *getFilter(const std::string &mime_type); /// Returns all supported MIME types. static void getSupportedTypes(std::set &mime_types); /// Indicates whether a MIME type is supported or not. static bool isSupportedType(const std::string &mime_type); /// Unloads all filter libraries. static void unloadFilters(void); protected: static std::map m_types; static std::map m_handles; FilterFactory(); static Filter *getLibraryFilter(const std::string &mime_type); private: FilterFactory(const FilterFactory &other); FilterFactory& operator=(const FilterFactory& other); }; } #endif // _DIJON_FILTERFACTORY_H pinot-1.10/Tokenize/filters/HtmlParser.h0000664000175000017500000000301513005331666015203 00000000000000/* htmlparse.h: simple HTML parser for omega indexer * * Copyright 1999,2000,2001 BrightStation PLC * Copyright 2002,2006,2008 Olly Betts * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 * USA */ #ifndef OMEGA_INCLUDED_HTMLPARSE_H #define OMEGA_INCLUDED_HTMLPARSE_H #include #include using std::string; using std::map; class HtmlParser { map parameters; protected: void decode_entities(string &s); bool in_script; string charset; static map named_ents; bool get_parameter(const string & param, string & value); public: virtual void process_text(const string &/*text*/) { } virtual void opening_tag(const string &/*tag*/) { } virtual void closing_tag(const string &/*tag*/) { } virtual void parse_html(const string &text); HtmlParser(); virtual ~HtmlParser() { } }; #endif // OMEGA_INCLUDED_HTMLPARSE_H pinot-1.10/Tokenize/filters/Exiv2ImageFilter.cc0000664000175000017500000002424613515661446016407 00000000000000/* * Copyright 2011-2019 Fabrice Colin * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include "config.h" #include #include #include #include #include #include #include #include #include #ifdef HAVE_EXIV2_XMP_EXIV2_HPP #include #include #else #include #endif #include "config.h" #include "Exiv2ImageFilter.h" using std::string; using std::clog; using std::clog; using std::endl; using namespace Dijon; #ifdef _DYNAMIC_DIJON_FILTERS DIJON_FILTER_EXPORT bool get_filter_types(MIMETypes &mime_types) { mime_types.m_mimeTypes.clear(); // List from http://dev.exiv2.org/wiki/exiv2/Supported_image_formats // without application/rdf+xml mime_types.m_mimeTypes.insert("image/jpeg"); mime_types.m_mimeTypes.insert("image/x-exv"); mime_types.m_mimeTypes.insert("image/x-canon-cr2"); mime_types.m_mimeTypes.insert("image/x-canon-crw"); mime_types.m_mimeTypes.insert("image/x-minolta-mrw"); mime_types.m_mimeTypes.insert("image/tiff"); mime_types.m_mimeTypes.insert("image/x-nikon-nef"); mime_types.m_mimeTypes.insert("image/x-pentax-pef"); mime_types.m_mimeTypes.insert("image/x-panasonic-rw2"); mime_types.m_mimeTypes.insert("image/x-samsung-srw"); mime_types.m_mimeTypes.insert("image/x-olympus-orf"); mime_types.m_mimeTypes.insert("image/png"); mime_types.m_mimeTypes.insert("image/pgf"); mime_types.m_mimeTypes.insert("image/x-fuji-raf"); mime_types.m_mimeTypes.insert("image/x-photoshop"); mime_types.m_mimeTypes.insert("image/targa"); mime_types.m_mimeTypes.insert("image/x-ms-bmp"); mime_types.m_mimeTypes.insert("image/jp2"); return true; } DIJON_FILTER_EXPORT bool check_filter_data_input(int data_input) { Filter::DataInput input = (Filter::DataInput)data_input; if (input == Filter::DOCUMENT_FILE_NAME) { return true; } return false; } DIJON_FILTER_EXPORT Filter *get_filter(void) { return new Exiv2ImageFilter(); } #endif static string iptcDateTime(const string &ccyymmdd, const string &hhmmss) { struct tm timeTm; // Initialize the structure timeTm.tm_sec = timeTm.tm_min = timeTm.tm_hour = timeTm.tm_mday = 0; timeTm.tm_mon = timeTm.tm_year = timeTm.tm_wday = timeTm.tm_yday = timeTm.tm_isdst = 0; #ifdef HAVE_STRPTIME if ((strptime(ccyymmdd.c_str(), "%C%Y%m%d", &timeTm) != NULL) && (strptime(hhmmss.c_str(), "%H%M%S", &timeTm) != NULL)) #else timeTm.tm_year = atoi(ccyymmdd.substr(2, 4).c_str()); timeTm.tm_mon = atoi(ccyymmdd.substr(6, 2).c_str()); timeTm.tm_mday = atoi(ccyymmdd.substr(8, 2).c_str()); timeTm.tm_hour = atoi(hhmmss.substr(0, 2).c_str()); timeTm.tm_min = atoi(hhmmss.substr(2, 2).c_str()); timeTm.tm_sec = atoi(hhmmss.substr(4, 2).c_str()); if (timeTm.tm_yday > 0) #endif { char timeStr[64]; if (strftime(timeStr, 64, "%a, %d %b %Y %H:%M:%S", &timeTm) > 0) { #ifdef DEBUG clog << "IPTC " << ccyymmdd << " " << hhmmss << " is " << timeStr << endl; #endif return timeStr; } } return ""; } static string exifDateTime(const string &value) { struct tm timeTm; // Initialize the structure timeTm.tm_sec = timeTm.tm_min = timeTm.tm_hour = timeTm.tm_mday = 0; timeTm.tm_mon = timeTm.tm_year = timeTm.tm_wday = timeTm.tm_yday = timeTm.tm_isdst = 0; #ifdef HAVE_STRPTIME if (strptime(value.c_str(), "%Y:%m:%d %H:%M:%S", &timeTm) != NULL) #else timeTm.tm_year = atoi(value.substr(0, 4).c_str()); timeTm.tm_mon = atoi(value.substr(5, 2).c_str()); timeTm.tm_mday = atoi(value.substr(8, 2).c_str()); timeTm.tm_hour = atoi(value.substr(11, 2).c_str()); timeTm.tm_min = atoi(value.substr(14, 2).c_str()); timeTm.tm_sec = atoi(value.substr(17, 2).c_str()); if (timeTm.tm_mday > 0) #endif { char timeStr[64]; if (strftime(timeStr, 64, "%a, %d %b %Y %H:%M:%S", &timeTm) > 0) { #ifdef DEBUG clog << "EXIF " << value << " is " << timeStr << endl; #endif return timeStr; } } return ""; } Exiv2ImageFilter::Exiv2ImageFilter() : Filter(), m_parseDocument(false) { } Exiv2ImageFilter::~Exiv2ImageFilter() { rewind(); } bool Exiv2ImageFilter::is_data_input_ok(DataInput input) const { if (input == DOCUMENT_FILE_NAME) { return true; } return false; } bool Exiv2ImageFilter::set_property(Properties prop_name, const string &prop_value) { return false; } bool Exiv2ImageFilter::set_document_data(const char *data_ptr, off_t data_length) { return false; } bool Exiv2ImageFilter::set_document_string(const string &data_str) { return false; } bool Exiv2ImageFilter::set_document_file(const string &file_path, bool unlink_when_done) { if (Filter::set_document_file(file_path, unlink_when_done) == true) { m_parseDocument = true; return true; } return false; } bool Exiv2ImageFilter::set_document_uri(const string &uri) { return false; } bool Exiv2ImageFilter::has_documents(void) const { return m_parseDocument; } bool Exiv2ImageFilter::next_document(void) { bool foundData = true; if (m_parseDocument == false) { return false; } #ifdef DEBUG clog << "Exiv2ImageFilter::next_document: " << m_filePath << endl; #endif m_parseDocument = false; m_metaData["mimetype"] = "text/plain"; m_metaData["charset"] = "utf-8"; m_metaData["title"] = m_filePath; try { Exiv2::Image::AutoPtr image = Exiv2::ImageFactory::open(m_filePath); if (image.get() == NULL) { clog << m_filePath.c_str() << " is not an image" << endl; return false; } image->readMetadata(); // Tag reference at http://www.exiv2.org/metadata.html Exiv2::XmpData &xmpData = image->xmpData(); if (xmpData.empty() == false) { #ifdef DEBUG clog << "Exiv2ImageFilter::next_document: XMP data in " << m_filePath << endl; #endif for (Exiv2::XmpData::const_iterator tagIter = xmpData.begin(); tagIter != xmpData.end(); ++tagIter) { const char *pTypeName = tagIter->typeName(); if ((pTypeName == NULL) || (strncasecmp(pTypeName, "Text", 4) != 0)) { continue; } const Exiv2::Value &value = tagIter->value(); string key(tagIter->key()); string valueStr(value.toString()); if (valueStr.empty() == false) { m_content += " "; m_content.append(key.c_str(), key.length()); m_content += " "; m_content.append(valueStr.c_str(), valueStr.length()); } #ifdef DEBUG clog << "Exiv2ImageFilter::next_document: " << key << "=" << value << endl; #endif } } Exiv2::IptcData &iptcData = image->iptcData(); if (iptcData.empty() == false) { string iptcDate, iptcTime; #ifdef DEBUG clog << "Exiv2ImageFilter::next_document: IPTC data in " << m_filePath << endl; #endif for (Exiv2::IptcData::const_iterator tagIter = iptcData.begin(); tagIter != iptcData.end(); ++tagIter) { const char *pTypeName = tagIter->typeName(); if (pTypeName == NULL) { continue; } const Exiv2::Value &value = tagIter->value(); string key(tagIter->key()); string valueStr(value.toString()); #ifdef DEBUG clog << "Exiv2ImageFilter::next_document: " << key << "=" << value << endl; #endif if ((strncasecmp(pTypeName, "Date", 4) == 0) && (key == "Iptc.Application2.DateCreated")) { iptcDate = valueStr; } else if ((strncasecmp(pTypeName, "Time", 4) == 0) && (key == "Iptc.Application2.TimeCreated")) { iptcTime = valueStr; } else if (strncasecmp(pTypeName, "String", 6) != 0) { continue; } if (key.find(".ObjectName") != string::npos) { m_metaData["title"] = valueStr; } else if (valueStr.empty() == false) { m_content += " "; m_content.append(key.c_str(), key.length()); m_content += " "; m_content.append(valueStr.c_str(), valueStr.length()); } } if ((iptcDate.empty() == false) || (iptcTime.empty() == false)) { m_metaData["date"] = iptcDateTime(iptcDate, iptcTime); } } Exiv2::ExifData &exifData = image->exifData(); if (exifData.empty() == false) { bool foundDate = false; #ifdef DEBUG clog << "Exiv2ImageFilter::next_document: EXIF data in " << m_filePath << endl; #endif for (Exiv2::ExifData::const_iterator tagIter = exifData.begin(); tagIter != exifData.end(); ++tagIter) { const char *pTypeName = tagIter->typeName(); if ((pTypeName == NULL) || (strncasecmp(pTypeName, "Ascii", 5) != 0)) { continue; } const Exiv2::Value &value = tagIter->value(); string key(tagIter->key()); string valueStr(value.toString()); #ifdef DEBUG clog << "Exiv2ImageFilter::next_document: " << key << "=" << value << endl; #endif if (key == "Exif.Image.DocumentName") { m_metaData["title"] = valueStr; } else if (key.find("Date") != string::npos) { if (((key == "Exif.Photo.DateTimeOriginal") || (key == "Exif.Image.DateTimeOriginal")) && (foundDate == false)) { m_metaData["date"] = exifDateTime(valueStr); foundDate = true; } } else if (valueStr.empty() == false) { m_content += " "; m_content.append(key.c_str(), key.length()); m_content += " "; m_content.append(valueStr.c_str(), valueStr.length()); } } } } catch (Exiv2::AnyError &e) { clog << "Caught exiv2 exception: " << e << endl; foundData = false; } catch (...) { clog << "Caught unknown exception" << endl; foundData = false; } return foundData; } bool Exiv2ImageFilter::skip_to_document(const string &ipath) { if (ipath.empty() == true) { return next_document(); } return false; } string Exiv2ImageFilter::get_error(void) const { return ""; } void Exiv2ImageFilter::rewind(void) { Filter::rewind(); m_parseDocument = false; } pinot-1.10/Tokenize/filters/HtmlParser.cc0000664000175000017500000003503213005331666015345 00000000000000/* htmlparse.cc: simple HTML parser for omega indexer * * Copyright 1999,2000,2001 BrightStation PLC * Copyright 2001 Ananova Ltd * Copyright 2002,2006,2007,2008 Olly Betts * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 * USA */ #include #include #include #include #include #include #include "config.h" #include "HtmlParser.h" using namespace std; inline void lowercase_string(string &str) { for (string::iterator i = str.begin(); i != str.end(); ++i) { *i = tolower(static_cast(*i)); } } map HtmlParser::named_ents; inline static bool p_notdigit(char c) { return !isdigit(static_cast(c)); } inline static bool p_notxdigit(char c) { return !isxdigit(static_cast(c)); } inline static bool p_notalnum(char c) { return !isalnum(static_cast(c)); } inline static bool p_notwhitespace(char c) { return !isspace(static_cast(c)); } inline static bool p_nottag(char c) { return !isalnum(static_cast(c)) && c != '.' && c != '-' && c != ':'; // ':' for XML namespaces. } inline static bool p_whitespacegt(char c) { return isspace(static_cast(c)) || c == '>'; } inline static bool p_whitespaceeqgt(char c) { return isspace(static_cast(c)) || c == '=' || c == '>'; } static unsigned nonascii_to_utf8(unsigned ch, char * buf) { // FIXME: use CJKVTokenizer's _unicode_to_char() if (ch < 0x800) { buf[0] = 0xc0 | (ch >> 6); buf[1] = 0x80 | (ch & 0x3f); return 2; } if (ch < 0x10000) { buf[0] = 0xe0 | (ch >> 12); buf[1] = 0x80 | ((ch >> 6) & 0x3f); buf[2] = 0x80 | (ch & 0x3f); return 3; } if (ch < 0x200000) { buf[0] = 0xf0 | (ch >> 18); buf[1] = 0x80 | ((ch >> 12) & 0x3f); buf[2] = 0x80 | ((ch >> 6) & 0x3f); buf[3] = 0x80 | (ch & 0x3f); return 4; } return 0; } bool HtmlParser::get_parameter(const string & param, string & value) { map::const_iterator i = parameters.find(param); if (i == parameters.end()) return false; value = i->second; return true; } HtmlParser::HtmlParser() { static const struct ent { const char *n; unsigned int v; } ents[] = { // Names and values from: "Character entity references in HTML 4" // http://www.w3.org/TR/html4/sgml/entities.html { "quot", 34 }, { "amp", 38 }, { "apos", 39 }, // Not in HTML 4 list but used in OpenOffice XML. { "lt", 60 }, { "gt", 62 }, { "nbsp", 160 }, { "iexcl", 161 }, { "cent", 162 }, { "pound", 163 }, { "curren", 164 }, { "yen", 165 }, { "brvbar", 166 }, { "sect", 167 }, { "uml", 168 }, { "copy", 169 }, { "ordf", 170 }, { "laquo", 171 }, { "not", 172 }, { "shy", 173 }, { "reg", 174 }, { "macr", 175 }, { "deg", 176 }, { "plusmn", 177 }, { "sup2", 178 }, { "sup3", 179 }, { "acute", 180 }, { "micro", 181 }, { "para", 182 }, { "middot", 183 }, { "cedil", 184 }, { "sup1", 185 }, { "ordm", 186 }, { "raquo", 187 }, { "frac14", 188 }, { "frac12", 189 }, { "frac34", 190 }, { "iquest", 191 }, { "Agrave", 192 }, { "Aacute", 193 }, { "Acirc", 194 }, { "Atilde", 195 }, { "Auml", 196 }, { "Aring", 197 }, { "AElig", 198 }, { "Ccedil", 199 }, { "Egrave", 200 }, { "Eacute", 201 }, { "Ecirc", 202 }, { "Euml", 203 }, { "Igrave", 204 }, { "Iacute", 205 }, { "Icirc", 206 }, { "Iuml", 207 }, { "ETH", 208 }, { "Ntilde", 209 }, { "Ograve", 210 }, { "Oacute", 211 }, { "Ocirc", 212 }, { "Otilde", 213 }, { "Ouml", 214 }, { "times", 215 }, { "Oslash", 216 }, { "Ugrave", 217 }, { "Uacute", 218 }, { "Ucirc", 219 }, { "Uuml", 220 }, { "Yacute", 221 }, { "THORN", 222 }, { "szlig", 223 }, { "agrave", 224 }, { "aacute", 225 }, { "acirc", 226 }, { "atilde", 227 }, { "auml", 228 }, { "aring", 229 }, { "aelig", 230 }, { "ccedil", 231 }, { "egrave", 232 }, { "eacute", 233 }, { "ecirc", 234 }, { "euml", 235 }, { "igrave", 236 }, { "iacute", 237 }, { "icirc", 238 }, { "iuml", 239 }, { "eth", 240 }, { "ntilde", 241 }, { "ograve", 242 }, { "oacute", 243 }, { "ocirc", 244 }, { "otilde", 245 }, { "ouml", 246 }, { "divide", 247 }, { "oslash", 248 }, { "ugrave", 249 }, { "uacute", 250 }, { "ucirc", 251 }, { "uuml", 252 }, { "yacute", 253 }, { "thorn", 254 }, { "yuml", 255 }, { "OElig", 338 }, { "oelig", 339 }, { "Scaron", 352 }, { "scaron", 353 }, { "Yuml", 376 }, { "fnof", 402 }, { "circ", 710 }, { "tilde", 732 }, { "Alpha", 913 }, { "Beta", 914 }, { "Gamma", 915 }, { "Delta", 916 }, { "Epsilon", 917 }, { "Zeta", 918 }, { "Eta", 919 }, { "Theta", 920 }, { "Iota", 921 }, { "Kappa", 922 }, { "Lambda", 923 }, { "Mu", 924 }, { "Nu", 925 }, { "Xi", 926 }, { "Omicron", 927 }, { "Pi", 928 }, { "Rho", 929 }, { "Sigma", 931 }, { "Tau", 932 }, { "Upsilon", 933 }, { "Phi", 934 }, { "Chi", 935 }, { "Psi", 936 }, { "Omega", 937 }, { "alpha", 945 }, { "beta", 946 }, { "gamma", 947 }, { "delta", 948 }, { "epsilon", 949 }, { "zeta", 950 }, { "eta", 951 }, { "theta", 952 }, { "iota", 953 }, { "kappa", 954 }, { "lambda", 955 }, { "mu", 956 }, { "nu", 957 }, { "xi", 958 }, { "omicron", 959 }, { "pi", 960 }, { "rho", 961 }, { "sigmaf", 962 }, { "sigma", 963 }, { "tau", 964 }, { "upsilon", 965 }, { "phi", 966 }, { "chi", 967 }, { "psi", 968 }, { "omega", 969 }, { "thetasym", 977 }, { "upsih", 978 }, { "piv", 982 }, { "ensp", 8194 }, { "emsp", 8195 }, { "thinsp", 8201 }, { "zwnj", 8204 }, { "zwj", 8205 }, { "lrm", 8206 }, { "rlm", 8207 }, { "ndash", 8211 }, { "mdash", 8212 }, { "lsquo", 8216 }, { "rsquo", 8217 }, { "sbquo", 8218 }, { "ldquo", 8220 }, { "rdquo", 8221 }, { "bdquo", 8222 }, { "dagger", 8224 }, { "Dagger", 8225 }, { "bull", 8226 }, { "hellip", 8230 }, { "permil", 8240 }, { "prime", 8242 }, { "Prime", 8243 }, { "lsaquo", 8249 }, { "rsaquo", 8250 }, { "oline", 8254 }, { "frasl", 8260 }, { "euro", 8364 }, { "image", 8465 }, { "weierp", 8472 }, { "real", 8476 }, { "trade", 8482 }, { "alefsym", 8501 }, { "larr", 8592 }, { "uarr", 8593 }, { "rarr", 8594 }, { "darr", 8595 }, { "harr", 8596 }, { "crarr", 8629 }, { "lArr", 8656 }, { "uArr", 8657 }, { "rArr", 8658 }, { "dArr", 8659 }, { "hArr", 8660 }, { "forall", 8704 }, { "part", 8706 }, { "exist", 8707 }, { "empty", 8709 }, { "nabla", 8711 }, { "isin", 8712 }, { "notin", 8713 }, { "ni", 8715 }, { "prod", 8719 }, { "sum", 8721 }, { "minus", 8722 }, { "lowast", 8727 }, { "radic", 8730 }, { "prop", 8733 }, { "infin", 8734 }, { "ang", 8736 }, { "and", 8743 }, { "or", 8744 }, { "cap", 8745 }, { "cup", 8746 }, { "int", 8747 }, { "there4", 8756 }, { "sim", 8764 }, { "cong", 8773 }, { "asymp", 8776 }, { "ne", 8800 }, { "equiv", 8801 }, { "le", 8804 }, { "ge", 8805 }, { "sub", 8834 }, { "sup", 8835 }, { "nsub", 8836 }, { "sube", 8838 }, { "supe", 8839 }, { "oplus", 8853 }, { "otimes", 8855 }, { "perp", 8869 }, { "sdot", 8901 }, { "lceil", 8968 }, { "rceil", 8969 }, { "lfloor", 8970 }, { "rfloor", 8971 }, { "lang", 9001 }, { "rang", 9002 }, { "loz", 9674 }, { "spades", 9824 }, { "clubs", 9827 }, { "hearts", 9829 }, { "diams", 9830 }, { NULL, 0 } }; if (named_ents.empty()) { const struct ent *i = ents; while (i->n) { named_ents[string(i->n)] = i->v; ++i; } } } void HtmlParser::decode_entities(string &s) { // We need a const_iterator version of s.end() - otherwise the // find() and find_if() templates don't work... string::const_iterator amp = s.begin(), s_end = s.end(); while ((amp = find(amp, s_end, '&')) != s_end) { unsigned int val = 0; string::const_iterator end, p = amp + 1; if (p != s_end && *p == '#') { p++; if (p != s_end && (*p == 'x' || *p == 'X')) { // hex p++; end = find_if(p, s_end, p_notxdigit); sscanf(s.substr(p - s.begin(), end - p).c_str(), "%x", &val); } else { // number end = find_if(p, s_end, p_notdigit); val = atoi(s.substr(p - s.begin(), end - p).c_str()); } } else { end = find_if(p, s_end, p_notalnum); string code = s.substr(p - s.begin(), end - p); map::const_iterator i; i = named_ents.find(code); if (i != named_ents.end()) val = i->second; } if (end < s_end && *end == ';') end++; if (val) { string::size_type amp_pos = amp - s.begin(); if (val < 0x80) { s.replace(amp_pos, end - amp, 1u, char(val)); } else { // Convert unicode value val to UTF-8. char seq[4]; unsigned len = nonascii_to_utf8(val, seq); s.replace(amp_pos, end - amp, seq, len); } s_end = s.end(); // We've modified the string, so the iterators are no longer // valid... amp = s.begin() + amp_pos + 1; } else { amp = end; } } } void HtmlParser::parse_html(const string &body) { in_script = false; parameters.clear(); string::const_iterator start = body.begin(); while (true) { // Skip through until we find an HTML tag, a comment, or the end of // document. Ignore isolated occurrences of `<' which don't start // a tag or comment. string::const_iterator p = start; while (true) { p = find(p, body.end(), '<'); if (p == body.end()) break; unsigned char ch = *(p + 1); // Tag, closing tag, or comment (or SGML declaration). if ((!in_script && isalpha(ch)) || ch == '/' || ch == '!') break; if (ch == '?') { // PHP code or XML declaration. // XML declaration is only valid at the start of the first line. // FIXME: need to deal with BOMs... if (p != body.begin() || body.size() < 20) break; // XML declaration looks something like this: // if (p[2] != 'x' || p[3] != 'm' || p[4] != 'l') break; if (strchr(" \t\r\n", p[5]) == NULL) break; string::const_iterator decl_end = find(p + 6, body.end(), '?'); if (decl_end == body.end()) break; // Default charset for XML is UTF-8. charset = "UTF-8"; string decl(p + 6, decl_end); size_t enc = decl.find("encoding"); if (enc == string::npos) break; enc = decl.find_first_not_of(" \t\r\n", enc + 8); if (enc == string::npos || enc == decl.size()) break; if (decl[enc] != '=') break; enc = decl.find_first_not_of(" \t\r\n", enc + 1); if (enc == string::npos || enc == decl.size()) break; if (decl[enc] != '"' && decl[enc] != '\'') break; char quote = decl[enc++]; size_t enc_end = decl.find(quote, enc); if (enc != string::npos) charset = decl.substr(enc, enc_end - enc); break; } p++; } // Process text up to start of tag. if (p > start) { string text = body.substr(start - body.begin(), p - start); #if 0 convert_to_utf8(text, charset); #endif decode_entities(text); process_text(text); } if (p == body.end()) break; start = p + 1; if (start == body.end()) break; if (*start == '!') { if (++start == body.end()) break; if (++start == body.end()) break; // comment or SGML declaration if (*(start - 1) == '-' && *start == '-') { ++start; string::const_iterator close = find(start, body.end(), '>'); // An unterminated comment swallows rest of document // (like Netscape, but unlike MSIE IIRC) if (close == body.end()) break; p = close; // look for --> while (p != body.end() && (*(p - 1) != '-' || *(p - 2) != '-')) p = find(p + 1, body.end(), '>'); if (p != body.end()) { // Check for htdig's "ignore this bit" comments. if (p - start == 15 && string(start, p - 2) == "htdig_noindex") { string::size_type i; i = body.find("", p + 1 - body.begin()); if (i == string::npos) break; start = body.begin() + i + 21; continue; } // If we found --> skip to there. start = p; } else { // Otherwise skip to the first > we found (as Netscape does). start = close; } } else { // just an SGML declaration, perhaps giving the DTD - ignore it start = find(start - 1, body.end(), '>'); if (start == body.end()) break; } ++start; } else if (*start == '?') { if (++start == body.end()) break; // PHP - swallow until ?> or EOF start = find(start + 1, body.end(), '>'); // look for ?> while (start != body.end() && *(start - 1) != '?') start = find(start + 1, body.end(), '>'); // unterminated PHP swallows rest of document (rather arbitrarily // but it avoids polluting the database when things go wrong) if (start != body.end()) ++start; } else { // opening or closing tag int closing = 0; if (*start == '/') { closing = 1; start = find_if(start + 1, body.end(), p_notwhitespace); } p = start; start = find_if(start, body.end(), p_nottag); string tag = body.substr(p - body.begin(), start - p); // convert tagname to lowercase lowercase_string(tag); if (closing) { closing_tag(tag); if (in_script && tag == "script") in_script = false; /* ignore any bogus parameters on closing tags */ p = find(start, body.end(), '>'); if (p == body.end()) break; start = p + 1; } else { // FIXME: parse parameters lazily. while (start < body.end() && *start != '>') { string name, value; p = find_if(start, body.end(), p_whitespaceeqgt); name.assign(body, start - body.begin(), p - start); p = find_if(p, body.end(), p_notwhitespace); start = p; if (start != body.end() && *start == '=') { start = find_if(start + 1, body.end(), p_notwhitespace); p = body.end(); int quote = *start; if (quote == '"' || quote == '\'') { start++; p = find(start, body.end(), quote); } if (p == body.end()) { // unquoted or no closing quote p = find_if(start, body.end(), p_whitespacegt); } value.assign(body, start - body.begin(), p - start); start = find_if(p, body.end(), p_notwhitespace); if (!name.empty()) { // convert parameter name to lowercase lowercase_string(name); // in case of multiple entries, use the first // (as Netscape does) parameters.insert(make_pair(name, value)); } } } opening_tag(tag); parameters.clear(); // In