pax_global_header00006660000000000000000000000064137660356160014527gustar00rootroot0000000000000052 comment=a4efe7a6508333a2e4f50f66223d212770e0f098 paraglob-0.6.0/000077500000000000000000000000001376603561600133215ustar00rootroot00000000000000paraglob-0.6.0/.gitignore000066400000000000000000000000721376603561600153100ustar00rootroot00000000000000# Ignore build output *.o *.a *.out Makefile .cmake build paraglob-0.6.0/.update-changes.cfg000066400000000000000000000002051376603561600167450ustar00rootroot00000000000000 # Automatically adapt version in files. function new_version_hook { version=$1 replace_version_in_rst README.md $version } paraglob-0.6.0/CHANGES000066400000000000000000000104061376603561600143150ustar00rootroot00000000000000 0.6.0 | 2020-12-07 17:50:29 +0000 * Release 0.6.0. 0.5.0-10 | 2020-12-07 14:16:50 +0000 * Baseline refresh to reflect btest 0.64 (Christian Kreibich, Corelight) 0.5.0-8 | 2020-12-02 11:06:31 -0800 * Update minimum required CMake to 3.5 (Jon Siwek, Corelight) 0.5.0-6 | 2020-12-01 09:54:14 -0800 * Fix compiler warning about copying loop variable. (Robin Sommer, Corelight) 0.5.0-4 | 2020-11-24 15:12:15 -0800 * Rely on GNUInstallDirs for libdir selection (Christian Kreibich, Corelight) This allows libdir overrides to be inherited in Zeek-bundled installs. 0.5.0-2 | 2020-09-21 13:35:55 -0700 * Avoid calling qsort() with null pointer argument (Jon Siwek, Corelight) Likely no ill-effects of doing so since number of elements was always observed to be zero whenever a null pointer was passed, but qsort() may technically be annotated with a `nonnull` attribute, so that triggers an error when using `-fsanitize=nonnull-attribute`. 0.5.0 | 2020-02-18 15:21:21 -0800 * Release 0.5.0. 0.4-3 | 2020-02-18 15:19:21 -0800 * GH-16: fix undefined references in libparaglob.a (Jon Siwek, Corelight) 0.4-2 | 2020-02-18 14:49:20 -0800 * Remove empty paraglob.h (Jon Siwek, Corelight) 0.4-1 | 2020-02-18 10:18:50 -0800 * Replace non-standard variable-length-array usage in benchmark (Jon Siwek, Corelight) 0.4 | 2020-01-13 11:32:09 +0000 * Release 0.4. 0.3-12 | 2020-01-13 11:31:10 +0000 * Increase max allowed time for benchmark test to increase reliability. (Jon Siwek, Corelight) * Set pattern ID type in AhoCorasickPlus::addPattern to avoid risk of invalid memory accesses. (Jon Siwek, Corelight) 0.3-9 | 2020-01-03 12:03:22 -0700 * Change AhoCorasickPlus::addPattern() to take a string_view (Jon Siwek, Corelight) * Fix use-after-free bug The default assumption of AhoCorasick::addPattern() was that the memory used to store pattern strings is available for the entire lifetime of the process, but Paraglob initialization was passing in temporary std::string objects. (Jon Siwek, Corelight) * Fix btest.cfg PATH (Jon Siwek, Corelight) 0.3-5 | 2019-10-29 09:10:19 -0700 * Update RequireCXX17.cmake to recent Zeek version (Jon Siwek, Corelight) Uses c++1z flag on Clang 4.0 for C++17 0.3-4 | 2019-10-28 18:03:55 -0700 * Update RequireCXX17.cmake to remove CMake < 3.0 compat code (Jon Siwek, Corelight) * Require CMake 3.0+ (Jon Siwek, Corelight) * Move CMake project() after cmake_minimum_required() (Jon Siwek, Corelight) * Switch to require C++17. (Robin Sommer, Corelight) 0.3 | 2019-09-30 21:07:37 -0700 * Re-organize and install all paraglob header files (Jon Siwek, Corelight) Moves all paraglob headers into include/paraglob and installs them. Also shortened some filenames since they'll be included via paraglob subdirectory now, like "paraglob/paraglob.h". Moves ahocorasick headers into src/ since they're not part of the public API. Related to https://github.com/zeek/zeek/issues/602 * Forward declare AhoCorasickPlus in paraglob.h (Jon Siwek, Corelight) Such that it becomes an implementation detail and no longer requires installation of ahocorasick headers along with paraglob's. * Remove empty paraglob.h (Jon Siwek, Corelight) 0.2 | 2019-07-09 17:39:28 -0700 * Release v0.2 * Some cleanup for Coverity. (Zeke Medley, Corelight) * Clarify the type of patterns supported. (Zeke Medley) Also updates the syntax to reflect a small change made in the merge. 0.1-19 | 2019-06-06 22:11:05 +0000 * Make release default build configuration. (Zeke Medley, Corelight) 0.1-15 | 2019-06-05 19:29:03 +0000 * Improve building and testing. (Zeke Medley, Corelight) 0.1-12 | 2019-06-04 18:59:59 +0000 * Communicate internal failures by throwing exceptions. (Zeke Medley, Corelight) 0.1-9 | 2019-05-29 18:03:48 +0000 * Make paraglob serializable. (Zeke Medley, Corelight) 0.1-4 | 2019-05-20 11:19:53 -0700 * Build from the current source directory. (Zeke Medley) * Update update-changes config. (Robin Sommer, Corelight) * Fixing README. (Robin Sommer, Corelight) 0.1 | 2019-05-13 23:36:45 +0000 * Port to C++ & update data structures for better performance. (Zeke Medley, Corelight) 0.01 | 2012-11-08 20:19:59 -0800 * Starting CHANGES. paraglob-0.6.0/CMakeLists.txt000066400000000000000000000017021376603561600160610ustar00rootroot00000000000000cmake_minimum_required(VERSION 3.5 FATAL_ERROR) project(paraglob) include(GNUInstallDirs) if(NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release) endif(NOT CMAKE_BUILD_TYPE) string(TOUPPER ${CMAKE_BUILD_TYPE} build_affix) # Modifies CXX_FLAGS && CXX_FLAGS_RELEASE include(RequireCXX17.cmake) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall") set(CMAKE_CXX_FLAGS_RELEASE "-O3") include_directories(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/include) include_directories(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/src) add_subdirectory(src) add_subdirectory(tools) install(DIRECTORY include/paraglob DESTINATION include FILES_MATCHING PATTERN "*.h") set(summary "=================| Paraglob Config Summary |===================" "\nBUILD_TYPE: ${build_affix}" "\nCXX_FLAGS: ${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${build_affix}}" "\n=================================================================" ) message("\n" ${summary} "\n") paraglob-0.6.0/COPYING000066400000000000000000000033511376603561600143560ustar00rootroot00000000000000Copyright (c) 2019, Corelight Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: (1) Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. (2) Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. (3) Neither the name of Corelight nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Note that some files in the distribution may carry their own copyright notices. WITH EXPLICIT PERMISSION OF ITS AUTHOR, KAMIAR KANANI, THE MULTIFAST PROJECT (http://multifast.sourceforge.net/) CAN BE USED IN THE ZEEK PROJECT (https://github.com/zeek) UNDER BSD LICENCE. paraglob-0.6.0/README.md000066400000000000000000000045131376603561600146030ustar00rootroot00000000000000# Paraglob 2 #### A fairly quick data structure for matching a string against a large list of patterns. For example, given a list of patterns ``` {*og, do*, ca*, plant} ``` and an input string `dog`, paraglob will return ``` {*og, do*} ``` ## How it works For any pattern, there exist a set of sub-strings that a string must contain in order for it to have any hope of matching against that pattern. We call these meta-words. Here are some examples: ``` *og -> |og| dog*fish -> |dog| |fish| ``` When a pattern is added to a Paraglob the pattern is stored and is split into its meta-words. Those meta words are then added to an Aho-Corasick data structure that can be found in `multifast-ac`. When Paraglob is given a query, it first gets the meta-words contained in the query using `multifast-ac`. Then, it builds a set of all patterns associated with those meta-words and runs `fnmatch` on the query and those patterns. It finally returns a vector of all the patterns that match. ## Installation ``` # ./configure && make && make test && make install ``` ## How to use it `paraglob-test` is a small benchmarking script that takes three parameters: the number of patterns to generate, the number of queries to perform, and the percentage generated of patterns that will match. As an example, running `paraglob-test 10000 50 50` will add 10,000 patterns, perform 50 queries on them (of which 50% should match), and then return the results. ## Inside Zeek Paraglob is integrated with Zeek & provides a simple api inside of its scripting language. In Zeek, paraglob is implemented as an `OpaqueType` and its syntax closely follows other similar constructs inside Zeek. A paraglob can only be instantiated once from a vector of patterns and then only supports get operations which return a vector of all patterns matching an input string. These patterns are different than the `patttern` type in Zeek in that they are just strings. The syntax is as follows: ``` local v = vector("*", "d?g", "*og", "d?", "d[!wl]g"); local p = paraglob_init(v); print paraglob_match(p1, "dog"); ``` out: ``` [*, *og, d?g, d[!wl]g] ``` ## Notes Paraglob can make queries very quickly, but does not build instantly. It takes about 1.5 seconds to build for 10,000 items, 3 seconds for 20,000, and so on. This is because of the time required to build the Aho-Corasick structure. paraglob-0.6.0/RequireCXX17.cmake000066400000000000000000000044531376603561600165000ustar00rootroot00000000000000# Detect if compiler version is sufficient for supporting C++17. # If it is, CMAKE_CXX_FLAGS are modified appropriately and HAVE_CXX17 # is set to a true value. Else, CMake exits with a fatal error message. # This currently only works for GCC and Clang compilers. # In Cmake 3.8+, CMAKE_CXX_STANDARD_REQUIRED should be able to replace # all the logic below. if ( DEFINED HAVE_CXX17 ) return() endif () include(CheckCXXSourceCompiles) set(required_gcc_version 7.0) set(required_clang_version 4.0) set(required_apple_clang_version 6.0) set(cxx17_flag "-std=c++17") macro(cxx17_compile_test) check_cxx_source_compiles(" #include int main() { std::optional a; }" cxx17_works) if (NOT cxx17_works) message(FATAL_ERROR "failed using C++17 for compilation") endif () endmacro() if ( CMAKE_CXX_COMPILER_ID STREQUAL "GNU" ) if ( CMAKE_CXX_COMPILER_VERSION VERSION_LESS ${required_gcc_version} ) message(FATAL_ERROR "GCC version must be at least " "${required_gcc_version} for C++17 support, detected: " "${CMAKE_CXX_COMPILER_VERSION}") endif () elseif ( CMAKE_CXX_COMPILER_ID STREQUAL "Clang" ) if ( CMAKE_CXX_COMPILER_VERSION VERSION_LESS ${required_clang_version} ) message(FATAL_ERROR "Clang version must be at least " "${required_clang_version} for C++17 support, detected: " "${CMAKE_CXX_COMPILER_VERSION}") endif () if ( CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5 ) set(cxx17_flag "-std=c++1z") endif () elseif ( CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" ) if ( CMAKE_CXX_COMPILER_VERSION VERSION_LESS ${required_apple_clang_version} ) message(FATAL_ERROR "Apple Clang version must be at least " "${required_apple_clang_version} for C++17 support, detected: " "${CMAKE_CXX_COMPILER_VERSION}") endif () else() # Unrecognized compiler: fine to be permissive of other compilers as long # as they are able to support C++17 and can compile the test program, but # we just won't be able to give specific advice on what compiler version a # user needs in the case it actually doesn't support C++17. endif () set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${cxx17_flag}") cxx17_compile_test() set(HAVE_CXX17 true) paraglob-0.6.0/VERSION000066400000000000000000000000061376603561600143650ustar00rootroot000000000000000.6.0 paraglob-0.6.0/configure000077500000000000000000000061211376603561600152300ustar00rootroot00000000000000#!/bin/sh # Convenience wrapper for easily viewing/setting options that # the project's CMake scripts will recognize # # Adapted from Bro's wrapper. set -e command="$0 $*" # check for `cmake` command type cmake > /dev/null 2>&1 || { echo "\ This package requires CMake, please install it first, then you may use this configure script to access CMake equivalent functionality.\ " >&2; exit 1; } usage="\ Usage: $0 [OPTION]... [VAR=VALUE]... Build Options: --builddir=DIR Place build files in directory [build] --enable-debug Compile in debugging mode --generator=GENERATOR CMake generator to use (see cmake --help) --prefix=PATH Installation prefix [/usr/local] " sourcedir="$( cd "$( dirname "$0" )" && pwd )" # Function to append a CMake cache entry definition to the # CMakeCacheEntries variable. # $1 is the cache entry variable name # $2 is the cache entry variable type # $3 is the cache entry variable value append_cache_entry () { CMakeCacheEntries="$CMakeCacheEntries -D $1:$2=$3" } # Function to remove a CMake cache entry definition from the # CMakeCacheEntries variable # $1 is the cache entry variable name remove_cache_entry () { CMakeCacheEntries="$CMakeCacheEntries -U $1" # Even with -U, cmake still warns by default if # added previously with -D. CMakeCacheEntries="$CMakeCacheEntries --no-warn-unused-cli" } # set defaults builddir=build prefix=/usr/local CMakeCacheEntries="" append_cache_entry CMAKE_INSTALL_PREFIX PATH $prefix append_cache_entry CMAKE_BUILD_TYPE STRING Release # parse arguments while [ $# -ne 0 ]; do case "$1" in -*=*) optarg=`echo "$1" | sed 's/[-_a-zA-Z0-9]*=//'` ;; *) optarg= ;; esac case "$1" in --help|-h) echo "${usage}" 1>&2 exit 1 ;; --builddir=*) builddir=$optarg ;; --generator=*) CMakeGenerator="$optarg" ;; --prefix=*) append_cache_entry CMAKE_INSTALL_PREFIX PATH $optarg ;; --enable-debug) append_cache_entry CMAKE_BUILD_TYPE STRING Debug ;; *) echo "Invalid option '$1'. Try $0 --help to see available options." exit 1 ;; esac shift done if [ -n "$CMakeGenerator" ]; then cmake="${ScanBuild} cmake -G "$CMakeGenerator" $CMakeCacheEntries $sourcedir" else cmake="${ScanBuild} cmake $CMakeCacheEntries $sourcedir" fi if [ -d $builddir ]; then # If build directory exists, check if it has a CMake cache if [ -f $builddir/CMakeCache.txt ]; then # If the CMake cache exists, delete it so that this configuration # is not tainted by a previous one rm -f $builddir/CMakeCache.txt fi else # Create build directory mkdir -p $builddir fi echo "Build Directory : $builddir" echo "Source Directory: $sourcedir" cd $builddir eval ${cmake} echo "# This is the command used to configure this build" > config.status echo $command >> config.status chmod u+x config.status paraglob-0.6.0/include/000077500000000000000000000000001376603561600147445ustar00rootroot00000000000000paraglob-0.6.0/include/paraglob/000077500000000000000000000000001376603561600165335ustar00rootroot00000000000000paraglob-0.6.0/include/paraglob/exceptions.h000066400000000000000000000014601376603561600210660ustar00rootroot00000000000000// See the file "COPYING" in the main distribution directory for copyright. // Collection of exceptions thrown by paraglob. // These allow more specific error handling when dealing with paraglob. #ifndef PARAGLOB_EXCEPTIONS_H #define PARAGLOB_EXCEPTIONS_H #include #include namespace paraglob { /* Indicates that less data was found than expected. */ struct underflow_error : public std::underflow_error { using std::underflow_error::underflow_error; }; /* Indicates that more data was found than expected. */ struct overflow_error : public std::overflow_error { using std::overflow_error::overflow_error; }; /* Thrown when a paraglob fails to add a pattern. */ struct add_error : public std::runtime_error { using std::runtime_error::runtime_error; }; } #endif paraglob-0.6.0/include/paraglob/node.h000066400000000000000000000031051376603561600176300ustar00rootroot00000000000000// See the file "COPYING" in the main distribution directory for copyright. // // Node class for paraglob. Holds a meta word and its associated patterns. #ifndef PARAGLOBNODE_H #define PARAGLOBNODE_H #include // copy_if #include #include #include namespace paraglob { class ParaglobNode { public: ParaglobNode(std::string meta_word, std::string init_pattern) : meta_word(std::move(meta_word)), patterns({ std::move(init_pattern) }) { } std::string get_meta_word() const { return this->meta_word; } bool operator==(const ParaglobNode &other) const { return (this->meta_word == other.get_meta_word()); } void add_pattern(std::string pattern) { this->patterns.push_back(std::move(pattern)); } /* Merges this nodes matching patterns into the input vector. */ void merge_matches(std::vector& target, const std::string& text) { std::copy_if(this->patterns.begin(), this->patterns.end(), std::back_inserter(target), [text](const std::string& candidate) { return (fnmatch(candidate.c_str(), text.c_str(), 0) == 0); }); } // Merges this nodes patterns into the input vector // Note: this could be done more efficently with a move iterator if we wanted // this to be destructive. void merge_patterns(std::vector& target) { target.insert(target.begin(), this->patterns.begin(), this->patterns.end()); } private: std::string meta_word; std::vector patterns; }; } // namespace paraglob #endif paraglob-0.6.0/include/paraglob/paraglob.h000066400000000000000000000040741376603561600205000ustar00rootroot00000000000000// See the file "COPYING" in the main distribution directory for copyright. #ifndef PARAGLOB_H #define PARAGLOB_H #include "paraglob/node.h" #include "paraglob/serializer.h" #include // sort #include #include // str() function #include #include #include // std::unique_ptr class AhoCorasickPlus; namespace paraglob { class Paraglob { private: std::unique_ptr my_ac; std::unordered_map meta_to_node_map; std::vector meta_words; /* Patterns with no meta words, ex: '*' & '?' */ std::vector single_wildcards; /* Get a vector of the meta words in the pattern. */ std::vector get_meta_words(const std::string& pattern); /* Split a string on pairs of square brackets. */ std::vector split_on_brackets(const std::string& in) const; /* Get a vector of all the patterns in the paraglob */ std::vector get_patterns() const; public: /* Create an empty paraglob to fill with add and finalize with compile */ Paraglob(); /* Initialize a paraglob from a (large) vector of patterns and compile */ Paraglob(const std::vector& patterns); /* Initialize and compile a paraglob from a serialized one */ Paraglob(std::unique_ptr> serialized); /* Destructor */ ~Paraglob(); /* Add a pattern to the paraglob & return true on success */ bool add(const std::string& pattern); /* Compile the paraglob */ void compile(); /* Get a vector of the patterns that match the input string */ std::vector get(const std::string& text); /* Get a raw byte representation of the paraglob */ std::unique_ptr> serialize() const; /* Get readable contents of the paraglob for debugging */ std::string str() const; /* Two paraglobs are equal if they contain the same patterns */ bool operator==(const Paraglob &other) const; }; } // namespace paraglob #endif paraglob-0.6.0/include/paraglob/serializer.h000066400000000000000000000022561376603561600210620ustar00rootroot00000000000000// See the file "COPYING" in the main distribution directory for copyright. // Class for performing serialization and deserialization of paraglob. #ifndef PARAGLOB_SERIALIZER_H #define PARAGLOB_SERIALIZER_H #include #include // std::unique_ptr #include "paraglob/paraglob.h" #include "paraglob/exceptions.h" namespace paraglob { class ParaglobSerializer { public: /* Returns serialized version of vector in form: [, , ... ] */ // TODO: When Zeek supports C++17 char should be replaced by std::byte. static std::unique_ptr> serialize (const std::vector &v); /* Loads a serialized vector and returns it. */ static std::vector unserialize (const std::unique_ptr> &vsp); private: /* Divides up and adds a large integer to the input vector. */ static void add_int (uint64_t a, std::vector &target); /* Gets the large integer beginning at the iterator and moves it forward. */ static uint64_t get_int_and_move (std::vector::iterator &start); }; } // namespace paraglob #endif paraglob-0.6.0/src/000077500000000000000000000000001376603561600141105ustar00rootroot00000000000000paraglob-0.6.0/src/CMakeLists.txt000066400000000000000000000006251376603561600166530ustar00rootroot00000000000000 set(AHOCORASICK_SRCS ahocorasick/ahocorasick.c ahocorasick/node.c ahocorasick/mpool.c ahocorasick/replace.c ahocorasick/AhoCorasickPlus.cpp ) add_subdirectory(ahocorasick) add_library(paraglob STATIC paraglob.cpp paraglob_serializer.cpp ${AHOCORASICK_SRCS} ) set_target_properties(paraglob PROPERTIES OUTPUT_NAME paraglob) install(TARGETS paraglob DESTINATION ${CMAKE_INSTALL_LIBDIR}) paraglob-0.6.0/src/ahocorasick/000077500000000000000000000000001376603561600163765ustar00rootroot00000000000000paraglob-0.6.0/src/ahocorasick/AhoCorasickPlus.cpp000066400000000000000000000056511376603561600221430ustar00rootroot00000000000000/* * AhoCorasickPlus.cpp: A sample C++ wrapper for Aho-Corasick C library * * This file is part of multifast. * Copyright 2010-2015 Kamiar Kanani multifast is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. multifast is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with multifast. If not, see . * Modified by Jon Siwek: add "copy" flag to addPattern() methods * Modified by Jon Siwek: fix addPattern() to set pattern ID type to "number" */ #include "ahocorasick.h" #include "AhoCorasickPlus.h" AhoCorasickPlus::AhoCorasickPlus () { m_automata = ac_trie_create (); m_acText = new AC_TEXT_t; } AhoCorasickPlus::~AhoCorasickPlus () { ac_trie_release (m_automata); delete m_acText; } AhoCorasickPlus::EnumReturnStatus AhoCorasickPlus::addPattern (std::string_view pattern, PatternId id, bool copy) { // Adds zero-terminating string EnumReturnStatus rv = RETURNSTATUS_FAILED; AC_PATTERN_t patt; patt.ptext.astring = (AC_ALPHABET_t*) pattern.data(); patt.ptext.length = pattern.size(); patt.id.u.number = id; patt.id.type = AC_PATTID_TYPE_NUMBER; patt.rtext.astring = NULL; patt.rtext.length = 0; AC_STATUS_t status = ac_trie_add (m_automata, &patt, copy); switch (status) { case ACERR_SUCCESS: rv = RETURNSTATUS_SUCCESS; break; case ACERR_DUPLICATE_PATTERN: rv = RETURNSTATUS_DUPLICATE_PATTERN; break; case ACERR_LONG_PATTERN: rv = RETURNSTATUS_LONG_PATTERN; break; case ACERR_ZERO_PATTERN: rv = RETURNSTATUS_ZERO_PATTERN; break; case ACERR_TRIE_CLOSED: rv = RETURNSTATUS_AUTOMATA_CLOSED; break; } return rv; } void AhoCorasickPlus::finalize () { ac_trie_finalize (m_automata); } void AhoCorasickPlus::search (const std::string &text, bool keep) { m_acText->astring = text.c_str(); m_acText->length = text.size(); ac_trie_settext (m_automata, m_acText, (int)keep); } std::vector AhoCorasickPlus::findAll (const std::string& text, bool keep) { this->search(text, keep); std::vector IDs; AC_MATCH_t matchp; unsigned int j; while ((matchp = ac_trie_findnext (m_automata)).size) { for (j = 0; j < matchp.size; j++) { // Add the id to our vector IDs.push_back(matchp.patterns[j].id.u.number); } } return IDs; } paraglob-0.6.0/src/ahocorasick/AhoCorasickPlus.h000066400000000000000000000041371376603561600216060ustar00rootroot00000000000000/* * AhoCorasickPlus.h: This is the header file for a sample * C++ wrapper for Aho-Corasick C library * * This file is part of multifast. * Copyright 2010-2015 Kamiar Kanani multifast is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. multifast is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with multifast. If not, see . * Modified by Jon Siwek: add "copy" flag to addPattern() methods */ #ifndef AHOCORASICKPPW_H_ #define AHOCORASICKPPW_H_ #include #include #include #include // Forward declarations struct ac_trie; struct ac_text; class AhoCorasickPlus { public: enum EnumReturnStatus { RETURNSTATUS_SUCCESS = 0, // No error occurred RETURNSTATUS_DUPLICATE_PATTERN, // Duplicate patterns RETURNSTATUS_LONG_PATTERN, // Long pattern RETURNSTATUS_ZERO_PATTERN, // Empty pattern (zero length) RETURNSTATUS_AUTOMATA_CLOSED, // Automata is closed RETURNSTATUS_FAILED, // General unknown failure }; typedef unsigned int PatternId; struct Match { unsigned int position; PatternId id; }; public: AhoCorasickPlus(); ~AhoCorasickPlus(); EnumReturnStatus addPattern (std::string_view pattern, PatternId id, bool copy = false); void finalize (); void search (const std::string &text, bool keep); std::vector findAll (const std::string& text, bool keep); private: struct ac_trie *m_automata; struct ac_text *m_acText; }; #endif /* AHOCORASICKPPW_H_ */ paraglob-0.6.0/src/ahocorasick/CMakeLists.txt000066400000000000000000000001341376603561600211340ustar00rootroot00000000000000 add_library(ahocorasick STATIC ahocorasick.c node.c mpool.c replace.c AhoCorasickPlus.cpp) paraglob-0.6.0/src/ahocorasick/README000066400000000000000000000012771376603561600172650ustar00rootroot00000000000000Aho-Corasick Library 2.0.0 -------------------------- This folder contain the implementation of Aho-Corasick library. The Aho–Corasick algorithm is a string searching algorithm invented by Alfred V. Aho and Margaret J. Corasick. It is a kind of dictionary-matching algorithm that locates elements of a finite set of strings (the "dictionary") within an input text. It matches all patterns simultaneously. COMPILE ------- $ cd ahocorasick $ make HOW TO USE ---------- You can guides in the mutifast's website: http://multifast.sourceforge.net/ It could be helpful to look at the implementation files: actypes.h and ahocorasick.h AUTHOR ------ Kamiar Kanani paraglob-0.6.0/src/ahocorasick/actypes.h000066400000000000000000000125271376603561600202260ustar00rootroot00000000000000/* * actypes.h: Defines basic data types of the trie * This file is part of multifast. * Copyright 2010-2015 Kamiar Kanani multifast is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. multifast is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with multifast. If not, see . */ #ifndef _AC_TYPES_H_ #define _AC_TYPES_H_ #include #ifdef __cplusplus extern "C" { #endif /** * @brief The alphabet type * * Actually defining AC_ALPHABET_t as a char works for many usage case, but * sometimes we deal with streams of other basic types e.g. integers or * enumerators. Although they consists of string of bytes (chars), but using * their specific types as AC_ALPHABET_t will lead to a better performance. * So instead of working with strings of chars, we assume that we are working * with strings of AC_ALPHABET_t and leave it optional for users to define * their own alphabets. */ typedef char AC_ALPHABET_t; /** * The text (strings of alphabets) type that is used for input/output when * dealing with the A.C. Trie. The text can contain zero value alphabets. */ typedef struct ac_text { const AC_ALPHABET_t *astring; /**< String of alphabets */ size_t length; /**< String length */ } AC_TEXT_t; /** * Pattern ID type * @see struct ac_pattid */ enum ac_pattid_type { AC_PATTID_TYPE_DEFAULT = 0, AC_PATTID_TYPE_NUMBER, AC_PATTID_TYPE_STRING }; /** * Provides a more readable representative for the pattern. Because patterns * themselves are not always suitable for displaying (e.g. patterns containing * special characters), we offer this type to improve intelligibility of the * output. Sometimes it can be also useful, when you are retrieving patterns * from a database, to maintain their identifiers in the trie for further * reference. We provisioned two possible types as a union. you can add your * type here. */ typedef struct ac_pattid { union { const char *stringy; /**< Null-terminated string */ long number; /**< Item indicator */ } u; enum ac_pattid_type type; /**< Shows the type of id */ } AC_PATTID_t; /** * This is the pattern type that the trie must be fed by. */ typedef struct ac_pattern { AC_TEXT_t ptext; /**< The search string */ AC_TEXT_t rtext; /**< The replace string */ AC_PATTID_t id; /**< Pattern identifier */ } AC_PATTERN_t; /** * @brief Provides the structure for reporting a match in the text. * * A match occurs when the trie reaches a final node. Any final * node can match one or more patterns at a position in the input text. * the 'patterns' field holds these matched patterns. Obviously these * matched patterns have same end-position in the text. There is a relationship * between matched patterns: the shorter one is a factor (tail) of the longer * one. The 'position' maintains the end position of matched patterns. */ typedef struct ac_match { AC_PATTERN_t *patterns; /**< Array of matched pattern(s) */ size_t size; /**< Number of matched pattern(s) */ size_t position; /**< The end position of the matching pattern(s) in * the input text */ } AC_MATCH_t; /** * The return status of various A.C. Trie functions */ typedef enum ac_status { ACERR_SUCCESS = 0, /**< No error occurred */ ACERR_DUPLICATE_PATTERN, /**< Duplicate patterns */ ACERR_LONG_PATTERN, /**< Pattern length is too long */ ACERR_ZERO_PATTERN, /**< Empty pattern (zero length) */ ACERR_TRIE_CLOSED /**< Trie is closed. */ } AC_STATUS_t; /** * @ brief The call-back function to report the matched patterns back to the * caller. * * When a match is found, the trie will reach the caller using this * function. You can send parameters to the call-back function when you call * _search() or _replace() functions. The call-back function receives those * parameters as the second parameter determined by void * in bellow. If you * return 0 from call-back function, it will tell trie to continue * searching, otherwise it will return from the trie function. */ typedef int (*AC_MATCH_CALBACK_f)(AC_MATCH_t *, void *); /** * @brief Call-back function to receive the replacement text (chunk by chunk). */ typedef void (*MF_REPLACE_CALBACK_f)(AC_TEXT_t *, void *); /** * Maximum accepted length of search/replace pattern */ #define AC_PATTRN_MAX_LENGTH 1024 /** * Replacement buffer size */ #define MF_REPLACEMENT_BUFFER_SIZE 2048 #if (MF_REPLACEMENT_BUFFER_SIZE <= AC_PATTRN_MAX_LENGTH) #error "REPLACEMENT_BUFFER_SIZE must be bigger than AC_PATTRN_MAX_LENGTH" #endif typedef enum act_working_mode { AC_WORKING_MODE_SEARCH = 0, /* Default */ AC_WORKING_MODE_FINDNEXT, AC_WORKING_MODE_REPLACE /* Not used */ } ACT_WORKING_MODE_t; #ifdef __cplusplus } #endif #endif paraglob-0.6.0/src/ahocorasick/ahocorasick.c000066400000000000000000000307651376603561600210430ustar00rootroot00000000000000/* * ahocorasick.c: Implements the A. C. Trie functionalities * This file is part of multifast. * Copyright 2010-2015 Kamiar Kanani multifast is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. multifast is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with multifast. If not, see . */ #include #include #include #include "node.h" #include "ahocorasick.h" #include "mpool.h" /* Privates */ static void ac_trie_set_failure (ACT_NODE_t *node, AC_ALPHABET_t *alphas); static void ac_trie_traverse_setfailure (ACT_NODE_t *node, AC_ALPHABET_t *prefix); static void ac_trie_traverse_action (ACT_NODE_t *node, void(*func)(ACT_NODE_t *), int top_down); static void ac_trie_reset (AC_TRIE_t *thiz); static int ac_trie_match_handler (AC_MATCH_t * matchp, void * param); /* Friends */ extern void mf_repdata_init (AC_TRIE_t *thiz); extern void mf_repdata_reset (MF_REPLACEMENT_DATA_t *rd); extern void mf_repdata_release (MF_REPLACEMENT_DATA_t *rd); extern void mf_repdata_allocbuf (MF_REPLACEMENT_DATA_t *rd); /** * @brief Initializes the trie; allocates memories and sets initial values * * @return *****************************************************************************/ AC_TRIE_t *ac_trie_create (void) { AC_TRIE_t *thiz = (AC_TRIE_t *) malloc (sizeof(AC_TRIE_t)); thiz->mp = mpool_create(0); thiz->root = node_create (thiz); thiz->patterns_count = 0; mf_repdata_init (thiz); ac_trie_reset (thiz); thiz->text = NULL; thiz->position = 0; thiz->wm = AC_WORKING_MODE_SEARCH; thiz->trie_open = 1; return thiz; } /** * @brief Adds pattern to the trie. * * @param Thiz pointer to the trie * @param Patt pointer to the pattern * @param copy should trie make a copy of patten strings or not, if not, * then user must keep the strings valid for the life-time of the trie. If * the pattern are available in the user program then call the function with * copy = 0 and do not waste memory. * * @return The return value indicates the success or failure of adding action *****************************************************************************/ AC_STATUS_t ac_trie_add (AC_TRIE_t *thiz, AC_PATTERN_t *patt, int copy) { size_t i; ACT_NODE_t *n = thiz->root; ACT_NODE_t *next; AC_ALPHABET_t alpha; if(!thiz->trie_open) return ACERR_TRIE_CLOSED; if (!patt->ptext.length) return ACERR_ZERO_PATTERN; if (patt->ptext.length > AC_PATTRN_MAX_LENGTH) return ACERR_LONG_PATTERN; for (i = 0; i < patt->ptext.length; i++) { alpha = patt->ptext.astring[i]; if ((next = node_find_next (n, alpha))) { n = next; continue; } else { next = node_create_next (n, alpha); next->depth = n->depth + 1; n = next; } } if(n->final) return ACERR_DUPLICATE_PATTERN; n->final = 1; node_accept_pattern (n, patt, copy); thiz->patterns_count++; return ACERR_SUCCESS; } /** * @brief Finalizes the preprocessing stage and gets the trie ready * * Locates the failure node for all nodes and collects all matched * pattern for each node. It also sorts outgoing edges of node, so binary * search could be performed on them. After calling this function the automate * will be finalized and you can not add new patterns to the automate. * * @param thiz pointer to the trie *****************************************************************************/ void ac_trie_finalize (AC_TRIE_t *thiz) { AC_ALPHABET_t prefix[AC_PATTRN_MAX_LENGTH]; /* 'prefix' defined here, because ac_trie_traverse_setfailure() calls * itself recursively */ ac_trie_traverse_setfailure (thiz->root, prefix); ac_trie_traverse_action (thiz->root, node_collect_matches, 1); mf_repdata_allocbuf (&thiz->repdata); thiz->trie_open = 0; /* Do not accept patterns any more */ } /** * @brief Search in the input text using the given trie. * * @param thiz pointer to the trie * @param text input text to be searched * @param keep indicated that if the input text the successive chunk of the * previous given text or not * @param callback when a match occurs this function will be called. The * call-back function in turn after doing its job, will return an integer * value, 0 means continue search, and non-0 value means stop search and return * to the caller. * @param user this parameter will be send to the call-back function * * @return * -1: failed; trie is not finalized * 0: success; input text was searched to the end * 1: success; input text was searched partially. (callback broke the loop) *****************************************************************************/ int ac_trie_search (AC_TRIE_t *thiz, AC_TEXT_t *text, int keep, AC_MATCH_CALBACK_f callback, void *user) { size_t position; ACT_NODE_t *current; ACT_NODE_t *next; AC_MATCH_t match; if (thiz->trie_open) return -1; /* Trie must be finalized first. */ if (thiz->wm == AC_WORKING_MODE_FINDNEXT) position = thiz->position; else position = 0; current = thiz->last_node; if (!keep) ac_trie_reset (thiz); /* This is the main search loop. * It must be kept as lightweight as possible. */ while (position < text->length) { if (!(next = node_find_next_bs (current, text->astring[position]))) { if(current->failure_node /* We are not in the root node */) current = current->failure_node; else position++; } else { current = next; position++; } if (current->final && next) /* We check 'next' to find out if we have come here after a alphabet * transition or due to a fail transition. in second case we should not * report match, because it has already been reported */ { /* Found a match! */ match.position = position + thiz->base_position; match.size = current->matched_size; match.patterns = current->matched; /* Do call-back */ if (callback(&match, user)) { if (thiz->wm == AC_WORKING_MODE_FINDNEXT) { thiz->position = position; thiz->last_node = current; } return 1; } } } /* Save status variables */ thiz->last_node = current; thiz->base_position += position; return 0; } /** * @brief sets the input text to be searched by a function call to _findnext() * * @param thiz The pointer to the trie * @param text The text to be searched. The owner of the text is the * calling program and no local copy is made, so it must be valid until you * have done with it. * @param keep Indicates that if the given text is the sequel of the previous * one or not; 1: it is, 0: it is not *****************************************************************************/ void ac_trie_settext (AC_TRIE_t *thiz, AC_TEXT_t *text, int keep) { if (!keep) ac_trie_reset (thiz); thiz->text = text; thiz->position = 0; } /** * @brief finds the next match in the input text which is set by _settext() * * @param thiz The pointer to the trie * @return A pointer to the matched structure *****************************************************************************/ AC_MATCH_t ac_trie_findnext (AC_TRIE_t *thiz) { AC_MATCH_t match; thiz->wm = AC_WORKING_MODE_FINDNEXT; match.size = 0; ac_trie_search (thiz, thiz->text, 1, ac_trie_match_handler, (void *)&match); thiz->wm = AC_WORKING_MODE_SEARCH; return match; } /** * @brief Release all allocated memories to the trie * * @param thiz pointer to the trie *****************************************************************************/ void ac_trie_release (AC_TRIE_t *thiz) { /* It must be called with a 0 top-down parameter */ ac_trie_traverse_action (thiz->root, node_release_vectors, 0); mf_repdata_release (&thiz->repdata); mpool_free(thiz->mp); free(thiz); } /** * @brief Prints the trie to output in human readable form. It is useful * for debugging purpose. * * @param thiz pointer to the trie *****************************************************************************/ void ac_trie_display (AC_TRIE_t *thiz) { ac_trie_traverse_action (thiz->root, node_display, 1); } /** * @brief the match handler function used in _findnext function * * @param matchp * @param param * @return *****************************************************************************/ static int ac_trie_match_handler (AC_MATCH_t * matchp, void * param) { AC_MATCH_t * mp = (AC_MATCH_t *)param; mp->position = matchp->position; mp->patterns = matchp->patterns; mp->size = matchp->size; return 1; } /** * @brief reset the trie and make it ready for doing new search * * @param thiz pointer to the trie *****************************************************************************/ static void ac_trie_reset (AC_TRIE_t *thiz) { thiz->last_node = thiz->root; thiz->base_position = 0; mf_repdata_reset (&thiz->repdata); } /** * @brief Finds and bookmarks the failure transition for the given node. * * @param node the node pointer * @param prefix The array that contain the prefix that leads the path from * root the the node. *****************************************************************************/ static void ac_trie_set_failure (ACT_NODE_t *node, AC_ALPHABET_t *prefix) { size_t i, j; ACT_NODE_t *n; ACT_NODE_t *root = node->trie->root; if (node == root) return; /* Failure transition is not defined for the root */ for (i = 1; i < node->depth; i++) { n = root; for (j = i; j < node->depth && n; j++) n = node_find_next (n, prefix[j]); if (n) { node->failure_node = n; break; } } if (!node->failure_node) node->failure_node = root; } /** * @brief Sets the failure transition node for all nodes * * Traverse all trie nodes using DFS (Depth First Search), meanwhile it set * the failure node for every node it passes through. this function is called * after adding last pattern to trie. * * @param node The pointer to the root node * @param prefix The array that contain the prefix that leads the path from * root the the node *****************************************************************************/ static void ac_trie_traverse_setfailure (ACT_NODE_t *node, AC_ALPHABET_t *prefix) { size_t i; /* In each node, look for its failure node */ ac_trie_set_failure (node, prefix); for (i = 0; i < node->outgoing_size; i++) { prefix[node->depth] = node->outgoing[i].alpha; /* Make the prefix */ /* Recursively call itself to traverse all nodes */ ac_trie_traverse_setfailure (node->outgoing[i].next, prefix); } } /** * @brief Traverses the trie using DFS method and applies the * given @param func on all nodes. At top level it should be called by * sending the the root node. * * @param node Pointer to trie root node * @param func The function that must be applied to all nodes * @param top_down Indicates that if the action should be applied to the note * itself and then to its children or vise versa. *****************************************************************************/ static void ac_trie_traverse_action (ACT_NODE_t *node, void(*func)(ACT_NODE_t *), int top_down) { size_t i; if (top_down) func (node); for (i = 0; i < node->outgoing_size; i++) /* Recursively call itself to traverse all nodes */ ac_trie_traverse_action (node->outgoing[i].next, func, top_down); if (!top_down) func (node); } paraglob-0.6.0/src/ahocorasick/ahocorasick.h000066400000000000000000000057161376603561600210460ustar00rootroot00000000000000/* * ahocorasick.h: The main ahocorasick header file. * This file is part of multifast. * Copyright 2010-2015 Kamiar Kanani multifast is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. multifast is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with multifast. If not, see . */ #ifndef _AHOCORASICK_H_ #define _AHOCORASICK_H_ #include "replace.h" #ifdef __cplusplus extern "C" { #endif /* Forward declaration */ struct act_node; struct mpool; /* * The A.C. Trie data structure */ typedef struct ac_trie { struct act_node *root; /**< The root node of the trie */ size_t patterns_count; /**< Total patterns in the trie */ short trie_open; /**< This flag indicates that if trie is finalized * or not. After finalizing the trie you can not * add pattern to trie anymore. */ struct mpool *mp; /**< Memory pool */ /* ******************* Thread specific part ******************** */ /* It is possible to search a long input chunk by chunk. In order to * connect these chunks and make a continuous view of the input, we need * the following variables. */ struct act_node *last_node; /**< Last node we stopped at */ size_t base_position; /**< Represents the position of the current chunk, * related to whole input text */ AC_TEXT_t *text; /**< A helper variable to hold the input chunk */ size_t position; /**< A helper variable to hold the relative current * position in the given text */ MF_REPLACEMENT_DATA_t repdata; /**< Replacement data structure */ ACT_WORKING_MODE_t wm; /**< Working mode */ } AC_TRIE_t; /* * The API functions */ AC_TRIE_t *ac_trie_create (void); AC_STATUS_t ac_trie_add (AC_TRIE_t *thiz, AC_PATTERN_t *patt, int copy); void ac_trie_finalize (AC_TRIE_t *thiz); void ac_trie_release (AC_TRIE_t *thiz); void ac_trie_display (AC_TRIE_t *thiz); int ac_trie_search (AC_TRIE_t *thiz, AC_TEXT_t *text, int keep, AC_MATCH_CALBACK_f callback, void *param); void ac_trie_settext (AC_TRIE_t *thiz, AC_TEXT_t *text, int keep); AC_MATCH_t ac_trie_findnext (AC_TRIE_t *thiz); int multifast_replace (AC_TRIE_t *thiz, AC_TEXT_t *text, MF_REPLACE_MODE_t mode, MF_REPLACE_CALBACK_f callback, void *param); void multifast_rep_flush (AC_TRIE_t *thiz, int keep); #ifdef __cplusplus } #endif #endif paraglob-0.6.0/src/ahocorasick/mpool.c000066400000000000000000000107301376603561600176710ustar00rootroot00000000000000/* * mpool.c memory pool management * This file is part of multifast. * Copyright 2010-2015 Kamiar Kanani multifast is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. multifast is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with multifast. If not, see . */ #include #include #include #include "mpool.h" #define MPOOL_BLOCK_SIZE (24*1024) #if (MPOOL_BLOCK_SIZE % 16 > 0) #error "MPOOL_BLOCK_SIZE must be multiple 16" #endif #if (MPOOL_BLOCK_SIZE <= AC_PATTRN_MAX_LENGTH) #error "MPOOL_BLOCK_SIZE must be bigger than AC_PATTRN_MAX_LENGTH" #endif struct mpool_block { size_t size; unsigned char *bp; /* Block pointer */ unsigned char *free; /* Free area; End of allocated section */ struct mpool_block *next; /* Next block */ }; struct mpool { struct mpool_block *block; }; /** * @brief Allocate a new block to the pool * * @param size * @return ******************************************************************************/ static struct mpool_block *mpool_new_block (size_t size) { struct mpool_block *block; if (!size) size = MPOOL_BLOCK_SIZE; block = (struct mpool_block *) malloc (sizeof(struct mpool_block)); block->bp = block->free = malloc(size); block->size = size; block->next = NULL; return block; } /** * @brief Creates a new pool * * @param size * @return ******************************************************************************/ struct mpool *mpool_create (size_t size) { struct mpool *ret; ret = malloc (sizeof(struct mpool)); ret->block = mpool_new_block(size); return ret; } /** * @brief Free a pool * * @param pool ******************************************************************************/ void mpool_free (struct mpool *pool) { struct mpool_block *p, *p_next; if (!pool) return; if (!pool->block) { free(pool); return; } p = pool->block; while (p) { p_next = p->next; free(p->bp); free(p); p = p_next; } free(pool); } /** * @brief Allocate from a pool * * @param pool * @param size * @return ******************************************************************************/ void *mpool_malloc (struct mpool *pool, size_t size) { void *ret = NULL; struct mpool_block *block, *new_block; size_t remain, block_size; if(!pool || !pool->block || !size) return NULL; size = (size + 15) & ~0xF; /* This is to align memory allocation on * multiple 16 boundary */ block = pool->block; remain = block->size - ((size_t)block->free - (size_t)block->bp); if (remain < size) { /* Allocate a new block */ block_size = ((size > block->size) ? size : block->size); new_block = mpool_new_block (block_size); new_block->next = block; block = pool->block = new_block; } ret = block->free; block->free = block->bp + (block->free - block->bp + size); return ret; } /** * @brief Makes a copy of a string with known size * * @param pool * @param str * @param n * @return *****************************************************************************/ void *mpool_strndup (struct mpool *pool, const char *str, size_t n) { void *ret; if (!str) return NULL; if ((ret = mpool_malloc(pool, n+1))) { strncpy((char *)ret, str, n); ((char *)ret)[n] = '\0'; } return ret; } /** * @brief Makes a copy of zero terminated string * * @param pool * @param str * @return ******************************************************************************/ void *mpool_strdup (struct mpool *pool, const char *str) { size_t len; if (!str) return NULL; len = strlen(str); return mpool_strndup (pool, str, len); } paraglob-0.6.0/src/ahocorasick/mpool.h000066400000000000000000000024071376603561600177000ustar00rootroot00000000000000/* * mpool.c memory pool management * This file is part of multifast. * Copyright 2010-2015 Kamiar Kanani multifast is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. multifast is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with multifast. If not, see . */ #ifndef _MPOOL_H_ #define _MPOOL_H_ #ifdef __cplusplus extern "C" { #endif /* Forward declaration */ struct mpool; struct mpool *mpool_create (size_t size); void mpool_free (struct mpool *pool); void *mpool_malloc (struct mpool *pool, size_t size); void *mpool_strdup (struct mpool *pool, const char *str); void *mpool_strndup (struct mpool *pool, const char *str, size_t n); #ifdef __cplusplus } #endif #endif /* _MPOOL_H_ */ paraglob-0.6.0/src/ahocorasick/node.c000066400000000000000000000326131376603561600174740ustar00rootroot00000000000000/* * node.c: Implements the A.C. Trie node * This file is part of multifast. * Copyright 2010-2015 Kamiar Kanani multifast is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. multifast is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with multifast. If not, see . */ #include #include #include #include #include "node.h" #include "mpool.h" #include "ahocorasick.h" /* Privates */ static void node_init (ACT_NODE_t *thiz); static int node_edge_compare (const void *l, const void *r); static int node_has_pattern (ACT_NODE_t *thiz, AC_PATTERN_t *patt); static void node_grow_outgoing_vector (ACT_NODE_t *thiz); static void node_grow_matched_vector (ACT_NODE_t *thiz); static void node_copy_pattern (ACT_NODE_t *thiz, AC_PATTERN_t *to, AC_PATTERN_t *from); /** * @brief Creates the node * * @return ******************************************************************************/ struct act_node * node_create (struct ac_trie *trie) { ACT_NODE_t *node; node = (ACT_NODE_t *) mpool_malloc (trie->mp, sizeof(ACT_NODE_t)); node_init (node); node->trie = trie; return node; } /** * @brief Initializes the node * * @param thiz *****************************************************************************/ static void node_init (ACT_NODE_t *thiz) { node_assign_id (thiz); thiz->final = 0; thiz->failure_node = NULL; thiz->depth = 0; thiz->matched = NULL; thiz->matched_capacity = 0; thiz->matched_size = 0; thiz->outgoing = NULL; thiz->outgoing_capacity = 0; thiz->outgoing_size = 0; thiz->to_be_replaced = NULL; } /** * @brief Releases the node memories * * @param thiz *****************************************************************************/ void node_release_vectors(ACT_NODE_t *nod) { free(nod->matched); free(nod->outgoing); } /** * @brief Finds out the next node for a given alpha. this function is used in * the pre-processing stage in which edge array is not sorted. so it uses * linear search. * * @param thiz * @param alpha * @return *****************************************************************************/ ACT_NODE_t * node_find_next(ACT_NODE_t *nod, AC_ALPHABET_t alpha) { size_t i; for (i=0; i < nod->outgoing_size; i++) { if(nod->outgoing[i].alpha == alpha) return (nod->outgoing[i].next); } return NULL; } /** * @brief Finds out the next node for a given alpha. this function is used * after the pre-processing stage in which we sort edges. so it uses Binary * Search. * * @param thiz * @param alpha * @return *****************************************************************************/ ACT_NODE_t *node_find_next_bs (ACT_NODE_t *nod, AC_ALPHABET_t alpha) { size_t mid; int min, max; AC_ALPHABET_t amid; min = 0; max = nod->outgoing_size - 1; while (min <= max) { mid = (min + max) >> 1; amid = nod->outgoing[mid].alpha; if (alpha > amid) min = mid + 1; else if (alpha < amid) max = mid - 1; else return (nod->outgoing[mid].next); } return NULL; } /** * @brief Determines if a final node contains a pattern in its accepted pattern * list or not. * * @param thiz * @param newstr * @return 1: has the pattern, 0: doesn't have it *****************************************************************************/ static int node_has_pattern (ACT_NODE_t *thiz, AC_PATTERN_t *patt) { size_t i, j; AC_TEXT_t *txt; AC_TEXT_t *new_txt = &patt->ptext; for (i = 0; i < thiz->matched_size; i++) { txt = &thiz->matched[i].ptext; if (txt->length != new_txt->length) continue; /* The following loop is futile! Because the input pattern always come * from a failure node, and if they have the same length, then they are * equal. But for the sake of functional integrity we leave it here. */ for (j = 0; j < txt->length; j++) if (txt->astring[j] != new_txt->astring[j]) break; if (j == txt->length) return 1; } return 0; } /** * @brief Create the next node for the given alpha. * * @param thiz * @param alpha * @return *****************************************************************************/ ACT_NODE_t *node_create_next (ACT_NODE_t *nod, AC_ALPHABET_t alpha) { ACT_NODE_t *next; if (node_find_next (nod, alpha) != NULL) /* The edge already exists */ return NULL; next = node_create (nod->trie); node_add_edge (nod, next, alpha); return next; } /** * @brief Adds the pattern to the list of accepted pattern. * * @param thiz * @param str * @param copy *****************************************************************************/ void node_accept_pattern (ACT_NODE_t *nod, AC_PATTERN_t *new_patt, int copy) { AC_PATTERN_t *patt; /* Check if the new pattern already exists in the node list */ if (node_has_pattern(nod, new_patt)) return; /* Manage memory */ if (nod->matched_size == nod->matched_capacity) node_grow_matched_vector (nod); patt = &nod->matched[nod->matched_size++]; if (copy) { /* Deep copy */ node_copy_pattern (nod, patt, new_patt); } else { /* Shallow copy */ *patt = *new_patt; } } /** * @brief Makes a deep copy of the pattern * * @param thiz pointer to the owner node * @param from * @param to *****************************************************************************/ static void node_copy_pattern (ACT_NODE_t *thiz, AC_PATTERN_t *to, AC_PATTERN_t *from) { struct mpool *mp = thiz->trie->mp; to->ptext.astring = (AC_ALPHABET_t *) mpool_strndup (mp, (const char *) from->ptext.astring, from->ptext.length * sizeof(AC_ALPHABET_t)); to->ptext.length = from->ptext.length; to->rtext.astring = (AC_ALPHABET_t *) mpool_strndup (mp, (const char *) from->rtext.astring, from->rtext.length * sizeof(AC_ALPHABET_t)); to->rtext.length = from->rtext.length; if (from->id.type == AC_PATTID_TYPE_STRING) to->id.u.stringy = (const char *) mpool_strdup (mp, (const char *) from->id.u.stringy); else to->id.u.number = from->id.u.number; to->id.type = from->id.type; } /** * @brief Establish an edge between two nodes * * @param thiz * @param next * @param alpha *****************************************************************************/ void node_add_edge (ACT_NODE_t *nod, ACT_NODE_t *next, AC_ALPHABET_t alpha) { struct act_edge *oe; /* Outgoing edge */ if(nod->outgoing_size == nod->outgoing_capacity) node_grow_outgoing_vector (nod); oe = &nod->outgoing[nod->outgoing_size]; oe->alpha = alpha; oe->next = next; nod->outgoing_size++; } /** * @brief Assigns a unique ID to the node (used for debugging purpose) * * @param thiz *****************************************************************************/ void node_assign_id (ACT_NODE_t *nod) { static int unique_id = 1; nod->id = unique_id++; } /** * @brief Comparison function for qsort. see man qsort. * * @param l left side * @param r right side * @return According to the man page: The comparison function must return an * integer less than, equal to, or greater than zero if the first argument is * considered to be respectively less than, equal to, or greater than the * second. if two members compare as equal, their order in the sorted array is * undefined. *****************************************************************************/ static int node_edge_compare (const void *l, const void *r) { /* * NOTE: Because edge alphabets are unique in every node we ignore * equivalence case. */ if (((struct act_edge *)l)->alpha >= ((struct act_edge *)r)->alpha) return 1; else return -1; } /** * @brief Sorts edges alphabets. * * @param thiz *****************************************************************************/ void node_sort_edges (ACT_NODE_t *nod) { if ( ! nod->outgoing ) return; qsort ((void *)nod->outgoing, nod->outgoing_size, sizeof(struct act_edge), node_edge_compare); } /** * @brief Bookmarks the to-be-replaced patterns * * If there was more than one pattern accepted in a node then only one of them * must be replaced: The longest pattern that has a requested replacement. * * @param node * @return 1 if there was any replacement, 0 otherwise *****************************************************************************/ int node_book_replacement (ACT_NODE_t *nod) { size_t j; AC_PATTERN_t *pattern; AC_PATTERN_t *longest = NULL; if(!nod->final) return 0; for (j=0; j < nod->matched_size; j++) { pattern = &nod->matched[j]; if (pattern->rtext.astring != NULL) { if (!longest) longest = pattern; else if (pattern->ptext.length > longest->ptext.length) longest = pattern; } } nod->to_be_replaced = longest; return longest ? 1 : 0; } /** * @brief Grows the size of outgoing edges vector * * @param thiz *****************************************************************************/ static void node_grow_outgoing_vector (ACT_NODE_t *thiz) { const size_t grow_factor = (8 / (thiz->depth + 1)) + 1; /* The outgoing edges of nodes grow with different pace in different * depths; the shallower nodes the bigger outgoing number of nodes. * So for efficiency (speed & memory usage), we apply a measure to * manage different growth rate. */ if (thiz->outgoing_capacity == 0) { thiz->outgoing_capacity = grow_factor; thiz->outgoing = (struct act_edge *) malloc (thiz->outgoing_capacity * sizeof(struct act_edge)); } else { thiz->outgoing_capacity += grow_factor; thiz->outgoing = (struct act_edge *) realloc ( thiz->outgoing, thiz->outgoing_capacity * sizeof(struct act_edge)); } } /** * @brief Grows the size of matched patterns vector * * @param thiz *****************************************************************************/ static void node_grow_matched_vector (ACT_NODE_t *thiz) { if (thiz->matched_capacity == 0) { thiz->matched_capacity = 1; thiz->matched = (AC_PATTERN_t *) malloc (thiz->matched_capacity * sizeof(AC_PATTERN_t)); } else { thiz->matched_capacity += 2; thiz->matched = (AC_PATTERN_t *) realloc ( thiz->matched, thiz->matched_capacity * sizeof(AC_PATTERN_t)); } } /** * @brief Collect accepted patterns of the node. * * The accepted patterns consist of the node's own accepted pattern plus * accepted patterns of its failure node. * * @param node *****************************************************************************/ void node_collect_matches (ACT_NODE_t *nod) { size_t i; ACT_NODE_t *n = nod; while ((n = n->failure_node)) { for (i = 0; i < n->matched_size; i++) /* Always call with copy parameter 0 */ node_accept_pattern (nod, &(n->matched[i]), 0); if (n->final) nod->final = 1; } node_sort_edges (nod); /* Sort matched patterns? Is that necessary? I don't think so. */ } /** * @brief Displays all nodes recursively * * @param n * @param repcast *****************************************************************************/ void node_display (ACT_NODE_t *nod) { size_t j; struct act_edge *e; AC_PATTERN_t patt; printf("NODE(%3d)/....fail....> ", nod->id); if (nod->failure_node) printf("NODE(%3d)\n", nod->failure_node->id); else printf ("N.A.\n"); for (j = 0; j < nod->outgoing_size; j++) { e = &nod->outgoing[j]; printf(" |----("); if(isgraph(e->alpha)) printf("%c)---", e->alpha); else printf("0x%x)", e->alpha); printf("--> NODE(%3d)\n", e->next->id); } if (nod->matched_size) { printf("Accepts: {"); for (j = 0; j < nod->matched_size; j++) { patt = nod->matched[j]; if(j) printf(", "); switch (patt.id.type) { case AC_PATTID_TYPE_DEFAULT: case AC_PATTID_TYPE_NUMBER: printf("%ld", patt.id.u.number); break; case AC_PATTID_TYPE_STRING: printf("%s", patt.id.u.stringy); break; } printf(": %.*s", (int)patt.ptext.length, patt.ptext.astring); } printf("}\n"); } printf("\n"); } paraglob-0.6.0/src/ahocorasick/node.h000066400000000000000000000055141376603561600175010ustar00rootroot00000000000000/* * node.h: Defines the trie node and interface functions * This file is part of multifast. * Copyright 2010-2015 Kamiar Kanani multifast is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. multifast is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with multifast. If not, see . */ #ifndef _NODE_H_ #define _NODE_H_ #include "actypes.h" #ifdef __cplusplus extern "C" { #endif /* Forward Declaration */ struct act_edge; struct ac_trie; /** * Aho-Corasick Trie node */ typedef struct act_node { int id; /**< Node identifier: used for debugging purpose */ int final; /**< A final node accepts pattern; 0: not, 1: is final */ size_t depth; /**< Distance between this node and the root */ struct act_node *failure_node; /**< The failure transition node */ struct act_edge *outgoing; /**< Outgoing edges array */ size_t outgoing_capacity; /**< Max capacity of outgoing edges */ size_t outgoing_size; /**< Number of outgoing edges */ AC_PATTERN_t *matched; /**< Matched patterns array */ size_t matched_capacity; /**< Max capacity of the matched patterns */ size_t matched_size; /**< Number of matched patterns in this node */ AC_PATTERN_t *to_be_replaced; /**< Pointer to the pattern that must be * replaced */ struct ac_trie *trie; /**< The trie that this node belongs to */ } ACT_NODE_t; /** * Edge of the node */ struct act_edge { AC_ALPHABET_t alpha; /**< Transition alpha */ ACT_NODE_t *next; /**< Target of the edge */ }; /* * Node interface functions */ ACT_NODE_t *node_create (struct ac_trie *trie); ACT_NODE_t *node_create_next (ACT_NODE_t *nod, AC_ALPHABET_t alpha); ACT_NODE_t *node_find_next (ACT_NODE_t *nod, AC_ALPHABET_t alpha); ACT_NODE_t *node_find_next_bs (ACT_NODE_t *nod, AC_ALPHABET_t alpha); void node_assign_id (ACT_NODE_t *nod); void node_add_edge (ACT_NODE_t *nod, ACT_NODE_t *next, AC_ALPHABET_t alpha); void node_sort_edges (ACT_NODE_t *nod); void node_accept_pattern (ACT_NODE_t *nod, AC_PATTERN_t *new_patt, int copy); void node_collect_matches (ACT_NODE_t *nod); void node_release_vectors (ACT_NODE_t *nod); int node_book_replacement (ACT_NODE_t *nod); void node_display (ACT_NODE_t *nod); #ifdef __cplusplus } #endif #endif paraglob-0.6.0/src/ahocorasick/replace.c000066400000000000000000000415731376603561600201670ustar00rootroot00000000000000/* * replace.c: Implements the replacement functionality * * This file is part of multifast. * Copyright 2010-2015 Kamiar Kanani multifast is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. multifast is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with multifast. If not, see . */ #include #include "node.h" #include "ahocorasick.h" /* Privates */ static void mf_repdata_do_replace (MF_REPLACEMENT_DATA_t *rd, size_t to_position); static void mf_repdata_booknominee (MF_REPLACEMENT_DATA_t *rd, struct mf_replacement_nominee *new_nom); static void mf_repdata_push_nominee (MF_REPLACEMENT_DATA_t *rd, struct mf_replacement_nominee *new_nom); static void mf_repdata_grow_noms_array (MF_REPLACEMENT_DATA_t *rd); static void mf_repdata_appendtext (MF_REPLACEMENT_DATA_t *rd, AC_TEXT_t *text); static void mf_repdata_appendfactor (MF_REPLACEMENT_DATA_t *rd, size_t from, size_t to); static void mf_repdata_savetobacklog (MF_REPLACEMENT_DATA_t *rd, size_t to_position_r); static void mf_repdata_flush (MF_REPLACEMENT_DATA_t *rd); static unsigned int mf_repdata_bookreplacements (ACT_NODE_t *node); /* Publics */ void mf_repdata_init (AC_TRIE_t *trie); void mf_repdata_reset (MF_REPLACEMENT_DATA_t *rd); void mf_repdata_release (MF_REPLACEMENT_DATA_t *rd); void mf_repdata_allocbuf (MF_REPLACEMENT_DATA_t *rd); /** * @brief Initializes the replacement data part of the trie * * @param trie *****************************************************************************/ void mf_repdata_init (AC_TRIE_t *trie) { MF_REPLACEMENT_DATA_t *rd = &trie->repdata; rd->buffer.astring = NULL; rd->buffer.length = 0; rd->backlog.astring = NULL; rd->backlog.length = 0; rd->has_replacement = 0; rd->curser = 0; rd->noms = NULL; rd->noms_capacity = 0; rd->noms_size = 0; rd->replace_mode = MF_REPLACE_MODE_DEFAULT; rd->trie = trie; } /** * @brief Performs finalization tasks on replacement data. * Must be called when finalizing the trie itself * * @param rd *****************************************************************************/ void mf_repdata_allocbuf (MF_REPLACEMENT_DATA_t *rd) { /* Bookmark replacement pattern for faster retrieval */ rd->has_replacement = mf_repdata_bookreplacements (rd->trie->root); if (rd->has_replacement) { rd->buffer.astring = (AC_ALPHABET_t *) malloc (MF_REPLACEMENT_BUFFER_SIZE * sizeof(AC_ALPHABET_t)); rd->backlog.astring = (AC_ALPHABET_t *) malloc (AC_PATTRN_MAX_LENGTH * sizeof(AC_ALPHABET_t)); /* Backlog length is not bigger than the max pattern length */ } } /** * @brief Bookmarks the to-be-replaced patterns for all nodes * * @param node * @return *****************************************************************************/ static unsigned int mf_repdata_bookreplacements (ACT_NODE_t *node) { size_t i; unsigned int ret; ret = node_book_replacement (node); for (i = 0; i < node->outgoing_size; i++) { /* Recursively call itself to traverse all nodes */ ret += mf_repdata_bookreplacements (node->outgoing[i].next); } return ret; } /** * @brief Resets the replacement data and prepares it for a new operation * * @param rd *****************************************************************************/ void mf_repdata_reset (MF_REPLACEMENT_DATA_t *rd) { rd->buffer.length = 0; rd->backlog.length = 0; rd->curser = 0; rd->noms_size = 0; } /** * @brief Release the allocated resources to the replacement data * * @param rd *****************************************************************************/ void mf_repdata_release (MF_REPLACEMENT_DATA_t *rd) { free((AC_ALPHABET_t *)rd->buffer.astring); free((AC_ALPHABET_t *)rd->backlog.astring); free(rd->noms); } /** * @brief Flushes out all the available stuff in the buffer to the user * * @param rd *****************************************************************************/ static void mf_repdata_flush (MF_REPLACEMENT_DATA_t *rd) { rd->cbf(&rd->buffer, rd->user); rd->buffer.length = 0; } /** * @brief Extends the nominees array * * @param rd *****************************************************************************/ static void mf_repdata_grow_noms_array (MF_REPLACEMENT_DATA_t *rd) { const size_t grow_factor = 128; if (rd->noms_capacity == 0) { rd->noms_capacity = grow_factor; rd->noms = (struct mf_replacement_nominee *) malloc (rd->noms_capacity * sizeof(struct mf_replacement_nominee)); rd->noms_size = 0; } else { rd->noms_capacity += grow_factor; rd->noms = (struct mf_replacement_nominee *) realloc (rd->noms, rd->noms_capacity * sizeof(struct mf_replacement_nominee)); } } /** * @brief Adds the nominee to the end of the nominee list * * @param rd * @param new_nom *****************************************************************************/ static void mf_repdata_push_nominee (MF_REPLACEMENT_DATA_t *rd, struct mf_replacement_nominee *new_nom) { struct mf_replacement_nominee *nomp; /* Extend the vector if needed */ if (rd->noms_size == rd->noms_capacity) mf_repdata_grow_noms_array (rd); /* Add the new nominee to the end */ nomp = &rd->noms[rd->noms_size]; nomp->pattern = new_nom->pattern; nomp->position = new_nom->position; rd->noms_size ++; } /** * @brief Tries to add the nominee to the end of the nominee list * * @param rd * @param new_nom *****************************************************************************/ static void mf_repdata_booknominee (MF_REPLACEMENT_DATA_t *rd, struct mf_replacement_nominee *new_nom) { struct mf_replacement_nominee *prev_nom; size_t prev_start_pos, prev_end_pos, new_start_pos; if (new_nom->pattern == NULL) return; /* This is not a to-be-replaced pattern; ignore it. */ new_start_pos = new_nom->position - new_nom->pattern->ptext.length; switch (rd->replace_mode) { case MF_REPLACE_MODE_LAZY: if (new_start_pos < rd->curser) return; /* Ignore the new nominee, because it overlaps with the * previous replacement */ if (rd->noms_size > 0) { prev_nom = &rd->noms[rd->noms_size - 1]; prev_end_pos = prev_nom->position; if (new_start_pos < prev_end_pos) return; } break; case MF_REPLACE_MODE_DEFAULT: case MF_REPLACE_MODE_NORMAL: default: while (rd->noms_size > 0) { prev_nom = &rd->noms[rd->noms_size - 1]; prev_start_pos = prev_nom->position - prev_nom->pattern->ptext.length; prev_end_pos = prev_nom->position; if (new_start_pos <= prev_start_pos) rd->noms_size--; /* Remove that nominee, because it is a * factor of the new nominee */ else break; /* Get out the loop and add the new nominee */ } break; } mf_repdata_push_nominee(rd, new_nom); } /** * @brief Append the given text to the output buffer * * @param rd * @param text *****************************************************************************/ static void mf_repdata_appendtext (MF_REPLACEMENT_DATA_t *rd, AC_TEXT_t *text) { size_t remaining_bufspace = 0; size_t remaining_text = 0; size_t copy_len = 0; size_t copy_index = 0; while (copy_index < text->length) { remaining_bufspace = MF_REPLACEMENT_BUFFER_SIZE - rd->buffer.length; remaining_text = text->length - copy_index; copy_len = (remaining_bufspace >= remaining_text)? remaining_text : remaining_bufspace; memcpy((void *)&rd->buffer.astring[rd->buffer.length], (void *)&text->astring[copy_index], copy_len * sizeof(AC_ALPHABET_t)); rd->buffer.length += copy_len; copy_index += copy_len; if (rd->buffer.length == MF_REPLACEMENT_BUFFER_SIZE) mf_repdata_flush(rd); } } /** * @brief Append a factor of the current text to the output buffer * * @param rd * @param from * @param to *****************************************************************************/ static void mf_repdata_appendfactor (MF_REPLACEMENT_DATA_t *rd, size_t from, size_t to) { AC_TEXT_t *instr = rd->trie->text; AC_TEXT_t factor; size_t backlog_base_pos; size_t base_position = rd->trie->base_position; if (to < from) return; if (base_position <= from) { /* The backlog located in the input text part */ factor.astring = &instr->astring[from - base_position]; factor.length = to - from; mf_repdata_appendtext(rd, &factor); } else { backlog_base_pos = base_position - rd->backlog.length; if (from < backlog_base_pos) return; /* shouldn't come here */ if (to < base_position) { /* The backlog located in the backlog part */ factor.astring = &rd->backlog.astring[from - backlog_base_pos]; factor.length = to - from; mf_repdata_appendtext (rd, &factor); } else { /* The factor is divided between backlog and input text */ /* The backlog part */ factor.astring = &rd->backlog.astring[from - backlog_base_pos]; factor.length = rd->backlog.length - from + backlog_base_pos; mf_repdata_appendtext (rd, &factor); /* The input text part */ factor.astring = instr->astring; factor.length = to - base_position; mf_repdata_appendtext (rd, &factor); } } } /** * @brief Saves the backlog part of the current text to the backlog buffer. The * backlog part is the part after @p bg_pos * * @param rd * @param bg_pos backlog position *****************************************************************************/ static void mf_repdata_savetobacklog (MF_REPLACEMENT_DATA_t *rd, size_t bg_pos) { size_t bg_pos_r; /* relative backlog position */ AC_TEXT_t *instr = rd->trie->text; size_t base_position = rd->trie->base_position; if (base_position < bg_pos) bg_pos_r = bg_pos - base_position; else bg_pos_r = 0; /* the whole input text must go to backlog */ if (instr->length == bg_pos_r) return; /* Nothing left for the backlog */ if (instr->length < bg_pos_r) return; /* unexpected : assert (instr->length >= bg_pos_r) */ /* Copy the part after bg_pos_r to the backlog buffer */ memcpy( (AC_ALPHABET_t *) &rd->backlog.astring[rd->backlog.length], &instr->astring[bg_pos_r], instr->length - bg_pos_r ); rd->backlog.length += instr->length - bg_pos_r; } /** * @brief Perform replacement operations on the non-backlog part of the current * text. In-range nominees will be replaced the original pattern and the result * will be pushed to the output buffer. * * @param rd * @param to_position *****************************************************************************/ static void mf_repdata_do_replace (MF_REPLACEMENT_DATA_t *rd, size_t to_position) { unsigned int index; struct mf_replacement_nominee *nom; size_t base_position = rd->trie->base_position; if (to_position < base_position) return; /* Replace the candidate patterns */ if (rd->noms_size > 0) { for (index = 0; index < rd->noms_size; index++) { nom = &rd->noms[index]; if (to_position <= (nom->position - nom->pattern->ptext.length)) break; /* Append the space before pattern */ mf_repdata_appendfactor (rd, rd->curser, /* from */ nom->position - nom->pattern->ptext.length /* to */); /* Append the replacement instead of the pattern */ mf_repdata_appendtext(rd, &nom->pattern->rtext); rd->curser = nom->position; } rd->noms_size -= index; /* Shift the array to the left to eliminate the consumed nominees */ if (rd->noms_size && index) { memcpy (&rd->noms[0], &rd->noms[index], rd->noms_size * sizeof(struct mf_replacement_nominee)); /* TODO: implement a circular queue */ } } /* Append the chunk between the last pattern and to_position */ if (to_position > rd->curser) { mf_repdata_appendfactor (rd, rd->curser, to_position); rd->curser = to_position; } if (base_position <= rd->curser) { /* we consume the whole backlog or none of it */ rd->backlog.length = 0; } } /** * @brief Replaces the patterns in the given text with their correspondence * replacement in the A.C. Trie * * @param thiz * @param instr * @param mode * @param callback * @param param * @return *****************************************************************************/ int multifast_replace (AC_TRIE_t *thiz, AC_TEXT_t *instr, MF_REPLACE_MODE_t mode, MF_REPLACE_CALBACK_f callback, void *param) { ACT_NODE_t *current; ACT_NODE_t *next; struct mf_replacement_nominee nom; MF_REPLACEMENT_DATA_t *rd = &thiz->repdata; size_t position_r = 0; /* Relative current position in the input string */ size_t backlog_pos = 0; /* Relative backlog position in the input string */ if (thiz->trie_open) return -1; /* _finalize() must be called first */ if (!rd->has_replacement) return -2; /* Trie doesn't have any to-be-replaced pattern */ rd->cbf = callback; rd->user = param; rd->replace_mode = mode; thiz->text = instr; /* Save the input string in a helper variable * for convenience */ current = thiz->last_node; /* Main replace loop: * Find patterns and bookmark them */ while (position_r < instr->length) { if (!(next = node_find_next_bs(current, instr->astring[position_r]))) { /* Failed to follow a pattern */ if(current->failure_node) current = current->failure_node; else position_r++; } else { current = next; position_r++; } if (current->final && next) { /* Bookmark nominee patterns for replacement */ nom.pattern = current->to_be_replaced; nom.position = thiz->base_position + position_r; mf_repdata_booknominee (rd, &nom); } } /* * At the end of input chunk, if the tail of the chunk is a prefix of a * pattern, then we must keep it in the backlog buffer and wait for the * next chunk to decide about it. */ backlog_pos = thiz->base_position + instr->length - current->depth; /* Now replace the patterns up to the backlog_pos point */ mf_repdata_do_replace (rd, backlog_pos); /* Save the remaining to the backlog buffer */ mf_repdata_savetobacklog (rd, backlog_pos); /* Save status variables */ thiz->last_node = current; thiz->base_position += position_r; return 0; } /** * @brief Flushes the remaining data back to the user and ends the replacement * operation. * * @param thiz * @param keep Indicates the continuity of the chunks. 0 means that the last * chunk has been fed in, and we want to end the replacement and receive the * final result. *****************************************************************************/ void multifast_rep_flush (AC_TRIE_t *thiz, int keep) { if (!keep) { mf_repdata_do_replace (&thiz->repdata, thiz->base_position); } mf_repdata_flush (&thiz->repdata); if (!keep) { mf_repdata_reset (&thiz->repdata); thiz->last_node = thiz->root; thiz->base_position = 0; } } paraglob-0.6.0/src/ahocorasick/replace.h000066400000000000000000000066761376603561600202010ustar00rootroot00000000000000/* * replace.h: Defines replacement related data structures * * This file is part of multifast. * Copyright 2010-2015 Kamiar Kanani multifast is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. multifast is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with multifast. If not, see . */ #ifndef _MF_REPLACE_H_ #define _MF_REPLACE_H_ #include "actypes.h" #ifdef __cplusplus extern "C" { #endif /** * Different replace modes */ typedef enum mf_replace_mode { MF_REPLACE_MODE_DEFAULT = 0, MF_REPLACE_MODE_NORMAL, /**< Normal replace mode: Short factors are swollen * by the big one; All other patterns are replced * even if they have overlap. */ MF_REPLACE_MODE_LAZY /**< Lazy replace mode: every pattern which comes * first is replced; the overlapping pattrns are * nullified by the previous patterns; consequently, * factor patterns nullify the big patterns. */ } MF_REPLACE_MODE_t; /** * Before we replace any pattern we encounter, we should be patient * because it may be a factor of another longer pattern. So we maintain a record * of each recognized pattern until we make sure that it is not a sub-pattern * and can be replaced by its substitute. To keep a record of packets we use * the following structure. */ struct mf_replacement_nominee { AC_PATTERN_t *pattern; size_t position; }; /** * Contains replacement related data */ typedef struct mf_replacement_date { AC_TEXT_t buffer; /**< replacement buffer: maintains the result * of replacement */ AC_TEXT_t backlog; /**< replacement backlog: if a pattern is divided * between two or more different chunks, then at the * end of the first chunk we need to keep it here until * the next chunk comes and we decide if it is a * pattern or just a pattern prefix. */ unsigned int has_replacement; /**< total number of to-be-replaced patterns */ struct mf_replacement_nominee *noms; /**< Replacement nominee array */ size_t noms_capacity; /**< Max capacity of the array */ size_t noms_size; /**< Number of nominees in the array */ size_t curser; /**< the position in the input text before which all * patterns are replaced and the result is saved to the * buffer. */ MF_REPLACE_MODE_t replace_mode; /**< Replace mode */ MF_REPLACE_CALBACK_f cbf; /**< Callback function */ void *user; /**< User parameters sent to the callback function */ struct ac_trie *trie; /**< Pointer to the trie */ } MF_REPLACEMENT_DATA_t; #ifdef __cplusplus } #endif #endif /* REPLACE_H */ paraglob-0.6.0/src/paraglob.cpp000066400000000000000000000132621376603561600164070ustar00rootroot00000000000000// See the file "COPYING" in the main distribution directory for copyright. #include "paraglob/paraglob.h" #include "ahocorasick/AhoCorasickPlus.h" paraglob::Paraglob::Paraglob() : my_ac(new AhoCorasickPlus) {} paraglob::Paraglob::Paraglob(const std::vector& patterns) : my_ac(new AhoCorasickPlus) { for (const std::string& pattern : patterns) { if ( !(this->add(pattern)) ) { throw paraglob::add_error("Failed to add pattern: " + pattern); } } this->compile(); } paraglob::Paraglob::Paraglob(std::unique_ptr> serialized) : Paraglob(paraglob::ParaglobSerializer::unserialize(std::move(serialized))) {} paraglob::Paraglob::~Paraglob() = default; bool paraglob::Paraglob::add(const std::string& pattern) { AhoCorasickPlus::EnumReturnStatus status; for (const std::string& meta_word : this->get_meta_words(pattern)) { AhoCorasickPlus::PatternId patId = this->meta_words.size(); status = this->my_ac->addPattern(meta_word, patId, true); if (status == AhoCorasickPlus::RETURNSTATUS_SUCCESS) { this->meta_words.push_back(meta_word); // Build the new paraglobNode in place. this->meta_to_node_map.emplace( std::piecewise_construct, std::forward_as_tuple(meta_word), std::forward_as_tuple(meta_word, pattern) ); } else if (status == AhoCorasickPlus::RETURNSTATUS_DUPLICATE_PATTERN) { this->meta_to_node_map.at(meta_word).add_pattern(pattern); } else { // Failed to add return false; } } return true; } void paraglob::Paraglob::compile() { this->my_ac->finalize(); } std::vector paraglob::Paraglob::get(const std::string& text) { // Narrow to the meta-word matches std::vector patterns; for (int id : this->my_ac->findAll(text, false)) this->meta_to_node_map.at(this->meta_words.at(id)).merge_matches(patterns, text); // Single wildcards always need to be checked if (this->single_wildcards.size() > 0) patterns.insert(patterns.end(), this->single_wildcards.begin(), this->single_wildcards.end()); // Remove duplicates std::sort(patterns.begin(), patterns.end()); patterns.erase(unique(patterns.begin(), patterns.end()), patterns.end()); return patterns; } std::vector paraglob::Paraglob::split_on_brackets(const std::string &in) const { std::vector out; size_t pos; size_t prev = 0; while ((pos = in.find_first_of('[', prev)) != std::string::npos) { size_t end_bracket = in.find_first_of(']', pos); if (end_bracket != std::string::npos) { out.push_back(in.substr(prev, pos-prev)); prev = end_bracket + 1; } else { break; } } // There are no more opening / closing brackets // Append the rest of the string out.push_back(in.substr(prev, in.length()-prev)); return out; } std::vector paraglob::Paraglob::get_meta_words(const std::string &pattern) { std::vector meta_words; // Split the pattern by brackets for (std::string word : split_on_brackets(pattern)) { // Parse each bracket section std::size_t prev = 0, pos; while ((pos = word.find_first_of("*?", prev)) != std::string::npos) { if (pos > prev) { meta_words.push_back(word.substr(prev, pos-prev)); } prev = pos+1; } if (prev < word.length()) { meta_words.push_back(word.substr(prev, std::string::npos)); } } if (meta_words.size() == 0 && pattern != "") { this->single_wildcards.push_back(pattern); } return meta_words; } std::vector paraglob::Paraglob::get_patterns() const { std::vector patterns; // Merge in all of the nodes patterns for (auto it : this->meta_to_node_map) { it.second.merge_patterns(patterns); } if (this->single_wildcards.size() > 0) patterns.insert(patterns.end(), this->single_wildcards.begin(), this->single_wildcards.end()); // Remove the duplicate patterns. Duplicates don't effect the state. std::sort(patterns.begin(), patterns.end()); patterns.erase(unique(patterns.begin(), patterns.end()), patterns.end()); return patterns; } // Returns a string representation of the paraglob that it can rebuild // itself from. A paraglobs state is completely defined by the vector of patterns // that it contains. // // NOTE: Ideally, we'd like to serialize a paraglob in such a way that it can be // unserialized without having to compile itself, but this proves to be very // non-trivial. While its surely possible, the multifast data structure // maintains a complex system of nodes, pointers to nodes, and doesn't store // itself in memory contiguously. Without a pressing use case for this // functionality, right now we're choosing not to do this. Instead, paraglob // serializes its vector of patterns, and rebuilds itself when unserialized. std::unique_ptr> paraglob::Paraglob::serialize() const { return paraglob::ParaglobSerializer::serialize(this->get_patterns()); } std::string paraglob::Paraglob::str() const { std::stringstream ss; auto add_string = [&ss](const std::string& p){ ss << p << " "; }; auto pretty_add = [add_string](const std::vector& v) { add_string("["); if (v.size() > 6) { std::for_each(v.begin(), v.begin() + 3, add_string); add_string("..."); std::for_each(v.rbegin(), v.rbegin() + 3, add_string); } else { std::for_each(v.begin(), v.end(), add_string); } add_string("]\n"); }; add_string("paraglob:\nmeta words: "); pretty_add(this->meta_words); add_string("patterns:"); pretty_add(this->get_patterns()); return ss.str(); } bool paraglob::Paraglob::operator==(const Paraglob &other) const { return (this->meta_to_node_map == other.meta_to_node_map); } paraglob-0.6.0/src/paraglob_serializer.cpp000066400000000000000000000042731376603561600206420ustar00rootroot00000000000000// See the file "COPYING" in the main distribution directory for copyright. #include "paraglob/serializer.h" std::unique_ptr> paraglob::ParaglobSerializer::serialize(const std::vector& v) { std::unique_ptr> ret (new std::vector); add_int(v.size(), *ret); for (const std::string &s: v) { add_int(s.length(), *ret); for (uint8_t c : s) { // copy here because of type change ret->push_back(c); } } return ret; } // ret -> [, , ... ] std::vector paraglob::ParaglobSerializer::unserialize (const std::unique_ptr>& vsp) { std::vector ret; std::vector::iterator vsp_it = vsp->begin(); uint64_t n_strings = get_int_and_move(vsp_it); // If n_strings is zero vsp_it will equal vsp->end if (vsp_it > vsp->end()){ throw paraglob::underflow_error("Serialization data ended unexpectedly."); } // Reserve space ahead of time rather than resizing in loop. ret.reserve(n_strings); while (vsp_it < vsp->end()) { uint64_t l = get_int_and_move(vsp_it); ret.emplace_back(vsp_it, vsp_it + l); std::advance(vsp_it, l); } // If the read was successful, we have advanced our iterator exactly to the // end, and we have read exactly n_strings. if (vsp_it > vsp->end()) { throw paraglob::underflow_error("Serialization data ended unexpectedly."); } else if (ret.size() > n_strings) { throw paraglob::overflow_error("Read more patterns than expected."); } else if (ret.size() < n_strings) { throw paraglob::underflow_error("Read fewer patterns than expected."); } return ret; } inline void paraglob::ParaglobSerializer::add_int (uint64_t a, std::vector &target) { uint8_t* chars = reinterpret_cast(&a); target.insert(target.end(), chars, chars + sizeof(uint64_t)); } inline uint64_t paraglob::ParaglobSerializer::get_int_and_move (std::vector::iterator &start) { uint64_t ret = static_cast(*start); std::advance(start, sizeof(uint64_t)); return ret; } paraglob-0.6.0/testing/000077500000000000000000000000001376603561600147765ustar00rootroot00000000000000paraglob-0.6.0/testing/.gitignore000066400000000000000000000000401376603561600167600ustar00rootroot00000000000000diag.log .tmp .btest.failed.dat paraglob-0.6.0/testing/Baseline/000077500000000000000000000000001376603561600165205ustar00rootroot00000000000000paraglob-0.6.0/testing/Baseline/driver.basic_matches/000077500000000000000000000000001376603561600225775ustar00rootroot00000000000000paraglob-0.6.0/testing/Baseline/driver.basic_matches/out000066400000000000000000000002741376603561600233340ustar00rootroot00000000000000### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63. 4 paraglob: meta words: [ d g og ] patterns: [ * *og d? d?g d[!wl]g ] paraglob-0.6.0/testing/Baseline/driver.empty_patterns/000077500000000000000000000000001376603561600230705ustar00rootroot00000000000000paraglob-0.6.0/testing/Baseline/driver.empty_patterns/out000066400000000000000000000003341376603561600236220ustar00rootroot00000000000000### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63. 1 paraglob: meta words: [ cat dog fish horse frog lion ] patterns: [ * cat dog? ... lion horse frog ] paraglob-0.6.0/testing/Baseline/driver.serial/000077500000000000000000000000001376603561600212715ustar00rootroot00000000000000paraglob-0.6.0/testing/Baseline/driver.serial/out000066400000000000000000000001721376603561600220230ustar00rootroot00000000000000### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63. passed paraglob-0.6.0/testing/Baseline/driver.serial/out2000066400000000000000000000001721376603561600221050ustar00rootroot00000000000000### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63. passed paraglob-0.6.0/testing/Baseline/driver.time/000077500000000000000000000000001376603561600207505ustar00rootroot00000000000000paraglob-0.6.0/testing/Baseline/driver.time/out000066400000000000000000000001731376603561600215030ustar00rootroot00000000000000### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63. Passed. paraglob-0.6.0/testing/btest.cfg000066400000000000000000000003341376603561600166000ustar00rootroot00000000000000 [btest] TestDirs = driver TmpDir = %(testbase)s/.tmp BaselineDir = %(testbase)s/Baseline IgnoreDirs = .svn CVS .tmp IgnoreFiles = *.tmp *.swp #* [environment] PATH=%(testbase)s/../build/tools:%(default_path)s paraglob-0.6.0/testing/driver/000077500000000000000000000000001376603561600162715ustar00rootroot00000000000000paraglob-0.6.0/testing/driver/basic_matches000066400000000000000000000001351376603561600210000ustar00rootroot00000000000000# @TEST-EXEC: paraglob-test -n dog "*" d?g *og d? d[!wl]g > out # @TEST-EXEC: btest-diff out paraglob-0.6.0/testing/driver/empty_patterns000066400000000000000000000001471376603561600212740ustar00rootroot00000000000000# @TEST-EXEC: paraglob-test -n dog "*" cat dog? fish horse frog lion> out # @TEST-EXEC: btest-diff out paraglob-0.6.0/testing/driver/serial000066400000000000000000000003151376603561600174720ustar00rootroot00000000000000# @TEST-EXEC: paraglob-test -s dog "*" d?g *og d? d[!wl]g > out # @TEST-EXEC: paraglob-test -s dog dog dog dog dog dog dog dog dog dog dog > out2 # @TEST-EXEC: btest-diff out # @TEST-EXEC: btest-diff out2 paraglob-0.6.0/testing/driver/time000066400000000000000000000001231376603561600171460ustar00rootroot00000000000000# @TEST-EXEC: paraglob-test -b 10000 20000 10 3 > out # @TEST-EXEC: btest-diff out paraglob-0.6.0/tools/000077500000000000000000000000001376603561600144615ustar00rootroot00000000000000paraglob-0.6.0/tools/CMakeLists.txt000066400000000000000000000002571376603561600172250ustar00rootroot00000000000000 set(SOURCES driver.cpp benchmark.cpp) add_executable(paraglob-test ${SOURCES}) target_link_libraries(paraglob-test paraglob) install(TARGETS paraglob-test DESTINATION bin) paraglob-0.6.0/tools/benchmark.cpp000066400000000000000000000103101376603561600171120ustar00rootroot00000000000000// See the file "COPYING" in the main distribution directory for copyright. #include "benchmark.h" #include #include std::random_device dev; std::mt19937 rng(dev()); std::uniform_int_distribution dist(0,RAND_MAX); int rand_int() { return dist(rng); } static const char* benchmark_pattern_words[] = { "aaaaaa", "bb", "cccccccccccccccc", "ddddd", "eeeeeeeee", "fffffffffffff", "gggg" }; const char* random_pattern_word(){ int idx = rand_int() % (sizeof(benchmark_pattern_words) / sizeof(const char*)); return benchmark_pattern_words[idx]; } const char* random_word() { static char buffer[1024]; int j; int rounds = (rand_int() % 25) + 5; for ( j = 0; j < rounds; j++ ) { buffer[j] = (char)((rand_int() % 26) + 'a'); } buffer[rounds] = '\0'; return buffer; } double benchmark(char* a, char* b, char* c, bool silent) { long num_patterns = atol(a); long num_queries = atol(b); long match_prob = atol(c); return benchmark_n(num_patterns, num_queries, match_prob, silent); } double benchmark_n(long num_patterns, long num_queries, long match_prob, bool silent) { if (!silent) { std::cout << "creating workload:\n"; std::cout << "\t# patterns: " << num_patterns << "\n"; std::cout << "\t# queries: " << num_queries << "\n"; std::cout << "\t% matches: " << match_prob << "\n"; } // Create the patterns. std::unique_ptr patterns(new std::string[num_patterns]); char buffer[1024]; int i, j; for ( i = 0; i < num_patterns; i++ ) { buffer[0] = '\0'; int rounds = (rand_int() % 10) + 2; for ( j = 0; j < rounds; j++ ) { if ( j != 0 ) strcat(buffer, "*"); if ( (rand_int() % 10) == 0 ) { strcat(buffer, random_pattern_word()); } else { strcat(buffer, random_word()); } } std::string s(buffer); patterns[i] = s; } // Create the queries. std::unique_ptr queries(new std::string[num_queries]); for ( i = 0; i < num_queries; i++ ) { buffer[0] = '\0'; if ( (rand_int() % 100) <= match_prob ) { // Create a likely match candidate. int rounds = (rand_int() % 5) + 1; for ( j = 0; j < rounds; j++ ) { strcat(buffer, random_pattern_word()); } } else { // Create a mismatch. int rounds = (rand_int() % 50) + 5; for ( j = 0; j < rounds; j++ ) { buffer[j] = (char)((rand_int() % 26) + 'a'); } buffer[rounds] = '\0'; } queries[i] = std::string(strdup(buffer)); } if (!silent) { std::cout << "creating paraglob \n"; } auto build_start = std::chrono::high_resolution_clock::now(); paraglob::Paraglob myGlob; for ( i = 0; i < num_patterns; ++i ) { const auto& p = patterns[i]; myGlob.add(p); } myGlob.compile(); auto build_finish = std::chrono::high_resolution_clock::now(); std::chrono::duration build_time = build_finish - build_start; auto start = std::chrono::high_resolution_clock::now(); if (!silent) { std::cout << "making queries \n"; } for ( i = 0; i < num_queries; ++i ) { const auto& q = queries[i]; myGlob.get(q); } auto finish = std::chrono::high_resolution_clock::now(); std::chrono::duration elapsed = finish - start; if (!silent) { std::cout << "Build time: " << build_time.count() << "s\n"; std::cout << "Search time: " << elapsed.count() << " s\n"; std::cout << "Queries/second: " << num_queries/elapsed.count() << "\n"; } return elapsed.count() + build_time.count(); } void makeGraphData() { /* prints data to the console for generation of 3d plot of paraglob performance. x axis is number of patterns y axis is number of queries z axis is the time taken to build and run the queries */ for(long patterns = 500; patterns <= 10000; patterns += 500) { std::cout << "{ "; for(long queries = 1000; queries <= 20000; queries += 1000) { std::cout << benchmark_n(patterns, queries, 10, true); if (queries != 20000) { std::cout << ", "; } } std::cout << "},\n"; } } paraglob-0.6.0/tools/benchmark.h000066400000000000000000000007211376603561600165640ustar00rootroot00000000000000// See the file "COPYING" in the main distribution directory for copyright. #ifndef BENCHMARK_H #define BENCHMARK_H #include #include #include #include "paraglob/paraglob.h" /* A set of benchmark functions exercising paraglob with different workloads. */ double benchmark(char* a, char* b, char* c, bool silent); double benchmark_n(long num_patterns, long num_queries, long match_prob, bool silent); void makeGraphData(); #endif paraglob-0.6.0/tools/driver.cpp000066400000000000000000000053151376603561600164640ustar00rootroot00000000000000// See the file "COPYING" in the main distribution directory for copyright. /* A simple driver for testing paraglob's performance and functionality. Supports the following arguments: -b