libcolumbus-1.1.0+15.10.20150806/0000755000015300001610000000000012560622775016400 5ustar pbuserpbgroup00000000000000libcolumbus-1.1.0+15.10.20150806/readme.txt0000644000015300001610000000132412560622644020371 0ustar pbuserpbgroup00000000000000Columbus - a library for fast error-tolerant searching (C) 2012 Canonical ltd Compiling Columbus uses CMake. It also enforces separate build and source directories. To compile it, simply extract the source, cd into it and run the following commands: mkdir build cd build cmake -DCMAKE_BUILD_TYPE=debug .. make Change "debug" to "release" or "relwithdebinfo" depending on your needs. Testing Columbus comes with a test suite. It is not enabled by default. To enable it, simply run 'ccmake .' in your build directory and enable tests in the GUI. There are additional scalability tests that need to be enabled separately, since they take quite a lot of time to run. They are enabled in the same way as regular tests. libcolumbus-1.1.0+15.10.20150806/include/0000755000015300001610000000000012560622775020023 5ustar pbuserpbgroup00000000000000libcolumbus-1.1.0+15.10.20150806/include/libcolumbus.pc.in0000644000015300001610000000033412560622644023267 0ustar pbuserpbgroup00000000000000prefix=@prefix@ exec_prefix=@exec_prefix@ libdir=@libdir@ includedir=@includedir@ Name: @pkg-name@ Description: Error tolerant matcher Version: @SO_VERSION@ Libs: -L${libdir} -l@COL_LIB_BASENAME@ Cflags: -I${includedir}libcolumbus-1.1.0+15.10.20150806/include/IndexWeights.hh0000644000015300001610000000225312560622644022743 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #ifndef INDEXWEIGHTS_HH_ #define INDEXWEIGHTS_HH_ #include "ColumbusCore.hh" COL_NAMESPACE_START struct IndexWeightsPrivate; class Word; class COL_PUBLIC IndexWeights final { IndexWeightsPrivate *p; public: IndexWeights(); ~IndexWeights(); const IndexWeights & operator=(const IndexWeights &other) = delete; void setWeight(const Word &w, double weigth); double getWeight(const Word &w) const; }; COL_NAMESPACE_END #endif /* INDEXWEIGHTS_HH_ */ libcolumbus-1.1.0+15.10.20150806/include/LevenshteinIndex.hh0000644000015300001610000000405512560622644023617 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #ifndef LEVENSHTEININDEX_HH #define LEVENSHTEININDEX_HH #include "ColumbusCore.hh" #include "IndexMatches.hh" COL_NAMESPACE_START struct LevenshteinIndexPrivate; struct TrieNode; class ErrorMatrix; class Word; class ErrorValues; class COL_PUBLIC LevenshteinIndex final { private: LevenshteinIndexPrivate *p; void searchRecursive(const Word &query, TrieOffset node, const ErrorValues &e, const Letter letter, const Letter previousLetter, const size_t depth, ErrorMatrix &em, IndexMatches &matches, const int max_error) const; int findOptimalError(const Letter letter, const Letter previousLetter, const Word &query, const size_t i, const size_t depth, const ErrorMatrix &em, const ErrorValues &e) const; public: LevenshteinIndex(); ~LevenshteinIndex(); LevenshteinIndex(const LevenshteinIndex &other) = delete; const LevenshteinIndex & operator=(const LevenshteinIndex &other) = delete; static int getDefaultError(); void insertWord(const Word &word, const WordID wordID); bool hasWord(const Word &word) const; void findWords(const Word &query, const ErrorValues &e, const int maxError, IndexMatches &matches) const; size_t wordCount(const WordID queryID) const; size_t maxCount() const; size_t numNodes() const; size_t numWords() const; }; COL_NAMESPACE_END #endif libcolumbus-1.1.0+15.10.20150806/include/Trie.hh0000644000015300001610000000372112560622644021245 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2013 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #ifndef TRIE_HH #define TRIE_HH #include "ColumbusCore.hh" COL_NAMESPACE_START struct TriePrivate; class Word; class Trie final { private: TriePrivate *p; void expand(); TrieOffset append(const char *data, const int size); TrieOffset addNewSibling(const TrieOffset node, const TrieOffset sibling, Letter l); TrieOffset addNewNode(const TrieOffset parent); public: Trie(); ~Trie(); Trie(const Trie &other) = delete; const Trie & operator=(const Trie &other) = delete; bool hasWord(const Word &word) const; TrieOffset findWord(const Word &word) const; TrieOffset insertWord(const Word &word, const WordID wordID); TrieOffset getRoot() const; TrieOffset getSiblingList(TrieOffset node) const; TrieOffset getNextSibling(TrieOffset sibling) const; Letter getLetter(TrieOffset sibling) const; TrieOffset getChild(TrieOffset sibling) const; WordID getWordID(TrieOffset node) const; bool hasSibling(TrieOffset sibling) const; TrieOffset getParent(TrieOffset node) const; TrieOffset getSiblingTo(const TrieOffset node, const TrieOffset child) const; size_t numWords() const; size_t numNodes() const; Word getWord(const TrieOffset startNode) const; }; COL_NAMESPACE_END #endif /* */ libcolumbus-1.1.0+15.10.20150806/include/Word.hh0000644000015300001610000000424512560622644021257 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #ifndef WORD_HH_ #define WORD_HH_ #include "ColumbusCore.hh" #include COL_NAMESPACE_START /** * A word encapsulates a single word. That is, * there is no whitespace in it. * * A word's contents are immutable. */ class COL_PUBLIC Word final { private: Letter *text; // Change this to a shared pointer to save memory. unsigned int len; bool hasWhitespace(); void duplicateFrom(const Word &w); void convertString(const char *utf8Word); public: Word(); Word(const Word &w); Word(Word &&w); Word(const std::string &w); explicit Word(const char *utf8Word); explicit Word(Letter *letters, size_t length); ~Word(); unsigned int length() const { return len;} void toUtf8(char *buf, unsigned int bufSize) const; std::string asUtf8() const; Word join(const Word &w) const; Letter operator[](unsigned int i) const; bool operator==(const Word &w) const; bool operator==(const std::string &utf8Str) const; bool operator==(const char *utf8Word) const; bool operator!=(const Word &w) const; bool operator!=(const std::string &utf8Str) const; bool operator!=(const char *utf8Word) const; bool operator<(const Word &w) const; Word& operator=(const Word &w); Word& operator=(Word &&w); Word& operator=(const char *utf8Word); Word& operator=(const std::string &utf8Str); size_t hash() const; friend class LevenshteinIndex; }; COL_NAMESPACE_END #endif /* WORD_HH_ */ libcolumbus-1.1.0+15.10.20150806/include/Document.hh0000644000015300001610000000316312560622644022120 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #ifndef DOCUMENT_HH_ #define DOCUMENT_HH_ #include "ColumbusCore.hh" #include COL_NAMESPACE_START class Word; class WordList; struct DocumentPrivate; class COL_PUBLIC Document final { private: DocumentPrivate *p; public: Document(DocumentID id); Document(const Document &d); ~Document(); const Document& operator=(const Document &d); void addText(const Word &field, const WordList &words); void addText(const Word &field, const char *textAsUtf8); void addText(const Word &field, const std::string &textAsUtf8); const WordList& getText(const Word &field) const; size_t fieldCount() const; DocumentID getID() const; void getFieldNames(WordList &list) const; size_t wordCount(const Word &w, const Word field) const; size_t totalWordCount(const Word &w) const; // Needs iterators to browse through text names. }; COL_NAMESPACE_END #endif /* DOCUMENT_HH_ */ libcolumbus-1.1.0+15.10.20150806/include/MatchResults.hh0000644000015300001610000000301212560622644022751 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #ifndef MATCHRESULTS_HH_ #define MATCHRESULTS_HH_ #include "ColumbusCore.hh" COL_NAMESPACE_START struct MatchResultsPrivate; class Word; class COL_PUBLIC MatchResults final { MatchResultsPrivate *p; void sortIfRequired() const; public: MatchResults(); ~MatchResults(); MatchResults(const MatchResults &other); MatchResults(MatchResults &&other); const MatchResults& operator=(MatchResults &&other); const MatchResults& operator=(const MatchResults &other); void addResult(DocumentID docID, double relevancy); void addResults(const MatchResults &r); void copyResult(const MatchResults &other, const size_t i); size_t size() const; DocumentID getDocumentID(size_t i) const; double getRelevancy(size_t i) const; }; COL_NAMESPACE_END #endif /* MATCHRESULTS_HH_ */ libcolumbus-1.1.0+15.10.20150806/include/ColumbusCore.hh.in0000644000015300001610000000462512560622644023355 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; version 3. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ /* * This file contains the most basic definitions in Columbus. * Almost every single source file includes this, so it must be fast. * It must work as such when included either from C or C++. * */ #ifndef COLUMBUSCORE_H #define COLUMBUSCORE_H /* Macros that set symbol visibilities in shared libraries properly. * Adapted from http://gcc.gnu.org/wiki/Visibility */ #if defined _WIN32 || defined __CYGWIN__ #ifdef BUILDING_COLUMBUS #define COL_PUBLIC __declspec(dllexport) #else #define COL_PUBLIC __declspec(dllimport) #endif #else #if defined __GNUC__ #define COL_PUBLIC __attribute__ ((visibility("default"))) #else #pragma message ("Compiler does not support symbol visibility.") #define COL_PUBLIC #endif #endif #define UNUSED_VAR __attribute__ ((unused)) #ifdef __cplusplus #include #include #else #include #include #endif #ifdef __cplusplus #define COL_NAMESPACE_START namespace Columbus { #define COL_NAMESPACE_END } #endif #ifdef DEBUG_MESSAGES #ifdef __cplusplus #include #else #include #endif #define debugMessage(...) printf(__VA_ARGS__); #else #define debugMessage(...) #endif #define COLUMBUS_VERSION_STRING "${SO_VERSION}" #define COLUMBUS_ABI_VERSION ${ABI_VERSION} #define COLUMBUS_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}" #define COLUMBUS_DATADIR COLUMBUS_INSTALL_PREFIX "/share/${COL_LIB_BASENAME}${SO_VERSION_MAJOR}/" typedef ${LETTER_TYPE} Letter; #define INTERNAL_ENCODING "${INTERNAL_ENCODING}" typedef uint32_t WordID; #define INVALID_WORDID ((WordID)-1) typedef uintptr_t DocumentID; #define INVALID_DOCID ((DocumentID)-1) typedef uint32_t TrieOffset; #cmakedefine HAS_SPARSE_HASH #endif libcolumbus-1.1.0+15.10.20150806/include/ErrorValues.hh0000644000015300001610000000712612560622644022616 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #ifndef ERRORVALUES_HH_ #define ERRORVALUES_HH_ #include "ColumbusCore.hh" COL_NAMESPACE_START enum accentGroups { latinAccentGroup, greekAccentGroup, }; struct ErrorValuesPrivate; class Word; class COL_PUBLIC ErrorValues final { private: static const int DEFAULT_ERROR = 100; static const int DEFAULT_GROUP_ERROR = 30; static const int DEFAULT_TYPO_ERROR = 30; static const int DEFAULT_SUBSTRING_END_DELETION_ERROR = 15; static const int DEFAULT_SUBSTRING_START_INSERTION_ERROR = 15; static const size_t DEFAULT_SUBSTRING_START_LENGTH = 3; int insertionError; int deletionError; int endDeletionError; int startInsertionError; int substituteError; int transposeError; size_t substringStartLimit; ErrorValuesPrivate *p; void clearLUT(); void addToLUT(Letter l1, Letter l2, int value); void addGroupErrorToLUT(const Word &groupLetters, const int error); int substituteErrorSlow(Letter l1, Letter l2) const; void setPadError(const Letter number, const char letters[4], int letterCount, int error); public: ErrorValues(); ~ErrorValues(); const ErrorValues& operator=(const ErrorValues &other) = delete; int getInsertionError() const { return insertionError; } int getDeletionError() const { return deletionError; } int getEndDeletionError() const { return endDeletionError; } int getStartInsertionError(const size_t queryTermLength) const { return queryTermLength >= substringStartLimit ? startInsertionError : insertionError; } int getTransposeError() const { return transposeError; } void setInsertionError(const int e) { insertionError = e; } void setDeletionError(const int e) { deletionError = e; } void setEndDeletionError(const int e) { endDeletionError = e; } void setStartInsertionError(const int e) { startInsertionError = e; } void setTransposeError(const int e) { transposeError = e; } void setSubstringStartLimit(const size_t e) { substringStartLimit = e; } int getSubstituteError(Letter l1, Letter l2) const; static int getDefaultError() { return ErrorValues::DEFAULT_ERROR; } static int getDefaultGroupError() { return ErrorValues::DEFAULT_GROUP_ERROR; } static int getDefaultTypoError() { return ErrorValues::DEFAULT_TYPO_ERROR; } static int getSubstringDefaultEndDeletionError() { return ErrorValues::DEFAULT_SUBSTRING_END_DELETION_ERROR; } static int getSubstringDefaultStartInsertionError() { return ErrorValues::DEFAULT_SUBSTRING_START_INSERTION_ERROR; } void setError(Letter l1, Letter l2, const int error); void setGroupError(const Word &groupLetters, const int error); void addAccents(accentGroups group); void addKeyboardErrors(); void addNumberpadErrors(); void addStandardErrors(); bool isInGroup(Letter l); void clearErrors(); void setSubstringMode(); }; COL_NAMESPACE_END #endif /* ERRORVALUES_HH_ */ libcolumbus-1.1.0+15.10.20150806/include/WordList.hh0000644000015300001610000000273612560622644022116 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #ifndef WORDLIST_HH_ #define WORDLIST_HH_ #include "ColumbusCore.hh" COL_NAMESPACE_START struct WordListPrivate; class Word; class COL_PUBLIC WordList final { private: WordListPrivate *p; public: WordList(); WordList(const WordList &wl); WordList(WordList &&wl); ~WordList(); size_t size() const; const Word& operator[](const size_t i) const; const WordList& operator=(const WordList &l); const WordList& operator=(WordList &&wl); bool operator==(const WordList &l) const; bool operator!=(const WordList &l) const; void addWord(const Word &w); // This is more of an implementation detail and should not be exposed in a base class or interface. // Add proper iterators here. }; COL_NAMESPACE_END #endif /* WORDLIST_HH_ */ libcolumbus-1.1.0+15.10.20150806/include/WordStore.hh0000644000015300001610000000306512560622644022273 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #ifndef WORDSTORE_HH_ #define WORDSTORE_HH_ #include "ColumbusCore.hh" COL_NAMESPACE_START /* * We will get words multiple times. This class assigns each word a unique * ID. This deduplicates data and ensures we only keep one copy * of each word in memory. * * This is, roughly, a simpler version of * http://mailinator.blogspot.fi/2012/02/how-mailinator-compresses-email-by-90.html */ struct WordStorePrivate; class Word; class COL_PUBLIC WordStore final { private: WordStorePrivate *p; public: WordStore(); ~WordStore(); WordStore(const WordStore &other) = delete; const WordStore & operator=(const WordStore &other) = delete; WordID getID(const Word &w); bool hasWord(const Word &w) const; Word getWord(const WordID id) const; bool hasWord(const WordID id) const; }; COL_NAMESPACE_END #endif /* WORDSTORE_HH_ */ libcolumbus-1.1.0+15.10.20150806/include/ResultFilter.hh0000644000015300001610000000270012560622644022762 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #ifndef RESULTFILTER_HH_ #define RESULTFILTER_HH_ #include "ColumbusCore.hh" COL_NAMESPACE_START struct ResultFilterPrivate; class Word; class COL_PUBLIC ResultFilter final { private: ResultFilterPrivate *p; public: ResultFilter(); ~ResultFilter(); ResultFilter(const ResultFilter &rf) = delete; const ResultFilter & operator=(const ResultFilter &other) = delete; void addNewTerm(); void addNewSubTerm(const Word &field, const Word &word); size_t numTerms() const; size_t numSubTerms(const size_t term) const; const Word& getField(const size_t term, const size_t subTerm) const; const Word& getWord(const size_t term, const size_t subTerm) const; }; COL_NAMESPACE_END #endif /* RESULTFILTER_HH_ */ libcolumbus-1.1.0+15.10.20150806/include/MatcherStatistics.hh0000644000015300001610000000231712560622644024000 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #ifndef MATCHERSTATISTICS_HH_ #define MATCHERSTATISTICS_HH_ #include "ColumbusCore.hh" COL_NAMESPACE_START struct MatcherStatisticsPrivate; class Word; class MatcherStatistics final { private: MatcherStatisticsPrivate *p; public: MatcherStatistics(); ~MatcherStatistics(); void wordProcessed(const WordID w); size_t getTotalWordCount(const WordID w) const; void addedWordToIndex(const WordID word, const Word &fieldName); }; COL_NAMESPACE_END #endif /* MATCHERSTATISTICS_HH_ */ libcolumbus-1.1.0+15.10.20150806/include/ColumbusSlow.hh0000644000015300001610000000217312560622644023000 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ /* * This file contains helper functions that for some reason or * another must include libc++ stuff. Whenever this file is included * compilation times slow down. * * If a function can be removed from here, it should. */ #ifndef COLUMBUSSLOW_HH #define COLUMBUSSLOW_HH #include "ColumbusCore.hh" #include COL_NAMESPACE_START std::string findDataFile(const std::string &baseName); COL_NAMESPACE_END #endif libcolumbus-1.1.0+15.10.20150806/include/SearchParameters.hh0000644000015300001610000000266712560622644023603 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2013 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #ifndef SEARCHPARAMETERS_H_ #define SEARCHPARAMETERS_H_ #include "ColumbusCore.hh" COL_NAMESPACE_START struct SearchParametersPrivate; class Word; class ResultFilter; class COL_PUBLIC SearchParameters final { private: SearchParametersPrivate *p; public: SearchParameters(); ~SearchParameters(); SearchParameters & operator=(const SearchParameters &other) = delete; bool isDynamic() const; void setDynamic(bool dyn); int getDynamicError(const Word &w) const; ResultFilter& getResultFilter(); const ResultFilter& getResultFilter() const; void addNonsearchingField(const Word &w); bool isNonsearchingField(const Word &w) const; int looseningIterations() const; }; COL_NAMESPACE_END #endif libcolumbus-1.1.0+15.10.20150806/include/IndexMatches.hh0000644000015300001610000000310212560622644022707 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #ifndef INDEXMATCHES_H_ #define INDEXMATCHES_H_ #include #include "ColumbusCore.hh" COL_NAMESPACE_START class LevenshteinIndex; struct IndexMatchesPrivate; class Word; /** * A class that contains a list of index matches * in growing error order. * */ class COL_PUBLIC IndexMatches final { friend class LevenshteinIndex; private: IndexMatchesPrivate *p; void addMatch(const Word &queryWord, const WordID matchedWord, int error); void sort(); public: IndexMatches(); ~IndexMatches(); IndexMatches(const IndexMatches &other) = delete; const IndexMatches & operator=(const IndexMatches &other) = delete; size_t size() const; const WordID& getMatch(size_t num) const; const Word& getQuery(size_t num) const; int getMatchError(size_t num) const; void clear(); }; COL_NAMESPACE_END #endif /* INDEXMATCHES_H_ */ libcolumbus-1.1.0+15.10.20150806/include/ColumbusHelpers.hh0000644000015300001610000000241412560622644023454 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #ifndef COLUMBUSHELPERS_H_ #define COLUMBUSHELPERS_H_ #include "ColumbusCore.hh" COL_NAMESPACE_START class Word; class WordList; Letter* utf8ToInternal(const char *utf8Text, unsigned int &resultStringSize); void internalToUtf8(const Letter *source, unsigned int characters, char *buf, unsigned int bufsize); COL_PUBLIC COL_PUBLIC double hiresTimestamp(); COL_PUBLIC WordList splitToWords(const char *utf8Text); COL_PUBLIC WordList split(const char *utf8Text, const Letter *splitChars, int numChars); COL_PUBLIC bool isWhitespace(Letter l); COL_NAMESPACE_END #endif libcolumbus-1.1.0+15.10.20150806/include/columbus.h0000644000015300001610000000574112560622644022027 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ /* * This file defines the C API of Columbus. If you can, it is strongly * recommended to use the C++ API of columbus.hh instead. */ #ifndef COLUMBUS_H_ #define COLUMBUS_H_ #ifdef COLUMBUS_HH_ #error "Mixing C and C++ public header includes. You can only use one or the other." #endif #include "ColumbusCore.hh" #ifdef __cplusplus extern "C" { #endif typedef void* ColWord; typedef void* ColDocument; typedef void* ColMatcher; typedef void* ColMatchResults; typedef void* ColCorpus; typedef void* ColErrorValues; typedef void* ColIndexWeights; COL_PUBLIC ColWord col_word_new(const char *utf8_word); COL_PUBLIC void col_word_delete(ColWord w); COL_PUBLIC size_t col_word_length(ColWord w); COL_PUBLIC void col_word_as_utf8(ColWord w, char *buf, unsigned int bufSize); COL_PUBLIC ColDocument col_document_new(DocumentID id); COL_PUBLIC void col_document_delete(ColDocument doc); COL_PUBLIC DocumentID col_document_get_id(ColDocument doc); COL_PUBLIC void col_document_add_text(ColDocument doc, ColWord field_name, const char *text_as_utf8); COL_PUBLIC ColMatcher col_matcher_new(); COL_PUBLIC void col_matcher_delete(ColMatcher m); COL_PUBLIC void col_matcher_index(ColMatcher m, ColCorpus c); COL_PUBLIC ColMatchResults col_matcher_match(ColMatcher m, const char *query_as_utf8); COL_PUBLIC ColErrorValues col_matcher_get_error_values(ColMatcher m); COL_PUBLIC ColIndexWeights col_matcher_get_index_weights(ColMatcher m); COL_PUBLIC ColMatchResults col_match_results_new(); COL_PUBLIC void col_match_results_delete(ColMatchResults mr); COL_PUBLIC size_t col_match_results_size(ColMatchResults mr); COL_PUBLIC DocumentID col_match_results_get_id(ColMatchResults mr, size_t i); COL_PUBLIC double col_match_results_get_relevancy(ColMatchResults mr, size_t i); COL_PUBLIC ColCorpus col_corpus_new(); COL_PUBLIC void col_corpus_delete(ColCorpus c); COL_PUBLIC void col_corpus_add_document(ColCorpus c, ColDocument d); COL_PUBLIC void col_index_weights_set_weight(ColIndexWeights weights, const ColWord field, const double new_weight); COL_PUBLIC double col_index_weights_get_weight(ColIndexWeights weights, const ColWord field); COL_PUBLIC void col_error_values_add_standard_errors(ColErrorValues ev); COL_PUBLIC void col_error_values_set_substring_mode(ColErrorValues ev); #ifdef __cplusplus } #endif #endif libcolumbus-1.1.0+15.10.20150806/include/columbus.hh0000644000015300001610000000246012560622644022172 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ /* * Columbus is an error tolerant matcher system. * * Include this file to use the C++ API to Columbus. * * If you need a C api, include columbus.h instead. * It is not guaranteed to have all functionality, though. */ #ifndef COLUMBUS_HH_ #define COLUMBUS_HH_ #ifdef COLUMBUS_H_ #error "Mixing C and C++ public header includes. You can only use one or the other." #endif #include #include #include #include #include #include #include #include #include #endif libcolumbus-1.1.0+15.10.20150806/include/ErrorMatrix.hh0000644000015300001610000000333512560622644022621 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #ifndef ERRORMATRIX_HH_ #define ERRORMATRIX_HH_ #include"ColumbusCore.hh" /* * A helper class for LevenshteinIndex to keep track of the * error values. * * This class only works because LevenshteinIndex does depth * first search. Breadth first search will break it completely. * * So don't use that then. */ COL_NAMESPACE_START class ErrorMatrix final { size_t rows, columns; int **m; public: ErrorMatrix(const size_t rows_, const size_t columns_, const int insertError, const int deletionError); ~ErrorMatrix(); ErrorMatrix(const ErrorMatrix &em) = delete; const ErrorMatrix & operator=(const ErrorMatrix &other) = delete; void set(const size_t rowNum, const size_t colNum, const int error); // No bounds checking because this is in the hot path. inline int get(const size_t rowNum, const size_t colNum) const { return m[rowNum][colNum]; } int totalError(const size_t rowNum) const; int minError(const size_t rowNum) const; }; COL_NAMESPACE_END #endif /* ERRORMATRIX_HH_ */ libcolumbus-1.1.0+15.10.20150806/include/CMakeLists.txt0000644000015300001610000000130212560622644022552 0ustar pbuserpbgroup00000000000000configure_file(ColumbusCore.hh.in ColumbusCore.hh) install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ColumbusCore.hh Matcher.hh MatchResults.hh columbus.h columbus.hh Word.hh WordList.hh Corpus.hh ErrorValues.hh Document.hh ColumbusHelpers.hh IndexWeights.hh DESTINATION include/${COL_LIB_BASENAME}${SO_VERSION_MAJOR}) # Build and install a pkg-config file set(prefix ${CMAKE_INSTALL_PREFIX}) set(exec_prefix ${prefix}/bin) set(libdir ${prefix}/${LIBDIR}) set(includedir ${prefix}/include/${COL_LIB_BASENAME}${SO_VERSION_MAJOR}) set(pkg-name "lib${COL_LIB_BASENAME}") configure_file(libcolumbus.pc.in libcolumbus.pc @ONLY) install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libcolumbus.pc DESTINATION ${LIBDIR}/pkgconfig) libcolumbus-1.1.0+15.10.20150806/include/Corpus.hh0000644000015300001610000000237312560622644021617 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #ifndef CORPUS_HH_ #define CORPUS_HH_ #include "ColumbusCore.hh" COL_NAMESPACE_START struct CorpusPrivate; class Document; class COL_PUBLIC Corpus final { private: CorpusPrivate *p; public: Corpus(); ~Corpus(); Corpus(const Corpus &c) = delete; const Corpus& operator=(const Corpus &c) = delete; void addDocument(const Document &d); size_t size() const; const Document& getDocument(size_t i) const; // Add iterators here. This class should really only expose them. }; COL_NAMESPACE_END #endif /* CORPUS_HH_ */ libcolumbus-1.1.0+15.10.20150806/include/Matcher.hh0000644000015300001610000000477112560622644021733 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #ifndef MATCHER_HH_ #define MATCHER_HH_ #include "ColumbusCore.hh" #include COL_NAMESPACE_START class Corpus; struct MatcherPrivate; class Word; class Document; class WordList; class MatchResults; class ErrorValues; class IndexWeights; class ResultFilter; class SearchParameters; class COL_PUBLIC Matcher final { private: MatcherPrivate *p; void buildIndexes(const Corpus &c); void addToIndex(const Word &word, const WordID wordID, const WordID indexID); void relevancyMatch(const WordList &query, const SearchParameters ¶ms, const int extraError, MatchResults &matchedDocuments); public: Matcher(); ~Matcher(); Matcher& operator=(const Matcher &m) = delete; // The simple API MatchResults match(const char *queryAsUtf8); MatchResults match(const WordList &query); MatchResults match(const std::string &queryAsUtf8); // When you want to specify search parameters exactly. MatchResults match(const char *queryAsUtf8, const SearchParameters ¶ms); MatchResults match(const WordList &query, const SearchParameters ¶ms); void index(const Corpus &c); ErrorValues& getErrorValues(); IndexWeights& getIndexWeights(); /* * This function is optimized for online matches, that is, queries * that are live updated during typing. It uses slightly different * search heuristics to ensure results that "feel good" to humans. * * The second argument is the field that should be the primary focus. * Usually it means having the text that will be shown to the user. * As an example, in the HUD, this field would contain the command * (and nothing else) that will be executed. */ MatchResults onlineMatch(const WordList &query, const Word &primaryIndex); }; COL_NAMESPACE_END #endif /* MATCHER_HH_ */ libcolumbus-1.1.0+15.10.20150806/python/0000755000015300001610000000000012560622775017721 5ustar pbuserpbgroup00000000000000libcolumbus-1.1.0+15.10.20150806/python/pch/0000755000015300001610000000000012560622775020473 5ustar pbuserpbgroup00000000000000libcolumbus-1.1.0+15.10.20150806/python/pch/colpython_pch.hh0000644000015300001610000000157512560622644023670 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ /* * This is the precompiled header file for Python bindings. * It contains only system headers. It must NOT have any internal * headers */ #include libcolumbus-1.1.0+15.10.20150806/python/CMakeLists.txt0000644000015300001610000000103012560622644022446 0ustar pbuserpbgroup00000000000000include_directories(${Boost_INCLUDE_DIR}) include_directories(${PYTHONLIBS_INCLUDE_DIRS}) if(use_python2) set(python_lib_name "columbus") else() set(python_lib_name "columbus.${pysoabi}") endif() add_library(columbus_ext SHARED columbus.cc) target_link_libraries(columbus_ext ${COL_LIB_BASENAME} ${BOOST_PYTHON_HACK} ${PYTHONLIBS_LIBRARIES}) set_target_properties(columbus_ext PROPERTIES OUTPUT_NAME ${python_lib_name} PREFIX "") add_pch(pch/colpython_pch.hh columbus_ext) install(TARGETS columbus_ext DESTINATION ${PYTHONDIR}) libcolumbus-1.1.0+15.10.20150806/python/columbus.cc0000644000015300001610000000712412560622644022060 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #include #include "columbus.hh" using namespace boost::python; using namespace Columbus; void (Document::*addAdaptor) (const Word &, const std::string &) = &Document::addText; MatchResults (Matcher::*queryAdaptor) (const std::string &) = &Matcher::match; BOOST_PYTHON_MODULE(columbus) { class_("Corpus", init<>()) .def("size", &Corpus::size) .def("add_document", &Corpus::addDocument) .def("__len__", &Corpus::size) ; class_("Word", init()) .def("get_string", &Word::asUtf8) .def("__len__", &Word::length) ; class_("WordList", init<>()) .def(init()) .def("__len__", &WordList::size) .def("__getitem__", &WordList::operator[], return_internal_reference<>()) .def("add_word", &WordList::addWord) ; def("split_to_words", splitToWords); class_("Document", init()) .def(init()) .def("field_count", &Document::fieldCount) .def("get_id", &Document::getID) .def("get_text", &Document::getText, return_internal_reference<>()) .def("add_text", addAdaptor) ; class_("MatchResults") .def("add_results", &MatchResults::addResults) .def("get_document_id", &MatchResults::getDocumentID) .def("get_relevancy", &MatchResults::getRelevancy) .def("__len__", &MatchResults::size) ; class_("Matcher") .def("index", &Matcher::index) .def("match", queryAdaptor) .def("get_errorvalues", &Matcher::getErrorValues, return_internal_reference<>()) .def("get_indexweights", &Matcher::getIndexWeights, return_internal_reference<>()) ; class_("ErrorValues", init<>()) .def("add_standard_errors", &ErrorValues::addStandardErrors) .def("set_substring_mode", &ErrorValues::setSubstringMode) .def("set_end_deletion_error", &ErrorValues::setEndDeletionError) .def("set_error", &ErrorValues::setError) .def("get_substitute_error", &ErrorValues::getSubstituteError) .def("get_default_error", &ErrorValues::getDefaultError) .staticmethod("get_default_error") .def("get_substring_default_end_deletion_error", &ErrorValues::getSubstringDefaultEndDeletionError) .staticmethod("get_substring_default_end_deletion_error") .def("clear_errors", &ErrorValues::clearErrors) ; class_("IndexWeights") .def("set_weight", &IndexWeights::setWeight) .def("get_weight", &IndexWeights::getWeight) ; } libcolumbus-1.1.0+15.10.20150806/hacking.txt0000644000015300001610000000425112560622644020542 0ustar pbuserpbgroup00000000000000A quick overview of the internals of Columbus Data model The Columbus library has a very simple view of the world. The basic unit of data it deals with is the Corpus. A corpus is a collection of Documents. A document is a named collection of texts and an ID. Columbus indexes these texts and allows the user to search through them efficiently. To make thing more clear, let's examine a simple music database. In it every song is a document. A corpus with three songs could look like this: song0 Author: Britney Spears Name: Toxic Album: In the Zone song1 Author: Micheal Jackson Name: Billie Jean Album: Thriller song2 Author: The Beatles Name: Lucy in the Sky with Diamonds Album: Yellow Submarine Soundtrack Did you notice the typo? That's intentional. Very, very few real world data sources are clean. Errors like this happen all the time, and it is the job of the search engine to deal with them. Error tolerant word matching Columbus is built around a data structure that efficiently computes the Damerau-Levenshtein distance for a set of words. For details see http://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance The matcher allows for custom errors, so for example the replacement error a->ä could be less than the standard replacement error. Additionally, the replacement error d->c could be set lower because the two letters are close to each other on the keyboard. Multiple letter replacements, e.g. the german ß -> ss are not supported yet. The actual search algorithm is quite straightforward. First the query term is split into search words. Then all words in the data that are within a certain error from the search terms is found. We then look at these terms and see which documents contain them. For each match we increment the relevancy of the given document. The more "important" the word the more we increment the relevancy. The smaller the word match error was, the bigger the increase in relevancy. The user can specify relative weights for different fields. In the music example, adding the weight of the song name is probably a good idea. Once all words have been processed, we just sort the document by relevancy and we have our result set. libcolumbus-1.1.0+15.10.20150806/share/0000755000015300001610000000000012560622775017502 5ustar pbuserpbgroup00000000000000libcolumbus-1.1.0+15.10.20150806/share/greekAccentedLetterGroups.txt0000644000015300001610000000005312560622644025340 0ustar pbuserpbgroup00000000000000αά εέ ηή ιίϊΐ οό υύϋΰ ωώ libcolumbus-1.1.0+15.10.20150806/share/latinAccentedLetterGroups.txt0000644000015300001610000000134612560622644025360 0ustar pbuserpbgroup00000000000000aáàăắằẵẳâấầẫẩǎåǻäǟãȧǡąāảȁȃạặậḁⱥᶏ bḃḅḇƀ cćĉčċçḉȼƈ dďḋḑḍḓḏđɖ eéèĕêếềễểěëẽėȩḝęēḗḕẻȅȇẹệḙḛɇ gǵğĝǧġģḡǥɠ hĥȟḧḣḩḥḫẖ iíìĭîǐïḯĩįīỉȉȋịḭɨᵻᶖİi jĵɉǰȷ kḱǩķḳḵƙⱪ lĺľļḷḹḽḻł mḿṁṃ nńǹňñṅņṇṋṉn oóòŏôốồỗổǒöȫőõṍṏȭȯo͘ȱøǿǫǭōṓṑỏȍȏơớờỡởợọộɵ pṕṗᵽƥp̃ rŕřṙŗȑȓṛṝṟɍ sśṥŝšṧṡşṣṩșs̩ tťṫţṭțṱṯŧⱦƭʈẗ uúùŭûǔůüǘǜǚǖűũṹųūṻủȕȗưứừữửựụṳṷṵ vṽṿ wẃẁŵẅẇẉẘ xẍẋ ýỳŷẙÿỹẏȳỷỵɏƴ zźẑžżẓẕƶ libcolumbus-1.1.0+15.10.20150806/share/CMakeLists.txt0000644000015300001610000000020412560622644022231 0ustar pbuserpbgroup00000000000000install(FILES latinAccentedLetterGroups.txt greekAccentedLetterGroups.txt DESTINATION share/${COL_LIB_BASENAME}${SO_VERSION_MAJOR}) libcolumbus-1.1.0+15.10.20150806/tools/0000755000015300001610000000000012560622775017540 5ustar pbuserpbgroup00000000000000libcolumbus-1.1.0+15.10.20150806/tools/sctest.cc0000644000015300001610000002237112560622644021354 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ /* * A simple GUI application to search application list data. */ #include "columbus.hh" #include "WordList.hh" #include #include #include #include #include #include using namespace Columbus; using namespace std; const char *queryTime = "Query time: "; const char *updateTime = "GUI update time: "; const char *resultCount = "Total results: "; const int DEFAULT_ERROR = 200; const Word nameField("name"); const Word genericField("genericname"); const Word commentField("comment"); struct app_data { Matcher *m; GtkWidget *window; GtkWidget *entry; GtkListStore *matchStore; GtkWidget *matchView; GtkWidget *queryTimeLabel; GtkWidget *resultCountLabel; GtkWidget *updateTimeLabel; vector names; }; static gboolean delete_event(GtkWidget */*widget*/, GdkEvent */*event*/, gpointer /*data*/) { gtk_main_quit(); return TRUE; } static void destroy(GtkWidget */*widget*/, gpointer /*data*/) { gtk_main_quit (); } void updateModel(app_data *app, MatchResults &matches) { GtkTreeIter iter; gtk_widget_freeze_child_notify(app->matchView); gtk_tree_view_set_model(GTK_TREE_VIEW(app->matchView), 0); gtk_list_store_clear(app->matchStore); for(size_t i=0; imatchStore, &iter); gtk_list_store_set(app->matchStore, &iter, 0, app->names[id].c_str(), 1, matches.getRelevancy(i), -1); } gtk_tree_view_set_model(GTK_TREE_VIEW(app->matchView), GTK_TREE_MODEL(app->matchStore)); gtk_widget_thaw_child_notify(app->matchView); } static void doSearch(GtkWidget */*widget*/, gpointer data) { app_data *app = (app_data*) data; MatchResults matches; double updateStart, updateEnd; double queryStart, queryEnd; try { queryStart = hiresTimestamp(); matches = app->m->match(gtk_entry_get_text(GTK_ENTRY(app->entry))); queryEnd = hiresTimestamp(); } catch(exception &e) { printf("Matching failed: %s\n", e.what()); gtk_list_store_clear(app->matchStore); gtk_label_set_text(GTK_LABEL(app->queryTimeLabel), queryTime); gtk_label_set_text(GTK_LABEL(app->resultCountLabel), resultCount); gtk_label_set_text(GTK_LABEL(app->updateTimeLabel), updateTime); gtk_entry_set_text(GTK_ENTRY(app->entry), ""); return; } updateStart = hiresTimestamp(); updateModel(app, matches); updateEnd = hiresTimestamp(); char buf[1024]; sprintf(buf, "%s%.2f", queryTime, queryEnd - queryStart); gtk_label_set_text(GTK_LABEL(app->queryTimeLabel), buf); sprintf(buf, "%s%.2f", updateTime, updateEnd - updateStart); gtk_label_set_text(GTK_LABEL(app->updateTimeLabel), buf); sprintf(buf, "%s%lu", resultCount, (unsigned long) matches.size()); gtk_label_set_text(GTK_LABEL(app->resultCountLabel), buf); } void build_gui(app_data &app) { GtkWidget *vbox; GtkWidget *scroller; GtkWidget *quitButton; GtkTreeViewColumn *textColumn; GtkTreeViewColumn *relevancyColumn; app.window = gtk_window_new(GTK_WINDOW_TOPLEVEL); g_signal_connect (app.window, "delete-event", G_CALLBACK (delete_event), NULL); g_signal_connect (app.window, "destroy", G_CALLBACK (destroy), NULL); gtk_window_set_default_size(GTK_WINDOW(app.window), 600, 700); gtk_window_set_title(GTK_WINDOW(app.window), "Software list browser tool"); vbox = gtk_box_new(GTK_ORIENTATION_VERTICAL, 0); gtk_container_add(GTK_CONTAINER(app.window), vbox); app.entry = gtk_entry_new(); gtk_widget_set_tooltip_text(app.entry, "Word to search, must not contain whitespace."); g_signal_connect(app.entry, "changed", G_CALLBACK(doSearch), &app); app.matchStore = gtk_list_store_new(2, G_TYPE_STRING, G_TYPE_DOUBLE); app.matchView = gtk_tree_view_new_with_model(GTK_TREE_MODEL(app.matchStore)); textColumn = gtk_tree_view_column_new_with_attributes("Match", gtk_cell_renderer_text_new(), "text", 0, NULL); gtk_tree_view_append_column(GTK_TREE_VIEW(app.matchView), textColumn); relevancyColumn = gtk_tree_view_column_new_with_attributes("Relevancy", gtk_cell_renderer_text_new(), "text", 1, NULL); gtk_tree_view_append_column(GTK_TREE_VIEW(app.matchView), relevancyColumn); scroller = gtk_scrolled_window_new(NULL, NULL); gtk_container_add(GTK_CONTAINER(scroller), app.matchView); app.queryTimeLabel = gtk_label_new(queryTime); gtk_label_set_justify(GTK_LABEL(app.queryTimeLabel), GTK_JUSTIFY_LEFT); app.resultCountLabel = gtk_label_new(resultCount); gtk_label_set_justify(GTK_LABEL(app.resultCountLabel), GTK_JUSTIFY_LEFT); app.updateTimeLabel = gtk_label_new(updateTime); gtk_label_set_justify(GTK_LABEL(app.updateTimeLabel), GTK_JUSTIFY_LEFT); quitButton = gtk_button_new_with_label("Quit"); g_signal_connect(quitButton, "clicked", G_CALLBACK(destroy), NULL); gtk_box_pack_start(GTK_BOX(vbox), app.entry, FALSE, TRUE, 0); gtk_box_pack_start(GTK_BOX(vbox), scroller, TRUE, TRUE, 0); gtk_box_pack_start(GTK_BOX(vbox), app.queryTimeLabel, FALSE, TRUE, 0); gtk_box_pack_start(GTK_BOX(vbox), app.updateTimeLabel, FALSE, TRUE, 0); gtk_box_pack_start(GTK_BOX(vbox), app.resultCountLabel, FALSE, TRUE, 0); gtk_box_pack_start(GTK_BOX(vbox), quitButton, FALSE, TRUE, 0); gtk_widget_show_all(app.window); } void processFile(string &fname, WordList &name, WordList &genericName, WordList &comment) { ifstream ifile(fname.c_str()); Word f("GenericName"); if(ifile.fail()) { printf("Could not open file %s.\n", fname.c_str()); exit(1); } string line; while(getline(ifile, line)) { WordList vals; Word n; size_t equalsLoc = line.find('=', 0); if(equalsLoc < line.length()) { vals = splitToWords(line.c_str() + equalsLoc + 1); line[equalsLoc] = '\0'; try { n = line.c_str(); } catch (invalid_argument &e) { continue; } } else { continue; } if(vals.size() > 0) { if(n == "name") name = vals; if(n == "genericname") genericName = vals; if(n == "comment") comment = vals; } } } void buildCorpus(Corpus &c, app_data &app) { string dataDir = "/usr/share/app-install/desktop"; DIR *dp; struct dirent *dirp; dp = opendir(dataDir.c_str()); if(!dp) { throw runtime_error("Could not open data dir."); } while((dirp = readdir(dp))) { WordList name, genericName, comment; if(dirp->d_name[0] == '.') continue; string fullPath = dataDir; fullPath += "/"; fullPath += dirp->d_name; processFile(fullPath, name, genericName, comment); if(name.size() > 0) { Document d(c.size()); d.addText(nameField, name); if(genericName.size() > 0) d.addText(genericField, genericName); if(comment.size() > 0) d.addText(commentField, comment); c.addDocument(d); app.names.push_back(dirp->d_name); } } printf("Read in %lu documents.\n", (unsigned long)c.size()); closedir(dp); } void build_matcher(app_data &app) { Corpus c; Word field("name"); size_t i=0; double dataReadStart, dataReadEnd; app.m = new Matcher(); // Build Corpus. dataReadStart = hiresTimestamp(); buildCorpus(c, app); dataReadEnd = hiresTimestamp(); printf("Read in %lu documents in %.2f seconds.\n", (unsigned long) i, dataReadEnd - dataReadStart); app.m->index(c); app.m->getIndexWeights().setWeight(genericField, 0.6); app.m->getIndexWeights().setWeight(commentField, 0.3); app.m->getErrorValues().addStandardErrors(); app.m->getErrorValues().setSubstringMode(); } void delete_matcher(app_data &app) { delete app.m; app.m = 0; } int main(int argc, char **argv) { app_data app; double buildStart, buildEnd; gtk_init(&argc, &argv); try { build_gui(app); buildStart = hiresTimestamp(); build_matcher(app); buildEnd = hiresTimestamp(); printf("Building the matcher took %.2f seconds.\n", buildEnd - buildStart); gtk_main(); delete_matcher(app); } catch(std::exception &e) { fprintf(stderr, "Failed with exception: %s\n", e.what()); return 99; } return 0; } libcolumbus-1.1.0+15.10.20150806/tools/queryapp.cc0000644000015300001610000001646012560622644021717 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ /* * A simple GUI application to search the database. */ #include "columbus.hh" // This app should only need public API from Columbus. #include #include #include #include using namespace Columbus; using namespace std; const char *queryTime = "Query time: "; const char *resultCount = "Total results: "; const int DEFAULT_ERROR = 200; struct app_data { Matcher *m; GtkWidget *window; GtkWidget *entry; GtkListStore *matchStore; GtkWidget *matchView; GtkWidget *queryTimeLabel; GtkWidget *resultCountLabel; vector source; }; static gboolean delete_event(GtkWidget */*widget*/, GdkEvent */*event*/, gpointer /*data*/) { gtk_main_quit(); return TRUE; } static void destroy(GtkWidget */*widget*/, gpointer /*data*/) { gtk_main_quit (); } static void doSearch(GtkWidget */*widget*/, gpointer data) { app_data *app = (app_data*) data; MatchResults matches; GtkTreeIter iter; double queryStart, queryEnd; try { queryStart = hiresTimestamp(); matches = app->m->match(gtk_entry_get_text(GTK_ENTRY(app->entry))); queryEnd = hiresTimestamp(); } catch(exception &e) { printf("Matching failed: %s\n", e.what()); gtk_list_store_clear(app->matchStore); gtk_label_set_text(GTK_LABEL(app->queryTimeLabel), queryTime); gtk_label_set_text(GTK_LABEL(app->resultCountLabel), resultCount); gtk_entry_set_text(GTK_ENTRY(app->entry), ""); return; } gtk_list_store_clear(app->matchStore); for(size_t i=0; imatchStore, &iter); gtk_list_store_set(app->matchStore, &iter, 0, app->source[id].c_str(), 1, matches.getRelevancy(i), -1); } char buf[1024]; sprintf(buf, "%s%.2f", queryTime, queryEnd - queryStart); gtk_label_set_text(GTK_LABEL(app->queryTimeLabel), buf); sprintf(buf, "%s%lu", resultCount, (unsigned long) matches.size()); gtk_label_set_text(GTK_LABEL(app->resultCountLabel), buf); } void build_gui(app_data &app) { GtkWidget *vbox; GtkWidget *hbox; GtkWidget *scroller; GtkWidget *quitButton; GtkWidget *searchButton; GtkTreeViewColumn *textColumn; GtkTreeViewColumn *relevancyColumn; app.window = gtk_window_new(GTK_WINDOW_TOPLEVEL); g_signal_connect (app.window, "delete-event", G_CALLBACK (delete_event), NULL); g_signal_connect (app.window, "destroy", G_CALLBACK (destroy), NULL); gtk_window_set_default_size(GTK_WINDOW(app.window), 600, 700); gtk_window_set_title(GTK_WINDOW(app.window), "Columbus query tool"); vbox = gtk_box_new(GTK_ORIENTATION_VERTICAL, 0); gtk_container_add(GTK_CONTAINER(app.window), vbox); hbox = gtk_box_new(GTK_ORIENTATION_HORIZONTAL, 0); app.entry = gtk_entry_new(); gtk_widget_set_tooltip_text(app.entry, "Word to search, must not contain whitespace."); searchButton = gtk_button_new_with_label("Search"); g_signal_connect(searchButton, "clicked", G_CALLBACK(doSearch), &app); g_signal_connect(app.entry, "activate", G_CALLBACK(doSearch), &app); // GTK+ docs say not to connect to "activate" but it seems to work. app.matchStore = gtk_list_store_new(2, G_TYPE_STRING, G_TYPE_DOUBLE); app.matchView = gtk_tree_view_new_with_model(GTK_TREE_MODEL(app.matchStore)); textColumn = gtk_tree_view_column_new_with_attributes("Match", gtk_cell_renderer_text_new(), "text", 0, NULL); gtk_tree_view_append_column(GTK_TREE_VIEW(app.matchView), textColumn); relevancyColumn = gtk_tree_view_column_new_with_attributes("Relevancy", gtk_cell_renderer_text_new(), "text", 1, NULL); gtk_tree_view_append_column(GTK_TREE_VIEW(app.matchView), relevancyColumn); scroller = gtk_scrolled_window_new(NULL, NULL); gtk_container_add(GTK_CONTAINER(scroller), app.matchView); app.queryTimeLabel = gtk_label_new(queryTime); gtk_label_set_justify(GTK_LABEL(app.queryTimeLabel), GTK_JUSTIFY_LEFT); app.resultCountLabel = gtk_label_new(resultCount); gtk_label_set_justify(GTK_LABEL(app.resultCountLabel), GTK_JUSTIFY_LEFT); quitButton = gtk_button_new_with_label("Quit"); g_signal_connect(quitButton, "clicked", G_CALLBACK(destroy), NULL); gtk_box_pack_start(GTK_BOX(hbox), app.entry, TRUE, TRUE, 0); gtk_box_pack_start(GTK_BOX(hbox), searchButton, FALSE, TRUE, 0); gtk_box_pack_start(GTK_BOX(vbox), hbox, FALSE, TRUE, 0); gtk_box_pack_start(GTK_BOX(vbox), scroller, TRUE, TRUE, 0); gtk_box_pack_start(GTK_BOX(vbox), app.queryTimeLabel, FALSE, TRUE, 0); gtk_box_pack_start(GTK_BOX(vbox), app.resultCountLabel, FALSE, TRUE, 0); gtk_box_pack_start(GTK_BOX(vbox), quitButton, FALSE, TRUE, 0); gtk_widget_show_all(app.window); } void build_matcher(app_data &app, const char *dataFile) { Corpus *c = new Corpus(); Word field("name"); const size_t batchSize = 100000; size_t i=0; double dataReadStart, dataReadEnd; ifstream ifile(dataFile); if(ifile.fail()) { printf("Could not open file %s.\n", dataFile); exit(1); } string line; app.m = new Matcher(); // Build Corpus. dataReadStart = hiresTimestamp(); while(getline(ifile, line)) { if(line.size() == 0) continue; // Remove possible DOS line ending garbage. if(line[line.size()-2] == '\r') line[line.size()-2] = '\0'; Document d(app.source.size()); d.addText(field, line.c_str()); c->addDocument(d); app.source.push_back(line); i++; if(i % batchSize == 0) { app.m->index(*c); delete c; c = new Corpus(); } } app.m->index(*c); delete c; dataReadEnd = hiresTimestamp(); printf("Read in %lu documents in %.2f seconds.\n", (unsigned long) i, dataReadEnd - dataReadStart); } void delete_matcher(app_data &app) { delete app.m; app.m = 0; } int main(int argc, char **argv) { app_data app; double buildStart, buildEnd; gtk_init(&argc, &argv); if(argc < 2) { printf("%s input_data_file.txt\n", argv[0]); return 0; } try { build_gui(app); buildStart = hiresTimestamp(); build_matcher(app, argv[1]); buildEnd = hiresTimestamp(); printf("Building the matcher took %.2f seconds.\n", buildEnd - buildStart); gtk_main(); delete_matcher(app); } catch(std::exception &e) { fprintf(stderr, "Failed with exception: %s\n", e.what()); return 99; } return 0; } libcolumbus-1.1.0+15.10.20150806/tools/queryindex.cc0000644000015300001610000000570412560622644022245 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #include "LevenshteinIndex.hh" #include "Word.hh" #include "IndexMatches.hh" #include "ErrorValues.hh" #include "WordStore.hh" #include #include #include #include #include #include using namespace Columbus; void load_data(LevenshteinIndex &ind, WordStore &s, char *file) { FILE *f = fopen(file, "r"); char buffer[1024]; if(!f) { printf("Could not open file %s.\n", file); exit(1); } while(fgets(buffer, 1024, f) != NULL) { unsigned int slen = strlen(buffer); assert(buffer[slen-1] == '\n'); buffer[slen-1] = '\0'; // Chop the \n. Word w(buffer); ind.insertWord(w, s.getID(w)); } fclose(f); } void queryAndPrint(LevenshteinIndex &ind, WordStore &s, Word &query, int maxError) { IndexMatches matches; ErrorValues e; ind.findWords(query, e, maxError, matches); if(matches.size() == 0) { printf("No matches.\n"); return; } for(size_t i=0; i * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ /* * A simple GUI application to search from a list of single words. */ #include "ColumbusHelpers.hh" #include "LevenshteinIndex.hh" #include "Word.hh" #include "ErrorValues.hh" #include "WordStore.hh" #include #include #include #include #include using namespace Columbus; using namespace std; const char *queryTime = "Query time: "; const char *resultCount = "Total results: "; const int DEFAULT_ERROR = 200; struct app_data { LevenshteinIndex ind; WordStore store; ErrorValues e; GtkWidget *window; GtkWidget *entry; GtkListStore *matchStore; GtkWidget *matchView; GtkWidget *queryTimeLabel; GtkWidget *resultCountLabel; GtkWidget *errorSpinner; }; static gboolean delete_event(GtkWidget */*widget*/, GdkEvent */*event*/, gpointer /*data*/) { gtk_main_quit(); return TRUE; } static void destroy(GtkWidget */*widget*/, gpointer /*data*/) { gtk_main_quit (); } static void doSearch(GtkWidget */*widget*/, gpointer data) { app_data *app = (app_data*) data; IndexMatches matches; GtkTreeIter iter; int maxError = gtk_spin_button_get_value_as_int(GTK_SPIN_BUTTON(app->errorSpinner)); double queryStart, queryEnd; try { Word query(gtk_entry_get_text(GTK_ENTRY(app->entry))); if(query.length() == 0) return; queryStart = hiresTimestamp(); app->ind.findWords(query, app->e, maxError, matches); queryEnd = hiresTimestamp(); } catch(exception &e) { printf("Matching failed: %s\n", e.what()); gtk_list_store_clear(app->matchStore); gtk_label_set_text(GTK_LABEL(app->queryTimeLabel), queryTime); gtk_label_set_text(GTK_LABEL(app->resultCountLabel), resultCount); gtk_entry_set_text(GTK_ENTRY(app->entry), ""); return; } gtk_list_store_clear(app->matchStore); for(size_t i=0; istore.getWord(matches.getMatch(i)).toUtf8(buf, 1024); gtk_list_store_append(app->matchStore, &iter); gtk_list_store_set(app->matchStore, &iter, 0, buf, 1, (int)matches.getMatchError(i), -1); } char buf[1024]; sprintf(buf, "%s%.2f", queryTime, queryEnd - queryStart); gtk_label_set_text(GTK_LABEL(app->queryTimeLabel), buf); sprintf(buf, "%s%lu", resultCount, (unsigned long) matches.size()); gtk_label_set_text(GTK_LABEL(app->resultCountLabel), buf); } void build_gui(app_data &app) { GtkWidget *vbox; GtkWidget *hbox; GtkWidget *scroller; GtkWidget *quitButton; GtkTreeViewColumn *textColumn; GtkTreeViewColumn *errorColumn; app.window = gtk_window_new(GTK_WINDOW_TOPLEVEL); g_signal_connect (app.window, "delete-event", G_CALLBACK (delete_event), NULL); g_signal_connect (app.window, "destroy", G_CALLBACK (destroy), NULL); gtk_window_set_default_size(GTK_WINDOW(app.window), 600, 700); gtk_window_set_title(GTK_WINDOW(app.window), "Columbus single word search test tool"); vbox = gtk_box_new(GTK_ORIENTATION_VERTICAL, 0); gtk_container_add(GTK_CONTAINER(app.window), vbox); hbox = gtk_box_new(GTK_ORIENTATION_HORIZONTAL, 0); app.entry = gtk_entry_new(); gtk_widget_set_tooltip_text(app.entry, "Word to search, must not contain whitespace."); app.errorSpinner = gtk_spin_button_new_with_range(100, 1000, 100); gtk_spin_button_set_value(GTK_SPIN_BUTTON(app.errorSpinner), DEFAULT_ERROR); gtk_widget_set_tooltip_text(app.errorSpinner, "Maximum error, 100 corresponds to one wrong letter."); g_signal_connect(app.entry, "changed", G_CALLBACK(doSearch), &app); g_signal_connect(app.errorSpinner, "value-changed", G_CALLBACK(doSearch), &app); app.matchStore = gtk_list_store_new(2, G_TYPE_STRING, G_TYPE_INT); app.matchView = gtk_tree_view_new_with_model(GTK_TREE_MODEL(app.matchStore)); textColumn = gtk_tree_view_column_new_with_attributes("Match", gtk_cell_renderer_text_new(), "text", 0, NULL); gtk_tree_view_append_column(GTK_TREE_VIEW(app.matchView), textColumn); errorColumn = gtk_tree_view_column_new_with_attributes("Error", gtk_cell_renderer_text_new(), "text", 1, NULL); gtk_tree_view_append_column(GTK_TREE_VIEW(app.matchView), errorColumn); scroller = gtk_scrolled_window_new(NULL, NULL); gtk_container_add(GTK_CONTAINER(scroller), app.matchView); app.queryTimeLabel = gtk_label_new(queryTime); gtk_label_set_justify(GTK_LABEL(app.queryTimeLabel), GTK_JUSTIFY_LEFT); app.resultCountLabel = gtk_label_new(resultCount); gtk_label_set_justify(GTK_LABEL(app.resultCountLabel), GTK_JUSTIFY_LEFT); quitButton = gtk_button_new_with_label("Quit"); g_signal_connect(quitButton, "clicked", G_CALLBACK(destroy), NULL); gtk_box_pack_start(GTK_BOX(hbox), app.entry, TRUE, TRUE, 0); gtk_box_pack_start(GTK_BOX(hbox), app.errorSpinner, FALSE, TRUE, 0); gtk_box_pack_start(GTK_BOX(vbox), hbox, FALSE, TRUE, 0); gtk_box_pack_start(GTK_BOX(vbox), scroller, TRUE, TRUE, 0); gtk_box_pack_start(GTK_BOX(vbox), app.queryTimeLabel, FALSE, TRUE, 0); gtk_box_pack_start(GTK_BOX(vbox), app.resultCountLabel, FALSE, TRUE, 0); gtk_box_pack_start(GTK_BOX(vbox), quitButton, FALSE, TRUE, 0); gtk_widget_show_all(app.window); } /* * Replace with library data read function once it is finished. */ static void readData(vector &a, const char *ifilename) { FILE *f = fopen(ifilename, "r"); char buffer[1024]; if(!f) { printf("Could not open dictionary file. Skipping performance test.\n"); exit(0); } while(fgets(buffer, 1024, f) != NULL) { unsigned int slen = strlen(buffer); assert(buffer[slen-1] == '\n'); buffer[slen-1] = '\0'; // Chop the \n. Word s(buffer); a.push_back(s); } fclose(f); } int main(int argc, char **argv) { app_data app; vector words; gtk_init(&argc, &argv); if(argc < 2) { printf("%s input_data_file.txt\n", argv[0]); return 0; } try { build_gui(app); readData(words, argv[1]); for(size_t i=0; i * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ /* * A simple GUI application to test number pad error correction. */ #include "columbus.hh" // This app should only need public API from Columbus. #include #include #include #include using namespace Columbus; using namespace std; const char *queryTime = "Query time: "; const char *resultCount = "Total results: "; const int DEFAULT_ERROR = 200; struct app_data { Matcher *m; GtkWidget *window; GtkWidget *entry; GtkListStore *matchStore; GtkWidget *matchView; GtkWidget *queryTimeLabel; GtkWidget *resultCountLabel; vector source; const char *filename; }; static gboolean delete_event(GtkWidget */*widget*/, GdkEvent */*event*/, gpointer /*data*/) { gtk_main_quit(); return TRUE; } static void destroy(GtkWidget */*widget*/, gpointer /*data*/) { gtk_main_quit (); } static void doExactMatch(const char *query, const char *fname) { string regex("^"); string command("grep "); for(int i=0; query[i] != 0; i++) { switch(query[i]) { case '0' : regex += "0"; break; case '1' : regex += "1"; break; case '2' : regex += "[2abc]"; break; case '3' : regex += "[3def]"; break; case '4' : regex += "[4ghi]"; break; case '5' : regex += "[5jkl]"; break; case '6' : regex += "[6mno]"; break; case '7' : regex += "[7pqrs]"; break; case '8' : regex += "[8tuv]"; break; case '9' : regex += "[9wxyz]"; break; default : regex += query[i]; break; } } command += "'"; command += regex; command += "' "; command += fname; printf("\n-------\n"); system(command.c_str()); printf("\n\n"); } static void doSearch(GtkWidget */*widget*/, gpointer data) { app_data *app = (app_data*) data; MatchResults matches; GtkTreeIter iter; double queryStart, queryEnd; try { queryStart = hiresTimestamp(); matches = app->m->match(gtk_entry_get_text(GTK_ENTRY(app->entry))); queryEnd = hiresTimestamp(); } catch(exception &e) { printf("Matching failed: %s\n", e.what()); gtk_list_store_clear(app->matchStore); gtk_label_set_text(GTK_LABEL(app->queryTimeLabel), queryTime); gtk_label_set_text(GTK_LABEL(app->resultCountLabel), resultCount); gtk_entry_set_text(GTK_ENTRY(app->entry), ""); return; } gtk_list_store_clear(app->matchStore); for(size_t i=0; imatchStore, &iter); gtk_list_store_set(app->matchStore, &iter, 0, app->source[id].c_str(), 1, matches.getRelevancy(i), -1); } char buf[1024]; sprintf(buf, "%s%.2f", queryTime, queryEnd - queryStart); gtk_label_set_text(GTK_LABEL(app->queryTimeLabel), buf); sprintf(buf, "%s%lu", resultCount, (unsigned long) matches.size()); gtk_label_set_text(GTK_LABEL(app->resultCountLabel), buf); doExactMatch(gtk_entry_get_text(GTK_ENTRY(app->entry)), app->filename); } static void padPress(GtkWidget *widget, gpointer data) { app_data *app = (app_data*) data; char txt[2]; txt[0] = gtk_button_get_label(GTK_BUTTON(widget))[0]; txt[1] = 0; gtk_entry_buffer_insert_text(gtk_entry_get_buffer(GTK_ENTRY(app->entry)), 1000, txt, 1); doSearch(NULL, app); } GtkWidget* build_numberpad(app_data *app) { GtkWidget *padBox = gtk_box_new(GTK_ORIENTATION_VERTICAL, 0); GtkWidget *row; GtkWidget *b; row = gtk_box_new(GTK_ORIENTATION_HORIZONTAL, 0); b = gtk_button_new_with_label("1"); g_signal_connect(b, "clicked", G_CALLBACK(padPress), app); gtk_box_pack_start(GTK_BOX(row), b, FALSE, TRUE, 0); b = gtk_button_new_with_label("2 (abc)"); g_signal_connect(b, "clicked", G_CALLBACK(padPress), app); gtk_box_pack_start(GTK_BOX(row), b, FALSE, TRUE, 0); b = gtk_button_new_with_label("3 (def)"); g_signal_connect(b, "clicked", G_CALLBACK(padPress), app); gtk_box_pack_start(GTK_BOX(row), b, FALSE, TRUE, 0); gtk_box_pack_start(GTK_BOX(padBox), row, FALSE, TRUE, 0); row = gtk_box_new(GTK_ORIENTATION_HORIZONTAL, 0); b = gtk_button_new_with_label("4 (ghi)"); g_signal_connect(b, "clicked", G_CALLBACK(padPress), app); gtk_box_pack_start(GTK_BOX(row), b, FALSE, TRUE, 0); b = gtk_button_new_with_label("5 (jkl)"); g_signal_connect(b, "clicked", G_CALLBACK(padPress), app); gtk_box_pack_start(GTK_BOX(row), b, FALSE, TRUE, 0); b = gtk_button_new_with_label("6 (mno)"); g_signal_connect(b, "clicked", G_CALLBACK(padPress), app); gtk_box_pack_start(GTK_BOX(row), b, FALSE, TRUE, 0); gtk_box_pack_start(GTK_BOX(padBox), row, FALSE, TRUE, 0); row = gtk_box_new(GTK_ORIENTATION_HORIZONTAL, 0); b = gtk_button_new_with_label("7 (pqrs)"); g_signal_connect(b, "clicked", G_CALLBACK(padPress), app); gtk_box_pack_start(GTK_BOX(row), b, FALSE, TRUE, 0); b = gtk_button_new_with_label("8 (tuv)"); g_signal_connect(b, "clicked", G_CALLBACK(padPress), app); gtk_box_pack_start(GTK_BOX(row), b, FALSE, TRUE, 0); b = gtk_button_new_with_label("9 (wxyz)"); g_signal_connect(b, "clicked", G_CALLBACK(padPress), app); gtk_box_pack_start(GTK_BOX(row), b, FALSE, TRUE, 0); gtk_box_pack_start(GTK_BOX(padBox), row, FALSE, TRUE, 0); row = gtk_box_new(GTK_ORIENTATION_HORIZONTAL, 0); b = gtk_button_new_with_label("*"); g_signal_connect(b, "clicked", G_CALLBACK(padPress), app); gtk_box_pack_start(GTK_BOX(row), b, FALSE, TRUE, 0); b = gtk_button_new_with_label("0"); g_signal_connect(b, "clicked", G_CALLBACK(padPress), app); gtk_box_pack_start(GTK_BOX(row), b, FALSE, TRUE, 0); b = gtk_button_new_with_label("#"); g_signal_connect(b, "clicked", G_CALLBACK(padPress), app); gtk_box_pack_start(GTK_BOX(row), b, FALSE, TRUE, 0); gtk_box_pack_start(GTK_BOX(padBox), row, FALSE, TRUE, 0); return padBox; } void build_gui(app_data &app) { GtkWidget *vbox; GtkWidget *hbox; GtkWidget *padBox; GtkWidget *scroller; GtkWidget *quitButton; GtkWidget *searchButton; GtkTreeViewColumn *textColumn; GtkTreeViewColumn *relevancyColumn; app.window = gtk_window_new(GTK_WINDOW_TOPLEVEL); g_signal_connect (app.window, "delete-event", G_CALLBACK (delete_event), NULL); g_signal_connect (app.window, "destroy", G_CALLBACK (destroy), NULL); gtk_window_set_default_size(GTK_WINDOW(app.window), 600, 700); gtk_window_set_title(GTK_WINDOW(app.window), "Number pad search test"); vbox = gtk_box_new(GTK_ORIENTATION_VERTICAL, 0); gtk_container_add(GTK_CONTAINER(app.window), vbox); hbox = gtk_box_new(GTK_ORIENTATION_HORIZONTAL, 0); app.entry = gtk_entry_new(); gtk_widget_set_tooltip_text(app.entry, "Word to search, must not contain whitespace."); searchButton = gtk_button_new_with_label("Search"); g_signal_connect(searchButton, "clicked", G_CALLBACK(doSearch), &app); g_signal_connect(app.entry, "activate", G_CALLBACK(doSearch), &app); // GTK+ docs say not to connect to "activate" but it seems to work. app.matchStore = gtk_list_store_new(2, G_TYPE_STRING, G_TYPE_DOUBLE); app.matchView = gtk_tree_view_new_with_model(GTK_TREE_MODEL(app.matchStore)); textColumn = gtk_tree_view_column_new_with_attributes("Match", gtk_cell_renderer_text_new(), "text", 0, NULL); gtk_tree_view_append_column(GTK_TREE_VIEW(app.matchView), textColumn); relevancyColumn = gtk_tree_view_column_new_with_attributes("Relevancy", gtk_cell_renderer_text_new(), "text", 1, NULL); gtk_tree_view_append_column(GTK_TREE_VIEW(app.matchView), relevancyColumn); scroller = gtk_scrolled_window_new(NULL, NULL); gtk_container_add(GTK_CONTAINER(scroller), app.matchView); app.queryTimeLabel = gtk_label_new(queryTime); gtk_label_set_justify(GTK_LABEL(app.queryTimeLabel), GTK_JUSTIFY_LEFT); app.resultCountLabel = gtk_label_new(resultCount); gtk_label_set_justify(GTK_LABEL(app.resultCountLabel), GTK_JUSTIFY_LEFT); quitButton = gtk_button_new_with_label("Quit"); g_signal_connect(quitButton, "clicked", G_CALLBACK(destroy), NULL); gtk_box_pack_start(GTK_BOX(hbox), app.entry, TRUE, TRUE, 0); gtk_box_pack_start(GTK_BOX(hbox), searchButton, FALSE, TRUE, 0); gtk_box_pack_start(GTK_BOX(vbox), hbox, FALSE, TRUE, 0); gtk_box_pack_start(GTK_BOX(vbox), scroller, TRUE, TRUE, 0); padBox = build_numberpad(&app); gtk_box_pack_start(GTK_BOX(vbox), padBox, FALSE, TRUE, 0); gtk_box_pack_start(GTK_BOX(vbox), app.queryTimeLabel, FALSE, TRUE, 0); gtk_box_pack_start(GTK_BOX(vbox), app.resultCountLabel, FALSE, TRUE, 0); gtk_box_pack_start(GTK_BOX(vbox), quitButton, FALSE, TRUE, 0); gtk_widget_show_all(app.window); } void build_matcher(app_data &app, const char *dataFile) { Corpus *c = new Corpus(); Word field("name"); const size_t batchSize = 100000; size_t i=0; double dataReadStart, dataReadEnd; app.filename = dataFile; ifstream ifile(dataFile); if(ifile.fail()) { printf("Could not open file %s.\n", dataFile); exit(1); } string line; app.m = new Matcher(); app.m->getErrorValues().addNumberpadErrors(); app.m->getErrorValues().setSubstringMode(); // Build Corpus. dataReadStart = hiresTimestamp(); while(getline(ifile, line)) { if(line.size() == 0) continue; // Remove possible DOS line ending garbage. if(line[line.size()-2] == '\r') line[line.size()-2] = '\0'; Document d(app.source.size()); d.addText(field, line.c_str()); c->addDocument(d); app.source.push_back(line); i++; if(i % batchSize == 0) { app.m->index(*c); delete c; c = new Corpus(); } } app.m->index(*c); delete c; dataReadEnd = hiresTimestamp(); printf("Read in %lu documents in %.2f seconds.\n", (unsigned long) i, dataReadEnd - dataReadStart); } void delete_matcher(app_data &app) { delete app.m; app.m = 0; } int main(int argc, char **argv) { app_data app; double buildStart, buildEnd; gtk_init(&argc, &argv); if(argc < 2) { printf("%s input_data_file.txt\n", argv[0]); return 0; } try { build_gui(app); buildStart = hiresTimestamp(); build_matcher(app, argv[1]); buildEnd = hiresTimestamp(); printf("Building the matcher took %.2f seconds.\n", buildEnd - buildStart); gtk_main(); delete_matcher(app); } catch(std::exception &e) { fprintf(stderr, "Failed with exception: %s\n", e.what()); return 99; } return 0; } libcolumbus-1.1.0+15.10.20150806/tools/hudtest.cc0000644000015300001610000002051412560622644021524 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ /* * A simple GUI application to test HUD matching. */ #include "columbus.hh" #include "WordList.hh" #include #include #include #include #include #include using namespace Columbus; using namespace std; const char *queryTime = "Query time: "; const char *resultCount = "Total results: "; const int DEFAULT_ERROR = 200; struct app_data { Matcher *m; GtkWidget *window; GtkWidget *entry; GtkListStore *matchStore; GtkWidget *matchView; GtkWidget *queryTimeLabel; GtkWidget *resultCountLabel; vector pathSource, commandSource; }; static gboolean delete_event(GtkWidget */*widget*/, GdkEvent */*event*/, gpointer /*data*/) { gtk_main_quit(); return TRUE; } static void destroy(GtkWidget */*widget*/, gpointer /*data*/) { gtk_main_quit (); } static void doSearch(GtkWidget */*widget*/, gpointer data) { app_data *app = (app_data*) data; MatchResults matches; GtkTreeIter iter; double queryStart, queryEnd; try { queryStart = hiresTimestamp(); matches = app->m->match(gtk_entry_get_text(GTK_ENTRY(app->entry))); queryEnd = hiresTimestamp(); } catch(exception &e) { printf("Matching failed: %s\n", e.what()); gtk_list_store_clear(app->matchStore); gtk_label_set_text(GTK_LABEL(app->queryTimeLabel), queryTime); gtk_label_set_text(GTK_LABEL(app->resultCountLabel), resultCount); gtk_entry_set_text(GTK_ENTRY(app->entry), ""); return; } gtk_list_store_clear(app->matchStore); for(size_t i=0; imatchStore, &iter); gtk_list_store_set(app->matchStore, &iter, 0, app->pathSource[id].c_str(), 1, app->commandSource[id].c_str(), 2, matches.getRelevancy(i), -1); } char buf[1024]; sprintf(buf, "%s%.2f", queryTime, queryEnd - queryStart); gtk_label_set_text(GTK_LABEL(app->queryTimeLabel), buf); sprintf(buf, "%s%lu", resultCount, (unsigned long) matches.size()); gtk_label_set_text(GTK_LABEL(app->resultCountLabel), buf); } void build_gui(app_data &app) { GtkWidget *vbox; GtkWidget *scroller; GtkWidget *quitButton; GtkTreeViewColumn *pathColumn; GtkTreeViewColumn *commandColumn; GtkTreeViewColumn *relevancyColumn; app.window = gtk_window_new(GTK_WINDOW_TOPLEVEL); g_signal_connect (app.window, "delete-event", G_CALLBACK (delete_event), NULL); g_signal_connect (app.window, "destroy", G_CALLBACK (destroy), NULL); gtk_window_set_default_size(GTK_WINDOW(app.window), 600, 700); gtk_window_set_title(GTK_WINDOW(app.window), "Columbus query tool"); vbox = gtk_box_new(GTK_ORIENTATION_VERTICAL, 0); gtk_container_add(GTK_CONTAINER(app.window), vbox); app.entry = gtk_entry_new(); gtk_widget_set_tooltip_text(app.entry, "You type your search phrase in, you blank your search phrase out. That's what it's all about."); g_signal_connect(app.entry, "changed", G_CALLBACK(doSearch), &app); app.matchStore = gtk_list_store_new(3, G_TYPE_STRING, G_TYPE_STRING, G_TYPE_DOUBLE); app.matchView = gtk_tree_view_new_with_model(GTK_TREE_MODEL(app.matchStore)); pathColumn = gtk_tree_view_column_new_with_attributes("Path", gtk_cell_renderer_text_new(), "text", 0, NULL); gtk_tree_view_append_column(GTK_TREE_VIEW(app.matchView), pathColumn); commandColumn = gtk_tree_view_column_new_with_attributes("Command", gtk_cell_renderer_text_new(), "text", 1, NULL); gtk_tree_view_append_column(GTK_TREE_VIEW(app.matchView), commandColumn); relevancyColumn = gtk_tree_view_column_new_with_attributes("Relevancy", gtk_cell_renderer_text_new(), "text", 2, NULL); gtk_tree_view_append_column(GTK_TREE_VIEW(app.matchView), relevancyColumn); scroller = gtk_scrolled_window_new(NULL, NULL); gtk_container_add(GTK_CONTAINER(scroller), app.matchView); app.queryTimeLabel = gtk_label_new(queryTime); gtk_label_set_justify(GTK_LABEL(app.queryTimeLabel), GTK_JUSTIFY_LEFT); app.resultCountLabel = gtk_label_new(resultCount); gtk_label_set_justify(GTK_LABEL(app.resultCountLabel), GTK_JUSTIFY_LEFT); quitButton = gtk_button_new_with_label("Quit"); g_signal_connect(quitButton, "clicked", G_CALLBACK(destroy), NULL); gtk_box_pack_start(GTK_BOX(vbox), app.entry, FALSE, TRUE, 0); gtk_box_pack_start(GTK_BOX(vbox), scroller, TRUE, TRUE, 0); gtk_box_pack_start(GTK_BOX(vbox), app.queryTimeLabel, FALSE, TRUE, 0); gtk_box_pack_start(GTK_BOX(vbox), app.resultCountLabel, FALSE, TRUE, 0); gtk_box_pack_start(GTK_BOX(vbox), quitButton, FALSE, TRUE, 0); gtk_widget_show_all(app.window); } void splitShowableParts(const string &line, string &pathText, string &commandText) { size_t tokenLoc = line.find('>', 0); pathText.assign(line, 0, tokenLoc); commandText.assign(line, tokenLoc+1, line.length()); } void build_matcher(app_data &app, const char *dataFile) { Corpus *c = new Corpus(); Word pathField("path"); Word commandField("command"); Word aliasField("alias"); const size_t batchSize = 100000; size_t i=0; const double pathWeight = 0.3; const double aliasWeight = 0.8; double dataReadStart, dataReadEnd; ifstream ifile(dataFile); if(ifile.fail()) { printf("Could not open file %s.\n", dataFile); exit(1); } string line; app.m = new Matcher(); app.m->getErrorValues().setSubstringMode(); app.m->getErrorValues().addStandardErrors(); // Build Corpus. dataReadStart = hiresTimestamp(); while(getline(ifile, line)) { WordList path, command; string pathText, commandText; if(line.size() == 0) continue; // Remove possible DOS line ending garbage. if(line[line.size()-2] == '\r') line[line.size()-2] = '\0'; splitShowableParts(line, pathText, commandText); path = splitToWords(pathText.c_str()); command = splitToWords(commandText.c_str()); if(command.size() == 0) continue; Document d(app.pathSource.size()); d.addText(pathField, path); d.addText(commandField, command); if(commandText.find("Fuzzy Select") != (size_t)-1) { d.addText(aliasField, "magnetic lasso"); } c->addDocument(d); app.pathSource.push_back(pathText); app.commandSource.push_back(commandText); i++; if(i % batchSize == 0) { app.m->index(*c); delete c; c = new Corpus(); } } app.m->index(*c); delete c; app.m->getIndexWeights().setWeight(pathField, pathWeight); app.m->getIndexWeights().setWeight(aliasField, aliasWeight); dataReadEnd = hiresTimestamp(); printf("Read in %lu documents in %.2f seconds.\n", (unsigned long) i, dataReadEnd - dataReadStart); } void delete_matcher(app_data &app) { delete app.m; app.m = 0; } int main(int argc, char **argv) { app_data app; double buildStart, buildEnd; gtk_init(&argc, &argv); if(argc < 2) { printf("%s input_data_file.txt\n", argv[0]); return 0; } try { build_gui(app); buildStart = hiresTimestamp(); build_matcher(app, argv[1]); buildEnd = hiresTimestamp(); printf("Building the matcher took %.2f seconds.\n", buildEnd - buildStart); gtk_main(); delete_matcher(app); } catch(std::exception &e) { fprintf(stderr, "Failed with exception: %s\n", e.what()); return 99; } return 0; } libcolumbus-1.1.0+15.10.20150806/tools/CMakeLists.txt0000644000015300001610000000132512560622644022274 0ustar pbuserpbgroup00000000000000add_executable(queryindex queryindex.cc) target_link_libraries(queryindex ${COL_LIB_BASENAME}) if(GTK3_FOUND) include_directories(${GTK3_INCLUDE_DIRS}) add_executable(singleword singleword.cc) target_link_libraries(singleword ${COL_LIB_BASENAME} ${GTK3_LIBRARIES}) add_executable(queryapp queryapp.cc) target_link_libraries(queryapp ${COL_LIB_BASENAME} ${GTK3_LIBRARIES}) add_executable(hudtest hudtest.cc) target_link_libraries(hudtest ${COL_LIB_BASENAME} ${GTK3_LIBRARIES}) add_executable(sctest sctest.cc) target_link_libraries(sctest ${COL_LIB_BASENAME} ${GTK3_LIBRARIES}) add_executable(numberpad numberpad.cc) target_link_libraries(numberpad ${COL_LIB_BASENAME} ${GTK3_LIBRARIES}) endif() libcolumbus-1.1.0+15.10.20150806/coding style.txt0000644000015300001610000000064712560622644021527 0ustar pbuserpbgroup00000000000000Code style of the Columbus project - functions must be short, anything bigger than one screen must be split - indentation is 4 spaces, tabs are forbidden - opening brace always on the same line - class header files must be minimal - no STL #includes because they slow down compilation massively, the only exception is string, which is necessary for interoperation - forward declarations instead of #includes libcolumbus-1.1.0+15.10.20150806/test/0000755000015300001610000000000012560622775017357 5ustar pbuserpbgroup00000000000000libcolumbus-1.1.0+15.10.20150806/test/MatchResultsTest.cc0000644000015300001610000000351412560622644023142 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #include "MatchResults.hh" #include "Word.hh" #include using namespace Columbus; void testMatchResult() { MatchResults r; DocumentID w1 = 0; double r1 = 1; DocumentID w2 = 1; double r2 = 2; DocumentID w3 = 2; double r3 = 0.5; assert(r.size() == 0); r.addResult(w1, r1); assert(r.size() == 1); assert(r.getRelevancy(0) == r1); r.addResult(w2, r2); assert(r.size() == 2); assert(r.getRelevancy(0) == r2); r.addResult(w3, r3); assert(r.size() == 3); assert(r.getRelevancy(0) == r2); } MatchResults gimme() { MatchResults m; m.addResult(1, 1); m.addResult(2, 2); return m; } /* * For great Valgrind justice. */ void testAssignments() { MatchResults m1, m2; m1.addResult(3, 4); m2 = m1; MatchResults m3(m1); MatchResults m4(m3); MatchResults m5(gimme()); MatchResults m6 = gimme(); } int main(int /*argc*/, char **/*argv*/) { try { testMatchResult(); testAssignments(); } catch(const std::exception &e) { fprintf(stderr, "Fail: %s\n", e.what()); return 666; } return 0; } libcolumbus-1.1.0+15.10.20150806/test/LevScalabilityTest.cc0000644000015300001610000000755712560622644023446 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ /* * This file tests the performance using the given wordlist. * It does not check search results, only that the program * does not crash. Run under Valgrind to check for memory leaks. */ #include "LevenshteinIndex.hh" #include "Word.hh" #include "ColumbusHelpers.hh" #include "ErrorValues.hh" #include "WordStore.hh" #include #include #include #include #include using namespace Columbus; using namespace std; void readData(vector &a, const char *ifilename) { FILE *f = fopen(ifilename, "r"); char buffer[1024]; if(!f) { printf("Could not open dictionary file. Skipping performance test.\n"); exit(0); } while(fgets(buffer, 1024, f) != NULL) { unsigned int slen = strlen(buffer); assert(buffer[slen-1] == '\n'); buffer[slen-1] = '\0'; // Chop the \n. Word s(buffer); a.push_back(s); } fclose(f); } /* * Separate function so it stands out in Callgrind. */ void runQueries(int query_size, const int defaultError, vector &a, ErrorValues &e, LevenshteinIndex &ind, IndexMatches &matches) { for(size_t i=0; i < (size_t)query_size; i++) { ind.findWords(a[i], e, 2*defaultError, matches); matches.clear(); } } void runTest(vector &a, int querySize) { double buildStart, buildEnd, queryStart, queryEnd; double plainQueryTime, fullErrorQueryTime, buildTime; LevenshteinIndex ind; WordStore store; IndexMatches matches; ErrorValues e; const int defaultError = LevenshteinIndex::getDefaultError(); buildStart = hiresTimestamp(); for(size_t i=0; i < a.size(); i++) ind.insertWord(a[i], store.getID(a[i])); buildEnd = hiresTimestamp(); buildTime = buildEnd - buildStart; printf("Index built in %.3f seconds. Words per second %.2f.\n", buildTime, a.size()/buildTime); queryStart = hiresTimestamp(); runQueries(querySize, defaultError, a, e, ind, matches); queryEnd = hiresTimestamp(); plainQueryTime = queryEnd - queryStart; printf("Simple queries done in %.3f seconds. Queries per second %.2f.\n", plainQueryTime, querySize/plainQueryTime); e.addStandardErrors(); queryStart = hiresTimestamp(); runQueries(querySize, defaultError, a, e, ind, matches); queryEnd = hiresTimestamp(); fullErrorQueryTime = queryEnd - queryStart; printf("Heavy queries done in %.3f seconds. Queries per second %.2f.\n", fullErrorQueryTime, querySize/fullErrorQueryTime); } int runtest(int argc, char **argv) { vector a; int querySize; const char *ifile; if(argc == 1) { printf("%s input_file_name.txt \n", argv[0]); return 1; } ifile = argv[1]; readData(a, ifile); printf("Read in %lu words.\n", (unsigned long) a.size()); if(argc > 2) querySize = atoi(argv[2]); else querySize = a.size(); printf("Querying %d elements.\n", querySize); runTest(a, querySize); return 0; } int main(int argc, char **argv) { try{ return runtest(argc, argv); } catch(const std::exception &e) { fprintf(stderr, "Fail: %s\n", e.what()); return 666; } } libcolumbus-1.1.0+15.10.20150806/test/CAPITest.c0000644000015300001610000000702212560622644021073 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #include "columbus.h" #include #include void testWord() { const int bufSize = 10; char buf[bufSize]; ColWord w = col_word_new("abc"); assert(w); assert(col_word_length(w) == 3); col_word_as_utf8(w, buf, bufSize); assert(strcmp("abc", buf) == 0); col_word_delete(w); } void testDocument() { ColDocument d = col_document_new(55); ColWord w = col_word_new("abc"); assert(d); assert(col_document_get_id(d) == 55); col_document_add_text(d, w, "this is just some text"); col_word_delete(w); col_document_delete(d); } void testMatcher() { ColMatcher m = col_matcher_new(); assert(m); col_matcher_delete(m); } void testMatchResults() { ColMatchResults mr = col_match_results_new(); assert(mr); col_match_results_delete(mr); } void testCorpus() { ColCorpus c = col_corpus_new(); ColDocument d = col_document_new(42); ColWord w = col_word_new("abc"); col_document_add_text(d, w, "this is just some text"); col_word_delete(w); assert(c); col_corpus_add_document(c, d); col_document_delete(d); col_corpus_delete(c); } ColCorpus buildCorpus() { ColCorpus c = col_corpus_new(); DocumentID name1 = 0; DocumentID name2 = 10; DocumentID name3 = 1000; ColWord textName = col_word_new("title"); ColDocument d1, d2, dFar; d1 = col_document_new(name1); col_document_add_text(d1, textName, "abc def"); d2 = col_document_new(name2); col_document_add_text(d2, textName, "abe test"); dFar = col_document_new(name3); col_document_add_text(dFar, textName, "faraway donotmatchme"); col_corpus_add_document(c, d1); col_corpus_add_document(c, d2); col_corpus_add_document(c, dFar); col_word_delete(textName); col_document_delete(d1); col_document_delete(d2); col_document_delete(dFar); return c; } void testMatching() { ColCorpus c = buildCorpus(); ColMatcher m = col_matcher_new(); ColMatchResults matches; DocumentID dFarName = 1000; DocumentID name1 = 0; DocumentID name2 = 10; col_matcher_index(m, c); col_corpus_delete(c); matches = col_matcher_match(m, "abe"); assert(col_match_results_size(matches) == 2); assert(col_match_results_get_id(matches, 0) != dFarName); assert(col_match_results_get_id(matches, 1) != dFarName); assert(col_match_results_get_id(matches, 0) == name1 || col_match_results_get_id(matches, 1) == name1); assert(col_match_results_get_id(matches, 0) == name2 || col_match_results_get_id(matches, 1) == name2); col_match_results_delete(matches); col_matcher_delete(m); } int main(int argc UNUSED_VAR, char **argv UNUSED_VAR) { testWord(); testDocument(); testMatcher(); testMatchResults(); testCorpus(); testMatching(); return 0; } libcolumbus-1.1.0+15.10.20150806/test/WordListTest.cc0000644000015300001610000000435612560622644022300 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #include "WordList.hh" #include "Word.hh" #include #include using namespace Columbus; void testList() { WordList l; bool gotException; Word w1("abc"); Word w2("def"); Word *w2Shadow = new Word(w2); assert(l.size() == 0); try { Word wTmp = l[1]; gotException = false; } catch(std::out_of_range &e) { gotException = true; } assert(gotException); l.addWord(w1); assert(l.size() == 1); assert(l[0] == w1); l.addWord(*w2Shadow); delete w2Shadow; assert(l.size() == 2); assert(l[1] == w2); } void testAssignment() { WordList *l1 = new WordList(); WordList l2; Word w("abc"); l1->addWord(w); assert(l1->size() == 1); l2 = *l1; assert(l2.size() == 1); assert(l2[0] == w); delete l1; assert(l2.size() == 1); assert(l2[0] == w); } void testEquality() { WordList l1, l2; Word w1("abc"); Word w2("def"); Word w3("ghi"); assert(l1 == l2); assert(!(l1 != l2)); l1.addWord(w1); assert(!(l1 == l2)); assert(l1 != l2); l2.addWord(w1); assert(l1 == l2); assert(!(l1 != l2)); l2.addWord(w2); assert(!(l1 == l2)); assert(l1 != l2); l1.addWord(w3); assert(!(l1 == l2)); assert(l1 != l2); } int main(int /*argc*/, char **/*argv*/) { try { testList(); testAssignment(); testEquality(); } catch(const std::exception &e) { fprintf(stderr, "Fail: %s\n", e.what()); return 666; } return 0; } libcolumbus-1.1.0+15.10.20150806/test/LevIndexTest.cc0000644000015300001610000002104012560622644022234 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ /* * This file tests the error tolerant matching of the Levenshtein index. */ #include #include "LevenshteinIndex.hh" #include "Word.hh" #include "ErrorValues.hh" using namespace Columbus; using namespace std; void testTrivial() { LevenshteinIndex ind; IndexMatches matches; ErrorValues e; Word w("a"); ind.findWords(w, e, 100*LevenshteinIndex::getDefaultError(), matches); assert(matches.size() == 0); } void testSimple() { LevenshteinIndex ind; IndexMatches matches; ErrorValues e; Word w1("abc"); Word w2("def"); WordID w1ID = 1; WordID w2ID = 2; ind.insertWord(w1, w1ID); ind.insertWord(w2, w2ID); ind.findWords(w1, e, LevenshteinIndex::getDefaultError(), matches); assert(matches.size() == 1); assert(w1ID == matches.getMatch(0)); assert(matches.getMatchError(0) == 0); matches.clear(); ind.findWords(w2, e, LevenshteinIndex::getDefaultError(), matches); assert(matches.size() == 1); assert(matches.getMatch(0) == w2ID); assert(matches.getMatchError(0) == 0); } void testOrder() { LevenshteinIndex ind; IndexMatches matches; ErrorValues e; const int defaultError = LevenshteinIndex::getDefaultError(); Word w1("abcde"); Word w2("abxye"); Word w3("abche"); Word w4("abxhe"); WordID w1ID = 1; WordID w2ID = 2; //WordID w3ID = 3; //WordID w4ID = 4; Word veryFarFromEveryOtherString("supercalifragilisticexpialidocious"); WordID veryFarID = 100; ind.insertWord(w1, w1ID); ind.insertWord(w2, w2ID); ind.insertWord(veryFarFromEveryOtherString, veryFarID); ind.findWords(w3, e, defaultError, matches); assert(matches.size() == 1); assert(matches.getMatch(0) == w1ID); assert(matches.getMatchError(0) == defaultError); matches.clear(); ind.findWords(w4, e, defaultError, matches); assert(matches.size() == 1); assert(matches.getMatch(0) == w2ID); assert(matches.getMatchError(0) == defaultError); matches.clear(); ind.findWords(w3, e, 2*defaultError, matches); assert(matches.size() == 2); assert(matches.getMatch(0) == w1ID); assert(matches.getMatchError(0) == defaultError); assert(matches.getMatchError(1) == 2*defaultError); matches.clear(); ind.findWords(w4, e, 2*defaultError, matches); assert(matches.size() == 2); assert(matches.getMatch(0) == w2ID); assert(matches.getMatchError(0) == defaultError); assert(matches.getMatchError(1) == 2*defaultError); } void testEdges() { LevenshteinIndex ind; IndexMatches matches; ErrorValues e; const int defaultError = LevenshteinIndex::getDefaultError(); const int bigError = 100*defaultError; Word w1("abc"); Word w2("bbc"); Word w3("acc"); Word w4("abb"); WordID w1ID = 1; ind.insertWord(w1, w1ID); ind.findWords(w2, e, bigError, matches); assert(matches.size() == 1); assert(matches.getMatchError(0) == defaultError); matches.clear(); ind.findWords(w3, e, bigError, matches); assert(matches.size() == 1); assert(matches.getMatchError(0) == defaultError); matches.clear(); ind.findWords(w4, e, bigError, matches); assert(matches.size() == 1); assert(matches.getMatchError(0) == defaultError); matches.clear(); } void testEmptyQuery() { LevenshteinIndex ind; IndexMatches matches; ErrorValues e; const int defaultError = LevenshteinIndex::getDefaultError(); Word w1("a"); Word w2("b"); Word w3("abc"); Word empty(""); WordID w1ID = 1; WordID w2ID = 2; WordID w3ID = 3; ind.insertWord(w1, w1ID); ind.insertWord(w2, w2ID); ind.insertWord(w3, w3ID); ind.findWords(empty, e, defaultError, matches); assert(matches.size() == 2); assert(matches.getMatchError(0) == defaultError); assert(matches.getMatchError(1) == defaultError); } void testExact() { LevenshteinIndex ind; IndexMatches matches; ErrorValues e; Word w1("abcd"); Word w2("abce"); WordID w1ID = 1; ind.insertWord(w1, w1ID); ind.findWords(w2, e, 0, matches); assert(matches.size() == 0); ind.findWords(w1, e, 0, matches); assert(matches.size() == 1); assert(matches.getMatch(0) == w1ID); assert(matches.getMatchError(0) == 0); } void testTranspose() { LevenshteinIndex ind; IndexMatches matches; ErrorValues e; const int defaultError = LevenshteinIndex::getDefaultError(); Word w1("abcd"); Word w2("acbd"); Word w3("bacd"); Word w4("abdc"); WordID w1ID = 1; ind.insertWord(w1, w1ID); ind.findWords(w2, e, defaultError, matches); assert(matches.size() == 1); assert(matches.getMatchError(0) == defaultError); matches.clear(); ind.findWords(w3, e, defaultError, matches); assert(matches.size() == 1); assert(matches.getMatchError(0) == defaultError); matches.clear(); ind.findWords(w4, e, defaultError, matches); assert(matches.size() == 1); assert(matches.getMatchError(0) == defaultError); matches.clear(); } void testEndError() { LevenshteinIndex trie; ErrorValues e; IndexMatches matches; const int endError = ErrorValues::getSubstringDefaultEndDeletionError(); const int defaultError = ErrorValues::getDefaultError(); Word w1("abcdef"); Word w2("abcdefghijkl"); // Should never be matched in these tests. WordID w1ID = 1; WordID w2ID = 2; Word query1("abcde"); Word query2("bcdef"); Word query3("abdef"); Word query4("abcd"); e.setEndDeletionError(endError); assert(2*endError < defaultError); trie.insertWord(w1, w1ID); trie.insertWord(w2, w2ID); assert(endError < defaultError); trie.findWords(query1, e, defaultError, matches); assert(matches.size() == 1); assert(matches.getMatch(0) == w1ID); matches.clear(); trie.findWords(query1, e, endError, matches); assert(matches.size() == 1); assert(matches.getMatch(0) == w1ID); matches.clear(); trie.findWords(query2, e, endError, matches); assert(matches.size() == 0); trie.findWords(query3, e, endError, matches); assert(matches.size() == 0); trie.findWords(query4, e, 2*endError, matches); assert(matches.size() == 1); assert(matches.getMatch(0) == w1ID); } void testStartError() { LevenshteinIndex trie; ErrorValues e; IndexMatches matches; const int startError = ErrorValues::getSubstringDefaultEndDeletionError(); const int defaultError = ErrorValues::getDefaultError(); Word w1("abcdef"); Word w2("ghijklabcdef"); // Should never be matched in these tests. WordID w1ID = 1; WordID w2ID = 2; Word query1("bcdef"); Word query2("abcdefe"); Word query3("abdef"); Word query4("cdef"); assert(2*startError < defaultError); e.setStartInsertionError(startError); trie.insertWord(w1, w1ID); trie.insertWord(w2, w2ID); assert(startError < defaultError); trie.findWords(query1, e, defaultError, matches); assert(matches.size() == 1); assert(matches.getMatch(0) == w1ID); matches.clear(); trie.findWords(query1, e, startError, matches); assert(matches.size() == 1); assert(matches.getMatch(0) == w1ID); matches.clear(); trie.findWords(query2, e, startError, matches); assert(matches.size() == 0); trie.findWords(query3, e, startError, matches); assert(matches.size() == 0); trie.findWords(query4, e, 2*startError, matches); assert(matches.size() == 1); assert(matches.getMatch(0) == w1ID); } int main(int /*argc*/, char **/*argv*/) { try { testTrivial(); testSimple(); testOrder(); testEdges(); testEmptyQuery(); testExact(); testTranspose(); testEndError(); testStartError(); } catch(const std::exception &e) { fprintf(stderr, "Fail: %s\n", e.what()); return 666; } return 0; } libcolumbus-1.1.0+15.10.20150806/test/ResultFilterTest.cc0000644000015300001610000000552712560622644023156 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #include "SearchParameters.hh" #include "ResultFilter.hh" #include "Word.hh" #include "Document.hh" #include "Corpus.hh" #include "MatchResults.hh" #include "Matcher.hh" #include #include using namespace Columbus; using namespace std; void testFiltering() { Word textField("text"); const char *txt = "something"; Word filterField1("field1"); Word filterField2("field2"); const char *val1str = "one"; const char *val2str = "two"; const char *val3str = "three"; Word val1(val1str); Word val2(val2str); Word val3(val3str); Document d1(1); Document d2(2); Corpus c; Matcher m; SearchParameters emptyFilter; SearchParameters onlyTakeFirst, onlyTakeSecond, orTest, andTest; d1.addText(textField, txt); d1.addText(filterField1, val1str); d1.addText(filterField2, val3str); c.addDocument(d1); d2.addText(textField, txt); d2.addText(filterField1, val2str); d2.addText(filterField2, val3str); c.addDocument(d2); m.index(c); MatchResults r1 = m.match(txt, emptyFilter); assert(r1.size() == 2); onlyTakeFirst.getResultFilter().addNewSubTerm(filterField1, val1); MatchResults r2 = m.match(txt, onlyTakeFirst); assert(r2.size() == 1); assert(r2.getDocumentID(0) == 1); onlyTakeSecond.getResultFilter().addNewSubTerm(filterField1, val2); MatchResults r3 = m.match(txt, onlyTakeSecond); assert(r3.size() == 1); assert(r3.getDocumentID(0) == 2); orTest.getResultFilter().addNewSubTerm(filterField1, val1); orTest.getResultFilter().addNewTerm(); orTest.getResultFilter().addNewSubTerm(filterField1, val2); MatchResults orResults = m.match(txt, orTest); assert(orResults.size() == 2); andTest.getResultFilter().addNewSubTerm(filterField2, val2); andTest.getResultFilter().addNewSubTerm(filterField1, val1); MatchResults andResults = m.match(txt, andTest); assert(andResults.size() == 0); } int main(int /*argc*/, char **/*argv*/) { try { testFiltering(); } catch(const std::exception &e) { fprintf(stderr, "Fail: %s\n", e.what()); return 666; } return 0; } libcolumbus-1.1.0+15.10.20150806/test/MatcherTest.cc0000644000015300001610000001756612560622644022123 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #include "Matcher.hh" #include "Corpus.hh" #include "Word.hh" #include "WordList.hh" #include "Document.hh" #include "MatchResults.hh" #include "ColumbusHelpers.hh" #include using namespace Columbus; using namespace std; Corpus * testCorpus() { Corpus *c = new Corpus(); Word w1("abc"); Word w2("def"); Word w3("abe"); Word w4("test"); Word w5("faraway"); Word w6("donotmatchme"); DocumentID name1 = 0; DocumentID name2 = 10; DocumentID name3 = 1000; Word textName("title"); WordList wl1, wl2, wlFar; wl1.addWord(w1); wl1.addWord(w2); wl2.addWord(w3); wl2.addWord(w4); wlFar.addWord(w5); wlFar.addWord(w6); Document d1(name1); d1.addText(textName, wl1); Document d2(name2); d2.addText(textName, wl2); Document dFar(name3); dFar.addText(textName, wlFar); c->addDocument(d1); c->addDocument(d2); c->addDocument(dFar); return c; } void testMatcher() { Corpus *c = testCorpus(); Matcher m; MatchResults matches; WordList queryList; Word w1("abc"); DocumentID dFarName = 1000; DocumentID name1 = 0; DocumentID name2 = 10; m.index(*c); delete(c); queryList.addWord(w1); matches = m.match(queryList); assert(matches.size() == 2); assert(matches.getDocumentID(0) != dFarName); assert(matches.getDocumentID(1) != dFarName); assert(matches.getDocumentID(0) == name1 || matches.getDocumentID(1) == name1); assert(matches.getDocumentID(0) == name2 || matches.getDocumentID(1) == name2); } void testRelevancy() { Corpus *c = testCorpus(); Matcher m; MatchResults matches; WordList queryList; Word w1("abc"); Word dFarName("distantdoc"); DocumentID name1 = 0; m.index(*c); delete c; queryList.addWord(w1); matches = m.match(queryList); assert(matches.size() == 2); // Document doc1 has an exact match, so it should be the best match. assert(matches.getRelevancy(0) > matches.getRelevancy(1)); assert(matches.getDocumentID(0) == name1); } void testMultiWord() { Corpus c; DocumentID correct = 1; DocumentID wrong = 0; Document d1(correct); Document d2(wrong); Word fieldName("name"); Matcher m; MatchResults matches; d1.addText(fieldName, "Sarah Michelle Gellar"); d2.addText(fieldName, "Sara Giller"); c.addDocument(d1); c.addDocument(d2); m.index(c); matches = m.match("Sari Michell Geller"); assert(matches.getDocumentID(0) == correct); } void testSentence() { Corpus c; DocumentID correct = 1; DocumentID wrong = 0; Document d1(correct); Document d2(wrong); Word fieldName("name"); Word secondName("context"); Matcher m; MatchResults matches; d1.addText(fieldName, "Fit Canvas to Layers"); d1.addText(secondName, "View Zoom (100%)"); d2.addText(fieldName, "Fit image in Window"); d2.addText(secondName, "Image"); c.addDocument(d1); c.addDocument(d2); m.index(c); matches = m.match("fit canvas to layers"); assert(matches.getDocumentID(0) == correct); } void testExactOrder() { Corpus c; DocumentID correct = 1; DocumentID wrong = 0; DocumentID moreWrong = 100; Document d1(correct); Document d2(wrong); Document d3(moreWrong); Word fieldName("name"); Word secondName("context"); Matcher m; MatchResults matches; WordList q = splitToWords("fit canvas to layers"); d1.addText(fieldName, "Fit Canvas to Layers"); d1.addText(secondName, "View Zoom (100%)"); d2.addText(fieldName, "Fit image in Window"); d2.addText(secondName, "Image"); d3.addText(fieldName, "Not matching."); d3.addText(secondName, "fit canvas to layers"); c.addDocument(d1); c.addDocument(d2); c.addDocument(d3); m.index(c); matches = m.onlineMatch(q, fieldName); assert(matches.size() >= 1); assert(matches.getDocumentID(0) == correct); } void testSmallestMatch() { Corpus c; DocumentID correct = 1; DocumentID wrong = 0; Document d1(correct); Document d2(wrong); Word fieldName("name"); Word field2("dummy"); Matcher m; MatchResults matches; WordList q = splitToWords("save"); d1.addText(fieldName, "save"); d1.addText(field2, "lots of text to ensure statistics of this field are ignored"); d2.addText(fieldName, "save as"); c.addDocument(d1); c.addDocument(d2); m.index(c); matches = m.onlineMatch(q, fieldName); assert(matches.size() == 2); assert(matches.getDocumentID(0) == correct); } void noCommonMatch() { Corpus c; DocumentID correct = 1; Document d1(correct); Word fieldName("name"); Word field2("dummy"); Matcher m; MatchResults matches; WordList q = splitToWords("fit canvas to selection"); d1.addText(fieldName, "Preparing your Images for the Web"); d1.addText(fieldName, "Help user manual"); c.addDocument(d1); m.index(c); matches = m.onlineMatch(q, fieldName); assert(matches.size() == 0); } void emptyMatch() { Corpus c; DocumentID correct = 1; Document d1(correct); Word fieldName("name"); Word field2("dummy"); Matcher m; MatchResults matches; WordList q; d1.addText(fieldName, "Preparing your Images for the Web"); d1.addText(fieldName, "Help user manual"); c.addDocument(d1); m.index(c); matches = m.onlineMatch(q, fieldName); assert(matches.size() == 0); } void testMatchCount() { Corpus c; DocumentID correct = 1; DocumentID wrong = 0; Document d1(correct); Document d2(wrong); Word fieldName("name"); Word secondName("context"); Matcher m; MatchResults matches; WordList q = splitToWords("fit canvas to selection"); d1.addText(fieldName, "Fit Canvas to Layers"); d1.addText(secondName, "View Zoom (100%)"); d2.addText(fieldName, "Selection editor"); d2.addText(secondName, "Windows dockable dialogs"); c.addDocument(d1); c.addDocument(d2); m.index(c); matches = m.onlineMatch(q, fieldName); assert(matches.size() == 2); assert(matches.getDocumentID(0) == correct); } void testPerfect() { Corpus c; DocumentID correct = 0; Document d1(1); Document d2(correct); Document d3(2); Document d4(3); Word fieldName("name"); Matcher m; MatchResults matches; WordList q = splitToWords("save"); d1.addText(fieldName, "Save as"); d2.addText(fieldName, "Save"); d3.addText(fieldName, "Save yourself"); d4.addText(fieldName, "Save the whales"); c.addDocument(d1); c.addDocument(d2); c.addDocument(d3); c.addDocument(d4); m.index(c); matches = m.onlineMatch(q, fieldName); assert(matches.size() >= 1); assert(matches.getDocumentID(0) == correct); } int main(int /*argc*/, char **/*argv*/) { try { testMatcher(); testRelevancy(); testMultiWord(); testSentence(); testExactOrder(); testSmallestMatch(); noCommonMatch(); emptyMatch(); testMatchCount(); testPerfect(); } catch(const std::exception &e) { fprintf(stderr, "Fail: %s\n", e.what()); return 666; } return 0; } libcolumbus-1.1.0+15.10.20150806/test/SearchParametersTest.cc0000644000015300001610000000412712560622644023756 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2013 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #include"SearchParameters.hh" #include"Word.hh" #include"Matcher.hh" #include"Document.hh" #include"Corpus.hh" #include"MatchResults.hh" #include using namespace Columbus; void testDynamic() { SearchParameters sp; assert(sp.isDynamic()); sp.setDynamic(false); assert(!sp.isDynamic()); sp.setDynamic(true); assert(sp.isDynamic()); } void testNosearch() { SearchParameters sp; Word w1("abc"); Word w2("def"); assert(!sp.isNonsearchingField(w1)); assert(!sp.isNonsearchingField(w2)); sp.addNonsearchingField(w1); assert(sp.isNonsearchingField(w1)); assert(!sp.isNonsearchingField(w2)); sp.addNonsearchingField(w2); assert(sp.isNonsearchingField(w1)); assert(sp.isNonsearchingField(w2)); } void testNosearchMatching() { Word textField("text"); Word search("field1"); Word nonSearch("field2"); const char *val1str = "one"; Corpus c; Matcher m; SearchParameters sp; MatchResults r; Document d1(1); Document d2(2); sp.addNonsearchingField(nonSearch); d1.addText(search, val1str); d2.addText(nonSearch, val1str); c.addDocument(d1); c.addDocument(d2); m.index(c); r = m.match(val1str, sp); assert(r.size() == 1); assert(r.getDocumentID(0) == 1); } int main(int /*argc*/, char **/*argv*/) { testDynamic(); testNosearch(); testNosearchMatching(); } libcolumbus-1.1.0+15.10.20150806/test/CustomErrorTest.cc0000644000015300001610000000420512560622644023006 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ /* * This file tests the error tolerant matching of the Levenshtein index. */ #include #include "LevenshteinIndex.hh" #include "ErrorValues.hh" #include "Word.hh" using namespace std; using namespace Columbus; void testCustomError() { LevenshteinIndex ind; IndexMatches matches; ErrorValues e; WordID wordID = 17; const int defaultError = ErrorValues::getDefaultError(); const int smallError = 1; const int biggerError = 2; assert(smallError < defaultError); assert(biggerError < defaultError); Word w1("abc"); Word w2("adc"); ind.insertWord(w1, wordID); ind.findWords(w2, e, defaultError, matches); assert(matches.getMatchError(0) == defaultError); matches.clear(); e.setError(Letter('b'), Letter('d'), smallError); ind.findWords(w2, e, defaultError, matches); assert(matches.getMatchError(0) == smallError); matches.clear(); e.setError(Letter('d'), Letter('b'), biggerError); ind.findWords(w2, e, defaultError, matches); assert(matches.getMatchError(0) == biggerError); matches.clear(); e.clearErrors(); ind.findWords(w2, e, defaultError, matches); assert(matches.getMatchError(0) == defaultError); matches.clear(); } int main(int /*argc*/, char **/*argv*/) { try { testCustomError(); } catch(const std::exception &e) { fprintf(stderr, "Fail: %s\n", e.what()); return 666; } return 0; } libcolumbus-1.1.0+15.10.20150806/test/ErrorValuesTest.cc0000644000015300001610000001073512560622644023000 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #include "ErrorValues.hh" #include "Word.hh" #include using namespace Columbus; void testError() { int smallError = 1; int defaultError = ErrorValues::getDefaultError(); Letter l1 = 16; Letter l2 = 17; ErrorValues ev; ErrorValues ev2; assert(ev.getSubstituteError(l1, l2) == defaultError); ev.setError(l1, l2, smallError); assert(ev.getSubstituteError(l1, l2) == smallError); assert(ev.getSubstituteError(l2, l1) == smallError); assert(ev2.getSubstituteError(l2, l1) == defaultError); ev2.setError(l2, l1, smallError); assert(ev2.getSubstituteError(l1, l2) == smallError); assert(ev2.getSubstituteError(l2, l1) == smallError); ev.clearErrors(); assert(ev.getSubstituteError(l1, l2) == defaultError); } void testGroupError() { ErrorValues ev; Letter e = 'e'; // These must be in lower case. Letter eacute = 0xe9; Letter ebreve = 0x115; Letter a = 'a'; Letter aacute = 0xe1; Letter abreve = 0x103; const int defaultError = ErrorValues::getDefaultError(); const int defaultGroupError = ErrorValues::getDefaultGroupError(); assert(defaultError != defaultGroupError); assert(ev.getSubstituteError(e, eacute) == defaultError); assert(ev.getSubstituteError(a, aacute) == defaultError); assert(ev.getSubstituteError(e, aacute) == defaultError); ev.addAccents(latinAccentGroup); assert(ev.isInGroup(e)); assert(ev.isInGroup(eacute)); assert(ev.isInGroup(ebreve)); assert(ev.isInGroup(a)); assert(ev.isInGroup(aacute)); assert(ev.isInGroup(abreve)); assert(ev.getSubstituteError(e, eacute) == defaultGroupError); assert(ev.getSubstituteError(eacute, e) == defaultGroupError); assert(ev.getSubstituteError(eacute, ebreve) == defaultGroupError); assert(ev.getSubstituteError(e, ebreve) == defaultGroupError); assert(ev.getSubstituteError(a, e) == defaultError); assert(ev.getSubstituteError(a, aacute) == defaultGroupError); assert(ev.getSubstituteError(abreve, aacute) == defaultGroupError); assert(ev.getSubstituteError(eacute, aacute) == defaultError); ev.clearErrors(); assert(ev.getSubstituteError(e, eacute) == defaultError); } void testKeyboardErrors() { ErrorValues ev; const int defaultError = ErrorValues::getDefaultError(); const int typoError = ErrorValues::getDefaultTypoError(); assert(ev.getSubstituteError('q', 'a') == defaultError); ev.addKeyboardErrors(); assert(ev.getSubstituteError('q', 'a') == typoError); assert(ev.getSubstituteError('w', 'a') == typoError); } void testNumberpadErrors() { ErrorValues ev; ev.addNumberpadErrors(); assert(ev.getSubstituteError('2', 'a') == 0); assert(ev.getSubstituteError('5', 'm') < ErrorValues::getDefaultError()); assert(ev.getSubstituteError('j', '6') < ErrorValues::getDefaultError()); } void testBigError() { ErrorValues ev; Letter l1 = 1000; // Big values, so they are guaranteed to be outside of the LUT. Letter l2 = 10000; int smallError = 1; assert(smallError < ErrorValues::getDefaultError()); assert(ev.getSubstituteError(l1, l2) == ErrorValues::getDefaultError()); assert(ev.getSubstituteError(l2, l1) == ErrorValues::getDefaultError()); assert(ev.getSubstituteError(l2, l2) == 0); ev.setError(l1, l2, smallError); assert(ev.getSubstituteError(l1, l2) == smallError); assert(ev.getSubstituteError(l2, l1) == smallError); assert(ev.getSubstituteError(l2, l2) == 0); } int main(int /*argc*/, char **/*argv*/) { try { testError(); testGroupError(); testKeyboardErrors(); testNumberpadErrors(); testBigError(); } catch(const std::exception &e) { fprintf(stderr, "Fail: %s\n", e.what()); return 666; } return 0; } libcolumbus-1.1.0+15.10.20150806/test/LevTrieTest.cc0000644000015300001610000001015012560622644022070 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ /* * This file tests the trie portion of the Levenshtein index. * That is, exact matches. */ #include #include #include "LevenshteinIndex.hh" #include "Word.hh" using namespace Columbus; void basicTest() { LevenshteinIndex ind; Word word1("word"); Word word2("another"); WordID w1ID = 1; WordID w2ID = 2; assert(!ind.hasWord(word1)); assert(!ind.hasWord(word2)); ind.insertWord(word1, w1ID); assert(ind.hasWord(word1)); assert(!ind.hasWord(word2)); ind.insertWord(word2, w2ID); assert(ind.hasWord(word1)); assert(ind.hasWord(word2)); } void shortTest() { LevenshteinIndex ind; Word word("a"); assert(!ind.hasWord(word)); ind.insertWord(word, 1); assert(ind.hasWord(word)); } void prefixTest() { LevenshteinIndex ind; Word word("ab"); Word prefix("a"); assert(!ind.hasWord(word)); assert(!ind.hasWord(prefix)); ind.insertWord(word, 1); assert(ind.hasWord(word)); assert(!ind.hasWord(prefix)); ind.insertWord(prefix, 2); assert(ind.hasWord(word)); assert(ind.hasWord(prefix)); } void suffixTest() { LevenshteinIndex ind; Word word("abc"); Word word2("abcd"); assert(!ind.hasWord(word)); assert(!ind.hasWord(word2)); ind.insertWord(word, 1); assert(ind.hasWord(word)); assert(!ind.hasWord(word2)); ind.insertWord(word2, 2); assert(ind.hasWord(word)); assert(ind.hasWord(word2)); } void branchTest() { LevenshteinIndex ind; Word word("abc"); Word word2("abcd"); Word word3("abce"); assert(!ind.hasWord(word)); assert(!ind.hasWord(word2)); assert(!ind.hasWord(word3)); ind.insertWord(word, 1); assert(ind.hasWord(word)); assert(!ind.hasWord(word2)); assert(!ind.hasWord(word3)); ind.insertWord(word2, 2); assert(ind.hasWord(word)); assert(ind.hasWord(word2)); assert(!ind.hasWord(word3)); ind.insertWord(word3, 3); assert(ind.hasWord(word)); assert(ind.hasWord(word2)); assert(ind.hasWord(word3)); } void countTest() { LevenshteinIndex ind; Word w1("abc"); Word w2("def"); Word w3("abce"); WordID w1ID = 1; WordID w2ID = 2; WordID w3ID = 3; assert(ind.wordCount(w1ID) == 0); assert(ind.wordCount(w2ID) == 0); assert(ind.wordCount(w3ID) == 0); assert(ind.maxCount() == 0); assert(ind.numWords() == 0); ind.insertWord(w1, w1ID); assert(ind.wordCount(w1ID) == 1); assert(ind.wordCount(w2ID) == 0); assert(ind.wordCount(w3ID) == 0); assert(ind.maxCount() == 1); assert(ind.numWords() == 1); ind.insertWord(w2, w2ID); assert(ind.wordCount(w1ID) == 1); assert(ind.wordCount(w2ID) == 1); assert(ind.wordCount(w3ID) == 0); assert(ind.maxCount() == 1); assert(ind.numWords() == 2); ind.insertWord(w1, w1ID); assert(ind.wordCount(w1ID) == 2); assert(ind.wordCount(w2ID) == 1); assert(ind.wordCount(w3ID) == 0); assert(ind.maxCount() == 2); assert(ind.numWords() == 2); } int main(int /*argc*/, char **/*argv*/) { #ifdef NDEBUG fprintf(stderr, "NDEBUG is defined, tests will not work!\n"); return 1; #else try { basicTest(); shortTest(); prefixTest(); suffixTest(); branchTest(); countTest(); } catch(const std::exception &e) { fprintf(stderr, "Fail: %s\n", e.what()); return 666; } return 0; #endif } libcolumbus-1.1.0+15.10.20150806/test/WordTest.cc0000644000015300001610000001462712560622644021446 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ /* * Tests the custom word class. Valgrind strongly recommended. */ #include #include #include #include #include "Word.hh" using namespace Columbus; using namespace std; void testEmpty() { Word w1; assert(w1.length() == 0); } void testIndexing() { Word w("abc"); bool gotException = false; assert(w[0] == 'a'); assert(w[1] == 'b'); assert(w[2] == 'c'); try { w[3]; } catch(std::out_of_range &e) { gotException = true; } assert(gotException); } void shouldThrow(const char *str) { bool gotException; try { Word w(str); gotException = false; } catch(std::invalid_argument &e) { gotException = true; } assert(gotException); } void testWhitespace() { shouldThrow(" "); shouldThrow(" a"); shouldThrow("a "); shouldThrow("a a"); shouldThrow("\t"); shouldThrow("a\t"); shouldThrow("\ta"); shouldThrow("a\ta"); shouldThrow("\n"); shouldThrow("a\n"); shouldThrow("\na"); shouldThrow("a\na"); shouldThrow("\r"); shouldThrow("a\r"); shouldThrow("\ra"); shouldThrow("a\ra"); } void testCreation() { Word w1("abc"); Word w2(w1); Word empty1; Word empty2(empty1); string s("xyz"); Word strW(s); assert(empty1.length() == 0); assert(empty2.length() == 0); assert(w2.length() == 3); assert(w2[0] == 'a'); assert(w2[1] == 'b'); assert(w2[2] == 'c'); assert(strW.length() == 3); assert(strW[0] == 'x'); assert(strW[1] == 'y'); assert(strW[2] == 'z'); Word *w3 = new Word(w1); assert(w3->length() == 3); assert((*w3)[0] == 'a'); assert((*w3)[1] == 'b'); assert((*w3)[2] == 'c'); delete w3; // Check that w1 did not get destroyed along with w3. assert(w1.length() == 3); assert(w1[0] == 'a'); assert(w1[1] == 'b'); assert(w1[2] == 'c'); } void testComparison() { Word w1a; Word w1b; Word w2a("abc"); Word w2b("abc"); Word w2c("abd"); Word different("different"); assert(w1a == w1a); assert(w2a == w2a); assert(w2b == w2b); assert(!(w1a != w1b)); assert(!(w2a != w2a)); assert(!(w2b != w2b)); assert(w1a != w2a); assert(w2a != w2c); assert(w1a != different); assert(w2a != different); assert(!(w1a == w2a)); assert(!(w2a == w2c)); assert(!(w1a == different)); assert(!(w2a == different)); } void testEncoding() { const unsigned char txt[5] = {0x61, 0xc3, 0xa4, 0x63, 0}; // "aäc" in UTF-8. char *text = (char*)txt; char returned[5]; Word w1(text); assert(w1.length() == 3); assert(w1[0] == 'a'); assert(w1[1] == 0xe4); assert(w1[2] == 'c'); w1.toUtf8(returned, 5); assert(strcmp(text, returned) == 0); assert(strcmp(text, w1.asUtf8().c_str()) == 0); Word wAss("abc"); assert(strcmp("abc", wAss.asUtf8().c_str()) == 0); // Make it allocate its internal things to check that they are released appropriately wAss = w1; assert(strcmp(text, w1.asUtf8().c_str()) == 0); assert(strcmp(text, wAss.asUtf8().c_str()) == 0); Word wInit(w1); assert(strcmp(text, w1.asUtf8().c_str()) == 0); assert(strcmp(text, wInit.asUtf8().c_str()) == 0); } void testLessThan() { Word w1("a"); Word w2("b"); Word w3("aa"); Word w4("ab"); assert(w1 < w2); assert(w1 < w3); assert(w3 < w2); assert(w3 < w4); assert(w4 < w2); assert(w1 < w4); } void testAutoLower() { const unsigned char txtUpper[4] = {0x41, 0xc3, 0x84, 0}; // "AÄ" in UTF-8. const unsigned char txtLower[4] = {0x61, 0xc3, 0xa4, 0}; // "aä" in UTF-8. const char *tu = (const char*) txtUpper; const char *tl = (const char*) txtLower; Word wUpper(tu); Word wLower(tl); assert(wUpper == wLower); assert(strcmp(wLower.asUtf8().c_str(), tl) == 0); assert(strcmp(wUpper.asUtf8().c_str(), tl) == 0); } void testJoin() { Word w1("abc"); Word w2("def"); Word r1("abcdef"); Word r2("defabc"); Word empty; Word result; result = w1.join(w2); assert(result == r1); // Test that it is properly null terminated. assert(strcmp(result.asUtf8().c_str(), "abcdef") == 0); result = w2.join(w1); assert(result == r2); result = empty.join(w1); assert(result == w1); result = w2.join(empty); assert(result == w2); result = empty.join(empty); assert(result == empty); } void testAssignment() { Word w; const char *txt = "abc"; const char *txt2 = "defg"; string txt3 = "xyz"; string txt4 = "lmn"; const char *txtError = "h h"; bool gotAssertion; w = txt; assert(w == txt); assert(w != txt2); assert(strcmp(txt, w.asUtf8().c_str()) == 0); assert(w.length() == 3); w = txt2; assert(w != txt); assert(w == txt2); assert(strcmp(txt2, w.asUtf8().c_str()) == 0); assert(w.length() == 4); w = txt3; assert(w == txt3); assert(w != txt4); assert(txt3 == w.asUtf8()); assert(w.length() == 3); w = txt4; assert(w != txt3); assert(w == txt4); assert(w == txt4); assert(w.length() == 3); try { w = txtError; gotAssertion = false; } catch(std::invalid_argument &e) { gotAssertion = true; } assert(gotAssertion); } int main(int /*argc*/, char **/* argv*/) { try { testEmpty(); testIndexing(); testWhitespace(); testCreation(); testComparison(); testEncoding(); testLessThan(); testAutoLower(); testJoin(); testAssignment(); } catch(const exception &e) { fprintf(stderr, "Fail: %s\n", e.what()); return 666; } return 0; } libcolumbus-1.1.0+15.10.20150806/test/DocumentTest.cc0000644000015300001610000000757412560622644022314 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #include "Document.hh" #include "Word.hh" #include "WordList.hh" #include #include using namespace Columbus; void testDoc() { DocumentID docId = 42; Document d(docId); Word w1("abc"); Word w2("def"); Word textName("title"); WordList *l = new WordList(); l->addWord(w1); l->addWord(w2); d.addText(textName, *l); const WordList &l2 = d.getText(textName); assert(l2.size() == 2); assert(l2[0] == w1); assert(l2[1] == w2); delete l; const WordList &l3 = d.getText(textName); assert(l3.size() == 2); assert(l3[0] == w1); assert(l3[1] == w2); } void testIndexNames() { DocumentID docId = 102; Document d(docId); Word w1("abc"); Word w2("def"); Word text1Name("text1"); Word text2Name("text2"); WordList wl1; WordList wl2; WordList textNames; wl1.addWord(w1); wl2.addWord(w2); d.addText(text1Name, wl1); d.addText(text2Name, wl2); d.getFieldNames(textNames); for(size_t i=0; i # This library is free software; you can redistribute it and/or modify it under # the terms of version 3 of the GNU Lesser General Public License as published # by the Free Software Foundation. # This library is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more # details. # You should have received a copy of the GNU Lesser General Public License # along with this program. If not, see . import unittest import columbus class TestWord(unittest.TestCase): def test_init(self): d = columbus.Word("hello") def test_encoding(self): str1 = "hello" str2 = 'abcåäö' w1 = columbus.Word(str1) self.assertEqual(str1, w1.get_string(), "plain ASCII string did not survive round trip") w2 = columbus.Word(str2) self.assertEqual(str2, w2.get_string(), 'non-ASCII string did not survive round trip') def test_exception(self): str1 = 'two words' with self.assertRaises(ValueError): w1 = columbus.Word(str1) def test_length(self): str1 = "hello" str2 = 'abcåäö' w1 = columbus.Word(str1) self.assertEqual(len(str1), len(w1), "plain ASCII Word has incorrect size") w2 = columbus.Word(str2) self.assertEqual(len(str2), len(w2), 'non-ASCII Word has incorrect size') class TestWordList(unittest.TestCase): def test_init(self): l = columbus.WordList() def test_size(self): l = columbus.WordList() self.assertEqual(0, len(l), 'Incorrect size for empty list') w1 = columbus.Word('abc') w2 = columbus.Word('defg') l.add_word(w1) self.assertEqual(1, len(l)) l.add_word(w2) self.assertEqual(2, len(l)) l.add_word(w1) self.assertEqual(3, len(l)) def test_split(self): l = columbus.split_to_words('this is my text') self.assertEqual(4, len(l), 'text splitting fails') def test_indexing(self): l = columbus.split_to_words('this is my text') self.assertEqual("this", l[0].get_string()) self.assertNotEqual("is", l[0].get_string()) self.assertEqual("is", l[1].get_string()) self.assertEqual("my", l[2].get_string()) self.assertEqual("text", l[3].get_string()) class TestDocument(unittest.TestCase): def test_init(self): d = columbus.Document(1) def test_doc(self): docid = 435 field = columbus.Word('fieldname') text = 'ye olde butcherede englishe' d = columbus.Document(docid) self.assertEqual(d.get_id(), docid, 'Document ID got mangled.') self.assertEqual(d.field_count(), 0) d.add_text(field, text) self.assertEqual(d.field_count(), 1, 'field count did not increase') self.assertGreater(len(text), 0) self.assertEqual(len(d.get_text(field)), len(text.split()), 'stored text got mangled') class TestCorpus(unittest.TestCase): def test_init(self): c = columbus.Corpus() def test_insertion(self): c = columbus.Corpus() d = columbus.Document(55) self.assertEqual(0, len(c)) c.add_document(d) self.assertEqual(1, len(c)) class TestMatchResults(): def test_init(self): mr = columbus.MatchResults() def test_basic(self): docid = 9 relevancy = 1.3 mr = columbus.MatchResults() self.assertEqual(len(mr), 0) mr.add_match(docid, relevancy) self.assertEqual(len(mr), 1) self.assertEqual(mr.get_id(0), docid) self.assertAlmostEqual(mr.get_relevancy(0), relevancy, 0.01) class TestMatcher(unittest.TestCase): def test_init(self): m = columbus.Matcher() def test_simple_match(self): c = columbus.Corpus() m = columbus.Matcher() name1 = 0; name2 = 10; name3 = 1000; textName = columbus.Word("title") d1 = columbus.Document(name1) d1.add_text(textName, "abc def") d2 = columbus.Document(name2) d2.add_text(textName, "abe test") dFar = columbus.Document(name3) dFar.add_text(textName, "faraway donotmatchme") c.add_document(d1) c.add_document(d2) c.add_document(dFar) m.index(c) matches = m.match("abe") self.assertEqual(len(matches), 2) self.assertNotEqual(matches.get_document_id(0), name3); self.assertNotEqual(matches.get_document_id(1), name3); self.assertTrue(matches.get_document_id(0) == name1 or matches.get_document_id(1) == name1) self.assertTrue(matches.get_document_id(0) == name2 or matches.get_document_id(1) == name2) def test_errorvalues(self): m = columbus.Matcher() ev = m.get_errorvalues() ev.add_standard_errors() def test_indexweights(self): m = columbus.Matcher() iw = m.get_indexweights() field = columbus.Word("abc") self.assertAlmostEqual(iw.get_weight(field), 1.0, 0.0001) class TestErrorValues(unittest.TestCase): def test_init(self): ev = columbus.ErrorValues() def test_values(self): small_error = 1; l1 = 16; l2 = 17; ev = columbus.ErrorValues() default_error = columbus.ErrorValues.get_default_error() self.assertLess(small_error, default_error) self.assertEqual(ev.get_substitute_error(l1, l2), default_error); ev.set_error(l1, l2, small_error); self.assertEqual(ev.get_substitute_error(l1, l2), small_error); self.assertEqual(ev.get_substitute_error(l2, l1), small_error); ev.clear_errors(); self.assertEqual(ev.get_substitute_error(l1, l2), default_error); class TestIndexWeights(unittest.TestCase): def test_init(self): w = columbus.IndexWeights() def test_weights(self): w = columbus.IndexWeights() original_weight = 1.0 new_weight = 2.0 accuracy = 4 field = columbus.Word("abc") self.assertNotAlmostEqual(original_weight, new_weight, accuracy) self.assertAlmostEqual(w.get_weight(field), original_weight, accuracy) w.set_weight(field, 2.0) self.assertAlmostEqual(w.get_weight(field), new_weight, accuracy) if __name__ == '__main__': unittest.main() libcolumbus-1.1.0+15.10.20150806/test/CreatePerformanceTest.cc0000644000015300001610000000471612560622644024116 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ /* * This exe is meant to be used to measure speed and * memory consumption of Index building. */ #include "columbus.hh" // This app should only need public API from Columbus. #include #include #include using namespace std; using namespace Columbus; Matcher* build_matcher(const char *dataFile, int maxLines) { Matcher *m = 0; Corpus *c = new Corpus(); const int batchSize = 100000; Word field("name"); double dataReadStart, dataReadEnd; int i = 0; size_t totalDocs = 0; ifstream ifile(dataFile); if(ifile.fail()) { printf("Could not open file %s.\n", dataFile); exit(1); } string line; m = new Matcher(); // Build Corpus. dataReadStart = hiresTimestamp(); while(getline(ifile, line)) { if(line.size() == 0) continue; totalDocs++; Document d(totalDocs); d.addText(field, line.c_str()); c->addDocument(d); i++; if(i % batchSize == 0) { m->index(*c); delete c; c = new Corpus(); } if(i >= maxLines) break; } m->index(*c); delete c; dataReadEnd = hiresTimestamp(); printf("Read in %lu documents in %.2f seconds.\n", (unsigned long)totalDocs, dataReadEnd - dataReadStart); return m; } int main(int argc, char **argv) { int maxLines = INT_MAX; Matcher *m; if(argc == 1) { printf("%s datafile.txt [num of lines]\n", argv[0]); return 1; } if(argc > 2) maxLines = atoi(argv[2]); try { m = build_matcher(argv[1], maxLines); delete m; } catch(const std::exception &e) { fprintf(stderr, "Fail: %s\n", e.what()); return 666; } return 0; } libcolumbus-1.1.0+15.10.20150806/test/HelpersTest.cc0000644000015300001610000000432512560622644022127 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #include "ColumbusHelpers.hh" #include "Word.hh" #include "WordList.hh" #include using namespace Columbus; bool splitCorrectly(const char *txt, const WordList &l) { WordList result = splitToWords(txt); return result == l; } void testSplitter() { Word w1("abc"); Word w2("def"); WordList l1; l1.addWord(w1); l1.addWord(w2); assert(splitCorrectly("abc def", l1)); assert(splitCorrectly("abc\tdef", l1)); assert(splitCorrectly("abc\ndef", l1)); assert(splitCorrectly("abc\rdef", l1)); assert(splitCorrectly(" abc def", l1)); assert(splitCorrectly("abc def ", l1)); assert(splitCorrectly(" abc def ", l1)); WordList empty; assert(splitCorrectly("", empty)); assert(splitCorrectly(" ", empty)); assert(splitCorrectly("\t", empty)); assert(splitCorrectly("\n", empty)); assert(splitCorrectly("\r", empty)); assert(splitCorrectly(" \t\n\r\n\t ", empty)); } void testWeirdWord() { const unsigned char txt[] = {0x42, 0x6c, 0x75, 0x65, 0x73, 0x20, 0xe2, 0x80, 0x9a, 0xc3, 0x84, 0xc3, 0xb2, 0x6e, 0xe2, 0x80, 0x9a, 0xc3, 0x84, 0xc3, 0xb4, 0x20, 0x54, 0x72, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x0d, 0x0a, 0}; WordList l = splitToWords((const char*)txt); assert(l.size() == 3); } int main(int /*argc*/, char **/*argv*/) { try { testSplitter(); testWeirdWord(); } catch(const std::exception &e) { fprintf(stderr, "Fail: %s\n", e.what()); return 666; } return 0; } libcolumbus-1.1.0+15.10.20150806/test/CMakeLists.txt0000644000015300001610000000273212560622644022116 0ustar pbuserpbgroup00000000000000macro(coltest tname tfilename) add_executable(${tname} ${tfilename}) target_link_libraries(${tname} ${COL_LIB_BASENAME}) add_test(${tname} ${tname}) endmacro() # Trie is an internal class whose symbols are hidden # so we need to add the source manually. add_executable(trie TrieTest.cc ../src/Trie.cc) target_link_libraries(trie ${COL_LIB_BASENAME}) add_test(trie trie) coltest(levtrie LevTrieTest.cc) coltest(levindex LevIndexTest.cc) coltest(custom_error CustomErrorTest.cc) coltest(error_values ErrorValuesTest.cc) coltest(word WordTest.cc) coltest(wordlist WordListTest.cc) coltest(document DocumentTest.cc) coltest(corpus CorpusTest.cc) coltest(matcher MatcherTest.cc) coltest(matchresults MatchResultsTest.cc) coltest(helpers HelpersTest.cc) coltest(indexweights IndexWeightsTest.cc) coltest(wordstore WordStoreTest.cc) coltest(filtering ResultFilterTest.cc) coltest(searchparameters SearchParametersTest.cc) coltest(capi CAPITest.c) add_executable(lev_scalability LevScalabilityTest.cc) target_link_libraries(lev_scalability ${COL_LIB_BASENAME}) if(${enable_scalability_tests}) add_test(lev_scalability lev_scalability /usr/share/dict/words) endif() if(build_python) add_test(python ${CMAKE_CURRENT_SOURCE_DIR}/pythontest.py) set_tests_properties(python PROPERTIES ENVIRONMENT "PYTHONPATH=${CMAKE_SOURCE_DIR}/python:${CMAKE_BINARY_DIR}/python") endif() add_executable(create_performance CreatePerformanceTest.cc) target_link_libraries(create_performance ${COL_LIB_BASENAME}) libcolumbus-1.1.0+15.10.20150806/test/IndexWeightsTest.cc0000644000015300001610000000223512560622644023125 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #include "IndexWeights.hh" #include "Word.hh" #include using namespace Columbus; void testWeights() { IndexWeights w; Word w1("abc"); assert(w.getWeight(w1) == 1.0); w.setWeight(w1, 2.0); assert(w.getWeight(w1) == 2.0); } int main(int /*argc*/, char **/*argv*/) { try { testWeights(); } catch(const std::exception &e) { fprintf(stderr, "Fail: %s\n", e.what()); return 666; } return 0; } libcolumbus-1.1.0+15.10.20150806/test/CorpusTest.cc0000644000015300001610000000371712560622644022004 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #include "Corpus.hh" #include "Word.hh" #include "Document.hh" #include "WordList.hh" #include #include using namespace Columbus; void testCorpus() { Corpus c; Word w1("abc"); Word w2("def"); Word w3("test1"); Word w4("test2"); DocumentID name1 = 0; DocumentID name2 = 0; Word textName("title"); WordList wl1, wl2; wl1.addWord(w1); wl1.addWord(w2); wl2.addWord(w3); wl2.addWord(w4); Document d1(name1); d1.addText(textName, wl1); Document *d2 = new Document(name2); d2->addText(textName, wl2); assert(c.size() == 0); c.addDocument(d1); assert(c.size() == 1); c.addDocument(*d2); assert(c.size() == 2); assert(c.getDocument(0).getID() == name1); const Document &dNew = c.getDocument(1); assert(dNew.getID() == name2); delete d2; assert(c.size() == 2); const Document &dNew2 = c.getDocument(1); assert(dNew2.getID() == name2); const WordList &lNew = dNew.getText(textName); assert(lNew[0] == w3); assert(lNew[1] == w4); } int main(int /*argc*/, char **/*argv*/) { try { testCorpus(); } catch(const std::exception &e) { fprintf(stderr, "Fail: %s\n", e.what()); return 666; } return 0; } libcolumbus-1.1.0+15.10.20150806/test/WordStoreTest.cc0000644000015300001610000000320312560622644022447 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #include "WordStore.hh" #include "Word.hh" #include #include using namespace Columbus; void testStore() { WordStore s; Word w1("abc"); Word w1Copy("abc"); Word w2("def"); WordID w1ID, w1CopyID, w2ID; WordID nonexisting = 42; bool gotException; w1ID = s.getID(w1); w1CopyID = s.getID(w1Copy); w2ID = s.getID(w2); assert(w1ID == w1CopyID); assert(w2ID != w1ID); assert(nonexisting != w1ID); assert(nonexisting != w2ID); assert(s.getWord(w1ID) == w1); assert(s.getWord(w2ID) == w2); try { s.getWord(nonexisting); gotException = false; } catch(std::out_of_range &e) { gotException = true; } assert(gotException); } int main(int /*argc*/, char **/*argv*/) { try { testStore(); } catch(const std::exception &e) { fprintf(stderr, "Fail: %s\n", e.what()); return 666; } return 0; } libcolumbus-1.1.0+15.10.20150806/test/TrieTest.cc0000644000015300001610000000325412560622644021430 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #include "Word.hh" #include "Trie.hh" #include using namespace Columbus; void testWordBuilding() { Trie t; Word w1("abc"); Word w2("abd"); Word result; TrieOffset node1, node2; WordID i1 = 1; WordID i2 = 2; assert(t.numWords() == 0); node1 = t.insertWord(w1, i1); assert(t.numWords() == 1); node2 = t.insertWord(w2, i2); assert(t.numWords() == 2); result = t.getWord(node1); assert(result == w1); result = t.getWord(node2); assert(result == w2); } void testHas() { Trie t; Word w1("abc"); Word w2("abd"); Word w3("a"); Word w4("x"); Word result; WordID i1 = 1; assert(t.numWords() == 0); t.insertWord(w1, i1); assert(t.hasWord(w1)); assert(!t.hasWord(w2)); assert(!t.hasWord(w3)); assert(!t.hasWord(w4)); } int main(int /*argc*/, char **/*argv*/) { // Move basic tests from levtrietest here. testWordBuilding(); testHas(); return 0; } libcolumbus-1.1.0+15.10.20150806/cmake/0000755000015300001610000000000012560622775017460 5ustar pbuserpbgroup00000000000000libcolumbus-1.1.0+15.10.20150806/cmake/python.cmake0000644000015300001610000000233112560622644021775 0ustar pbuserpbgroup00000000000000set(build_python FALSE) # CMake's Boost.Python detector is completely and utterly # broken. We have to do this manually. # # Upstream bug: # http://public.kitware.com/Bug/view.php?id=12955 find_file(BP_HEADER boost/python.hpp) if(use_python2) pkg_search_module(PYTHONLIBS python) else() pkg_search_module(PYTHONLIBS python3) endif() if(NOT BP_HEADER) message(STATUS "Boost.Python not found, not building Python bindings.") else() if(NOT PYTHONLIBS_FOUND) message(STATUS "Python dev libraries not found, not building Python bindings.") else() string(SUBSTRING ${PYTHONLIBS_VERSION} 0 1 PYTHON_MAJOR) string(SUBSTRING ${PYTHONLIBS_VERSION} 2 1 PYTHON_MINOR) message(STATUS "Found Python version ${PYTHON_MAJOR}.${PYTHON_MINOR}.") if(NOT use_python2) execute_process(COMMAND ${CMAKE_SOURCE_DIR}/cmake/pysoabi.py OUTPUT_VARIABLE pysoabi OUTPUT_STRIP_TRAILING_WHITESPACE) endif() find_library(BOOST_PYTHON_HACK boost_python-py${PYTHON_MAJOR}${PYTHON_MINOR}) if(NOT BOOST_PYTHON_HACK) message(STATUS "Boost.Python hack library not found, not building Python bindings") else() set(build_python TRUE) message(STATUS "Building Python bindings.") endif() endif() endif() libcolumbus-1.1.0+15.10.20150806/cmake/coverage.cmake0000644000015300001610000000323012560622644022246 0ustar pbuserpbgroup00000000000000if (CMAKE_BUILD_TYPE MATCHES coverage) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --coverage") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} --coverage") set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} --coverage") set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} --coverage") find_program(GCOVR_EXECUTABLE gcovr HINTS ${GCOVR_ROOT} "${GCOVR_ROOT}/bin") if (NOT GCOVR_EXECUTABLE) message(STATUS "Gcovr binary was not found, can not generate XML coverage info.") else () message(STATUS "Gcovr found, can generate XML coverage info.") add_custom_target (coverage-xml WORKING_DIRECTORY ${CMAKE_BINARY_DIR} COMMAND "${GCOVR_EXECUTABLE}" --exclude="test.*" -x -r "${CMAKE_SOURCE_DIR}" --object-directory=${CMAKE_BINARY_DIR} -o coverage.xml) endif() find_program(LCOV_EXECUTABLE lcov HINTS ${LCOV_ROOT} "${GCOVR_ROOT}/bin") find_program(GENHTML_EXECUTABLE genhtml HINTS ${GENHTML_ROOT}) if (NOT LCOV_EXECUTABLE) message(STATUS "Lcov binary was not found, can not generate HTML coverage info.") else () if(NOT GENHTML_EXECUTABLE) message(STATUS "Genthml binary not found, can not generate HTML coverage info.") else() message(STATUS "Lcov and genhtml found, can generate HTML coverage info.") add_custom_target (coverage-html WORKING_DIRECTORY ${CMAKE_BINARY_DIR} COMMAND "${LCOV_EXECUTABLE}" --directory ${CMAKE_BINARY_DIR} --capture --output-file coverage.info --no-checksum COMMAND "${GENHTML_EXECUTABLE}" --prefix ${CMAKE_BINARY_DIR} --output-directory coveragereport --title "Code Coverage" --legend --show-details coverage.info ) endif() endif() endif() libcolumbus-1.1.0+15.10.20150806/cmake/pch.cmake0000644000015300001610000000775712560622644021247 0ustar pbuserpbgroup00000000000000function(get_gcc_flags target_name) # CMake does not provide an easy way to get all compiler switches, # so this function is a fishing expedition to get them. # http://public.kitware.com/Bug/view.php?id=1260 if(CMAKE_CXX_COMPILER_ARG1) set(compile_args ${CMAKE_CXX_COMPILER_ARG1}) else() set(compile_args "") endif() if(CMAKE_CXX_COMPILER_ARG2) list(APPEND compile_args ${CMAKE_CXX_COMPILER_ARG2}) endif() list(APPEND compile_args ${CMAKE_CXX_FLAGS}) string(TOUPPER "${CMAKE_BUILD_TYPE}" buildtype_name) if(CMAKE_CXX_FLAGS_${buildtype_name}) list(APPEND compile_args ${CMAKE_CXX_FLAGS_${buildtype_name}}) endif() get_directory_property(dir_inc INCLUDE_DIRECTORIES) foreach(item ${dir_inc}) LIST(APPEND compile_args "-I" ${item}) endforeach() get_directory_property(dir_defs COMPILE_DEFINITIONS) foreach(item ${dir_defs}) list(APPEND compile_args -D${item}) endforeach() get_directory_property(dir_buildtype_defs COMPILE_DEFINITIONS_${buildtype_name}) foreach(item ${dir_buildtype_defs}) list(APPEND compile_args -D${item}) endforeach() get_directory_property(buildtype_defs COMPILE_DEFINITIONS_${buildtype_name}) foreach(item ${buildtype_defs}) list(APPEND compile_args -D${item}) endforeach() get_target_property(target_type ${target_name} TYPE) if(${target_type} STREQUAL SHARED_LIBRARY) list(APPEND compile_args ${CMAKE_CXX_COMPILE_OPTIONS_PIC}) endif() get_target_property(target_defs ${target_name} COMPILE_DEFINITIONS) if(target_defs) foreach(item ${target_defs}) list(APPEND compile_args -D${item}) endforeach() endif() get_target_property(target_buildtype_defs ${target_name} COMPILE_DEFINITIONS_${buildtype_name}) if(target_buildtype_defs) foreach(item ${target_buildtype_defs}) list(APPEND compile_args -D${item}) endforeach() endif() get_target_property(target_flags ${target_name} COMPILE_FLAGS) if(target_flags) list(APPEND compile_args ${target_flags}) endif() set(compile_args ${compile_args} PARENT_SCOPE) #message(STATUS ${compile_args}) endfunction() function(add_pch_linux header_filename target_name pch_suffix) set(gch_target_name "${target_name}_pch") get_filename_component(header_basename ${header_filename} NAME) set(gch_filename "${CMAKE_CURRENT_BINARY_DIR}/${header_basename}.${pch_suffix}") get_gcc_flags(${target_name}) # Sets compile_args in this scope. It's even better than Intercal's COME FROM! #message(STATUS ${compile_args}) list(APPEND compile_args -c ${CMAKE_CURRENT_SOURCE_DIR}/${header_filename} -o ${gch_filename}) separate_arguments(compile_args) add_custom_command(OUTPUT ${gch_filename} COMMAND ${CMAKE_CXX_COMPILER} ${compile_args} DEPENDS ${header_filename} VERBATIM) add_custom_target(${gch_target_name} DEPENDS ${gch_filename}) add_dependencies(${target_name} ${gch_target_name}) # Add the PCH to every source file's include list. # This is the only way that is supported by both GCC and Clang. set_property(TARGET ${target_name} APPEND_STRING PROPERTY COMPILE_FLAGS " -include ${header_basename}") set_property(TARGET ${target_name} APPEND_STRING PROPERTY COMPILE_FLAGS " -Winvalid-pch") set_property(TARGET ${target_name} APPEND PROPERTY INCLUDE_DIRECTORIES ${CMAKE_CURRENT_BINARY_DIR}) endfunction() include(CheckCXXSourceCompiles) CHECK_CXX_SOURCE_COMPILES("#ifdef __clang__\n#else\n#error \"Not clang.\"\n#endif\nint main(int argc, char **argv) { return 0; }" IS_CLANG) if(UNIX) if(NOT APPLE) option(use_pch "Use precompiled headers." TRUE) endif() endif() if(use_pch) message(STATUS "Using precompiled headers.") if(IS_CLANG) set(precompiled_header_extension pch) else() set(precompiled_header_extension gch) endif() macro(add_pch _header_filename _target_name) add_pch_linux(${_header_filename} ${_target_name} ${precompiled_header_extension}) endmacro() else() message(STATUS "Not using precompiled headers.") macro(add_pch _header_filename _target_name) endmacro() endif() libcolumbus-1.1.0+15.10.20150806/cmake/pysoabi.py0000755000015300001610000000144012560622644021475 0ustar pbuserpbgroup00000000000000#!/usr/bin/python3 -tt # -*- coding: utf-8 -*- # Copyright (C) 2012 Canonical, Ltd. # Authors: # Jussi Pakkanen # This library is free software; you can redistribute it and/or modify it under # the terms of version 3 of the GNU Lesser General Public License as published # by the Free Software Foundation. # This library is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more # details. # You should have received a copy of the GNU Lesser General Public License # along with this program. If not, see . import sysconfig print(sysconfig.get_config_var("SOABI")) libcolumbus-1.1.0+15.10.20150806/COPYING0000644000015300001610000001672712560622644017443 0ustar pbuserpbgroup00000000000000 GNU LESSER GENERAL PUBLIC LICENSE Version 3, 29 June 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. This version of the GNU Lesser General Public License incorporates the terms and conditions of version 3 of the GNU General Public License, supplemented by the additional permissions listed below. 0. Additional Definitions. As used herein, "this License" refers to version 3 of the GNU Lesser General Public License, and the "GNU GPL" refers to version 3 of the GNU General Public License. "The Library" refers to a covered work governed by this License, other than an Application or a Combined Work as defined below. An "Application" is any work that makes use of an interface provided by the Library, but which is not otherwise based on the Library. Defining a subclass of a class defined by the Library is deemed a mode of using an interface provided by the Library. A "Combined Work" is a work produced by combining or linking an Application with the Library. The particular version of the Library with which the Combined Work was made is also called the "Linked Version". The "Minimal Corresponding Source" for a Combined Work means the Corresponding Source for the Combined Work, excluding any source code for portions of the Combined Work that, considered in isolation, are based on the Application, and not on the Linked Version. The "Corresponding Application Code" for a Combined Work means the object code and/or source code for the Application, including any data and utility programs needed for reproducing the Combined Work from the Application, but excluding the System Libraries of the Combined Work. 1. Exception to Section 3 of the GNU GPL. You may convey a covered work under sections 3 and 4 of this License without being bound by section 3 of the GNU GPL. 2. Conveying Modified Versions. If you modify a copy of the Library, and, in your modifications, a facility refers to a function or data to be supplied by an Application that uses the facility (other than as an argument passed when the facility is invoked), then you may convey a copy of the modified version: a) under this License, provided that you make a good faith effort to ensure that, in the event an Application does not supply the function or data, the facility still operates, and performs whatever part of its purpose remains meaningful, or b) under the GNU GPL, with none of the additional permissions of this License applicable to that copy. 3. Object Code Incorporating Material from Library Header Files. The object code form of an Application may incorporate material from a header file that is part of the Library. You may convey such object code under terms of your choice, provided that, if the incorporated material is not limited to numerical parameters, data structure layouts and accessors, or small macros, inline functions and templates (ten or fewer lines in length), you do both of the following: a) Give prominent notice with each copy of the object code that the Library is used in it and that the Library and its use are covered by this License. b) Accompany the object code with a copy of the GNU GPL and this license document. 4. Combined Works. You may convey a Combined Work under terms of your choice that, taken together, effectively do not restrict modification of the portions of the Library contained in the Combined Work and reverse engineering for debugging such modifications, if you also do each of the following: a) Give prominent notice with each copy of the Combined Work that the Library is used in it and that the Library and its use are covered by this License. b) Accompany the Combined Work with a copy of the GNU GPL and this license document. c) For a Combined Work that displays copyright notices during execution, include the copyright notice for the Library among these notices, as well as a reference directing the user to the copies of the GNU GPL and this license document. d) Do one of the following: 0) Convey the Minimal Corresponding Source under the terms of this License, and the Corresponding Application Code in a form suitable for, and under terms that permit, the user to recombine or relink the Application with a modified version of the Linked Version to produce a modified Combined Work, in the manner specified by section 6 of the GNU GPL for conveying Corresponding Source. 1) Use a suitable shared library mechanism for linking with the Library. A suitable mechanism is one that (a) uses at run time a copy of the Library already present on the user's computer system, and (b) will operate properly with a modified version of the Library that is interface-compatible with the Linked Version. e) Provide Installation Information, but only if you would otherwise be required to provide such information under section 6 of the GNU GPL, and only to the extent that such information is necessary to install and execute a modified version of the Combined Work produced by recombining or relinking the Application with a modified version of the Linked Version. (If you use option 4d0, the Installation Information must accompany the Minimal Corresponding Source and Corresponding Application Code. If you use option 4d1, you must provide the Installation Information in the manner specified by section 6 of the GNU GPL for conveying Corresponding Source.) 5. Combined Libraries. You may place library facilities that are a work based on the Library side by side in a single library together with other library facilities that are not Applications and are not covered by this License, and convey such a combined library under terms of your choice, if you do both of the following: a) Accompany the combined library with a copy of the same work based on the Library, uncombined with any other library facilities, conveyed under the terms of this License. b) Give prominent notice with the combined library that part of it is a work based on the Library, and explaining where to find the accompanying uncombined form of the same work. 6. Revised Versions of the GNU Lesser General Public License. The Free Software Foundation may publish revised and/or new versions of the GNU Lesser General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Library as you received it specifies that a certain numbered version of the GNU Lesser General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that published version or of any later version published by the Free Software Foundation. If the Library as you received it does not specify a version number of the GNU Lesser General Public License, you may choose any version of the GNU Lesser General Public License ever published by the Free Software Foundation. If the Library as you received it specifies that a proxy can decide whether future versions of the GNU Lesser General Public License shall apply, that proxy's public statement of acceptance of any version is permanent authorization for you to choose that version for the Library. libcolumbus-1.1.0+15.10.20150806/src/0000755000015300001610000000000012560622775017167 5ustar pbuserpbgroup00000000000000libcolumbus-1.1.0+15.10.20150806/src/IndexMatches.cc0000644000015300001610000000522712560622644022053 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #include "IndexMatches.hh" #include "Word.hh" #include #include #include #include COL_NAMESPACE_START using namespace std; struct MatchData { Word queryWord; WordID matchedWord; int error; bool operator<(const MatchData &other) const { return error < other.error; } }; struct IndexMatchesPrivate { vector matches; }; IndexMatches::IndexMatches() { p = new IndexMatchesPrivate(); } IndexMatches::~IndexMatches() { delete p; } void IndexMatches::addMatch(const Word &/*queryWord*/, const WordID matchedWord, int error) { MatchData m; m.matchedWord = matchedWord; m.error = error; p->matches.push_back(m); } size_t IndexMatches::size() const { return p->matches.size(); } const WordID& IndexMatches::getMatch(size_t num) const { if(num >= p->matches.size()) { std::string msg("Attempt to access match "); msg += num; msg += " out of bounds (array size "; msg += p->matches.size(); msg += ")."; throw out_of_range(msg); } return p->matches[num].matchedWord; } const Word& IndexMatches::getQuery(size_t num) const { if(num >= p->matches.size()) { std::string msg("Attempt to access query term "); msg += num; msg += " out of bounds (array size "; msg += p->matches.size(); msg += ")."; throw out_of_range(msg); } return p->matches[num].queryWord; } int IndexMatches::getMatchError(size_t num) const { if(num >= p->matches.size()) { std::string msg("Attempt to access match error "); msg += num; msg += " out of bounds (array size "; msg += p->matches.size(); msg += ")."; throw out_of_range(msg); } return p->matches[num].error; } void IndexMatches::clear() { p->matches.clear(); } void IndexMatches::sort() { std::sort(p->matches.begin(), p->matches.end()); } COL_NAMESPACE_END libcolumbus-1.1.0+15.10.20150806/src/MatchResults.cc0000644000015300001610000000616212560622644022114 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #include "MatchResults.hh" #include "Word.hh" #include #include #include COL_NAMESPACE_START using namespace std; struct MatchResultsPrivate { vector > results; bool sorted; }; MatchResults::MatchResults() { p = new MatchResultsPrivate(); p->sorted = true;; } MatchResults::MatchResults(const MatchResults &other) { p = new MatchResultsPrivate(); *p = *other.p; } MatchResults::MatchResults(MatchResults &&other) { p = other.p; other.p = nullptr; } MatchResults::~MatchResults() { delete p; } const MatchResults& MatchResults::operator=(MatchResults &&other) { if(this != &other) { delete p; p = other.p; other.p = nullptr; } return *this; } const MatchResults& MatchResults::operator=(const MatchResults &other) { if(this != &other) { *p = *other.p; } return *this; } void MatchResults::addResult(DocumentID id, double relevancy) { pair n; n.first = relevancy; n.second = id; p->results.push_back(n); p->sorted = false; } void MatchResults::addResults(const MatchResults &r) { p->results.insert(p->results.end(), r.p->results.begin(), r.p->results.end()); p->sorted = false; } size_t MatchResults::size() const { return p->results.size(); } void MatchResults::copyResult(const MatchResults &other, const size_t i) { if(i >= other.p->results.size()) { throw out_of_range("Tried to copy an out-of-range result."); } p->results.push_back(other.p->results[i]); p->sorted = false; } void MatchResults::sortIfRequired() const { if(p->sorted) return; MatchResults *me = const_cast(this); stable_sort(me->p->results.rbegin(), me->p->results.rend(), [](const pair &a, const pair &b) -> bool{ return a.first < b.first; }); me->p->sorted = true; } DocumentID MatchResults::getDocumentID(size_t i) const { if(i>=p->results.size()) { throw out_of_range("Access out of bounds in MatchResults::getDocumentID."); } sortIfRequired(); return p->results[i].second; } double MatchResults::getRelevancy(size_t i) const { if(i>=p->results.size()) { throw out_of_range("Access out of bounds in MatchResults::getDocumentID."); } sortIfRequired(); return p->results[i].first; } COL_NAMESPACE_END libcolumbus-1.1.0+15.10.20150806/src/MatcherStatistics.cc0000644000015300001610000000342512560622644023133 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #include "Word.hh" #include "MatcherStatistics.hh" #ifdef HAS_SPARSE_HASH #include using google::sparse_hash_map; #define hashmap sparse_hash_map #else #include #define hashmap unordered_map #endif COL_NAMESPACE_START using namespace std; struct MatcherStatisticsPrivate { hashmap totalWordCounts; }; MatcherStatistics::MatcherStatistics() { p = new MatcherStatisticsPrivate(); } MatcherStatistics::~MatcherStatistics() { delete p; } void MatcherStatistics::wordProcessed(const WordID w) { auto it = p->totalWordCounts.find(w); if(it == p->totalWordCounts.end()) { p->totalWordCounts[w] = 1; } else { it->second++; } } size_t MatcherStatistics::getTotalWordCount(const WordID w) const { auto it = p->totalWordCounts.find(w); if(it == p->totalWordCounts.end()) { return 0; } else { return it->second; } } void MatcherStatistics::addedWordToIndex(const WordID /*word*/, const Word &/*fieldName*/) { // Doesn't do anything yet. } COL_NAMESPACE_END libcolumbus-1.1.0+15.10.20150806/src/IndexWeights.cc0000644000015300001610000000243212560622644022074 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #include "IndexWeights.hh" #include "Word.hh" #include COL_NAMESPACE_START using namespace std; struct IndexWeightsPrivate { map weigths; }; IndexWeights::IndexWeights() { p = new IndexWeightsPrivate(); } IndexWeights::~IndexWeights() { delete p; } void IndexWeights::setWeight(const Word &w, double weigth) { p->weigths[w] = weigth; } double IndexWeights::getWeight(const Word &w) const { map::iterator it = p->weigths.find(w); if(it == p->weigths.end()) return 1.0; return it->second; } COL_NAMESPACE_END libcolumbus-1.1.0+15.10.20150806/src/WordStore.cc0000644000015300001610000000400412560622644021417 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #include "WordStore.hh" #include "Word.hh" #include "Trie.hh" #include #include COL_NAMESPACE_START using namespace std; /* * Wordstore turned out to be too slow when used with * sparse_maps or unordered_maps. */ struct hasher : std::unary_function { size_t operator() ( const Word &obj) const { return obj.hash(); } }; struct WordStorePrivate { Trie words; vector wordIndex; // The Word object is duplicated here. It should be fixed. }; WordStore::WordStore() { p = new WordStorePrivate(); } WordStore::~WordStore() { delete p; } WordID WordStore::getID(const Word &w) { if(p->words.hasWord(w)) { return p->words.getWordID(p->words.findWord(w)); } TrieOffset node = p->words.insertWord(w, p->wordIndex.size()); p->wordIndex.push_back(node); WordID result = p->wordIndex.size()-1; return result; } bool WordStore::hasWord(const Word &w) const { return p->words.hasWord(w); } Word WordStore::getWord(const WordID id) const { if(!hasWord(id)) { throw out_of_range("Tried to access non-existing WordID in WordStore."); } return p->words.getWord(p->wordIndex[id]); } bool WordStore::hasWord(const WordID id) const { return id < p->wordIndex.size(); } COL_NAMESPACE_END libcolumbus-1.1.0+15.10.20150806/src/Corpus.cc0000644000015300001610000000247312560622644020752 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #include "Corpus.hh" #include "Document.hh" #include #include COL_NAMESPACE_START using namespace std; struct CorpusPrivate { vector documents; }; Corpus::Corpus() { p = new CorpusPrivate(); } Corpus::~Corpus() { delete p; } void Corpus::addDocument(const Document &d) { p->documents.push_back(d); } size_t Corpus::size() const { return p->documents.size(); } const Document& Corpus::getDocument(size_t i) const { if(i >= p->documents.size()) throw out_of_range("Out of bounds access in Document."); return p->documents[i]; } COL_NAMESPACE_END libcolumbus-1.1.0+15.10.20150806/src/WordList.cc0000644000015300001610000000370612560622644021246 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #include "WordList.hh" #include "Word.hh" #include #include COL_NAMESPACE_START using namespace std; struct WordListPrivate { vector words; }; WordList::WordList() { p = new WordListPrivate(); } WordList::WordList(const WordList &wl) { p = new WordListPrivate(); p->words = wl.p->words; } WordList::WordList(WordList &&wl) { p = wl.p; wl.p = nullptr; } WordList::~WordList() { delete p; } size_t WordList::size() const { return p->words.size(); } const Word& WordList::operator[](const size_t i) const { if(i >= p->words.size()) throw out_of_range("Out of bounds access in WordList."); return p->words[i]; } void WordList::addWord(const Word &w) { p->words.push_back(w); } const WordList& WordList::operator=(const WordList &l) { if(this == &l) return *this; p->words = l.p->words; return *this; } const WordList& WordList::operator=(WordList &&wl) { if(this != &wl) { delete p; p = wl.p; wl.p = nullptr; } return *this; } bool WordList::operator==(const WordList &l) const { return p->words == l.p->words; } bool WordList::operator!=(const WordList &l) const { return p->words != l.p->words; } COL_NAMESPACE_END libcolumbus-1.1.0+15.10.20150806/src/SearchParameters.cc0000644000015300001610000000416312560622644022726 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2013 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #include"SearchParameters.hh" #include"Word.hh" #include"LevenshteinIndex.hh" #include"ResultFilter.hh" #include COL_NAMESPACE_START using namespace std; struct SearchParametersPrivate { bool dynamic; ResultFilter filter; set nosearchFields; }; SearchParameters::SearchParameters() { p = new SearchParametersPrivate(); p->dynamic = true; } SearchParameters::~SearchParameters() { delete p; } bool SearchParameters::isDynamic() const { return p->dynamic; } void SearchParameters::setDynamic(bool dyn) { p->dynamic = dyn; } /* * Long words should allow for more error than short ones. * This is a simple function which is meant to be strict * so there won't be too many matches. */ int SearchParameters::getDynamicError(const Word &w) const { size_t len = w.length(); if(len < 2) return LevenshteinIndex::getDefaultError(); else return 2*LevenshteinIndex::getDefaultError(); } ResultFilter& SearchParameters::getResultFilter() { return p->filter; } const ResultFilter& SearchParameters::getResultFilter() const { return p->filter; } void SearchParameters::addNonsearchingField(const Word &w) { p->nosearchFields.insert(w); } bool SearchParameters::isNonsearchingField(const Word &w) const { return p->nosearchFields.find(w) != p->nosearchFields.end(); } int SearchParameters::looseningIterations() const { return 1; } COL_NAMESPACE_END libcolumbus-1.1.0+15.10.20150806/src/libcolumbus.map0000644000015300001610000001050512560622650022177 0ustar pbuserpbgroup00000000000000{ global: extern "C++" { "Columbus::Matcher::Matcher()"; "Columbus::Matcher::~Matcher()"; Columbus::Matcher::match*; Columbus::Matcher::onlineMatch*; Columbus::Matcher::get*; Columbus::Matcher::operator*; Columbus::Matcher::index*; Columbus::Word::Word*; "Columbus::Word::~Word()"; "Columbs::Word::length()"; "Columbus::Word::asUtf8[abi:cxx11]() const"; Columbus::Word::toUtf8*; Columbus::Word::join*; Columbus::Word::operator*; "Columbus::Word::hash() const"; Columbus::WordStore*; Columbus::Corpus*; Columbus::WordList*; Columbus::Document*; Columbus::MatchResults::MatchResults*; "Columbus::MatchResults::~MatchResults()"; Columbus::MatchResults::operator*; Columbus::MatchResults::add*; Columbus::MatchResults::copyResult*; "Columbus::MatchResults::size() const"; Columbus::MatchResults::get*; "Columbus::ErrorValues::ErrorValues()"; "Columbus::ErrorValues::~ErrorValues()"; "Columbus::ErrorValues::getInsertionError() const"; "Columbus::ErrorValues::getDeletionError() const"; "Columbus::ErrorValues::getEndDeletionError() const"; Columbus::ErrorValues::getStartInsertionError*; "Columbus::ErrorValues::getTransposeError() const"; "Columbus::ErrorValues::setInsertionError(const int)"; "Columbus::ErrorValues::setDeletionError(const int)"; "Columbus::ErrorValues::setEndDeletionError(const int)"; "Columbus::ErrorValues::setStartInsertionError(const int)"; "Columbus::ErrorValues::setTransposeError(const int)"; Columbus::ErrorValues::setSubstringStartLimit*; Columbus::ErrorValues::getSubstituteError*; "Columbus::ErrorValues::getDefaultError()"; "Columbus::ErrorValues::getDefaultGroupError()"; "Columbus::ErrorValues::getDefaultTypoError()"; "Columbus::ErrorValues::getSubstringDefaultEndDeletionError()"; "Columbus::ErrorValues::getSubstringDefaultStartInsertionError()"; Columbus::ErrorValues::setError*; Columbus::ErrorValues::setGroupError*; Columbus::ErrorValues::addAccents*; "Columbus::ErrorValues::addKeyboardErrors()"; "Columbus::ErrorValues::addNumberpadErrors()"; "Columbus::ErrorValues::addStandardErrors()"; Columbus::ErrorValues::isInGroup*; "Columbus::ErrorValues::clearErrors()"; "Columbus::ErrorValues::setSubstringMode()"; "Columbus::IndexMatches::IndexMatches()"; "Columbus::IndexMatches::~IndexMatches()"; "Columbus::IndexMatches::size() const"; Columbus::IndexMatches::getMatch*; Columbus::IndexMatches::getQuery*; Columbus::IndexMatches::getMatchError*; "Columbus::IndexMatches::clear()"; Columbus::IndexWeights*; "Columbus::LevenshteinIndex::LevenshteinIndex()"; "Columbus::LevenshteinIndex::~LevenshteinIndex()"; "Columbus::LevenshteinIndex::getDefaultError()"; Columbus::LevenshteinIndex::insertWord*; Columbus::LevenshteinIndex::hasWord*; Columbus::LevenshteinIndex::findWords*; Columbus::LevenshteinIndex::wordCount*; "Columbus::LevenshteinIndex::maxCount() const"; "Columbus::LevenshteinIndex::numNodes() const"; "Columbus::LevenshteinIndex::numWords() const"; Columbus::SearchParameters*; Columbus::ResultFilter*; "Columbus::hiresTimestamp()"; "Columbus::splitToWords(char const*)"; }; col_word_new; col_word_delete; col_word_length; col_word_as_utf8; col_document_new; col_document_delete; col_document_get_id; col_document_add_text; col_matcher_new; col_matcher_delete; col_matcher_index; col_matcher_match; col_matcher_get_error_values; col_matcher_get_index_weights; col_match_results_new; col_match_results_delete; col_match_results_size; col_match_results_get_id; col_match_results_get_relevancy; col_corpus_new; col_corpus_delete; col_corpus_add_document; col_index_weights_set_weight; col_index_weights_get_weight; col_error_values_add_standard_errors; col_error_values_set_substring_mode; local: extern "C++" { *; }; }; libcolumbus-1.1.0+15.10.20150806/src/ErrorValues.cc0000644000015300001610000002125512560622644021747 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ /* * This class implements an error lookup system. * It is in the hottest of the hot paths in the entire system. * Use of crazy optimization techniques is approved. */ #include #include #include #include #include #include "ErrorValues.hh" #include "Word.hh" #include "ColumbusSlow.hh" COL_NAMESPACE_START using namespace std; static const char *accentGroupDataFile[] = {"latinAccentedLetterGroups.txt", "greekAccentedLetterGroups.txt"}; const int LUT_BITS = 9; const int LUT_LETTERS = 1 << LUT_BITS; const int LUT_SIZE = (LUT_LETTERS*LUT_LETTERS); #define LUT_OFFSET(l1, l2) ((l1) << LUT_BITS | (l2)) static_assert(LUT_BITS > 0, "LUT_BITS must be larger than zero"); struct ErrorValuesPrivate { map, int> singleErrors; map groupMap; vector groupErrors; int *lut; ErrorValuesPrivate() { lut = new int[LUT_SIZE]; } ~ErrorValuesPrivate() { delete []lut; } }; ErrorValues::ErrorValues() : insertionError(DEFAULT_ERROR), deletionError(DEFAULT_ERROR), endDeletionError(DEFAULT_ERROR), startInsertionError(DEFAULT_ERROR), substituteError(DEFAULT_ERROR), transposeError(DEFAULT_ERROR), substringStartLimit(0) { p = new ErrorValuesPrivate; clearLUT(); } ErrorValues::~ErrorValues() { delete p; } void ErrorValues::clearLUT() { for(int i=0; ilut[LUT_OFFSET(i, j)] = i == j ? 0 : substituteError; } } } void ErrorValues::setError(Letter l1, Letter l2, const int error) { if(l1 > l2) { Letter tmp = l1; l1 = l2; l2 = tmp; } pair in(l1, l2); p->singleErrors[in] = error; addToLUT(l1, l2, error); } int ErrorValues::getSubstituteError(Letter l1, Letter l2) const { if(l1 < LUT_LETTERS && l2 < LUT_LETTERS) { return p->lut[LUT_OFFSET(l1, l2)]; } return substituteErrorSlow(l1, l2); } int ErrorValues::substituteErrorSlow(Letter l1, Letter l2) const { if(l1 == l2) return 0; if(l1 > l2) { Letter tmp = l1; l1 = l2; l2 = tmp; } pair in(l1, l2); auto f = p->singleErrors.find(in); if(f != p->singleErrors.end()) return f->second; // Are the letters in the same error group? Check the bigger // value first, because it is probably a more uncommon letter. auto g1 = p->groupMap.find(l2); if(g1 != p->groupMap.end()) { auto g2 = p->groupMap.find(l1); if(g2 != p->groupMap.end()) { if(g1->second == g2->second) { return p->groupErrors[g1->second]; } } } return substituteError; } void ErrorValues::clearErrors() { p->singleErrors.clear(); p->groupErrors.clear(); p->groupMap.clear(); clearLUT(); } void ErrorValues::setGroupError(const Word &groupLetters, const int error) { size_t newGroupID = p->groupErrors.size(); p->groupErrors.push_back(error); for(size_t i = 0; i < groupLetters.length(); i++) { Letter curLetter = groupLetters[i]; if(isInGroup(curLetter)) { if(p->groupMap.find(curLetter)->second != newGroupID) throw runtime_error("Tried to add letter to two different error groups."); } else { p->groupMap[curLetter] = newGroupID; } } addGroupErrorToLUT(groupLetters, error); debugMessage("Added error group: %s\n", groupLetters.asUtf8().c_str()); } void ErrorValues::addGroupErrorToLUT(const Word &groupLetters, const int error) { for(size_t i=0; igroupMap.find(l) != p->groupMap.end(); } void ErrorValues::addAccents(accentGroups group) { const char *baseName = accentGroupDataFile[group]; string dataFile = findDataFile(baseName); string line; if(dataFile.length() == 0) { string s = "Could not find file "; s += baseName; s += ". Run make install or set COLUMBUS_DATADIR env var to your data directory."; throw runtime_error(s); } ifstream ifile(dataFile.c_str()); if(ifile.fail()) { string s = "Could not open data file "; s += dataFile; throw runtime_error(s); } while(getline(ifile, line)) { Word group(line.c_str()); if(group.length() == 0) continue; setGroupError(group, getDefaultGroupError()); } } void ErrorValues::addKeyboardErrors() { int error = getDefaultTypoError(); // Yes, this is a Finnish keyboard. const Letter line1[] = {'1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '+'}; const Letter line2[] = {'q', 'w', 'e', 'r', 't', 'y', 'u', 'i', 'o', 'p', 0xe5}; const Letter line3[] = {'a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 0xf6, 0xe4, '\''}; const Letter line4[] = {'z', 'x', 'c', 'v', 'b', 'n', 'm', ',', '.', '-'}; const Letter *keyboard_layout[4] = {line1, line2, line3, line4}; const size_t lineLens[] = {11, 11, 12, 10}; for(size_t i = 0; i < 3; i++) { const Letter *cur_row = keyboard_layout[i]; const Letter *next_row = keyboard_layout[i+1]; for(size_t j1=0; j1 < lineLens[i]; j1++) { Letter l1 = cur_row[j1]; if(j1 + 1 < lineLens[i]) setError(l1, cur_row[j1+1], error); if(j1 > 0 && j1-1 < lineLens[i+1]) setError(l1, next_row[j1-1], error); if(j1 < lineLens[i+1]) setError(l1, next_row[j1], error); } } } void ErrorValues::setPadError(const Letter number, const char letters[4], int letterCount, int error) { assert(number >= '0' && number <= '9'); for(int i=0; i 0) setPadError(padNumbers[j][i], padLetters[j][i-1], letterCount[j][i-1], adjacentButton); if(i+1 < w) setPadError(padNumbers[j][i], padLetters[j][i+1], letterCount[j][i+1], adjacentButton); if(j-1 > 0) setPadError(padNumbers[j][i], padLetters[j-1][i], letterCount[j-1][i], adjacentButton); if(j+1 < h) setPadError(padNumbers[j][i], padLetters[j+1][i], letterCount[j+1][i], adjacentButton); } } } void ErrorValues::addStandardErrors() { addAccents(latinAccentGroup); addAccents(greekAccentGroup); addKeyboardErrors(); } void ErrorValues::addToLUT(Letter l1, Letter l2, int value) { if(l1 < LUT_LETTERS && l2 < LUT_LETTERS) { p->lut[LUT_OFFSET(l1, l2)] = value; p->lut[LUT_OFFSET(l2, l1)] = value; } } void ErrorValues::setSubstringMode() { startInsertionError = getSubstringDefaultStartInsertionError(); endDeletionError = getSubstringDefaultEndDeletionError(); substringStartLimit = DEFAULT_SUBSTRING_START_LENGTH; } COL_NAMESPACE_END libcolumbus-1.1.0+15.10.20150806/src/Word.cc0000644000015300001610000001535112560622644020411 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #include "Word.hh" #include #include #include #include #include "ColumbusHelpers.hh" using namespace std; COL_NAMESPACE_START static_assert(sizeof(size_t) <= sizeof(uint64_t), "Wow, you are running a 128 bit platform. Respect!"); static const int randomArrSize = 256; const static uint64_t randomNumbers[randomArrSize] = { 0x31d04490f9cd0152, 0xcfd220f4878a1427, 0x9b2dd113758d9e8a, 0x35a4419e88a812d5, 0x9f9743e9ee40cd55, 0x7038be807e85f27f, 0x9ca0e3499edabe60, 0x9b3e409e7ffbe39f, 0xc58155e5a1e164e0, 0x1f3f0823c9670283, 0xddc1ff4e8431766f, 0xf708145c12c3a474, 0x1bd343edebb746e8, 0x59363d26f1d34003, 0xade2044c51ce1ab5, 0x86c0607a613fa4e6, 0x4751cef8b5647cf1, 0x618cdd1beaba96a6, 0x9a5616eed71a1b05, 0x90fffcf56ab61b54, 0xc7b408b8542bf4f9, 0x64d8fba24eed76cd, 0x483d04576118f39b, 0x5c9534dee689698, 0x25d7939c3cf11b2d, 0xe020bdf2ba9f78f5, 0xf441f807c4808932, 0x993166a178ddade4, 0x51c7de16e4a0e2bb, 0xa89b70521c0b028, 0x9b3f7f5af8b2f82, 0x6985efce9aa164a7, 0x692607c787097f9c, 0x6afaf7e9f5ee3211, 0xfa34657c280407b4, 0xa160382b0e3e03ec, 0xe8902b92a6dd18c4, 0x7cd35c609f728a7, 0xdd7ac1ab0ce338f3, 0xa7a9e144792de8b4, 0x435dc2030e1bd3bb, 0xba03839edae53f8c, 0x74918b9786b2ecf6, 0x183041d61d4e02d, 0xaa1dc5c7c7c5fb5b, 0x939564fc52bece9b, 0x3a3faae9201160d0, 0xc20d3f67a52cb7b6, 0x77ad9b3c19bda0f9, 0x65696731011860b4, 0xae6b011d726f2fe, 0xba5217bd2b48005f, 0x8f8e100ae6ba4e9d, 0x51967f54690c822d, 0x261a8bf80c1d6890, 0x58cb529d19f0856f, 0xc45e7d76ca927907, 0xc15c5589af3dbef0, 0xa8175814c7ff20f6, 0xaec21b2f3fddfc14, 0xaf247b61fd25583, 0x2d784f3af2691077, 0x58f3a2b1743759c6, 0x77115ac165a120a9, }; Word::Word() : text(0), len(0){ } Word::Word(const char *utf8Word) : text(0), len(0) { convertString(utf8Word); } Word::Word(const Word &w) : text(0), len(0) { duplicateFrom(w); } Word::Word(Word &&w) : text(w.text), len(w.len) { w.len = 0; w.text = 0; } Word::Word(Letter *letters, size_t length) { if(letters[length-1] == 0) { text = new Letter[length]; len = length-1; } else { text = new Letter[length+1]; text[length] = 0; len = length; } memcpy(text, letters, length*sizeof(Letter)); if(hasWhitespace()) { delete []text; text = nullptr; throw std::invalid_argument("Tried to create a Word with whitespace."); } } Word::Word(const std::string &w) : text(0), len(0) { convertString(w.c_str()); } Word::~Word() { delete []text; } void Word::convertString(const char *utf8Word) { text = utf8ToInternal(utf8Word, len); if(hasWhitespace()) { delete []text; text = nullptr; std::string err("Tried to create a word with whitespace in it: "); err += (const char*)utf8Word; throw std::invalid_argument(err); } } void Word::duplicateFrom(const Word &w) { if(this == &w) { return; } delete []text; len = w.len; if(len == 0) { text = 0; } else { text = new Letter[len+1]; memcpy(text, w.text, (len+1)*sizeof(Letter)); } } Letter Word::operator[](unsigned int i) const { if(i >= len) { std::string msg("Tried to access letter "); msg += i; msg += " in a word of size "; msg += len; msg += "."; throw std::out_of_range(msg); } return text[i]; } Word& Word::operator=(const Word &w) { duplicateFrom(w); return *this; } Word& Word::operator=(Word &&w) { delete []text; text = w.text; len = w.len; w.text = 0; w.len = 0; return *this; } /** * A word is not supposed to have any whitespace in it. Verify that we don't. */ bool Word::hasWhitespace() { for(unsigned int i=0; i w.text[i]) return false; } if(w.len > len) return true; return false; } void Word::toUtf8(char *buf, unsigned int bufSize) const { internalToUtf8(text, len, buf, bufSize); } string Word::asUtf8() const { size_t strSize = 4*(len+1); // One codepoint is max 4 bytes in UTF-8. char *u8 = new char[strSize]; toUtf8(u8, strSize); string result(u8); delete []u8; return result; } Word Word::join(const Word &w) const { Word result; size_t newLen = length() + w.length(); result.len = newLen; result.text = new Letter[newLen+1]; memcpy(result.text, text, len*sizeof(Letter)); memcpy(result.text + len, w.text, w.len*sizeof(Letter)); result.text[newLen] = '\0'; return result; } Word& Word::operator=(const char *utf8Word) { delete []text; text = nullptr; len = 0; convertString(utf8Word); return *this; } Word& Word::operator=(const string &utf8Str) { return *this = utf8Str.c_str(); } size_t Word::hash() const { size_t result = 0; const size_t *nums = (const size_t*) randomNumbers; unsigned char *arr = (unsigned char*) text; for(size_t i=0; i * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #include "Matcher.hh" #include "Corpus.hh" #include "LevenshteinIndex.hh" #include "Word.hh" #include "Document.hh" #include "WordList.hh" #include "IndexMatches.hh" #include "MatchResults.hh" #include "ErrorValues.hh" #include "ColumbusHelpers.hh" #include "IndexWeights.hh" #include "MatcherStatistics.hh" #include "WordStore.hh" #include "ResultFilter.hh" #include "SearchParameters.hh" #include #include #include #include #include #include #ifdef HAS_SPARSE_HASH #include #include using google::sparse_hash_map; using google::sparse_hash_set; #define hashmap sparse_hash_map #define hashset sparse_hash_set #else #include #include #define hashmap unordered_map #define hashset unordered_set #endif COL_NAMESPACE_START using namespace std; struct idhasher : std::unary_function, size_t> { size_t operator() ( const pair &p) const { size_t w1 = p.first; size_t w2 = p.second; if(sizeof(size_t) > sizeof(WordID)) { return (w1 << 32) | w2; // Assuming size_t is 64 bits. } else { return w1 ^ w2; } } }; typedef hashset DocumentSet; typedef hashmap IndexMap; typedef hashmap, DocumentSet, idhasher > ReverseIndexData; // Index name, word, documents. typedef IndexMap::iterator IndIterator; typedef ReverseIndexData::iterator RevIndIterator; typedef map MatchErrorMap; typedef map BestIndexMatches; typedef BestIndexMatches::iterator MatchIndIterator; typedef MatchErrorMap::iterator MatchIterator; class ReverseIndex { private: ReverseIndexData reverseIndex; public: void add(const WordID wordID, const WordID indexID, const DocumentID id); bool documentHasTerm(const WordID wordID, const WordID indexID, DocumentID id); void findDocuments(const WordID wordID, const WordID indexID, std::vector &result); }; struct MatcherPrivate { IndexMap indexes; ReverseIndex reverseIndex; ErrorValues e; IndexWeights weights; MatcherStatistics stats; WordStore store; map, size_t> originalSizes; // Lengths of original documents. }; void ReverseIndex::add(const WordID wordID, const WordID indexID, const DocumentID id) { pair p; p.first = indexID; p.second = wordID; auto revIt = reverseIndex.find(p); if(revIt == reverseIndex.end()) { DocumentSet tmp; tmp.insert(id); reverseIndex[p] = tmp; } else { revIt->second.insert(id); } } bool ReverseIndex::documentHasTerm(const WordID wordID, const WordID indexID, DocumentID id) { pair p; p.first = indexID; p.second = wordID; auto revIt = reverseIndex.find(p); if(revIt == reverseIndex.end()) return false; return revIt->second.find(id) != revIt->second.end(); } void ReverseIndex::findDocuments(const WordID wordID, const WordID indexID, std::vector &result) { pair p; p.first = indexID; p.second = wordID; auto revIt = reverseIndex.find(p); if(revIt == reverseIndex.end()) return; DocumentSet &docSet = revIt->second; for(auto docIter = docSet.begin(); docIter != docSet.end(); docIter++) { result.push_back(*docIter); } } /* * These are helper functions for Matcher. They are not member functions to avoid polluting the header * with STL includes. */ static void addMatches(MatcherPrivate */*p*/, BestIndexMatches &bestIndexMatches, const Word &/*queryWord*/, const WordID indexID, IndexMatches &matches) { MatchIndIterator it = bestIndexMatches.find(indexID); map *indexMatches; if(it == bestIndexMatches.end()) { map tmp; bestIndexMatches[indexID] = tmp; it = bestIndexMatches.find(indexID); } indexMatches = &(it->second); for(size_t i=0; i < matches.size(); i++) { const WordID matchWordID = matches.getMatch(i); const int matchError = matches.getMatchError(i); MatchIterator mIt = indexMatches->find(matchWordID); if(mIt == indexMatches->end()) { (*indexMatches)[matchWordID] = matchError; } else { if(mIt->second > matchError) (*indexMatches)[matchWordID] = matchError; } } } /* * A simple relevancy calculator for matched word. Better ranking functions exist and should be examined: * http://en.wikipedia.org/wiki/TF_IDF * http://en.wikipedia.org/wiki/Okapi_BM25 */ static double calculateRelevancy(MatcherPrivate *p, const WordID wID, const WordID indexID, int error) { const LevenshteinIndex * const ind = p->indexes[indexID]; double errorMultiplier = 100.0/(100.0+error); // Should be adjusted for maxError or word length. size_t indexCount = ind->wordCount(wID); size_t indexMaxCount = ind->maxCount(); assert(indexCount > 0); assert(indexMaxCount > 0); double frequencyMultiplier = 1.0 - double(indexCount)/(indexMaxCount+1); double indexWeightMultiplier = p->weights.getWeight(p->store.getWord(indexID)); return errorMultiplier*frequencyMultiplier*indexWeightMultiplier; } static void matchIndexes(MatcherPrivate *p, const WordList &query, const SearchParameters ¶ms, const int extraError, BestIndexMatches &bestIndexMatches) { for(size_t i=0; iindexes.begin(); it != p->indexes.end(); it++) { if(params.isNonsearchingField(p->store.getWord(it->first))) { continue; } IndexMatches m; it->second->findWords(w, p->e, maxError, m); addMatches(p, bestIndexMatches, w, it->first, m); debugMessage("Matched word %s in index %s with error %d and got %lu matches.\n", w.asUtf8().c_str(), p->store.getWord(it->first).asUtf8().c_str(), maxError, (unsigned long) m.size()); } } } static void gatherMatchedDocuments(MatcherPrivate *p, BestIndexMatches &bestIndexMatches, map &matchedDocuments) { for(MatchIndIterator it = bestIndexMatches.begin(); it != bestIndexMatches.end(); it++) { for(MatchIterator mIt = it->second.begin(); mIt != it->second.end(); mIt++) { vector tmp; p->reverseIndex.findDocuments(mIt->first, it->first, tmp); debugMessage("Exact searched \"%s\" in field \"%s\", which was found in %lu documents.\n", p->store.getWord(mIt->first).asUtf8().c_str(), p->store.getWord(it->first).asUtf8().c_str(), (unsigned long)tmp.size()); for(size_t i=0; ifirst, it->first, mIt->second); auto doc = matchedDocuments.find(curDoc); if(doc == matchedDocuments.end()) matchedDocuments[curDoc] = relevancy; else matchedDocuments[curDoc] += relevancy; } } } } static bool subtermsMatch(MatcherPrivate *p, const ResultFilter &filter, size_t term, DocumentID id) { for(size_t subTerm=0; subTerm < filter.numSubTerms(term); subTerm++) { const Word &filterName = filter.getField(term, subTerm); const Word &value = filter.getWord(term, subTerm); bool termFound = p->reverseIndex.documentHasTerm( p->store.getID(value), p->store.getID(filterName), id); if(!termFound) { return false; } } return true; } Matcher::Matcher() { p = new MatcherPrivate(); } void Matcher::index(const Corpus &c) { double buildStart, buildEnd; buildStart = hiresTimestamp(); buildIndexes(c); buildEnd = hiresTimestamp(); debugMessage("Added %lu documents to matcher. It now has %lu indexes. Index population took %.2f seconds.\n", (unsigned long) c.size(), (unsigned long) p->indexes.size(), buildEnd - buildStart); for(IndIterator it = p->indexes.begin(); it != p->indexes.end(); it++) { debugMessage("Index \"%s\" has %lu words and %lu nodes.\n", p->store.getWord(it->first).asUtf8().c_str(), (unsigned long) it->second->numWords(), (unsigned long) it->second->numNodes()); } } Matcher::~Matcher() { for(IndIterator it = p->indexes.begin(); it != p->indexes.end(); it++) { delete it->second; } delete p; } void Matcher::buildIndexes(const Corpus &c) { for(size_t ci = 0; ci < c.size(); ci++) { const Document &d = c.getDocument(ci); WordList textNames; d.getFieldNames(textNames); for(size_t ti=0; ti < textNames.size(); ti++) { const Word &fieldName = textNames[ti]; const WordID fieldID = p->store.getID(fieldName); const WordList &text = d.getText(fieldName); pair lengths; lengths.first = d.getID(); lengths.second = fieldID; p->originalSizes[lengths] = text.size(); for(size_t wi=0; wistore.getID(word); p->stats.wordProcessed(wordID); addToIndex(word, wordID, fieldID); p->stats.addedWordToIndex(wordID, fieldName); p->reverseIndex.add(wordID, fieldID, d.getID()); } } } } void Matcher::addToIndex(const Word &word, const WordID wordID, const WordID indexID) { LevenshteinIndex *target; IndIterator it = p->indexes.find(indexID); if(it == p->indexes.end()) { target = new LevenshteinIndex(); p->indexes[indexID] = target; } else { target = it->second; } target->insertWord(word, wordID); } void Matcher::relevancyMatch(const WordList &query, const SearchParameters ¶ms, const int extraError, MatchResults &matchedDocuments) { map docs; BestIndexMatches bestIndexMatches; double start, indexMatchEnd, gatherEnd, finish; start = hiresTimestamp(); matchIndexes(p, query, params, extraError, bestIndexMatches); indexMatchEnd = hiresTimestamp(); // Now we know all matched words in all indexes. Gather up the corresponding documents. gatherMatchedDocuments(p, bestIndexMatches, docs); gatherEnd = hiresTimestamp(); for(auto it=docs.begin(); it != docs.end(); it++) { matchedDocuments.addResult(it->first, it->second); } debugMessage("Found a total of %lu documents.\n", (unsigned long) matchedDocuments.size()); finish = hiresTimestamp(); debugMessage("Query finished. Index lookups took %.2fs, result gathering %.2fs, result building %.2fs.\n", indexMatchEnd - start, gatherEnd - indexMatchEnd, finish - gatherEnd); } MatchResults Matcher::match(const WordList &query, const SearchParameters ¶ms) { MatchResults matchedDocuments; const int maxIterations = 1; const int increment = LevenshteinIndex::getDefaultError(); const size_t minMatches = 10; MatchResults allMatches; if(query.size() == 0) return matchedDocuments; // Try to search with ever growing error until we find enough matches. for(int i=0; i= minMatches || i == maxIterations-1) { allMatches.addResults(matches); break; } } /* Filter results into final set. */ auto &filter = params.getResultFilter(); for(size_t i=0; ie; } MatchResults Matcher::match(const char *queryAsUtf8, const SearchParameters ¶ms) { return match(splitToWords(queryAsUtf8), params); } IndexWeights& Matcher::getIndexWeights() { return p->weights; } static map countExacts(MatcherPrivate *p, const WordList &query, const WordID indexID) { map matchCounts; for(size_t i=0; istore.hasWord(w)) { continue; } WordID curWord = p->store.getID(w); vector exacts; p->reverseIndex.findDocuments(curWord, indexID, exacts); for(const auto &i : exacts) { matchCounts[i]++; // Default is zero initialisation. } } return matchCounts; } struct DocCount { DocumentID id; size_t matches; }; MatchResults Matcher::onlineMatch(const WordList &query, const Word &primaryIndex) { MatchResults results; set exactMatched; map accumulator; if(!p->store.hasWord(primaryIndex)) { string msg("Index named "); msg += primaryIndex.asUtf8(); msg += " is not known"; throw invalid_argument(msg); } WordID indexID = p->store.getID(primaryIndex); // How many times each document matched with zero error. vector stats; for(const auto &i : countExacts(p, query, indexID)) { DocCount c; pair key; exactMatched.insert(i.first); key.first = i.first; key.second = indexID; c.id = i.first; c.matches = i.second; stats.push_back(c); } for(const auto &i: stats) { accumulator[i.id] = 2*i.matches; if(i.matches == query.size() && i.matches == p->originalSizes[make_pair(i.id, indexID)]) { // Perfect match. accumulator[i.id] += 100; } } // Merge in fuzzy matches. MatchResults fuzzyResults = match(query); for(size_t i = 0; i * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #include "Document.hh" #include "Word.hh" #include "WordList.hh" #include "ColumbusHelpers.hh" #include #include COL_NAMESPACE_START using namespace std; struct DocumentPrivate { DocumentID id; map texts; }; typedef map::iterator TextIter; typedef map::const_iterator TextIterC; Document::Document(DocumentID id) { p = new DocumentPrivate(); p->id = id; } Document::Document(const Document& d) { p = new DocumentPrivate(); p->id = d.p->id; p->texts = d.p->texts; } Document::~Document() { delete p; } void Document::addText(const Word &field, const WordList &words) { p->texts[field] = words; } void Document::addText(const Word &field, const char *textAsUtf8) { addText(field, splitToWords(textAsUtf8)); } void Document::addText(const Word &field, const std::string &textAsUtf8) { addText(field, textAsUtf8.c_str()); } const WordList& Document::getText(const Word &field) const { TextIter res = p->texts.find(field); if(res == p->texts.end()) { throw invalid_argument("Tried to access nonexisting text field in Document."); } return res->second; } size_t Document::fieldCount() const { return p->texts.size(); } DocumentID Document::getID() const { return p->id; } void Document::getFieldNames(WordList &list) const { for(TextIter it=p->texts.begin(); it != p->texts.end(); it++) { list.addWord(it->first); } } const Document& Document::operator=(const Document&d) { if(this == &d) return *this; p->id = d.p->id; p->texts = d.p->texts; return *this; } size_t Document::wordCount(const Word &w, const Word field) const { TextIterC it = p->texts.find(field); size_t count = 0; if(it == p->texts.end()) return count; for(size_t i = 0; i < it->second.size(); i++) { if(it->second[i] == w) count++; } return count; } size_t Document::totalWordCount(const Word &w) const { size_t count = 0; for(TextIterC it = p->texts.begin(); it != p->texts.end(); it++) { count += wordCount(w, it->first); } return count; } COL_NAMESPACE_END libcolumbus-1.1.0+15.10.20150806/src/ResultFilter.cc0000644000015300001610000000446412560622644022125 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #include "ResultFilter.hh" #include "Word.hh" #include #include COL_NAMESPACE_START using namespace std; struct ResultFilterPrivate { vector>> termList; }; ResultFilter::ResultFilter() { p = new ResultFilterPrivate(); addNewTerm(); } ResultFilter::~ResultFilter() { delete p; } void ResultFilter::addNewTerm() { vector > dummy; p->termList.push_back(dummy); } void ResultFilter::addNewSubTerm(const Word &field, const Word &word) { p->termList.back().push_back(make_pair(field, word)); } const Word& ResultFilter::getField(const size_t term, const size_t subTerm) const { if(term >= numTerms()) throw out_of_range("Term access out of bounds in ResultFilter::getField."); if(subTerm >= p->termList[term].size()) throw out_of_range("Subterm access out of bounds in ResultFilter::getField."); return p->termList[term][subTerm].first; } const Word& ResultFilter::getWord(const size_t term, const size_t subTerm) const { if(term >= numTerms()) throw out_of_range("Term access out of bounds in ResultFilter::getField."); if(subTerm >= p->termList[term].size()) throw out_of_range("Subterm access out of bounds in ResultFilter::getField."); return p->termList[term][subTerm].second; } size_t ResultFilter::numTerms() const { return p->termList.size(); } size_t ResultFilter::numSubTerms(const size_t term) const { if(term >= numTerms()) throw out_of_range("Access out of bounds in ResultFilter::numSubTerms."); return p->termList[term].size(); } COL_NAMESPACE_END libcolumbus-1.1.0+15.10.20150806/src/ColumbusSlow.cc0000644000015300001610000000261212560622644022130 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #include "ColumbusSlow.hh" #include "ColumbusCore.hh" #include #include COL_NAMESPACE_START using namespace std; static bool fileExists(const char *fname) { struct stat foo; return stat(fname, &foo) == 0; } string findDataFile(const string &baseName) { string empty, s; const char *varname = "COLUMBUS_DATADIR"; const char *envvarDir; envvarDir = getenv(varname); if(envvarDir) { s = envvarDir; if(s[s.length()-1] != '/') s += "/"; s += baseName; } else { s = COLUMBUS_DATADIR; s += baseName; } if(fileExists(s.c_str())) return s; return empty; } COL_NAMESPACE_END libcolumbus-1.1.0+15.10.20150806/src/pch/0000755000015300001610000000000012560622775017741 5ustar pbuserpbgroup00000000000000libcolumbus-1.1.0+15.10.20150806/src/pch/columbus_pch.hh0000644000015300001610000000175312560622644022746 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ /* * This is the precompiled header file for core libcolumbus. * It contains only system headers. It must NOT have any internal * headers */ #include #include #include #include //#include #include //#include libcolumbus-1.1.0+15.10.20150806/src/ErrorMatrix.cc0000644000015300001610000000362512560622644021755 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #include #include "ErrorMatrix.hh" using namespace std; COL_NAMESPACE_START ErrorMatrix::ErrorMatrix(const size_t rows_, const size_t columns_, const int insertError, const int deletionError) : rows(rows_), columns(columns_) { m = new int*[rows+1]; for(size_t i=0; i<=rows; i++) { m[i] = new int[columns+1]; } for(size_t i=0; i<=columns; i++) { m[0][i] = i*insertError; } for(size_t i=1; i<=rows; i++) m[i][0] = m[i-1][0] + deletionError; } ErrorMatrix::~ErrorMatrix() { for(size_t i=0; i<=rows; i++) { delete []m[i]; } delete []m; } void ErrorMatrix::set(const size_t rowNum, const size_t colNum, const int error) { if(rowNum > rows) throw out_of_range("Illegal row number."); if(colNum > columns) throw out_of_range("Illegal column number."); m[rowNum][colNum] = error; } int ErrorMatrix::totalError(const size_t rowNum) const { return m[rowNum][columns-1]; } int ErrorMatrix::minError(const size_t rowNum) const { int result = m[rowNum][0]; for(size_t i=1; i * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #include "columbus.h" #include "Word.hh" #include "Document.hh" #include "Matcher.hh" #include "MatchResults.hh" #include "Corpus.hh" #include "ErrorValues.hh" #include "IndexWeights.hh" #include #include using namespace Columbus; using namespace std; #ifdef __cplusplus extern "C" { #endif ColWord col_word_new(const char *utf8_word) { try { Word *w = new Word(utf8_word); return reinterpret_cast(w); } catch(exception &e) { fprintf(stderr, "Error creating Word: %s\n", e.what()); } return nullptr; } void col_word_delete(ColWord w) { try { delete reinterpret_cast(w); } catch(exception &e) { fprintf(stderr, "Error deleting Word: %s\n", e.what()); } } size_t col_word_length(ColWord w) { try { return reinterpret_cast(w)->length(); } catch(exception &e) { fprintf(stderr, "Error getting Word length: %s\n", e.what()); } return 0; } void col_word_as_utf8(ColWord w, char *buf, unsigned int bufSize) { try { reinterpret_cast(w)->toUtf8(buf, bufSize); } catch(exception &e) { fprintf(stderr, "Error converting to Utf-8: %s\n", e.what()); } } ColDocument col_document_new(DocumentID id) { try { return reinterpret_cast(new Document(id)); } catch(exception &e) { fprintf(stderr, "Error creating Document: %s\n", e.what()); } return nullptr; } void col_document_delete(ColDocument doc) { try { delete reinterpret_cast(doc); } catch(exception &e) { fprintf(stderr, "Error deleting Document: %s\n", e.what()); } } DocumentID col_document_get_id(ColDocument doc) { try { return reinterpret_cast(doc)->getID(); } catch(exception &e) { fprintf(stderr, "Error getting Document ID %s\n", e.what()); } return INVALID_DOCID; } void col_document_add_text(ColDocument doc, ColWord field_name, const char *text_as_utf8) { try { Document *d = reinterpret_cast(doc); Word *w = reinterpret_cast(field_name); d->addText(*w, text_as_utf8); } catch(exception &e) { fprintf(stderr, "Error adding text: %s\n", e.what()); } } ColMatcher col_matcher_new() { try { return reinterpret_cast(new Matcher()); } catch(exception &e) { fprintf(stderr, "Error creating Matcher: %s\n", e.what()); } return nullptr; } void col_matcher_delete(ColMatcher m) { try { delete reinterpret_cast(m); } catch(exception &e) { fprintf(stderr, "Error deleting Matcher: %s\n", e.what()); } } void col_matcher_index(ColMatcher m, ColCorpus c) { try { Matcher *matcher = reinterpret_cast(m); Corpus *corp = reinterpret_cast(c); matcher->index(*corp); } catch(exception &e) { fprintf(stderr, "Exception when indexing: %s\n", e.what()); } } ColMatchResults col_matcher_match(ColMatcher m, const char *query_as_utf8) { try { Matcher *matcher = reinterpret_cast(m); MatchResults *results = new MatchResults(matcher->match(query_as_utf8)); return reinterpret_cast(results); } catch(exception &e) { fprintf(stderr, "Exception when matching: %s\n", e.what()); return nullptr; } } ColErrorValues col_matcher_get_error_values(ColMatcher m) { try { Matcher *matcher = reinterpret_cast(m); return reinterpret_cast(&matcher->getErrorValues()); } catch(exception &e) { fprintf(stderr, "Error getting ErrorValues: %s\n", e.what()); } return nullptr; } ColIndexWeights col_matcher_get_index_weights(ColMatcher m) { try { Matcher *matcher = reinterpret_cast(m); return reinterpret_cast(&matcher->getIndexWeights()); } catch(exception &e) { fprintf(stderr, "Error getting IndexWeights: %s\n", e.what()); } return nullptr; } ColMatchResults col_match_results_new() { try { return reinterpret_cast(new MatchResults()); } catch(exception &e) { fprintf(stderr, "Error creating MatchResults: %s\n", e.what()); } return nullptr; } void col_match_results_delete(ColMatchResults mr) { try { delete reinterpret_cast(mr); } catch(exception &e) { fprintf(stderr, "Error deleting MatchResults: %s\n", e.what()); } } size_t col_match_results_size(ColMatchResults mr) { try { return reinterpret_cast(mr)->size(); } catch(exception &e) { fprintf(stderr, "Error getting match size: %s\n", e.what()); } return 0; } DocumentID col_match_results_get_id(ColMatchResults mr, size_t i) { try { MatchResults *results = reinterpret_cast(mr); return results->getDocumentID(i); } catch(exception &e) { fprintf(stderr, "Exception when getting result document ID: %s\n", e.what()); } return INVALID_DOCID; } double col_match_results_get_relevancy(ColMatchResults mr, size_t i) { try { MatchResults *results = reinterpret_cast(mr); return results->getDocumentID(i); } catch(exception &e) { fprintf(stderr, "Exception when getting result document ID: %s\n", e.what()); } return -1.0; } ColCorpus col_corpus_new() { try { return reinterpret_cast(new Corpus()); } catch(exception &e) { fprintf(stderr, "Error creating Corpus: %s\n", e.what()); } return nullptr; } void col_corpus_delete(ColCorpus c) { try { delete reinterpret_cast(c); } catch(exception &e) { fprintf(stderr, "Error deleting Corpus: %s\n", e.what()); } } void col_corpus_add_document(ColCorpus c, ColDocument d) { try { Corpus *corp = reinterpret_cast(c); Document *doc = reinterpret_cast(d); corp->addDocument(*doc); } catch(exception &e) { fprintf(stderr, "Error adding document: %s\n", e.what()); } } void col_error_values_add_standard_errors(ColErrorValues ev) { try { ErrorValues *results = reinterpret_cast(ev); results->addStandardErrors(); } catch(exception &e) { fprintf(stderr, "Error adding standard errors: %s\n", e.what()); } } void col_error_values_set_substring_mode(ColErrorValues ev) { try { ErrorValues *results = reinterpret_cast(ev); results->setSubstringMode(); } catch(exception &e) { fprintf(stderr, "Error setting substring mode: %s\n", e.what()); } } void col_index_weights_set_weight(ColIndexWeights weights, const ColWord field, const double new_weight) { try { IndexWeights *cweight = reinterpret_cast(weights); Word *w = reinterpret_cast(field); cweight->setWeight(*w, new_weight); } catch(exception &e) { fprintf(stderr, "Error setting weight: %s\n", e.what()); } } double col_index_weights_get_weight(ColIndexWeights weights, const ColWord field) { try { IndexWeights *cweight = reinterpret_cast(weights); Word *w = reinterpret_cast(field); return cweight->getWeight(*w); } catch(exception &e) { fprintf(stderr, "Error getting weight: %s\n", e.what()); } return 1.0; } #ifdef __cplusplus } #endif libcolumbus-1.1.0+15.10.20150806/src/CMakeLists.txt0000644000015300001610000000177112560622644021730 0ustar pbuserpbgroup00000000000000add_library(${COL_LIB_BASENAME} SHARED LevenshteinIndex.cc IndexMatches.cc ErrorValues.cc Word.cc ColumbusHelpers.cc ColumbusSlow.cc WordList.cc Document.cc Corpus.cc Matcher.cc MatchResults.cc IndexWeights.cc MatcherStatistics.cc WordStore.cc ColumbusCAPI.cc ErrorMatrix.cc ResultFilter.cc Trie.cc SearchParameters.cc ) if(ICONV_LIBRARIES) target_link_libraries(${COL_LIB_BASENAME} ${ICONV_LIBRARIES}) endif() target_link_libraries(${COL_LIB_BASENAME} ${ICU_LIBRARIES}) set(symbol_map "${CMAKE_CURRENT_SOURCE_DIR}/libcolumbus.map") set_target_properties(${COL_LIB_BASENAME} PROPERTIES VERSION ${SO_VERSION} SOVERSION ${ABI_VERSION}) set_target_properties(${COL_LIB_BASENAME} PROPERTIES LINK_FLAGS "${ldflags} -Wl,--version-script,${symbol_map}") set_target_properties(${COL_LIB_BASENAME} PROPERTIES LINK_DEPENDS ${symbol_map}) add_pch(pch/columbus_pch.hh ${COL_LIB_BASENAME}) install( TARGETS ${COL_LIB_BASENAME} ARCHIVE DESTINATION ${LIBDIR} RUNTIME DESTINATION bin LIBRARY DESTINATION ${LIBDIR} ) libcolumbus-1.1.0+15.10.20150806/src/Trie.cc0000644000015300001610000002021212560622644020371 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2013 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ /* * This class implements a trie as an array. It uses a sparse memory mapped * file for backing storage. This makes it possible to grow the allocation * efficiently with ftruncate. * * The offsets have 32 bits to save memory. 4 gigs of trie should be enough * for everybody. * * This low level bit fiddling makes the code slightly hard to read. It * should still be understandable, though. */ #include"Trie.hh" #include"Word.hh" #include #include #include #include #include #include #include #include #include #include using namespace std; COL_NAMESPACE_START struct TrieHeader { TrieOffset totalSize; TrieOffset firstFree; uint32_t numWords; uint32_t numNodes; }; struct TriePtrs { Letter l; TrieOffset child; TrieOffset sibling; }; struct TrieNode { WordID word; TrieOffset parent; }; struct TriePrivate { FILE *f; char *map; TrieHeader *h; TrieOffset root; }; Trie::Trie() { p = new TriePrivate(); p->f = tmpfile(); if(!p->f) { string msg("Could not create temporary file: "); msg += strerror(errno); throw runtime_error(msg); } p->map = nullptr; expand(); p->h->firstFree = sizeof(TrieHeader); p->root = p->h->firstFree; p->h->numWords = 0; addNewNode(0); } Trie::~Trie() { fclose(p->f); delete p; } void Trie::expand() { TrieOffset newSize; if(p->map) { TrieOffset oldSize = p->h->totalSize; newSize = oldSize*2; if(munmap(p->map, oldSize) != 0) { string err = "Munmap failed: "; err += strerror(errno); throw runtime_error(err); } } else { newSize = 1024; } if(ftruncate(fileno(p->f), newSize) != 0) { string err = "Truncate failed: "; err += strerror(errno); throw runtime_error(err); } p->map = (char*)mmap(NULL, newSize, PROT_READ | PROT_WRITE, MAP_SHARED, fileno(p->f), 0); if(p->map == MAP_FAILED) { string err = "MMap failed: "; err += strerror(errno); throw runtime_error(err); } if(madvise(p->map, newSize, MADV_RANDOM | MADV_WILLNEED) != 0) { fprintf(stderr, "Problem with madvise: %s\n", strerror(errno)); } p->h = (TrieHeader*)p->map; p->h->totalSize = newSize; assert(p->h->totalSize > p->h->firstFree); } TrieOffset Trie::append(const char *data, const int size) { TrieOffset result; assert(p->h->totalSize > p->h->firstFree); while(p->h->firstFree + size >= p->h->totalSize) { expand(); } memcpy(p->map + p->h->firstFree, data, size); result = p->h->firstFree; p->h->firstFree += size; assert(p->h->totalSize > p->h->firstFree); return result; } TrieOffset Trie::addNewNode(const TrieOffset parent) { TrieNode n; TriePtrs ptr; TrieOffset nodeoffset; n.word = INVALID_WORDID; n.parent = parent; ptr.child = ptr.sibling = ptr.l = 0; nodeoffset = append((char*)&n, sizeof(n)); append((char*)&ptr, sizeof(ptr)); p->h->numNodes++; return nodeoffset; } TrieOffset Trie::addNewSibling(const TrieOffset node, const TrieOffset sibling, Letter l) { TriePtrs *last; // Assign only at the end so remappings won't invalidate it. TriePtrs ptr; TrieOffset newSibling; ptr.l = l; ptr.child = addNewNode(node); ptr.sibling = 0; newSibling = append((char*) &ptr, sizeof(ptr)); last = (TriePtrs*)(p->map + sibling); assert(last->sibling == 0); last->sibling = newSibling; return ptr.child; } TrieOffset Trie::insertWord(const Word &word, const WordID wordID) { size_t i=0; TrieOffset node = p->root; while(word.length() > i) { Letter l = word[i]; TrieOffset searcher = node; //TrieNode *n = (TrieNode*)(p->map + searcher); TrieOffset sibl = searcher + sizeof(TrieNode); TriePtrs *ptrs = (TriePtrs*)(p->map + sibl); while(ptrs->sibling != 0 && ptrs->l != l) { sibl = ptrs->sibling; ptrs = (TriePtrs*)(p->map + sibl); } if(ptrs->l == l) { node = ptrs->child; } else { node = addNewSibling(node, sibl, l); } i++; } TrieNode *final = (TrieNode*)(p->map + node); if (final->word == INVALID_WORDID) { final->word = wordID; p->h->numWords++; } /* * Theoretically there is nothing wrong with adding the same word with * different IDs. In our case it probably means that the word deduplicator * is not working and there is a leak somewhere. So check explicitly. */ assert(final->word == wordID); return node; } bool Trie::hasWord(const Word &word) const { TrieOffset node = findWord(word); if(!node) return false; TrieNode *n = (TrieNode*)(p->map+node); if(n->word != INVALID_WORDID) return true; return false; } TrieOffset Trie::findWord(const Word &word) const { TrieOffset node = p->root; for(size_t i=0; word.length() > i; i++) { Letter l = word[i]; TrieOffset searcher = node; TrieOffset sibl = searcher + sizeof(TrieNode); TriePtrs *ptrs = (TriePtrs*)(p->map + sibl); while(ptrs->sibling != 0 && ptrs->l != l) { sibl = ptrs->sibling; ptrs = (TriePtrs*)(p->map + sibl); } if(ptrs->l != l) return 0; node = ptrs->child; } return node; } TrieOffset Trie::getRoot() const { return p->root; } TrieOffset Trie::getSiblingList(TrieOffset node) const { TriePtrs *ptrs = (TriePtrs*)(p->map + node + sizeof(TrieNode)); return ptrs->sibling; } TrieOffset Trie::getNextSibling(TrieOffset sibling) const { TriePtrs *ptrs = (TriePtrs*)(p->map + sibling); return ptrs->sibling; } Letter Trie::getLetter(TrieOffset sibling) const { TriePtrs *ptrs = (TriePtrs*)(p->map + sibling); return ptrs->l; } TrieOffset Trie::getChild(TrieOffset sibling) const { TriePtrs *ptrs = (TriePtrs*)(p->map + sibling); return ptrs->child; } WordID Trie::getWordID(TrieOffset node) const { TrieNode *n = (TrieNode*)(p->map + node); return n->word; } bool Trie::hasSibling(TrieOffset sibling) const { TriePtrs *ptrs = (TriePtrs*)(p->map + sibling); return ptrs->sibling != 0; } size_t Trie::numWords() const { return p->h->numWords; } size_t Trie::numNodes() const { return p->h->numNodes; } TrieOffset Trie::getParent(TrieOffset node) const { TrieNode *n = (TrieNode*)(p->map + node); return n->parent; } TrieOffset Trie::getSiblingTo(const TrieOffset node, const TrieOffset child) const { TrieOffset sibling = getSiblingList(node); while(getChild(sibling) != child) { sibling = getNextSibling(sibling); if(!sibling) throw runtime_error("Trie is corrupted"); } return sibling; } Word Trie::getWord(const TrieOffset startNode) const { vector letters; vector res; TrieOffset node = startNode; if(node == 0) { return Word(); } TrieOffset parent = getParent(node); letters.push_back(0); while(parent) { TrieOffset newParent; letters.push_back(getLetter(getSiblingTo(parent, node))); newParent = getParent(parent); node = parent; parent = newParent; } res.insert(res.begin(), letters.rbegin(), letters.rend()); return Word(&(res[0]), res.size()); } COL_NAMESPACE_END libcolumbus-1.1.0+15.10.20150806/src/LevenshteinIndex.cc0000644000015300001610000001373112560622644022752 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ /* * This file implements a fast Levenshtein matcher for a dictionary of * words. It is a re-implementation of code placed in the public domain * here: * * http://stevehanov.ca/blog/index.php?id=114 */ #include #include #include #include #include "LevenshteinIndex.hh" #include "ErrorValues.hh" #include "Word.hh" #include "ErrorMatrix.hh" #include "Trie.hh" #ifdef HAS_SPARSE_HASH #include using google::sparse_hash_map; #define hashmap sparse_hash_map #else #include #define hashmap unordered_map #endif COL_NAMESPACE_START using namespace std; typedef vector > ChildList; typedef ChildList::iterator ChildListIter; typedef ChildList::const_iterator ChildListConstIter; typedef hashmap WordCount; struct LevenshteinIndexPrivate { WordCount wordCounts; // How many times the word has been added to this index. size_t maxCount; // How many times the most common word has been added. size_t numNodes; size_t numWords; // How many words are in this index in total. size_t longestWordLength; // Longest word that has been added. Same as tree depth. Trie trie; }; LevenshteinIndex::LevenshteinIndex() { p = new LevenshteinIndexPrivate(); p->maxCount = 0; p->longestWordLength = 0; } LevenshteinIndex::~LevenshteinIndex() { delete p; } int LevenshteinIndex::getDefaultError() { return ErrorValues::getDefaultError(); } void LevenshteinIndex::insertWord(const Word &word, const WordID wordID) { if(word.length() == 0) return; auto it = p->wordCounts.find(wordID); size_t newCount; if(it != p->wordCounts.end()) { newCount = p->wordCounts[wordID] + 1; } else { newCount = 1; } p->trie.insertWord(word, wordID); p->wordCounts[wordID] = newCount; if(word.length() > p->longestWordLength) p->longestWordLength = word.length(); if(p->maxCount < newCount) p->maxCount = newCount; return; } bool LevenshteinIndex::hasWord(const Word &word) const { return p->trie.hasWord(word); } void LevenshteinIndex::findWords(const Word &query, const ErrorValues &e, const int maxError, IndexMatches &matches) const { TrieOffset root; TrieOffset sibling; ErrorMatrix em(p->longestWordLength+1, query.length()+1, e.getDeletionError(), e.getStartInsertionError(query.length())); assert(em.get(0, 0) == 0); if(query.length() > 0) assert(em.get(0, 1) == e.getInsertionError()); root = p->trie.getRoot(); sibling = p->trie.getSiblingList(root); while(sibling != 0) { Letter l = p->trie.getLetter(sibling); TrieOffset nextNode = p->trie.getChild(sibling); searchRecursive(query, nextNode, e, l, (Letter)0, 1, em, matches, maxError); sibling = p->trie.getNextSibling(sibling); } matches.sort(); } int LevenshteinIndex::findOptimalError(const Letter letter, const Letter previousLetter, const Word &query, const size_t i, const size_t depth, const ErrorMatrix &em, const ErrorValues &e) const { int insertError = em.get(depth, i-1) + e.getInsertionError(); int deleteError; if(i >= query.length()) deleteError = em.get(depth-1, i) + e.getEndDeletionError(); else deleteError = em.get(depth-1, i) + e.getDeletionError(); int substituteError = em.get(depth-1, i-1) + e.getSubstituteError(query.text[i-1], letter); int transposeError; if(i > 1 && query.text[i - 1] == previousLetter && query.text[i - 2] == letter) { transposeError = em.get(depth-2, i-2) + e.getTransposeError(); } else { transposeError = insertError + 10000; // Ensures this will not be chosen. } return min(insertError, min(deleteError, min(substituteError, transposeError))); } void LevenshteinIndex::searchRecursive(const Word &query, TrieOffset node, const ErrorValues &e, const Letter letter, const Letter previousLetter, const size_t depth, ErrorMatrix &em, IndexMatches &matches, const int maxError) const { for(size_t i = 1; i < query.length()+1; i++) { int minError = findOptimalError(letter, previousLetter, query, i, depth, em, e); em.set(depth, i, minError); } // Error row evaluated. Now check if a word was found and continue recursively. if(em.totalError(depth) <= maxError && p->trie.getWordID(node) != INVALID_WORDID) { matches.addMatch(query, p->trie.getWordID(node), em.totalError(depth)); } if(em.minError(depth) <= maxError) { TrieOffset sibling = p->trie.getSiblingList(node); while(sibling != 0) { Letter l = p->trie.getLetter(sibling); TrieOffset nextNode = p->trie.getChild(sibling); searchRecursive(query, nextNode, e, l, letter, depth+1, em, matches, maxError); sibling = p->trie.getNextSibling(sibling); } } } size_t LevenshteinIndex::wordCount(const WordID queryID) const { auto i = p->wordCounts.find(queryID); if(i == p->wordCounts.end()) return 0; return i->second; } size_t LevenshteinIndex::maxCount() const { return p->maxCount; } size_t LevenshteinIndex::numNodes() const { return p->trie.numNodes(); } size_t LevenshteinIndex::numWords() const { return p->trie.numWords(); } COL_NAMESPACE_END libcolumbus-1.1.0+15.10.20150806/src/ColumbusHelpers.cc0000644000015300001610000001325412560622644022612 0ustar pbuserpbgroup00000000000000/* * Copyright (C) 2012 Canonical, Ltd. * * Authors: * Jussi Pakkanen * * This library is free software; you can redistribute it and/or modify it under * the terms of version 3 of the GNU Lesser General Public License as published * by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ /* * This file contains various library helper functions. */ #include "ColumbusHelpers.hh" #include "Word.hh" #include "WordList.hh" #include #include #include #include #include #include #include #include #include COL_NAMESPACE_START static const Letter whitespaceLetters[] = {' ', '\t', '\n', '\r', '\0'}; static const int numWhitespaceLetters = 4; static Letter lowerLetter(Letter l) { return Letter(u_tolower(l)); // Have to use ICU because towlower libc function does not work. } Letter* utf8ToInternal(const char *utf8Text, unsigned int &resultStringSize) { iconv_t ic = iconv_open(INTERNAL_ENCODING, "UTF-8"); char *tmp; char *txt; char *inBuf; char *outBuf; size_t badConvertedCharacters; size_t inBytes, outBytes, outBytesOriginal; size_t bytesWritten; if (ic == (iconv_t)-1) { throw std::runtime_error("Could not create iconv converter."); } unsigned int inputLen = strlen((const char*)(utf8Text)); txt = (char*)new Letter[inputLen+1]; tmp = strdup((const char*)(utf8Text)); // Iconv should take a const pointer but does not. Protect against it screwing up. assert(tmp); inBytes = inputLen; outBytes = sizeof(Letter)*(inBytes+1); outBytesOriginal = outBytes; inBuf = tmp; outBuf = txt; badConvertedCharacters = iconv(ic, &inBuf, &inBytes, &outBuf, &outBytes); free(tmp); iconv_close(ic); if(badConvertedCharacters == (size_t)-1) { std::string err("Could not convert UTF8-string to internal representation: "); err += (const char*)(utf8Text); throw std::runtime_error(err); } bytesWritten = outBytesOriginal - outBytes; resultStringSize = bytesWritten/sizeof(Letter); if(bytesWritten < inputLen) { // Shrink allocated memory size to exactly the produced string. size_t newArraySize = bytesWritten + sizeof(Letter); char *newtxt = new char[newArraySize]; memcpy(newtxt, txt, newArraySize); delete []txt; txt = newtxt; } Letter* text = reinterpret_cast(txt); text[resultStringSize] = 0; // Null terminated. // Now convert all letters to lower case, because we don't care about case difference when matching. for(size_t i=0; i(const_cast(source)); char *outBuf; size_t badConvertedCharacters; size_t inBytes, outBytes, outBytesOriginal, resultStringSize; if (ic == (iconv_t)-1) { throw std::runtime_error("Could not create iconv converter."); } inBytes = characters*sizeof(Letter); outBytes = bufsize; outBytesOriginal = outBytes; outBuf = buf; badConvertedCharacters = iconv(ic, &inBuf, &inBytes, &outBuf, &outBytes); iconv_close(ic); if(badConvertedCharacters == (size_t)-1) { throw std::runtime_error("Could not convert internal string to UTF-8."); } resultStringSize = outBytesOriginal - outBytes; buf[resultStringSize] = 0; // Null terminated, just in case. } double hiresTimestamp() { struct timeval now; gettimeofday(&now, NULL); return now.tv_sec + now.tv_usec/1000000.0; } WordList splitToWords(const char *utf8Text) { return split(utf8Text, whitespaceLetters, numWhitespaceLetters); } static bool isInList(const Letter l, const Letter *chars, int numChars) { for(int i=0; i= strSize) { delete []word; return list; } end = begin+1; while(!isInList(utf8Text[end], splitChars, numChars) && end < strSize) { end++; } // End points to one past the last letter. unsigned int wordLen = end-begin; if(wordLen +1 > bufSize) { delete[] word; word = new char[wordLen+1]; bufSize = wordLen + 1; } memcpy(word, utf8Text+begin, wordLen); word[wordLen] = '\0'; try { Word w(word); list.addWord(w); } catch(std::invalid_argument &ex) { delete []word; throw ex; } } while(end < strSize); delete []word; return list; } bool isWhitespace(Letter l) { return isInList(l, whitespaceLetters, numWhitespaceLetters); } COL_NAMESPACE_END libcolumbus-1.1.0+15.10.20150806/CMakeLists.txt0000644000015300001610000000733412560622644021142 0ustar pbuserpbgroup00000000000000project(columbus C CXX) cmake_minimum_required(VERSION 2.8.9) if(PROJECT_BINARY_DIR STREQUAL PROJECT_SOURCE_DIR) message(FATAL_ERROR "In-tree build attempt detected, aborting. Set your build dir outside your source dir, delete CMakeCache.txt from source root and try again.") endif() option(enable_tests "Enable tests." ON) option(enable_scalability_tests "Additional scalability tests that are potentially very slow to run." OFF) option(full_warnings "All possible compiler warnings." OFF) option(debug_messages "Print debug messages.") option(full_unicode "Enable full Unicode support (takes lots of memory).") option(use_python2 "Build Python bindings against Python 2 (UNSUPPORTED)." OFF) if(use_python2) message(WARNING "Python 2 bindings are NOT SUPPORTED! If they break, you get to keep both pieces.") endif() include(FindPkgConfig) include(cmake/pch.cmake) include(cmake/python.cmake) include(cmake/coverage.cmake) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -pedantic -Wextra") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -pedantic -Wextra") if(${full_warnings}) # C does not have any more warning flags. set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Weffc++") endif() if(${debug_messages}) add_definitions(-DDEBUG_MESSAGES) endif() # Symbol visibility add_definitions(-DBUILDING_COLUMBUS) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fvisibility=hidden") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden") set(SO_VERSION_MAJOR "1") set(SO_VERSION_MINOR "1") set(SO_VERSION_PATCH "0") set(SO_VERSION "${SO_VERSION_MAJOR}.${SO_VERSION_MINOR}.${SO_VERSION_PATCH}") set(COL_LIB_BASENAME "columbus") # Increment this manually whenever breaking ABI. # http://tldp.org/HOWTO/Program-Library-HOWTO/shared-libraries.html#AEN135 set(ABI_VERSION 1) include(GNUInstallDirs) set(LIBDIR ${CMAKE_INSTALL_LIBDIR}) # Set as cache variable so packaging can override. set(PYTHONDIR "lib/python3/dist-packages" CACHE PATH "Destination install dir for Python module") include(TestBigEndian) TEST_BIG_ENDIAN(IS_BIG_ENDIAN) if(full_unicode) if(${IS_BIG_ENDIAN}) set(INTERNAL_ENCODING "UTF-32BE") # Iconv encoding string. else() set(INTERNAL_ENCODING "UTF-32LE") endif() set(LETTER_TYPE "uint32_t") else() if(${IS_BIG_ENDIAN}) set(INTERNAL_ENCODING "UCS-2BE//IGNORE") # Drop everything outside Base Multilingual Plane. else() set(INTERNAL_ENCODING "UCS-2LE//IGNORE") endif() set(LETTER_TYPE "uint16_t") endif() find_file(HAS_SPARSE_HASH "google/sparse_hash_map") if(HAS_SPARSE_HASH) message(STATUS "Using sparse hash.") else() message(STATUS "Sparse hash not found, using regular hash.") endif() # Only required on some platforms. # If not found, that's ok. find_library(ICONV_LIBRARIES iconv) pkg_search_module(GTK3 gtk+-3.0) if(GTK3_FOUND) message(STATUS "GTK+ 3.0 found, building GUI apps.") else() message(STATUS "GTK+ 3.0 not found, not building GUI apps.") endif() include_directories(include) include_directories(${CMAKE_CURRENT_BINARY_DIR}/include) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c99") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") pkg_search_module(ICU icu-uc) # Quantal and earlier do not have pkg-config files for icu. # Find it manually. if(NOT ICU_FOUND) find_library(icu1 icuuc) find_library(icu2 icudata) if(NOT icu1) message(FATAL_ERROR "Libicu not found. Please install it and rerun CMake.") endif() if(NOT icu2) message(FATAL_ERROR "Libicu not found. Please install it and rerun CMake.") endif() set(ICU_LIBRARIES ${icu1} ${icu2}) endif() add_subdirectory(include) add_subdirectory(src) add_subdirectory(tools) add_subdirectory(share) if(build_python) add_subdirectory(python) endif() if(${enable_tests}) enable_testing() add_subdirectory(test) endif()