python-pyahocorasick_1.4.1.orig/.gitignore0000644000000000000000000000026414002634426015635 0ustar00# patterns *.pyc *.pyd *.dot *.swp *.so *.sh *.o # files MANIFEST runtest.sh tags release_checklist.txt .gdb_history # dirs build/ dist/ /pyahocorasick.egg-info/ /tmp/ coverage/ python-pyahocorasick_1.4.1.orig/.travis.yml0000644000000000000000000000025012707733326015762 0ustar00language: c addons: apt: packages: - python3 - python3-dev matrix: include: - compiler: gcc - compiler: clang script: - make python-pyahocorasick_1.4.1.orig/Automaton.c0000644000000000000000000010047614002626547015773 0ustar00/* This is part of pyahocorasick Python module. Automaton class implementation. (this file includes Automaton_pickle.c) Author : Wojciech Muła, wojciech_mula@poczta.onet.pl WWW : http://0x80.pl License : BSD-3-Clause (see LICENSE) */ #include "Automaton.h" #include "slist.h" #include "src/inline_doc.h" #include "src/custompickle/save/automaton_save.h" static PyTypeObject automaton_type; static bool check_store(const int store) { switch (store) { case STORE_LENGTH: case STORE_INTS: case STORE_ANY: return true; default: PyErr_SetString( PyExc_ValueError, "store value must be one of ahocorasick.STORE_LENGTH, STORE_INTS or STORE_ANY" ); return false; } // switch } static bool check_kind(const int kind) { switch (kind) { case EMPTY: case TRIE: case AHOCORASICK: return true; default: PyErr_SetString( PyExc_ValueError, "kind value must be one of ahocorasick.EMPTY, TRIE or AHOCORASICK" ); return false; } } static bool check_key_type(const int store) { switch (store) { case KEY_STRING: case KEY_SEQUENCE: return true; default: PyErr_SetString( PyExc_ValueError, "key_type must have value KEY_STRING or KEY_SEQUENCE" ); return false; } // switch } static PyObject* automaton_create() { Automaton* automaton; automaton = (Automaton*)F(PyObject_New)(Automaton, &automaton_type); if (UNLIKELY(automaton == NULL)) { return NULL; } automaton->kind = EMPTY; automaton->store = STORE_ANY; automaton->key_type = KEY_STRING; automaton->count = 0; automaton->longest_word = 0; automaton->version = 0; automaton->stats.version = -1; automaton->root = NULL; return (PyObject*)automaton; } static PyObject* automaton_new(PyTypeObject* self, PyObject* args, PyObject* kwargs) { Automaton* automaton; int key_type; int store; automaton = (Automaton*)automaton_create(); if (UNLIKELY(automaton == NULL)) return NULL; if (UNLIKELY(PyTuple_Size(args) == 7)) { int word_count; int longest_word; AutomatonKind kind; KeysStore store; KeyType key_type; PyObject* bytes_list = NULL; PyObject* values = NULL; const char* fmt = "OiiiiiO"; if (!F(PyArg_ParseTuple)(args, fmt, &bytes_list, &kind, &store, &key_type, &word_count, &longest_word, &values)) { PyErr_SetString(PyExc_ValueError, "Unable to load from pickle."); goto error; } if (!check_store(store) || !check_kind(kind) || !check_key_type(key_type)) { goto error; } if (!PyList_CheckExact(bytes_list)) { PyErr_SetString(PyExc_TypeError, "Expected list"); goto error; } if (kind != EMPTY) { if (values == Py_None) { Py_XDECREF(values); values = NULL; } if (automaton_unpickle(automaton, bytes_list, values)) { automaton->kind = kind; automaton->store = store; automaton->key_type = key_type; automaton->count = word_count; automaton->longest_word = longest_word; } else goto error; } } else { store = STORE_ANY; key_type = KEY_STRING; // construct new object if (F(PyArg_ParseTuple)(args, "ii", &store, &key_type)) { if (not check_store(store)) { goto error; } if (not check_key_type(key_type)) { goto error; } } else if (F(PyArg_ParseTuple)(args, "i", &store)) { if (not check_store(store)) { goto error; } } PyErr_Clear(); automaton->store = store; automaton->key_type = key_type; } //ok: return (PyObject*)automaton; error: Py_XDECREF(automaton); return NULL; } static void automaton_del(PyObject* self) { #define automaton ((Automaton*)self) automaton_clear(self, NULL); PyObject_Del(self); #undef automaton } static ssize_t automaton_len(PyObject* self) { #define automaton ((Automaton*)self) return automaton->count; #undef automaton } static PyObject* automaton_add_word(PyObject* self, PyObject* args) { #define automaton ((Automaton*)self) // argument PyObject* py_value = NULL; struct Input input; Py_ssize_t integer = 0; TrieNode* node; bool new_word; if (!prepare_input_from_tuple(self, args, 0, &input)) { return NULL; } switch (automaton->store) { case STORE_ANY: py_value = F(PyTuple_GetItem)(args, 1); if (not py_value) { PyErr_SetString(PyExc_ValueError, "A value object is required as second argument."); goto py_exception; } break; case STORE_INTS: py_value = F(PyTuple_GetItem)(args, 1); if (py_value) { if (F(PyNumber_Check)(py_value)) { integer = F(PyNumber_AsSsize_t)(py_value, PyExc_ValueError); if (integer == -1 and PyErr_Occurred()) goto py_exception; } else { PyErr_SetString(PyExc_TypeError, "An integer value is required as second argument."); goto py_exception; } } else { // default PyErr_Clear(); integer = automaton->count + 1; } break; case STORE_LENGTH: integer = input.wordlen; break; default: PyErr_SetString(PyExc_SystemError, "Invalid value for this key: see documentation for supported values."); goto py_exception; } node = NULL; new_word = false; if (input.wordlen > 0) { node = trie_add_word(automaton, input.word, input.wordlen, &new_word); if (node == NULL) { PyErr_NoMemory(); goto py_exception; } } destroy_input(&input); if (node) { switch (automaton->store) { case STORE_ANY: if (not new_word and node->eow) // replace Py_DECREF(node->output.object); Py_INCREF(py_value); node->output.object = py_value; break; default: node->output.integer = integer; } // switch if (new_word) { automaton->version += 1; // change version only when new word appeared if (input.wordlen > automaton->longest_word) automaton->longest_word = (int)input.wordlen; Py_RETURN_TRUE; } else { Py_RETURN_FALSE; } } Py_RETURN_FALSE; py_exception: destroy_input(&input); return NULL; } static TristateResult automaton_remove_word_aux(PyObject* self, PyObject* args, PyObject** value) { #define automaton ((Automaton*)self) struct Input input; if (!prepare_input_from_tuple(self, args, 0, &input)) { return MEMORY_ERROR; } if (input.wordlen == 0) { destroy_input(&input); return FALSE; } *value = trie_remove_word(automaton, input.word, input.wordlen); destroy_input(&input); if (UNLIKELY(PyErr_Occurred() != NULL)) { return MEMORY_ERROR; } else { return (*value != NULL) ? TRUE : FALSE; } } static PyObject* automaton_remove_word(PyObject* self, PyObject* args) { PyObject* value; switch (automaton_remove_word_aux(self, args, &value)) { case FALSE: Py_RETURN_FALSE; break; case TRUE: if (automaton->store == STORE_ANY) { // value is meaningful Py_DECREF(value); } automaton->version += 1; automaton->count -= 1; Py_RETURN_TRUE; break; case MEMORY_ERROR: default: return NULL; } } static PyObject* automaton_pop(PyObject* self, PyObject* args) { PyObject* value; switch (automaton_remove_word_aux(self, args, &value)) { case FALSE: PyErr_SetNone(PyExc_KeyError); return NULL; case TRUE: automaton->version += 1; automaton->count -= 1; return value; // there's no need to increase refcount, the value was removed case MEMORY_ERROR: default: return NULL; } } static void clear_aux(TrieNode* node, KeysStore store) { unsigned i; if (node) { switch (store) { case STORE_INTS: case STORE_LENGTH: // nop break; case STORE_ANY: if (node->eow && node->output.object) Py_DECREF(node->output.object); break; } for (i=0; i < node->n; i++) { TrieNode* child = trienode_get_ith_unsafe(node, i); if (child != node) // avoid self-loops! clear_aux(child, store); } trienode_free(node); } #undef automaton } static PyObject* automaton_clear(PyObject* self, PyObject* args) { #define automaton ((Automaton*)self) clear_aux(automaton->root, automaton->store); automaton->count = 0; automaton->longest_word = 0; automaton->kind = EMPTY; automaton->root = NULL; automaton->version += 1; Py_RETURN_NONE; #undef automaton } static int automaton_contains(PyObject* self, PyObject* args) { #define automaton ((Automaton*)self) TrieNode* node; struct Input input; if (!prepare_input(self, args, &input)) { return -1; } node = trie_find(automaton->root, input.word, input.wordlen); destroy_input(&input); return (node and node->eow); #undef automaton } static PyObject* automaton_exists(PyObject* self, PyObject* args) { PyObject* word; word = F(PyTuple_GetItem)(args, 0); if (word) switch (automaton_contains(self, word)) { case 1: Py_RETURN_TRUE; case 0: Py_RETURN_FALSE; default: return NULL; } else return NULL; } static PyObject* automaton_match(PyObject* self, PyObject* args) { #define automaton ((Automaton*)self) TrieNode* node; struct Input input; if (!prepare_input_from_tuple(self, args, 0, &input)) { return NULL; } node = trie_find(automaton->root, input.word, input.wordlen);; destroy_input(&input); if (node) Py_RETURN_TRUE; else Py_RETURN_FALSE; #undef automaton } static PyObject* automaton_longest_prefix(PyObject* self, PyObject* args) { #define automaton ((Automaton*)self) int len; struct Input input; if (!prepare_input_from_tuple(self, args, 0, &input)) { return NULL; } len = trie_longest(automaton->root, input.word, input.wordlen); destroy_input(&input); return F(Py_BuildValue)("i", len); #undef automaton } static PyObject* automaton_get(PyObject* self, PyObject* args) { #define automaton ((Automaton*)self) struct Input input; PyObject* py_def; Py_ssize_t k; TrieNode* node; k = PyTuple_GET_SIZE(args); if (k < 1 || k > 2) { PyErr_Format(PyExc_TypeError, "get() takes one or two arguments (%ld given)", k); return NULL; } if (!prepare_input_from_tuple(self, args, 0, &input)) { return NULL; } node = trie_find(automaton->root, input.word, input.wordlen); destroy_input(&input); if (node and node->eow) { switch (automaton->store) { case STORE_INTS: case STORE_LENGTH: return F(Py_BuildValue)("i", node->output.integer); case STORE_ANY: Py_INCREF(node->output.object); return node->output.object; default: PyErr_SetNone(PyExc_ValueError); return NULL; } } else { py_def = F(PyTuple_GetItem)(args, 1); if (py_def) { Py_INCREF(py_def); return py_def; } else { PyErr_Clear(); PyErr_SetNone(PyExc_KeyError); return NULL; } } #undef automaton } typedef struct AutomatonQueueItem { LISTITEM_data; TrieNode* node; } AutomatonQueueItem; static PyObject* automaton_make_automaton(PyObject* self, PyObject* args) { #define automaton ((Automaton*)self) AutomatonQueueItem* item; List queue; unsigned i; TrieNode* node; TrieNode* child; TrieNode* state; TRIE_LETTER_TYPE letter; if (automaton->kind != TRIE) Py_RETURN_FALSE; list_init(&queue); // 1. setup nodes at first level: they fail back to the root ASSERT(automaton->root); for (i=0; i < automaton->root->n; i++) { TrieNode* child = trienode_get_ith_unsafe(automaton->root, i); ASSERT(child); // fail edges go to the root // every other letters loop on root - implicit (see automaton_next) child->fail = automaton->root; item = (AutomatonQueueItem*)list_item_new(sizeof(AutomatonQueueItem)); if (item) { item->node = child; list_append(&queue, (ListItem*)item); } else goto no_mem; } // 2. make links while (true) { AutomatonQueueItem* item = (AutomatonQueueItem*)list_pop_first(&queue); if (item == NULL) break; else { node = item->node; memory_free(item); } for (i=0; i < node->n; i++) { child = trienode_get_ith_unsafe(node, i); letter = trieletter_get_ith_unsafe(node, i); ASSERT(child); item = (AutomatonQueueItem*)list_item_new(sizeof(AutomatonQueueItem)); if (item) { item->node = child; list_append(&queue, (ListItem*)item); } else goto no_mem; state = node->fail; ASSERT(state); ASSERT(child); while (state != automaton->root and\ not trienode_get_next(state, letter)) { state = state->fail; ASSERT(state); } child->fail = trienode_get_next(state, letter); if (child->fail == NULL) child->fail = automaton->root; ASSERT(child->fail); } } automaton->kind = AHOCORASICK; automaton->version += 1; list_delete(&queue); Py_RETURN_NONE; #undef automaton no_mem: list_delete(&queue); PyErr_NoMemory(); return NULL; } static PyObject* automaton_find_all(PyObject* self, PyObject* args) { #define automaton ((Automaton*)self) struct Input input; ssize_t start; ssize_t end; PyObject* callback; PyObject* callback_ret; ssize_t i; TrieNode* state; TrieNode* tmp; if (automaton->kind != AHOCORASICK) Py_RETURN_NONE; // arg 1 if (!prepare_input_from_tuple(self, args, 0, &input)) { return NULL; } // arg 2 callback = F(PyTuple_GetItem)(args, 1); if (callback == NULL) { destroy_input(&input); return NULL; } else if (not F(PyCallable_Check)(callback)) { PyErr_SetString(PyExc_TypeError, "The callback argument must be a callable such as a function."); destroy_input(&input); return NULL; } // parse start/end if (pymod_parse_start_end(args, 2, 3, 0, input.wordlen, &start, &end)) { destroy_input(&input); return NULL; } state = automaton->root; for (i=start; i < end; i++) { state = tmp = ahocorasick_next(state, automaton->root, input.word[i]); // return output while (tmp) { if (tmp->eow) { if (automaton->store == STORE_ANY) callback_ret = F(PyObject_CallFunction)(callback, "iO", i, tmp->output.object); else callback_ret = F(PyObject_CallFunction)(callback, "ii", i, tmp->output.integer); if (callback_ret == NULL) { destroy_input(&input); return NULL; } else Py_DECREF(callback_ret); } tmp = tmp->fail; } } #undef automaton destroy_input(&input); Py_RETURN_NONE; } static PyObject* automaton_items_create(PyObject* self, PyObject* args, const ItemsType type) { #define automaton ((Automaton*)self) PyObject* arg1 = NULL; PyObject* arg2 = NULL; PyObject* arg3 = NULL; TRIE_LETTER_TYPE* word = NULL; TRIE_LETTER_TYPE* tmp = NULL; ssize_t wordlen = 0; TRIE_LETTER_TYPE wildcard; bool use_wildcard = false; PatternMatchType matchtype = MATCH_AT_LEAST_PREFIX; AutomatonItemsIter* iter; bool word_is_copy = false; bool tmp_is_copy = false; // arg 1: prefix/prefix pattern if (args) arg1 = F(PyTuple_GetItem)(args, 0); else arg1 = NULL; if (arg1) { arg1 = pymod_get_string(arg1, &word, &wordlen, &word_is_copy); if (arg1 == NULL) goto error; } else { PyErr_Clear(); word = NULL; wordlen = 0; } // arg 2: wildcard if (args) arg2 = F(PyTuple_GetItem)(args, 1); else arg2 = NULL; if (arg2) { ssize_t len = 0; arg2 = pymod_get_string(arg2, &tmp, &len, &tmp_is_copy); if (arg2 == NULL) { goto error; } else { if (len == 1) { wildcard = tmp[0]; use_wildcard = true; } else { PyErr_SetString(PyExc_ValueError, "Wildcard must be a single character."); goto error; } } } else { PyErr_Clear(); wildcard = 0; use_wildcard = false; } // arg3: matchtype matchtype = MATCH_AT_LEAST_PREFIX; if (args) { arg3 = F(PyTuple_GetItem)(args, 2); if (arg3) { Py_ssize_t val = F(PyNumber_AsSsize_t)(arg3, PyExc_OverflowError); if (val == -1 and PyErr_Occurred()) goto error; switch ((PatternMatchType)val) { case MATCH_AT_LEAST_PREFIX: case MATCH_AT_MOST_PREFIX: case MATCH_EXACT_LENGTH: matchtype = (PatternMatchType)val; break; default: PyErr_SetString(PyExc_ValueError, "The optional how third argument must be one of: " "MATCH_EXACT_LENGTH, MATCH_AT_LEAST_PREFIX or MATCH_AT_LEAST_PREFIX" ); goto error; } } else { PyErr_Clear(); if (use_wildcard) matchtype = MATCH_EXACT_LENGTH; else matchtype = MATCH_AT_LEAST_PREFIX; } } // iter = (AutomatonItemsIter*)automaton_items_iter_new( automaton, word, wordlen, use_wildcard, wildcard, matchtype); maybe_decref(word_is_copy, arg1) maybe_decref(tmp_is_copy, arg2) maybe_free(word_is_copy, word) maybe_free(tmp_is_copy, tmp) if (iter) { iter->type = type; return (PyObject*)iter; } else return NULL; error: maybe_decref(word_is_copy, arg1) maybe_decref(tmp_is_copy, arg2) maybe_free(word_is_copy, word) maybe_free(tmp_is_copy, tmp) return NULL; #undef automaton } static PyObject* automaton_keys(PyObject* self, PyObject* args) { return automaton_items_create(self, args, ITER_KEYS); } static PyObject* automaton_iterate(PyObject* self) { return automaton_items_create(self, NULL, ITER_KEYS); } static PyObject* automaton_values(PyObject* self, PyObject* args) { return automaton_items_create(self, args, ITER_VALUES); } static PyObject* automaton_items(PyObject* self, PyObject* args) { return automaton_items_create(self, args, ITER_ITEMS); } static PyObject* automaton_iter(PyObject* self, PyObject* args, PyObject* keywds) { #define automaton ((Automaton*)self) static char *kwlist[] = {"string", "start", "end", "ignore_white_space", NULL}; PyObject* object; ssize_t start, start_tmp = -1; ssize_t end, end_tmp = -1; int ignore_white_space_tmp = -1; bool ignore_white_space = false; if (automaton->kind != AHOCORASICK) { PyErr_SetString(PyExc_AttributeError,"Not an Aho-Corasick automaton yet: " "call add_word to add some keys and call make_automaton to " "convert the trie to an automaton."); return NULL; } if (!F(PyArg_ParseTupleAndKeywords)(args, keywds, "O|iii", kwlist, &object, &start_tmp, &end_tmp, &ignore_white_space_tmp)) { return NULL; } if (ignore_white_space_tmp == 1) { ignore_white_space = true; } if (object) { if (automaton->key_type == KEY_STRING) { #ifdef PY3K #ifdef AHOCORASICK_UNICODE if (F(PyUnicode_Check)(object)) { start = 0; #if PY_MINOR_VERSION >= 3 end = PyUnicode_GET_LENGTH(object); #else end = PyUnicode_GET_SIZE(object); #endif } else { PyErr_SetString(PyExc_TypeError, "string required"); return NULL; } #else if (F(PyBytes_Check)(object)) { start = 0; end = PyBytes_GET_SIZE(object); } else { PyErr_SetString(PyExc_TypeError, "bytes required"); return NULL; } #endif #else if (F(PyString_Check)(object)) { start = 0; end = PyString_GET_SIZE(object); } else { PyErr_SetString(PyExc_TypeError, "string required"); return NULL; } #endif } else { if (F(PyTuple_Check)(object)) { start = 0; end = PyTuple_GET_SIZE(object); } else { PyErr_SetString(PyExc_TypeError, "tuple required"); return NULL; } } } else return NULL; if (start_tmp != -1) { start = start_tmp; } if (end_tmp != -1) { end = end_tmp; } return automaton_search_iter_new( automaton, object, (int)start, (int)end, ignore_white_space ); #undef automaton } static PyObject* automaton_iter_long(PyObject* self, PyObject* args) { #define automaton ((Automaton*)self) PyObject* object; ssize_t start; ssize_t end; if (automaton->kind != AHOCORASICK) { PyErr_SetString(PyExc_AttributeError, "not an automaton yet; add some words and call make_automaton"); return NULL; } object = PyTuple_GetItem(args, 0); if (object == NULL) return NULL; if (automaton->key_type == KEY_STRING) { #ifdef PY3K #ifdef AHOCORASICK_UNICODE if (F(PyUnicode_Check)(object)) { start = 0; #if PY_MINOR_VERSION >= 3 end = PyUnicode_GET_LENGTH(object); #else end = PyUnicode_GET_SIZE(object); #endif } else { PyErr_SetString(PyExc_TypeError, "string required"); return NULL; } #else if (F(PyBytes_Check)(object)) { start = 0; end = PyBytes_GET_SIZE(object); } else { PyErr_SetString(PyExc_TypeError, "bytes required"); return NULL; } #endif #else if (F(PyString_Check)(object)) { start = 0; end = PyString_GET_SIZE(object); } else { PyErr_SetString(PyExc_TypeError, "string required"); return NULL; } #endif } else { if (F(PyTuple_Check)(object)) { start = 0; end = PyTuple_GET_SIZE(object); } else { PyErr_SetString(PyExc_TypeError, "tuple required"); return NULL; } } if (pymod_parse_start_end(args, 1, 2, start, end, &start, &end)) return NULL; return automaton_search_iter_long_new( automaton, object, start, end ); #undef automaton } static void get_stats_aux(TrieNode* node, AutomatonStatistics* stats, int depth) { unsigned i; stats->nodes_count += 1; stats->words_count += (int)(node->eow); stats->links_count += node->n; stats->total_size += trienode_get_size(node); if (depth > stats->longest_word) stats->longest_word = depth; for (i=0; i < node->n; i++) get_stats_aux(trienode_get_ith_unsafe(node, i), stats, depth + 1); } static void get_stats(Automaton* automaton) { automaton->stats.nodes_count = 0; automaton->stats.words_count = 0; automaton->stats.longest_word = 0; automaton->stats.links_count = 0; automaton->stats.sizeof_node = sizeof(TrieNode); automaton->stats.total_size = 0; if (automaton->kind != EMPTY) get_stats_aux(automaton->root, &automaton->stats, 0); automaton->stats.version = automaton->version; } static PyObject* automaton_get_stats(PyObject* self, PyObject* args) { #define automaton ((Automaton*)self) PyObject* dict; if (automaton->stats.version != automaton->version) get_stats(automaton); dict = F(Py_BuildValue)( "{s:k,s:k,s:k,s:k,s:i,s:k}", "nodes_count", automaton->stats.nodes_count, "words_count", automaton->stats.words_count, "longest_word", automaton->stats.longest_word, "links_count", automaton->stats.links_count, "sizeof_node", automaton->stats.sizeof_node, "total_size", automaton->stats.total_size ); return dict; #undef automaton } typedef struct DumpAux { PyObject* nodes; PyObject* edges; PyObject* fail; char error; } DumpAux; static int dump_aux(TrieNode* node, const int depth, void* extra) { #define Dump ((DumpAux*)extra) PyObject* tuple; TrieNode* child; unsigned i; #define append_tuple(list) \ if (tuple == NULL) { \ Dump->error = 1; \ return 0; \ } \ else if (PyList_Append(list, tuple) < 0) { \ Dump->error = 1; \ return 0; \ } // 1. tuple = F(Py_BuildValue)("ii", node, (int)(node->eow)); append_tuple(Dump->nodes) // 2. for (i=0; i < node->n; i++) { child = trienode_get_ith_unsafe(node, i); tuple = F(Py_BuildValue)("ici", node, trieletter_get_ith_unsafe(node, i), child); append_tuple(Dump->edges) } // 3. if (node->fail) { tuple = F(Py_BuildValue)("ii", node, node->fail); append_tuple(Dump->fail); } return 1; #undef append_tuple #undef Dump } static PyObject* automaton_dump(PyObject* self, PyObject* args) { #define automaton ((Automaton*)self) DumpAux dump; if (automaton->kind == EMPTY) Py_RETURN_NONE; dump.nodes = 0; dump.edges = 0; dump.fail = 0; dump.error = 0; dump.nodes = F(PyList_New)(0); dump.edges = F(PyList_New)(0); dump.fail = F(PyList_New)(0); if (dump.edges == NULL or dump.fail == NULL or dump.nodes == NULL) goto error; trie_traverse(automaton->root, dump_aux, &dump); if (dump.error) goto error; else return F(Py_BuildValue)("OOO", dump.nodes, dump.edges, dump.fail); error: Py_XDECREF(dump.nodes); Py_XDECREF(dump.edges); Py_XDECREF(dump.fail); return NULL; #undef automaton } static PyObject* automaton___sizeof__(PyObject* self, PyObject* args) { #define automaton ((Automaton*)self) Py_ssize_t size = sizeof(Automaton); if (automaton->kind != EMPTY) { if (automaton->stats.version != automaton->version) { get_stats(automaton); } size += automaton->stats.total_size; } return Py_BuildValue("i", size); #undef automaton } #include "Automaton_pickle.c" #define method(name, kind) {#name, (PyCFunction)automaton_##name, kind, automaton_##name##_doc} static PyMethodDef automaton_methods[] = { method(add_word, METH_VARARGS), method(remove_word, METH_VARARGS), method(pop, METH_VARARGS), method(clear, METH_NOARGS), method(exists, METH_VARARGS), method(match, METH_VARARGS), method(longest_prefix, METH_VARARGS), method(get, METH_VARARGS), method(make_automaton, METH_NOARGS), method(find_all, METH_VARARGS), method(iter, METH_VARARGS|METH_KEYWORDS), method(iter_long, METH_VARARGS), method(keys, METH_VARARGS), method(values, METH_VARARGS), method(items, METH_VARARGS), method(get_stats, METH_NOARGS), method(dump, METH_NOARGS), method(__reduce__, METH_VARARGS), method(__sizeof__, METH_VARARGS), method(save, METH_VARARGS), {NULL, NULL, 0, NULL} }; #undef method static PySequenceMethods automaton_as_sequence; static PyMemberDef automaton_members[] = { { "kind", T_INT, offsetof(Automaton, kind), READONLY, "Read-only attribute maintained automatically.\nKind for this Automaton instance.\nOne of ahocorasick.EMPTY, TRIE or AHOCORASICK." }, { "store", T_INT, offsetof(Automaton, store), READONLY, "Read-only attribute set when creating an Automaton().\nType of values accepted by this Automaton.\nOne of ahocorasick.STORE_ANY, STORE_INTS or STORE_LEN." }, {NULL} }; static PyTypeObject automaton_type = { PY_OBJECT_HEAD_INIT "ahocorasick.Automaton", /* tp_name */ sizeof(Automaton), /* tp_size */ 0, /* tp_itemsize? */ (destructor)automaton_del, /* tp_dealloc */ 0, /* tp_print */ 0, /* tp_getattr */ 0, /* tp_setattr */ 0, /* tp_reserved */ 0, /* tp_repr */ 0, /* tp_as_number */ 0, /* tp_as_sequence */ 0, /* tp_as_mapping */ 0, /* tp_hash */ 0, /* tp_call */ 0, /* tp_str */ PyObject_GenericGetAttr, /* tp_getattro */ 0, /* tp_setattro */ 0, /* tp_as_buffer */ Py_TPFLAGS_DEFAULT, /* tp_flags */ automaton_constructor_doc, /* tp_doc */ 0, /* tp_traverse */ 0, /* tp_clear */ 0, /* tp_richcompare */ 0, /* tp_weaklistoffset */ automaton_iterate, /* tp_iter */ 0, /* tp_iternext */ automaton_methods, /* tp_methods */ automaton_members, /* tp_members */ 0, /* tp_getset */ 0, /* tp_base */ 0, /* tp_dict */ 0, /* tp_descr_get */ 0, /* tp_descr_set */ 0, /* tp_dictoffset */ 0, /* tp_init */ 0, /* tp_alloc */ automaton_new, /* tp_new */ }; python-pyahocorasick_1.4.1.orig/Automaton.h0000644000000000000000000000713313642657421016000 0ustar00/* This is part of pyahocorasick Python module. Automaton class methods Author : Wojciech Muła, wojciech_mula@poczta.onet.pl WWW : http://0x80.pl License : BSD-3-Clause (see LICENSE) */ #ifndef ahocorasick_Automaton_h_included #define ahocorasick_Automaton_h_included #include "common.h" #include "trie.h" typedef enum { EMPTY = 0, TRIE = 1, AHOCORASICK = 2 } AutomatonKind; static bool check_kind(const int kind); typedef enum { STORE_INTS = 10, STORE_LENGTH = 20, STORE_ANY = 30 } KeysStore; static bool check_store(const int store); typedef enum { KEY_STRING = 100, KEY_SEQUENCE = 200 } KeyType; static bool check_key_type(const int key_type); struct Input { Py_ssize_t wordlen; TRIE_LETTER_TYPE* word; PyObject* py_word; bool is_copy; }; typedef struct AutomatonStatistics { int version; ssize_t nodes_count; ///< total number of nodes ssize_t words_count; ///< len(automaton) ssize_t longest_word; ///< longest word ssize_t links_count; ///< links count ssize_t sizeof_node; ///< size of single node (a C structure) ssize_t total_size; ///< total size in bytes } AutomatonStatistics; typedef struct Automaton { PyObject_HEAD AutomatonKind kind; ///< current kind of automaton KeysStore store; ///< type of values: copy of string, bare integer, python object KeyType key_type; ///< type of keys: strings or integer sequences int count; ///< number of distinct words int longest_word; ///< length of the longest word TrieNode* root; ///< root of a trie int version; ///< current version of automaton, incremented by add_word, clean and make_automaton; used to lazy invalidate iterators AutomatonStatistics stats; ///< statistics } Automaton; /*------------------------------------------------------------------------*/ static bool automaton_unpickle( Automaton* automaton, PyObject* bytes_list, PyObject* values ); static PyObject* automaton_create(void); /* __init__ */ static PyObject* automaton_new(PyTypeObject* self, PyObject* args, PyObject* kwargs); /* clear() */ static PyObject* automaton_clear(PyObject* self, PyObject* args); /* len() */ static ssize_t automaton_len(PyObject* self); /* add_word */ static PyObject* automaton_add_word(PyObject* self, PyObject* args); /* clear() */ static PyObject* automaton_clear(PyObject* self, PyObject* args); /* __contains__ */ static int automaton_contains(PyObject* self, PyObject* args); /* exists() */ static PyObject* automaton_exists(PyObject* self, PyObject* args); /* match() */ static PyObject* automaton_match(PyObject* self, PyObject* args); /* get() */ static PyObject* automaton_get(PyObject* self, PyObject* args); /* make_automaton() */ static PyObject* automaton_make_automaton(PyObject* self, PyObject* args); /* find_all() */ static PyObject* automaton_find_all(PyObject* self, PyObject* args); /* keys() */ static PyObject* automaton_keys(PyObject* self, PyObject* args); /* values() */ static PyObject* automaton_values(PyObject* self, PyObject* args); /* items() */ static PyObject* automaton_items(PyObject* self, PyObject* args); /* iter() */ static PyObject* automaton_iter(PyObject* self, PyObject* args, PyObject* keywds); /* iter_long() */ static PyObject* automaton_iter_long(PyObject* self, PyObject* args); /* get_stats() */ static PyObject* automaton_get_stats(PyObject* self, PyObject* args); #endif python-pyahocorasick_1.4.1.orig/AutomatonItemsIter.c0000644000000000000000000002516313417131365017616 0ustar00/* This is part of pyahocorasick Python module. AutomatonItemsIter implementation Author : Wojciech Muła, wojciech_mula@poczta.onet.pl WWW : http://0x80.pl License : BSD-3-Clause (see LICENSE) */ #include "AutomatonItemsIter.h" static PyTypeObject automaton_items_iter_type; typedef struct AutomatonItemsStackItem { LISTITEM_data; struct TrieNode* node; TRIE_LETTER_TYPE letter; size_t depth; } AutomatonItemsStackItem; #define StackItem AutomatonItemsStackItem static PyObject* automaton_items_iter_new( Automaton* automaton, const TRIE_LETTER_TYPE* word, const ssize_t wordlen, const bool use_wildcard, const TRIE_LETTER_TYPE wildcard, const PatternMatchType matchtype ) { AutomatonItemsIter* iter; StackItem* new_item; iter = (AutomatonItemsIter*)F(PyObject_New)(AutomatonItemsIter, &automaton_items_iter_type); if (iter == NULL) return NULL; iter->automaton = automaton; iter->version = automaton->version; iter->state = NULL; iter->type = ITER_KEYS; iter->buffer = NULL; #ifndef AHOCORASICK_UNICODE iter->char_buffer = NULL; #endif iter->pattern = NULL; iter->use_wildcard = use_wildcard; iter->wildcard = wildcard; iter->matchtype = matchtype; list_init(&iter->stack); Py_INCREF((PyObject*)iter->automaton); iter->buffer = memory_alloc((automaton->longest_word + 1) * TRIE_LETTER_SIZE); if (iter->buffer == NULL) { goto no_memory; } #ifndef AHOCORASICK_UNICODE iter->char_buffer = memory_alloc(automaton->longest_word + 1); if (iter->char_buffer == NULL) { goto no_memory; } #endif if (word) { iter->pattern = (TRIE_LETTER_TYPE*)memory_alloc(wordlen * TRIE_LETTER_SIZE); if (UNLIKELY(iter->pattern == NULL)) { goto no_memory; } else { iter->pattern_length = wordlen; memcpy(iter->pattern, word, wordlen * TRIE_LETTER_SIZE); } } else iter->pattern_length = 0; new_item = (StackItem*)list_item_new(sizeof(StackItem)); if (UNLIKELY(new_item == NULL)) { goto no_memory; } new_item->node = automaton->root; new_item->depth = 0; list_push_front(&iter->stack, (ListItem*)new_item); return (PyObject*)iter; no_memory: Py_DECREF((PyObject*)iter); PyErr_NoMemory(); return NULL; } #define iter ((AutomatonItemsIter*)self) static void automaton_items_iter_del(PyObject* self) { memory_safefree(iter->buffer); memory_safefree(iter->pattern); #ifndef AHOCORASICK_UNICODE memory_safefree(iter->char_buffer); #endif list_delete(&iter->stack); Py_DECREF(iter->automaton); PyObject_Del(self); } static PyObject* automaton_items_iter_iter(PyObject* self) { Py_INCREF(self); return self; } static PyObject* automaton_items_iter_next(PyObject* self) { bool output; TrieNode* node; TRIE_LETTER_TYPE letter; size_t depth; if (UNLIKELY(iter->version != iter->automaton->version)) { PyErr_SetString(PyExc_ValueError, "The underlying automaton has changed: this iterator is no longer valid."); return NULL; } while (true) { StackItem* top = (StackItem*)list_pop_first(&iter->stack); if (top == NULL) return NULL; /* Stop iteration */ if (top->node == NULL) { memory_free(top); return NULL; /* Stop iteration */ } node = top->node; letter = top->letter; depth = top->depth; memory_free(top); if (iter->matchtype != MATCH_AT_LEAST_PREFIX and depth > iter->pattern_length) continue; switch (iter->matchtype) { case MATCH_EXACT_LENGTH: output = (depth == iter->pattern_length); break; case MATCH_AT_MOST_PREFIX: output = (depth <= iter->pattern_length); break; case MATCH_AT_LEAST_PREFIX: default: output = (depth >= iter->pattern_length); break; } iter->state = node; iter->letter = letter; if ((depth >= iter->pattern_length) or (iter->use_wildcard and iter->pattern[depth] == iter->wildcard)) { // process all const int n = iter->state->n; int i; for (i=0; i < n; i++) { StackItem* new_item = (StackItem*)list_item_new(sizeof(StackItem)); if (UNLIKELY(new_item == NULL)) { PyErr_NoMemory(); return NULL; } new_item->node = trienode_get_ith_unsafe(iter->state, i); new_item->letter = trieletter_get_ith_unsafe(iter->state, i); new_item->depth = depth + 1; list_push_front(&iter->stack, (ListItem*)new_item); } } else { // process single letter TrieNode* node = trienode_get_next(iter->state, iter->pattern[depth]); if (node) { StackItem* new_item = (StackItem*)list_item_new(sizeof(StackItem)); if (UNLIKELY(new_item == NULL)) { PyErr_NoMemory(); return NULL; } new_item->node = node; new_item->letter = iter->pattern[depth]; new_item->depth = depth + 1; list_push_front(&iter->stack, (ListItem*)new_item); } } if (iter->type != ITER_VALUES) { // update keys when needed iter->buffer[depth] = iter->letter; #ifndef AHOCORASICK_UNICODE iter->char_buffer[depth] = (char)iter->letter; #endif } if (output and iter->state->eow) { PyObject* val; switch (iter->type) { case ITER_KEYS: #if defined PEP393_UNICODE return F(PyUnicode_FromKindAndData)(PyUnicode_4BYTE_KIND, (void*)(iter->buffer + 1), depth); #elif defined AHOCORASICK_UNICODE return PyUnicode_FromUnicode((Py_UNICODE*)(iter->buffer + 1), depth); #else return PyBytes_FromStringAndSize(iter->char_buffer + 1, depth); #endif case ITER_VALUES: switch (iter->automaton->store) { case STORE_ANY: val = iter->state->output.object; Py_INCREF(val); break; case STORE_LENGTH: case STORE_INTS: return F(Py_BuildValue)("i", iter->state->output.integer); default: PyErr_SetString(PyExc_SystemError, "Incorrect 'store' attribute."); return NULL; } return val; case ITER_ITEMS: switch (iter->automaton->store) { case STORE_ANY: return F(Py_BuildValue)( #ifdef PY3K #ifdef AHOCORASICK_UNICODE "(u#O)", /*key*/ iter->buffer + 1, depth, #else "(y#O)", /*key*/ iter->buffer + 1, depth, #endif #else "(s#O)", /*key*/ iter->char_buffer + 1, depth, #endif /*val*/ iter->state->output.object ); case STORE_LENGTH: case STORE_INTS: return F(Py_BuildValue)( #ifdef PY3K #ifdef AHOCORASICK_UNICODE "(u#i)", /*key*/ iter->buffer + 1, depth, #else "(y#i)", /*key*/ iter->buffer + 1, depth, #endif #else "(s#i)", /*key*/ iter->char_buffer + 1, depth, #endif /*val*/ iter->state->output.integer ); default: PyErr_SetString(PyExc_SystemError, "Incorrect 'store' attribute."); return NULL; } // switch } } } } #undef StackItem #undef iter static PyTypeObject automaton_items_iter_type = { PY_OBJECT_HEAD_INIT "AutomatonItemsIter", /* tp_name */ sizeof(AutomatonItemsIter), /* tp_size */ 0, /* tp_itemsize? */ (destructor)automaton_items_iter_del, /* tp_dealloc */ 0, /* tp_print */ 0, /* tp_getattr */ 0, /* tp_setattr */ 0, /* tp_reserved */ 0, /* tp_repr */ 0, /* tp_as_number */ 0, /* tp_as_sequence */ 0, /* tp_as_mapping */ 0, /* tp_hash */ 0, /* tp_call */ 0, /* tp_str */ PyObject_GenericGetAttr, /* tp_getattro */ 0, /* tp_setattro */ 0, /* tp_as_buffer */ Py_TPFLAGS_DEFAULT, /* tp_flags */ 0, /* tp_doc */ 0, /* tp_traverse */ 0, /* tp_clear */ 0, /* tp_richcompare */ 0, /* tp_weaklistoffset */ automaton_items_iter_iter, /* tp_iter */ automaton_items_iter_next, /* tp_iternext */ 0, /* tp_methods */ 0, /* tp_members */ 0, /* tp_getset */ 0, /* tp_base */ 0, /* tp_dict */ 0, /* tp_descr_get */ 0, /* tp_descr_set */ 0, /* tp_dictoffset */ 0, /* tp_init */ 0, /* tp_alloc */ 0, /* tp_new */ }; python-pyahocorasick_1.4.1.orig/AutomatonItemsIter.h0000644000000000000000000000336513417131365017623 0ustar00/* This is part of pyahocorasick Python module. AutomatonItemsIter const, struct & methods declarations. This class implements iterator walk over trie, that returns words and associated values. Object of this class is returned by 'keys'/'values'/'items' methods of Automaton class. Author : Wojciech Muła, wojciech_mula@poczta.onet.pl WWW : http://0x80.pl License : BSD-3-Clause (see LICENSE) */ #ifndef ahocorasick_AutomatonItemsIter_h_included #define ahocorasick_AutomatonItemsIter_h_included #include "common.h" #include "Automaton.h" typedef enum { ITER_KEYS, ITER_VALUES, ITER_ITEMS } ItemsType; typedef enum { MATCH_EXACT_LENGTH, MATCH_AT_MOST_PREFIX, MATCH_AT_LEAST_PREFIX } PatternMatchType; typedef struct AutomatonItemsIter { PyObject_HEAD Automaton* automaton; int version; ///< automaton version TrieNode* state; ///< current automaton node TRIE_LETTER_TYPE letter; ///< current letter List stack; ///< stack ItemsType type; ///< type of iterator (KEYS/VALUES/ITEMS) TRIE_LETTER_TYPE* buffer; ///< buffer to construct key representation #ifndef AHOCORASICK_UNICODE char *char_buffer; #endif size_t pattern_length; TRIE_LETTER_TYPE* pattern; ///< pattern bool use_wildcard; TRIE_LETTER_TYPE wildcard; ///< wildcard char PatternMatchType matchtype; ///< how pattern have to be handled } AutomatonItemsIter; /* new() */ static PyObject* automaton_items_iter_new( Automaton* automaton, const TRIE_LETTER_TYPE* word, const ssize_t wordlen, const bool use_wildcard, const TRIE_LETTER_TYPE wildcard, const PatternMatchType matchtype ); #endif python-pyahocorasick_1.4.1.orig/AutomatonSearchIter.c0000644000000000000000000002650214002636726017742 0ustar00/* This is part of pyahocorasick Python module. AutomatonSearchIter implementation Author : Wojciech Muła, wojciech_mula@poczta.onet.pl WWW : http://0x80.pl License : BSD-3-Clause (see LICENSE) */ #include "AutomatonSearchIter.h" #include static PyTypeObject automaton_search_iter_type; #ifdef VARIABLE_LEN_CHARCODES static int automaton_search_iter_substring_index(struct Input* input, int position) { TRIE_LETTER_TYPE letter; int index = 0; int i; for (i=0; i < position; i++) { letter = input->word[index]; if (UNLIKELY(Py_UNICODE_IS_SURROGATE(letter))) { if (UNLIKELY(!Py_UNICODE_IS_HIGH_SURROGATE(letter))) { PyErr_Format(PyExc_ValueError, "Malformed UCS-2 string: expected a high surrogate at %d, got %04x", index, letter); return -1; } index += 1; if (index >= input->wordlen) { PyErr_Format(PyExc_ValueError, "Malformed UCS-2 string: unexpected end of string"); return -1; } letter = input->word[index]; if (UNLIKELY(!Py_UNICODE_IS_LOW_SURROGATE(letter))) { PyErr_Format(PyExc_ValueError, "Malformed UCS-2 string: expected a low surrogate at %d, got %04x", index, letter); return -1; } index += 1; } else { index += 1; } } return index; } #endif // VARIABLE_LEN_CHARCODES static PyObject* automaton_search_iter_new( Automaton* automaton, PyObject* object, int start, int end, bool ignore_white_space ) { AutomatonSearchIter* iter; #ifdef VARIABLE_LEN_CHARCODES int tmp; #endif iter = (AutomatonSearchIter*)F(PyObject_New)(AutomatonSearchIter, &automaton_search_iter_type); if (iter == NULL) return NULL; iter->automaton = automaton; iter->version = automaton->version; iter->state = automaton->root; iter->output= NULL; iter->shift = 0; iter->ignore_white_space = ignore_white_space; init_input(&iter->input); Py_INCREF(iter->automaton); if (!prepare_input((PyObject*)automaton, object, &iter->input)) { goto error; } #ifdef VARIABLE_LEN_CHARCODES if (automaton->key_type == KEY_STRING) { tmp = automaton_search_iter_substring_index(&iter->input, start); if (tmp >= 0) { iter->index = tmp - 1; iter->position = start - 1; } else { goto error; } tmp = automaton_search_iter_substring_index(&iter->input, end); if (tmp >= 0) { iter->end = end; } else { goto error; } iter->expected = pyaho_UCS2_Any; } else { iter->index = start - 1; iter->end = end; } #else // -1 because the first instruction in next() increments index iter->index = start - 1; iter->end = end; #endif return (PyObject*)iter; error: Py_DECREF(iter); return NULL; } #define iter ((AutomatonSearchIter*)self) static void automaton_search_iter_del(PyObject* self) { Py_DECREF(iter->automaton); destroy_input(&iter->input); PyObject_Del(self); } static PyObject* automaton_search_iter_iter(PyObject* self) { Py_INCREF(self); return self; } enum { OutputValue, OutputNone, OutputError }; static int automaton_build_output(PyObject* self, PyObject** result) { TrieNode* node; Py_ssize_t idx = 0; while (iter->output && !iter->output->eow) { iter->output = iter->output->fail; } if (iter->output) { node = iter->output; iter->output = iter->output->fail; #ifdef VARIABLE_LEN_CHARCODES idx = iter->shift; if (iter->automaton->key_type == KEY_STRING) { idx += iter->position; } else { idx += iter->index; } #else idx = iter->index + iter->shift; #endif switch (iter->automaton->store) { case STORE_LENGTH: case STORE_INTS: *result = F(Py_BuildValue)("ii", idx, node->output.integer); return OutputValue; case STORE_ANY: *result = F(Py_BuildValue)("iO", idx, node->output.object); return OutputValue; default: PyErr_SetString(PyExc_ValueError, "inconsistent internal state!"); return OutputError; } } return OutputNone; } #ifdef VARIABLE_LEN_CHARCODES static bool automaton_search_iter_advance_index(PyObject* self) { TRIE_LETTER_TYPE letter; iter->index += 1; if (iter->automaton->key_type == KEY_SEQUENCE) { return true; } letter = iter->input.word[iter->index]; if (iter->expected == pyaho_UCS2_Any) { if (UNLIKELY(Py_UNICODE_IS_SURROGATE(letter))) { if (LIKELY(Py_UNICODE_IS_HIGH_SURROGATE(letter))) { iter->expected = pyaho_UCS2_LowSurrogate; } else { PyErr_Format(PyExc_ValueError, "Malformed UCS-2 string: expected a high surrogate at %d, got %04x", iter->index, letter); return false; } } else { iter->position += 1; } } else { assert(iter->expected == pyaho_UCS2_LowSurrogate); if (LIKELY(Py_UNICODE_IS_LOW_SURROGATE(letter))) { iter->expected = pyaho_UCS2_Any; iter->position += 1; } else { PyErr_Format(PyExc_ValueError, "Malformed UCS-2 string: expected a low surrogate at %d, got %04x", iter->index, letter); return false; } } return true; } #endif static PyObject* automaton_search_iter_next(PyObject* self) { PyObject* output; if (iter->version != iter->automaton->version) { PyErr_SetString(PyExc_ValueError, "underlaying automaton has changed, iterator is not valid anymore"); return NULL; } return_output: switch (automaton_build_output(self, &output)) { case OutputValue: return output; case OutputNone: break; case OutputError: return NULL; } #ifdef VARIABLE_LEN_CHARCODES if (!automaton_search_iter_advance_index(self)) { return NULL; } #else iter->index += 1; if (iter->ignore_white_space) { while ((iter->index < iter->end) and iswspace(iter->input.word[iter->index])) { iter->index += 1; } } #endif while (iter->index < iter->end) { // process single char iter->state = ahocorasick_next( iter->state, iter->automaton->root, iter->input.word[iter->index] ); ASSERT(iter->state); iter->output = iter->state; goto return_output; #ifdef VARIABLE_LEN_CHARCODES if (!automaton_search_iter_advance_index(self)) { return NULL; } #else iter->index += 1; #endif } // while return NULL; // StopIteration } static PyObject* automaton_search_iter_set(PyObject* self, PyObject* args) { PyObject* object; PyObject* flag; Py_ssize_t position; bool reset; struct Input new_input; // first argument - required string or buffer object = F(PyTuple_GetItem)(args, 0); if (object) { init_input(&new_input); if (!prepare_input((PyObject*)iter->automaton, object, &new_input)) { return NULL; } } else return NULL; // second argument - optional bool flag = F(PyTuple_GetItem)(args, 1); if (flag) { switch (PyObject_IsTrue(flag)) { case 0: reset = false; break; case 1: reset = true; break; default: return NULL; } } else { PyErr_Clear(); reset = false; } destroy_input(&iter->input); assign_input(&iter->input, &new_input); if (!reset) { position = iter->index; #ifdef VARIABLE_LEN_CHARCODES if (iter->automaton->key_type == KEY_STRING) { position = iter->position; } #endif iter->shift += (position >= 0) ? position : 0; } iter->index = -1; iter->end = new_input.wordlen; if (reset) { iter->state = iter->automaton->root; iter->shift = 0; iter->output = NULL; #ifdef VARIABLE_LEN_CHARCODES iter->position = -1; iter->expected = pyaho_UCS2_Any; #endif } Py_RETURN_NONE; } #undef iter #define method(name, kind) {#name, automaton_search_iter_##name, kind, automaton_search_iter_##name##_doc} static PyMethodDef automaton_search_iter_methods[] = { method(set, METH_VARARGS), {NULL, NULL, 0, NULL} }; #undef method static PyTypeObject automaton_search_iter_type = { PY_OBJECT_HEAD_INIT "ahocorasick.AutomatonSearchIter", /* tp_name */ sizeof(AutomatonSearchIter), /* tp_size */ 0, /* tp_itemsize? */ (destructor)automaton_search_iter_del, /* tp_dealloc */ 0, /* tp_print */ 0, /* tp_getattr */ 0, /* tp_setattr */ 0, /* tp_reserved */ 0, /* tp_repr */ 0, /* tp_as_number */ 0, /* tp_as_sequence */ 0, /* tp_as_mapping */ 0, /* tp_hash */ 0, /* tp_call */ 0, /* tp_str */ PyObject_GenericGetAttr, /* tp_getattro */ 0, /* tp_setattro */ 0, /* tp_as_buffer */ Py_TPFLAGS_DEFAULT, /* tp_flags */ automaton_search_iter_doc, /* tp_doc */ 0, /* tp_traverse */ 0, /* tp_clear */ 0, /* tp_richcompare */ 0, /* tp_weaklistoffset */ automaton_search_iter_iter, /* tp_iter */ automaton_search_iter_next, /* tp_iternext */ automaton_search_iter_methods, /* tp_methods */ 0, /* tp_members */ 0, /* tp_getset */ 0, /* tp_base */ 0, /* tp_dict */ 0, /* tp_descr_get */ 0, /* tp_descr_set */ 0, /* tp_dictoffset */ 0, /* tp_init */ 0, /* tp_alloc */ 0, /* tp_new */ }; python-pyahocorasick_1.4.1.orig/AutomatonSearchIter.h0000644000000000000000000000301113407734122017732 0ustar00/* This is part of pyahocorasick Python module. AutomatonSearchIter const, struct & methods declarations. This class implements iterator walk over Aho-Corasick automaton. Object of this class is returned by 'iter' method of Automaton class. Author : Wojciech Muła, wojciech_mula@poczta.onet.pl WWW : http://0x80.pl License : BSD-3-Clause (see LICENSE) */ #ifndef ahocorasick_AutomatonSearchIter_h_included #define ahocorasick_AutomatonSearchIter_h_included #include "common.h" #include "Automaton.h" #ifdef VARIABLE_LEN_CHARCODES typedef enum { pyaho_UCS2_Any, pyaho_UCS2_LowSurrogate } UCS2ExpectedChar; #endif typedef struct AutomatonSearchIter { PyObject_HEAD Automaton* automaton; int version; ///< automaton version struct Input input; ///< input string TrieNode* state; ///< current state of automaton TrieNode* output; ///< current node, i.e. yielded value Py_ssize_t index; ///< current index in data Py_ssize_t shift; ///< shift + index => output index Py_ssize_t end; ///< end index bool ignore_white_space; ///< ignore input string white spaces using iswspace() function #ifdef VARIABLE_LEN_CHARCODES int position; ///< position in string UCS2ExpectedChar expected; #endif } AutomatonSearchIter; static PyObject* automaton_search_iter_new( Automaton* automaton, PyObject* object, int start, int end, bool ignore_white_space ); #endif python-pyahocorasick_1.4.1.orig/AutomatonSearchIterLong.c0000644000000000000000000001747214002631330020552 0ustar00/* This is part of pyahocorasick Python module. AutomatonSearchIterLong implementation Author : Wojciech Muła, wojciech_mula@poczta.onet.pl License : 3-clauses BSD (see LICENSE) */ #include "AutomatonSearchIterLong.h" static PyTypeObject automaton_search_iter_long_type; static PyObject* automaton_search_iter_long_new( Automaton* automaton, PyObject* object, int start, int end ) { AutomatonSearchIterLong* iter; iter = (AutomatonSearchIterLong*)PyObject_New(AutomatonSearchIterLong, &automaton_search_iter_long_type); if (iter == NULL) return NULL; iter->automaton = automaton; iter->version = automaton->version; iter->object = object; iter->state = automaton->root; iter->shift = 0; iter->index = start - 1; // -1 because first instruction in next() increments index iter->end = end; iter->last_index = -1; iter->last_node = NULL; Py_INCREF(iter->automaton); Py_INCREF(iter->object); init_input(&iter->input); if (!prepare_input((PyObject*)automaton, object, &iter->input)) { goto error; } return (PyObject*)iter; error: Py_DECREF(iter); return NULL; } #define iter ((AutomatonSearchIterLong*)self) static void automaton_search_iter_long_del(PyObject* self) { Py_DECREF(iter->automaton); Py_DECREF(iter->object); destroy_input(&iter->input); PyObject_Del(self); } static PyObject* automaton_search_iter_long_iter(PyObject* self) { Py_INCREF(self); return self; } static PyObject* automaton_build_output_iter_long(PyObject* self) { switch (iter->automaton->store) { case STORE_LENGTH: case STORE_INTS: return Py_BuildValue("ii", iter->shift + iter->last_index, iter->last_node->output.integer); case STORE_ANY: return Py_BuildValue("iO", iter->shift + iter->last_index, iter->last_node->output.object); default: PyErr_SetString(PyExc_ValueError, "inconsistent internal state!"); return NULL; } } static PyObject* automaton_search_iter_long_next(PyObject* self) { PyObject* output; TrieNode* next; if (iter->version != iter->automaton->version) { PyErr_SetString(PyExc_ValueError, "underlaying automaton has changed, iterator is not valid anymore"); return NULL; } return_output: if (iter->last_node) { output = automaton_build_output_iter_long(self); // start over, as we don't want overlapped results // Note: this leads to quadratic complexity in the worst case iter->state = iter->automaton->root; iter->index = iter->last_index; iter->last_node = NULL; iter->last_index = -1; return output; } iter->index += 1; while (iter->index < iter->end) { next = trienode_get_next(iter->state, iter->input.word[iter->index]); if (next) { if (next->eow) { // save the last node on the path iter->last_node = next; iter->last_index = iter->index; } iter->state = next; iter->index += 1; } else { if (iter->last_node) { goto return_output; } else { while (true) { iter->state = iter->state->fail; if (iter->state == NULL) { iter->state = iter->automaton->root; iter->index += 1; break; } else if (trienode_get_next(iter->state, iter->input.word[iter->index])) { break; } } } } } // while if (iter->last_node) { goto return_output; } return NULL; // StopIteration } static PyObject* automaton_search_iter_long_set(PyObject* self, PyObject* args) { PyObject* object; PyObject* flag; bool reset; struct Input new_input; // first argument - required string or buffer object = PyTuple_GetItem(args, 0); if (object) { init_input(&new_input); if (!prepare_input((PyObject*)iter->automaton, object, &new_input)) { return NULL; } } else return NULL; // second argument - optional bool flag = PyTuple_GetItem(args, 1); if (flag) { switch (PyObject_IsTrue(flag)) { case 0: reset = false; break; case 1: reset = true; break; default: return NULL; } } else { PyErr_Clear(); reset = false; } // update internal state Py_XDECREF(iter->object); Py_INCREF(object); iter->object = object; destroy_input(&iter->input); assign_input(&iter->input, &new_input); if (!reset) iter->shift += (iter->index >= 0) ? iter->index : 0; iter->index = -1; iter->end = new_input.wordlen; if (reset) { iter->state = iter->automaton->root; iter->shift = 0; iter->last_node = NULL; iter->last_index = -1; } Py_RETURN_NONE; } #undef iter #define method(name, kind) {#name, automaton_search_iter_long_##name, kind, ""} static PyMethodDef automaton_search_iter_long_methods[] = { method(set, METH_VARARGS), {NULL, NULL, 0, NULL} }; #undef method static PyTypeObject automaton_search_iter_long_type = { PY_OBJECT_HEAD_INIT "ahocorasick.AutomatonSearchIterLong", /* tp_name */ sizeof(AutomatonSearchIterLong), /* tp_size */ 0, /* tp_itemsize? */ (destructor)automaton_search_iter_long_del, /* tp_dealloc */ 0, /* tp_print */ 0, /* tp_getattr */ 0, /* tp_setattr */ 0, /* tp_reserved */ 0, /* tp_repr */ 0, /* tp_as_number */ 0, /* tp_as_sequence */ 0, /* tp_as_mapping */ 0, /* tp_hash */ 0, /* tp_call */ 0, /* tp_str */ PyObject_GenericGetAttr, /* tp_getattro */ 0, /* tp_setattro */ 0, /* tp_as_buffer */ Py_TPFLAGS_DEFAULT, /* tp_flags */ 0, /* tp_doc */ 0, /* tp_traverse */ 0, /* tp_clear */ 0, /* tp_richcompare */ 0, /* tp_weaklistoffset */ automaton_search_iter_long_iter, /* tp_iter */ automaton_search_iter_long_next, /* tp_iternext */ automaton_search_iter_long_methods, /* tp_methods */ 0, /* tp_members */ 0, /* tp_getset */ 0, /* tp_base */ 0, /* tp_dict */ 0, /* tp_descr_get */ 0, /* tp_descr_set */ 0, /* tp_dictoffset */ 0, /* tp_init */ 0, /* tp_alloc */ 0, /* tp_new */ }; python-pyahocorasick_1.4.1.orig/AutomatonSearchIterLong.h0000644000000000000000000000233213642670543020566 0ustar00/* This is part of pyahocorasick Python module. AutomatonSearchIterLong const, struct & methods declarations. This class implements iterator walk over Aho-Corasick automaton. Object of this class is returnd by 'iter' method of Automaton class. Author : Wojciech Muła, wojciech_mula@poczta.onet.pl License : 3-clauses BSD (see LICENSE) */ #ifndef ahocorasick_AutomatonSearchIterLong_h_included #define ahocorasick_AutomatonSearchIterLong_h_included #include "common.h" #include "Automaton.h" typedef struct AutomatonSearchIterLong { PyObject_HEAD Automaton* automaton; int version; ///< automaton version PyObject* object; ///< unicode or buffer struct Input input; ///< input string TrieNode* state; ///< current state of automaton TrieNode* last_node; ///< last node on trie path int last_index; int index; ///< current index in data int shift; ///< shift + index => output index int end; ///< end index } AutomatonSearchIterLong; static PyObject* automaton_search_iter_long_new( Automaton* automaton, PyObject* object, int start, int end ); #endif python-pyahocorasick_1.4.1.orig/Automaton_pickle.c0000644000000000000000000003161713417131365017320 0ustar00/* This is part of pyahocorasick Python module. Implementation of pickling/unpickling routines for Automaton class Author : Wojciech Muła, wojciech_mula@poczta.onet.pl WWW : http://0x80.pl License : BSD-3-Clause (see LICENSE) */ /* Pickling (automaton___reduce__): 1. assign sequential numbers to nodes in order to replace address with these numbers (pickle_dump_replace_fail_with_id) 2. save in array all nodes data in the same order as numbers, also replace fail and next links with numbers; collect on a list all values (python objects) stored in a trie (pickle_dump_save); Before we start, all nodes of trie are visited and total size of pickled data is calculated. If it is small enough (less than given threshold), all data is saved in a single byte array. Otherwise, data is saved in several byte arrays. In either case, the format of byte array is the same: * 8 first bytes is number of nodes stored in this chunk of memory * the number if followed by some raw data. When there is just one byte array, it's size is fit to needs. If data is split, then each array has exactly the same size of bytes, but not all might be used (only the last array is fit). 3. clean up (pickle_dump_undo_replace or pickle_dump_revert_replace) Unpickling (automaton_unpickle, called in Automaton constructor) 1. load all nodes from array 2. make number->node lookup table 3. replace numbers stored in fail and next pointers with real pointers, reassign python objects as values */ #include #include "src/pickle/pickle_data.c" typedef struct NodeID { TrieNode* fail; ///< original fail value Py_uintptr_t id; ///< id } NodeID; typedef struct DumpState { Py_uintptr_t id; ///< next id size_t total_size; ///< number of nodes TrieNode* failed_on; ///< if fail while numerating, save node in order /// to revert changes made in trie } DumpState; static size_t get_pickled_size(TrieNode* node) { ASSERT(node != NULL); return PICKLE_TRIENODE_SIZE + node->n * sizeof(Pair); } // replace fail with pairs (fail, id) static int pickle_dump_replace_fail_with_id(TrieNode* node, const int depth, void* extra) { NodeID* repl; ASSERT(sizeof(NodeID*) <= sizeof(TrieNode*)); #define state ((DumpState*)extra) repl = (NodeID*)memory_alloc(sizeof(NodeID)); if (LIKELY(repl != NULL)) { state->id += 1; state->total_size += get_pickled_size(node); repl->id = state->id; repl->fail = node->fail; node->fail = (TrieNode*)repl; return 1; } else { // error, revert is needed! state->failed_on = node; return 0; } #undef state } // revert changes in trie (in case of error) static int pickle_dump_revert_replace(TrieNode* node, const int depth, void* extra) { #define state ((DumpState*)extra) if (state->failed_on != node) { NodeID* repl = (NodeID*)(node->fail); node->fail = repl->fail; memory_free(repl); return 1; } else return 0; #undef state } // revert changes in trie static int pickle_dump_undo_replace(TrieNode* node, const int depth, void* extra) { #define state ((DumpState*)extra) NodeID* repl = (NodeID*)(node->fail); node->fail = repl->fail; memory_free(repl); return 1; #undef state } static int pickle_dump_save(TrieNode* node, const int depth, void* extra) { #define self ((PickleData*)extra) #define NODEID(object) ((NodeID*)((TrieNode*)object)->fail) TrieNode* dump; TrieNode* tmp; Pair* arr; unsigned i; size_t size; size = get_pickled_size(node); if (UNLIKELY(self->top + size > self->size)) { if (UNLIKELY(!pickle_data__add_next_buffer(self))) { self->error = true; return 0; } } dump = (TrieNode*)(self->data + self->top); // we do not save the last pointer in array arr = (Pair*)(self->data + self->top + PICKLE_TRIENODE_SIZE); // append the python object to the list if (node->eow and self->values) { if (PyList_Append(self->values, node->output.object) == -1) { self->error = true; return 0; } } // save node data if (self->values) dump->output.integer = 0; else dump->output.integer = node->output.integer; dump->n = node->n; dump->eow = node->eow; tmp = NODEID(node)->fail; if (tmp) dump->fail = (TrieNode*)(NODEID(tmp)->id); else dump->fail = NULL; // save array of pointers for (i=0; i < node->n; i++) { TrieNode* child = trienode_get_ith_unsafe(node, i); ASSERT(child); arr[i].child = (TrieNode*)(NODEID(child)->id); // save the id of child node arr[i].letter = trieletter_get_ith_unsafe(node, i); } self->top += size; (*self->count) += 1; return 1; #undef NODEID #undef self } static PyObject* automaton___reduce__(PyObject* self, PyObject* args) { #define automaton ((Automaton*)self) #define MB ((size_t)(1024*1024)) const size_t array_size = 16*MB; DumpState state; PickleData data; PyObject* tuple; // 0. for an empty automaton do nothing if (automaton->count == 0) { // the class constructor feed with an empty argument build an empty automaton return F(Py_BuildValue)("O()", Py_TYPE(self)); } // 1. numerate nodes state.id = 0; state.failed_on = NULL; state.total_size = 0; trie_traverse(automaton->root, pickle_dump_replace_fail_with_id, &state); if (state.failed_on) { // revert changes (partial) trie_traverse(automaton->root, pickle_dump_revert_replace, &state); // and set error PyErr_NoMemory(); return NULL; } // 2. gather data if (!pickle_data__init(&data, automaton->store, state.total_size, array_size)) goto exception; trie_traverse(automaton->root, pickle_dump_save, &data); if (UNLIKELY(data.error)) { goto exception; } if (UNLIKELY(!pickle_data__shrink_last_buffer(&data))) { goto exception; } if (automaton->store != STORE_ANY) { // always pickle a Python object data.values = Py_None; Py_INCREF(data.values); } /* 3: save tuple: * binary data * automaton->kind * automaton->store * automaton->key_type * automaton->count * automaton->longest_word * list of values */ tuple = F(Py_BuildValue)( "O(OiiiiiO)", Py_TYPE(self), data.bytes_list, automaton->kind, automaton->store, automaton->key_type, automaton->count, automaton->longest_word, data.values ); if (data.values == Py_None) { data.values = NULL; } if (UNLIKELY(tuple == NULL)) { goto exception; } // revert all changes trie_traverse(automaton->root, pickle_dump_undo_replace, NULL); return tuple; exception: // revert all changes trie_traverse(automaton->root, pickle_dump_undo_replace, NULL); // and free memory pickle_data__cleanup(&data); return NULL; #undef automaton } static bool automaton_unpickle__validate_bytes_list(PyObject* bytes_list, size_t* result) { PyObject* bytes; Py_ssize_t k; Py_ssize_t nodes_count; const uint8_t* data; size_t count = 0; // calculate the total number of nodes (and do validate data at the same time) for (k=0; k < PyList_GET_SIZE(bytes_list); k++) { bytes = PyList_GET_ITEM(bytes_list, k); if (UNLIKELY(!F(PyBytes_CheckExact)(bytes))) { PyErr_Format(PyExc_ValueError, "Item #%d on the bytes list is not a bytes object", k); return false; } data = (const uint8_t*)PyBytes_AS_STRING(bytes); nodes_count = *((Py_ssize_t*)data); if (UNLIKELY(nodes_count <= 0)) { PyErr_Format(PyExc_ValueError, "Nodes count for item #%d on the bytes list is not positive (%d)", k, nodes_count); return false; } count += nodes_count; } *result = count; return true; } static bool automaton_unpickle( Automaton* automaton, PyObject* bytes_list, PyObject* values ) { TrieNode** id2node = NULL; TrieNode* node; TrieNode* dump; Pair* next; PyObject* bytes; PyObject* value; Py_ssize_t nodes_count; Py_ssize_t i; size_t id; const uint8_t* data; const uint8_t* ptr; const uint8_t* end; size_t k; size_t j; size_t object_idx = 0; size_t index; size_t count; if (!automaton_unpickle__validate_bytes_list(bytes_list, &count)) { goto exception; } id2node = (TrieNode**)memory_alloc((count+1) * sizeof(TrieNode*)); if (UNLIKELY(id2node == NULL)) { goto no_mem; } // 1. make nodes id = 1; for (k=0; k < PyList_GET_SIZE(bytes_list); k++) { bytes = PyList_GET_ITEM(bytes_list, k); data = (const uint8_t*)PyBytes_AS_STRING(bytes); nodes_count = *((Py_ssize_t*)data); ptr = data + PICKLE_CHUNK_COUNTER_SIZE; end = ptr + PyBytes_GET_SIZE(bytes) - PICKLE_CHUNK_COUNTER_SIZE; for (i=0; i < nodes_count; i++) { if (UNLIKELY(ptr + PICKLE_TRIENODE_SIZE > end)) { PyErr_Format(PyExc_ValueError, "Data truncated [parsing header of node #%d]: " "chunk #%d @ offset %lu, expected at least %lu bytes", i, k, ptr - data, PICKLE_TRIENODE_SIZE); goto exception; } dump = (TrieNode*)(ptr); node = (TrieNode*)memory_alloc(sizeof(TrieNode)); if (LIKELY(node != NULL)) { node->output = dump->output; node->fail = dump->fail; node->n = dump->n; node->eow = dump->eow; node->next = NULL; } else goto no_mem; ptr += PICKLE_TRIENODE_SIZE; id2node[id++] = node; if (node->n > 0) { if (UNLIKELY(ptr + node->n * sizeof(Pair) > end)) { PyErr_Format(PyExc_ValueError, "Data truncated [parsing children of node #%d]: " "chunk #%d @ offset %lu, expected at least %ld bytes", i, k, ptr - data + i, node->n * sizeof(Pair)); goto exception; } node->next = (Pair*)memory_alloc(node->n * sizeof(Pair)); if (UNLIKELY(node->next == NULL)) { goto no_mem; } next = (Pair*)(ptr); for (j=0; j < node->n; j++) { node->next[j] = next[j]; } ptr += node->n * sizeof(Pair); } } } // 2. restore pointers and references to pyobjects for (i=1; i < id; i++) { node = id2node[i]; // references if (values and node->eow) { value = F(PyList_GetItem)(values, object_idx); if (value) { Py_INCREF(value); node->output.object = value; object_idx += 1; } else goto exception; } // pointers if (node->fail) { index = (size_t)(node->fail); if (LIKELY(index < count + 1)) { node->fail = id2node[index]; } else { PyErr_Format(PyExc_ValueError, "Node #%lu malformed: the fail link points to node #%lu, while there are %lu nodes", i - 1, index, count); goto exception; } } for (j=0; j < node->n; j++) { index = (size_t)(node->next[j].child); if (LIKELY(index < count + 1)) { node->next[j].child = id2node[index]; } else { PyErr_Format(PyExc_ValueError, "Node #%lu malformed: next link #%lu points to node #%lu, while there are %lu nodes", i - 1, j, index, count); goto exception; } } } automaton->root = id2node[1]; memory_free(id2node); return 1; no_mem: PyErr_NoMemory(); exception: // free memory if (id2node) { for (i=1; i < id; i++) { trienode_free(id2node[i]); } memory_free(id2node); } // If there is value list and some of its items were already // referenced, release them if (values) { for (i=0; i < object_idx; i++) { Py_XDECREF(F(PyList_GetItem)(values, i)); } } return 0; } python-pyahocorasick_1.4.1.orig/CHANGELOG.rst0000644000000000000000000001122714004050654015664 0ustar001.4.0 (2020-01-26) -------------------------------------------------- - Add method ``iter_long``, that performs the modified Aho-Corasick search procedure matching the longest words from set. 1.4.0 (2019-01-24) -------------------------------------------------- - Change internal trie representation thanks to that performance of common operation is 1.5 - 2.5 times faster. Details are presented in https://github.com/WojciechMula/pyahocorasick/pull/107 Warning: this change breaks compatibility of pickle and ``save()`` format, this won't be possible to load files created in the previous version. 1.3.0 (2018-12-20) -------------------------------------------------- - Add alternative pickling mechanism ``save()``/``load``, which requires less memory than the standard pickle solution (issue #102) 1.2.0 (2018-12-13) -------------------------------------------------- - Add methods ``remove_word()``/``pop()`` (issue #79) 1.1.13.1 (2018-12-11) -------------------------------------------------- - Fix manifest file 1.1.13 (2018-12-11) -------------------------------------------------- - Fix pickling of large automatons (issue #50); The fix wouldn't be possible without great help and patience of all people involved: * **Emil Stenström** (@EmilStenstrom) * **David Woakes** (@woakesd) * **@Dobatymo** * **Philippe Ombredanne** (@pombredanne) The fix wouldn't also be possible without **Daniel Lemire** (@lemire), who gave me access to decent machines and I was able to test fixes on large data. 1.1.12 (2018-12-03) -------------------------------------------------- - Add support for tuples of ints to ``iter()`` (by **Frankie Robertson**) 1.1.11 (2018-12-02) -------------------------------------------------- - Reworked pickling code - Fix pickling crash (issue #68) - Fix pickling memory leak (issue #62) - Fix documentation (by **Philippe Ombredanne**) - Fix several latent bugs and problems 1.1.10 (2018-10-25) -------------------------------------------------- - Fix handling of unicode in Python 3 (by **Frankie Robertson**) 1.1.9 (2018-10-25) -------------------------------------------------- - Fix documentation typos (by **Sylvain Zimmer**) - Add ability to skip white spaces in the input strings (by **@gladtosee**; issue #84) 1.1.8 (2018-04-25) -------------------------------------------------- - Fix memory leak (issue #81) - Add link to Python implementation from Abusix (by **Frederik Petersen**) - Fix unit tests (by **Renat Nasyrov**) 1.1.7 (2018-02-23) -------------------------------------------------- - Minor documentation fixes (by **Edward Betts**) - Some internal improvements 1.1.6 (2017-11-27) -------------------------------------------------- - Fix PyPI building (by **Philippe Ombredanne**; issue #71) 1.1.5 (2017-11-22) -------------------------------------------------- - Fix handling of UCS2-encoded string (issue #53) - Fix pickling error - Several minor fixes and corrections to documentation and infrastructure (thanks to: **Jan Fan**, **@blackelk**, **David Woakes** and **Xiaopeng Xu**) 1.1.4 (2016-08-08) -------------------------------------------------- - Fix URL in documentation (by **Philippe Ombredanne**) 1.1.3 (2016-08-07) -------------------------------------------------- - Rewrite documentation and fix PyPI presentation (by **Philippe Ombredanne**) 1.1.2 (2016-08-06) -------------------------------------------------- - Rewrite documentation continued (by **Philippe Ombredanne**) 1.1.1 (2016-05-29) -------------------------------------------------- - Rewrite documentation, setup readthedocs.io__ page (by **Philippe Ombredanne**) - Make the module compilable in Windows using MSVC compiler (issue #11) - Fix ``get()`` method that crashed when trie was empty (issue #22) - Fix pickling problem (issue #26) - Add ``__sizeof__()`` method (issue #25) __ https://pyahocorasick.readthedocs.io/en/latest/ 1.1.0 (2016-04-26) -------------------------------------------------- - Support for Python 2 (with help from **Philippe Ombredanne**; issue #12) 1.0.3 (2016-04-24) -------------------------------------------------- - Fix memory leak (by **Jonathan Grs**; issue #9) 1.0.2 (2016-04-23) -------------------------------------------------- - Fix range parsing (by **Jonathan Grs**; issue #10) - Fix pickling on 64-bit machines (issue #20) - Update documentation regarding wildcards 1.0.1 (2016-04-19) -------------------------------------------------- - Fix Unicode handling during automaton build (issue #8) - Fix some 64-bit code issues (issue #5) - Fix documentation (thanks to **Pastafarianist**) 1.0.0 (2014-11-25) -------------------------------------------------- - The first version available through PyPi python-pyahocorasick_1.4.1.orig/LICENSE0000644000000000000000000000273212707726002014656 0ustar00 Copyright (c) 2011-2016 Wojciech Muła All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the Wojciech Muła nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. python-pyahocorasick_1.4.1.orig/MANIFEST.in0000644000000000000000000000046413404001570015376 0ustar00graft benchmarks graft docs graft msinttypes graft py graft docs graft regression graft stamp graft unresolved_bugs graft src include README.rst include LICENSE include *.py include *.h include *.c include *.cfg include .gitignore include MANIFEST.in include travis.yml include appveyor.yml include Makefile python-pyahocorasick_1.4.1.orig/Makefile0000644000000000000000000000252213401303143015273 0ustar00.SUFFIXES: .PHONY: test clean valgrind export PYTHONPATH := .:$(PYTHONPATH):$(PATH) DEPS=*.c \ *.h \ setup.py \ unittests.py test: stamp/regression_py2 stamp/regression_py3 stamp/build_py2: $(DEPS) python2 setup.py build_ext --inplace touch $@ stamp/unittests_py2: stamp/build_py2 python2 unittests.py touch $@ stamp/regression_py2: stamp/unittests_py2 python2 regression/issue_5.py python2 regression/issue_8.py python2 regression/issue_9.py python2 regression/issue_10.py python2 regression/issue_26.py python2 regression/issue_56.py touch $@ stamp/build_py3: $(DEPS) python3 setup.py build_ext --inplace touch $@ stamp/unittests_py3: stamp/build_py3 python3 unittests.py touch $@ stamp/regression_py3: stamp/unittests_py3 python3 regression/issue_5.py python3 regression/issue_8.py python3 regression/issue_9.py python3 regression/issue_10.py python3 regression/issue_26.py python3 regression/issue_56.py touch $@ benchmark: benchmarks/benchmark.py stamp/build_py2 python2 $^ devbuild2: python2 setup.py build_ext --inplace devbuild3: python3 setup.py build_ext --inplace valgrind: python -c "import sys;print(sys.version)" valgrind --leak-check=full --track-origins=yes --log-file=valgrind.log python unittests.py pip-release: python setup.py sdist upload clean: rm -f stamp/* rm -rf dist build python-pyahocorasick_1.4.1.orig/README.rst0000644000000000000000000002565613642657421015361 0ustar00======================================================================== pyahocorasick ======================================================================== .. image:: https://travis-ci.org/WojciechMula/pyahocorasick.svg?branch=master :target: https://travis-ci.org/WojciechMula/pyahocorasick :alt: Linux Master branch tests status .. image:: https://ci.appveyor.com/api/projects/status/github/WojciechMula/pyahocorasick?branch=master&svg=true :target: https://ci.appveyor.com/project/WojciechMula/pyahocorasick :alt: Windows Master branch tests status **pyahocorasick** is a fast and memory efficient library for exact or approximate multi-pattern string search meaning that you can find multiple key strings occurrences at once in some input text. The library provides an `ahocorasick` Python module that you can use as a plain dict-like Trie or convert a Trie to an automaton for efficient Aho-Corasick search. It is implemented in C and tested on Python 2.7 and 3.4+. It works on Linux, Mac and Windows. The license_ is BSD-3-clause. Some utilities, such as tests and the pure Python automaton are dedicated to the Public Domain. Download and source code ======================== You can fetch **pyahocorasick** from: - GitHub https://github.com/WojciechMula/pyahocorasick/ - Pypi https://pypi.python.org/pypi/pyahocorasick/ - Conda-Forge https://github.com/conda-forge/pyahocorasick-feedstock/ Quick start =========== This module is written in C. You need a C compiler installed to compile native CPython extensions. To install:: pip install pyahocorasick Then create an Automaton:: >>> import ahocorasick >>> A = ahocorasick.Automaton() You can use the Automaton class as a trie. Add some string keys and their associated value to this trie. Here we associate a tuple of (insertion index, original string) as a value to each key string we add to the trie:: >>> for idx, key in enumerate('he her hers she'.split()): ... A.add_word(key, (idx, key)) Then check if some string exists in the trie:: >>> 'he' in A True >>> 'HER' in A False And play with the ``get()`` dict-like method:: >>> A.get('he') (0, 'he') >>> A.get('she') (3, 'she') >>> A.get('cat', 'not exists') 'not exists' >>> A.get('dog') Traceback (most recent call last): File "", line 1, in KeyError Now convert the trie to an Aho-Corasick automaton to enable Aho-Corasick search:: >>> A.make_automaton() Then search all occurrences of the keys (the needles) in an input string (our haystack). Here we print the results and just check that they are correct. The `Automaton.iter()` method return the results as two-tuples of the `end index` where a trie key was found in the input string and the associated `value` for this key. Here we had stored as values a tuple with the original string and its trie insertion order:: >>> for end_index, (insert_order, original_value) in A.iter(haystack): ... start_index = end_index - len(original_value) + 1 ... print((start_index, end_index, (insert_order, original_value))) ... assert haystack[start_index:start_index + len(original_value)] == original_value ... (1, 2, (0, 'he')) (1, 3, (1, 'her')) (1, 4, (2, 'hers')) (4, 6, (3, 'she')) (5, 6, (0, 'he')) You can also create an eventually large automaton ahead of time and `pickle` it to re-load later. Here we just pickle to a string. You would typically pickle to a file instead:: >>> import cPickle >>> pickled = cPickle.dumps(A) >>> B = cPickle.loads(pickled) >>> B.get('he') (0, 'he') See also: - FAQ and Who is using pyahocorasick? https://github.com/WojciechMula/pyahocorasick/wiki/FAQ#who-is-using-pyahocorasick Documentation ============= The full documentation including the API overview and reference is published on `readthedocs `_. Overview With an `Aho-Corasick automaton `_ you can efficiently search all occurrences of multiple strings (the needles) in an input string (the haystack) making a single pass over the input string. With pyahocorasick you can eventually build large automatons and pickle them to reuse them over and over as an indexed structure for fast multi pattern string matching. One of the advantages of an Aho-Corasick automaton is that the typical worst-case and best-case **runtimes** are about the same and depends primarily on the size of the input string and secondarily on the number of matches returned. While this may not be the fastest string search algorithm in all cases, it can search for multiple strings at once and its runtime guarantees make it rather unique. Because pyahocorasick is based on a Trie, it stores redundant keys prefixes only once using memory efficiently. A drawback is that it needs to be constructed and "finalized" ahead of time before you can search strings. In several applications where you search for several pre-defined "needles" in a variable "haystacks" this is actually an advantage. **Aho-Corasick automatons** are commonly used for fast multi-pattern matching in intrusion detection systems (such as snort), anti-viruses and many other applications that need fast matching against a pre-defined set of string keys. Internally an Aho-Corasick automaton is typically based on a Trie with extra data for failure links and an implementation of the Aho-Corasick search procedure. Behind the scenes the **pyahocorasick** Python library implements these two data structures: a `Trie `_ and an Aho-Corasick string matching automaton. Both are exposed through the `Automaton` class. In addition to Trie-like and Aho-Corasick methods and data structures, **pyahocorasick** also implements dict-like methods: The pyahocorasick **Automaton** is a **Trie** a dict-like structure indexed by string keys each associated with a value object. You can use this to retrieve an associated value in a time proportional to a string key length. pyahocorasick is available in two flavors: * a CPython **C-based extension**, compatible with Python 2 and 3. * a simpler pure Python module, compatible with Python 2 and 3. This is only available in the source repository (not on Pypi) under the py/ directory and has a slightly different API. Unicode and bytes ----------------- The type of strings accepted and returned by ``Automaton`` methods are either **unicode** or **bytes**, depending on a compile time settings (preprocessor definition of ``AHOCORASICK_UNICODE`` as set in `setup.py`). The ``Automaton.unicode`` attributes can tell you how the library was built. On Python 3, unicode is the default. On Python 2, bytes is the default and only value. .. warning:: When the library is built with unicode support on Python 3, an Automaton will store 2 or 4 bytes per letter, depending on your Python installation. When built for bytes, only one byte per letter is needed. Unicode is **NOT supported** on Python 2 for now. Build and install from PyPi =========================== To install for common operating systems, use pip. Pre-built wheels should be available on Pypi at some point in the future:: pip install pyahocorasick To build from sources you need to have a C compiler installed and configured which should be standard on Linux and easy to get on MacOSX. On Windows and Python 2.7 you need the `Microsoft Visual C++ Compiler for Python 2.7 `_ (aka. Visual Studio 2008). There have been reports that `pyahocorasick` does not build yet with MinGW. It may build with cygwin but this has not been tested. If you get this working with these platforms, please report in a ticket! To build from sources, clone the git repository or download and extract the source archive. Install `pip` (and its `setuptools` companion) and then run (in a `virtualenv` of course!):: pip install . If compilation succeeds, the module is ready to use. Support ======= Support is available through the `GitHub issue tracker `_ to report bugs or ask questions. Contributing ============ You can submit contributions through `GitHub pull requests `_. Authors ======= The initial author and maintainer is Wojciech Muła. `Philippe Ombredanne `_, the current co-owner, rewrote documentation, setup CI servers and did a whole lot of work to make this module better accessible to end users. Alphabetic list of authors: * **Andrew Grigorev** * **Bogdan** * **David Woakes** * **Edward Betts** * **Frankie Robertson** * **Frederik Petersen** * **gladtosee** * **INADA Naoki** * **Jan Fan** * **Pastafarianist** * **Philippe Ombredanne** * **Renat Nasyrov** * **Sylvain Zimmer** * **Xiaopeng Xu** This library would not be possible without help of many people, who contributed in various ways. They created `pull requests `_, reported bugs as `GitHub issues `_ or via direct messages, proposed fixes, or spent their valuable time on testing. Thank you. License ======= This library is licensed under very liberal `BSD-3-Clause `_ license. Some portions of the code are dedicated to the public domain such as the pure Python automaton and test code. Full text of license is available in LICENSE file. Other Aho-Corasick implementations for Python you can consider ============================================================== While **pyahocorasick** tries to be the finest and fastest Aho Corasick library for Python you may consider these other libraries: * `py_aho_corasick `_ by Jan * Written in pure Python. * Poor performance. * `ahocorapy `_ by abusix * Written in pure Python. * Better performance than py-aho-corasick. * Using pypy, ahocorapy's search performance is only slightly worse than pyahocorasick's. * Performs additional suffix shortcutting (more setup overhead, less search overhead for suffix lookups). * Includes visualization tool for resulting automaton (using pygraphviz). * MIT-licensed, 100% test coverage, tested on all major python versions (+ pypy) * `noaho `_ by Jeff Donner * Written in C. Does not return overlapping matches. * Does not compile on Windows (July 2016). * No support for the pickle protocol. * `acora `_ by Stefan Behnel * Written in Cython. * Large automaton may take a long time to build (July 2016) * No support for a dict-like protocol to associate a value to a string key. * `ahocorasick `_ by Danny Yoo * Written in C. * seems unmaintained (last update in 2005). * GPL-licensed. python-pyahocorasick_1.4.1.orig/allsources.c0000644000000000000000000000043213406755432016172 0ustar00#include "src/custompickle/custompickle.c" #include "src/custompickle/pyhelpers.c" #include "src/custompickle/save/savebuffer.c" #include "src/custompickle/save/automaton_save.c" #include "src/custompickle/load/loadbuffer.c" #include "src/custompickle/load/module_automaton_load.c" python-pyahocorasick_1.4.1.orig/appveyor.yml0000644000000000000000000000116413574761352016251 0ustar00version: 1.0.{build} environment: matrix: - PYTHON: "C:\\Python27" - PYTHON: "C:\\Python27-x64" - PYTHON: "C:\\Python36" - PYTHON: "C:\\Python36-x64" - PYTHON: "C:\\Python37" - PYTHON: "C:\\Python37-x64" - PYTHON: "C:\\Python38" - PYTHON: "C:\\Python38-x64" install: - cmd: "%PYTHON%\\python.exe -m pip install --upgrade pip wheel" build: off test_script: - cmd: "%PYTHON%\\python.exe -m pip install ." - cmd: "%PYTHON%\\python.exe setup.py test" - dir - mkdir wheels - cmd: "%PYTHON%\\python.exe -m pip wheel . --wheel-dir=wheels" - dir wheels artifacts: - path: wheels\* python-pyahocorasick_1.4.1.orig/benchmarks/0000755000000000000000000000000013001457257015763 5ustar00python-pyahocorasick_1.4.1.orig/common.h0000644000000000000000000000450514002633220015277 0ustar00/* This is part of pyahocorasick Python module. common definitions and includes Author : Wojciech Muła, wojciech_mula@poczta.onet.pl WWW : http://0x80.pl License : public domain */ #ifndef ahocorasick_common_h_included__ #define ahocorasick_common_h_included__ #define PY_SSIZE_T_CLEAN #include #include // PyMemberDef #include #define DEBUG #if defined(_MSC_VER) // Visual Studio compiler # include "windows.h" #else # if defined(__CYGWIN__) # include "cygwin.h" # else # include "posix.h" # endif #endif #if PY_MAJOR_VERSION >= 3 #define PY3K #if PY_MINOR_VERSION >= 3 || PY_MAJOR_VERSION > 3 #define PEP393 #ifdef AHOCORASICK_UNICODE #define PEP393_UNICODE #endif #endif #else #ifdef AHOCORASICK_UNICODE #warning "No support for unicode in version for Python2" #endif #undef AHOCORASICK_UNICODE #endif // setup supported character set #ifdef AHOCORASICK_UNICODE # if defined PEP393_UNICODE || defined Py_UNICODE_WIDE // Either Python uses UCS-4 or we don't know what Python uses, // but we use UCS-4 # define TRIE_LETTER_TYPE uint32_t # define TRIE_LETTER_SIZE 4 # else // Python use UCS-2 # define TRIE_LETTER_TYPE uint16_t # define TRIE_LETTER_SIZE 2 # define VARIABLE_LEN_CHARCODES 1 # endif #else // only bytes are supported # define TRIE_LETTER_TYPE uint16_t # define TRIE_LETTER_SIZE 2 #endif #ifdef __GNUC__ # define LIKELY(x) __builtin_expect(x, 1) # define UNLIKELY(x) __builtin_expect(x, 0) # define ALWAYS_INLINE __attribute__((always_inline)) # define PURE __attribute__((pure)) # define UNUSED __attribute__((unused)) #else # define LIKELY(x) x # define UNLIKELY(x) x # define ALWAYS_INLINE # define PURE # define UNUSED #endif #ifdef DEBUG # include # define ASSERT(expr) do {if (!(expr)) {fprintf(stderr, "%s:%s:%d - %s failed!\n", __FILE__, __FUNCTION__, __LINE__, #expr); fflush(stderr); exit(1);} }while(0) #else # define ASSERT(expr) #endif #if defined(PYCALLS_INJECT_FAULTS) && defined(PY3K) # include "src/pycallfault/pycallfault.h" #else # define F(name) name #endif typedef char bool; #define true 1 #define false 0 #endif python-pyahocorasick_1.4.1.orig/cygwin.h0000644000000000000000000000054313632334274015324 0ustar00/* This is part of pyahocorasick Python module. CYGWIN declarations. Author : Wojciech Muła, wojciech_mula@poczta.onet.pl WWW : http://0x80.pl License : BSD-3-Clause (see LICENSE) */ #ifndef PYAHCORASICK_CYGWIN_H__ #define PYAHCORASICK_CYGWIN_H__ #define PY_OBJECT_HEAD_INIT PyVarObject_HEAD_INIT(NULL, 0) #endif python-pyahocorasick_1.4.1.orig/docs/0000755000000000000000000000000012746664234014610 5ustar00python-pyahocorasick_1.4.1.orig/dump2dot.py0000644000000000000000000000327013406760402015756 0ustar00""" Aho-Corasick string search algorithm. Author : Wojciech Muła, wojciech_mula@poczta.onet.pl WWW : http://0x80.pl License : public domain """ import ahocorasick import os from ahocorasick import EMPTY, TRIE, AHOCORASICK; def dump2dot(automaton, file): def writeln(text=""): file.write(text + "\n") def nodename(nodeid): return 'node%x' % (nodeid & 0xffffffff) if automaton.kind == EMPTY: writeln("digraph empty {}") return if automaton.kind == TRIE: name = "trie" else: name = "ahocorasick" writeln("digraph %s {" % name) nodes, edges, fail = automaton.dump() # nodes for nodeid, end in nodes: if end: attr = '[shape=doublecircle, label=""]' else: attr = '[shape=circle, label=""]' writeln("\t%s %s" % (nodename(nodeid), attr)) def format_label(label): label = str(label, 'ascii') label = label.replace('"', r'\"') return '"%s"' % label # trie edges for nodeid, label, destid in edges: writeln("\t%s -> %s [label=%s]" % (nodename(nodeid), nodename(destid), format_label(label))) # fail links for nodeid, failid in fail: writeln("\t%s -> %s [color=blue]" % (nodename(nodeid), nodename(failid))) writeln("}") def show(automaton): path = '/dev/shm/%s.dot' % os.getpid() with open(path, 'wt') as f: dump2dot(automaton, f) os.system("xdot %s" % path) #os.system("dotty %s" % path) os.unlink(path) if __name__ == '__main__': A = ahocorasick.Automaton(ahocorasick.STORE_LENGTH) A.add_word("he") A.add_word("her") A.add_word("hers") A.add_word("she") A.add_word("cat") A.add_word("shield") with open('trie.dot', 'wt') as f: dump2dot(A, f) A.make_automaton() with open('ahocorasick.dot', 'wt') as f: dump2dot(A, f) python-pyahocorasick_1.4.1.orig/msinttypes/0000755000000000000000000000000013035462050016057 5ustar00python-pyahocorasick_1.4.1.orig/posix.h0000644000000000000000000000055013642657421015167 0ustar00/* This is part of pyahocorasick Python module. POSIX declarations. Author : Wojciech Muła, wojciech_mula@poczta.onet.pl WWW : http://0x80.pl License : BSD-3-Clause (see LICENSE) */ #ifndef PYAHCORASICK_POSIX_H__ #define PYAHCORASICK_POSIX_H__ #define PY_OBJECT_HEAD_INIT PyVarObject_HEAD_INIT(&PyType_Type, 0) #endif python-pyahocorasick_1.4.1.orig/py/0000755000000000000000000000000012231541307014270 5ustar00python-pyahocorasick_1.4.1.orig/pyahocorasick.c0000644000000000000000000000645113642657421016665 0ustar00/* This is part of pyahocorasick Python module. Python module. This file include all code from *.c files. Author : Wojciech Muła, wojciech_mula@poczta.onet.pl WWW : http://0x80.pl License : BSD-3-Clause (see LICENSE) */ #include "common.h" #include "slist.h" #include "trienode.h" #include "trie.h" #include "Automaton.h" #include "AutomatonSearchIter.h" #include "AutomatonSearchIterLong.h" #include "AutomatonItemsIter.h" #include "src/inline_doc.h" #include "src/custompickle/load/module_automaton_load.h" /* code */ #include "utils.c" #include "trienode.c" #include "trie.c" #include "slist.c" #include "Automaton.c" #include "AutomatonItemsIter.c" #include "AutomatonSearchIter.c" #include "AutomatonSearchIterLong.c" #ifdef PYCALLS_INJECT_FAULTS #include "src/pycallfault/pycallfault.c" #endif #include "allsources.c" static PyMethodDef ahocorasick_module_methods[] = { {"load", module_automaton_load, METH_VARARGS, module_load_doc}, {NULL, NULL, 0, NULL} }; #ifdef PY3K static PyModuleDef ahocorasick_module = { PyModuleDef_HEAD_INIT, "ahocorasick", module_doc, -1, ahocorasick_module_methods }; #endif #ifdef PY3K #define init_function PyInit_ahocorasick #define init_return(value) return (value) #else #define init_function initahocorasick #define init_return(unused) return #endif PyMODINIT_FUNC init_function(void) { PyObject* module; #ifdef MEMORY_DEBUG PyErr_WarnEx(PyExc_RuntimeWarning, "This is a developer version of pyahcorosick. " "The module was compiled with flag MEMORY_DEBUG.", 1); initialize_memory_debug(); #endif #ifdef PYCALLS_INJECT_FAULTS PyErr_WarnEx(PyExc_RuntimeWarning, "This is a developer version of pyahcorosick. " "The module was compiled with flag PYCALLS_INJECT_FAULTS.", 1); initialize_pycallfault(); #endif #if DEBUG_LAYOUT PyErr_WarnEx(PyExc_RuntimeWarning, "This is a developer version of pyahcorosick. " "The module was compiled with flag DEBUG_LAYOUT.", 1); trienode_dump_layout(); #endif automaton_as_sequence.sq_length = automaton_len; automaton_as_sequence.sq_contains = automaton_contains; automaton_type.tp_as_sequence = &automaton_as_sequence; #ifdef PY3K module = PyModule_Create(&ahocorasick_module); #else module = Py_InitModule3("ahocorasick", ahocorasick_module_methods, module_doc); #endif if (module == NULL) init_return(NULL); if (PyType_Ready(&automaton_type) < 0) { Py_DECREF(module); init_return(NULL); } else PyModule_AddObject(module, "Automaton", (PyObject*)&automaton_type); #define add_enum_const(name) PyModule_AddIntConstant(module, #name, name) add_enum_const(TRIE); add_enum_const(AHOCORASICK); add_enum_const(EMPTY); add_enum_const(STORE_LENGTH); add_enum_const(STORE_INTS); add_enum_const(STORE_ANY); add_enum_const(KEY_STRING); add_enum_const(KEY_SEQUENCE); add_enum_const(MATCH_EXACT_LENGTH); add_enum_const(MATCH_AT_MOST_PREFIX); add_enum_const(MATCH_AT_LEAST_PREFIX); #undef add_enum_const #ifdef AHOCORASICK_UNICODE PyModule_AddIntConstant(module, "unicode", 1); #else PyModule_AddIntConstant(module, "unicode", 0); #endif init_return(module); } python-pyahocorasick_1.4.1.orig/regression/0000755000000000000000000000000012514500132016013 5ustar00python-pyahocorasick_1.4.1.orig/release_checklist.txt0000644000000000000000000000064514002634426020062 0ustar00action python2 python3 -------------------------------------------------- ./runtest.sh unit [ ] [ ] ./runtest.sh unpickle [ ] [ ] ./runtest.sh valgrind [ ] [ ] ./runtest.sh mallocfaults [ ] [ ] ./runtest.sh reallocfaults [ ] [ ] ./runtest.sh pycallfaults [ ] [ ] ./runtest.sh coverage [ ] [ ] python-pyahocorasick_1.4.1.orig/runtest.sh0000755000000000000000000002035714002626257015720 0ustar00#!/bin/bash TMPDIR=/dev/shm if [[ ${PYTHON} == "" ]] then PYTHON=python fi function print_help { echo "Utility to run various tests" echo echo "Define variable PYTHON to point custom executable (if needed);" echo "by default standard python command is invoked." echo echo "Current settings:" echo "- Python interpreter: '${PYTHON}'" echo "- CFLAGS: '${CFLAGS}'" echo "- Selected unit tests: '${UNITTEST}' (empty value means 'all')" echo " This flag is used just for 'mallocfaults' and 'pycallfaults'" echo " as these tests might be really time consuming" echo usage } function usage { echo "$0 unit|unpickle|leaks|valgrind|mallocfaults|pycallfaults" echo echo "unit - run default unit tests" echo "unpickle - run unpickle tests, which depend on machine" echo "leaks - recompile module with flag MEMORY_DEBUG," echo " then run unittests and check if there were memory leaks" echo "valgrind - run unittests in valgrind and check if there are" echo " any leaks from pyahocorasick" echo "mallocfaults - recompile module with flag MEMORY_DEBUG," echo " then run unnitests injecting malloc faults" echo "reallocfaults - recompile module with flag MEMORY_DEBUG," echo " then run unnitests injecting realloc faults" echo "pycallfaults - recompile module with flag MEMORY_DEBUG," echo " then run unnitests injecting faults in python C-API calls" echo "coverage - create coverage report in 'coverage' subdir" echo echo "release - run unit, unpickle, leaks, mallocfaults and reallocfaults" echo " meant to run before relese" } ###################################################################### ACTIONS="unit unpickle leaks valgrind mallocfaults reallocfaults pycallfaults coverage release" if [[ $# != 1 || $1 == '-h' || $1 == '--help' ]] then print_help exit 1 fi ACTION= REBUILD=1 ###################################################################### RED='\033[31m' GREEN='\033[32m' RESET='\033[0m' MEMORY_DEBUG_PATH="${TMPDIR}/memory.dump" MEMORY_DEBUG="-DMEMORY_DEBUG -DMEMORY_DUMP_PATH='\"${MEMORY_DEBUG_PATH}\"'" function rebuild { ${PYTHON} setup.py build_ext --inplace if [[ $? != 0 ]] then echo -e "${RED}Build failed${RESET}" exit 1 fi } function force_rebuild { if [[ ${REBUILD} == 1 ]] then rm -r build ahocorasick*.so 2> /dev/null rebuild fi } function run_unittests { ${PYTHON} unittests.py ${UNITTEST} if [[ $? != 0 ]] then echo -e "${RED}Unit tests failed${RESET}" exit 1 fi } function handle_unit { force_rebuild run_unittests } function run_unpickletests { ${PYTHON} unpickle_test.py if [[ $? != 0 ]] then echo -e "${RED}Unpickle tests failed${RESET}" exit 1 fi } function handle_unpickle { force_rebuild run_unpickletests } function run_leaktest { ${PYTHON} tests/memdump_check.py ${MEMORY_DEBUG_PATH} if [[ $? != 0 ]] then echo -e "${RED}Memory leaks detected${RESET}" exit 1 fi } function handle_leaks { export CFLAGS="${CFLAGS} ${MEMORY_DEBUG}" force_rebuild run_unittests run_leaktest } function handle_valgrind { if ! command -v valgrind > /dev/null then echo "Valgrind not found" exit 1 fi force_rebuild local LOGFILE=${TMPDIR}/valgrind.log echo "Running valgrind..." valgrind --log-file=${LOGFILE} --leak-check=full --track-origins=yes ${PYTHON} unittests.py ${PYTHON} tests/valgrind_check.py . ${LOGFILE} } function run_mallocfaults { # obtain max allocation number unset ALLOC_FAIL unset REALLOC_FAIL run_unittests local MINID=0 echo ${MEMORY_DEBUG_PATH} local MAXID=$(${PYTHON} tests/memdump_maxalloc.py ${MEMORY_DEBUG_PATH}) # simulate failures of all allocations for ID in `seq ${MINID} ${MAXID}` do echo -ne "Checking memalloc fail ${ID} of ${MAXID}\r" mallocfault ${ID} done echo } function mallocfault { export ALLOC_NODUMP=1 export ALLOC_FAIL=$1 local LOG=${TMPDIR}/mallocfault${ID}.log ${PYTHON} unittests.py ${UNITTEST} -q > ${LOG} 2>&1 if [[ $? == 139 ]] then echo -e "${RED}SEGFAULT${RESET}" exit 1 fi ${PYTHON} tests/unittestlog_check.py ${LOG} if [[ $? != 0 ]] then echo -e "${RED}Possible error${RESET}" echo "Inspect ${LOG}, there are errors other than expected MemoryError" exit 1 fi } function handle_mallocfaults { export CFLAGS=${MEMORY_DEBUG} force_rebuild run_mallocfaults } function run_reallocfaults { # obtain max allocation number unset ALLOC_FAIL unset REALLOC_FAIL run_unittests local MINID=0 echo ${MEMORY_DEBUG_PATH} local MAXID=$(${PYTHON} tests/memdump_maxrealloc.py ${MEMORY_DEBUG_PATH}) # simulate failures of all allocations for ID in `seq ${MINID} ${MAXID}` do echo -ne "\rChecking realloc fail ${ID} of ${MAXID}" reallocfault ${ID} done echo } function reallocfault { export ALLOC_NODUMP=1 export REALLOC_FAIL=$1 local LOG=${TMPDIR}/reallocfault${ID}.log ${PYTHON} unittests.py ${UNITTEST} -q > ${LOG} 2>&1 if [[ $? == 139 ]] then echo -e "${RED}SEGFAULT${RESET}" exit 1 fi ${PYTHON} tests/unittestlog_check.py ${LOG} if [[ $? != 0 ]] then echo -e "${RED}Possible error${RESET}" echo "Inspect ${LOG}, there are errors other than expected MemoryError" exit 1 fi } function handle_reallocfaults { export CFLAGS=${MEMORY_DEBUG} force_rebuild run_reallocfaults } function handle_pycallfaults { export CFLAGS="-DPYCALLS_INJECT_FAULTS" force_rebuild local TMP=${TMPDIR}/pycallfaults ${PYTHON} unittests.py ${UNITTEST} > ${TMP} local MINID=0 local MAXID=$(awk ' /^Fail ID: / {if ($3 > max) max=$3} END {print max} ' ${TMP}) # simulate failures of all call to Python C-API for ID in `seq 0 ${MAXID}` do echo -n "Checking Python C-API fail ${ID} of ${MAXID}" local LOG=${TMPDIR}/pycallfaults${ID}.log export PYCALL_FAIL=${ID} ${PYTHON} unittests.py ${UNITTEST} > ${LOG} 2>&1 echo " return code $?" ${PYTHON} tests/pyfault_check.py ${LOG} done } function handle_coverage { if ! command -v gcovr > /dev/null then echo "gcovr not found" exit 1 fi export CFLAGS="--coverage" force_rebuild run_unittests local DIR=coverage local INDEX=pyahocorasick.html mkdir ${DIR} 2> /dev/null gcovr --html-details -o ${DIR}/${INDEX} echo "Navigate to ${DIR}/${INDEX}" } function handle_release { unset ALLOC_FAIL unset UNITTEST unset CFLAGS # 1. build with default settings and run unit tests and unpickle tests if true then force_rebuild > /dev/null 2>&1 run_unittests run_unpickletests fi # 2. build with memory debug and run unit tests and unpickle tests if true then export CFLAGS="${MEMORY_DEBUG}" force_rebuild > /dev/null 2>&1 rm -f ${MEMORY_DUMP_PATH} run_unittests run_leaktest rm -f ${MEMORY_DUMP_PATH} run_unpickletests run_leaktest fi # 3. inject malloc faults if true then export CFLAGS="${MEMORY_DEBUG}" force_rebuild > /dev/null 2>&1 run_mallocfaults fi echo -e "${GREEN}All OK${RESET}" } ################################################### arg=$1 case "${arg}" in unit) handle_unit ;; unpickle) handle_unpickle ;; leaks) handle_leaks ;; valgrind) handle_valgrind ;; mallocfaults) handle_mallocfaults ;; reallocfaults) handle_reallocfaults ;; pycallfaults) handle_pycallfaults ;; coverage) handle_coverage ;; release) handle_release ;; *) echo "Unknown action '${arg}'" usage exit 2 ;; esac python-pyahocorasick_1.4.1.orig/setup.cfg0000644000000000000000000000015212746701515015471 0ustar00[metadata] license_file = LICENSE [build_sphinx] source-dir = docs/ build-dir = docs/_build all_files = 1python-pyahocorasick_1.4.1.orig/setup.py0000644000000000000000000000742014004050702015347 0ustar00# -*- coding: utf-8 -*- """ Aho-Corasick string search algorithm. Author : Wojciech Muła, wojciech_mula@poczta.onet.pl WWW : http://0x80.pl License : BSD-3-Clause (see LICENSE) """ try: from setuptools import setup, Extension except ImportError: from distutils.core import setup, Extension from sys import version_info as python_version def get_long_description(): """ Strip the content index from the long description. """ import codecs with codecs.open('README.rst', encoding='UTF-8') as f: readme = [line for line in f if not line.startswith('.. contents::')] return ''.join(readme) if python_version.major not in [2, 3]: raise ValueError('Python %s is not supported' % python_version) if python_version.major == 3: macros = [ # when defined unicode strings are supported ('AHOCORASICK_UNICODE', ''), ] else: # On Python 2, unicode strings are not supported (yet). macros = [] module = Extension( 'ahocorasick', sources=[ 'pyahocorasick.c', ], define_macros=macros, depends=[ 'common.h', 'Automaton.c', 'Automaton.h', 'Automaton_pickle.c', 'AutomatonItemsIter.c', 'AutomatonItemsIter.h', 'AutomatonSearchIter.c', 'AutomatonSearchIter.h', 'AutomatonSearchIterLong.c', 'AutomatonSearchIterLong.h', 'trie.c', 'trie.h', 'slist.c', 'utils.c', 'trienode.c', 'trienode.h', 'msinttypes/stdint.h', 'src/inline_doc.h', 'src/pickle/pickle.h', 'src/pickle/pickle_data.h', 'src/pickle/pickle_data.c', 'src/custompickle/custompickle.h', 'src/custompickle/custompickle.c', 'src/custompickle/pyhelpers.h', 'src/custompickle/pyhelpers.c', 'src/custompickle/save/automaton_save.h', 'src/custompickle/save/automaton_save.c', 'src/custompickle/save/savebuffer.h', 'src/custompickle/save/savebuffer.c', 'src/custompickle/load/module_automaton_load.h', 'src/custompickle/load/module_automaton_load.c', 'src/custompickle/load/loadbuffer.h', 'src/custompickle/load/loadbuffer.c', 'src/pycallfault/pycallfault.h', 'src/pycallfault/pycallfault.c', ], ) setup( name='pyahocorasick', version='1.4.1', ext_modules=[module], description=( 'pyahocorasick is a fast and memory efficient library for exact or ' 'approximate multi-pattern string search. With the ahocorasick.Automaton ' 'class, you can find multiple key strings occurrences at once in some input ' 'text. You can use it as a plain dict-like Trie or convert a Trie to an ' 'automaton for efficient Aho-Corasick search. Implemented in C and tested ' 'on Python 2.7 and 3.4+. Works on Linux, Mac and Windows. BSD-3-clause license.' ), author='Wojciech Muła', author_email='wojciech_mula@poczta.onet.pl', maintainer='Wojciech Muła', maintainer_email='wojciech_mula@poczta.onet.pl', url='http://github.com/WojciechMula/pyahocorasick', platforms=['Linux', 'MacOSX', 'Windows'], license=' BSD-3-Clause and Public-Domain', long_description=get_long_description(), long_description_content_type="text/x-rst", keywords=[ 'aho-corasick', 'trie', 'automaton', 'dictionary', ], classifiers=[ 'Development Status :: 5 - Production/Stable', 'License :: OSI Approved :: BSD License', 'Programming Language :: C', 'Programming Language :: Python :: 2', 'Programming Language :: Python :: 3', 'Topic :: Software Development :: Libraries', 'Topic :: Text Editors :: Text Processing', ], ) python-pyahocorasick_1.4.1.orig/slist.c0000644000000000000000000000333013407734122015146 0ustar00/* This is part of pyahocorasick Python module. Linked list implementation. Const time of: * append * prepend * pop first * get first/last Author : Wojciech Muła, wojciech_mula@poczta.onet.pl WWW : http://0x80.pl License : public domain */ #include "slist.h" ListItem* list_item_new(const size_t size) { ListItem* item = (ListItem*)memory_alloc(size); if (item) { item->__next = 0; } return item; } void list_item_delete(ListItem* item) { memory_free(item); } void list_init(List* list) { if (list) { list->head = 0; list->last = 0; } } int list_delete(List* list) { ListItem* item; ListItem* tmp; ASSERT(list); item = list->head; while (item) { tmp = item; item = item->__next; memory_free(tmp); } list->head = list->last = NULL; return 0; } ListItem* list_append(List* list, ListItem* item) { ASSERT(list); if (item) { if (list->last) { list->last->__next = item; // append list->last = item; // set as last node } else list->head = list->last = item; } return item; } ListItem* list_push_front(List* list, ListItem* item) { ASSERT(list); if (list->head) { item->__next = list->head; list->head = item; } else list->head = list->last = item; return item; } ListItem* list_pop_first(List* list) { ListItem* item; ASSERT(list); if (list->head) { item = list->head; list->head = item->__next; if (!list->head) list->last = 0; return item; } else return NULL; } python-pyahocorasick_1.4.1.orig/slist.h0000644000000000000000000000271713407734122015163 0ustar00/* This is part of pyahocorasick Python module. Linked list declarations. Const time of: * append * prepend * pop first * get first/last Author : Wojciech Muła, wojciech_mula@poczta.onet.pl WWW : http://0x80.pl License : public domain */ #ifndef ahocorasick_slist_h_included #define ahocorasick_slist_h_included #include "common.h" /** base structure for list */ #define LISTITEM_data struct ListItem* __next /** list item node */ typedef struct ListItem { LISTITEM_data; } ListItem; /** Create new item */ ListItem* list_item_new(const size_t size); /** Deallocate list item. */ void list_item_delete(ListItem* item); /** Returns pointer to next item */ #define list_item_next(item) (((ListItem*)(item))->__next) /** Set new pointer to next item */ #define list_item_setnext(item, next) list_item_next(item) = (ListItem*)(next) /** List. */ typedef struct { ListItem* head; ///< first node ListItem* last; ///< last node } List; /** Initialize list. */ void list_init(List* list); /** Deallocate all elements of list. */ int list_delete(List* list); /** Append item at the end of list. */ ListItem* list_append(List* list, ListItem* item); /** Prepend item at front of list. */ ListItem* list_push_front(List* list, ListItem* item); /** Unlink first item from list. */ ListItem* list_pop_first(List* list); /** Test if list is empty. */ #define list_empty(list) ((list)->head == NULL) #endif python-pyahocorasick_1.4.1.orig/src/0000755000000000000000000000000013403224132014423 5ustar00python-pyahocorasick_1.4.1.orig/stamp/0000755000000000000000000000000012707723621014775 5ustar00python-pyahocorasick_1.4.1.orig/test.py0000644000000000000000000000155713346410306015203 0ustar00# -*- coding: utf-8 -*- """ Aho-Corasick string search algorithm. Author : Wojciech Muła, wojciech_mula@poczta.onet.pl WWW : http://0x80.pl License : public domain """ import ahocorasick import sys print(dir(ahocorasick)) def is_python_2(): return sys.version_info.major == 2 a = ahocorasick.Automaton() words = b"he e hers his she hi him man he" if not is_python_2(): words = words.decode('utf8') for i,w in enumerate(words.split()): a.add_word(w, (i, w)) #print(len(a), len(set(words))) #print(a.get(b"hea", 1)) """ for w in a.keys(): print(w) for w in a.values(): print(w) for w in a.items(): print(w) """ s = b"he rshershidamanza " if not is_python_2(): s = s.decode('utf8') a.make_automaton() for item in a.iter(s, 2, 8): print(item) print("==") def callback(index, item): print(index, item) a.find_all(s, callback, 2, 11) python-pyahocorasick_1.4.1.orig/tests/0000755000000000000000000000000013403215123014776 5ustar00python-pyahocorasick_1.4.1.orig/trie.c0000644000000000000000000001225513417131365014762 0ustar00/* This is part of pyahocorasick Python module. Trie implementation Author : Wojciech Muła, wojciech_mula@poczta.onet.pl WWW : http://0x80.pl License : BSD-3-Clause (see LICENSE) */ #include "trie.h" static TrieNode* trie_add_word(Automaton* automaton, const TRIE_LETTER_TYPE* word, const size_t wordlen, bool* new_word) { TrieNode* node; TrieNode* child; unsigned i; if (automaton->kind == EMPTY) { ASSERT(automaton->root == NULL); automaton->root = trienode_new(false); if (automaton->root == NULL) return NULL; } node = automaton->root; for (i=0; i < wordlen; i++) { const TRIE_LETTER_TYPE letter = word[i]; child = trienode_get_next(node, letter); if (child == NULL) { child = trienode_new(false); if (LIKELY(child != NULL)) { if (UNLIKELY(trienode_set_next(node, letter, child) == NULL)) { memory_free(child); return NULL; } } else { // Note: in case of memory error, the already allocate nodes // are still reachable from the root and will be free // upon automaton destruction. return NULL; } } node = child; } if (node->eow == false) { node->eow = true; *new_word = true; automaton->count += 1; } else *new_word = false; automaton->kind = TRIE; return node; } static PyObject* trie_remove_word(Automaton* automaton, const TRIE_LETTER_TYPE* word, const size_t wordlen) { PyObject* object; TrieNode* node; TrieNode* tmp; TrieNode* last_multiway; unsigned last_multiway_index; unsigned i; if (automaton->root == NULL) { return NULL; } node = automaton->root; last_multiway = node; last_multiway_index = 0; for (i=0; i < wordlen; i++) { const TRIE_LETTER_TYPE letter = word[i]; node = trienode_get_next(node, letter); if (node == NULL) { return NULL; } // Save the last node along path which has more children // or is a terminating node. if (node->n > 1 || (node->n == 1 && node->eow)) { last_multiway = node; last_multiway_index = i + 1; } } if (node->eow != true) { return NULL; } object = node->output.object; if (trienode_is_leaf(node)) { // Remove a linear list that starts at the last_multiway node // and ends at the last [found] one. // 1. Unlink the tail from the trie node = trienode_get_next(last_multiway, word[last_multiway_index]); ASSERT(node != NULL); if (UNLIKELY(trienode_unset_next_pointer(last_multiway, node) == MEMORY_ERROR)) { PyErr_NoMemory(); return NULL; } // 2. Free the tail (reference to value from the last element was already saved) for (i = last_multiway_index + 1; i < wordlen; i++) { tmp = trienode_get_next(node, word[i]); ASSERT(tmp->n <= 1); trienode_free(node); node = tmp; } trienode_free(node); } else { // just unmark the terminating node node->eow = false; } automaton->kind = TRIE; return object; } static TrieNode* PURE trie_find(TrieNode* root, const TRIE_LETTER_TYPE* word, const size_t wordlen) { TrieNode* node; size_t i; node = root; if (node != NULL) { for (i=0; i < wordlen; i++) { node = trienode_get_next(node, word[i]); if (node == NULL) return NULL; } } return node; } static int PURE trie_longest(TrieNode* root, const TRIE_LETTER_TYPE* word, const size_t wordlen) { TrieNode* node; int len = 0; size_t i; node = root; for (i=0; i < wordlen; i++) { node = trienode_get_next(node, word[i]); if (node == NULL) break; else len += 1; } return len; } static TrieNode* PURE ahocorasick_next(TrieNode* node, TrieNode* root, const TRIE_LETTER_TYPE letter) { TrieNode* next = node; TrieNode* tmp; while (next) { tmp = trienode_get_next(next, letter); if (tmp) // found link return tmp; else // or go back through fail edges next = next->fail; } // or return root node return root; } static int trie_traverse_aux( TrieNode* node, const int depth, trie_traverse_callback callback, void *extra ) { unsigned i; if (callback(node, depth, extra) == 0) return 0; for (i=0; i < node->n; i++) { if (trie_traverse_aux(trienode_get_ith_unsafe(node, i), depth + 1, callback, extra) == 0) return 0; } return 1; } static void trie_traverse( TrieNode* root, trie_traverse_callback callback, void *extra ) { ASSERT(root); ASSERT(callback); trie_traverse_aux(root, 0, callback, extra); } size_t PURE trienode_get_size(const TrieNode* node) { return sizeof(TrieNode) + node->n * sizeof(TrieNode*); } python-pyahocorasick_1.4.1.orig/trie.h0000644000000000000000000000274713407734122014773 0ustar00/* This is part of pyahocorasick Python module. Trie declarations Author : Wojciech Muła, wojciech_mula@poczta.onet.pl WWW : http://0x80.pl License : BSD-3-Clause (see LICENSE) */ #ifndef ahocorasick_trie_h_included #define ahocorasick_trie_h_included #include "common.h" #include "trienode.h" #include "Automaton.h" /* add new word to a trie, returns last node on a path for that word */ static TrieNode* trie_add_word(Automaton* automaton, const TRIE_LETTER_TYPE* word, const size_t wordlen, bool* new_word); /* remove word from a trie, returns associated object if was any */ static PyObject* trie_remove_word(Automaton* automaton, const TRIE_LETTER_TYPE* word, const size_t wordlen); /* returns last node on a path for given word */ static TrieNode* PURE trie_find(TrieNode* root, const TRIE_LETTER_TYPE* word, const size_t wordlen); /* returns node linked by edge labeled with letter including paths going through fail links */ static TrieNode* PURE ahocorasick_next(TrieNode* node, TrieNode* root, const TRIE_LETTER_TYPE letter); typedef int (*trie_traverse_callback)(TrieNode* node, const int depth, void* extra); /* traverse trie in DFS order, for each node callback is called if callback returns false, then traversing stop */ static void trie_traverse( TrieNode* root, trie_traverse_callback callback, void *extra ); /* returns total size of node and it's internal structures */ size_t PURE trienode_get_size(const TrieNode* node); #endif python-pyahocorasick_1.4.1.orig/trienode.c0000644000000000000000000001014113417131365015620 0ustar00/* This is part of pyahocorasick Python module. Trie implementation Author : Wojciech Muła, wojciech_mula@poczta.onet.pl WWW : http://0x80.pl License : BSD-3-Clause (see LICENSE) */ #include "trienode.h" static TrieNode* trienode_new(const char eow) { TrieNode* node = (TrieNode*)memory_alloc(sizeof(TrieNode)); if (node) { node->output.integer = 0; node->output.object = NULL; node->fail = NULL; node->n = 0; node->eow = eow; node->next = NULL; } return node; } static void trienode_free(TrieNode* node) { ASSERT(node); if (node->n > 0) { memory_free(node->next); } memory_free(node); } static TrieNode* PURE trienode_get_next(TrieNode* node, const TRIE_LETTER_TYPE letter) { unsigned i; Pair* next; ASSERT(node); next = (Pair*)node->next; for (i=0; i < node->n; i++) if (next[i].letter == letter) { return next[i].child; } return NULL; } static TristateResult trienode_unset_next_pointer(TrieNode* node, TrieNode* child) { unsigned i; unsigned index; Pair* next; ASSERT(node); for (i=0; i < node->n; i++) { if (node->next[i].child == child) { index = i; goto found; } } return FALSE; found: if (node->n == 1) { // there is just one node node->n = 0; memory_free(node->next); node->next = NULL; return TRUE; } // there are more nodes, reallocation is needed next = (Pair*)memory_alloc((node->n - 1) * sizeof(Pair)); if (UNLIKELY(next == NULL)) { return MEMORY_ERROR; } for (i=0; i < index; i++) { next[i] = node->next[i]; } for (i=index + 1; i < node->n; i++) { next[i - 1] = node->next[i]; } memory_free(node->next); node->next = next; node->n -= 1; return TRUE; } static TrieNode* PURE trienode_get_ith_unsafe(TrieNode* node, size_t index) { ASSERT(node); return node->next[index].child; } static TRIE_LETTER_TYPE PURE trieletter_get_ith_unsafe(TrieNode* node, size_t index) { ASSERT(node); return node->next[index].letter; } static TrieNode* trienode_set_next(TrieNode* node, const TRIE_LETTER_TYPE letter, TrieNode* child) { int n; void* next; ASSERT(node); ASSERT(child); ASSERT(trienode_get_next(node, letter) == NULL); n = node->n; next = (TrieNode**)memory_realloc(node->next, (n + 1) * (sizeof(Pair))); if (next) { node->next = next; node->next[n].letter = letter; node->next[n].child = child; node->n += 1; return child; } else return NULL; } #ifdef DEBUG_LAYOUT void trienode_dump_layout() { #define field_size(TYPE, name) sizeof(((TYPE*)NULL)->name) #define field_ofs(TYPE, name) offsetof(TYPE, name) #define field_dump(TYPE, name) printf("- %-12s: %d %d\n", #name, field_size(TYPE, name), field_ofs(TYPE, name)); printf("TrieNode (size=%lu):\n", sizeof(TrieNode)); field_dump(TrieNode, output); field_dump(TrieNode, fail); field_dump(TrieNode, n); field_dump(TrieNode, eow); field_dump(TrieNode, next); printf("Pair (size=%lu):\n", sizeof(Pair)); field_dump(Pair, letter); field_dump(Pair, child); #undef field_dump #undef field_size #undef field_ofs } #endif UNUSED static void trienode_dump_to_file(TrieNode* node, FILE* f) { unsigned i; ASSERT(node != NULL); ASSERT(f != NULL); if (node->n == 0) fprintf(f, "leaf "); fprintf(f, "node %p\n", node); if (node->eow) fprintf(f, "- eow [%p]\n", node->output.object); fprintf(f, "- fail: %p\n", node->fail); if (node->n > 0) { if (node->next == NULL) { fprintf(f, "- %d next: %p\n", node->n, node->next); } else { fprintf(f, "- %d next: [(%d; %p)", node->n, node->next[0].letter, node->next[0].child); for (i=1; i < node->n; i++) fprintf(f, ", (%d; %p)", node->next[i].letter, node->next[i].child); fprintf(f, "]\n"); } } } python-pyahocorasick_1.4.1.orig/trienode.h0000644000000000000000000000424113417131365015631 0ustar00/* This is part of pyahocorasick Python module. Trie node declarations Author : Wojciech Muła, wojciech_mula@poczta.onet.pl WWW : http://0x80.pl License : BSD-3-Clause (see LICENSE) */ #ifndef ahocorasick_trienode_h_included #define ahocorasick_trienode_h_included #include "common.h" struct TrieNode; #pragma pack(push) #pragma pack(1) typedef struct Pair { TRIE_LETTER_TYPE letter; ///< edge label struct TrieNode* child; ///< next pointer } Pair; #pragma pack(pop) /* links to children nodes are stored in dynamic table */ typedef struct TrieNode { union { PyObject* object; ///< valid when kind = STORE_ANY Py_uintptr_t integer; ///< valid when kind in [STORE_LENGTH, STORE_INTS] } output; ///< output function, valid when eow is true struct TrieNode* fail; ///< fail node #if TRIE_LETTER_SIZE == 1 uint16_t n; ///< length of next #else uint32_t n; ///< length of next #endif uint8_t eow; ///< end of word marker Pair* next; ///< table of letters and associated next pointers } TrieNode; typedef enum { MEMORY_ERROR, TRUE, FALSE } TristateResult; /* allocate new node */ static TrieNode* trienode_new(const char eow); /* free node */ static void trienode_free(TrieNode* node); /* returns child node linked by edge labelled with letter */ static TrieNode* PURE trienode_get_next(TrieNode* node, const TRIE_LETTER_TYPE letter); /* link with child node by edge labelled with letter */ static TrieNode* trienode_set_next(TrieNode* node, const TRIE_LETTER_TYPE letter, TrieNode* child); /* remove link to given children */ static TristateResult trienode_unset_next_pointer(TrieNode* node, TrieNode* child); static TrieNode* PURE trienode_get_ith_unsafe(TrieNode* node, size_t letter); static TRIE_LETTER_TYPE PURE trieletter_get_ith_unsafe(TrieNode* node, size_t letter); #define trienode_is_leaf(node) ((node)->n == 0) static void trienode_dump_to_file(TrieNode* node, FILE* f); #define trienode_dump(node) trienode_dump_to_file(node, stdout) #ifdef DEBUG_LAYOUT void trienode_dump_layout(); #endif #endif python-pyahocorasick_1.4.1.orig/unittests.py0000644000000000000000000012267713642670543016307 0ustar00# -*- coding: utf-8 -*- """ This is part of pyahocorasick Python module. Unit tests for the C-based ahocorasick module. Author : Wojciech Muła, wojciech_mula@poczta.onet.pl WWW : http://0x80.pl/proj/pyahocorasick/ License : public domain """ import sys import os import unittest import ahocorasick try: import _pickle except ImportError: _pickle = None if ahocorasick.unicode: conv = lambda x: x else: if sys.version_info.major >= 3: conv = lambda x: bytes(x, 'ascii') else: conv = lambda x: x class TestCase(unittest.TestCase): def __init__(self, *args): super(TestCase, self).__init__(*args) if not hasattr(self, 'assertRaisesRegex'): # fixup for Py2 self.assertRaisesRegex = self.assertRaisesRegexp def assertEmpty(self, collection): self.assertEqual(0, len(collection)) def assertNotEmpty(self, collection): self.assertGreater(len(collection), 0) class TestConstructor(TestCase): def test_constructor_wrong_store(self): with self.assertRaisesRegex(ValueError, "store value must be one of.*"): ahocorasick.Automaton(-42) def test_constructor_wrong_key_type(self): with self.assertRaisesRegex(ValueError, "key_type must have value.*"): ahocorasick.Automaton(ahocorasick.STORE_ANY, -42) class TestTrieStorePyObjectsBase(TestCase): def setUp(self): self.A = ahocorasick.Automaton(); self.words = "word python aho corasick \x00\x00\x00".split() self.inexisting = "test foo bar dword".split() class TestTrieMethods(TestTrieStorePyObjectsBase): "Test basic methods related to trie structure" def test_empty(self): A = self.A self.assertTrue(A.kind == ahocorasick.EMPTY) self.assertTrue(len(A) == 0) def test_add_word(self): A = self.A self.assertTrue(A.kind == ahocorasick.EMPTY) n = 0 for word in self.words: n += 1 A.add_word(conv(word), None) self.assertEqual(A.kind, ahocorasick.TRIE) self.assertEqual(len(A), n) # dupliacted entry A.add_word(conv(self.words[0]), None) self.assertTrue(A.kind == ahocorasick.TRIE) self.assertTrue(len(A) == n) def test_add_empty_word(self): if ahocorasick.unicode: self.assertFalse(self.A.add_word("", None)) else: self.assertFalse(self.A.add_word(b"", None)) self.assertEqual(len(self.A), 0) self.assertEqual(self.A.kind, ahocorasick.EMPTY) def test_clear(self): A = self.A self.assertTrue(A.kind == ahocorasick.EMPTY) for w in self.words: A.add_word(conv(w), w) self.assertEqual(len(A), len(self.words)) A.clear() self.assertEqual(A.kind, ahocorasick.EMPTY) self.assertEqual(len(A), 0) def test_exists(self): A = self.A for w in self.words: A.add_word(conv(w), w) for w in self.words: self.assertTrue(A.exists(conv(w))) for w in self.inexisting: self.assertFalse(A.exists(conv(w))) def test_contains(self): A = self.A for w in self.words: A.add_word(conv(w), w) for w in self.words: self.assertTrue(conv(w) in A) for w in self.inexisting: self.assertTrue(conv(w) not in A) def test_match(self): A = self.A for word in self.words: A.add_word(conv(word), word) prefixes = "w wo wor word p py pyt pyth pytho python \x00 \x00\x00 \x00\x00\x00".split() for word in prefixes: self.assertTrue(A.match(conv(word))) inexisting = "wa apple pyTon \x00\x00\x00\x00".split() for word in inexisting: self.assertFalse(A.match(conv(word))) def test_get1(self): A = self.A for i, w in enumerate(self.words): A.add_word(conv(w), i + 1) for i, w in enumerate(self.words): self.assertEqual(A.get(conv(w)), i + 1) def test_get2(self): A = self.A for i, w in enumerate(self.words): A.add_word(conv(w), i + 1) for w in self.inexisting: self.assertEqual(A.get(conv(w), None), None) def test_get3(self): A = self.A for i, w in enumerate(self.words): A.add_word(conv(w), i + 1) for w in self.inexisting: with self.assertRaises(KeyError): A.get(conv(w)) def test_get_from_an_empty_automaton(self): A = ahocorasick.Automaton() r = A.get('foo', None) self.assertEqual(r, None) def test_longest_prefix(self): A = self.A for i, w in enumerate(self.words): A.add_word(conv(w), i + 1) # there is "word" self.assertEqual(A.longest_prefix(conv("wo")), 2) self.assertEqual(A.longest_prefix(conv("working")), 3) self.assertEqual(A.longest_prefix(conv("word")), 4) self.assertEqual(A.longest_prefix(conv("wordbook")), 4) self.assertEqual(A.longest_prefix(conv("void")), 0) self.assertEqual(A.longest_prefix(conv("")), 0) def test_stats_have_valid_structure(self): A = self.A for i, w in enumerate(self.words): A.add_word(conv(w), i + 1) platform_dependent = None reference = { 'longest_word': 8, 'total_size': platform_dependent, 'sizeof_node': platform_dependent, 'nodes_count': 25, 'words_count': 5, 'links_count': 24 } s = A.get_stats() self.assertEqual(len(s), len(reference)) for key in reference: self.assertIn(key, s) for key in (key for key in reference if reference[key] != platform_dependent): self.assertEqual(reference[key], s[key]) def test_stats_for_empty_tire_are_empty(self): s = self.A.get_stats() self.assertTrue(len(s) > 0) for key in s: if key != "sizeof_node": self.assertEqual(s[key], 0) class TestTrieRemoveWord(TestTrieStorePyObjectsBase): def test_remove_word_from_empty_trie(self): self.assertFalse(self.A.remove_word("test")) def test_remove_existing_word(self): A = self.A words = ["he", "her", "hi", "him", "his"] for w in words: A.add_word(conv(w), w) expected_len = len(A) for w in words: self.assertTrue(self.A.remove_word(w)) self.assertFalse(self.A.exists(w)) expected_len -= 1 self.assertEqual(expected_len, len(A)) def test_remove_inexisting_word(self): A = self.A words = ["he", "her", "hi", "him", "his"] for w in words: A.add_word(conv(w), w) expected_len = len(A) for w in ["cat", "dog", "tree"]: self.assertFalse(self.A.exists(w)) self.assertFalse(self.A.remove_word(w)) self.assertEqual(expected_len, len(A)) def test_remove__case1(self): words = ["k", "ki", "kit", "kitt", "kitte", "kitten" , "kitc", "kitch", "kitche", "kitchen"] A = self.A for w in words: A.add_word(conv(w), w) expected_set = set(words) for w in words: self.assertTrue(self.A.remove_word(w)) expected_set.discard(w) current_set = set(A.keys()) self.assertEqual(expected_set, current_set) self.assertEqual(len(expected_set), len(A)) def test_remove__case2(self): words = ["k", "ki", "kit", "kitt", "kitte", "kitten" , "kitc", "kitch", "kitche", "kitchen"] A = self.A for w in words: A.add_word(conv(w), w) expected_set = set(words) for w in reversed(words): self.assertTrue(self.A.remove_word(w)) expected_set.discard(w) current_set = set(A.keys()) self.assertEqual(expected_set, current_set) self.assertEqual(len(expected_set), len(A)) def test_remove_word_changes_type_of_automaton(self): A = self.A words = ["he", "her", "hi", "him", "his"] for w in words: A.add_word(conv(w), w) A.make_automaton() self.assertEqual(ahocorasick.AHOCORASICK, A.kind) self.assertFalse(A.remove_word("inexisting")) self.assertEqual(ahocorasick.AHOCORASICK, A.kind) self.assertTrue(A.remove_word("hi")) self.assertEqual(ahocorasick.TRIE, A.kind) class TestTriePop(TestTrieStorePyObjectsBase): def test_pop_from_empty_trie(self): with self.assertRaises(KeyError): self.A.pop("test") def test_pop_existing_word(self): A = self.A words = ["he", "her", "hi", "him", "his"] for w in words: A.add_word(conv(w), w) expected_len = len(A) for w in words: self.assertEqual(w, self.A.pop(w)) self.assertFalse(self.A.exists(w)) expected_len -= 1 self.assertEqual(expected_len, len(A)) def test_pop_inexisting_word(self): A = self.A words = ["he", "her", "hi", "him", "his"] for w in words: A.add_word(conv(w), w) expected_len = len(A) for w in ["cat", "dog", "tree"]: with self.assertRaises(KeyError): self.A.pop(w) self.assertEqual(expected_len, len(A)) def test_pop__case1(self): words = ["k", "ki", "kit", "kitt", "kitte", "kitten" , "kitc", "kitch", "kitche", "kitchen"] A = self.A for w in words: A.add_word(conv(w), w) expected_set = set(words) for w in words: self.assertEqual(w, self.A.pop(w)) expected_set.discard(w) current_set = set(A.keys()) self.assertEqual(expected_set, current_set) self.assertEqual(len(expected_set), len(A)) def test_pop__case2(self): words = ["k", "ki", "kit", "kitt", "kitte", "kitten" , "kitc", "kitch", "kitche", "kitchen"] A = self.A for w in words: A.add_word(conv(w), w) expected_set = set(words) for w in reversed(words): self.assertEqual(w, self.A.pop(w)) expected_set.discard(w) current_set = set(A.keys()) self.assertEqual(expected_set, current_set) self.assertEqual(len(expected_set), len(A)) def test_pop_changes_type_of_automaton(self): A = self.A words = ["he", "her", "hi", "him", "his"] for w in words: A.add_word(conv(w), w) A.make_automaton() self.assertEqual(ahocorasick.AHOCORASICK, A.kind) with self.assertRaises(KeyError): A.pop("inexisting") self.assertEqual(ahocorasick.AHOCORASICK, A.kind) self.assertEqual("hi", A.pop("hi")) self.assertEqual(ahocorasick.TRIE, A.kind) class TestTrieIterators(TestTrieStorePyObjectsBase): "Test iterators walking over trie" def test_iter(self): A = self.A for i, w in enumerate(self.words): A.add_word(conv(w), i + 1) L = [word for word in A] K = list(map(conv, self.words)) self.assertEqual(len(L), len(K)) self.assertEqual(set(L), set(K)) def test_keys(self): A = self.A for i, w in enumerate(self.words): A.add_word(conv(w), i + 1) L = [word for word in A.keys()] K = [conv(word) for word in self.words] self.assertEqual(len(L), len(K)) self.assertEqual(set(L), set(K)) def test_values(self): A = self.A for i, w in enumerate(self.words): A.add_word(conv(w), i + 1) L = [x for x in A.values()] V = list(range(1, len(self.words) + 1)) self.assertEqual(len(L), len(V)) self.assertEqual(set(L), set(V)) def test_items(self): A = self.A I = [] for i, w in enumerate(self.words): A.add_word(conv(w), i + 1) I.append((conv(w), i + 1)) L = [x for x in A.items()] self.assertEqual(len(L), len(I)) self.assertEqual(set(L), set(I)) def test_items_with_prefix_valid(self): A = self.A words = "he she her hers star ham".split() for word in words: A.add_word(conv(word), word) I = list(map(conv, "he her hers".split())) L = [x for x in A.keys(conv("he"))] self.assertEqual(len(L), len(I)) self.assertEqual(set(L), set(I)) def test_items_with_prefix_invalid(self): A = self.A words = "he she her hers star ham".split() for word in words: A.add_word(conv(word), word) I = [] L = [x for x in A.keys(conv("cat"))] self.assertEqual(len(L), len(I)) self.assertEqual(set(L), set(I)) def test_items_with_valid_pattern(self): A = self.A words = "abcde aXcd aZcdef aYc Xbcdefgh".split() for word in words: A.add_word(conv(word), word) I = ["aXcd"] L = [x for x in A.keys(conv("a?cd"), conv("?"))] self.assertEqual(set(I), set(L)) def test_items_with_valid_pattern2(self): A = self.A words = "abcde aXcde aZcdef aYc Xbcdefgh".split() for word in words: A.add_word(conv(word), word) L = [x for x in A.keys(conv("a?c??"), conv("?"), ahocorasick.MATCH_EXACT_LENGTH)] I = ["abcde", "aXcde"] self.assertEqual(set(I), set(L)) L = [x for x in A.keys(conv("a?c??"), conv("?"), ahocorasick.MATCH_AT_MOST_PREFIX)] I = ["aYc", "abcde", "aXcde"] self.assertEqual(set(I), set(L)) L = [x for x in A.keys(conv("a?c??"), conv("?"), ahocorasick.MATCH_AT_LEAST_PREFIX)] I = ["abcde", "aXcde", "aZcdef"] self.assertEqual(set(I), set(L)) def test_items_wrong_wildcrard(self): with self.assertRaisesRegex(ValueError, "Wildcard must be a single character.*"): self.A.keys(conv("anything"), conv("??")) def test_items_wrong_match_enum(self): with self.assertRaisesRegex(ValueError, "The optional how third argument must be one of"): self.A.keys(conv("anything"), conv("?"), -42) class TestTrieIteratorsInvalidate(TestTrieStorePyObjectsBase): "Test invalidating iterator when trie is changed" def helper(self, method): A = self.A for i, w in enumerate(self.words): A.add_word(conv(w), i + 1) it = method() w = next(it) # word already exists, just change associated value # iterator is still valid A.add_word(conv(self.words[0]), 2) w = next(it) # new word, iterator is invalidated A.add_word(conv("should fail"), 1) with self.assertRaises(ValueError): w = next(it) def test_keys(self): self.helper(self.A.keys) def test_values(self): self.helper(self.A.values) def test_items(self): self.helper(self.A.items) class TestAutomatonBase(TestCase): def setUp(self): self.A = ahocorasick.Automaton(); self.words = "he her hers she".split() self.string = "_sherhershe_" self.correct_positons = [ (3, "she"), (3, "he"), (4, "her"), (6, "he"), (7, "her"), (8, "hers"), (10, "she"), (10, "he") ] def add_words(self): for word in self.words: self.A.add_word(conv(word), word) return self.A def add_words_and_make_automaton(self): self.add_words() self.A.make_automaton() return self.A class TestAutomatonConstruction(TestAutomatonBase): "Test converting trie to Aho-Corasick automaton" def test_make_automaton1(self): A = self.A self.assertEqual(A.kind, ahocorasick.EMPTY) A.make_automaton() # empty trie is never converted to automaton self.assertEqual(A.kind, ahocorasick.EMPTY) def test_make_automaton2(self): A = self.A self.assertEqual(A.kind, ahocorasick.EMPTY) self.add_words() self.assertEqual(A.kind, ahocorasick.TRIE) A.make_automaton() self.assertEqual(A.kind, ahocorasick.AHOCORASICK) def test_make_automaton3(self): A = self.A self.assertEqual(A.kind, ahocorasick.EMPTY) self.add_words() self.assertEqual(A.kind, ahocorasick.TRIE) A.make_automaton() self.assertEqual(A.kind, ahocorasick.AHOCORASICK) A.add_word(conv("rollback?"), True) self.assertEqual(A.kind, ahocorasick.TRIE) class TestAutomatonSearch(TestAutomatonBase): "Test searching using constructed automaton (method find_all)" def test_find_all1(self): "no action is performed until automaton is constructed" A = self.A self.assertEqual(A.kind, ahocorasick.EMPTY) self.assertEqual(A.find_all(self.string, conv("any arg")), None) A.add_word(conv("word"), None) self.assertEqual(A.kind, ahocorasick.TRIE) self.assertEqual(A.find_all(self.string, conv("any arg")), None) def test_find_all2(self): A = self.add_words_and_make_automaton() L = [] def callback(index, word): L.append((index, word)) A.find_all(conv(self.string), callback) C = self.correct_positons self.assertEqual(L, C) def test_find_all3(self): A = self.add_words_and_make_automaton() L = [] def callback(index, word): L.append((index, word)) start = 4 end = 9 L = [] A.find_all(conv(self.string[start:end]), callback) C = [(pos + start, word) for pos, word in L] L = [] A.find_all(conv(self.string), callback, start, end) self.assertEqual(L, C) def test_find_all__not_a_callable_object(self): A = self.add_words_and_make_automaton() with self.assertRaisesRegex(TypeError, "The callback argument must be a callable such as a function."): A.find_all(conv(self.string), None) def test_find_all__wrong_range__case_1(self): A = self.add_words_and_make_automaton() L = [] def callback(index, word): L.append((index, word)) with self.assertRaisesRegex(IndexError, "end index not in range 0..12"): A.find_all(conv(self.string), callback, 0, len(self.string) + 5) def test_find_all__wrong_range__case_2(self): A = self.add_words_and_make_automaton() L = [] def callback(index, word): L.append((index, word)) with self.assertRaisesRegex(IndexError, "start index not in range 0..12"): A.find_all(conv(self.string), callback, -len(self.string) - 1, 3) def test_find_all__end_index_not_given(self): A = self.add_words_and_make_automaton() L = [] def callback(index, word): L.append((index, word)) A.find_all(conv(self.string), callback, 0) def test_find_all__start_is_negative(self): A = self.add_words_and_make_automaton() L = [] def callback(index, word): L.append((index, word)) A.find_all(conv(self.string), callback, -3, 4) def test_find_all__end_is_negative(self): A = self.add_words_and_make_automaton() L = [] def callback(index, word): L.append((index, word)) A.find_all(conv(self.string), callback, 0, -1) class TestAutomatonIterSearch(TestAutomatonBase): "Test searching using constructed automaton (iterator)" def test_iter1(self): A = self.A self.assertEqual(A.kind, ahocorasick.EMPTY) with self.assertRaises(AttributeError): A.iter(conv(self.string)) A.add_word(conv("word"), None) self.assertEqual(A.kind, ahocorasick.TRIE) with self.assertRaises(AttributeError): A.iter(conv(self.string)) def test_iter2(self): A = self.add_words_and_make_automaton() L = [] for index, word in A.iter(conv(self.string)): L.append((index, word)) C = self.correct_positons self.assertEqual(L, C) def test_iter3(self): A = self.add_words_and_make_automaton() start = 4 end = 9 C = [] for index, word in A.iter(conv(self.string[start:end])): C.append((index + start, word)) L = [] for index, word in A.iter(conv(self.string), start, end): L.append((index, word)) self.assertEqual(L, C) def test_iter_set(self): A = self.add_words_and_make_automaton() parts = "_sh erhe rshe _".split() expected = { '_sh' : [], 'erhe' : [(3, 'she'), (3, 'he'), (4, 'her'), (6, 'he')], 'rshe' : [(7, 'her'), (8, 'hers'), (10, 'she'), (10, 'he')], '_' : [] } it = A.iter(conv("")) result = {} for part in parts: it.set(conv(part)) result[part] = [] for item in it: result[part].append(item) self.assertEqual(expected, result) def test_iter_set__with_reset(self): A = self.add_words_and_make_automaton() expected = { 'he' : [(1, 'he')], 'she' : [(2, 'she'), (2, 'he')], } it = A.iter(conv("")) result = {} for part in ["he", "she"]: it.set(conv(part), True) result[part] = [] for item in it: result[part].append(item) self.assertEqual(expected, result) def test_iter_compare_with_find_all(self): A = self.add_words_and_make_automaton() # results from find_all L = [] def callback(index, word): L.append((index, word)) A.find_all(conv(self.string), callback) # results from iterator C = [] for index, word in A.iter(conv(self.string)): C.append((index, word)) self.assertEqual(L, C) def test_iter_wrong_argument_type(self): A = self.add_words_and_make_automaton() with self.assertRaisesRegex(TypeError, "string required"): A.iter(None) class TestAutomatonIterSearchWithIgnoreWhiteSpace(TestAutomatonBase): "Test searching using constructed automaton (iterator)" def setUp(self): self.A = ahocorasick.Automaton() self.words = "he her hers she".split() self.string = "_sh e rher she_" self.correct_positons = [ (4, "she"), (4, "he"), (6, "her"), (8, "he"), (9, "her"), (11, "hers"), (13, "she"), (13, "he") ] self.correct_positons_start_12 = [ (13, "he") ] def test_iter1(self): self.add_words_and_make_automaton() A = self.A self.assertEqual(A.kind, ahocorasick.AHOCORASICK) L = [] for index, word in A.iter(conv(self.string), ignore_white_space=True): L.append((index, word)) self.assertEqual(L, self.correct_positons) def test_iter2(self): self.add_words_and_make_automaton() A = self.A self.assertEqual(A.kind, ahocorasick.AHOCORASICK) L = [] for index, word in A.iter(conv(self.string), ignore_white_space=True, start=12): L.append((index, word)) self.assertEqual(L, self.correct_positons_start_12) def test_wrong_keyword(self): self.add_words_and_make_automaton() A = self.A self.assertEqual(A.kind, ahocorasick.AHOCORASICK) with self.assertRaises(TypeError): A.iter(conv(self.string), ignore_white_space2=True) class TestAutomatonIterInvalidate(TestAutomatonBase): "Test if searching iterator is invalidated when trie/automaton change" def test_iter1(self): A = self.add_words_and_make_automaton() it = A.iter(conv(self.string)) w = next(it) A.add_word(conv("should fail"), 1) with self.assertRaises(ValueError): w = next(it) def test_iter2(self): A = self.add_words_and_make_automaton() it = A.iter(conv(self.string)) w = next(it) A.clear() with self.assertRaises(ValueError): w = next(it) print_dumps = False class TestPickle(TestAutomatonBase): "Test pickling/unpickling" def test_pickle(self): import pickle A = self.add_words_and_make_automaton(); reduced = A.__reduce__() self.assertEqual(len(reduced), 2) if print_dumps: print(pickle.dumps(A)) def test_unpickle(self): import pickle A = self.add_words_and_make_automaton(); dump = pickle.dumps(A) B = pickle.loads(dump) self.compare_automatons(A, B) def test_unicode(self): # sample Russian words from issue #8 import pickle test_sentences_rus = ["!ASM Print", "!ASM Print, tyre компания er", "!ASM Print, рекламно-производственная компания rr", "!Action Pact!", "!T.O.O.H.!", "!YES, лингвистический центр", "!ts, магазин", "!ФЕСТ", '"100-th" department store', '"1000 мелочей"', '"1001 мелочь"', '"19 отряд Федеральной противопожарной службы по Ленинградской области"', '"У Друзей"', '"ШТОРЫ и не только..."'] A = ahocorasick.Automaton() for sentences in test_sentences_rus[-7:]: for index, word in enumerate(sentences.split(' ')): A.add_word(word, (index, word)) dump = pickle.dumps(A) B = pickle.loads(dump) self.compare_automatons(A, B) def test_empty(self): import pickle A = ahocorasick.Automaton() dump = pickle.dumps(A) B = pickle.loads(dump) self.compare_automatons(A, B) def compare_automatons(self, A, B): if print_dumps: print([x for x in B.items()]) print([x for x in A.items()]) self.assertEqual(len(A), len(B)) for item in zip(A.items(), B.items()): (AK, AV), (BK, BV) = item self.assertEqual(AK, BK) self.assertEqual(AV, BV) class TestPickleStoreInts(TestCase): "Test pickling/unpickling for automaton of kind STORE_INTS/STORE_LEN" def add_words_and_make_automaton(self): A = ahocorasick.Automaton(ahocorasick.STORE_INTS) words = "tree trie bark branch barrier brag".split() for index, word in enumerate(words): A.add_word(word, index) A.make_automaton() return A def test_pickle_and_unpickle(self): import pickle A = self.add_words_and_make_automaton(); dump = pickle.dumps(A) B = pickle.loads(dump) self.compare_automatons(A, B) def compare_automatons(self, A, B): if print_dumps: print([x for x in B.items()]) print([x for x in A.items()]) self.assertEqual(len(A), len(B)) for item in zip(A.items(), B.items()): (AK, AV), (BK, BV) = item self.assertEqual(AK, BK) self.assertEqual(AV, BV) class TestTrieStoreInts(TestCase): "Test storing plain ints as values (instead of python objects)" def setUp(self): self.A = ahocorasick.Automaton(ahocorasick.STORE_INTS); self.words = "word python aho corasick \x00\x00\x00".split() def test_add_word1(self): A = self.A # by default next values are stored for word in self.words: A.add_word(conv(word)) I = list(range(1, len(self.words) + 1)) L = [A.get(conv(word)) for word in self.words] self.assertEqual(I, L) def test_add_word2(self): A = self.A # store arbitrary ints for i, word in enumerate(self.words): A.add_word(conv(word), i + 123) I = list(range(123, 123 + len(self.words))) L = [A.get(conv(word)) for word in self.words] self.assertEqual(I, L) def test_add_word3(self): # not a number with self.assertRaises(TypeError): self.A.add_word(conv("xyz"), None) def test_iter(self): A = self.A for word in self.words: A.add_word(conv(word)); I = set(range(1, len(A) + 1)) L1 = [val for val in A.values()] L2 = [val for key, val in A.items()] self.assertEqual(L1, L2) self.assertEqual(set(L1), I) def test_find_all_and_iter(self): words = "he her hers she".split() string = "_sherhershe_" A = self.A for word in words: A.add_word(conv(word)) A.make_automaton() # find_all() C = [] def callback(index, value): C.append((index, value)) A.find_all(conv(string), callback); # iter() L = [(index, value) for index, value in A.iter(conv(string))] # self.assertEqual(C, L) class TestTrieStoreLengths(TestCase): """Test storing plain ints -- length of words --- as values (instead of python objects)""" def setUp(self): self.A = ahocorasick.Automaton(ahocorasick.STORE_LENGTH); self.words = "word python aho corasick \x00\x00\x00".split() def test_add_word1(self): A = self.A # by default next values are stored for word in self.words: A.add_word(conv(word)) for key, value in A.items(): self.assertEqual(len(key), value) class TestSizeOf(TestCase): def setUp(self): self.A = ahocorasick.Automaton(); words = "word python aho corasick tree bark branch root".split() for word in words: self.A.add_word(conv(word), 1) def test_sizeof(self): size1 = sys.getsizeof(self.A) # grow memory self.A.add_word("kitten", "fluffy") size2 = sys.getsizeof(self.A) # just change the assigned value, no changes to the trie structure self.A.add_word("word", "other value") size3 = sys.getsizeof(self.A) self.assertTrue(size2 > size1) self.assertTrue(size3 == size2) class TestBugAutomatonSearch(TestAutomatonBase): """Bug in search""" def setUp(self): self.A = ahocorasick.Automaton() self.words = ['GT-C3303', 'SAMSUNG-GT-C3303K/'] def test_bug(self): self.add_words_and_make_automaton() text = 'SAMSUNG-GT-C3303i/1.0 NetFront/3.5 Profile/MIDP-2.0 Configuration/CLDC-1.1' res = list(self.A.iter(conv(text))) self.assertEqual([(15, 'GT-C3303')], res) class TestIntSequenceBase(TestCase): def setUp(self): self.A = ahocorasick.Automaton(ahocorasick.STORE_ANY, ahocorasick.KEY_SEQUENCE); class TestIntSequence__TrieMethods(TestIntSequenceBase): def test_add__case_1(self): A = self.A ret = A.add_word((1, 2, 3), "foo") self.assertTrue(ret) self.assertTrue(A.kind == ahocorasick.TRIE) self.assertEqual(len(A), 1) self.assertTrue((1, 2, 3) in A) def test_add__case_2(self): A = self.A A.add_word((1, 2, 3), "foo") ret = A.add_word((1, 2, 3), "bar") self.assertFalse(ret) def test_add__case_3(self): A = self.A A.add_word((1, 2, 3), "foo") A.add_word((1, 2, 3, 4, 5), "bar") A.add_word((1, 3, 4, 5), "baz") self.assertEqual(len(A), 3); self.assertEqual(A.get((1, 2, 3)), "foo"); self.assertEqual(A.get((1, 2, 3, 4, 5)), "bar"); self.assertEqual(A.get((1, 3, 4, 5)), "baz"); def test_add__case_4(self): A = self.A ret = A.add_word((), "foo") self.assertFalse(ret) def test_add__case_5__wrong_argument_type(self): A = self.A with self.assertRaises(TypeError) as e: A.add_word("hello!", "foo") self.assertEqual(str(e.exception), "argument is not a supported sequence type") def test_add__case_6__wrong_item_type(self): A = self.A with self.assertRaises(ValueError) as e: A.add_word((1, 2, "hello!"), "foo") self.assertEqual(str(e.exception), "item #2 is not a number") def test_add__case_7__wrong_value(self): A = self.A with self.assertRaises(ValueError) as e: A.add_word((1, -1, 12), "foo") errmsg = str(e.exception) msgs = [ "item #1: value -1 outside range [0..65535]", "item #1: value -1 outside range [0..4294967295]", ] self.assertIn(errmsg, msgs) def test_add__case_8__wrong_value(self): A = self.A with self.assertRaises(ValueError) as e: A.add_word((2**42, 0, 12), "foo") # Depending on python's version the message might be different, # but the type remains the same. errmsg = str(e.exception) msgs = [ "item #0: value 4398046511104 outside range [0..65535]", "item #0: value 4398046511104 outside range [0..4294967295]", "item #0 is not a number", ] self.assertIn(errmsg, msgs) def test_match(self): A = self.A ret = A.add_word((1, 2, 3), "foo") self.assertTrue(A.match((1,))) self.assertTrue(A.match((1, 2))) self.assertTrue(A.match((1, 2, 3))) def test_longest_prefix(self): A = self.A ret = A.add_word((1, 2, 3, 4, 5, 6), "foo") self.assertEqual(A.longest_prefix((1, 2, 3, 111, 1111, 11111)), 3); self.assertEqual(A.longest_prefix((111, 1111, 11111)), 0); def test_iter1(self): A = self.A A.add_word((1, 2, 3), "foo") A.add_word((2, 3, 4, 5), "bar") A.add_word((2, 3, 5), "baz") A.make_automaton() L = [(index, value) for index, value in A.iter((1, 2, 3, 5))] self.assertEqual(L, [ (2, "foo"), (3, "baz"), ]) def test_iter2(self): A = self.A A.add_word((43, 89), (43, 89)) A.add_word((43, 89, 64), (43, 89, 64)) A.add_word((89, 64), (89, 64)) A.add_word((89, 100), (89, 100)) A.make_automaton() L = [ (index, value) for index, value in A.iter((80, 80, 43, 89, 90, 89, 64, 100, 43, 89, 100)) ] self.assertEqual(L, [ (3, (43, 89)), (6, (89, 64)), (9, (43, 89)), (10, (89, 100)), ]) def test_iter_wrong_argument_type(self): A = self.A A.add_word((89, 100), (89, 100)) A.make_automaton() with self.assertRaisesRegex(TypeError, "tuple required"): self.A.iter(None) class TestDump(TestAutomatonBase): def test_dump_empty(self): self.assertIsNone(self.A.dump()) def test_dump_trie(self): self.add_words() ret = self.A.dump() self.assertEqual(3, len(ret)) self.assertNotEmpty(ret[0]) # list of nodes self.assertNotEmpty(ret[1]) # list of edges self.assertEmpty(ret[2]) # list of fail links -- empty, if not an automaton def test_dump_automaton(self): self.add_words_and_make_automaton() ret = self.A.dump() self.assertEqual(3, len(ret)) self.assertNotEmpty(ret[0]) # list of nodes self.assertNotEmpty(ret[1]) # list of edges self.assertNotEmpty(ret[2]) # list of fail links class TestIssue53(TestCase): """ Problems with handling of UCS-2 encoding """ def test_case1(self): # test contributed by @woakesd (David Woakes) a = ahocorasick.Automaton() a.add_word('test', 'test') a.make_automaton() test_string = 'test 🙈 test?!' # wrongly calculated matching position for item in a.iter(test_string): start = item[0] - len(item[1]) + 1 match = test_string[start:item[0] + 1] self.assertEqual(match, "test") def test_case2(self): a = ahocorasick.Automaton() a.add_word('test', 'test') a.make_automaton() test_string = '🙈' * 1000 # wrongly calculated the input's length for item in a.iter(test_string): pass class TestIssue68(TestCase): """ Test problems with pickling """ def test_case1(self): if _pickle is None: print("module _pickle not available") return A = ahocorasick.Automaton() for i in range(0, 65): A.add_word(str(i), (i, i)) path = 'TestIssue68.test_case1' with open(path, 'wb') as f: _pickle.dump(A, f) with open(path, 'rb') as f: _pickle.load(f) try: os.unlink(path) except: pass class TestLoadSave(TestAutomatonBase): def __init__(self, *args): super(TestAutomatonBase, self).__init__(*args) if os.path.isdir("/dev/shm"): tmp = "/dev/shm" else: tmp = "/tmp" self.path = conv(os.path.join(tmp, "test.dat")) def test_save__invalid_number_of_arguments(self): A = self.add_words_and_make_automaton(); with self.assertRaisesRegex(ValueError, "expected exactly two arguments"): A.save() def test_save__invalid_argument_1(self): A = self.add_words_and_make_automaton(); with self.assertRaisesRegex(TypeError, "the first argument must be a string"): A.save(None, None) def test_save__invalid_argument_2(self): A = self.add_words_and_make_automaton(); with self.assertRaisesRegex(TypeError, "the second argument must be a callable object"): A.save(self.path, None) def test_load__invalid_number_of_arguments(self): with self.assertRaisesRegex(ValueError, "expected exactly two arguments"): ahocorasick.load() def test_load__invalid_argument_1(self): with self.assertRaisesRegex(TypeError, "the first argument must be a string"): ahocorasick.load(None, None) def test_load__invalid_argument_2(self): with self.assertRaisesRegex(TypeError, "the second argument must be a callable object"): ahocorasick.load("/dev/shm/test.dump", None) def test_save(self): import pickle A = self.add_words_and_make_automaton(); A.save(self.path, pickle.dumps) def test_save_and_load_empty(self): import pickle A = ahocorasick.Automaton() A.save(self.path, pickle.dumps) B = ahocorasick.load(self.path, pickle.loads) self.compare_automatons(A, B) def test_save_and_load_trie(self): import pickle A = self.add_words() A.save(self.path, pickle.dumps) B = ahocorasick.load(self.path, pickle.loads) self.compare_automatons(A, B) def test_save_and_load_automaton(self): import pickle A = self.add_words_and_make_automaton(); A.save(self.path, pickle.dumps) B = ahocorasick.load(self.path, pickle.loads) self.compare_automatons(A, B) def test_save_ints(self): A = ahocorasick.Automaton(ahocorasick.STORE_INTS) with self.assertRaisesRegex(ValueError, "expected exactly one argument"): A.save(self.path, None) def test_save_and_load_ints(self): import pickle A = ahocorasick.Automaton(ahocorasick.STORE_INTS) for i, word in enumerate(conv("he she her cat car carriage zoo")): A.add_word(word, i) A.save(self.path) B = ahocorasick.load(self.path, pickle.loads) self.compare_automatons(A, B) def compare_automatons(self, A, B): if print_dumps: print([x for x in B.items()]) print([x for x in A.items()]) self.assertEqual(len(A), len(B)) A = list(A.items()) B = list(B.items()) for item in zip(A, B): (AK, AV), (BK, BV) = item self.assertEqual(AK, BK) self.assertEqual(AV, BV) class TestLongIterString(TestAutomatonBase): def test_match(self): A = ahocorasick.Automaton(); for word in "he here her".split(): A.add_word(word, word) A.make_automaton() result = list(A.iter_long("he here her")) self.assertEqual(result[0], (1, "he")) self.assertEqual(result[1], (6, "here")) self.assertEqual(result[2], (10, "her")) class TestLongIterSequence(TestAutomatonBase): def test_match(self): A = ahocorasick.Automaton(ahocorasick.STORE_ANY, ahocorasick.KEY_SEQUENCE); for word in [(1, 2), (1, 2, 3), (1, 2, 3, 4)]: A.add_word(word, word) A.make_automaton() result = list(A.iter_long((0, 1, 2, 3, 4, 0, 0, 1, 2, 0, 1, 3, 1, 2, 3, 0))) # ^^^^^^^^^^ ^^^^ ^^^^^^^ # index 4 8 14 self.assertEqual(result[0], (4, (1, 2, 3, 4))) self.assertEqual(result[1], (8, (1, 2))) self.assertEqual(result[2], (14, (1, 2, 3))) if __name__ == '__main__': unittest.main() python-pyahocorasick_1.4.1.orig/unpickle_test.py0000644000000000000000000003207713770620457017111 0ustar00# -*- coding: utf-8 -*- import ahocorasick import unittest import struct import sys class TreeNodeBuilderBase(object): def __init__(self): self.integer = 0 self.fail = 0 self.n = 0 self.eow = 0 self.next = [] def dump(self): assert self.n == len(self.next) next = b'' for letter, node in self.next: next += self.dump_edge(letter, node) return self.dump_node() + next if sys.version_info.major == 3: class TreeNodeBuilderPy3(TreeNodeBuilderBase): def dump_node(self): """ On Debian 64-bit, GCC 7.3 python3: integer : size 8, offset 0 fail : size 8, offset 8 n : size 4, offset 16 eow : size 1, offset 20 padding : size 3 next : size 8, offset 24 -- omitted in dump python2: integer : size 8, offset 0 fail : size 8, offset 8 n : size 4, offset 16 eow : size 1, offset 20 padding : size 1 next : size 8, offset 22 -- omitted in dump """ node = struct.pack('=QQIBxxx', self.integer, self.fail, self.n, self.eow) assert len(node) == 24 return node def dump_edge(self, letter, node): assert ord(letter) < 256 b = struct.pack('=IQ', ord(letter), node) assert len(b) == 12 return b TreeNodeBuilder = TreeNodeBuilderPy3 elif sys.version_info.major == 2: class TreeNodeBuilderPy2(TreeNodeBuilderBase): def dump_node(self): """ On Debian 64-bit, GCC 7.3 python2: integer : size 8, offset 0 fail : size 8, offset 8 n : size 4, offset 16 eow : size 1, offset 20 padding : size 3 next : size 8, offset 24 -- omitted in dump """ node = struct.pack('QQIBxxx', self.integer, self.fail, self.n, self.eow) assert len(node) == 24 return node def dump_edge(self, letter, node): assert ord(letter) < 256 b = struct.pack('=HQ', ord(letter), node) assert len(b) == 10 return b TreeNodeBuilder = TreeNodeBuilderPy2 USE_EXACT_RAW = True class TestUnpickleRaw(unittest.TestCase): def __init__(self, *args): super(TestUnpickleRaw, self).__init__(*args) if not hasattr(self, 'assertRaisesRegex'): # fixup for Py2 self.assertRaisesRegex = self.assertRaisesRegexp # raw constructor get 7-tuple (see Automaton.c): # 1. serialized nodes (as list of bytes or strings) # 2. kind # 3. store # 4. key type # 5. word count # 6. length of the longest word # 7. python values saved in a trie (if store == ahocorasick.STORE_ANY) def setUp(self): self.count = 0 self.raw = b'' self.kind = ahocorasick.EMPTY self.store = ahocorasick.STORE_ANY self.key_type = ahocorasick.KEY_STRING self.word_count = 0 self.longest = 0 self.values = [] def create_automaton(self, use_exact_raw=False): # alter values that were set in setUp if use_exact_raw: raw = self.raw else: raw = [self.create_raw_count(self.count) + self.raw] args = (raw, self.kind, self.store, self.key_type, self.word_count, self.longest, self.values); return ahocorasick.Automaton(*args) def create_node_builder(self, eow, children): builder = TreeNodeBuilder() builder.next = [(letter, i + 1) for letter, i in children] # starts from 1 builder.n = len(children) builder.eow = eow return builder def create_raw_count(self, n): return struct.pack('Q', n) def create_raw_node(self, eow, children): return self.create_node_builder(eow, children).dump() # -------------------------------------------------- def test__construct_empty(self): A = self.create_automaton() self.assertTrue(A.kind == ahocorasick.EMPTY) self.assertTrue(len(A) == 0) def test__construct_simple_trie(self): r""" trie for set {he, her, his, him, it} #0 -> [h #1 ] -> [e #2*] -> [r #3*] | \-> [i #4 ] -> [s #5*] | \-> [m #6*] | +--> [i #7 ] -> [t #8 ] """ values = ["HE", "HER", "HIS", "HIM", "IT"] node0 = self.create_raw_node(0, [('h', 1), ('i', 7)]) node1 = self.create_raw_node(0, [('e', 2), ('i', 4)]) node2 = self.create_raw_node(1, [('r', 3)]) # HE node3 = self.create_raw_node(1, []) # HER node4 = self.create_raw_node(0, [('s', 5), ('m', 6)]) node5 = self.create_raw_node(1, []) # HIS node6 = self.create_raw_node(1, []) # HIM node7 = self.create_raw_node(0, [('t', 8)]) node8 = self.create_raw_node(1, []) # IT self.count = 9 self.raw = node0 + node1 + node2 + node3 + node4 + node5 + node6 + node7 + node8 self.kind = ahocorasick.TRIE self.values = values self.word_count = 5 A = self.create_automaton() self.assertEqual(len(A), 5) self.assertEqual(A.get("he"), "HE") self.assertEqual(A.get("her"), "HER") self.assertEqual(A.get("him"), "HIM") self.assertEqual(A.get("his"), "HIS") self.assertEqual(A.get("it"), "IT") def test__construct_simple_trie__split_across_a_few_chunks(self): r""" trie for set {he, her, his, him, it} #0 -> [h #1 ] -> [e #2*] -> [r #3*] | \-> [i #4 ] -> [s #5*] | \-> [m #6*] | +--> [i #7 ] -> [t #8 ] """ values = ["HE", "HER", "HIS", "HIM", "IT"] node0 = self.create_raw_node(0, [('h', 1), ('i', 7)]) node1 = self.create_raw_node(0, [('e', 2), ('i', 4)]) node2 = self.create_raw_node(1, [('r', 3)]) # HE node3 = self.create_raw_node(1, []) # HER node4 = self.create_raw_node(0, [('s', 5), ('m', 6)]) node5 = self.create_raw_node(1, []) # HIS node6 = self.create_raw_node(1, []) # HIM node7 = self.create_raw_node(0, [('t', 8)]) node8 = self.create_raw_node(1, []) # IT self.count = 9 self.raw = [ self.create_raw_count(2) + node0 + node1, self.create_raw_count(3) + node2 + node3 + node4, self.create_raw_count(1) + node5, self.create_raw_count(3) + node6 + node7 + node8 ] self.kind = ahocorasick.TRIE self.values = values self.word_count = 5 A = self.create_automaton(USE_EXACT_RAW) self.assertEqual(len(A), 5) self.assertEqual(A.get("he"), "HE") self.assertEqual(A.get("her"), "HER") self.assertEqual(A.get("him"), "HIM") self.assertEqual(A.get("his"), "HIS") self.assertEqual(A.get("it"), "IT") def test__construct_wrong_kind(self): self.kind = 10000 with self.assertRaisesRegex(ValueError, "kind value.*"): self.create_automaton() def test__construct_wrong_store(self): self.store = 10000 with self.assertRaisesRegex(ValueError, "store value.*"): self.create_automaton() def test__construct_wrong_key_type(self): self.key_type = 10000 with self.assertRaisesRegex(ValueError, "key_type must have.*"): self.create_automaton() def test__construct_simple_trie__wrong_index(self): """ trie for set {he} #0 -> [h #1*] -> [e #2*] """ node0 = self.create_raw_node(0, [('h', 1)]) node1 = self.create_raw_node(1, [('e', 2)]) # expect python value node2 = self.create_raw_node(1, []) # also python value self.count = 3 self.raw = node0 + node1 + node2 self.kind = ahocorasick.TRIE self.values = ["HE"] # but we provide a too short collection self.word_count = 2 with self.assertRaises(IndexError): self.create_automaton() def test__truncated_raw__case_1(self): self.count = 1 # we're saying this is a non-empty trie, but given empty data self.raw = b'' self.kind = ahocorasick.TRIE with self.assertRaisesRegex(ValueError, r"Data truncated \[parsing header of node #0\].*"): self.create_automaton() def test__truncated_raw__case_2(self): """ trie for set {he} #0 -> [h #1 ] -> [e #2*] """ node0 = self.create_raw_node(0, [('h', 1)]) node1 = self.create_raw_node(0, [('e', 2)]) node2 = self.create_raw_node(1, []) raw = node0 + node1 + node2 self.count = 3 self.kind = ahocorasick.TRIE for length in range(len(raw)): self.raw = raw[:length] # truncate data and expect fail with self.assertRaisesRegex(ValueError, "Data truncated.*"): self.create_automaton() def test__malicious_next_pointer(self): """ #0 -> [? #1 ] """ node0 = self.create_raw_node(0, [('?', 1)]) node1 = self.create_raw_node(0, [('x', 16)]) # the second node point to non-existent node self.count = 2 self.raw = node0 + node1 self.kind = ahocorasick.TRIE with self.assertRaisesRegex(ValueError, "Node #1 malformed: next link #0 points to.*"): self.create_automaton() def test__malicious_fail_pointer(self): """ trie with just one node """ builder = self.create_node_builder(0, []) builder.fail = 42 self.count = 1 self.raw = builder.dump() self.kind = ahocorasick.TRIE with self.assertRaisesRegex(ValueError, "Node #0 malformed: the fail link points to.*"): self.create_automaton() def test__values_leaks(self): # create not connected nodes, but each hold a value good_nodes = 1000 raw = b'' values = [] for i in range(good_nodes): raw += self.create_raw_node(1, []) values.append(tuple("node %d" % i)) # create the last node that will cause error -- malformed next pointer raw += self.create_raw_node(1, [('_', 10000)]) values.append(tuple("never reached")) self.count = good_nodes + 1 self.raw = raw self.kind = ahocorasick.TRIE self.values = values with self.assertRaises(ValueError): self.create_automaton() def test__wrong_type_of_chunk_container(self): self.count = 9 self.raw = () # this should be a list self.kind = ahocorasick.TRIE self.values = None self.word_count = 5 with self.assertRaisesRegex(TypeError, "Expected list"): A = self.create_automaton(USE_EXACT_RAW) def test__wrong_type_of_chunk(self): self.count = 9 self.raw = [42] # list items must be strings/bytes self.kind = ahocorasick.TRIE self.values = None self.word_count = 5 with self.assertRaisesRegex(ValueError, "Item #0 on the bytes list is not a bytes object"): A = self.create_automaton(USE_EXACT_RAW) def test__wrong_count_of_nodes_in_chunk__case1(self): self.count = 9 self.raw = [ self.create_raw_count(0) # count must be greater than 0 ] self.kind = ahocorasick.TRIE self.values = None self.word_count = 5 with self.assertRaisesRegex(ValueError, r"Nodes count for item #0 on the bytes list is not positive \(0\)"): A = self.create_automaton(USE_EXACT_RAW) def test__wrong_count_of_nodes_in_chunk__case2(self): self.count = 9 self.raw = [ self.create_raw_count(-12 & 0xffffffffffffffff) # count must be greater than 0 ] self.kind = ahocorasick.TRIE self.values = None self.word_count = 5 with self.assertRaisesRegex(ValueError, r"Nodes count for item #0 on the bytes list is not positive \(-12\)"): A = self.create_automaton(USE_EXACT_RAW) if __name__ == '__main__': print("WARNING: these tests deal with in-memory representation (see TreeNodeBuilder),") print(" they were meant to test low-level implementation of pickling.") print(" Might segfault on your machine which is not necessary a bug in pyahocorasick.") unittest.main() python-pyahocorasick_1.4.1.orig/unresolved_bugs/0000755000000000000000000000000013050545546017057 5ustar00python-pyahocorasick_1.4.1.orig/update_inlinedoc.py0000644000000000000000000000730414002651246017526 0ustar00from pathlib import Path import sys import os import textwrap import xml.etree.ElementTree as ET def main(): dstpath = Path('src/inline_doc.h') app = Application(dstpath) app.run() HEADER = """#pragma once // DO NOT EDIT. File generated by script update_inlinedoc.py. """ class Application(object): def __init__(self, dstpath): self.dstpath = dstpath def run(self): content = HEADER for path, name in self.__get_files(): content += '\n' + self.__format_file(path, name) oldcontent = None if self.dstpath.exists(): oldcontent = self.dstpath.read_text() if content != oldcontent: print("Creating %s" % self.dstpath) self.dstpath.write_text(content) def __format_file(self, path, name): print("Parsing %s" % path) cmd = 'rst2xml %s' % path xml = os.popen(cmd).read() f = Formatter(xml, name) return f.format() def __get_files(self): rootdir = Path('docs') for path in sorted(rootdir.glob("*.rst")): if path.name != 'index.rst': name = path.stem + '_doc' yield (path, name) WIDTH = 60 class Formatter(object): def __init__(self, xml_string, name): self.xml = ET.fromstring(xml_string) self.name = name def format(self): self.lines = [] for node in next(self.xml.iter('document')): if node.tag == 'title': self.format_title(node) elif node.tag == 'paragraph': self.format_paragraph(node) elif node.tag == 'bullet_list': self.format_bullet_list(node) elif node.tag == 'section': break # do not add extra sections else: raise ValueError("tag '%s' not supported" % node.tag) return self.format_c_define() def format_title(self, node): self.lines.append(node.text) def format_paragraph(self, node): self.lines.append('') self.lines.extend(textwrap.wrap(self.normalize(node), width=WIDTH)) def format_bullet_list(self, node): for list_item in node.iter('list_item'): for paragraph in list_item.iter('paragraph'): text = self.normalize(paragraph) lines = textwrap.wrap(text, width=(WIDTH - 2)) for i, line in enumerate(lines): if i == 0: prefix = '- ' else: prefix = ' ' self.lines.append(prefix + line) def normalize(self, node): t = ET.tostring(node, method='text', encoding='unicode') t = t.split() return ' '.join(t) def format_c_define(self): lines = [] prevline = '' # 1. do preformatting for line in self.lines: line = line.rstrip() if line == '' and prevline == '': continue # compress multiple empty lines prevline = line line = line.replace(r'\\', r'\\\\') line = line.replace('"', r'\"') lines.append(line) # 2. remove empty lines from the end while lines: if lines[-1] == '': del lines[-1] else: break # 3. add qutations n = len(lines) indent = '\t' result = '#define %s \\\n' % self.name for i, line in enumerate(lines): result += indent if i < n - 1: result += '"%s\\n" \\\n' % line else: result += '"%s"\n' % line return result if __name__ == '__main__': main() python-pyahocorasick_1.4.1.orig/utils.c0000644000000000000000000002415714002631620015151 0ustar00/* This is part of pyahocorasick Python module. Helpers functions. This file is included directly. Author : Wojciech Muła, wojciech_mula@poczta.onet.pl WWW : http://0x80.pl License : public domain */ //#define MEMORY_DEBUG #ifdef MEMORY_DEBUG #ifndef MEMORY_DUMP_PATH # define MEMORY_DUMP_PATH "memory.dump" #endif const char* debug_path = MEMORY_DUMP_PATH; FILE* debug_file; int memory_dump = 1; // dump to file int alloc_num = 0; // id of allocation int alloc_fail = -1; // id of allocation that will fail int alloc_trap_on_fail = 0; // rather failing, execute trap (for gdb use) int realloc_num = 0; // id of allocation int realloc_fail = -1; // id of allocation that will fail int realloc_trap_on_fail = 0; // rather failing, execute trap (for gdb use) static int env_getint(const char* name, int def) { const char* val = getenv(name); if (val != NULL) return atoi(val); else return def; } static int env_exists(const char* name) { return (getenv(name) != NULL); } static void initialize_memory_debug(void) { if (env_exists("ALLOC_NODUMP")) { memory_dump = 0; } alloc_fail = env_getint("ALLOC_FAIL", alloc_fail); realloc_fail = env_getint("REALLOC_FAIL", realloc_fail); alloc_trap_on_fail = env_exists("ALLOC_TRAP"); realloc_trap_on_fail = env_exists("REALLOC_TRAP"); if (memory_dump) { debug_file = fopen(debug_path, "wt"); if (debug_file == NULL) { PyErr_WarnEx(PyExc_RuntimeWarning, "Cannot open file, logging on stderr", 1); debug_file = stderr; } } } #endif void* memory_alloc(ssize_t size) { #ifdef MEMORY_DEBUG if (alloc_num == alloc_fail) { if (alloc_trap_on_fail) { __builtin_trap(); } printf("DEBUG: allocation #%d failed\n", alloc_num); alloc_num += 1; return NULL; } #endif void* res = PyMem_Malloc(size); #ifdef MEMORY_DEBUG alloc_num += 1; if (memory_dump) fprintf(debug_file, "A %d %p %ld\n", alloc_num, res, size); #endif return res; } void* memory_realloc(void* ptr, size_t size) { #ifdef MEMORY_DEBUG if (realloc_num == realloc_fail) { if (realloc_trap_on_fail) { __builtin_trap(); } printf("DEBUG: reallocation #%d failed\n", realloc_num); realloc_num += 1; return NULL; } #endif void* res = PyMem_Realloc(ptr, size); #ifdef MEMORY_DEBUG realloc_num += 1; if (memory_dump) { fprintf(debug_file, "R %d %p %p %ld\n", realloc_num, ptr, res, size); } #endif return res; } void memory_free(void* ptr) { #ifdef MEMORY_DEBUG if (memory_dump) fprintf(debug_file, "F %p\n", ptr); #endif PyMem_Free(ptr); } void memory_safefree(void* ptr) { if (ptr != NULL) { memory_free(ptr); } } #if !defined(PY3K) || !defined(AHOCORASICK_UNICODE) // define when pymod_get_string makes a copy of string # define INPUT_KEEPS_COPY #endif #if defined INPUT_KEEPS_COPY # define maybe_free(flag, word) memory_free(word); # define maybe_decref(flag, ref) #elif defined PEP393_UNICODE # define maybe_free(flag, word) if (flag) { memory_free(word); } # define maybe_decref(flag, ref) if (ref && !flag) { Py_DECREF(ref); } #else # define maybe_free(flag, word) # define maybe_decref(flag, ref) if (ref) { Py_DECREF(ref); } #endif /* returns bytes or unicode internal buffer */ static PyObject* pymod_get_string(PyObject* obj, TRIE_LETTER_TYPE** word, ssize_t* wordlen, bool* is_copy) { #ifdef INPUT_KEEPS_COPY ssize_t i; char* bytes; #endif #if defined PEP393_UNICODE if (F(PyUnicode_Check)(obj)) { PyUnicode_READY(obj); if (PyUnicode_KIND(obj) == PyUnicode_4BYTE_KIND) { *word = (TRIE_LETTER_TYPE*)(PyUnicode_4BYTE_DATA(obj)); *wordlen = PyUnicode_GET_LENGTH(obj); *is_copy = false; Py_INCREF(obj); return obj; } else { *word = PyUnicode_AsUCS4Copy(obj); *wordlen = PyUnicode_GET_LENGTH(obj); *is_copy = true; // No INCREF - we have our copy return obj; } } else { PyErr_SetString(PyExc_TypeError, "string expected"); return NULL; } #elif defined PY3K # ifdef AHOCORASICK_UNICODE if (F(PyUnicode_Check)(obj)) { *word = (TRIE_LETTER_TYPE*)(PyUnicode_AS_UNICODE(obj)); *wordlen = PyUnicode_GET_SIZE(obj); Py_INCREF(obj); return obj; } else { PyErr_SetString(PyExc_TypeError, "string expected"); return NULL; } # else # ifndef INPUT_KEEPS_COPY # error "defines inconsistency" # endif if (F(PyBytes_Check)(obj)) { *wordlen = PyBytes_GET_SIZE(obj); *word = (TRIE_LETTER_TYPE*)memory_alloc(*wordlen * TRIE_LETTER_SIZE); if (*word == NULL) { PyErr_NoMemory(); return NULL; } bytes = PyBytes_AS_STRING(obj); for (i=0; i < *wordlen; i++) { (*word)[i] = bytes[i]; } // Note: there is no INCREF return obj; } else { PyErr_SetString(PyExc_TypeError, "bytes expected"); return NULL; } # endif #else // PY_MAJOR_VERSION == 3 # ifndef INPUT_KEEPS_COPY # error "defines inconsistency" # endif if (F(PyString_Check)(obj)) { *wordlen = PyString_GET_SIZE(obj); *word = (TRIE_LETTER_TYPE*)memory_alloc(*wordlen * TRIE_LETTER_SIZE); if (*word == NULL) { PyErr_NoMemory(); return NULL; } bytes = PyString_AS_STRING(obj); for (i=0; i < *wordlen; i++) { (*word)[i] = bytes[i]; }; Py_INCREF(obj); return obj; } else { PyErr_SetString(PyExc_TypeError, "string required"); return NULL; } #endif } static bool __read_sequence__from_tuple(PyObject* obj, TRIE_LETTER_TYPE** word, ssize_t* wordlen) { Py_ssize_t i; Py_ssize_t size = PyTuple_GET_SIZE(obj); TRIE_LETTER_TYPE* tmpword; tmpword = (TRIE_LETTER_TYPE*)memory_alloc(size * TRIE_LETTER_SIZE); if (UNLIKELY(tmpword == NULL)) { PyErr_NoMemory(); return false; } for (i=0; i < size; i++) { Py_ssize_t value = F(PyNumber_AsSsize_t)(F(PyTuple_GetItem)(obj, i), PyExc_ValueError); if (value == -1 && PyErr_Occurred()) { PyErr_Format(PyExc_ValueError, "item #%zd is not a number", i); memory_free(tmpword); return false; } // TODO: both min and max values should be configured #if TRIE_LETTER_SIZE == 4 #define MAX_VAL 4294967295l #else #define MAX_VAL 65535ul #endif if (value < 0 || value > MAX_VAL) { PyErr_Format(PyExc_ValueError, "item #%zd: value %zd outside range [%d..%lu]", i, value, 0, MAX_VAL); memory_free(tmpword); return false; } tmpword[i] = (TRIE_LETTER_TYPE)value; } *word = tmpword; *wordlen = size; return true; } static bool pymod_get_sequence(PyObject* obj, TRIE_LETTER_TYPE** word, ssize_t* wordlen) { if (LIKELY(F(PyTuple_Check)(obj))) { return __read_sequence__from_tuple(obj, word, wordlen); } else { PyErr_Format(PyExc_TypeError, "argument is not a supported sequence type"); return false; } } /* parse optional indexes used in few functions [start, [end]] */ static int pymod_parse_start_end( PyObject* args, int idx_start, int idx_end, const ssize_t min, const ssize_t max, ssize_t* Start, ssize_t* End ) { PyObject* obj; #define start (*Start) #define end (*End) start = min; end = max; // first argument obj = F(PyTuple_GetItem)(args, idx_start); if (obj == NULL) { PyErr_Clear(); return 0; } obj = F(PyNumber_Index)(obj); if (obj == NULL) return -1; start = F(PyNumber_AsSsize_t)(obj, PyExc_IndexError); Py_DECREF(obj); if (start == -1 and PyErr_Occurred()) return -1; if (start < 0) start = max + start; if (start < min or start >= max) { PyErr_Format(PyExc_IndexError, "start index not in range %zd..%zd", min, max); return -1; } // second argument obj = F(PyTuple_GetItem)(args, idx_end); if (obj == NULL) { PyErr_Clear(); return 0; } obj = F(PyNumber_Index)(obj); if (obj == NULL) return -1; end = F(PyNumber_AsSsize_t)(obj, PyExc_IndexError); Py_DECREF(obj); if (end == -1 and PyErr_Occurred()) return -1; if (end < 0) end = max - 1 + end; if (end < min or end > max) { PyErr_Format(PyExc_IndexError, "end index not in range %zd..%zd", min, max); return -1; } return 0; #undef start #undef end } void init_input(struct Input* input) { input->word = NULL; input->py_word = NULL; } bool prepare_input(PyObject* self, PyObject* tuple, struct Input* input) { #define automaton ((Automaton*)self) if (automaton->key_type == KEY_STRING) { input->py_word = pymod_get_string(tuple, &input->word, &input->wordlen, &input->is_copy); if (not input->py_word) return false; } else { input->is_copy = true; // we always create a copy of sequence input->py_word = NULL; if (not pymod_get_sequence(tuple, &input->word, &input->wordlen)) { return false; } } #undef automaton return true; } bool prepare_input_from_tuple(PyObject* self, PyObject* args, int index, struct Input* input) { PyObject* tuple; tuple = F(PyTuple_GetItem)(args, index); if (tuple) return prepare_input(self, tuple, input); else return false; } void destroy_input(struct Input* input) { maybe_decref(input->is_copy, input->py_word) maybe_free(input->is_copy, input->word) } void assign_input(struct Input* dst, struct Input* src) { dst->wordlen = src->wordlen; dst->word = src->word; dst->py_word = src->py_word; // Note: there is no INCREF } python-pyahocorasick_1.4.1.orig/windows.bat0000644000000000000000000000172413207220350016022 0ustar00@echo off @rem A python interperter must be available through PATH. SET PYTHONPATH=. IF [%1]==[clean] ( del /Q stamp\*_pyW exit /B ) IF NOT EXIST stamp\build_pyW ( python setup.py build_ext --inplace IF %ERRORLEVEL% NEQ 0 EXIT /B type nul > stamp\build_pyW ) ELSE echo the extension was built IF NOT EXIST stamp\unittests_pyW ( python unittests.py IF %ERRORLEVEL% NEQ 0 EXIT /B type nul > stamp\unittests_pyW ) ELSE echo unittests were run IF NOT EXIST stamp\regression_pyW ( python regression/issue_5.py IF %ERRORLEVEL% NEQ 0 EXIT /B python regression/issue_8.py IF %ERRORLEVEL% NEQ 0 EXIT /B python regression/issue_9.py IF %ERRORLEVEL% NEQ 0 EXIT /B python regression/issue_10.py IF %ERRORLEVEL% NEQ 0 EXIT /B python regression/issue_26.py IF %ERRORLEVEL% NEQ 0 EXIT /B python regression/issue_56.py IF %ERRORLEVEL% NEQ 0 EXIT /B type nul > stamp\regression_pyW ) ELSE echo regression tests were run python-pyahocorasick_1.4.1.orig/windows.h0000644000000000000000000000060013407734122015504 0ustar00/* This is part of pyahocorasick Python module. Windows declarations Author : Wojciech Muła, wojciech_mula@poczta.onet.pl WWW : http://0x80.pl License : BSD-3-Clause (see LICENSE) */ #ifndef PYAHCORASICK_WINDOWS_H__ #define PYAHCORASICK_WINDOWS_H__ #include "msinttypes/stdint.h" #define PY_OBJECT_HEAD_INIT PyVarObject_HEAD_INIT(NULL, 0) #endif python-pyahocorasick_1.4.1.orig/benchmarks/benchmark.py0000644000000000000000000000505213001457257020271 0ustar00from time import clock from random import choice, randint, seed from sys import stdout import ahocorasick def write(str): stdout.write(str) stdout.flush() def writeln(str): stdout.write(str) stdout.write('\n') class ElapsedTime: def __init__(self, msg): self.msg = msg def __enter__(self): write("%-40s: " % self.msg) self.start = clock() def __exit__(self, a1, a2, a3): self.stop = clock() writeln("%0.3f s" % self.get_time()) def get_time(self): return self.stop - self.start class Test: def __init__(self, max_word_length, count): self.min_word_length = 3 self.max_word_length = max_word_length self.count = count self.words = [] self.inexisting = [] self.input = "" self.automaton = None seed(0) # make sure that tests will be repeatable def init_data(self): def random_word(length): chars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" return ''.join(choice(chars) for _ in xrange(length)) for i in xrange(self.count): length = randint(self.min_word_length, self.max_word_length) self.words.append(random_word(length)) length = randint(self.min_word_length, self.max_word_length) self.inexisting.append(random_word(length)) self.input = random_word(self.count) assert(len(self.words) == len(self.inexisting)) def add_words(self): self.automaton = ahocorasick.Automaton() A = self.automaton for word in self.words: A.add_word(word, word) def build(self): self.automaton.make_automaton() def lookup(self): n = len(self.words) A = self.automaton for i in xrange(n): A.get(self.words[i]) A.get(self.inexisting[i], "unknown") def search(self): A = self.automaton n = 0 for item in A.iter(self.input): n += 1 def run(self): with ElapsedTime("Generating data (%d words)" % self.count): self.init_data() with ElapsedTime("Add words"): self.add_words() with ElapsedTime("Building automaton"): self.build() with ElapsedTime("Look up"): self.lookup() with ElapsedTime("Search"): self.search() def main(): test = Test(32, 1000000) test.run() if __name__ == '__main__': main() python-pyahocorasick_1.4.1.orig/benchmarks/benchmark3.py0000644000000000000000000000504713034165504020355 0ustar00from time import clock from random import choice, randint, seed from sys import stdout import ahocorasick def write(str): stdout.write(str) stdout.flush() def writeln(str): stdout.write(str) stdout.write('\n') class ElapsedTime: def __init__(self, msg): self.msg = msg def __enter__(self): write("%-40s: " % self.msg) self.start = clock() def __exit__(self, a1, a2, a3): self.stop = clock() writeln("%0.3f s" % self.get_time()) def get_time(self): return self.stop - self.start class Test: def __init__(self, max_word_length, count): self.min_word_length = 3 self.max_word_length = max_word_length self.count = count self.words = [] self.inexisting = [] self.input = "" self.automaton = None seed(0) # make sure that tests will be repeatable def init_data(self): def random_word(length): chars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" return ''.join(choice(chars) for _ in range(length)) for i in range(self.count): length = randint(self.min_word_length, self.max_word_length) self.words.append(random_word(length)) length = randint(self.min_word_length, self.max_word_length) self.inexisting.append(random_word(length)) self.input = random_word(self.count) assert(len(self.words) == len(self.inexisting)) def add_words(self): self.automaton = ahocorasick.Automaton() A = self.automaton for word in self.words: A.add_word(word, word) def build(self): self.automaton.make_automaton() def lookup(self): n = len(self.words) A = self.automaton for i in range(n): A.get(self.words[i]) A.get(self.inexisting[i], "unknown") def search(self): A = self.automaton n = 0 for item in A.iter(self.input): n += 1 def run(self): with ElapsedTime("Generating data (%d words)" % self.count): self.init_data() with ElapsedTime("Add words"): self.add_words() with ElapsedTime("Building automaton"): self.build() with ElapsedTime("Look up"): self.lookup() with ElapsedTime("Search"): self.search() def main(): test = Test(32, 1000000) test.run() if __name__ == '__main__': main() python-pyahocorasick_1.4.1.orig/benchmarks/results/0000755000000000000000000000000013001457454017463 5ustar00python-pyahocorasick_1.4.1.orig/benchmarks/results/python2-westmere.txt0000644000000000000000000000055613001457454023466 0ustar00CPU: Intel(R) Core(TM) i5 CPU M 540 @ 2.53GHz $ make benchmark python2 benchmarks/benchmark.py stamp/build_py2 Generating data (1000000 words) : 24.886 s Add words : 4.627 s Building automaton : 33.362 s Look up : 5.946 s Search : 1.762 s python-pyahocorasick_1.4.1.orig/benchmarks/results/python3-broadwell-u.txt0000644000000000000000000000050013034171244024032 0ustar00CPU: Intel(R) Core(TM) i7 CPU 5600U @ 2.60GHz Python 3.5.2 64 bit Generating data (1000000 words) : 35.752 s Add words : 2.833 s Building automaton : 15.418 s Look up : 2.667 s Search : 0.740 s python-pyahocorasick_1.4.1.orig/docs/.gitignore0000644000000000000000000000001112746664234016570 0ustar00/_build/ python-pyahocorasick_1.4.1.orig/docs/Makefile0000644000000000000000000001672212746664234016260 0ustar00# Makefile for Sphinx documentation # # You can set these variables from the command line. SPHINXOPTS = SPHINXBUILD = sphinx-build PAPER = BUILDDIR = _build # Internal variables. PAPEROPT_a4 = -D latex_paper_size=a4 PAPEROPT_letter = -D latex_paper_size=letter ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . # the i18n builder cannot share the environment and doctrees with the others I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . .PHONY: help help: @echo "Please use \`make ' where is one of" @echo " html to make standalone HTML files" @echo " dirhtml to make HTML files named index.html in directories" @echo " singlehtml to make a single large HTML file" @echo " pickle to make pickle files" @echo " json to make JSON files" @echo " htmlhelp to make HTML files and a HTML help project" @echo " qthelp to make HTML files and a qthelp project" @echo " applehelp to make an Apple Help Book" @echo " devhelp to make HTML files and a Devhelp project" @echo " epub to make an epub" @echo " epub3 to make an epub3" @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" @echo " latexpdf to make LaTeX files and run them through pdflatex" @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" @echo " text to make text files" @echo " man to make manual pages" @echo " texinfo to make Texinfo files" @echo " info to make Texinfo files and run them through makeinfo" @echo " gettext to make PO message catalogs" @echo " changes to make an overview of all changed/added/deprecated items" @echo " xml to make Docutils-native XML files" @echo " pseudoxml to make pseudoxml-XML files for display purposes" @echo " linkcheck to check all external links for integrity" @echo " doctest to run all doctests embedded in the documentation (if enabled)" @echo " coverage to run coverage check of the documentation (if enabled)" @echo " dummy to check syntax errors of document sources" .PHONY: clean clean: rm -rf $(BUILDDIR)/* .PHONY: html html: $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." .PHONY: dirhtml dirhtml: $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." .PHONY: singlehtml singlehtml: $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml @echo @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." .PHONY: pickle pickle: $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle @echo @echo "Build finished; now you can process the pickle files." .PHONY: json json: $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json @echo @echo "Build finished; now you can process the JSON files." .PHONY: htmlhelp htmlhelp: $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp @echo @echo "Build finished; now you can run HTML Help Workshop with the" \ ".hhp project file in $(BUILDDIR)/htmlhelp." .PHONY: qthelp qthelp: $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp @echo @echo "Build finished; now you can run "qcollectiongenerator" with the" \ ".qhcp project file in $(BUILDDIR)/qthelp, like this:" @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/pyahocorasick.qhcp" @echo "To view the help file:" @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/pyahocorasick.qhc" .PHONY: applehelp applehelp: $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp @echo @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." @echo "N.B. You won't be able to view it unless you put it in" \ "~/Library/Documentation/Help or install it in your application" \ "bundle." .PHONY: devhelp devhelp: $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp @echo @echo "Build finished." @echo "To view the help file:" @echo "# mkdir -p $$HOME/.local/share/devhelp/pyahocorasick" @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/pyahocorasick" @echo "# devhelp" .PHONY: epub epub: $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub @echo @echo "Build finished. The epub file is in $(BUILDDIR)/epub." .PHONY: epub3 epub3: $(SPHINXBUILD) -b epub3 $(ALLSPHINXOPTS) $(BUILDDIR)/epub3 @echo @echo "Build finished. The epub3 file is in $(BUILDDIR)/epub3." .PHONY: latex latex: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." @echo "Run \`make' in that directory to run these through (pdf)latex" \ "(use \`make latexpdf' here to do that automatically)." .PHONY: latexpdf latexpdf: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo "Running LaTeX files through pdflatex..." $(MAKE) -C $(BUILDDIR)/latex all-pdf @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." .PHONY: latexpdfja latexpdfja: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo "Running LaTeX files through platex and dvipdfmx..." $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." .PHONY: text text: $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text @echo @echo "Build finished. The text files are in $(BUILDDIR)/text." .PHONY: man man: $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man @echo @echo "Build finished. The manual pages are in $(BUILDDIR)/man." .PHONY: texinfo texinfo: $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo @echo @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." @echo "Run \`make' in that directory to run these through makeinfo" \ "(use \`make info' here to do that automatically)." .PHONY: info info: $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo @echo "Running Texinfo files through makeinfo..." make -C $(BUILDDIR)/texinfo info @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." .PHONY: gettext gettext: $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale @echo @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." .PHONY: changes changes: $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes @echo @echo "The overview file is in $(BUILDDIR)/changes." .PHONY: linkcheck linkcheck: $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck @echo @echo "Link check complete; look for any errors in the above output " \ "or in $(BUILDDIR)/linkcheck/output.txt." .PHONY: doctest doctest: $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest @echo "Testing of doctests in the sources finished, look at the " \ "results in $(BUILDDIR)/doctest/output.txt." .PHONY: coverage coverage: $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage @echo "Testing of coverage in the sources finished, look at the " \ "results in $(BUILDDIR)/coverage/python.txt." .PHONY: xml xml: $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml @echo @echo "Build finished. The XML files are in $(BUILDDIR)/xml." .PHONY: pseudoxml pseudoxml: $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml @echo @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." .PHONY: dummy dummy: $(SPHINXBUILD) -b dummy $(ALLSPHINXOPTS) $(BUILDDIR)/dummy @echo @echo "Build finished. Dummy builder generates no files." python-pyahocorasick_1.4.1.orig/docs/automaton___reduce__.rst0000644000000000000000000000021213407704416021457 0ustar00__reduce__() ---------------------------------------------------------------------- Return pickle-able data for this automaton instance. python-pyahocorasick_1.4.1.orig/docs/automaton___sizeof__.rst0000644000000000000000000000031713407704416021515 0ustar00Return the approximate size in bytes occupied by the Automaton instance in memory excluding the size of associated objects when the Automaton is created with Automaton() or Automaton(ahocorasick.STORE_ANY). python-pyahocorasick_1.4.1.orig/docs/automaton_add_word.rst0000644000000000000000000000430313407704416021204 0ustar00add_word(key, [value]) -> boolean -------------------------------------------------------------------------------- Add a key string to the dict-like trie and associate this key with a value. value is optional or mandatory depending how the ``Automaton`` instance was created. Return True if the word key is inserted and did not exists in the trie or False otherwise. The value associated with an existing word is replaced. The value is either mandatory or optional: - If the Automaton was created without argument (the default) as ``Automaton()`` or with ``Automaton(ahocorasik.STORE_ANY)`` then the value is required and can be any Python object. - If the Automaton was created with ``Automaton(ahocorasik.STORE_INTS)`` then the value is optional. If provided it must be an integer, otherwise it defaults to ``len(automaton)`` which is therefore the order index in which keys are added to the trie. - If the Automaton was created with ``Automaton(ahocorasik.STORE_LENGTH)`` then associating a value is not allowed - ``len(word)`` is saved automatically as a value instead. Calling add_word() invalidates all iterators only if the new key did not exist in the trie so far (i.e. the method returned True). Examples ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. code:: python >>> import ahocorasick >>> A = ahocorasick.Automaton() >>> A.add_word("pyahocorasick") Traceback (most recent call last): File "", line 1, in ValueError: A value object is required as second argument. >>> A.add_word("pyahocorasick", (42, 'text')) True >>> A.get("pyhocorasick") (42, 'text') >>> A.add_word("pyahocorasick", 12) False >>> A.get("pyhocorasick") 12 .. code:: python >>> import ahocorasick >>> B = ahocorasick.Automaton(ahocorasick.STORE_INTS) >>> B.add_word("cat") True >>> B.get() Traceback (most recent call last): File "", line 1, in IndexError: tuple index out of range >>> B.get("cat") 1 >>> B.add_word("dog") True >>> B.get("dog") 2 >>> B.add_word("tree", 42) True >>> B.get("tree") 42 >>> B.add_word("cat", 43) False >>> B.get("cat") 43 python-pyahocorasick_1.4.1.orig/docs/automaton_clear.rst0000644000000000000000000000073013407704416020507 0ustar00clear() ---------------------------------------------------------------------- Remove all keys from the trie. This method invalidates all iterators. Examples ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. code:: python >>> import ahocorasick >>> A = ahocorasick.Automaton() >>> A.add_word("cat", 1) True >>> A.add_word("dog", 2) True >>> A.add_word("elephant", 3) True >>> len(A) 3 >>> A.clear() >>> len(A) 0 python-pyahocorasick_1.4.1.orig/docs/automaton_constructor.rst0000644000000000000000000000260713407704416022013 0ustar00Automaton(value_type=ahocorasick.STORE_ANY, [key_type]) -------------------------------------------------------------------------------- Create a new empty Automaton. Both ``value_type`` and ``key_type`` are optional. ``value_type`` is one of these constants: - ahocorasick.STORE_ANY [*default*] : The associated value can be any Python object. - ahocorasick.STORE_LENGTH : The length of an added string key is automatically used as the associated value stored in the trie for that key. - ahocorasick.STORE_INTS : The associated value must be a 32-bit integer. ``key_type`` defines the type of data that can be stored in an automaton; it is one of these constants and defines type of data might be stored: - ahocorasick.KEY_STRING [*default*] : string - ahocorasick.KEY_SEQUENCE : sequences of integers; The size of integer depends the version and platform Python, but for versions of Python >= 3.3, it is guaranteed to be 32-bits. Examples ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. code:: python >>> import ahocorasick >>> A = ahocorasick.Automaton() >>> A >>> B = ahocorasick.Automaton(ahocorasick.STORE_ANY) >>> B >>> C = ahocorasick.Automaton(ahocorasick.STORE_INTS, ahocorasick.KEY_STRING) >>> C python-pyahocorasick_1.4.1.orig/docs/automaton_dump.rst0000644000000000000000000000074113407704416020370 0ustar00dump() ---------------------------------------------------------------------- Return a three-tuple of lists describing the Automaton as a graph of **nodes**, **edges**, **failure links**. - nodes: each item is a pair (node id, end of word marker) - edges: each item is a triple (node id, label char, child node id) - failure links: each item is a pair (source node id, node if connected by fail node) For each of these, the node id is a unique number and a label is a number. python-pyahocorasick_1.4.1.orig/docs/automaton_exists.rst0000644000000000000000000000075013407704416020742 0ustar00exists(key) -> boolean ---------------------------------------------------------------------- Return True if the ``key`` is present in the trie. Same as using the 'in' keyword. Examples ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. code:: python >>> import ahocorasick >>> A = ahocorasick.Automaton() >>> A.add_word("cat", 1) True >>> A.exists("cat") True >>> A.exists("dog") False >>> 'elephant' in A False >>> 'cat' in A True python-pyahocorasick_1.4.1.orig/docs/automaton_find_all.rst0000644000000000000000000000134013407704416021167 0ustar00find_all(string, callback, [start, [end]]) ---------------------------------------------------------------------- Perform the Aho-Corasick search procedure using the provided input ``string`` and iterate over the matching tuples (``end_index``, ``value``) for keys found in string. Invoke the ``callback`` callable for each matching tuple. The callback callable must accept two positional arguments: - end_index is the end index in the input string where a trie key string was found. - value is the value associated with the found key string. The start and end optional arguments can be used to limit the search to an input string slice as in string[start:end]. Equivalent to a loop on iter() calling a callable at each iteration. python-pyahocorasick_1.4.1.orig/docs/automaton_get.rst0000644000000000000000000000125313407704416020201 0ustar00get(key[, default]) ---------------------------------------------------------------------- Return the value associated with the key string. Raise a ``KeyError`` exception if the key is not in the trie and no default is provided. Return the optional default value if provided and the key is not in the trie. Example ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. code:: python >>> import ahocorasick >>> A = ahocorasick.Automaton() >>> A.add_word("cat", 42) True >>> A.get("cat") 42 >>> A.get("dog") Traceback (most recent call last): File "", line 1, in KeyError >>> A.get("dog", "good dog") 'good dog' python-pyahocorasick_1.4.1.orig/docs/automaton_get_stats.rst0000644000000000000000000000161613407704416021422 0ustar00get_stats() -> dict ---------------------------------------------------------------------- Return a dictionary containing Automaton statistics. - *nodes_count* - total number of nodes - *words_count* - number of distinct words (same as ``len(automaton)``) - *longest_word* - length of the longest word - *links_count* - number of edges - *sizeof_node* - size of single node in bytes - *total_size* - total size of trie in bytes (about nodes_count * size_of node + links_count * size of pointer). Examples ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. code:: python >>> import ahocorasick >>> A = ahocorasick.Automaton() >>> A.add_word("he", None) True >>> A.add_word("her", None) True >>> A.add_word("hers", None) True >>> A.get_stats() {'nodes_count': 5, 'words_count': 3, 'longest_word': 4, 'links_count': 4, 'sizeof_node': 40, 'total_size': 232} python-pyahocorasick_1.4.1.orig/docs/automaton_items.rst0000644000000000000000000000037713407704416020551 0ustar00items([prefix, [wildcard, [how]]]) ---------------------------------------------------------------------- Return an iterator on tuples of (key, value). Keys are matched optionally to the prefix using the same logic and arguments as in the keys() method. python-pyahocorasick_1.4.1.orig/docs/automaton_iter.rst0000644000000000000000000000124613407704416020367 0ustar00iter(string, [start, [end]], ignore_white_space=False) ---------------------------------------------------------------------- Perform the Aho-Corasick search procedure using the provided input string. Return an iterator of tuples (``end_index``, ``value``) for keys found in string where: - ``end_index`` is the end index in the input string where a trie key string was found. - ``value`` is the value associated with the found key string. The ``start`` and ``end`` optional arguments can be used to limit the search to an input string slice as in ``string[start:end]``. The ``ignore_white_space`` optional arguments can be used to ignore white spaces from input string. python-pyahocorasick_1.4.1.orig/docs/automaton_iter_long.rst0000644000000000000000000000271613642662762021421 0ustar00iter_long(string, [start, [end]]) ---------------------------------------------------------------------- Perform the modified Aho-Corasick search procedure which matches the longest words from set. Return an iterator of tuples (``end_index``, ``value``) for keys found in string where: - ``end_index`` is the end index in the input string where a trie key string was found. - ``value`` is the value associated with the found key string. The ``start`` and ``end`` optional arguments can be used to limit the search to an input string slice as in ``string[start:end]``. Example ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The default Aho-Corasick algorithm returns all occurrences of words stored in the automaton, including substring of other words from string. Method ``iter_long`` reports only the longest match. For set of words {"he", "her", "here"} and a needle "he here her" the default algorithm finds following words: "he", "he", "her", "here", "he", "her", while the modified one yields only: "he", "here", "her". .. code:: python >>> import ahocorasick >>> A = ahocorasick.Automaton() >>> A.add_word("he", "he") True >>> A.add_word("her", "her") True >>> A.add_word("here", "here") True >>> A.make_automaton() >>> needle = "he here her" >>> list(A.iter_long(needle)) [(1, 'he'), (6, 'here'), (10, 'her')] >>> list(A.iter(needle)) [(1, 'he'), (4, 'he'), (5, 'her'), (6, 'here'), (9, 'he'), (10, 'her')] python-pyahocorasick_1.4.1.orig/docs/automaton_keys.rst0000644000000000000000000000150113407704416020371 0ustar00keys([prefix, [wildcard, [how]]]) ---------------------------------------------------------------------- Return an iterator on keys. If the optional ``prefix`` string is provided, only yield keys starting with this prefix. If the optional ``wildcard`` is provided as a single character string, then the prefix is treated as a simple pattern using this character as a wildcard. The optional ``how`` argument is used to control how strings are matched using one of these possible values: - **ahocorasick.MATCH_EXACT_LENGTH** (default) Yield matches that have the same exact length as the prefix length. - **ahocorasick.MATCH_AT_LEAST_PREFIX** Yield matches that have a length greater or equal to the prefix length. - **ahocorasick.MATCH_AT_MOST_PREFIX** Yield matches that have a length lesser or equal to the prefix length. python-pyahocorasick_1.4.1.orig/docs/automaton_len.rst0000644000000000000000000000066413407704416020205 0ustar00len() -> integer ---------------------------------------------------------------------- Return the number of distinct keys added to the trie. Examples ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. code:: python >>> import ahocorasick >>> A = ahocorasick.Automaton() >>> len(A) 0 >>> A.add_word("python", 1) True >>> len(A) 1 >>> A.add_word("elephant", True) True >>> len(A) 2 python-pyahocorasick_1.4.1.orig/docs/automaton_longest_prefix.rst0000644000000000000000000000101713407704416022450 0ustar00longest_prefix(string) => integer ---------------------------------------------------------------------- Return the length of the longest prefix of string that exists in the trie. Examples ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. code:: python >>> import ahocorasick >>> A = ahocorasick.Automaton() >>> A.add_word("he", True) True >>> A.add_word("her", True) True >>> A.add_word("hers", True) True >>> A.longest_prefix("she") 0 >>> A.longest_prefix("herself") 4 python-pyahocorasick_1.4.1.orig/docs/automaton_make_automaton.rst0000644000000000000000000000050013407704416022420 0ustar00make_automaton() ---------------------------------------------------------------------- Finalize and create the Aho-Corasick automaton based on the keys already added to the trie. This does not require additional memory. After successful creation the ``Automaton.kind`` attribute is set to ``ahocorasick.AHOCORASICK``. python-pyahocorasick_1.4.1.orig/docs/automaton_match.rst0000644000000000000000000000155513407704416020523 0ustar00match(key) -> bool ---------------------------------------------------------------------- Return True if there is a prefix (or key) equal to key present in the trie. For example if the key 'example' has been added to the trie, then calls to match('e'), match('ex'), ..., match('exampl') or match('example') all return True. But exists() is True only when calling exists('example'). Examples ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. code:: python >>> import ahocorasick >>> A = ahocorasick.Automaton() >>> A.add_word("example", True) True >>> A.match("e") True >>> A.match("ex") True >>> A.match("exa") True >>> A.match("exam") True >>> A.match("examp") True >>> A.match("exampl") True >>> A.match("example") True >>> A.match("examples") False >>> A.match("python") False python-pyahocorasick_1.4.1.orig/docs/automaton_pop.rst0000644000000000000000000000130413407704416020215 0ustar00pop(word) -------------------------------------------------------------------------------- Remove given word from a trie and return associated values. Raise a ``KeyError`` if the word was not found. Examples ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. code:: python >>> import ahocorasick >>> A = ahocorasick.Automaton() >>> A.add_word("cat", 1) True >>> A.add_word("dog", 2) True >>> A.pop("elephant") Traceback (most recent call last): File "", line 1, in KeyError >>> A.pop("cat") 1 >>> A.pop("dog") 2 >>> A.pop("cat") Traceback (most recent call last): File "", line 1, in KeyError python-pyahocorasick_1.4.1.orig/docs/automaton_remove_word.rst0000644000000000000000000000107213407704416021751 0ustar00remove_word(word) -> bool -------------------------------------------------------------------------------- Remove given word from a trie. Return True if words was found, False otherwise. Examples ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. code:: python >>> import ahocorasick >>> A = ahocorasick.Automaton() >>> A.add_word("cat", 1) True >>> A.add_word("dog", 2) True >>> A.remove_word("cat") True >>> A.remove_word("cat") False >>> A.remove_word("dog") True >>> A.remove_word("dog") False >>> python-pyahocorasick_1.4.1.orig/docs/automaton_save.rst0000644000000000000000000000047013407704416020360 0ustar00save(path, serializer) ---------------------------------------------------------------------- Save content of automaton in an on-disc file. ``Serializer`` is a callable object that is used when automaton store type is ``STORE_ANY``. This method converts a python object into bytes; it can be ``pickle.dumps``. python-pyahocorasick_1.4.1.orig/docs/automaton_search_iter.rst0000644000000000000000000000027013407704416021710 0ustar00This class is not available directly but instances of AutomatonSearchIter are returned by the iter() method of an Automaton. This iterator can be manipulated through its set() method. python-pyahocorasick_1.4.1.orig/docs/automaton_search_iter_set.rst0000644000000000000000000000057313407704416022571 0ustar00set(string, reset=False) ---------------------------------------------------------------------- Set a new string to search. When the reset argument is False (default) then the Aho-Corasick procedure is continued and the internal state of the Automaton and end index of the string being searched are not reset. This allow to search for large strings in multiple smaller chunks. python-pyahocorasick_1.4.1.orig/docs/automaton_values.rst0000644000000000000000000000041213407704416020715 0ustar00values([prefix, [wildcard, [how]]]) ---------------------------------------------------------------------- Return an iterator on values associated with each keys. Keys are matched optionally to the prefix using the same logic and arguments as in the keys() method. python-pyahocorasick_1.4.1.orig/docs/conf.py0000644000000000000000000002366613414402730016104 0ustar00# -*- coding: utf-8 -*- # # pyahocorasick documentation build configuration file, created by # sphinx-quickstart on Fri Jul 29 14:38:56 2016. # # This file is execfile()d with the current directory set to its # containing dir. # # Note that not all possible configuration values are present in this # autogenerated file. # # All configuration values have a default; values that are commented out # serve to show the default. # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # import os import sys sys.path.insert(0, os.path.abspath('../') ) # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. # # needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ 'sphinx.ext.autodoc', # see https://sphinxcontrib-napoleon.readthedocs.io/en/latest/ # we use this for better docstrings 'sphinx.ext.napoleon' ] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # # source_suffix = ['.rst', '.md'] source_suffix = '.rst' # The encoding of source files. # source_encoding = 'utf-8' # The master toctree document. master_doc = 'index' # General information about the project. project = u'ahocorasick' copyright = u'2019, Wojciech Muła' author = u'Wojciech Muła' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. #version = u'1.1.0' # The full version, including alpha/beta/rc tags. #release = version # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. language = None # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: # # today = '' # # Else, today_fmt is used as the format for a strftime call. # # today_fmt = '%B %d, %Y' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This patterns also effect to html_static_path and html_extra_path exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', 'bin', 'include', 'lib'] # The reST default role (used for this markup: `text`) to use for all # documents. # # default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. # # add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). # # add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. # # show_authors = False # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' # A list of ignored prefixes for module index sorting. # modindex_common_prefix = [] # If true, keep warnings as "system message" paragraphs in the built documents. # keep_warnings = False # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = False # -- Options for HTML output ---------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # #html_theme = 'alabaster' # for unknown reasons, this theme has empty "Navigation" bar, makes documentation useless html_theme = 'classic' # html_theme = 'sphinx_rtd_theme' # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. # # html_theme_options = {} # Add any paths that contain custom themes here, relative to this directory. # html_theme_path = [] # The name for this set of Sphinx documents. # " v documentation" by default. # # html_title = u'pyahocorasick v1.1.0' # A shorter title for the navigation bar. Default is the same as html_title. # # html_short_title = None # The name of an image file (relative to this directory) to place at the top # of the sidebar. # # html_logo = None # The name of an image file (relative to this directory) to use as a favicon of # the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. # # html_favicon = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". # html_static_path = ['_static'] # Add any extra paths that contain custom files (such as robots.txt or # .htaccess) here, relative to this directory. These files are copied # directly to the root of the documentation. # # html_extra_path = [] # If not None, a 'Last updated on:' timestamp is inserted at every page # bottom, using the given strftime format. # The empty string is equivalent to '%b %d, %Y'. # # html_last_updated_fmt = None # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. # # html_use_smartypants = True # Custom sidebar templates, maps document names to template names. # # html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. # # html_additional_pages = {} # If false, no module index is generated. # # html_domain_indices = True # If false, no index is generated. # html_use_index = False # If true, the index is split into individual pages for each letter. # # html_split_index = False # If true, links to the reST sources are added to the pages. html_show_sourcelink = False # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. # # html_show_sphinx = True # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. # # html_show_copyright = True # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. # # html_use_opensearch = '' # This is the file name suffix for HTML files (e.g. ".xhtml"). # html_file_suffix = None # Language to be used for generating the HTML full-text search index. # Sphinx supports the following languages: # 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja' # 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr', 'zh' # # html_search_language = 'en' # A dictionary with options for the search language support, empty by default. # 'ja' uses this config value. # 'zh' user can custom change `jieba` dictionary path. # # html_search_options = {'type': 'default'} # The name of a javascript file (relative to the configuration directory) that # implements a search results scorer. If empty, the default will be used. # # html_search_scorer = 'scorer.js' # Output file base name for HTML help builder. htmlhelp_basename = 'pyahocorasickdoc' # -- Options for LaTeX output --------------------------------------------- latex_elements = { # The paper size ('letterpaper' or 'a4paper'). # # 'papersize': 'letterpaper', # The font size ('10pt', '11pt' or '12pt'). # # 'pointsize': '10pt', # Additional stuff for the LaTeX preamble. # # 'preamble': '', # Latex figure (float) alignment # # 'figure_align': 'htbp', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ (master_doc, 'pyahocorasick.tex', u'pyahocorasick Documentation', u'Wojciech Muła', 'manual'), ] # The name of an image file (relative to this directory) to place at the top of # the title page. # # latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. # # latex_use_parts = False # If true, show page references after internal links. # # latex_show_pagerefs = False # If true, show URL addresses after external links. # # latex_show_urls = False # Documents to append as an appendix to all manuals. # # latex_appendices = [] # It false, will not define \strong, \code, itleref, \crossref ... but only # \sphinxstrong, ..., \sphinxtitleref, ... To help avoid clash with user added # packages. # # latex_keep_old_macro_names = True # If false, no module index is generated. # # latex_domain_indices = True # -- Options for manual page output --------------------------------------- # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ (master_doc, 'pyahocorasick', u'pyahocorasick Documentation', [author], 1) ] # If true, show URL addresses after external links. # # man_show_urls = False # -- Options for Texinfo output ------------------------------------------- # Grouping the document tree into Texinfo files. List of tuples # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ (master_doc, 'pyahocorasick', u'pyahocorasick Documentation', author, 'pyahocorasick', 'One line description of project.', 'Miscellaneous'), ] # Documents to append as an appendix to all manuals. # # texinfo_appendices = [] # If false, no module index is generated. # # texinfo_domain_indices = True # How to display URL addresses: 'footnote', 'no', or 'inline'. # # texinfo_show_urls = 'footnote' # If true, do not generate a @detailmenu in the "Top" node's menu. # # texinfo_no_detailmenu = False python-pyahocorasick_1.4.1.orig/docs/index.rst0000644000000000000000000002635013642662762016457 0ustar00 .. include:: ../README.rst API Overview ============ This is a quick tour of the API for the C **ahocorasick** module. See the full API doc for more details. The pure Python module has a slightly different interface. The module ``ahocorasick`` contains a few constants and the main ``Automaton`` class. Module constants ---------------- - ``ahocorasick.unicode`` --- see `Unicode and bytes`_ - ``ahocorasick.STORE_ANY``, ``ahocorasick.STORE_INTS``, ``ahocorasick.STORE_LENGTH`` --- see `Automaton class`_ - ``ahocorasick.KEY_STRING`` ``ahocorasick.KEY_SEQUENCE`` --- see `Automaton class`_ - ``ahocorasick.EMPTY``, ``ahocorasick.TRIE``, ``ahocorasick.AHOCORASICK`` --- see `Automaton Attributes`_ - ``ahocorasick.MATCH_EXACT_LENGTH``, ``ahocorasick.MATCH_AT_MOST_PREFIX``, ``ahocorasick.MATCH_AT_LEAST_PREFIX`` --- see description of the keys method Automaton class --------------- Note: ``Automaton`` instances are `pickle-able `_ meaning that you can create ahead of time an eventually large automaton then save it to disk and re-load it later to reuse it over and over as a persistent multi-string search index. Internally, Automaton implements the ``__reduce__() magic method``. ``Automaton([value_type], [key_type])`` Create a new empty Automaton optionally passing a `value_type` to indicate what is the type of associated values (default to any Python object type). It can be one of ``ahocorasick.STORE_ANY``, ``ahocorasick.STORE_INTS`` or ``ahocorasick.STORE_LENGTH``. In the last case the length of the key will be stored in the automaton. The optional argument `key_type` can be ``ahocorasick.KEY_STRING`` or ``ahocorasick.KEY_SEQUENCE``. In the latter case keys will be tuples of integers. The size of integer depends on the version and platform Python is running on, but for versions of Python >= 3.3, it is guaranteed to be 32-bits. Automaton Trie methods ---------------------- The Automaton class has the following main trie-like methods: ``add_word(key, [value]) => bool`` Add a ``key`` string to the dict-like trie and associate this key with a ``value``. ``remove_word(key) => bool`` Remove a ``key`` string from the dict-like trie. ``pop(key) => value`` Remove a ``key`` string from the dict-like trie and return the associated ``value``. ``exists(key) => bool`` or ``key in ...`` Return True if the key is present in the trie. ``match(key) => bool`` Return True if there is a prefix (or key) equal to ``key`` present in the trie. Automaton Dictionary-like methods --------------------------------- A pyahocorasick Automaton trie behaves more or less like a Python dictionary and implements a subset of dict-like methods. Some of them are: ``get(key[, default])`` Return the value associated with the ``key`` string. Similar to `dict.get()`. ``keys([prefix, [wildcard, [how]]]) => yield strings`` Return an iterator on keys. ``values([prefix, [wildcard, [how]]]) => yield object`` Return an iterator on values associated with each keys. ``items([prefix, [wildcard, [how]]]) => yield tuple (string, object)`` Return an iterator on tuples of (key, value). Wildcard search ~~~~~~~~~~~~~~~ The methods ``keys``, ``values`` and ``items`` can be called with an optional **wildcard**. A wildcard character is equivalent to a question mark used in glob patterns (?) or a dot (.) in regular expressions. You can use any character you like as a wildcard. Note that it is not possible to escape a wildcard to match it exactly. You need instead to select another wildcard character not present in the provided prefix. For example:: automaton.keys("hi?", "?") # would match "him", "his" automaton.keys("XX?", "X") # would match "me?", "he?" or "it?" Aho-Corasick methods -------------------- The Automaton class has the following main Aho-Corasick methods: ``make_automaton()`` Finalize and create the Aho-Corasick automaton. ``iter(string, [start, [end]])`` Perform the Aho-Corasick search procedure using the provided input ``string``. Return an iterator of tuples (end_index, value) for keys found in string. ``iter_long(string, [start, [end]])`` Returns iterator (object of class AutomatonSearchIterLong) that searches for longest, non-overlapping matches. AutomatonSearchIter class ~~~~~~~~~~~~~~~~~~~~~~~~~ Instances of this class are returned by the ``iter`` method of an ``Automaton``. This iterator can be manipulated through its `set()` method. ``set(string, [reset]) => None`` Set a new string to search eventually keeping the current Automaton state to continue searching for the next chunk of a string. For example:: >>> it = A.iter(b"") >>> while True: ... buffer = receive(server_address, 4096) ... if not buffer: ... break ... it.set(buffer) ... for index, value in it: ... print(index, '=>', value) When ``reset`` is ``True`` then processing is restarted. For example this code:: >>> for string in string_set: ... for index, value in A.iter(string) ... print(index, '=>', value) does the same job as:: >>> it = A.iter(b"") >>> for string in string_set: ... it.set(it, True) ... for index, value in it: ... print(index, '=>', value) Automaton Attributes -------------------- The Automaton class has the following attributes: ``kind`` [readonly] Return the state of the ``Automaton`` instance. ``store`` [readonly] Return the type of values stored in the Automaton as specified at creation. Saving and loading automaton ---------------------------- There is support for two method of saving and loading an automaton: * the standard ``pickle`` protocol, * custom ``save`` and ``load`` methods. While pickling is more convenient to use, it has quite high memory requirements. The ``save``/``load`` method try to overcome this problem. .. warning:: Neither format of pickle nor save are safe. Although there are a few sanity checks, they are not sufficient to detect all possible input errors. Pickle ~~~~~~ .. code:: python import ahocorasick import pickle # build automaton A = ahocorasick.Automaton() # ... A.add_data, A.make_automaton # save current state with open(path, 'wb') as f: pickle.dump(A, f) # load saved state with open(path, 'rb') as f: B = pickle.load(f) Save/load methods ~~~~~~~~~~~~~~~~~ .. code:: python import ahocorasick import pickle # build automaton A = ahocorasick.Automaton() # ... A.add_data, A.make_automaton # save current state A.save(path, pickle.dumps) # load saved state B = ahocorasick.load(path, pickle.loads) Automaton method ``save`` requires ``path`` to the file which will store data. If the automaton type is ``STORE_ANY``, i.e. values associated with words are any python objects, then ``save`` requires also another argument, a callable. The callable serializes python object into bytes; in the example above we use standard pickle ``dumps`` function. Module method ``load`` also requires ``path`` to file that has data previously saved. Because at the moment of loading data we don't know what is the store attribute of automaton, the second argument - a callable - is required. The callable must convert back given bytes object into python value, that will be stored in automaton. Similarly, standard ``pickle.loads`` function can be passed. Other Automaton methods ----------------------- The Automaton class has a few other interesting methods: ``dump() => (list of nodes, list of edges, list of fail links)`` Return a three-tuple of lists describing the Automaton as a graph of (nodes, edges, failure links). The source repository and source package also contains the ``dump2dot.py`` script that converts ``dump()`` results to a `graphviz `_ dot format for convenient visualization of the trie and Automaton data structure. ``get_stats() => dict`` Return a dictionary containing Automaton statistics. Note that the real size occupied by the data structure could be larger because of `internal memory fragmentation `_ that can occur in a memory manager. ``__sizeof__() => int`` Return the approximate size in bytes occupied by the Automaton instance. Also available by calling sys.getsizeof(automaton instance). Examples ======== :: >>> import ahocorasick >>> A = ahocorasick.Automaton() >>> # add some key words to trie >>> for index, word in enumerate('he her hers she'.split()): ... A.add_word(word, (index, word)) >>> # test that these key words exists in the trie all right >>> 'he' in A True >>> 'HER' in A False >>> A.get('he') (0, 'he') >>> A.get('she') (3, 'she') >>> A.get('cat', '') '' >>> A.get('dog') Traceback (most recent call last): File "", line 1, in KeyError >>> A.remove_word('he') True >>> A.remove_word('he') False >>> A.pop('she') (3, 'she') >>> 'she' in A False >>> # convert the trie in an Aho-Corasick automaton >>> A = ahocorasick.Automaton() >>> for index, word in enumerate('he her hers she'.split()): ... A.add_word(word, (index, word)) >>> A.make_automaton() >>> # then find all occurrences of the stored keys in a string >>> for item in A.iter('_hershe_'): ... print(item) ... (2, (0, 'he')) (3, (1, 'her')) (4, (2, 'hers')) (6, (3, 'she')) (6, (0, 'he')) Example of the keys method behavior ----------------------------------- :: >>> import ahocorasick >>> A = ahocorasick.Automaton() >>> # add some key words to trie >>> for index, word in enumerate('cat catastropha rat rate bat'.split()): ... A.add_word(word, (index, word)) >>> # Search some prefix >>> list(A.keys('cat')) ['cat', 'catastropha'] >>> # Search with a wildcard: here '?' is used as a wildcard. You can use any character you like. >>> list(A.keys('?at', '?', ahocorasick.MATCH_EXACT_LENGTH)) ['bat', 'cat', 'rat'] >>> list(A.keys('?at?', '?', ahocorasick.MATCH_AT_MOST_PREFIX)) ['bat', 'cat', 'rat', 'rate'] >>> list(A.keys('?at?', '?', ahocorasick.MATCH_AT_LEAST_PREFIX)) ['rate'] API Reference ============= .. include:: automaton_constructor.rst .. include:: automaton_add_word.rst .. include:: automaton_exists.rst .. include:: automaton_get.rst .. include:: automaton_longest_prefix.rst .. include:: automaton_match.rst .. include:: automaton_len.rst .. include:: automaton_remove_word.rst .. include:: automaton_pop.rst .. include:: automaton_clear.rst .. include:: automaton_keys.rst .. include:: automaton_items.rst .. include:: automaton_values.rst .. include:: automaton_make_automaton.rst .. include:: automaton_iter.rst .. include:: automaton_iter_long.rst .. include:: automaton_find_all.rst .. include:: automaton___reduce__.rst .. include:: automaton_save.rst .. include:: module_load.rst .. include:: automaton___sizeof__.rst .. include:: automaton_get_stats.rst .. include:: automaton_dump.rst .. include:: automaton_search_iter_set.rst python-pyahocorasick_1.4.1.orig/docs/module.rst0000644000000000000000000000030513407731766016625 0ustar00**pyahocorasick** is a fast and memory efficient library for exact or approximate multi-pattern string search meaning that you can find multiple key strings occurrences at once in some input text. python-pyahocorasick_1.4.1.orig/docs/module_load.rst0000644000000000000000000000043713407731766017632 0ustar00load(path, deserializer) => Automaton ---------------------------------------------------------------------- Load automaton previously stored on disc using ``save`` method. ``Deserializer`` is a callable object which converts bytes back into python object; it can be ``pickle.loads``. python-pyahocorasick_1.4.1.orig/msinttypes/inttypes.h0000644000000000000000000001757413035462050020125 0ustar00// ISO C9x compliant inttypes.h for Microsoft Visual Studio // Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 // // Copyright (c) 2006-2013 Alexander Chemeris // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // 1. Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // // 2. Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // // 3. Neither the name of the product nor the names of its contributors may // be used to endorse or promote products derived from this software // without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED // WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO // EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // /////////////////////////////////////////////////////////////////////////////// #ifndef _MSC_VER // [ #error "Use this header only with Microsoft Visual C++ compilers!" #endif // _MSC_VER ] #ifndef _MSC_INTTYPES_H_ // [ #define _MSC_INTTYPES_H_ #if _MSC_VER > 1000 #pragma once #endif #include "stdint.h" // 7.8 Format conversion of integer types typedef struct { intmax_t quot; intmax_t rem; } imaxdiv_t; // 7.8.1 Macros for format specifiers #if !defined(__cplusplus) || defined(__STDC_FORMAT_MACROS) // [ See footnote 185 at page 198 // The fprintf macros for signed integers are: #define PRId8 "d" #define PRIi8 "i" #define PRIdLEAST8 "d" #define PRIiLEAST8 "i" #define PRIdFAST8 "d" #define PRIiFAST8 "i" #define PRId16 "hd" #define PRIi16 "hi" #define PRIdLEAST16 "hd" #define PRIiLEAST16 "hi" #define PRIdFAST16 "hd" #define PRIiFAST16 "hi" #define PRId32 "I32d" #define PRIi32 "I32i" #define PRIdLEAST32 "I32d" #define PRIiLEAST32 "I32i" #define PRIdFAST32 "I32d" #define PRIiFAST32 "I32i" #define PRId64 "I64d" #define PRIi64 "I64i" #define PRIdLEAST64 "I64d" #define PRIiLEAST64 "I64i" #define PRIdFAST64 "I64d" #define PRIiFAST64 "I64i" #define PRIdMAX "I64d" #define PRIiMAX "I64i" #define PRIdPTR "Id" #define PRIiPTR "Ii" // The fprintf macros for unsigned integers are: #define PRIo8 "o" #define PRIu8 "u" #define PRIx8 "x" #define PRIX8 "X" #define PRIoLEAST8 "o" #define PRIuLEAST8 "u" #define PRIxLEAST8 "x" #define PRIXLEAST8 "X" #define PRIoFAST8 "o" #define PRIuFAST8 "u" #define PRIxFAST8 "x" #define PRIXFAST8 "X" #define PRIo16 "ho" #define PRIu16 "hu" #define PRIx16 "hx" #define PRIX16 "hX" #define PRIoLEAST16 "ho" #define PRIuLEAST16 "hu" #define PRIxLEAST16 "hx" #define PRIXLEAST16 "hX" #define PRIoFAST16 "ho" #define PRIuFAST16 "hu" #define PRIxFAST16 "hx" #define PRIXFAST16 "hX" #define PRIo32 "I32o" #define PRIu32 "I32u" #define PRIx32 "I32x" #define PRIX32 "I32X" #define PRIoLEAST32 "I32o" #define PRIuLEAST32 "I32u" #define PRIxLEAST32 "I32x" #define PRIXLEAST32 "I32X" #define PRIoFAST32 "I32o" #define PRIuFAST32 "I32u" #define PRIxFAST32 "I32x" #define PRIXFAST32 "I32X" #define PRIo64 "I64o" #define PRIu64 "I64u" #define PRIx64 "I64x" #define PRIX64 "I64X" #define PRIoLEAST64 "I64o" #define PRIuLEAST64 "I64u" #define PRIxLEAST64 "I64x" #define PRIXLEAST64 "I64X" #define PRIoFAST64 "I64o" #define PRIuFAST64 "I64u" #define PRIxFAST64 "I64x" #define PRIXFAST64 "I64X" #define PRIoMAX "I64o" #define PRIuMAX "I64u" #define PRIxMAX "I64x" #define PRIXMAX "I64X" #define PRIoPTR "Io" #define PRIuPTR "Iu" #define PRIxPTR "Ix" #define PRIXPTR "IX" // The fscanf macros for signed integers are: #define SCNd8 "d" #define SCNi8 "i" #define SCNdLEAST8 "d" #define SCNiLEAST8 "i" #define SCNdFAST8 "d" #define SCNiFAST8 "i" #define SCNd16 "hd" #define SCNi16 "hi" #define SCNdLEAST16 "hd" #define SCNiLEAST16 "hi" #define SCNdFAST16 "hd" #define SCNiFAST16 "hi" #define SCNd32 "ld" #define SCNi32 "li" #define SCNdLEAST32 "ld" #define SCNiLEAST32 "li" #define SCNdFAST32 "ld" #define SCNiFAST32 "li" #define SCNd64 "I64d" #define SCNi64 "I64i" #define SCNdLEAST64 "I64d" #define SCNiLEAST64 "I64i" #define SCNdFAST64 "I64d" #define SCNiFAST64 "I64i" #define SCNdMAX "I64d" #define SCNiMAX "I64i" #ifdef _WIN64 // [ # define SCNdPTR "I64d" # define SCNiPTR "I64i" #else // _WIN64 ][ # define SCNdPTR "ld" # define SCNiPTR "li" #endif // _WIN64 ] // The fscanf macros for unsigned integers are: #define SCNo8 "o" #define SCNu8 "u" #define SCNx8 "x" #define SCNX8 "X" #define SCNoLEAST8 "o" #define SCNuLEAST8 "u" #define SCNxLEAST8 "x" #define SCNXLEAST8 "X" #define SCNoFAST8 "o" #define SCNuFAST8 "u" #define SCNxFAST8 "x" #define SCNXFAST8 "X" #define SCNo16 "ho" #define SCNu16 "hu" #define SCNx16 "hx" #define SCNX16 "hX" #define SCNoLEAST16 "ho" #define SCNuLEAST16 "hu" #define SCNxLEAST16 "hx" #define SCNXLEAST16 "hX" #define SCNoFAST16 "ho" #define SCNuFAST16 "hu" #define SCNxFAST16 "hx" #define SCNXFAST16 "hX" #define SCNo32 "lo" #define SCNu32 "lu" #define SCNx32 "lx" #define SCNX32 "lX" #define SCNoLEAST32 "lo" #define SCNuLEAST32 "lu" #define SCNxLEAST32 "lx" #define SCNXLEAST32 "lX" #define SCNoFAST32 "lo" #define SCNuFAST32 "lu" #define SCNxFAST32 "lx" #define SCNXFAST32 "lX" #define SCNo64 "I64o" #define SCNu64 "I64u" #define SCNx64 "I64x" #define SCNX64 "I64X" #define SCNoLEAST64 "I64o" #define SCNuLEAST64 "I64u" #define SCNxLEAST64 "I64x" #define SCNXLEAST64 "I64X" #define SCNoFAST64 "I64o" #define SCNuFAST64 "I64u" #define SCNxFAST64 "I64x" #define SCNXFAST64 "I64X" #define SCNoMAX "I64o" #define SCNuMAX "I64u" #define SCNxMAX "I64x" #define SCNXMAX "I64X" #ifdef _WIN64 // [ # define SCNoPTR "I64o" # define SCNuPTR "I64u" # define SCNxPTR "I64x" # define SCNXPTR "I64X" #else // _WIN64 ][ # define SCNoPTR "lo" # define SCNuPTR "lu" # define SCNxPTR "lx" # define SCNXPTR "lX" #endif // _WIN64 ] #endif // __STDC_FORMAT_MACROS ] // 7.8.2 Functions for greatest-width integer types // 7.8.2.1 The imaxabs function #define imaxabs _abs64 // 7.8.2.2 The imaxdiv function // This is modified version of div() function from Microsoft's div.c found // in %MSVC.NET%\crt\src\div.c #ifdef STATIC_IMAXDIV // [ static #else // STATIC_IMAXDIV ][ _inline #endif // STATIC_IMAXDIV ] imaxdiv_t __cdecl imaxdiv(intmax_t numer, intmax_t denom) { imaxdiv_t result; result.quot = numer / denom; result.rem = numer % denom; if (numer < 0 && result.rem > 0) { // did division wrong; must fix up ++result.quot; result.rem -= denom; } return result; } // 7.8.2.3 The strtoimax and strtoumax functions #define strtoimax _strtoi64 #define strtoumax _strtoui64 // 7.8.2.4 The wcstoimax and wcstoumax functions #define wcstoimax _wcstoi64 #define wcstoumax _wcstoui64 #endif // _MSC_INTTYPES_H_ ] python-pyahocorasick_1.4.1.orig/msinttypes/stdint.h0000644000000000000000000001764513035462050017552 0ustar00// ISO C9x compliant stdint.h for Microsoft Visual Studio // Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 // // Copyright (c) 2006-2013 Alexander Chemeris // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // 1. Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // // 2. Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // // 3. Neither the name of the product nor the names of its contributors may // be used to endorse or promote products derived from this software // without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED // WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO // EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // /////////////////////////////////////////////////////////////////////////////// #ifndef _MSC_VER // [ #error "Use this header only with Microsoft Visual C++ compilers!" #endif // _MSC_VER ] #ifndef _MSC_STDINT_H_ // [ #define _MSC_STDINT_H_ #if _MSC_VER > 1000 #pragma once #endif #if _MSC_VER >= 1600 // [ #include #else // ] _MSC_VER >= 1600 [ #include // For Visual Studio 6 in C++ mode and for many Visual Studio versions when // compiling for ARM we should wrap include with 'extern "C++" {}' // or compiler give many errors like this: // error C2733: second C linkage of overloaded function 'wmemchr' not allowed #ifdef __cplusplus extern "C" { #endif # include #ifdef __cplusplus } #endif // Define _W64 macros to mark types changing their size, like intptr_t. #ifndef _W64 # if !defined(__midl) && (defined(_X86_) || defined(_M_IX86)) && _MSC_VER >= 1300 # define _W64 __w64 # else # define _W64 # endif #endif // 7.18.1 Integer types // 7.18.1.1 Exact-width integer types // Visual Studio 6 and Embedded Visual C++ 4 doesn't // realize that, e.g. char has the same size as __int8 // so we give up on __intX for them. #if (_MSC_VER < 1300) typedef signed char int8_t; typedef signed short int16_t; typedef signed int int32_t; typedef unsigned char uint8_t; typedef unsigned short uint16_t; typedef unsigned int uint32_t; #else typedef signed __int8 int8_t; typedef signed __int16 int16_t; typedef signed __int32 int32_t; typedef unsigned __int8 uint8_t; typedef unsigned __int16 uint16_t; typedef unsigned __int32 uint32_t; #endif typedef signed __int64 int64_t; typedef unsigned __int64 uint64_t; // 7.18.1.2 Minimum-width integer types typedef int8_t int_least8_t; typedef int16_t int_least16_t; typedef int32_t int_least32_t; typedef int64_t int_least64_t; typedef uint8_t uint_least8_t; typedef uint16_t uint_least16_t; typedef uint32_t uint_least32_t; typedef uint64_t uint_least64_t; // 7.18.1.3 Fastest minimum-width integer types typedef int8_t int_fast8_t; typedef int16_t int_fast16_t; typedef int32_t int_fast32_t; typedef int64_t int_fast64_t; typedef uint8_t uint_fast8_t; typedef uint16_t uint_fast16_t; typedef uint32_t uint_fast32_t; typedef uint64_t uint_fast64_t; // 7.18.1.4 Integer types capable of holding object pointers #ifdef _WIN64 // [ typedef signed __int64 intptr_t; typedef unsigned __int64 uintptr_t; #else // _WIN64 ][ typedef _W64 signed int intptr_t; typedef _W64 unsigned int uintptr_t; #endif // _WIN64 ] // 7.18.1.5 Greatest-width integer types typedef int64_t intmax_t; typedef uint64_t uintmax_t; // 7.18.2 Limits of specified-width integer types #if !defined(__cplusplus) || defined(__STDC_LIMIT_MACROS) // [ See footnote 220 at page 257 and footnote 221 at page 259 // 7.18.2.1 Limits of exact-width integer types #define INT8_MIN ((int8_t)_I8_MIN) #define INT8_MAX _I8_MAX #define INT16_MIN ((int16_t)_I16_MIN) #define INT16_MAX _I16_MAX #define INT32_MIN ((int32_t)_I32_MIN) #define INT32_MAX _I32_MAX #define INT64_MIN ((int64_t)_I64_MIN) #define INT64_MAX _I64_MAX #define UINT8_MAX _UI8_MAX #define UINT16_MAX _UI16_MAX #define UINT32_MAX _UI32_MAX #define UINT64_MAX _UI64_MAX // 7.18.2.2 Limits of minimum-width integer types #define INT_LEAST8_MIN INT8_MIN #define INT_LEAST8_MAX INT8_MAX #define INT_LEAST16_MIN INT16_MIN #define INT_LEAST16_MAX INT16_MAX #define INT_LEAST32_MIN INT32_MIN #define INT_LEAST32_MAX INT32_MAX #define INT_LEAST64_MIN INT64_MIN #define INT_LEAST64_MAX INT64_MAX #define UINT_LEAST8_MAX UINT8_MAX #define UINT_LEAST16_MAX UINT16_MAX #define UINT_LEAST32_MAX UINT32_MAX #define UINT_LEAST64_MAX UINT64_MAX // 7.18.2.3 Limits of fastest minimum-width integer types #define INT_FAST8_MIN INT8_MIN #define INT_FAST8_MAX INT8_MAX #define INT_FAST16_MIN INT16_MIN #define INT_FAST16_MAX INT16_MAX #define INT_FAST32_MIN INT32_MIN #define INT_FAST32_MAX INT32_MAX #define INT_FAST64_MIN INT64_MIN #define INT_FAST64_MAX INT64_MAX #define UINT_FAST8_MAX UINT8_MAX #define UINT_FAST16_MAX UINT16_MAX #define UINT_FAST32_MAX UINT32_MAX #define UINT_FAST64_MAX UINT64_MAX // 7.18.2.4 Limits of integer types capable of holding object pointers #ifdef _WIN64 // [ # define INTPTR_MIN INT64_MIN # define INTPTR_MAX INT64_MAX # define UINTPTR_MAX UINT64_MAX #else // _WIN64 ][ # define INTPTR_MIN INT32_MIN # define INTPTR_MAX INT32_MAX # define UINTPTR_MAX UINT32_MAX #endif // _WIN64 ] // 7.18.2.5 Limits of greatest-width integer types #define INTMAX_MIN INT64_MIN #define INTMAX_MAX INT64_MAX #define UINTMAX_MAX UINT64_MAX // 7.18.3 Limits of other integer types #ifdef _WIN64 // [ # define PTRDIFF_MIN _I64_MIN # define PTRDIFF_MAX _I64_MAX #else // _WIN64 ][ # define PTRDIFF_MIN _I32_MIN # define PTRDIFF_MAX _I32_MAX #endif // _WIN64 ] #define SIG_ATOMIC_MIN INT_MIN #define SIG_ATOMIC_MAX INT_MAX #ifndef SIZE_MAX // [ # ifdef _WIN64 // [ # define SIZE_MAX _UI64_MAX # else // _WIN64 ][ # define SIZE_MAX _UI32_MAX # endif // _WIN64 ] #endif // SIZE_MAX ] // WCHAR_MIN and WCHAR_MAX are also defined in #ifndef WCHAR_MIN // [ # define WCHAR_MIN 0 #endif // WCHAR_MIN ] #ifndef WCHAR_MAX // [ # define WCHAR_MAX _UI16_MAX #endif // WCHAR_MAX ] #define WINT_MIN 0 #define WINT_MAX _UI16_MAX #endif // __STDC_LIMIT_MACROS ] // 7.18.4 Limits of other integer types #if !defined(__cplusplus) || defined(__STDC_CONSTANT_MACROS) // [ See footnote 224 at page 260 // 7.18.4.1 Macros for minimum-width integer constants #define INT8_C(val) val##i8 #define INT16_C(val) val##i16 #define INT32_C(val) val##i32 #define INT64_C(val) val##i64 #define UINT8_C(val) val##ui8 #define UINT16_C(val) val##ui16 #define UINT32_C(val) val##ui32 #define UINT64_C(val) val##ui64 // 7.18.4.2 Macros for greatest-width integer constants // These #ifndef's are needed to prevent collisions with . // Check out Issue 9 for the details. #ifndef INTMAX_C // [ # define INTMAX_C INT64_C #endif // INTMAX_C ] #ifndef UINTMAX_C // [ # define UINTMAX_C UINT64_C #endif // UINTMAX_C ] #endif // __STDC_CONSTANT_MACROS ] #endif // _MSC_VER >= 1600 ] #endif // _MSC_STDINT_H_ ] python-pyahocorasick_1.4.1.orig/py/README.rst0000644000000000000000000000023712744130105015760 0ustar00This directory contains a simpler pure python module, compatible with Python 2 and 3. It has a slightly different API. It may fail at pickling for long keys. python-pyahocorasick_1.4.1.orig/py/exportdot.py0000644000000000000000000000314712744172027016707 0ustar00""" Aho-Corasick string search algorithm. Author : Wojciech Muła, wojciech_mula@poczta.onet.pl WWW : http://0x80.pl License : public domain """ import pyahocorasick def exportdot(trie, file): def writeln(text=""): file.write(text + "\n") writeln("digraph ahocorasick {") def walk(node): queue = [node] while queue: node = queue.pop() yield node for child in node.children.itervalues(): if child != node: queue.append(child) nodes = list(walk(trie.root)) # nodes for node in nodes: if node.output != pyahocorasick.nil: writeln("\tnode%d [shape=doublecircle, label=\"\"]" % id(node)) else: writeln("\tnode%d [shape=circle, label=\"\"]" % id(node)) # trie edges for node in nodes: for letter, child in node.children.iteritems(): nodeid = id(node) destid = id(child) if destid == id(trie.root): # do not show self-links of root node created during make_automaton continue if letter.isalnum(): label = letter else: label = '%02x' % ord(letter) writeln("\tnode%d -> node%d [label=\"%s\"]" % (nodeid, destid, label)) # fail links for node in nodes: nodeid = id(node) failid = id(node.fail) if failid != pyahocorasick.nil: writeln("\tnode%d -> node%d [color=blue]" % (nodeid, failid)) writeln("}") if __name__ == '__main__': A = pyahocorasick.Trie() A.add_word("he", 0) A.add_word("her", 1) A.add_word("hers", 2) A.add_word("she", 3) A.add_word("cat", 4) A.add_word("shield", 5) with open('trie.dot', 'wt') as f: exportdot(A, f) A.make_automaton() with open('ahocorasick.dot', 'wt') as f: exportdot(A, f) python-pyahocorasick_1.4.1.orig/py/issue_21.py0000644000000000000000000000253112744172027016305 0ustar00""" Aho-Corasick string search algorithm. Author : Wojciech Muła, wojciech_mula@poczta.onet.pl WWW : http://0x80.pl License : public domain """ import pyahocorasick test_cases = [ # example provided by @Ulitochka { 'words' : ["alpha", "alpha beta", "gamma", "gamma alpha"], 'input' : "I went to alpha beta the alpha other day gamma alpha to pick up some spam", 'expected' : [("alpha beta", 19), ("alpha", 29), ("gamma alpha", 51)] }, { 'words' : ["alpha", "alpha beta", "beta gamma", "gamma"], 'input' : "Cats have not idea what alpha beta gamma means", 'expected' : [("alpha beta", 33), ("gamma", 39)] }, { 'words' : ["alpha", "alpha beta", "beta gamma", "gamma"], 'input' : "Cats have not idea what alpha beta gamma", 'expected' : [("alpha beta", 33), ("gamma", 39)] }, ] def test(case): tree = pyahocorasick.Trie() for word in case['words']: tree.add_word(word, word) tree.make_automaton() actual = [item for item in tree.iter_long(case['input'])] if actual != case['expected']: print("ERROR:") print(actual) print(case['expected']) assert(False) if __name__ == '__main__': for data in test_cases: test(data) print("OK") python-pyahocorasick_1.4.1.orig/py/pyahocorasick.py0000644000000000000000000001415312731675410017515 0ustar00# -*- coding: utf-8 -*- """ Aho-Corasick string search algorithm. Author : Wojciech Muła, wojciech_mula@poczta.onet.pl WWW : http://0x80.pl License : public domain """ from collections import deque nil = object() # used to distinguish from None class TrieNode(object): """ Node of trie/Aho-Corasick automaton """ __slots__ = ['char', 'output', 'fail', 'children'] def __init__(self, char): """ Constructs an empty node """ self.char = char # character self.output = nil # an output function for this node self.fail = nil # fail link used by Aho-Corasick automaton self.children = {} # children def __repr__(self): """ Textual representation of node. """ if self.output is not nil: return "" % (self.char, self.output) else: return "" % self.char class Trie(object): """ Trie/Aho-Corasick automaton. """ def __init__(self): """ Construct an empty trie """ self.root = TrieNode('') def __get_node(self, word): """ Private function retrieving a final node of trie for given word Returns node or None, if the trie doesn't contain the word. """ node = self.root for c in word: try: node = node.children[c] except KeyError: return None return node def get(self, word, default=nil): """ Retrieves output value associated with word. If there is no word returns default value, and if default is not given rises KeyError. """ node = self.__get_node(word) output = nil if node: output = node.output if output is nil: if default is nil: raise KeyError("no key '%s'" % word) else: return default else: return output def keys(self): """ Generator returning all keys (i.e. word) stored in trie """ for key, _ in self.items(): yield key def values(self): """ Generator returning all values associated with words stored in a trie. """ for _, value in self.items(): yield value def items(self): """ Generator returning all keys and values stored in a trie. """ L = [] def aux(node, s): s = s + node.char if node.output is not nil: L.append((s, node.output)) for child in node.children.values(): if child is not node: aux(child, s) aux(self.root, '') return iter(L) def __len__(self): """ Calculates number of words in a trie. """ stack = deque() stack.append(self.root) n = 0 while stack: node = stack.pop() if node.output is not nil: n += 1 for child in node.children.values(): stack.append(child) return n def add_word(self, word, value): """ Adds word and associated value. If word already exists, its value is replaced. """ if not word: return node = self.root for c in word: try: node = node.children[c] except KeyError: n = TrieNode(c) node.children[c] = n node = n node.output = value def clear(self): """ Clears trie. """ self.root = TrieNode('') def exists(self, word): """ Checks if whole word is present in the trie. """ node = self.__get_node(word) if node: return bool(node.output != nil) else: return False def match(self, word): """ Checks if word is a prefix of any existing word in the trie. """ return (self.__get_node(word) is not None) def make_automaton(self): """ Converts trie to Aho-Corasick automaton. """ queue = deque() # 1. for i in range(256): c = chr(i) if c in self.root.children: node = self.root.children[c] node.fail = self.root # f(s) = 0 queue.append(node) else: self.root.children[c] = self.root # 2. while queue: r = queue.popleft() for node in r.children.values(): queue.append(node) state = r.fail while node.char not in state.children: state = state.fail node.fail = state.children.get(node.char, self.root) def iter(self, string): """ Generator performs Aho-Corasick search string algorithm, yielding tuples containing two values: - position in string - outputs associated with matched strings """ state = self.root for index, c in enumerate(string): while c not in state.children: state = state.fail state = state.children.get(c, self.root) tmp = state output = [] while tmp is not nil: if tmp.output is not nil: output.append(tmp.output) tmp = tmp.fail if output: yield (index, output) def iter_long(self, string): """ Generator performs a modified Aho-Corasick search string algorithm, which maches only the longest word. """ state = self.root last = None index = 0 while index < len(string): c = string[index] if c in state.children: state = state.children[c] if state.output is not nil: # save the last node on the path last = (state.output, index) index += 1 else: if last: # return the saved match yield last # and start over, as we don't want overlapped results # Note: this leads to quadratic complexity in the worst case index = last[1] + 1 state = self.root last = None else: # if no output, perform classic Aho-Corasick algorithm while c not in state.children: state = state.fail # corner case if last: yield last def find_all(self, string, callback): """ Wrapper on iter method, callback gets an iterator result """ for index, output in self.iter(string): callback(index, output) if __name__ == '__main__': def demo(): words = "he hers his she hi him man".split() t = Trie(); for w in words: t.add_word(w, w) s = "he rshershidamanza " t.make_automaton() for res in t.items(): print(res) for res in t.iter(s): print print('%s' % s) pos, matches = res for fragment in matches: print('%s%s' % ((pos - len(fragment) + 1)*' ', fragment)) demo() def bug(): patterns = ['GT-C3303','SAMSUNG-GT-C3303K/'] text = 'SAMSUNG-GT-C3303i/1.0 NetFront/3.5 Profile/MIDP-2.0 Configuration/CLDC-1.1' t = Trie() for pattern in patterns: ret = t.add_word(pattern, (0, pattern)) t.make_automaton() res = list(t.iter(text)) assert len(res) == 1, 'failed' bug() # vim: ts=4 sw=4 nowrap python-pyahocorasick_1.4.1.orig/py/unittests.py0000644000000000000000000001122112744172027016711 0ustar00""" Aho-Corasick string search algorithm. Author : Wojciech Muła, wojciech_mula@poczta.onet.pl WWW : http://0x80.pl License : public domain """ import unittest from pyahocorasick import Trie class TestTrie(unittest.TestCase): def testEmptyTrieShouldNotContainsAnyWords(self): t = Trie() self.assertEqual(len(t), 0) def testAddedWordShouldBeCountedAndAvailableForRetrieval(self): t = Trie() t.add_word('python', 'value') self.assertEqual(len(t), 1) self.assertEqual(t.get('python'), 'value') def testAddingExistingWordShouldReplaceAssociatedValue(self): t = Trie() t.add_word('python', 'value') self.assertEqual(len(t), 1) self.assertEqual(t.get('python'), 'value') t.add_word('python', 'other') self.assertEqual(len(t), 1) self.assertEqual(t.get('python'), 'other') def testGetUnknowWordWithoutDefaultValueShouldRaiseException(self): t = Trie() with self.assertRaises(KeyError): t.get('python') def testGetUnknowWordWithDefaultValueShouldReturnDefault(self): t = Trie() self.assertEqual(t.get('python', 'default'), 'default') def testExistShouldDetectAddedWords(self): t = Trie() t.add_word('python', 'value') t.add_word('ada', 'value') self.assertTrue(t.exists('python')) self.assertTrue(t.exists('ada')) def testExistShouldReturnFailOnUnknownWord(self): t = Trie() t.add_word('python', 'value') self.assertFalse(t.exists('ada')) def testMatchShouldDetecAllPrefixesIncludingWord(self): t = Trie() t.add_word('python', 'value') t.add_word('ada', 'value') self.assertTrue(t.match('a')) self.assertTrue(t.match('ad')) self.assertTrue(t.match('ada')) self.assertTrue(t.match('p')) self.assertTrue(t.match('py')) self.assertTrue(t.match('pyt')) self.assertTrue(t.match('pyth')) self.assertTrue(t.match('pytho')) self.assertTrue(t.match('python')) def testItemsShouldReturnAllItemsAlreadyAddedToTheTrie(self): t = Trie() t.add_word('python', 1) t.add_word('ada', 2) t.add_word('perl', 3) t.add_word('pascal', 4) t.add_word('php', 5) result = list(t.items()) self.assertEquals(len(result), 5) self.assertIn(('python', 1), result) self.assertIn(('ada', 2), result) self.assertIn(('perl', 3), result) self.assertIn(('pascal', 4), result) self.assertIn(('php', 5), result) def testKeysShouldReturnAllKeysAlreadyAddedToTheTrie(self): t = Trie() t.add_word('python', 1) t.add_word('ada', 2) t.add_word('perl', 3) t.add_word('pascal', 4) t.add_word('php', 5) result = list(t.keys()) self.assertEquals(len(result), 5) self.assertIn('python',result) self.assertIn('ada', result) self.assertIn('perl', result) self.assertIn('pascal',result) self.assertIn('php', result) def testValuesShouldReturnAllValuesAlreadyAddedToTheTrie(self): t = Trie() t.add_word('python', 1) t.add_word('ada', 2) t.add_word('perl', 3) t.add_word('pascal', 4) t.add_word('php', 5) result = list(t.values()) self.assertEquals(len(result), 5) self.assertIn(1, result) self.assertIn(2, result) self.assertIn(3, result) self.assertIn(4, result) self.assertIn(5, result) def testClearShouldRemoveEveryting(self): t = Trie() t.add_word('python', 1) t.add_word('ada', 2) t.add_word('perl', 3) t.add_word('pascal', 4) t.add_word('php', 5) self.assertEqual(len(t), 5) self.assertEqual(len(list(t.items())), 5) t.clear() self.assertEqual(len(t), 0) self.assertEqual(len(list(t.items())), 0) def testIterShouldMatchAllStrings(self): def get_test_automaton(): words = "he her hers his she hi him man himan".split() t = Trie(); for w in words: t.add_word(w, w) t.make_automaton() return t test_string = "he she himan" t = get_test_automaton() result = list(t.iter(test_string)) # there are 5 matching positions self.assertEquals(len(result), 5) # result should have be valid, i.e. returned position and substring # must match substring from test string for end_index, strings in result: for s in strings: n = len(s) self.assertEqual(s, test_string[end_index - n + 1 : end_index + 1]) def testFindAllShouldGetTheSameDataAsIter(self): def get_test_automaton(): words = "he her hers his she hi him man himan".split() t = Trie(); for w in words: t.add_word(w, w) t.make_automaton() return t find_all_arguments = [] def find_all_callback(end_index, strings): find_all_arguments.append((end_index, strings)) t = get_test_automaton() test_string = "he she himan" t.find_all(test_string, find_all_callback) result_items = list(t.iter(test_string)) self.assertEquals(find_all_arguments, result_items) if __name__ == '__main__': unittest.main() python-pyahocorasick_1.4.1.orig/regression/issue_10.py0000644000000000000000000000122512744172027020032 0ustar00# -*- coding: utf-8 -*- """ Aho-Corasick string search algorithm. Author : Wojciech Muła, wojciech_mula@poczta.onet.pl WWW : http://0x80.pl License : public domain """ import ahocorasick ac = ahocorasick.Automaton() ac.add_word('S', 1) ac.make_automaton() buffer = 'SSS' def case_1(): count = 0 for item in ac.iter(buffer, 0, 3): # this causes an error print(item) count += 1 assert(count == 3) def case_2(): count = 0 for item in ac.iter(buffer, 0, 2): # no error, but it misses the last 'S' in the buffer print(item) count += 1 assert(count == 2) case_1() case_2() python-pyahocorasick_1.4.1.orig/regression/issue_19.py0000644000000000000000000000054112744172027020043 0ustar00# -*- coding: utf-8 -*- """ Aho-Corasick string search algorithm. Author : Wojciech Muła, wojciech_mula@poczta.onet.pl WWW : http://0x80.pl License : public domain """ import ahocorasick A = ahocorasick.Automaton() for index, word in enumerate("he her hers she".split()): A.add_word(word, (index, word)) A.clear() python-pyahocorasick_1.4.1.orig/regression/issue_26.py0000644000000000000000000000051712744172027020044 0ustar00# -*- coding: utf-8 -*- """ Aho-Corasick string search algorithm. Author : Wojciech Muła, wojciech_mula@poczta.onet.pl WWW : http://0x80.pl License : public domain """ import pickle import ahocorasick as aho a = aho.Automaton(aho.STORE_INTS) a.add_word('abc', 12) a.make_automaton() p = pickle.dumps(a) python-pyahocorasick_1.4.1.orig/regression/issue_5.py0000644000000000000000000000061312744172027017756 0ustar00# -*- coding: utf-8 -*- """ Aho-Corasick string search algorithm. Author : Wojciech Muła, wojciech_mula@poczta.onet.pl WWW : http://0x80.pl License : public domain """ import ahocorasick A = ahocorasick.Automaton() # add some words to trie for index, word in enumerate("he her hers she".split()): A.add_word(word, (index, word)) A = None #### segfault here python-pyahocorasick_1.4.1.orig/regression/issue_50-part1.py0000644000000000000000000000032213051776644021067 0ustar00from ahocorasick import Automaton from pickle import load, dump auto = Automaton() auto.add_word('abc', 'abc') auto.add_word('def', 'def') with open('automaton-wee.pickle', 'wb') as dest: dump(auto, dest) python-pyahocorasick_1.4.1.orig/regression/issue_50-part2.py0000644000000000000000000000020413051776644021067 0ustar00from ahocorasick import Automaton from pickle import load, dump with open('automaton-wee.pickle', 'rb') as src: auto = load(src) python-pyahocorasick_1.4.1.orig/regression/issue_53.py0000644000000000000000000000053113054044727020040 0ustar00from ahocorasick import Automaton auto = Automaton() auto.add_word('wounded', 'wounded') auto.make_automaton() for item in auto.iter('Winning \U0001F629 so gutted, can\'t do anything for 4 weeks... Myth. #wounded'): print(item) for item in auto.iter('Winning so gutted, can\'t do anything for 4 weeks... Myth. #wounded'): print(item) python-pyahocorasick_1.4.1.orig/regression/issue_56.py0000644000000000000000000000115113050613275020036 0ustar00import ahocorasick def iter_results(s): r = [] for x in A.iter(teststr): r.append(x) return r def find_all_results(s): r = [] def append(x, s): r.append((x, s)) A.find_all(s, append) return r A = ahocorasick.Automaton() for word in ("poke", "go", "pokegois", "egoist"): A.add_word(word, word) A.make_automaton() teststr = 'pokego pokego pokegoist' expected = iter_results(teststr) findall = find_all_results(teststr) if findall != expected: print("expected: %s" % expected) print("findall : %s" % findall) assert findall == expected python-pyahocorasick_1.4.1.orig/regression/issue_8.py0000644000000000000000000000267312744172027017771 0ustar00# -*- coding: utf-8 -*- """ Aho-Corasick string search algorithm. Author : Wojciech Muła, wojciech_mula@poczta.onet.pl WWW : http://0x80.pl License : public domain """ import ahocorasick test_sentences_rus = ["!ASM Print", "!ASM Print, tyre компания er", "!ASM Print, рекламно-производственная компания rr", "!Action Pact!", "!T.O.O.H.!", "!YES, лингвистический центр", "!ts, магазин", "!ФЕСТ", '"100-th" department store', '"1000 мелочей"', '"1001 мелочь"', '"19 отряд Федеральной противопожарной службы по Ленинградской области"', '"У Друзей"', '"ШТОРЫ и не только..."'] test_sentences_pl = [ "wąż", # a snake "mąż", # a husband - why so similar :) "żółć", "aż", "waży" ] def create_sutomata_rus(): A = ahocorasick.Automaton() for sentences in test_sentences_rus[-7:]: for index, word in enumerate(sentences.split(' ')): A.add_word(word, (index, word)) A.make_automaton() def create_and_iter_sutomata_pl(): A = ahocorasick.Automaton() for index, word in enumerate(test_sentences_pl): A.add_word(word, (index, word)) A.make_automaton() for item in A.iter("wyważyć"): print(item) if __name__ == '__main__': create_sutomata_rus() create_and_iter_sutomata_pl() python-pyahocorasick_1.4.1.orig/regression/issue_9.py0000644000000000000000000000214513036643734017767 0ustar00# -*- coding: utf-8 -*- """ Aho-Corasick string search algorithm. Author : Wojciech Muła, wojciech_mula@poczta.onet.pl WWW : http://0x80.pl License : public domain """ import os import sys import ahocorasick ac = ahocorasick.Automaton() ac.add_word('SSSSS', 1) ac.make_automaton() try: range = xrange # for Py2 except NameError: pass def get_memory_usage(): # Linux only pid = os.getpid() lines = [] try: with open('/proc/%d/status' % pid, 'rt') as f: lines = f.readlines() except: pass for line in lines: if line.startswith('VmSize'): return float(line.split()[1]) return 0 def test(): with open('README.rst', 'r') as f: data = f.read()[:1024 * 2] for loop in range(1000): for start in range(0, len(data) - 20): ac.iter(data, start) if __name__ == '__main__': before = get_memory_usage() test() after = get_memory_usage() print("Memory's usage growth: %s (before = %s, after = %s)" % (after - before, before, after)) assert(before == after) python-pyahocorasick_1.4.1.orig/src/custompickle/0000755000000000000000000000000013406755432017144 5ustar00python-pyahocorasick_1.4.1.orig/src/inline_doc.h0000644000000000000000000002503613642662762016731 0ustar00#pragma once // DO NOT EDIT. File generated by script update_inlinedoc.py. #define automaton___reduce___doc \ "__reduce__()\n" \ "\n" \ "Return pickle-able data for this automaton instance." #define automaton___sizeof___doc \ "Return the approximate size in bytes occupied by the\n" \ "Automaton instance in memory excluding the size of\n" \ "associated objects when the Automaton is created with\n" \ "Automaton() or Automaton(ahocorasick.STORE_ANY)." #define automaton_add_word_doc \ "add_word(key, [value]) -> boolean\n" \ "\n" \ "Add a key string to the dict-like trie and associate this\n" \ "key with a value. value is optional or mandatory depending\n" \ "how the Automaton instance was created. Return True if the\n" \ "word key is inserted and did not exists in the trie or False\n" \ "otherwise. The value associated with an existing word is\n" \ "replaced.\n" \ "\n" \ "The value is either mandatory or optional:\n" \ "- If the Automaton was created without argument (the\n" \ " default) as Automaton() or with\n" \ " Automaton(ahocorasik.STORE_ANY) then the value is required\n" \ " and can be any Python object.\n" \ "- If the Automaton was created with\n" \ " Automaton(ahocorasik.STORE_INTS) then the value is\n" \ " optional. If provided it must be an integer, otherwise it\n" \ " defaults to len(automaton) which is therefore the order\n" \ " index in which keys are added to the trie.\n" \ "- If the Automaton was created with\n" \ " Automaton(ahocorasik.STORE_LENGTH) then associating a\n" \ " value is not allowed - len(word) is saved automatically as\n" \ " a value instead.\n" \ "\n" \ "Calling add_word() invalidates all iterators only if the new\n" \ "key did not exist in the trie so far (i.e. the method\n" \ "returned True)." #define automaton_clear_doc \ "clear()\n" \ "\n" \ "Remove all keys from the trie. This method invalidates all\n" \ "iterators." #define automaton_constructor_doc \ "Automaton(value_type=ahocorasick.STORE_ANY, [key_type])\n" \ "\n" \ "Create a new empty Automaton. Both value_type and key_type\n" \ "are optional.\n" \ "\n" \ "value_type is one of these constants:\n" \ "- ahocorasick.STORE_ANY [default] : The associated value can\n" \ " be any Python object.\n" \ "- ahocorasick.STORE_LENGTH : The length of an added string\n" \ " key is automatically used as the associated value stored\n" \ " in the trie for that key.\n" \ "- ahocorasick.STORE_INTS : The associated value must be a\n" \ " 32-bit integer.\n" \ "\n" \ "key_type defines the type of data that can be stored in an\n" \ "automaton; it is one of these constants and defines type of\n" \ "data might be stored:\n" \ "- ahocorasick.KEY_STRING [default] : string\n" \ "- ahocorasick.KEY_SEQUENCE : sequences of integers; The size\n" \ " of integer depends the version and platform Python, but\n" \ " for versions of Python >= 3.3, it is guaranteed to be\n" \ " 32-bits." #define automaton_dump_doc \ "dump()\n" \ "\n" \ "Return a three-tuple of lists describing the Automaton as a\n" \ "graph of nodes, edges, failure links.\n" \ "- nodes: each item is a pair (node id, end of word marker)\n" \ "- edges: each item is a triple (node id, label char, child\n" \ " node id)\n" \ "- failure links: each item is a pair (source node id, node\n" \ " if connected by fail node)\n" \ "\n" \ "For each of these, the node id is a unique number and a\n" \ "label is a number." #define automaton_exists_doc \ "exists(key) -> boolean\n" \ "\n" \ "Return True if the key is present in the trie. Same as using\n" \ "the 'in' keyword." #define automaton_find_all_doc \ "find_all(string, callback, [start, [end]])\n" \ "\n" \ "Perform the Aho-Corasick search procedure using the provided\n" \ "input string and iterate over the matching tuples\n" \ "(end_index, value) for keys found in string. Invoke the\n" \ "callback callable for each matching tuple.\n" \ "\n" \ "The callback callable must accept two positional arguments:\n" \ "- end_index is the end index in the input string where a\n" \ "trie key string was found. - value is the value associated\n" \ "with the found key string.\n" \ "\n" \ "The start and end optional arguments can be used to limit\n" \ "the search to an input string slice as in string[start:end].\n" \ "\n" \ "Equivalent to a loop on iter() calling a callable at each\n" \ "iteration." #define automaton_get_doc \ "get(key[, default])\n" \ "\n" \ "Return the value associated with the key string.\n" \ "\n" \ "Raise a KeyError exception if the key is not in the trie and\n" \ "no default is provided.\n" \ "\n" \ "Return the optional default value if provided and the key is\n" \ "not in the trie." #define automaton_get_stats_doc \ "get_stats() -> dict\n" \ "\n" \ "Return a dictionary containing Automaton statistics.\n" \ "- nodes_count - total number of nodes\n" \ "- words_count - number of distinct words (same as\n" \ " len(automaton))\n" \ "- longest_word - length of the longest word\n" \ "- links_count - number of edges\n" \ "- sizeof_node - size of single node in bytes\n" \ "- total_size - total size of trie in bytes (about\n" \ " nodes_count * size_of node + links_count * size of\n" \ " pointer)." #define automaton_items_doc \ "items([prefix, [wildcard, [how]]])\n" \ "\n" \ "Return an iterator on tuples of (key, value). Keys are\n" \ "matched optionally to the prefix using the same logic and\n" \ "arguments as in the keys() method." #define automaton_iter_doc \ "iter(string, [start, [end]], ignore_white_space=False)\n" \ "\n" \ "Perform the Aho-Corasick search procedure using the provided\n" \ "input string.\n" \ "\n" \ "Return an iterator of tuples (end_index, value) for keys\n" \ "found in string where:\n" \ "- end_index is the end index in the input string where a\n" \ " trie key string was found.\n" \ "- value is the value associated with the found key string.\n" \ "\n" \ "The start and end optional arguments can be used to limit\n" \ "the search to an input string slice as in string[start:end].\n" \ "\n" \ "The ignore_white_space optional arguments can be used to\n" \ "ignore white spaces from input string." #define automaton_iter_long_doc \ "iter_long(string, [start, [end]])\n" \ "\n" \ "Perform the modified Aho-Corasick search procedure which\n" \ "matches the longest words from set.\n" \ "\n" \ "Return an iterator of tuples (end_index, value) for keys\n" \ "found in string where:\n" \ "- end_index is the end index in the input string where a\n" \ " trie key string was found.\n" \ "- value is the value associated with the found key string.\n" \ "\n" \ "The start and end optional arguments can be used to limit\n" \ "the search to an input string slice as in string[start:end]." #define automaton_keys_doc \ "keys([prefix, [wildcard, [how]]])\n" \ "\n" \ "Return an iterator on keys. If the optional prefix string is\n" \ "provided, only yield keys starting with this prefix.\n" \ "\n" \ "If the optional wildcard is provided as a single character\n" \ "string, then the prefix is treated as a simple pattern using\n" \ "this character as a wildcard.\n" \ "\n" \ "The optional how argument is used to control how strings are\n" \ "matched using one of these possible values:\n" \ "- ahocorasick.MATCH_EXACT_LENGTH (default) Yield matches\n" \ " that have the same exact length as the prefix length.\n" \ "- ahocorasick.MATCH_AT_LEAST_PREFIX Yield matches that have\n" \ " a length greater or equal to the prefix length.\n" \ "- ahocorasick.MATCH_AT_MOST_PREFIX Yield matches that have a\n" \ " length lesser or equal to the prefix length." #define automaton_len_doc \ "len() -> integer\n" \ "\n" \ "Return the number of distinct keys added to the trie." #define automaton_longest_prefix_doc \ "longest_prefix(string) => integer\n" \ "\n" \ "Return the length of the longest prefix of string that\n" \ "exists in the trie." #define automaton_make_automaton_doc \ "make_automaton()\n" \ "\n" \ "Finalize and create the Aho-Corasick automaton based on the\n" \ "keys already added to the trie. This does not require\n" \ "additional memory. After successful creation the\n" \ "Automaton.kind attribute is set to ahocorasick.AHOCORASICK." #define automaton_match_doc \ "match(key) -> bool\n" \ "\n" \ "Return True if there is a prefix (or key) equal to key\n" \ "present in the trie.\n" \ "\n" \ "For example if the key 'example' has been added to the trie,\n" \ "then calls to match('e'), match('ex'), ..., match('exampl')\n" \ "or match('example') all return True. But exists() is True\n" \ "only when calling exists('example')." #define automaton_pop_doc \ "pop(word)\n" \ "\n" \ "Remove given word from a trie and return associated values.\n" \ "Raise a KeyError if the word was not found." #define automaton_remove_word_doc \ "remove_word(word) -> bool\n" \ "\n" \ "Remove given word from a trie. Return True if words was\n" \ "found, False otherwise." #define automaton_save_doc \ "save(path, serializer)\n" \ "\n" \ "Save content of automaton in an on-disc file.\n" \ "\n" \ "Serializer is a callable object that is used when automaton\n" \ "store type is STORE_ANY. This method converts a python\n" \ "object into bytes; it can be pickle.dumps." #define automaton_search_iter_doc \ "This class is not available directly but instances of\n" \ "AutomatonSearchIter are returned by the iter() method of an\n" \ "Automaton. This iterator can be manipulated through its\n" \ "set() method." #define automaton_search_iter_set_doc \ "set(string, reset=False)\n" \ "\n" \ "Set a new string to search. When the reset argument is False\n" \ "(default) then the Aho-Corasick procedure is continued and\n" \ "the internal state of the Automaton and end index of the\n" \ "string being searched are not reset. This allow to search\n" \ "for large strings in multiple smaller chunks." #define automaton_values_doc \ "values([prefix, [wildcard, [how]]])\n" \ "\n" \ "Return an iterator on values associated with each keys. Keys\n" \ "are matched optionally to the prefix using the same logic\n" \ "and arguments as in the keys() method." #define module_doc \ "pyahocorasick is a fast and memory efficient library for\n" \ "exact or approximate multi-pattern string search meaning\n" \ "that you can find multiple key strings occurrences at once\n" \ "in some input text." #define module_load_doc \ "load(path, deserializer) => Automaton\n" \ "\n" \ "Load automaton previously stored on disc using save method.\n" \ "\n" \ "Deserializer is a callable object which converts bytes back\n" \ "into python object; it can be pickle.loads." python-pyahocorasick_1.4.1.orig/src/pickle/0000755000000000000000000000000013403255600015676 5ustar00python-pyahocorasick_1.4.1.orig/src/pycallfault/0000755000000000000000000000000013403235140016744 5ustar00python-pyahocorasick_1.4.1.orig/src/custompickle/custompickle.c0000644000000000000000000000302713417131365022007 0ustar00#include "custompickle.h" #include "../../Automaton.h" static const char CUSTOMPICKLE_MAGICK[16] = { 'p', 'y', 'a', 'h', 'o', 'c', 'o', 'r', 'a', 's', 'i', 'c', 'k', // signature '0', '0', '2' // format version }; void custompickle_initialize_header(CustompickleHeader* header, Automaton* automaton) { ASSERT(header != NULL); ASSERT(automaton != NULL); memcpy(header->magick, CUSTOMPICKLE_MAGICK, sizeof(CUSTOMPICKLE_MAGICK)); header->data.kind = automaton->kind; header->data.store = automaton->store; header->data.key_type = automaton->key_type; header->data.words_count = automaton->count; header->data.longest_word = automaton->longest_word; } void custompickle_initialize_footer(CustompickleFooter* footer, size_t nodes_count) { ASSERT(footer != NULL); memcpy(footer->magick, CUSTOMPICKLE_MAGICK, sizeof(CUSTOMPICKLE_MAGICK)); footer->nodes_count = nodes_count; } int custompickle_validate_header(CustompickleHeader* header) { if (memcmp(header->magick, CUSTOMPICKLE_MAGICK, sizeof(CUSTOMPICKLE_MAGICK)) != 0) return false; if (!check_store(header->data.store)) return false; if (!check_kind(header->data.kind)) return false; if (!check_key_type(header->data.key_type)) return false; return true; } int custompickle_validate_footer(CustompickleFooter* footer) { return (memcmp(footer->magick, CUSTOMPICKLE_MAGICK, sizeof(CUSTOMPICKLE_MAGICK)) == 0); } python-pyahocorasick_1.4.1.orig/src/custompickle/custompickle.h0000644000000000000000000000147413406755432022025 0ustar00#pragma once #include "../../Automaton.h" typedef struct AutomatonData { AutomatonKind kind; KeysStore store; KeyType key_type; size_t words_count; int longest_word; } AutomatonData; typedef struct CustompickleHeader { char magick[16]; // CUSTOMPICKLE_MAGICK AutomatonData data; } CustompickleHeader; typedef struct CustompickleFooter { size_t nodes_count; char magick[16]; // CUSTOMPICKLE_MAGICK } CustompickleFooter; void custompickle_initialize_header(CustompickleHeader* header, Automaton* automaton); void custompickle_initialize_footer(CustompickleFooter* footer, size_t nodescount); int custompickle_validate_header(CustompickleHeader* header); int custompickle_validate_footer(CustompickleFooter* footer); python-pyahocorasick_1.4.1.orig/src/custompickle/load/0000755000000000000000000000000013406755432020063 5ustar00python-pyahocorasick_1.4.1.orig/src/custompickle/pyhelpers.c0000644000000000000000000000304413406755432021324 0ustar00#include "pyhelpers.h" bool automaton_save_load_parse_args(KeysStore store, PyObject* args, SaveLoadParameters* result) { PyObject* string; if (store == STORE_ANY) { if (PyTuple_GET_SIZE(args) != 2) { PyErr_SetString(PyExc_ValueError, "expected exactly two arguments"); return false; } } else { if (PyTuple_GET_SIZE(args) != 1) { PyErr_SetString(PyExc_ValueError, "expected exactly one argument"); return false; } } string = F(PyTuple_GetItem)(args, 0); if (UNLIKELY(string == NULL)) { return false; } #if defined(PY3K) if (UNLIKELY(!F(PyUnicode_Check)(string))) { PyErr_SetString(PyExc_TypeError, "the first argument must be a string"); return false; } #else if (UNLIKELY(!F(PyString_Check)(string))) { PyErr_SetString(PyExc_TypeError, "the first argument must be a string"); return false; } #endif if (store == STORE_ANY) { result->callback = F(PyTuple_GetItem)(args, 1); if (UNLIKELY(result->callback == NULL)) { return false; } if (UNLIKELY(!F(PyCallable_Check)(result->callback))) { PyErr_SetString(PyExc_TypeError, "the second argument must be a callable object"); return false; } } #if defined(PY3K) result->path = F(PyUnicode_AsUTF8String)(string); #else result->path = string; Py_INCREF(string); #endif if (UNLIKELY(result->path == NULL)) { return false; } return true; } python-pyahocorasick_1.4.1.orig/src/custompickle/pyhelpers.h0000644000000000000000000000033113406755432021325 0ustar00#pragma once typedef struct SaveLoadParameters { PyObject* path; PyObject* callback; } SaveLoadParameters; bool automaton_save_load_parse_args(KeysStore store, PyObject* args, SaveLoadParameters* result); python-pyahocorasick_1.4.1.orig/src/custompickle/save/0000755000000000000000000000000013406755432020102 5ustar00python-pyahocorasick_1.4.1.orig/src/custompickle/load/loadbuffer.c0000644000000000000000000000626513406755432022351 0ustar00#include "loadbuffer.h" int loadbuffer_open(LoadBuffer* input, const char* path, PyObject* deserializer) { ASSERT(input != NULL); ASSERT(path != NULL); input->file = NULL; input->lookup = NULL; input->size = 0; input->capacity = 0; input->deserializer = deserializer; input->file = fopen(path, "rb"); if (UNLIKELY(input->file == NULL)) { PyErr_SetFromErrno(PyExc_IOError); return 0; } return 1; } int loadbuffer_load(LoadBuffer* input, char* buffer, size_t size) { size_t read; ASSERT(input != NULL); ASSERT(buffer != NULL); if (UNLIKELY(size == 0)) { PyErr_SetString(PyExc_ValueError, "logic error: tried to read 0 bytes"); return 0; } read = fread(buffer, 1, size, input->file); if (read != size) { PyErr_SetFromErrno(PyExc_IOError); return 0; } return 1; } int loadbuffer_init(LoadBuffer* input, CustompickleHeader* header, CustompickleFooter* footer) { long pos; int ret; ASSERT(input != NULL); ASSERT(header != NULL); ASSERT(footer != NULL); ret = loadbuffer_loadinto(input, header, CustompickleHeader); if (UNLIKELY(!ret)) { return 0; } pos = ftell(input->file); if (UNLIKELY(pos < 0)) { PyErr_SetFromErrno(PyExc_IOError); return 0; } ret = fseek(input->file, -sizeof(CustompickleFooter), SEEK_END); if (UNLIKELY(ret < 0)) { PyErr_SetFromErrno(PyExc_IOError); return 0; } ret = loadbuffer_loadinto(input, footer, CustompickleFooter); if (UNLIKELY(!ret)) { return 0; } ret = fseek(input->file, pos, SEEK_SET); if (UNLIKELY(ret < 0)) { PyErr_SetFromErrno(PyExc_IOError); return 0; } if (UNLIKELY(!custompickle_validate_header(header))) { PyErr_Format(PyExc_ValueError, "invalid header"); return 0; } if (UNLIKELY(!custompickle_validate_footer(footer))) { PyErr_Format(PyExc_ValueError, "invalid footer"); return 0; } input->store = header->data.store; input->kind = header->data.kind; input->size = 0; input->capacity = footer->nodes_count; input->lookup = (AddressPair*)memory_alloc(sizeof(AddressPair) * input->capacity); if (UNLIKELY(input->lookup == NULL)) { PyErr_NoMemory(); return 0; } return 1; } void loadbuffer_invalidate(LoadBuffer* input) { ASSERT(input != NULL); input->size = 0; } void loadbuffer_close(LoadBuffer* input) { TrieNode* node; size_t i; if (input->file != NULL) { fclose(input->file); } if (input->lookup) { for (i=0; i < input->size; i++) { node = input->lookup[i].current; if (node->eow && input->store == STORE_ANY) { Py_DECREF(node->output.object); } trienode_free(node); } memory_free(input->lookup); } } void loadbuffer_dump(LoadBuffer* input, FILE* out) { AddressPair* pair; size_t i; for (i=0; i < input->size; i++) { pair = &(input->lookup[i]); fprintf(out, "%p -> %p\n", pair->original, pair->current); } } python-pyahocorasick_1.4.1.orig/src/custompickle/load/loadbuffer.h0000644000000000000000000000162713406755432022353 0ustar00#pragma once #include #include "../../../trienode.h" #include "../custompickle.h" typedef struct AddressPair { TrieNode* original; TrieNode* current; } AddressPair; typedef struct LoadBuffer { PyObject* deserializer; FILE* file; KeysStore store; AutomatonKind kind; AddressPair* lookup; size_t size; size_t capacity; } LoadBuffer; int loadbuffer_open(LoadBuffer* input, const char* path, PyObject* deserializer); int loadbuffer_load(LoadBuffer* input, char* output, size_t size); #define loadbuffer_loadinto(input, variable, type) \ loadbuffer_load(input, (char*)(variable), sizeof(type)) int loadbuffer_init(LoadBuffer* input, CustompickleHeader* header, CustompickleFooter* footer); void loadbuffer_invalidate(LoadBuffer* input); void loadbuffer_close(LoadBuffer* input); void loadbuffer_dump(LoadBuffer* input, FILE* out); python-pyahocorasick_1.4.1.orig/src/custompickle/load/module_automaton_load.c0000644000000000000000000001514713417131365024605 0ustar00#include "module_automaton_load.h" #include "../../../Automaton.h" #include "loadbuffer.h" // --- public ----------------------------------------------------------- static bool automaton_load_impl(Automaton* automaton, const char* path, PyObject* deserializer); PyObject* module_automaton_load(PyObject* module, PyObject* args) { SaveLoadParameters params; Automaton* automaton; int ret; automaton = (Automaton*)automaton_create(); if (UNLIKELY(automaton == NULL)) { return NULL; } if (UNLIKELY(!automaton_save_load_parse_args(automaton->store, args, ¶ms))) { Py_DECREF(automaton); return NULL; } ret = automaton_load_impl(automaton, PyBytes_AsString(params.path), params.callback); Py_DECREF(params.path); if (LIKELY(ret)) return (PyObject*)automaton; else return NULL; } // ----private ---------------------------------------------------------- static bool automaton_load_node(LoadBuffer* input); static TrieNode* automaton_load_fixup_pointers(LoadBuffer* input); static bool automaton_load_impl(Automaton* automaton, const char* path, PyObject* deserializer) { TrieNode* root; LoadBuffer input; CustompickleHeader header; CustompickleFooter footer; size_t i; if (!loadbuffer_open(&input, path, deserializer)) { return false; } if (!loadbuffer_init(&input, &header, &footer)) { goto exception; } if (header.data.kind == TRIE || header.data.kind == AHOCORASICK) { for (i=0; i < input.capacity; i++) { if (UNLIKELY(!automaton_load_node(&input))) { goto exception; } } root = automaton_load_fixup_pointers(&input); if (UNLIKELY(root == NULL)) { goto exception; } } else if (header.data.kind == EMPTY) { root = NULL; } else { PyErr_SetString(PyExc_ValueError, "automaton kind save in file is invalid"); goto exception; } loadbuffer_close(&input); // setup object automaton->kind = header.data.kind; automaton->store = header.data.store; automaton->key_type = header.data.key_type; automaton->count = header.data.words_count; automaton->longest_word = header.data.longest_word; automaton->version = 0; automaton->stats.version = -1; automaton->root = root; return true; exception: loadbuffer_close(&input); return false; } static bool automaton_load_node(LoadBuffer* input) { PyObject* bytes; // XXX: it might be reused (i.e. be part of input) PyObject* object; TrieNode* original; TrieNode* node; size_t size; int ret; // 1. get original address of upcoming node ret = loadbuffer_loadinto(input, &original, TrieNode*); if (UNLIKELY(!ret)) { return false; } // 2. load node data node = (TrieNode*)memory_alloc(sizeof(TrieNode)); if (UNLIKELY(node == NULL)) { PyErr_NoMemory(); return false; } ret = loadbuffer_load(input, (char*)node, PICKLE_TRIENODE_SIZE); if (UNLIKELY(!ret)) { memory_free(node); return false; } node->next = NULL; // 3. load next pointers if (node->n > 0) { size = sizeof(Pair) * node->n; node->next = (Pair*)memory_alloc(size); if (UNLIKELY(node->next == NULL)) { PyErr_NoMemory(); goto exception; } ret = loadbuffer_load(input, (char*)(node->next), size); if (UNLIKELY(!ret)) { goto exception; } } // 4. load custom python object if (node->eow && input->store == STORE_ANY) { size = (size_t)(node->output.integer); bytes = F(PyBytes_FromStringAndSize)(NULL, size); if (UNLIKELY(bytes == NULL)) { goto exception; } ret = loadbuffer_load(input, PyBytes_AS_STRING(bytes), size); if (UNLIKELY(!ret)) { Py_DECREF(bytes); goto exception; } object = F(PyObject_CallFunction)(input->deserializer, "O", bytes); if (UNLIKELY(object == NULL)) { Py_DECREF(bytes); goto exception; } node->output.object = object; Py_DECREF(bytes); } input->lookup[input->size].original = original; input->lookup[input->size].current = node; input->size += 1; return true; exception: memory_safefree(node->next); memory_free(node); return false; } static int addresspair_cmp(const void* a, const void *b) { const TrieNode* Aptr; const TrieNode* Bptr; uintptr_t A; uintptr_t B; Aptr = ((AddressPair*)a)->original; Bptr = ((AddressPair*)b)->original; A = (uintptr_t)Aptr; B = (uintptr_t)Bptr; if (A < B) { return -1; } else if (A > B) { return +1; } else { return 0; } } static TrieNode* lookup_address(LoadBuffer* input, TrieNode* original) { AddressPair* pair; pair = (AddressPair*)bsearch(&original, input->lookup, input->size, sizeof(AddressPair), addresspair_cmp); if (LIKELY(pair != NULL)) { return pair->current; } else { return NULL; } } static bool automaton_load_fixup_node(LoadBuffer* input, TrieNode* node) { size_t i; if (input->kind == AHOCORASICK && node->fail != NULL) { node->fail = lookup_address(input, node->fail); if (UNLIKELY(node->fail == NULL)) { return false; } } if (node->n > 0) { for (i=0; i < node->n; i++) { node->next[i].child = lookup_address(input, node->next[i].child); if (UNLIKELY(node->next[i].child == NULL)) { return false; } } } return true; } static TrieNode* automaton_load_fixup_pointers(LoadBuffer* input) { TrieNode* root; TrieNode* node; size_t i; ASSERT(input != NULL); // 1. root is the first node stored in the array root = input->lookup[0].current; // 2. sort array to make it bsearch-able qsort(input->lookup, input->size, sizeof(AddressPair), addresspair_cmp); // 3. convert all next and fail pointers to current pointers for (i=0; i < input->size; i++) { node = input->lookup[i].current; if (UNLIKELY(!automaton_load_fixup_node(input, node))) { PyErr_Format(PyExc_ValueError, "Detected malformed pointer during unpickling node %lu", i); return NULL; } } loadbuffer_invalidate(input); return root; } python-pyahocorasick_1.4.1.orig/src/custompickle/load/module_automaton_load.h0000644000000000000000000000022413406755432024605 0ustar00#pragma once #define module_automaton_load_doc \ "Load automaton from a file" PyObject* module_automaton_load(PyObject* module, PyObject* args); python-pyahocorasick_1.4.1.orig/src/custompickle/save/automaton_save.c0000644000000000000000000000676213417131365023301 0ustar00#include "automaton_save.h" #include "../custompickle.h" #include "../pyhelpers.h" #include "savebuffer.h" // --- public ----------------------------------------------------------- static bool automaton_save_impl(Automaton* automaton, const char* path, PyObject* serializer); PyObject* automaton_save(PyObject* self, PyObject* args) { SaveLoadParameters params; Automaton* automaton; int ret; automaton = (Automaton*)self; if (UNLIKELY(!automaton_save_load_parse_args(automaton->store, args, ¶ms))) { return NULL; } ret = automaton_save_impl(automaton, PyBytes_AsString(params.path), params.callback); Py_DECREF(params.path); if (LIKELY(ret)) Py_RETURN_NONE; else return NULL; } // --- private ---------------------------------------------------------- static int automaton_save_node(TrieNode* node, const int depth, void* extra); static bool automaton_save_impl(Automaton* automaton, const char* path, PyObject* serializer) { CustompickleHeader header; CustompickleFooter footer; SaveBuffer output; int ret; ret = savebuffer_init(&output, serializer, automaton->store, path, SAVEBUFFER_DEFAULT_SIZE); if (!ret) return false; custompickle_initialize_header(&header, automaton); // 1. save header savebuffer_store(&output, (const char*)&header, sizeof(header)); // 2. save nodes if (automaton->kind != EMPTY) { trie_traverse(automaton->root, automaton_save_node, &output); if (UNLIKELY(PyErr_Occurred() != NULL)) { goto exception; } } // 3. save footer custompickle_initialize_footer(&footer, output.nodes_count); savebuffer_store(&output, (const char*)&footer, sizeof(footer)); savebuffer_finalize(&output); return true; exception: savebuffer_finalize(&output); return false; } static int automaton_save_node(TrieNode* node, const int depth, void* extra) { SaveBuffer* output; TrieNode* dump; PyObject* bytes; output = (SaveBuffer*)extra; // 1. save actual address of node savebuffer_store_pointer(output, (void*)node); // 2. obtain buffer dump = (TrieNode*)savebuffer_acquire(output, PICKLE_TRIENODE_SIZE); if (output->store != STORE_ANY) dump->output.integer = node->output.integer; dump->n = node->n; dump->eow = node->eow; dump->fail = node->fail; // 3. pickle python value associated with word if (node->eow && output->store == STORE_ANY) { bytes = F(PyObject_CallFunction)(output->serializer, "O", node->output.object); if (UNLIKELY(bytes == NULL)) { return 0; } if (UNLIKELY(!F(PyBytes_CheckExact)(bytes))) { PyErr_SetString(PyExc_TypeError, "serializer must return bytes object"); return 0; } // store the size of buffer in trie node [which is not saved yet in the file] *(size_t*)(&dump->output.integer) = PyBytes_GET_SIZE(bytes); } else { bytes = NULL; } // 4. save array of pointers if (node->n > 0) { savebuffer_store(output, (const char*)node->next, node->n * sizeof(Pair)); } // 5. save pickled data, if any if (bytes) { savebuffer_store(output, PyBytes_AS_STRING(bytes), PyBytes_GET_SIZE(bytes)); Py_DECREF(bytes); } output->nodes_count += 1; return 1; } python-pyahocorasick_1.4.1.orig/src/custompickle/save/automaton_save.h0000644000000000000000000000014713407731766023307 0ustar00#pragma once #include "../../../common.h" PyObject* automaton_save(PyObject* self, PyObject* args); python-pyahocorasick_1.4.1.orig/src/custompickle/save/savebuffer.c0000644000000000000000000000513413417131365022374 0ustar00#include "savebuffer.h" bool savebuffer_init(SaveBuffer* output, PyObject* serializer, KeysStore store, const char* path, size_t capacity) { output->store = store; output->file = NULL; output->buffer = NULL; output->size = 0; output->capacity = capacity; output->serializer = serializer; output->nodes_count = 0; if (PICKLE_SIZE_T_SIZE < sizeof(PyObject*)) { // XXX: this must be reworked, likely moved to module level PyErr_SetString(PyExc_SystemError, "unable to save data due to technical reasons"); return false; } if (UNLIKELY(store == STORE_ANY && serializer == NULL)) { PyErr_SetString(PyExc_ValueError, "for automatons with STORE_ANY serializer must be given"); return false; } output->buffer = (char*)memory_alloc(capacity); if (UNLIKELY(output->buffer == NULL)) { PyErr_NoMemory(); return false; } output->file = fopen(path, "wb"); if (output->file == NULL) { memory_free(output->buffer); output->buffer = NULL; PyErr_SetFromErrno(PyExc_IOError); return false; } return true; } void savebuffer_flush(SaveBuffer* output) { if (output->size != fwrite(output->buffer, 1, output->size, output->file)) { PyErr_SetFromErrno(PyExc_IOError); } output->size = 0; } char* savebuffer_acquire(SaveBuffer* output, size_t request) { char* ptr; if (UNLIKELY(request > output->capacity)) { return NULL; } if (UNLIKELY(output->size + request > output->capacity)) { savebuffer_flush(output); } ptr = output->buffer + output->size; output->size += request; return ptr; } void savebuffer_store(SaveBuffer* output, const char* data, size_t size) { if (UNLIKELY(size > output->capacity)) { savebuffer_flush(output); if (fwrite(data, 1, size, output->file) != size) { PyErr_SetFromErrno(PyExc_IOError); } return; } if (UNLIKELY(output->size + size >= output->capacity)) { savebuffer_flush(output); } memcpy(output->buffer + output->size, data, size); output->size += size; } void savebuffer_store_pointer(SaveBuffer* save, void* ptr) { char* buf; buf = savebuffer_acquire(save, sizeof(void*)); *((void**)buf) = ptr; } void savebuffer_finalize(SaveBuffer* output) { if (output->buffer != NULL && output->file != NULL && output->size > 0) { savebuffer_flush(output); } memory_safefree(output->buffer); if (output->file != NULL) { fclose(output->file); } } python-pyahocorasick_1.4.1.orig/src/custompickle/save/savebuffer.h0000644000000000000000000000135513406755432022407 0ustar00#pragma once #include "../../../Automaton.h" #define SAVEBUFFER_DEFAULT_SIZE (32 * 1024lu) typedef struct SaveBuffer { KeysStore store; FILE* file; char* buffer; size_t size; size_t capacity; PyObject* serializer; size_t nodes_count; ///< the total number of stored nodes } SaveBuffer; bool savebuffer_init(SaveBuffer* save, PyObject* serializer, KeysStore store, const char* path, size_t capacity); void savebuffer_flush(SaveBuffer* save); char* savebuffer_acquire(SaveBuffer* save, size_t request); void savebuffer_store(SaveBuffer* save, const char* data, size_t size); void savebuffer_store_pointer(SaveBuffer* save, void* ptr); void savebuffer_finalize(SaveBuffer* save); python-pyahocorasick_1.4.1.orig/src/pickle/pickle.h0000644000000000000000000000052213417131365017323 0ustar00#pragma once #include "../../trienode.h" // We save all TrieNode's fields except the last one, which is a pointer to array, // as we're store that array just after the node #define PICKLE_TRIENODE_SIZE (sizeof(TrieNode) - sizeof(Pair*)) #define PICKLE_SIZE_T_SIZE (sizeof(size_t)) #define PICKLE_CHUNK_COUNTER_SIZE (sizeof(Py_ssize_t)) python-pyahocorasick_1.4.1.orig/src/pickle/pickle_data.c0000644000000000000000000000450513406760402020312 0ustar00#include "pickle.h" #include "pickle_data.h" static void pickle_data__init_default(PickleData* data) { ASSERT(data != NULL); data->bytes_list = NULL; data->chunked = false; data->size = 0; data->data = NULL; data->count = NULL; data->top = 0; data->values = 0; data->error = false; } static void pickle_data__cleanup(PickleData* data) { ASSERT(data != NULL); Py_XDECREF(data->bytes_list); Py_XDECREF(data->values); } static bool pickle_data__add_next_buffer(PickleData* data) { PyObject* bytes; void* raw; ASSERT(data != NULL); bytes = F(PyBytes_FromStringAndSize)(NULL, data->size); if (UNLIKELY(bytes == NULL)) { return false; } if (UNLIKELY(F(PyList_Append)(data->bytes_list, bytes) < 0)) { Py_DECREF(bytes); return false; } raw = PyBytes_AS_STRING(bytes); data->count = (Py_ssize_t*)raw; (*data->count) = 0; data->data = (uint8_t*)raw; data->top = PICKLE_CHUNK_COUNTER_SIZE; return true; } static bool pickle_data__shrink_last_buffer(PickleData* data) { PyObject* bytes; PyObject* new; Py_ssize_t last_idx; ASSERT(data != NULL); if (data->top >= data->size) { return true; } ASSERT(data->bytes_list); last_idx = PyList_GET_SIZE(data->bytes_list) - 1; bytes = F(PyList_GetItem)(data->bytes_list, last_idx); if (UNLIKELY(bytes == NULL)) { return false; } new = F(PyBytes_FromStringAndSize)(PyBytes_AS_STRING(bytes), data->top); if (UNLIKELY(new == NULL)) { return false; } if (F(PyList_SetItem)(data->bytes_list, last_idx, new) < 0) { return false; } return true; } static int pickle_data__init(PickleData* data, KeysStore store, size_t total_size, size_t max_array_size) { pickle_data__init_default(data); ASSERT(total_size > 0); ASSERT(max_array_size > PICKLE_TRIENODE_SIZE * 1024); data->bytes_list = F(PyList_New)(0); if (UNLIKELY(data->bytes_list == NULL)) { return false; } if (store == STORE_ANY) { data->values = F(PyList_New)(0); if (UNLIKELY(data->values == NULL)) { Py_DECREF(data->bytes_list); return false; } } if (total_size <= max_array_size) { data->size = total_size + PICKLE_CHUNK_COUNTER_SIZE; data->chunked = false; } else { // TODO: more heuristic here: what if total_size > 100MB? what if > 1GB, > 10GB? data->size = max_array_size; data->chunked = true; } return pickle_data__add_next_buffer(data); } python-pyahocorasick_1.4.1.orig/src/pickle/pickle_data.h0000644000000000000000000000147514002633220020310 0ustar00#pragma once typedef struct PickleData { PyObject* bytes_list; ///< PyList of PyBytes bool chunked; ///< bytes_list has more than one element size_t size; ///< size of single array uint8_t* data; ///< current array Py_ssize_t* count; ///< ptr to number of nodes stored in the current array size_t top; ///< first free address in the current array PyObject* values; ///< a list (if store == STORE_ANY) bool error; ///< error occurred during pickling } PickleData; static void pickle_data__init_default(PickleData* data); static void pickle_data__cleanup(PickleData* data); static bool pickle_data__add_next_buffer(PickleData* data); static bool pickle_data__shrink_last_buffer(PickleData* data); static int pickle_data__init(PickleData* data, KeysStore store, size_t total_size, size_t max_array_size); python-pyahocorasick_1.4.1.orig/src/pycallfault/pycallfault.c0000644000000000000000000000140713406760402021441 0ustar00#include "pycallfault.h" #include static int pycall = -1; static int pycall_fail = -1; static int pycall_trap = 0; void initialize_pycallfault(void) { const char* fail = getenv("PYCALL_FAIL"); const char* trap = getenv("PYCALL_TRAP"); if (fail != NULL) { pycall_fail = atoi(fail); } if (trap != NULL) { pycall_trap = 1; } } int check(void) { pycall += 1; printf("Fail ID: %d\n", pycall); if (pycall == pycall_fail) { if (pycall_trap) { __builtin_trap(); } printf("Failed pycall #%d\n", pycall); return 1; } return 0; } int check_and_set_error(void) { if (check()) { PyErr_NoMemory(); return 1; } return 0; } python-pyahocorasick_1.4.1.orig/src/pycallfault/pycallfault.h0000644000000000000000000000436014002633220021435 0ustar00#ifndef PYCALLFAULT_H_ #define PYCALLFAULT_H_ #define F(name) name##_custom void initialize_pycallfault(void); // --- python function wrappers ----------------------------------------- int check(void); int check_and_set_error(void); #define PyObject_New_custom(...) (check_and_set_error() ? NULL : PyObject_New(__VA_ARGS__)) #define PyArg_ParseTuple_custom(...) (check() ? 0 : PyArg_ParseTuple(__VA_ARGS__)) #define PyTuple_GetItem_custom(...) (check_and_set_error() ? NULL : PyTuple_GetItem(__VA_ARGS__)) #define PyList_New_custom(arg) (check_and_set_error() ? NULL : PyList_New(arg)) #define PyList_GetItem_custom(...) (check_and_set_error() ? NULL : PyList_GetItem(__VA_ARGS__)) #define PyList_SetItem_custom(...) (check_and_set_error() ? -1 : PyList_SetItem(__VA_ARGS__)) #define PyList_Append_custom(...) (check_and_set_error() ? -1 : PyList_Append(__VA_ARGS__)) #define PyNumber_AsSsize_t_custom(...) (check_and_set_error() ? -1 : PyNumber_AsSsize_t(__VA_ARGS__)) #define Py_BuildValue_custom(...) (check_and_set_error() ? NULL : Py_BuildValue(__VA_ARGS__)) #define PyCallable_Check_custom(arg) (check() ? 0 : PyCallable_Check(arg)) #define PyString_Check_custom(arg) (check() ? 0 : PyString_Check(arg)) #define PyUnicode_Check_custom(arg) (check() ? 0 : PyUnicode_Check(arg)) #define PyBytes_Check_custom(arg) (check() ? 0 : PyBytes_Check(arg)) #define PyBytes_CheckExact_custom(arg) (check() ? 0 : PyBytes_CheckExact(arg)) #define PyNumber_Check_custom(arg) (check() ? 0 : PyNumber_Check(arg)) #define PyTuple_Check_custom(arg) (check() ? 0 : PyTuple_Check(arg)) #define PyObject_CallFunction_custom(...) (check_and_set_error() ? NULL : PyObject_CallFunction(__VA_ARGS__)) #define PyArg_ParseTupleAndKeywords_custom(...) (check_and_set_error() ? 0 : PyArg_ParseTupleAndKeywords(__VA_ARGS__)) #define PyNumber_Index_custom(arg) (check_and_set_error() ? NULL : PyNumber_Index(arg)) #define PyUnicode_FromKindAndData_custom(...) (check_and_set_error() ? NULL : PyUnicode_FromKindAndData(__VA_ARGS__)) #define PyUnicode_AsUTF8String_custom(...) (check_and_set_error() ? NULL : PyUnicode_AsUTF8String(__VA_ARGS__)) #define PyBytes_FromStringAndSize_custom(...) (check_and_set_error() ? NULL : PyBytes_FromStringAndSize(__VA_ARGS__)) #endif // PYCALLFAULT_H_ python-pyahocorasick_1.4.1.orig/stamp/.gitignore0000644000000000000000000000000612707723621016761 0ustar00*_py? python-pyahocorasick_1.4.1.orig/tests/generate_random_words.py0000644000000000000000000000271513405536435021743 0ustar00import sys import os import random import gzip import pickle import optparse import time from optparse import OptionParser def main(): options = parse_args() app = TestApplication(options) app.run() chars = 'abcdefghijklmnopqestuvwxyzABCDEFGHIJKLMNOPQESTUVWXYZ0123456789.,;:-' class TestApplication(object): def __init__(self, options): self.options = options random.seed(options.seed) def run(self): n = self.options.words for i in range(n): print(self.generate_random_word()) def generate_random_word(self): n = random.randint(1, self.options.maxlength + 1) s = '' for i in range(n): s += random.choice(chars) return s def parse_args(): parser = OptionParser() parser.add_option( "--max-words", dest='words', type=int, default=50000, metavar='N', help="maximum number of words generated/loaded" ) parser.add_option( "--random", dest='random', action='store_true', default=False, help="generate random words" ) parser.add_option( "--seed", dest='seed', type=int, default=0, metavar='INT', help="random seed" ) parser.add_option( "--random-max-len", dest='maxlength', type=int, default=100, metavar='K', help="maximum count of characters in a word" ) (options, rest) = parser.parse_args() return options if __name__ == '__main__': main() python-pyahocorasick_1.4.1.orig/tests/memdump_check.py0000644000000000000000000000350413407740645020173 0ustar00import sys def main(): try: path = sys.argv[1] except IndexError: path = 'memory.dump' app = Application(path) if app.run(): sys.exit(0) else: sys.exit(1) class Application(object): def __init__(self, path): self.path = path self.memory = {} def run(self): with open(self.path, 'rt') as f: self.analyze(f) self.print_leaks() return len(self.memory) == 0 def analyze(self, file): self.memory = {} for i, line in enumerate(file): fields = line.split() action = fields[0] if action == 'A': id = fields[1] addr = fields[2] size = int(fields[3]) assert addr not in self.memory self.memory[addr] = (id, size) elif action == 'R': id = fields[1] oldaddr = fields[2] newaddr = fields[3] size = int(fields[4]) try: key = int(oldaddr, 16) del self.memory[oldaddr] except ValueError: pass assert newaddr not in self.memory self.memory[newaddr] = (id, size) elif action == 'F': addr = fields[1] if addr in self.memory: del self.memory[addr] def print_leaks(self): n = len(self.memory) if n == 0: return print('There are %d leaks:' % n) tmp = [(int(id), addr, size) for addr, (id, size) in self.memory.items()] tmp.sort(key=lambda item: item[0]) for id, addr, size in tmp: print('#%s: %s %d' % (id, addr, size)) if __name__ == '__main__': main() python-pyahocorasick_1.4.1.orig/tests/memdump_maxalloc.py0000644000000000000000000000104513403221153020674 0ustar00import sys def main(): try: path = sys.argv[1] except IndexError: path = 'memory.dump' app = Application(path) app.run() class Application(object): def __init__(self, path): self.path = path def run(self): with open(self.path, 'rt') as f: print(max(self.ids(f))) def ids(self, file): for i, line in enumerate(file): fields = line.split() if fields[0] == 'A': yield int(fields[1]) if __name__ == '__main__': main() python-pyahocorasick_1.4.1.orig/tests/memdump_maxrealloc.py0000644000000000000000000000104513407740645021243 0ustar00import sys def main(): try: path = sys.argv[1] except IndexError: path = 'memory.dump' app = Application(path) app.run() class Application(object): def __init__(self, path): self.path = path def run(self): with open(self.path, 'rt') as f: print(max(self.ids(f))) def ids(self, file): for i, line in enumerate(file): fields = line.split() if fields[0] == 'R': yield int(fields[1]) if __name__ == '__main__': main() python-pyahocorasick_1.4.1.orig/tests/pickle_stresstest.py0000644000000000000000000001622213406760402021135 0ustar00import sys import os import random import gzip import pickle import optparse import time import ahocorasick from optparse import OptionParser def main(): options = parse_args() app = TestApplication(options) app.run() chars = 'abcdefghijklmnopqestuvwxyzABCDEFGHIJKLMNOPQESTUVWXYZ0123456789.,;:-' class TestApplication(object): def __init__(self, options): self.options = options self.words = set() random.seed(options.seed) def run(self): self.A = ahocorasick.Automaton() if self.options.compare and (not self.options.pickle and not self.options.save): self.generate_words() if self.options.pickle or self.options.save: self.add_words() if self.options.pickle: t1 = time.time() self.pickle() t2 = time.time() print(" time: %0.2fs" % (t2 - t1)) self.A.clear() if self.options.save: t1 = time.time() self.save() t2 = time.time() print(" time: %0.2fs" % (t2 - t1)) self.A.clear() if self.options.unpickle: t1 = time.time() self.unpickle() t2 = time.time() print(" time: %0.2fs" % (t2 - t1)) if self.options.load: t1 = time.time() self.load() t2 = time.time() print(" time: %0.2fs" % (t2 - t1)) if self.options.compare: self.compare() def add_words(self): if self.options.random: self.__add_random_words() else: self.__add_from_file() print("Automaton statistics:") d = self.A.get_stats() print("- nodes_count : %d" % d['nodes_count']) print("- words_count : %d" % d['words_count']) print("- links_count : %d" % d['links_count']) print("- longest_word : %d" % d['longest_word']) print("- sizeof_node : %d" % d['sizeof_node']) print("- total_size : %d" % d['total_size']) def __add_random_words(self): n = self.options.words print("Adding %d words" % n) while n > 0: word = self.generate_random_word() if self.options.compare: self.words.add(word) if self.A.add_word(word, True): n -= 1 def __add_from_file(self): n = self.options.words print("Adding %d words from %s" % (n, self.options.file_gz)) for i, word in enumerate(self.read()): if i > n: return if self.options.compare: self.words.add(word) self.A.add_word(word, True) def generate_words(self): if self.options.random: self.__generate_random_words() else: self.__load_words() def __generate_random_words(self): n = self.options.words print("Generating %d words" % n) while len(self.words) < n: word = self.generate_random_word() self.words.add(word) def __load_words(self): n = self.options.words print ("Loading %d words from %s" % (n, self.options.file_gz)) for i, word in enumerate(self.read()): if i < n: self.words.add(word) else: return def read(self): with gzip.open(self.options.file_gz, "rt", encoding="utf-8") as f: for line in f: yield line.strip() def pickle(self): path = self.options.picklepath print("Pickling automaton in %s" % path) with open(path, 'wb') as f: pickle.dump(self.A, f) size = os.path.getsize(path) print(" file size is %s" % format_size(size)) def unpickle(self): path = self.options.picklepath print("Unpickling automaton from %s" % path) with open(path, 'rb') as f: self.A = pickle.load(f) def save(self): path = self.options.picklepath print("Saving automaton in %s" % path) self.A.save(path, pickle.dumps); size = os.path.getsize(path) print(" file size is %s" % format_size(size)) def load(self): path = self.options.picklepath print("Loading automaton from %s" % path) self.A = ahocorasick.load(path, pickle.loads) def compare(self): print("Comparing added words with restored automaton") for word in self.A: self.words.remove(word) if self.words: print("Not all words were restored (%d missing)" % len(self.words)) def generate_random_word(self): n = random.randint(1, self.options.maxlength + 1) s = '' for i in range(n): s += random.choice(chars) return s def format_size(size): units = [ ('GB', 1024**3), ('MB', 1024**2), ('kB', 1024), ] for suffix, threshold in units: if size > threshold: return '%0.2f %s (%d bytes)' % (float(size)/threshold, suffix, size) return '%d bytes' % size def parse_args(): parser = OptionParser() parser.add_option( "--pickle-path", dest='picklepath', default='pickle_stresstest.pickle', help="path used in pickling/unpickling" ) parser.add_option( "-p", "--pickle", dest='pickle', action='store_true', default=False, help="perform pickle operation on generated/loaded words" ) parser.add_option( "-u", "--unpickle", dest='unpickle', action='store_true', default=False, help="perform unpickle operation on previously pickled data" ) parser.add_option( "-s", "--save", dest='save', action='store_true', default=False, help="perform save operation on generated/loaded words" ) parser.add_option( "-l", "--load", dest='load', action='store_true', default=False, help="perform load operation on previously saved data" ) parser.add_option( "-c", "--compare", action='store_true', default=False, help="compare generated/loaded words with unpickled data" ) parser.add_option( "--max-words", dest='words', type=int, default=50000, metavar='N', help="maximum number of words generated/loaded" ) parser.add_option( "--random", dest='random', action='store_true', default=False, help="generate random words" ) parser.add_option( "--seed", dest='seed', type=int, default=0, metavar='INT', help="random seed" ) parser.add_option( "--random-max-len", dest='maxlength', type=int, default=100, metavar='K', help="maximum count of characters in a word" ) parser.add_option( "--file-gz", metavar='FILE', help="load words from utf8-encoded gz file" ) (options, rest) = parser.parse_args() if not (options.file_gz or options.random): raise parser.error("pass --random or --file-gz option") if (options.pickle or options.unpickle) and (options.save or options.load): raise parser.error("use separately --pickle/--unpickle and --save/--load") return options if __name__ == '__main__': main() python-pyahocorasick_1.4.1.orig/tests/pyfault_check.py0000644000000000000000000000205713403234605020203 0ustar00import sys def main(): path = sys.argv[1] app = Application(path) app.run() class Application(object): def __init__(self, path): self.path = path def run(self): with open(self.path, 'rt') as f: lines = [line.rstrip() for line in f if line.rstrip()] self.analyze(lines) def analyze(self, lines): error_sep = '======================================================================' traceback_sep = '----------------------------------------------------------------------' index = 0 while True: try: index = lines.index(error_sep, index) except ValueError: break index += 1 function = lines[index] index += 1 start = lines.index(traceback_sep, index) end = lines.index(traceback_sep, start + 1) index = end + 1 error = lines[end - 1] print('%s: %s' % (function, error)) if __name__ == '__main__': main() python-pyahocorasick_1.4.1.orig/tests/removeword_stresstest.py0000644000000000000000000001073413404135655022065 0ustar00import sys import os import random import gzip import pickle import optparse import ahocorasick from optparse import OptionParser def main(): options = parse_args() app = TestApplication(options) app.run() chars = 'abcdefghijklmnopqestuvwxyzABCDEFGHIJKLMNOPQESTUVWXYZ0123456789.,;:-' class TestApplication(object): def __init__(self, options): self.options = options self.words = [] random.seed(options.seed) def run(self): self.A = ahocorasick.Automaton() self.add_words() self.remove() def add_words(self): if self.options.random: self.__add_random_words() else: self.__add_from_file() print("Automaton statistics:") d = self.A.get_stats() print("- nodes_count : %d" % d['nodes_count']) print("- words_count : %d" % d['words_count']) print("- links_count : %d" % d['links_count']) print("- longest_word : %d" % d['longest_word']) print("- sizeof_node : %d" % d['sizeof_node']) print("- total_size : %d" % d['total_size']) def remove(self): print("Removing %d words" % len(self.words)) random.shuffle(self.words) for word in self.words: self.A.remove_word(word) print("Automaton statistics:") d = self.A.get_stats() print("- nodes_count : %d" % d['nodes_count']) print("- words_count : %d" % d['words_count']) print("- links_count : %d" % d['links_count']) print("- longest_word : %d" % d['longest_word']) print("- sizeof_node : %d" % d['sizeof_node']) print("- total_size : %d" % d['total_size']) def __add_random_words(self): n = self.options.words print("Adding %d words" % n) while n > 0: word = self.generate_random_word() if self.A.add_word(word, True): n -= 1 self.words.append(word) def __add_from_file(self): n = self.options.words print("Adding %d words from %s" % (n, self.options.file_gz)) for i, word in enumerate(self.read()): if i > n: return self.A.add_word(word, True) self.words.append(word) def generate_words(self): if self.options.random: self.__generate_random_words() else: self.__load_words() def __generate_random_words(self): n = self.options.words print("Generating %d words" % n) while len(self.words) < n: word = self.generate_random_word() self.words.add(word) def __load_words(self): n = self.options.words print ("Loading %d words from %s" % (n, self.options.file_gz)) for i, word in enumerate(self.read()): if i < n: self.words.add(word) else: return def read(self): with gzip.open(self.options.file_gz, "rt", encoding="utf-8") as f: for line in f: yield line.strip() def generate_random_word(self): n = random.randint(1, self.options.maxlength + 1) s = '' for i in range(n): s += random.choice(chars) return s def format_size(size): units = [ ('GB', 1024**3), ('MB', 1024**2), ('kB', 1024), ] for suffix, threshold in units: if size > threshold: return '%0.2f %s (%d bytes)' % (float(size)/threshold, suffix, size) return '%d bytes' % size def parse_args(): parser = OptionParser() parser.add_option( "--max-words", dest='words', type=int, default=50000, metavar='N', help="maximum number of words generated/loaded" ) parser.add_option( "--random", dest='random', action='store_true', default=False, help="generate random words" ) parser.add_option( "--seed", dest='seed', type=int, default=0, metavar='INT', help="random seed" ) parser.add_option( "--random-max-len", dest='maxlength', type=int, default=100, metavar='K', help="maximum count of characters in a word" ) parser.add_option( "--file-gz", metavar='FILE', help="load words from utf8-encoded gz file" ) (options, rest) = parser.parse_args() if not (options.file_gz or options.random): raise parser.error("pass --random or --file-gz option") return options if __name__ == '__main__': main() python-pyahocorasick_1.4.1.orig/tests/unittestlog_check.py0000644000000000000000000000237413403221153021074 0ustar00import sys def main(): path = sys.argv[1] app = Application(path) if app.run(): sys.exit(0) else: sys.exit(1) class Application(object): def __init__(self, path): self.path = path def run(self): with open(self.path, 'rt') as f: lines = [line.rstrip() for line in f if line.rstrip()] errors = self.analyze(lines) return errors def analyze(self, lines): error_sep = '======================================================================' traceback_sep = '----------------------------------------------------------------------' index = 0 result = True while True: try: index = lines.index(error_sep, index) except ValueError: break index += 1 function = lines[index] index += 1 start = lines.index(traceback_sep, index) end = lines.index(traceback_sep, start + 1) index = end + 1 error = lines[end - 1] if error != 'MemoryError': print('%s: %s' % (function, error)) result = False return result if __name__ == '__main__': main() python-pyahocorasick_1.4.1.orig/tests/valgrind_check.py0000644000000000000000000000341113403215123020312 0ustar00import os import sys def main(): app = Application(sys.argv[1], sys.argv[2]) if app.run(): sys.exit(0) else: sys.exit(1) class Application(object): def __init__(self, srcdir, path): self.srcdir = srcdir self.path = path self.sources = set() self.dump = 0 def run(self): self.gather_sources() with open(self.path, 'rt') as f: leaks = self.analyze(f) if not leaks: return True else: print("Following references found in %s (file -> line no)" % self.path) for name in sorted(leaks): lines = ', '.join(map(str, leaks[name])) print("- %s: %s" % (name, lines)) def gather_sources(self): for path in os.listdir(self.srcdir): if path.endswith('.c'): self.sources.add(path) def analyze(self, file): result = {} for k, line in enumerate(file): if 'by 0x' in line or 'at 0x' in line: try: # by 0xfffff: function (file.c:1234) # ^^^^^^ index = line.rindex('(') + 1 name = line[index:] if name.startswith('in '): continue index = name.index(':') name = name[:index] if self.dump: print(name) except ValueError: continue if name in self.sources: if name not in result: result[name] = [] result[name].append(k) return result if __name__ == '__main__': main() python-pyahocorasick_1.4.1.orig/unresolved_bugs/.gitignore0000644000000000000000000000000613050546452021040 0ustar00*.txt python-pyahocorasick_1.4.1.orig/unresolved_bugs/bug_81.py0000644000000000000000000000213713267725271020526 0ustar00# -*- coding: utf-8 -*- """ Aho-Corasick string search algorithm. Author : Wojciech Muła, wojciech_mula@poczta.onet.pl WWW : http://0x80.pl License : public domain """ import os import sys import ahocorasick try: range = xrange # for Py2 except NameError: pass def get_memory_usage(): # Linux only pid = os.getpid() lines = [] try: with open('/proc/%d/status' % pid, 'rt') as f: lines = f.readlines() except: pass for line in lines: if line.startswith('VmSize'): return float(line.split()[1]) return 0 def test(): with open('README.rst', 'r') as f: data = f.read().split() ac = ahocorasick.Automaton() for i, word in enumerate(data): ac.add_word(word, i) ac.make_automaton() for i in range(1024): s = list(ac.keys()) if __name__ == '__main__': before = get_memory_usage() test() after = get_memory_usage() print("Memory's usage growth: %s (before = %s, after = %s)" % (after - before, before, after)) assert(before == after)