././@PaxHeader0000000000000000000000000000003400000000000010212 xustar0028 mtime=1725263730.7794456 efficient_apriori-2.0.5/0000755000175100001770000000000014665267563014630 5ustar00runnerdocker././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1725263710.0 efficient_apriori-2.0.5/LICENSE0000644000175100001770000000204614665267536015637 0ustar00runnerdockerMIT License Copyright (c) 2018 Tommy Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ././@PaxHeader0000000000000000000000000000003400000000000010212 xustar0028 mtime=1725263730.7794456 efficient_apriori-2.0.5/PKG-INFO0000644000175100001770000001504614665267563015733 0ustar00runnerdockerMetadata-Version: 2.1 Name: efficient_apriori Version: 2.0.5 Summary: An efficient Python implementation of the Apriori algorithm. Author-email: tommyod License: MIT License Copyright (c) 2018 Tommy Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. Project-URL: Source, https://github.com/tommyod/Efficient-Apriori Project-URL: Homepage, https://github.com/tommyod/Efficient-Apriori Project-URL: Documentation, https://github.com/tommyod/Efficient-Apriori#readme Project-URL: Repository, https://github.com/tommyod/Efficient-Apriori.git Classifier: Development Status :: 5 - Production/Stable Classifier: License :: OSI Approved :: MIT License Classifier: Programming Language :: Python :: 3.8 Classifier: Programming Language :: Python :: 3.9 Classifier: Programming Language :: Python :: 3.10 Classifier: Programming Language :: Python :: 3.11 Classifier: Programming Language :: Python :: 3.12 Requires-Python: >=3.8 Description-Content-Type: text/markdown License-File: LICENSE # Efficient-Apriori ![Build Status](https://github.com/tommyod/Efficient-Apriori/workflows/Python%20CI/badge.svg?branch=master) [![PyPI version](https://badge.fury.io/py/efficient-apriori.svg)](https://pypi.org/project/efficient-apriori/) [![Documentation Status](https://readthedocs.org/projects/efficient-apriori/badge/?version=latest)](https://efficient-apriori.readthedocs.io/en/latest/?badge=latest) [![Downloads](https://pepy.tech/badge/efficient-apriori)](https://pepy.tech/project/efficient-apriori) [![Black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/ambv/black) An efficient pure Python implementation of the Apriori algorithm. The apriori algorithm uncovers hidden structures in categorical data. The classical example is a database containing purchases from a supermarket. Every purchase has a number of items associated with it. We would like to uncover association rules such as `{bread, eggs} -> {bacon}` from the data. This is the goal of [association rule learning](https://en.wikipedia.org/wiki/Association_rule_learning), and the [Apriori algorithm](https://en.wikipedia.org/wiki/Apriori_algorithm) is arguably the most famous algorithm for this problem. This repository contains an efficient, well-tested implementation of the apriori algorithm as described in the [original paper](https://www.macs.hw.ac.uk/~dwcorne/Teaching/agrawal94fast.pdf) by Agrawal et al, published in 1994. **The code is stable and in widespread use.** It's cited in the book "*Mastering Machine Learning Algorithms*" by Bonaccorso. **The code is fast.** See timings in [this PR](https://github.com/tommyod/Efficient-Apriori/pull/40). ## Example Here's a minimal working example. Notice that in every transaction with `eggs` present, `bacon` is present too. Therefore, the rule `{eggs} -> {bacon}` is returned with 100 % confidence. ```python from efficient_apriori import apriori transactions = [('eggs', 'bacon', 'soup'), ('eggs', 'bacon', 'apple'), ('soup', 'bacon', 'banana')] itemsets, rules = apriori(transactions, min_support=0.5, min_confidence=1) print(rules) # [{eggs} -> {bacon}, {soup} -> {bacon}] ``` If your data is in a pandas DataFrame, you must [convert it to a list of tuples](https://github.com/tommyod/Efficient-Apriori/issues/12). Do you have **missing values**, or does the algorithm **run for a long time**? See [this comment](https://github.com/tommyod/Efficient-Apriori/issues/30#issuecomment-626129085). **More examples are included below.** ## Installation The software is available through GitHub, and through [PyPI](https://pypi.org/project/efficient-apriori/). You may install the software using `pip`. ```bash pip install efficient-apriori ``` ## Contributing You are very welcome to scrutinize the code and make pull requests if you have suggestions and improvements. Your submitted code must be PEP8 compliant, and all tests must pass. See list of contributors [here](https://github.com/tommyod/Efficient-Apriori/graphs/contributors). ## More examples ### Filtering and sorting association rules It's possible to filter and sort the returned list of association rules. ```python from efficient_apriori import apriori transactions = [('eggs', 'bacon', 'soup'), ('eggs', 'bacon', 'apple'), ('soup', 'bacon', 'banana')] itemsets, rules = apriori(transactions, min_support=0.2, min_confidence=1) # Print out every rule with 2 items on the left hand side, # 1 item on the right hand side, sorted by lift rules_rhs = filter(lambda rule: len(rule.lhs) == 2 and len(rule.rhs) == 1, rules) for rule in sorted(rules_rhs, key=lambda rule: rule.lift): print(rule) # Prints the rule and its confidence, support, lift, ... ``` ### Transactions with IDs If you need to know which transactions occurred in the frequent itemsets, set the `output_transaction_ids` parameter to `True`. This changes the output to contain `ItemsetCount` objects for each itemset. The objects have a `members` property containing is the set of ids of frequent transactions as well as a `count` property. The ids are the enumeration of the transactions in the order they appear. ```python from efficient_apriori import apriori transactions = [('eggs', 'bacon', 'soup'), ('eggs', 'bacon', 'apple'), ('soup', 'bacon', 'banana')] itemsets, rules = apriori(transactions, output_transaction_ids=True) print(itemsets) # {1: {('bacon',): ItemsetCount(itemset_count=3, members={0, 1, 2}), ... ``` ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1725263710.0 efficient_apriori-2.0.5/README.md0000644000175100001770000001067614665267536016121 0ustar00runnerdocker# Efficient-Apriori ![Build Status](https://github.com/tommyod/Efficient-Apriori/workflows/Python%20CI/badge.svg?branch=master) [![PyPI version](https://badge.fury.io/py/efficient-apriori.svg)](https://pypi.org/project/efficient-apriori/) [![Documentation Status](https://readthedocs.org/projects/efficient-apriori/badge/?version=latest)](https://efficient-apriori.readthedocs.io/en/latest/?badge=latest) [![Downloads](https://pepy.tech/badge/efficient-apriori)](https://pepy.tech/project/efficient-apriori) [![Black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/ambv/black) An efficient pure Python implementation of the Apriori algorithm. The apriori algorithm uncovers hidden structures in categorical data. The classical example is a database containing purchases from a supermarket. Every purchase has a number of items associated with it. We would like to uncover association rules such as `{bread, eggs} -> {bacon}` from the data. This is the goal of [association rule learning](https://en.wikipedia.org/wiki/Association_rule_learning), and the [Apriori algorithm](https://en.wikipedia.org/wiki/Apriori_algorithm) is arguably the most famous algorithm for this problem. This repository contains an efficient, well-tested implementation of the apriori algorithm as described in the [original paper](https://www.macs.hw.ac.uk/~dwcorne/Teaching/agrawal94fast.pdf) by Agrawal et al, published in 1994. **The code is stable and in widespread use.** It's cited in the book "*Mastering Machine Learning Algorithms*" by Bonaccorso. **The code is fast.** See timings in [this PR](https://github.com/tommyod/Efficient-Apriori/pull/40). ## Example Here's a minimal working example. Notice that in every transaction with `eggs` present, `bacon` is present too. Therefore, the rule `{eggs} -> {bacon}` is returned with 100 % confidence. ```python from efficient_apriori import apriori transactions = [('eggs', 'bacon', 'soup'), ('eggs', 'bacon', 'apple'), ('soup', 'bacon', 'banana')] itemsets, rules = apriori(transactions, min_support=0.5, min_confidence=1) print(rules) # [{eggs} -> {bacon}, {soup} -> {bacon}] ``` If your data is in a pandas DataFrame, you must [convert it to a list of tuples](https://github.com/tommyod/Efficient-Apriori/issues/12). Do you have **missing values**, or does the algorithm **run for a long time**? See [this comment](https://github.com/tommyod/Efficient-Apriori/issues/30#issuecomment-626129085). **More examples are included below.** ## Installation The software is available through GitHub, and through [PyPI](https://pypi.org/project/efficient-apriori/). You may install the software using `pip`. ```bash pip install efficient-apriori ``` ## Contributing You are very welcome to scrutinize the code and make pull requests if you have suggestions and improvements. Your submitted code must be PEP8 compliant, and all tests must pass. See list of contributors [here](https://github.com/tommyod/Efficient-Apriori/graphs/contributors). ## More examples ### Filtering and sorting association rules It's possible to filter and sort the returned list of association rules. ```python from efficient_apriori import apriori transactions = [('eggs', 'bacon', 'soup'), ('eggs', 'bacon', 'apple'), ('soup', 'bacon', 'banana')] itemsets, rules = apriori(transactions, min_support=0.2, min_confidence=1) # Print out every rule with 2 items on the left hand side, # 1 item on the right hand side, sorted by lift rules_rhs = filter(lambda rule: len(rule.lhs) == 2 and len(rule.rhs) == 1, rules) for rule in sorted(rules_rhs, key=lambda rule: rule.lift): print(rule) # Prints the rule and its confidence, support, lift, ... ``` ### Transactions with IDs If you need to know which transactions occurred in the frequent itemsets, set the `output_transaction_ids` parameter to `True`. This changes the output to contain `ItemsetCount` objects for each itemset. The objects have a `members` property containing is the set of ids of frequent transactions as well as a `count` property. The ids are the enumeration of the transactions in the order they appear. ```python from efficient_apriori import apriori transactions = [('eggs', 'bacon', 'soup'), ('eggs', 'bacon', 'apple'), ('soup', 'bacon', 'banana')] itemsets, rules = apriori(transactions, output_transaction_ids=True) print(itemsets) # {1: {('bacon',): ItemsetCount(itemset_count=3, members={0, 1, 2}), ... ``` ././@PaxHeader0000000000000000000000000000003400000000000010212 xustar0028 mtime=1725263730.7754457 efficient_apriori-2.0.5/efficient_apriori/0000755000175100001770000000000014665267563020311 5ustar00runnerdocker././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1725263710.0 efficient_apriori-2.0.5/efficient_apriori/__init__.py0000644000175100001770000000125014665267536022420 0ustar00runnerdocker#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Implementation of the Apriori algorithm. """ import importlib.metadata from efficient_apriori.apriori import apriori from efficient_apriori.itemsets import itemsets_from_transactions from efficient_apriori.rules import Rule, generate_rules_apriori # We use semantic versioning # See https://semver.org/ __version__ = importlib.metadata.version("efficient_apriori") __all__ = ["apriori", "itemsets_from_transactions", "Rule", "generate_rules_apriori"] def run_tests(): """ Run all tests. """ import pytest import os base, _ = os.path.split(__file__) pytest.main(args=[base, "--doctest-modules"]) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1725263710.0 efficient_apriori-2.0.5/efficient_apriori/apriori.py0000644000175100001770000000555714665267536022344 0ustar00runnerdocker#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ High-level implementations of the apriori algorithm. """ import typing from efficient_apriori.itemsets import itemsets_from_transactions from efficient_apriori.rules import generate_rules_apriori def apriori( transactions: typing.Iterable[typing.Union[set, tuple, list]], min_support: float = 0.5, min_confidence: float = 0.5, max_length: int = 8, verbosity: int = 0, output_transaction_ids: bool = False, ): """ The classic apriori algorithm as described in 1994 by Agrawal et al. The Apriori algorithm works in two phases. Phase 1 iterates over the transactions several times to build up itemsets of the desired support level. Phase 2 builds association rules of the desired confidence given the itemsets found in Phase 1. Both of these phases may be correctly implemented by exhausting the search space, i.e. generating every possible itemset and checking it's support. The Apriori prunes the search space efficiently by deciding apriori if an itemset possibly has the desired support, before iterating over the entire dataset and checking. Parameters ---------- transactions : list of transactions (sets/tuples/lists). Each element in the transactions must be hashable. min_support : float The minimum support of the rules returned. The support is frequency of which the items in the rule appear together in the data set. min_confidence : float The minimum confidence of the rules returned. Given a rule X -> Y, the confidence is the probability of Y, given X, i.e. P(Y|X) = conf(X -> Y) max_length : int The maximum length of the itemsets and the rules. verbosity : int The level of detail printing when the algorithm runs. Either 0, 1 or 2. output_transaction_ids : bool If set to true, the output contains the ids of transactions that contain a frequent itemset. The ids are the enumeration of the transactions in the sequence they appear. Examples -------- >>> transactions = [('a', 'b', 'c'), ('a', 'b', 'd'), ('f', 'b', 'g')] >>> itemsets, rules = apriori(transactions, min_confidence=1) >>> rules [{a} -> {b}] """ itemsets, num_trans = itemsets_from_transactions( transactions, min_support, max_length, verbosity, output_transaction_ids=True, ) itemsets_raw = { length: {item: counter.itemset_count for (item, counter) in itemsets.items()} for (length, itemsets) in itemsets.items() } rules = generate_rules_apriori(itemsets_raw, min_confidence, num_trans, verbosity) if output_transaction_ids: return itemsets, list(rules) return itemsets_raw, list(rules) if __name__ == "__main__": import pytest pytest.main(args=[".", "--doctest-modules", "-v"]) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1725263710.0 efficient_apriori-2.0.5/efficient_apriori/itemsets.py0000644000175100001770000003253414665267536022527 0ustar00runnerdocker#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Implementations of algorithms related to itemsets. """ import itertools import numbers import typing import collections from dataclasses import field, dataclass import collections.abc @dataclass class ItemsetCount: itemset_count: int = 0 members: set = field(default_factory=set) class TransactionManager: # The brilliant transaction manager idea is due to: # https://github.com/ymoch/apyori/blob/master/apyori.py def __init__(self, transactions: typing.Iterable[typing.Iterable[typing.Hashable]]): # A lookup that returns indices of transactions for each item self.indices_by_item = collections.defaultdict(set) # Populate i = -1 for i, transaction in enumerate(transactions): for item in transaction: self.indices_by_item[item].add(i) # Total number of transactions self._transactions = i + 1 @property def items(self): return set(self.indices_by_item.keys()) def __len__(self): return self._transactions def transaction_indices(self, transaction: typing.Iterable[typing.Hashable]): """Return the indices of the transaction.""" transaction = set(transaction) # Copy item = transaction.pop() indices = self.indices_by_item[item] while transaction: item = transaction.pop() indices = indices.intersection(self.indices_by_item[item]) return indices def transaction_indices_sc(self, transaction: typing.Iterable[typing.Hashable], min_support: float = 0): """Return the indices of the transaction, with short-circuiting. Returns (over_or_equal_to_min_support, set_of_indices) """ # Sort items by number of transaction rows the item appears in, # starting with the item beloning to the most transactions transaction = sorted(transaction, key=lambda item: len(self.indices_by_item[item]), reverse=True) # Pop item appearing in the fewest item = transaction.pop() indices = self.indices_by_item[item] support = len(indices) / len(self) if support < min_support: return False, None # The support is a non-increasing function # Sorting by number of transactions the items appear in is a heuristic # to make the support drop as quickly as possible while transaction: item = transaction.pop() indices = indices.intersection(self.indices_by_item[item]) support = len(indices) / len(self) if support < min_support: return False, None # No short circuit happened return True, indices def join_step(itemsets: typing.List[tuple]): """ Join k length itemsets into k + 1 length itemsets. This algorithm assumes that the list of itemsets are sorted, and that the itemsets themselves are sorted tuples. Instead of always enumerating all n^2 combinations, the algorithm only has n^2 runtime for each block of itemsets with the first k - 1 items equal. Parameters ---------- itemsets : list of itemsets A list of itemsets of length k, to be joined to k + 1 length itemsets. Examples -------- >>> # This is an example from the 1994 paper by Agrawal et al. >>> itemsets = [(1, 2, 3), (1, 2, 4), (1, 3, 4), (1, 3, 5), (2, 3, 4)] >>> list(join_step(itemsets)) [(1, 2, 3, 4), (1, 3, 4, 5)] """ i = 0 # Iterate over every itemset in the itemsets while i < len(itemsets): # The number of rows to skip in the while-loop, initially set to 1 skip = 1 # Get all but the last item in the itemset, and the last item *itemset_first, itemset_last = itemsets[i] # We now iterate over every itemset following this one, stopping # if the first k - 1 items are not equal. If we're at (1, 2, 3), # we'll consider (1, 2, 4) and (1, 2, 7), but not (1, 3, 1) # Keep a list of all last elements, i.e. tail elements, to perform # 2-combinations on later on tail_items = [itemset_last] tail_items_append = tail_items.append # Micro-optimization # Iterate over ever itemset following this itemset for j in range(i + 1, len(itemsets)): # Get all but the last item in the itemset, and the last item *itemset_n_first, itemset_n_last = itemsets[j] # If it's the same, append and skip this itemset in while-loop if itemset_first == itemset_n_first: # Micro-optimization tail_items_append(itemset_n_last) skip += 1 # If it's not the same, break out of the for-loop else: break # For every 2-combination in the tail items, yield a new candidate # itemset, which is sorted. itemset_first_tuple = tuple(itemset_first) for a, b in itertools.combinations(tail_items, 2): yield itemset_first_tuple + (a,) + (b,) # Increment the while-loop counter i += skip def prune_step(itemsets: typing.Iterable[tuple], possible_itemsets: typing.List[tuple]): """ Prune possible itemsets whose subsets are not in the list of itemsets. Parameters ---------- itemsets : list of itemsets A list of itemsets of length k. possible_itemsets : list of itemsets A list of possible itemsets of length k + 1 to be pruned. Examples ------- >>> itemsets = [('a', 'b', 'c'), ('a', 'b', 'd'), ... ('b', 'c', 'd'), ('a', 'c', 'd')] >>> possible_itemsets = list(join_step(itemsets)) >>> list(prune_step(itemsets, possible_itemsets)) [('a', 'b', 'c', 'd')] """ # For faster lookups itemsets = set(itemsets) # Go through every possible itemset for possible_itemset in possible_itemsets: # Remove 1 from the combination, same as k-1 combinations # The itemsets created by removing the last two items in the possible # itemsets must be part of the itemsets by definition, # due to the way the `join_step` function merges the sorted itemsets for i in range(len(possible_itemset) - 2): removed = possible_itemset[:i] + possible_itemset[i + 1 :] # If every k combination exists in the set of itemsets, # yield the possible itemset. If it does not exist, then it's # support cannot be large enough, since supp(A) >= supp(AB) for # all B, and if supp(S) is large enough, then supp(s) must be large # enough for every s which is a subset of S. # This is the downward-closure property of the support function. if removed not in itemsets: break # If we have not breaked yet else: yield possible_itemset def apriori_gen(itemsets: typing.List[tuple]): """ Compute all possible k + 1 length supersets from k length itemsets. This is done efficiently by using the downward-closure property of the support function, which states that if support(S) > k, then support(s) > k for every subset s of S. Parameters ---------- itemsets : list of itemsets A list of itemsets of length k. Examples ------- >>> # This is an example from the 1994 paper by Agrawal et al. >>> itemsets = [(1, 2, 3), (1, 2, 4), (1, 3, 4), (1, 3, 5), (2, 3, 4)] >>> possible_itemsets = list(join_step(itemsets)) >>> list(prune_step(itemsets, possible_itemsets)) [(1, 2, 3, 4)] """ possible_extensions = join_step(itemsets) yield from prune_step(itemsets, possible_extensions) def itemsets_from_transactions( transactions: typing.Iterable[typing.Union[set, tuple, list]], min_support: float, max_length: int = 8, verbosity: int = 0, output_transaction_ids: bool = False, ): """ Compute itemsets from transactions by building the itemsets bottom up and iterating over the transactions to compute the support repedately. This is the heart of the Apriori algorithm by Agrawal et al. in the 1994 paper. Parameters ---------- transactions : a list of itemsets (tuples/sets/lists with hashable entries) min_support : float The minimum support of the itemsets, i.e. the minimum frequency as a percentage. max_length : int The maximum length of the itemsets. verbosity : int The level of detail printing when the algorithm runs. Either 0, 1 or 2. output_transaction_ids : bool If set to true, the output contains the ids of transactions that contain a frequent itemset. The ids are the enumeration of the transactions in the sequence they appear. Examples -------- >>> # This is an example from the 1994 paper by Agrawal et al. >>> transactions = [(1, 3, 4), (2, 3, 5), (1, 2, 3, 5), (2, 5)] >>> itemsets, _ = itemsets_from_transactions(transactions, min_support=2/5) >>> itemsets[1] == {(1,): 2, (2,): 3, (3,): 3, (5,): 3} True >>> itemsets[2] == {(1, 3): 2, (2, 3): 2, (2, 5): 3, (3, 5): 2} True >>> itemsets[3] == {(2, 3, 5): 2} True """ # STEP 0 - Sanitize user inputs # ----------------------------- if not (isinstance(min_support, numbers.Number) and (0 <= min_support <= 1)): raise ValueError("`min_support` must be a number between 0 and 1.") # Store in transaction manager manager = TransactionManager(transactions) # If no transactions are present transaction_count = len(manager) if transaction_count == 0: return dict(), 0 # large_itemsets, num_transactions # STEP 1 - Generate all large itemsets of size 1 # ---------------------------------------------- if verbosity > 0: print("Generating itemsets.") print(" Counting itemsets of length 1.") candidates: typing.Dict[tuple, int] = {(item,): len(indices) for item, indices in manager.indices_by_item.items()} large_itemsets: typing.Dict[int, typing.Dict[tuple, int]] = { 1: {item: count for (item, count) in candidates.items() if (count / len(manager)) >= min_support} } if verbosity > 0: print(" Found {} candidate itemsets of length 1.".format(len(manager.items))) print(" Found {} large itemsets of length 1.".format(len(large_itemsets.get(1, dict())))) if verbosity > 1: print(" {}".format(list(item for item in large_itemsets.get(1, dict()).keys()))) # If large itemsets were found, convert to dictionary if not large_itemsets.get(1, dict()): return dict(), 0 # large_itemsets, num_transactions # STEP 2 - Build up the size of the itemsets # ------------------------------------------ # While there are itemsets of the previous size k = 2 while large_itemsets[k - 1] and (max_length != 1): if verbosity > 0: print(" Counting itemsets of length {}.".format(k)) # STEP 2a) - Build up candidate of larger itemsets # Retrieve the itemsets of the previous size, i.e. of size k - 1 # They must be sorted to maintain the invariant when joining/pruning itemsets_list = sorted(item for item in large_itemsets[k - 1].keys()) # Gen candidates of length k + 1 by joining, prune, and copy as set # This algorithm assumes that the list of itemsets are sorted, # and that the itemsets themselves are sorted tuples C_k: typing.List[tuple] = list(apriori_gen(itemsets_list)) if verbosity > 0: print(" Found {} candidate itemsets of length {}.".format(len(C_k), k)) if verbosity > 1: print(" {}".format(C_k)) # If no candidate itemsets were found, break out of the loop if not C_k: break # Prepare counts of candidate itemsets (from the prune step) if verbosity > 1: print(" Iterating over transactions.") # Keep only large transactions found_itemsets: typing.Dict[tuple, int] = dict() for candidate in C_k: over_min_support, indices = manager.transaction_indices_sc(candidate, min_support=min_support) if over_min_support: found_itemsets[candidate] = len(indices) # If no itemsets were found, break out of the loop if not found_itemsets: break # Candidate itemsets were found, add them large_itemsets[k] = {i: counts for (i, counts) in found_itemsets.items()} if verbosity > 0: num_found = len(large_itemsets[k]) print(" Found {} large itemsets of length {}.".format(num_found, k)) if verbosity > 1: print(" {}".format(list(large_itemsets[k].keys()))) k += 1 # Break out if we are about to consider larger itemsets than the max if k > max_length: break if verbosity > 0: print("Itemset generation terminated.\n") if output_transaction_ids: itemsets_out = { length: { item: ItemsetCount(itemset_count=count, members=manager.transaction_indices(set(item))) for (item, count) in itemsets.items() } for (length, itemsets) in large_itemsets.items() } return itemsets_out, len(manager) return large_itemsets, len(manager) if __name__ == "__main__": import pytest pytest.main(args=[".", "--doctest-modules", "-v"]) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1725263710.0 efficient_apriori-2.0.5/efficient_apriori/rules.py0000644000175100001770000003455514665267536022031 0ustar00runnerdocker#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Implementations of algorithms related to association rules. """ import typing import numbers import itertools from efficient_apriori.itemsets import apriori_gen class Rule: """ A class for a rule. """ # Number of decimals used for printing _decimals = 3 def __init__( self, lhs: tuple, rhs: tuple, count_full: int = 0, count_lhs: int = 0, count_rhs: int = 0, num_transactions: int = 0, ): """ Initialize a new rule. This call is a thin wrapper around some data. Parameters ---------- lhs : tuple The left hand side (antecedent) of the rule. Each item in the tuple must be hashable, e.g. a string or an integer. rhs : tuple The right hand side (consequent) of the rule. count_full : int The count of the union of the lhs and rhs in the dataset. count_lhs : int The count of the lhs in the dataset. count_rhs : int The count of the rhs in the dataset. num_transactions : int The number of transactions in the dataset. Examples -------- >>> r = Rule(('a', 'b'), ('c',), 50, 100, 150, 200) >>> r.confidence # Probability of 'c', given 'a' and 'b' 0.5 >>> r.support # Probability of ('a', 'b', 'c') in the data 0.25 >>> # Ratio of observed over expected support if lhs, rhs = independent >>> r.lift == 2 / 3 True >>> print(r) {a, b} -> {c} (conf: 0.500, supp: 0.250, lift: 0.667, conv: 0.500) >>> r {a, b} -> {c} """ self.lhs = lhs # antecedent self.rhs = rhs # consequent self.count_full = count_full self.count_lhs = count_lhs self.count_rhs = count_rhs self.num_transactions = num_transactions @property def confidence(self): """ The confidence of a rule is the probability of the rhs given the lhs. If X -> Y, then the confidence is P(Y|X). """ try: return self.count_full / self.count_lhs except ZeroDivisionError: return None except AttributeError: return None @property def support(self): """ The support of a rule is the frequency of which the lhs and rhs appear together in the dataset. If X -> Y, then the support is P(Y and X). """ try: return self.count_full / self.num_transactions except ZeroDivisionError: return None except AttributeError: return None @property def lift(self): """ The lift of a rule is the ratio of the observed support to the expected support if the lhs and rhs were independent.If X -> Y, then the lift is given by the fraction P(X and Y) / (P(X) * P(Y)). """ try: observed_support = self.count_full / self.num_transactions prod_counts = self.count_lhs * self.count_rhs expected_support = prod_counts / self.num_transactions**2 return observed_support / expected_support except ZeroDivisionError: return None except AttributeError: return None @property def conviction(self): """ The conviction of a rule X -> Y is the ratio P(not Y) / P(not Y | X). It's the proportion of how often Y does not appear in the data to how often Y does not appear in the data, given X. If the ratio is large, then the confidence is large and Y appears often. """ try: eps = 10e-10 # Avoid zero division prob_not_rhs = 1 - self.count_rhs / self.num_transactions prob_not_rhs_given_lhs = 1 - self.confidence return prob_not_rhs / (prob_not_rhs_given_lhs + eps) except ZeroDivisionError: return None except AttributeError: return None @property def rpf(self): """ The RPF (Rule Power Factor) is the confidence times the support. """ try: return self.confidence * self.support except ZeroDivisionError: return None except AttributeError: return None @staticmethod def _pf(s): """ Pretty formatting of an iterable. """ return "{" + ", ".join(str(k) for k in s) + "}" def __repr__(self): """ Representation of a rule. """ return "{} -> {}".format(self._pf(self.lhs), self._pf(self.rhs)) def __str__(self): """ Printing of a rule. """ conf = "conf: {0:.3f}".format(self.confidence) supp = "supp: {0:.3f}".format(self.support) lift = "lift: {0:.3f}".format(self.lift) conv = "conv: {0:.3f}".format(self.conviction) return "{} -> {} ({}, {}, {}, {})".format(self._pf(self.lhs), self._pf(self.rhs), conf, supp, lift, conv) def __eq__(self, other): """ Equality of two rules. """ return (set(self.lhs) == set(other.lhs)) and (set(self.rhs) == set(other.rhs)) def __hash__(self): """ Hashing a rule for efficient set and dict representation. """ return hash(frozenset(self.lhs + self.rhs)) def __len__(self): """ The length of a rule, defined as the number of items in the rule. """ return len(self.lhs + self.rhs) def generate_rules_simple( itemsets: typing.Dict[int, typing.Dict], min_confidence: float, num_transactions: int, ): """ DO NOT USE. This is a simple top-down algorithm for generating association rules. It is included here for testing purposes, and because it is mentioned in the 1994 paper by Agrawal et al. It is slow because it does not enumerate the search space efficiently: it produces duplicates, and it does not prune the search space efficiently. Simple algorithm for generating association rules from itemsets. """ # Iterate over every size for size in itemsets.keys(): # Do not consider itemsets of size 1 if size < 2: continue # This algorithm returns duplicates, so we keep track of items yielded # in a set to avoid yielding duplicates yielded: set = set() yielded_add = yielded.add # Iterate over every itemset of the prescribed size for itemset in itemsets[size].keys(): # Generate rules for result in _genrules(itemset, itemset, itemsets, min_confidence, num_transactions): # If the rule has been yieded, keep going, else add and yield if result in yielded: continue yielded_add(result) yield result def _genrules(l_k, a_m, itemsets, min_conf, num_transactions): """ DO NOT USE. This is the gen-rules algorithm from the 1994 paper by Agrawal et al. It's a subroutine called by `generate_rules_simple`. However, the algorithm `generate_rules_simple` should not be used. The naive algorithm from the original paper. Parameters ---------- l_k : tuple The itemset containing all elements to be considered for a rule. a_m : tuple The itemset to take m-length combinations of, an move to the left of l_k. The itemset a_m is a subset of l_k. """ def count(itemset): """ Helper function to retrieve the count of the itemset in the dataset. """ return itemsets[len(itemset)][itemset] # Iterate over every k - 1 combination of a_m to produce # rules of the form a -> (l - a) for a_m in itertools.combinations(a_m, len(a_m) - 1): # Compute the count of this rule, which is a_m -> (l_k - a_m) confidence = count(l_k) / count(a_m) # Keep going if the confidence level is too low if confidence < min_conf: continue # Create the right hand set: rhs = (l_k - a_m) , and keep it sorted rhs = set(l_k).difference(set(a_m)) rhs = tuple(sorted(rhs)) # Create new rule object and yield it yield Rule(a_m, rhs, count(l_k), count(a_m), count(rhs), num_transactions) # If the left hand side has one item only, do not recurse the function if len(a_m) <= 1: continue yield from _genrules(l_k, a_m, itemsets, min_conf, num_transactions) def generate_rules_apriori( itemsets: typing.Dict[int, typing.Dict[tuple, int]], min_confidence: float, num_transactions: int, verbosity: int = 0, ): """ Bottom up algorithm for generating association rules from itemsets, very similar to the fast algorithm proposed in the original 1994 paper by Agrawal et al. The algorithm is based on the observation that for {a, b} -> {c, d} to hold, both {a, b, c} -> {d} and {a, b, d} -> {c} must hold, since in general conf( {a, b, c} -> {d} ) >= conf( {a, b} -> {c, d} ). In other words, if either of the two one-consequent rules do not hold, then there is no need to ever consider the two-consequent rule. Parameters ---------- itemsets : dict of dicts The first level of the dictionary is of the form (length, dict of item sets). The second level is of the form (itemset, count_in_dataset)). min_confidence : float The minimum confidence required for the rule to be yielded. num_transactions : int The number of transactions in the data set. verbosity : int The level of detail printing when the algorithm runs. Either 0, 1 or 2. Examples -------- >>> itemsets = {1: {('a',): 3, ('b',): 2, ('c',): 1}, ... 2: {('a', 'b'): 2, ('a', 'c'): 1}} >>> list(generate_rules_apriori(itemsets, 1.0, 3)) [{b} -> {a}, {c} -> {a}] """ # Validate user inputs if not ((0 <= min_confidence <= 1) and isinstance(min_confidence, numbers.Number)): raise ValueError("`min_confidence` must be a number between 0 and 1.") if not ((num_transactions >= 0) and isinstance(num_transactions, numbers.Number)): raise ValueError("`num_transactions` must be a number greater than 0.") def count(itemset): """ Helper function to retrieve the count of the itemset in the dataset. """ return itemsets[len(itemset)][itemset] if verbosity > 0: print("Generating rules from itemsets.") # For every itemset of a perscribed size for size in itemsets.keys(): # Do not consider itemsets of size 1 if size < 2: continue if verbosity > 0: print(" Generating rules of size {}.".format(size)) # For every itemset of this size for itemset in itemsets[size].keys(): # Generate combinations to start off of. These 1-combinations will # be merged to 2-combinations in the function `_ap_genrules` H_1 = [] # Special case to capture rules such as {others} -> {1 item} for removed in itertools.combinations(itemset, 1): # Compute the left hand side remaining = set(itemset).difference(set(removed)) lhs = tuple(sorted(remaining)) # If the confidence is high enough, yield the rule conf = count(itemset) / count(lhs) if conf >= min_confidence: yield Rule( lhs, removed, count(itemset), count(lhs), count(removed), num_transactions, ) # Consider the removed item for 2-combinations in the function `_ap_genrules` H_1.append(removed) # If H_1 is empty, there is nothing for passing to _ap_genrules, so continue to the next itemset if len(H_1) == 0: continue yield from _ap_genrules(itemset, H_1, itemsets, min_confidence, num_transactions) if verbosity > 0: print("Rule generation terminated.\n") def _ap_genrules( itemset: tuple, H_m: typing.List[tuple], itemsets: typing.Dict[int, typing.Dict[tuple, int]], min_conf: float, num_transactions: int, ): """ Recursively build up rules by adding more items to the right hand side. This algorithm is called `ap-genrules` in the original paper. It is called by the `generate_rules_apriori` generator above. See it's docs. Parameters ---------- itemset : tuple The itemset under consideration. H_m : tuple Subsets of the itemset of length m, to be considered for rhs of a rule. itemsets : dict of dicts All itemsets and counts for in the data set. min_conf : float The minimum confidence for a rule to be returned. num_transactions : int The number of transactions in the data set. """ def count(itemset): """ Helper function to retrieve the count of the itemset in the dataset. """ return itemsets[len(itemset)][itemset] # If H_1 is so large that calling `apriori_gen` will produce right-hand # sides as large as `itemset`, there will be no right hand side. # This should not happen happen, so we return. if len(itemset) <= (len(H_m[0]) + 1): return # Generate right-hand itemsets of length k + 1 if H is of length k H_m = list(apriori_gen(H_m)) H_m_copy = H_m.copy() # For every possible right hand side for h_m in H_m: # Compute the left hand side of the rule lhs = tuple(sorted(set(itemset).difference(set(h_m)))) # If the confidence is high enough, yield the rule, else remove from # the upcoming recursive generator call if (count(itemset) / count(lhs)) >= min_conf: yield Rule( lhs, h_m, count(itemset), count(lhs), count(h_m), num_transactions, ) else: H_m_copy.remove(h_m) # Unless the list of right-hand sides is empty, recurse the generator call if H_m_copy: yield from _ap_genrules(itemset, H_m_copy, itemsets, min_conf, num_transactions) if __name__ == "__main__": import pytest pytest.main(args=[".", "--doctest-modules", "-v"]) ././@PaxHeader0000000000000000000000000000003400000000000010212 xustar0028 mtime=1725263730.7794456 efficient_apriori-2.0.5/efficient_apriori.egg-info/0000755000175100001770000000000014665267563022003 5ustar00runnerdocker././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1725263730.0 efficient_apriori-2.0.5/efficient_apriori.egg-info/PKG-INFO0000644000175100001770000001504614665267562023105 0ustar00runnerdockerMetadata-Version: 2.1 Name: efficient_apriori Version: 2.0.5 Summary: An efficient Python implementation of the Apriori algorithm. Author-email: tommyod License: MIT License Copyright (c) 2018 Tommy Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. Project-URL: Source, https://github.com/tommyod/Efficient-Apriori Project-URL: Homepage, https://github.com/tommyod/Efficient-Apriori Project-URL: Documentation, https://github.com/tommyod/Efficient-Apriori#readme Project-URL: Repository, https://github.com/tommyod/Efficient-Apriori.git Classifier: Development Status :: 5 - Production/Stable Classifier: License :: OSI Approved :: MIT License Classifier: Programming Language :: Python :: 3.8 Classifier: Programming Language :: Python :: 3.9 Classifier: Programming Language :: Python :: 3.10 Classifier: Programming Language :: Python :: 3.11 Classifier: Programming Language :: Python :: 3.12 Requires-Python: >=3.8 Description-Content-Type: text/markdown License-File: LICENSE # Efficient-Apriori ![Build Status](https://github.com/tommyod/Efficient-Apriori/workflows/Python%20CI/badge.svg?branch=master) [![PyPI version](https://badge.fury.io/py/efficient-apriori.svg)](https://pypi.org/project/efficient-apriori/) [![Documentation Status](https://readthedocs.org/projects/efficient-apriori/badge/?version=latest)](https://efficient-apriori.readthedocs.io/en/latest/?badge=latest) [![Downloads](https://pepy.tech/badge/efficient-apriori)](https://pepy.tech/project/efficient-apriori) [![Black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/ambv/black) An efficient pure Python implementation of the Apriori algorithm. The apriori algorithm uncovers hidden structures in categorical data. The classical example is a database containing purchases from a supermarket. Every purchase has a number of items associated with it. We would like to uncover association rules such as `{bread, eggs} -> {bacon}` from the data. This is the goal of [association rule learning](https://en.wikipedia.org/wiki/Association_rule_learning), and the [Apriori algorithm](https://en.wikipedia.org/wiki/Apriori_algorithm) is arguably the most famous algorithm for this problem. This repository contains an efficient, well-tested implementation of the apriori algorithm as described in the [original paper](https://www.macs.hw.ac.uk/~dwcorne/Teaching/agrawal94fast.pdf) by Agrawal et al, published in 1994. **The code is stable and in widespread use.** It's cited in the book "*Mastering Machine Learning Algorithms*" by Bonaccorso. **The code is fast.** See timings in [this PR](https://github.com/tommyod/Efficient-Apriori/pull/40). ## Example Here's a minimal working example. Notice that in every transaction with `eggs` present, `bacon` is present too. Therefore, the rule `{eggs} -> {bacon}` is returned with 100 % confidence. ```python from efficient_apriori import apriori transactions = [('eggs', 'bacon', 'soup'), ('eggs', 'bacon', 'apple'), ('soup', 'bacon', 'banana')] itemsets, rules = apriori(transactions, min_support=0.5, min_confidence=1) print(rules) # [{eggs} -> {bacon}, {soup} -> {bacon}] ``` If your data is in a pandas DataFrame, you must [convert it to a list of tuples](https://github.com/tommyod/Efficient-Apriori/issues/12). Do you have **missing values**, or does the algorithm **run for a long time**? See [this comment](https://github.com/tommyod/Efficient-Apriori/issues/30#issuecomment-626129085). **More examples are included below.** ## Installation The software is available through GitHub, and through [PyPI](https://pypi.org/project/efficient-apriori/). You may install the software using `pip`. ```bash pip install efficient-apriori ``` ## Contributing You are very welcome to scrutinize the code and make pull requests if you have suggestions and improvements. Your submitted code must be PEP8 compliant, and all tests must pass. See list of contributors [here](https://github.com/tommyod/Efficient-Apriori/graphs/contributors). ## More examples ### Filtering and sorting association rules It's possible to filter and sort the returned list of association rules. ```python from efficient_apriori import apriori transactions = [('eggs', 'bacon', 'soup'), ('eggs', 'bacon', 'apple'), ('soup', 'bacon', 'banana')] itemsets, rules = apriori(transactions, min_support=0.2, min_confidence=1) # Print out every rule with 2 items on the left hand side, # 1 item on the right hand side, sorted by lift rules_rhs = filter(lambda rule: len(rule.lhs) == 2 and len(rule.rhs) == 1, rules) for rule in sorted(rules_rhs, key=lambda rule: rule.lift): print(rule) # Prints the rule and its confidence, support, lift, ... ``` ### Transactions with IDs If you need to know which transactions occurred in the frequent itemsets, set the `output_transaction_ids` parameter to `True`. This changes the output to contain `ItemsetCount` objects for each itemset. The objects have a `members` property containing is the set of ids of frequent transactions as well as a `count` property. The ids are the enumeration of the transactions in the order they appear. ```python from efficient_apriori import apriori transactions = [('eggs', 'bacon', 'soup'), ('eggs', 'bacon', 'apple'), ('soup', 'bacon', 'banana')] itemsets, rules = apriori(transactions, output_transaction_ids=True) print(itemsets) # {1: {('bacon',): ItemsetCount(itemset_count=3, members={0, 1, 2}), ... ``` ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1725263730.0 efficient_apriori-2.0.5/efficient_apriori.egg-info/SOURCES.txt0000644000175100001770000000047014665267562023667 0ustar00runnerdockerLICENSE README.md pyproject.toml efficient_apriori/__init__.py efficient_apriori/apriori.py efficient_apriori/itemsets.py efficient_apriori/rules.py efficient_apriori.egg-info/PKG-INFO efficient_apriori.egg-info/SOURCES.txt efficient_apriori.egg-info/dependency_links.txt efficient_apriori.egg-info/top_level.txt././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1725263730.0 efficient_apriori-2.0.5/efficient_apriori.egg-info/dependency_links.txt0000644000175100001770000000000114665267562026050 0ustar00runnerdocker ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1725263730.0 efficient_apriori-2.0.5/efficient_apriori.egg-info/top_level.txt0000644000175100001770000000002214665267562024526 0ustar00runnerdockerefficient_apriori ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1725263710.0 efficient_apriori-2.0.5/pyproject.toml0000644000175100001770000000257614665267536017556 0ustar00runnerdocker[build-system] requires = ["setuptools", "wheel"] build-backend = "setuptools.build_meta" [project] name = "efficient_apriori" version = "2.0.5" dynamic = ["readme"] description = "An efficient Python implementation of the Apriori algorithm." license = {file = "LICENSE"} authors = [ {name = "tommyod", email = "tommy.odland@gmail.com"}, ] classifiers = [ "Development Status :: 5 - Production/Stable", "License :: OSI Approved :: MIT License", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", ] requires-python = ">=3.8" [project.urls] Source = "https://github.com/tommyod/Efficient-Apriori" Homepage = "https://github.com/tommyod/Efficient-Apriori" Documentation = "https://github.com/tommyod/Efficient-Apriori#readme" Repository = "https://github.com/tommyod/Efficient-Apriori.git" [tool.setuptools] packages = ["efficient_apriori"] [tool.setuptools.dynamic] readme = { file = "README.md", content-type = "text/markdown"} [tool.pytest.ini_options] testpaths = ["efficient_apriori/tests"] addopts = "-v -ra -q" log_cli = true log_cli_level = "INFO" log_format = "%(asctime)s %(levelname)s %(message)s" log_date_format = "%Y-%m-%d %H:%M:%S" minversion = "3.8" filterwarnings = "ignore" norecursedirs = "docs" ././@PaxHeader0000000000000000000000000000003400000000000010212 xustar0028 mtime=1725263730.7794456 efficient_apriori-2.0.5/setup.cfg0000644000175100001770000000004614665267563016451 0ustar00runnerdocker[egg_info] tag_build = tag_date = 0