pax_global_header00006660000000000000000000000064146023340760014517gustar00rootroot0000000000000052 comment=015d9a943fec65a7541d9afada738207ba089299 imbalanced-learn-0.12.2/000077500000000000000000000000001460233407600147575ustar00rootroot00000000000000imbalanced-learn-0.12.2/.circleci/000077500000000000000000000000001460233407600166125ustar00rootroot00000000000000imbalanced-learn-0.12.2/.circleci/config.yml000066400000000000000000000031371460233407600206060ustar00rootroot00000000000000version: 2 jobs: doc: docker: - image: cimg/python:3.8.12 environment: - USERNAME: "glemaitre" - ORGANIZATION: "imbalanced-learn" - DOC_REPO: "imbalanced-learn.github.io" - DOC_URL: "" - EMAIL: "g.lemaitre58@gmail.com" - MINICONDA_PATH: ~/miniconda - CONDA_ENV_NAME: testenv - OMP_NUM_THREADS: 1 - PYTHON_VERSION: 3 - NUMPY_VERSION: 'latest' - SCIPY_VERSION: 'latest' - SKLEARN_VERSION: 'latest' - MATPLOTLIB_VERSION: 'latest' - SPHINX_VERSION: 'min' - PANDAS_VERSION: 'latest' - SPHINX_GALLERY_VERSION: 'latest' - NUMPYDOC_VERSION: 'latest' - SPHINXCONTRIB_BIBTEX_VERSION: 'latest' - PYDATA_SPHINX_THEME_VERSION: 'latest' steps: - add_ssh_keys: fingerprints: - "34:ea:b1:d9:b1:e2:5d:79:81:c4:d0:39:ca:85:e1:ef" - checkout - run: ./build_tools/circle/checkout_merge_commit.sh - run: ./build_tools/circle/build_doc.sh - store_artifacts: path: doc/_build/html destination: doc - store_artifacts: path: ~/log.txt - persist_to_workspace: root: doc/_build/html paths: . - attach_workspace: at: doc/_build/html - run: ls -ltrh doc/_build/html - deploy: command: | if [[ "${CIRCLE_BRANCH}" =~ ^master$|^[0-9]+\.[0-9]+\.X$ ]]; then bash ./build_tools/circle/push_doc.sh doc/_build/html fi filters: branches: ignore: gh-pages workflows: version: 2 build-doc-and-deploy: jobs: - doc imbalanced-learn-0.12.2/.coveragerc000066400000000000000000000005451460233407600171040ustar00rootroot00000000000000# Configuration for coverage.py [run] branch = True source = imblearn include = */imblearn/* omit = */setup.py [report] exclude_lines = pragma: no cover def __repr__ if self.debug: if settings.DEBUG raise AssertionError raise NotImplementedError if 0: if __name__ == .__main__.: if self.verbose: show_missing = Trueimbalanced-learn-0.12.2/.flake8000066400000000000000000000003571460233407600161370ustar00rootroot00000000000000[flake8] max-line-length = 88 # Default flake8 3.5 ignored flags ignore=E121,E123,E126,E226,E24,E704,W503,W504,E203 # It's fine not to put the import at the top of the file in the examples # folder. per-file-ignores = examples/*: E402 imbalanced-learn-0.12.2/.github/000077500000000000000000000000001460233407600163175ustar00rootroot00000000000000imbalanced-learn-0.12.2/.github/ISSUE_TEMPLATE.md000066400000000000000000000031711460233407600210260ustar00rootroot00000000000000 #### Description #### Steps/Code to Reproduce #### Expected Results #### Actual Results #### Versions imbalanced-learn-0.12.2/.github/ISSUE_TEMPLATE/000077500000000000000000000000001460233407600205025ustar00rootroot00000000000000imbalanced-learn-0.12.2/.github/ISSUE_TEMPLATE/bug_report.md000066400000000000000000000030641460233407600231770ustar00rootroot00000000000000--- name: Bug report about: Create a report to help us reproduce and correct the bug title: "[BUG]" labels: bug assignees: '' --- #### Describe the bug A clear and concise description of what the bug is. #### Steps/Code to Reproduce ``` Sample code to reproduce the problem ``` #### Expected Results #### Actual Results #### Versions imbalanced-learn-0.12.2/.github/ISSUE_TEMPLATE/documentation-improvement.md000066400000000000000000000006231460233407600262410ustar00rootroot00000000000000--- name: Documentation improvement about: Create a report to help us improve the documentation title: "[DOC]" labels: Documentation, help wanted, good first issue assignees: '' --- #### Describe the issue linked to the documentation Tell us about the confusion introduce in the documentation. #### Suggest a potential alternative/fix Tell us how we could improve the documentation in this regard. imbalanced-learn-0.12.2/.github/ISSUE_TEMPLATE/feature_request.md000066400000000000000000000010271460233407600242270ustar00rootroot00000000000000--- name: Feature request about: Suggest an new algorithm, enhancement to an existing algorithm, etc. title: "[ENH]" labels: enhancement assignees: '' --- <-- If you want to propose a new algorithm, please refer first to the scikit-learn inclusion criterion: https://scikit-learn.org/stable/faq.html#what-are-the-inclusion-criteria-for-new-algorithms --> #### Is your feature request related to a problem? Please describe #### Describe the solution you'd like #### Describe alternatives you've considered #### Additional context imbalanced-learn-0.12.2/.github/ISSUE_TEMPLATE/other--blank-template-.md000066400000000000000000000002011460233407600251660ustar00rootroot00000000000000--- name: Other (blank template) about: For all other issues to reach the community... title: '' labels: '' assignees: '' --- imbalanced-learn-0.12.2/.github/ISSUE_TEMPLATE/question.md000066400000000000000000000003701460233407600226730ustar00rootroot00000000000000--- name: Question about: If you have a usage question title: '' labels: '' assignees: '' --- ** If your issue is a usage question, submit it here instead: - The imbalanced learn gitter: https://gitter.im/scikit-learn-contrib/imbalanced-learn ** imbalanced-learn-0.12.2/.github/ISSUE_TEMPLATE/usage-question.md000066400000000000000000000007371460233407600240040ustar00rootroot00000000000000--- name: Usage question about: If you have a usage question title: "[SO]" labels: question assignees: '' --- ** If your issue is a usage question, submit it here instead:** - **The imbalanced learn gitter: https://gitter.im/scikit-learn-contrib/imbalanced-learn** - **StackOverflow with the imblearn (or imbalanced-learn) tag:https://stackoverflow.com/questions/tagged/imblearn** We are going to automatically close this issue if this is not link to a bug or an enhancement. imbalanced-learn-0.12.2/.github/PULL_REQUEST_TEMPLATE.md000066400000000000000000000014631460233407600221240ustar00rootroot00000000000000 #### Reference Issue #### What does this implement/fix? Explain your changes. #### Any other comments? imbalanced-learn-0.12.2/.github/workflows/000077500000000000000000000000001460233407600203545ustar00rootroot00000000000000imbalanced-learn-0.12.2/.github/workflows/circleci-artifacts-redirector.yml000066400000000000000000000006421460233407600267740ustar00rootroot00000000000000name: circleci-artifacts-redirector on: [status] jobs: circleci_artifacts_redirector_job: runs-on: ubuntu-latest name: Run CircleCI artifacts redirector steps: - name: GitHub Action step uses: larsoner/circleci-artifacts-redirector-action@master with: repo-token: ${{ secrets.GITHUB_TOKEN }} artifact-path: doc/index.html circleci-jobs: documentation imbalanced-learn-0.12.2/.gitignore000066400000000000000000000024641460233407600167550ustar00rootroot00000000000000# Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] # C extensions *.so # Distribution / packaging .Python env/ build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ *.egg-info/ .installed.cfg *.egg Pipfile Pipfile.lock # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *,cover .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log # Sphinx documentation docs/_build/ # PyBuilder target/ # vim *.swp # emacs *~ # Visual Studio *.sln *.pyproj *.suo *.vs .vscode/ # PyCharm .idea/ # Cython *.pyc *.pyo __pycache__ *.so *.o *.egg *.egg-info Cython/Compiler/*.c Cython/Plex/*.c Cython/Runtime/refnanny.c Cython/Tempita/*.c Cython/*.c Tools/*.elc /TEST_TMP/ /build/ /wheelhouse*/ !tests/build/ /dist/ .gitrev .coverage *.orig *.rej *.dep *.swp *~ .ipynb_checkpoints docs/build tags TAGS MANIFEST .tox cythonize.dat # build documentation doc/_build/ doc/auto_examples/ doc/generated/ doc/references/generated/ doc/bibtex/auto doc/min_dependency_table.rst # MacOS .DS_Store imbalanced-learn-0.12.2/.pre-commit-config.yaml000066400000000000000000000011211460233407600212330ustar00rootroot00000000000000repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.3.0 hooks: - id: check-yaml - id: end-of-file-fixer - id: trailing-whitespace - repo: https://github.com/psf/black rev: 23.3.0 hooks: - id: black - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. rev: v0.0.272 hooks: - id: ruff args: ["--fix", "--show-source"] - repo: https://github.com/pre-commit/mirrors-mypy rev: v1.3.0 hooks: - id: mypy files: imblearn/ additional_dependencies: [pytest==6.2.4] imbalanced-learn-0.12.2/AUTHORS.rst000066400000000000000000000010631460233407600166360ustar00rootroot00000000000000History ------- Development lead ~~~~~~~~~~~~~~~~ The project started in August 2014 by Fernando Nogueira and focused on SMOTE implementation. Together with Guillaume Lemaitre, Dayvid Victor, and Christos Aridas, additional under-sampling and over-sampling methods have been implemented as well as major changes in the API to be fully compatible with scikit-learn_. Contributors ------------ Refers to GitHub contributors page_. .. _scikit-learn: http://scikit-learn.org .. _page: https://github.com/scikit-learn-contrib/imbalanced-learn/graphs/contributors imbalanced-learn-0.12.2/CONTRIBUTING.md000066400000000000000000000155201460233407600172130ustar00rootroot00000000000000Contributing code ================= This guide is adapted from [scikit-learn](https://github.com/scikit-learn/scikit-learn/blob/master/CONTRIBUTING.md). How to contribute ----------------- The preferred way to contribute to imbalanced-learn is to fork the [main repository](https://github.com/scikit-learn-contrib/imbalanced-learn) on GitHub: 1. Fork the [project repository](https://github.com/scikit-learn-contrib/imbalanced-learn): click on the 'Fork' button near the top of the page. This creates a copy of the code under your account on the GitHub server. 2. Clone this copy to your local disk: $ git clone git@github.com:YourLogin/imbalanced-learn.git $ cd imblearn 3. Create a branch to hold your changes: $ git checkout -b my-feature and start making changes. Never work in the ``master`` branch! 4. Work on this copy on your computer using Git to do the version control. When you're done editing, do: $ git add modified_files $ git commit to record your changes in Git, then push them to GitHub with: $ git push -u origin my-feature Finally, go to the web page of your fork of the imbalanced-learn repo, and click 'Pull request' to send your changes to the maintainers for review. This will send an email to the committers. (If any of the above seems like magic to you, then look up the [Git documentation](https://git-scm.com/documentation) on the web.) Contributing Pull Requests -------------------------- It is recommended to check that your contribution complies with the following rules before submitting a pull request: - Follow the [coding-guidelines](http://scikit-learn.org/dev/developers/contributing.html#coding-guidelines) as for scikit-learn. - When applicable, use the validation tools and other code in the `sklearn.utils` submodule. A list of utility routines available for developers can be found in the [Utilities for Developers](http://scikit-learn.org/dev/developers/utilities.html#developers-utils) page. - If your pull request addresses an issue, please use the title to describe the issue and mention the issue number in the pull request description to ensure a link is created to the original issue. - All public methods should have informative docstrings with sample usage presented as doctests when appropriate. - Please prefix the title of your pull request with `[MRG]` if the contribution is complete and should be subjected to a detailed review. Incomplete contributions should be prefixed `[WIP]` to indicate a work in progress (and changed to `[MRG]` when it matures). WIPs may be useful to: indicate you are working on something to avoid duplicated work, request broad review of functionality or API, or seek collaborators. WIPs often benefit from the inclusion of a [task list](https://github.com/blog/1375-task-lists-in-gfm-issues-pulls-comments) in the PR description. - All other tests pass when everything is rebuilt from scratch. On Unix-like systems, check with (from the toplevel source folder): $ make - When adding additional functionality, provide at least one example script in the ``examples/`` folder. Have a look at other examples for reference. Examples should demonstrate why the new functionality is useful in practice and, if possible, compare it to other methods available in scikit-learn. - Documentation and high-coverage tests are necessary for enhancements to be accepted. - At least one paragraph of narrative documentation with links to references in the literature (with PDF links when possible) and the example. You can also check for common programming errors with the following tools: - Code with good unittest coverage (at least 80%), check with: $ pip install pytest pytest-cov $ pytest --cov=imblearn imblearn - No pyflakes warnings, check with: $ pip install pyflakes $ pyflakes path/to/module.py - No PEP8 warnings, check with: $ pip install pycodestyle $ pycodestyle path/to/module.py - AutoPEP8 can help you fix some of the easy redundant errors: $ pip install autopep8 $ autopep8 path/to/pep8.py Filing bugs ----------- We use Github issues to track all bugs and feature requests; feel free to open an issue if you have found a bug or wish to see a feature implemented. It is recommended to check that your issue complies with the following rules before submitting: - Verify that your issue is not being currently addressed by other [issues](https://github.com/scikit-learn-contrib/imbalanced-learn/issues) or [pull requests](https://github.com/scikit-learn-contrib/imbalanced-learn/pulls). - Please ensure all code snippets and error messages are formatted in appropriate code blocks. See [Creating and highlighting code blocks](https://help.github.com/articles/creating-and-highlighting-code-blocks). - Please include your operating system type and version number, as well as your Python, scikit-learn, numpy, and scipy versions. This information can be found by runnning the following code snippet: ```python import platform; print(platform.platform()) import sys; print("Python", sys.version) import numpy; print("NumPy", numpy.__version__) import scipy; print("SciPy", scipy.__version__) import sklearn; print("Scikit-Learn", sklearn.__version__) import imblearn; print("Imbalanced-Learn", imblearn.__version__) ``` - Please be specific about what estimators and/or functions are involved and the shape of the data, as appropriate; please include a [reproducible](https://stackoverflow.com/help/mcve) code snippet or link to a [gist](https://gist.github.com). If an exception is raised, please provide the traceback. Documentation ------------- We are glad to accept any sort of documentation: function docstrings, reStructuredText documents (like this one), tutorials, etc. reStructuredText documents live in the source code repository under the doc/ directory. You can edit the documentation using any text editor and then generate the HTML output by typing ``make html`` from the doc/ directory. Alternatively, ``make`` can be used to quickly generate the documentation without the example gallery. The resulting HTML files will be placed in _build/html/ and are viewable in a web browser. See the README file in the doc/ directory for more information. For building the documentation, you will need [sphinx](http://sphinx-doc.org), [matplotlib](https://matplotlib.org), and [pillow](https://pillow.readthedocs.io). When you are writing documentation, it is important to keep a good compromise between mathematical and algorithmic details, and give intuition to the reader on what the algorithm does. It is best to always start with a small paragraph with a hand-waving explanation of what the method does to the data and a figure (coming from an example) illustrating it. imbalanced-learn-0.12.2/LICENSE000066400000000000000000000021451460233407600157660ustar00rootroot00000000000000The MIT License (MIT) Copyright (c) 2014-2020 The imbalanced-learn developers. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. imbalanced-learn-0.12.2/MANIFEST.in000066400000000000000000000002051460233407600165120ustar00rootroot00000000000000 recursive-include doc * recursive-include examples * include AUTHORS.rst include CONTRIBUTING.md include LICENSE include README.rst imbalanced-learn-0.12.2/Makefile000066400000000000000000000012671460233407600164250ustar00rootroot00000000000000.PHONY: all clean test clean: find . -name "*.so" -o -name "*.pyc" -o -name "*.md5" -o -name "*.pyd" -o -name "*~" | xargs rm -f find . -name "*.pyx" -exec ./tools/rm_pyx_c_file.sh {} \; rm -rf coverage rm -rf dist rm -rf build rm -rf doc/_build rm -rf doc/auto_examples rm -rf doc/generated rm -rf doc/modules rm -rf examples/.ipynb_checkpoints test-code: pytest imblearn test-doc: pytest doc/*.rst test-coverage: rm -rf coverage .coverage pytest --cov=imblearn imblearn test: test-coverage test-doc html: export SPHINXOPTS=-W; make -C doc html conda: conda-build conda-recipe code-analysis: flake8 imblearn | grep -v __init__ pylint -E imblearn/ -d E1103,E0611,E1101 imbalanced-learn-0.12.2/README.rst000066400000000000000000000124321460233407600164500ustar00rootroot00000000000000.. -*- mode: rst -*- .. _scikit-learn: http://scikit-learn.org/stable/ .. _scikit-learn-contrib: https://github.com/scikit-learn-contrib |Azure|_ |Codecov|_ |CircleCI|_ |PythonVersion|_ |Pypi|_ |Gitter|_ |Black|_ .. |Azure| image:: https://dev.azure.com/imbalanced-learn/imbalanced-learn/_apis/build/status/scikit-learn-contrib.imbalanced-learn?branchName=master .. _Azure: https://dev.azure.com/imbalanced-learn/imbalanced-learn/_build .. |Codecov| image:: https://codecov.io/gh/scikit-learn-contrib/imbalanced-learn/branch/master/graph/badge.svg .. _Codecov: https://codecov.io/gh/scikit-learn-contrib/imbalanced-learn .. |CircleCI| image:: https://circleci.com/gh/scikit-learn-contrib/imbalanced-learn.svg?style=shield .. _CircleCI: https://circleci.com/gh/scikit-learn-contrib/imbalanced-learn/tree/master .. |PythonVersion| image:: https://img.shields.io/pypi/pyversions/imbalanced-learn.svg .. _PythonVersion: https://img.shields.io/pypi/pyversions/imbalanced-learn.svg .. |Pypi| image:: https://badge.fury.io/py/imbalanced-learn.svg .. _Pypi: https://badge.fury.io/py/imbalanced-learn .. |Gitter| image:: https://badges.gitter.im/scikit-learn-contrib/imbalanced-learn.svg .. _Gitter: https://gitter.im/scikit-learn-contrib/imbalanced-learn?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge .. |Black| image:: https://img.shields.io/badge/code%20style-black-000000.svg .. _Black: :target: https://github.com/psf/black .. |PythonMinVersion| replace:: 3.8 .. |NumPyMinVersion| replace:: 1.17.3 .. |SciPyMinVersion| replace:: 1.5.0 .. |ScikitLearnMinVersion| replace:: 1.0.2 .. |MatplotlibMinVersion| replace:: 3.1.2 .. |PandasMinVersion| replace:: 1.0.5 .. |TensorflowMinVersion| replace:: 2.4.3 .. |KerasMinVersion| replace:: 2.4.3 .. |SeabornMinVersion| replace:: 0.9.0 .. |PytestMinVersion| replace:: 5.0.1 imbalanced-learn ================ imbalanced-learn is a python package offering a number of re-sampling techniques commonly used in datasets showing strong between-class imbalance. It is compatible with scikit-learn_ and is part of scikit-learn-contrib_ projects. Documentation ------------- Installation documentation, API documentation, and examples can be found on the documentation_. .. _documentation: https://imbalanced-learn.org/stable/ Installation ------------ Dependencies ~~~~~~~~~~~~ `imbalanced-learn` requires the following dependencies: - Python (>= |PythonMinVersion|) - NumPy (>= |NumPyMinVersion|) - SciPy (>= |SciPyMinVersion|) - Scikit-learn (>= |ScikitLearnMinVersion|) Additionally, `imbalanced-learn` requires the following optional dependencies: - Pandas (>= |PandasMinVersion|) for dealing with dataframes - Tensorflow (>= |TensorflowMinVersion|) for dealing with TensorFlow models - Keras (>= |KerasMinVersion|) for dealing with Keras models The examples will requires the following additional dependencies: - Matplotlib (>= |MatplotlibMinVersion|) - Seaborn (>= |SeabornMinVersion|) Installation ~~~~~~~~~~~~ From PyPi or conda-forge repositories ..................................... imbalanced-learn is currently available on the PyPi's repositories and you can install it via `pip`:: pip install -U imbalanced-learn The package is release also in Anaconda Cloud platform:: conda install -c conda-forge imbalanced-learn From source available on GitHub ............................... If you prefer, you can clone it and run the setup.py file. Use the following commands to get a copy from Github and install all dependencies:: git clone https://github.com/scikit-learn-contrib/imbalanced-learn.git cd imbalanced-learn pip install . Be aware that you can install in developer mode with:: pip install --no-build-isolation --editable . If you wish to make pull-requests on GitHub, we advise you to install pre-commit:: pip install pre-commit pre-commit install Testing ~~~~~~~ After installation, you can use `pytest` to run the test suite:: make coverage Development ----------- The development of this scikit-learn-contrib is in line with the one of the scikit-learn community. Therefore, you can refer to their `Development Guide `_. About ----- If you use imbalanced-learn in a scientific publication, we would appreciate citations to the following paper:: @article{JMLR:v18:16-365, author = {Guillaume Lema{{\^i}}tre and Fernando Nogueira and Christos K. Aridas}, title = {Imbalanced-learn: A Python Toolbox to Tackle the Curse of Imbalanced Datasets in Machine Learning}, journal = {Journal of Machine Learning Research}, year = {2017}, volume = {18}, number = {17}, pages = {1-5}, url = {http://jmlr.org/papers/v18/16-365} } Most classification algorithms will only perform optimally when the number of samples of each class is roughly the same. Highly skewed datasets, where the minority is heavily outnumbered by one or more classes, have proven to be a challenge while at the same time becoming more and more common. One way of addressing this issue is by re-sampling the dataset as to offset this imbalance with the hope of arriving at a more robust and fair decision boundary than you would otherwise. You can refer to the `imbalanced-learn`_ documentation to find details about the implemented algorithms. .. _imbalanced-learn: https://imbalanced-learn.org/stable/user_guide.html imbalanced-learn-0.12.2/azure-pipelines.yml000066400000000000000000000235711460233407600206260ustar00rootroot00000000000000# Adapted from https://github.com/pandas-dev/pandas/blob/master/azure-pipelines.yml schedules: - cron: "30 2 * * *" displayName: Run nightly build branches: include: - main always: true jobs: - job: git_commit displayName: Get Git Commit pool: vmImage: ubuntu-22.04 steps: - bash: | set -ex if [[ $BUILD_REASON == "PullRequest" ]]; then # By default pull requests use refs/pull/PULL_ID/merge as the source branch # which has a "Merge ID into ID" as a commit message. The latest commit # message is the second to last commit COMMIT_ID=$(echo $BUILD_SOURCEVERSIONMESSAGE | awk '{print $2}') message=$(git log $COMMIT_ID -1 --pretty=%B) else message=$BUILD_SOURCEVERSIONMESSAGE fi echo "##vso[task.setvariable variable=message;isOutput=true]$message" name: commit displayName: Get source version message - job: linting dependsOn: [git_commit] condition: | and( succeeded(), not(contains(dependencies['git_commit']['outputs']['commit.message'], '[lint skip]')), not(contains(dependencies['git_commit']['outputs']['commit.message'], '[ci skip]')) ) displayName: Linting pool: vmImage: ubuntu-22.04 steps: - task: UsePythonVersion@0 inputs: versionSpec: '3.9' - bash: | # Include pytest compatibility with mypy pip install flake8 pytest mypy==1.3.0 black==23.3 ruff==0.0.272 displayName: Install linters - bash: | black --check --diff . displayName: Run black - bash: | ruff check --show-source . displayName: Run ruff - bash: | ./build_tools/azure/linting.sh displayName: Run linting - bash: | mypy imblearn/ displayName: Run mypy - template: build_tools/azure/posix.yml parameters: name: Linux_Nightly vmImage: ubuntu-22.04 dependsOn: [git_commit, linting] condition: | and( succeeded(), not(contains(dependencies['git_commit']['outputs']['commit.message'], '[ci skip]')), or(eq(variables['Build.Reason'], 'Schedule'), contains(dependencies['git_commit']['outputs']['commit.message'], '[scipy-dev]' ) ) ) matrix: pylatest_pip_scipy_dev: DISTRIB: 'conda-pip-scipy-dev' PYTHON_VERSION: '*' CHECK_WARNINGS: 'true' TEST_DOCS: 'true' TEST_DOCSTRINGS: 'true' # Will run all the time regardless of linting outcome. - template: build_tools/azure/posix.yml parameters: name: Linux_Runs vmImage: ubuntu-22.04 dependsOn: [git_commit] condition: | and( succeeded(), not(contains(dependencies['git_commit']['outputs']['commit.message'], '[ci skip]')) ) matrix: pylatest_conda_forge_mkl: DISTRIB: 'conda' CONDA_CHANNEL: 'conda-forge' PYTHON_VERSION: '*' BLAS: 'mkl' COVERAGE: 'true' SHOW_SHORT_SUMMARY: 'true' # Check compilation with Ubuntu bionic 18.04 LTS and scipy from conda-forge - template: build_tools/azure/posix.yml parameters: name: Ubuntu_Jammy_Jellyfish vmImage: ubuntu-22.04 dependsOn: [git_commit, linting] condition: | and( succeeded(), not(contains(dependencies['git_commit']['outputs']['commit.message'], '[ci skip]')), ne(variables['Build.Reason'], 'Schedule') ) matrix: py39_conda_forge_openblas_ubuntu_1804: DISTRIB: 'conda' CONDA_CHANNEL: 'conda-forge' PYTHON_VERSION: '3.9' BLAS: 'openblas' COVERAGE: 'false' - template: build_tools/azure/posix.yml parameters: name: Linux vmImage: ubuntu-22.04 dependsOn: [linting, git_commit] condition: | and( succeeded(), not(contains(dependencies['git_commit']['outputs']['commit.message'], '[ci skip]')), ne(variables['Build.Reason'], 'Schedule') ) matrix: # Linux environment to test that scikit-learn can be built against # versions of numpy, scipy with ATLAS that comes with Ubuntu Focal 20.04 # i.e. numpy 1.17.4 and scipy 1.3.3 ubuntu_atlas: DISTRIB: 'ubuntu' JOBLIB_VERSION: 'min' PANDAS_VERSION: 'none' THREADPOOLCTL_VERSION: 'min' COVERAGE: 'false' # Linux + Python 3.8 build with OpenBLAS and without SITE_JOBLIB py38_conda_conda_forge_openblas: DISTRIB: 'conda' CONDA_CHANNEL: 'conda-forge' PYTHON_VERSION: '3.8' BLAS: 'openblas' NUMPY_VERSION: '1.21.0' # we cannot get an older version of the dependencies resolution SCIPY_VERSION: 'min' SKLEARN_VERSION: 'min' MATPLOTLIB_VERSION: 'none' PANDAS_VERSION: 'none' THREADPOOLCTL_VERSION: '2.2.0' # Linux environment to test the latest available dependencies and MKL. pylatest_pip_openblas_pandas: DISTRIB: 'conda-pip-latest' PYTHON_VERSION: '*' TEST_DOCS: 'true' TEST_DOCSTRINGS: 'true' CHECK_WARNINGS: 'true' # Test the intermediate version of scikit-learn pylatest_pip_openblas_sklearn_intermediate: DISTRIB: 'conda-pip-latest' PYTHON_VERSION: '3.10' TEST_DOCS: 'true' TEST_DOCSTRINGS: 'true' CHECK_WARNINGS: 'false' SKLEARN_VERSION: '1.1.3' pylatest_pip_openblas_sklearn_intermediate_bis: DISTRIB: 'conda-pip-latest' PYTHON_VERSION: '3.10' TEST_DOCS: 'true' TEST_DOCSTRINGS: 'true' CHECK_WARNINGS: 'false' SKLEARN_VERSION: '1.2.2' pylatest_pip_tensorflow: DISTRIB: 'conda-pip-latest-tensorflow' CONDA_CHANNEL: 'conda-forge' PYTHON_VERSION: '3.9' TEST_DOCS: 'true' TEST_DOCSTRINGS: 'true' CHECK_WARNINGS: 'true' COVERAGE: 'true' pylatest_conda_tensorflow: DISTRIB: 'conda-latest-tensorflow' CONDA_CHANNEL: 'conda-forge' PYTHON_VERSION: '3.9' TEST_DOCS: 'true' TEST_DOCSTRINGS: 'true' CHECK_WARNINGS: 'true' conda_tensorflow_minimum: DISTRIB: 'conda-minimum-tensorflow' CONDA_CHANNEL: 'conda-forge' PYTHON_VERSION: '3.8' NUMPY_VERSION: '1.19.5' # This version is the minimum requrired by tensorflow SCIPY_VERSION: 'min' SKLEARN_VERSION: 'min' TENSORFLOW_VERSION: 'min' TEST_DOCS: 'true' TEST_DOCSTRINGS: 'false' # it is going to fail because of scikit-learn inheritance CHECK_WARNINGS: 'false' # in case the older version raise some FutureWarnings pylatest_pip_keras: DISTRIB: 'conda-pip-latest-keras' CONDA_CHANNEL: 'conda-forge' PYTHON_VERSION: '3.9' TEST_DOCS: 'true' TEST_DOCSTRINGS: 'true' CHECK_WARNINGS: 'true' COVERAGE: 'true' pylatest_conda_keras: DISTRIB: 'conda-latest-keras' CONDA_CHANNEL: 'conda-forge' PYTHON_VERSION: '3.9' TEST_DOCS: 'true' TEST_DOCSTRINGS: 'true' CHECK_WARNINGS: 'true' conda_keras_minimum: DISTRIB: 'conda-minimum-keras' CONDA_CHANNEL: 'conda-forge' PYTHON_VERSION: '3.8' NUMPY_VERSION: '1.19.5' # This version is the minimum requrired by tensorflow SCIPY_VERSION: 'min' SKLEARN_VERSION: 'min' KERAS_VERSION: 'min' TEST_DOCS: 'true' TEST_DOCSTRINGS: 'false' # it is going to fail because of scikit-learn inheritance CHECK_WARNINGS: 'false' # in case the older version raise some FutureWarnings # Currently runs on Python 3.8 while only Python 3.7 available # - template: build_tools/azure/posix-docker.yml # parameters: # name: Linux_Docker # vmImage: ubuntu-20.04 # dependsOn: [linting, git_commit] # condition: | # and( # succeeded(), # not(contains(dependencies['git_commit']['outputs']['commit.message'], '[ci skip]')), # ne(variables['Build.Reason'], 'Schedule') # ) # matrix: # debian_atlas_32bit: # DISTRIB: 'debian-32' # DOCKER_CONTAINER: 'i386/debian:10.9' # JOBLIB_VERSION: 'min' # # disable pytest xdist due to unknown bug with 32-bit container # PYTEST_XDIST_VERSION: 'none' # PYTEST_VERSION: 'min' # THREADPOOLCTL_VERSION: '2.2.0' - template: build_tools/azure/posix.yml parameters: name: macOS vmImage: macOS-11 dependsOn: [linting, git_commit] condition: | and( succeeded(), not(contains(dependencies['git_commit']['outputs']['commit.message'], '[ci skip]')), ne(variables['Build.Reason'], 'Schedule') ) matrix: pylatest_conda_forge_mkl: DISTRIB: 'conda' BLAS: 'mkl' CONDA_CHANNEL: 'conda-forge' CPU_COUNT: '3' TEST_DOCS: 'true' # TODO: re-enable when we find out why MKL on defaults segfaults # It seems that scikit-learn from defaults channel is built with LLVM/CLANG OMP # while we use MKL OMP. This could be the cause of the segfaults. # pylatest_conda_mkl_no_openmp: # DISTRIB: 'conda' # BLAS: 'mkl' # SKLEARN_SKIP_OPENMP_TEST: 'true' # CPU_COUNT: '3' # TEST_DOCS: 'true' conda_conda_forge_openblas: DISTRIB: 'conda' CONDA_CHANNEL: 'conda-forge' BLAS: 'openblas' TEST_DOCS: 'true' CPU_COUNT: '3' - template: build_tools/azure/windows.yml parameters: name: Windows vmImage: windows-latest dependsOn: [linting, git_commit] condition: | and( succeeded(), not(contains(dependencies['git_commit']['outputs']['commit.message'], '[ci skip]')), ne(variables['Build.Reason'], 'Schedule') ) matrix: py38_conda_forge_mkl: DISTRIB: 'conda' CONDA_CHANNEL: 'conda-forge' PYTHON_VERSION: '3.10' CHECK_WARNINGS: 'true' PYTHON_ARCH: '64' PYTEST_VERSION: '*' COVERAGE: 'true' imbalanced-learn-0.12.2/build_tools/000077500000000000000000000000001460233407600172765ustar00rootroot00000000000000imbalanced-learn-0.12.2/build_tools/azure/000077500000000000000000000000001460233407600204245ustar00rootroot00000000000000imbalanced-learn-0.12.2/build_tools/azure/install.sh000077500000000000000000000124621460233407600224360ustar00rootroot00000000000000#!/bin/bash set -e set -x UNAMESTR=`uname` make_conda() { conda update -yq conda TO_INSTALL="$@" if [[ "$DISTRIB" == *"mamba"* ]]; then mamba create -n $VIRTUALENV --yes $TO_INSTALL else conda config --show conda create -n $VIRTUALENV --yes $TO_INSTALL fi source activate $VIRTUALENV } # imports get_dep source build_tools/shared.sh if [[ "$DISTRIB" == "conda" || "$DISTRIB" == *"mamba"* ]]; then if [[ "$CONDA_CHANNEL" != "" ]]; then TO_INSTALL="--override-channels -c $CONDA_CHANNEL" else TO_INSTALL="" fi TO_INSTALL="$TO_INSTALL python=$PYTHON_VERSION" TO_INSTALL="$TO_INSTALL pip blas[build=$BLAS]" TO_INSTALL="$TO_INSTALL $(get_dep numpy $NUMPY_VERSION)" TO_INSTALL="$TO_INSTALL $(get_dep scipy $SCIPY_VERSION)" TO_INSTALL="$TO_INSTALL $(get_dep scikit-learn $SKLEARN_VERSION)" TO_INSTALL="$TO_INSTALL $(get_dep joblib $JOBLIB_VERSION)" TO_INSTALL="$TO_INSTALL $(get_dep pandas $PANDAS_VERSION)" TO_INSTALL="$TO_INSTALL $(get_dep Pillow $PILLOW_VERSION)" TO_INSTALL="$TO_INSTALL $(get_dep matplotlib $MATPLOTLIB_VERSION)" make_conda $TO_INSTALL elif [[ "$DISTRIB" == "ubuntu" ]]; then sudo add-apt-repository --remove ppa:ubuntu-toolchain-r/test sudo apt-get update sudo apt-get install python3-scipy python3-sklearn python3-matplotlib \ libatlas3-base libatlas-base-dev python3-virtualenv python3 -m virtualenv --system-site-packages --python=python3 $VIRTUALENV source $VIRTUALENV/bin/activate python -m pip install $(get_dep joblib $JOBLIB_VERSION) elif [[ "$DISTRIB" == "debian-32" ]]; then apt-get update apt-get install -y python3-dev python3-numpy python3-scipy python3-sklearn \ python3-matplotlib libatlas3-base libatlas-base-dev python3-virtualenv \ python3-pandas python3 -m virtualenv --system-site-packages --python=python3 $VIRTUALENV source $VIRTUALENV/bin/activate python -m pip install $(get_dep joblib $JOBLIB_VERSION) elif [[ "$DISTRIB" == "conda-pip-latest" ]]; then # Since conda main channel usually lacks behind on the latest releases, # we use pypi to test against the latest releases of the dependencies. # conda is still used as a convenient way to install Python and pip. make_conda "python=$PYTHON_VERSION" python -m pip install -U pip python -m pip install pandas matplotlib python -m pip install scikit-learn elif [[ "$DISTRIB" == "conda-pip-latest-tensorflow" ]]; then make_conda "python=$PYTHON_VERSION" python -m pip install -U pip python -m pip install numpy scipy scikit-learn pandas tensorflow elif [[ "$DISTRIB" == "conda-latest-tensorflow" ]]; then make_conda "python=$PYTHON_VERSION numpy scipy scikit-learn pandas tensorflow" elif [[ "$DISTRIB" == "conda-minimum-tensorflow" ]]; then TO_INSTALL="python=$PYTHON_VERSION" TO_INSTALL="$TO_INSTALL $(get_dep numpy $NUMPY_VERSION)" TO_INSTALL="$TO_INSTALL $(get_dep scipy $SCIPY_VERSION)" TO_INSTALL="$TO_INSTALL $(get_dep scikit-learn $SKLEARN_VERSION)" TO_INSTALL="$TO_INSTALL $(get_dep pandas $PANDAS_VERSION)" TO_INSTALL="$TO_INSTALL $(get_dep tensorflow $TENSORFLOW_VERSION)" make_conda $TO_INSTALL elif [[ "$DISTRIB" == "conda-pip-latest-keras" ]]; then make_conda "python=$PYTHON_VERSION" python -m pip install -U pip python -m pip install numpy scipy scikit-learn pandas keras elif [[ "$DISTRIB" == "conda-latest-keras" ]]; then make_conda "python=$PYTHON_VERSION numpy scipy scikit-learn pandas keras" elif [[ "$DISTRIB" == "conda-minimum-keras" ]]; then TO_INSTALL="python=$PYTHON_VERSION" TO_INSTALL="$TO_INSTALL $(get_dep numpy $NUMPY_VERSION)" TO_INSTALL="$TO_INSTALL $(get_dep scipy $SCIPY_VERSION)" TO_INSTALL="$TO_INSTALL $(get_dep scikit-learn $SKLEARN_VERSION)" TO_INSTALL="$TO_INSTALL $(get_dep pandas $PANDAS_VERSION)" TO_INSTALL="$TO_INSTALL $(get_dep keras $KERAS_VERSION)" make_conda $TO_INSTALL elif [[ "$DISTRIB" == "conda-pip-scipy-dev" ]]; then make_conda "python=$PYTHON_VERSION" python -m pip install -U pip echo "Installing numpy and scipy master wheels" dev_anaconda_url=https://pypi.anaconda.org/scipy-wheels-nightly/simple pip install --pre --upgrade --timeout=60 --extra-index $dev_anaconda_url numpy pandas scipy scikit-learn echo "Installing joblib master" pip install https://github.com/joblib/joblib/archive/master.zip echo "Installing tensorflow master" pip install tf-nightly fi python -m pip install $(get_dep threadpoolctl $THREADPOOLCTL_VERSION) \ $(get_dep pytest $PYTEST_VERSION) \ $(get_dep pytest-xdist $PYTEST_XDIST_VERSION) if [[ "$COVERAGE" == "true" ]]; then python -m pip install codecov pytest-cov fi if [[ "$PYTEST_XDIST_VERSION" != "none" ]]; then python -m pip install pytest-xdist fi if [[ "$TEST_DOCSTRINGS" == "true" ]]; then # numpydoc requires sphinx python -m pip install sphinx python -m pip install numpydoc fi python --version python -c "import numpy; print('numpy %s' % numpy.__version__)" python -c "import scipy; print('scipy %s' % scipy.__version__)" python -c "\ try: import pandas print('pandas %s' % pandas.__version__) except ImportError: print('pandas not installed') " python -m pip list pip install --verbose --editable . imbalanced-learn-0.12.2/build_tools/azure/install_win.sh000077500000000000000000000013231460233407600233050ustar00rootroot00000000000000#!/bin/bash set -e set -x if [[ "$PYTHON_ARCH" == "64" ]]; then conda create -n $VIRTUALENV -q -y python=$PYTHON_VERSION numpy scipy scikit-learn matplotlib wheel pillow joblib source activate $VIRTUALENV pip install threadpoolctl if [[ "$PYTEST_VERSION" == "*" ]]; then pip install pytest else pip install pytest==$PYTEST_VERSION fi else pip install numpy scipy scikit-learn pytest wheel pillow joblib threadpoolctl fi if [[ "$PYTEST_XDIST_VERSION" != "none" ]]; then pip install pytest-xdist fi if [[ "$COVERAGE" == "true" ]]; then pip install coverage codecov pytest-cov fi python --version pip --version python -m pip list pip install --verbose --editable . imbalanced-learn-0.12.2/build_tools/azure/linting.sh000077500000000000000000000020071460233407600224260ustar00rootroot00000000000000#!/bin/bash set -e # pipefail is necessary to propagate exit codes set -o pipefail # For docstrings and warnings of deprecated attributes to be rendered # properly, the property decorator must come before the deprecated decorator # (else they are treated as functions) # do not error when grep -B1 "@property" finds nothing set +e bad_deprecation_property_order=`git grep -A 10 "@property" -- "*.py" | awk '/@property/,/def /' | grep -B1 "@deprecated"` if [ ! -z "$bad_deprecation_property_order" ] then echo "property decorator should come before deprecated decorator" echo "found the following occurrencies:" echo $bad_deprecation_property_order exit 1 fi # Check for default doctest directives ELLIPSIS and NORMALIZE_WHITESPACE doctest_directive="$(git grep -nw -E "# doctest\: \+(ELLIPSIS|NORMALIZE_WHITESPACE)")" if [ ! -z "$doctest_directive" ] then echo "ELLIPSIS and NORMALIZE_WHITESPACE doctest directives are enabled by default, but were found in:" echo "$doctest_directive" exit 1 fi imbalanced-learn-0.12.2/build_tools/azure/posix-docker.yml000066400000000000000000000056431460233407600235660ustar00rootroot00000000000000parameters: name: '' vmImage: '' matrix: [] dependsOn: [] condition: ne(variables['Build.Reason'], 'Schedule') jobs: - job: ${{ parameters.name }} dependsOn: ${{ parameters.dependsOn }} condition: ${{ parameters.condition }} pool: vmImage: ${{ parameters.vmImage }} variables: TEST_DIR: '$(Agent.WorkFolder)/tmp_folder' JUNITXML: 'test-data.xml' OMP_NUM_THREADS: '2' OPENBLAS_NUM_THREADS: '2' CPU_COUNT: '2' NUMPY_VERSION: 'latest' SCIPY_VERSION: 'latest' SKLEARN_VERSION: 'latest' TENSORFLOW_VERSION: 'none' KERAS_VERSION: 'none' JOBLIB_VERSION: 'latest' PANDAS_VERSION: 'latest' MATPLOTLIB_VERSION: 'latest' PYTEST_VERSION: 'latest' PYTEST_XDIST_VERSION: 'latest' THREADPOOLCTL_VERSION: 'latest' COVERAGE: 'false' TEST_DOCSTRINGS: 'false' CHECK_WARNINGS: 'false' BLAS: 'openblas' # Set in azure-pipelines.yml DISTRIB: '' DOCKER_CONTAINER: '' SHOW_SHORT_SUMMARY: 'false' strategy: matrix: ${{ insert }}: ${{ parameters.matrix }} steps: # Container is detached and sleeping, allowing steps to run commands # in the container. The TEST_DIR is mapped allowing the host to access # the JUNITXML file - script: > docker container run --rm --volume $TEST_DIR:/temp_dir --volume $PWD:/io -w /io --detach --name skcontainer -e DISTRIB=$DISTRIB -e TEST_DIR=/temp_dir -e JUNITXML=$JUNITXML -e VIRTUALENV=testvenv -e NUMPY_VERSION=$NUMPY_VERSION -e SCIPY_VERSION=$SCIPY_VERSION -e SKLEARN_VERSION=$SKLEARN_VERSION -e TENSORFLOW_VERSION=$TENSORFLOW_VERSION -e KERAS_VERSION=$KERAS_VERSION -e JOBLIB_VERSION=$JOBLIB_VERSION -e PANDAS_VERSION=$PANDAS_VERSION -e PILLOW_VERSION=$PILLOW_VERSION -e MATPLOTLIB_VERSION=$MATPLOTLIB_VERSION -e PYTEST_VERSION=$PYTEST_VERSION -e PYTEST_XDIST_VERSION=$PYTEST_XDIST_VERSION -e THREADPOOLCTL_VERSION=$THREADPOOLCTL_VERSION -e OMP_NUM_THREADS=$OMP_NUM_THREADS -e OPENBLAS_NUM_THREADS=$OPENBLAS_NUM_THREADS -e SKLEARN_SKIP_NETWORK_TESTS=$SKLEARN_SKIP_NETWORK_TESTS -e BLAS=$BLAS -e CPU_COUNT=$CPU_COUNT $DOCKER_CONTAINER sleep 1000000 displayName: 'Start container' - script: > docker exec skcontainer ./build_tools/azure/install.sh displayName: 'Install' - script: > docker exec skcontainer ./build_tools/azure/test_script.sh displayName: 'Test Library' - task: PublishTestResults@2 inputs: testResultsFiles: '$(TEST_DIR)/$(JUNITXML)' testRunTitle: ${{ format('{0}-$(Agent.JobName)', parameters.name) }} displayName: 'Publish Test Results' condition: succeededOrFailed() - script: > docker container stop skcontainer displayName: 'Stop container' condition: always() imbalanced-learn-0.12.2/build_tools/azure/posix.yml000066400000000000000000000045341460233407600223170ustar00rootroot00000000000000parameters: name: '' vmImage: '' matrix: [] dependsOn: [] condition: '' jobs: - job: ${{ parameters.name }} dependsOn: ${{ parameters.dependsOn }} condition: ${{ parameters.condition }} timeoutInMinutes: 120 pool: vmImage: ${{ parameters.vmImage }} variables: TEST_DIR: '$(Agent.WorkFolder)/tmp_folder' VIRTUALENV: 'testvenv' JUNITXML: 'test-data.xml' OMP_NUM_THREADS: '2' OPENBLAS_NUM_THREADS: '2' CPU_COUNT: '2' SKLEARN_SKIP_NETWORK_TESTS: '1' SKLEARN_DATA_DIR: $(System.DefaultWorkingDirectory)/scikit_learn_data NUMPY_VERSION: 'latest' SCIPY_VERSION: 'latest' SKLEARN_VERSION: 'latest' TENSORFLOW_VERSION: 'none' KERAS_VERSION: 'none' JOBLIB_VERSION: 'latest' PANDAS_VERSION: 'latest' PILLOW_VERSION: 'latest' MATPLOTLIB_VERSION: 'latest' PYTEST_VERSION: 'latest' PYTEST_XDIST_VERSION: 'latest' THREADPOOLCTL_VERSION: 'latest' COVERAGE: 'true' TEST_DOCS: 'false' TEST_DOCSTRINGS: 'false' CHECK_WARNINGS: 'false' SHOW_SHORT_SUMMARY: 'false' strategy: matrix: ${{ insert }}: ${{ parameters.matrix }} steps: - bash: echo "##vso[task.prependpath]$CONDA/bin" displayName: Add conda to PATH condition: startsWith(variables['DISTRIB'], 'conda') - bash: sudo chown -R $USER $CONDA displayName: Take ownership of conda installation condition: startsWith(variables['DISTRIB'], 'conda') - script: | build_tools/azure/install.sh displayName: 'Install' - script: | build_tools/azure/test_script.sh displayName: 'Test Library' - script: | build_tools/azure/test_docs.sh displayName: 'Test Docs' condition: eq(variables['TEST_DOCS'], 'true') - script: | build_tools/azure/test_docstring.sh displayName: "Numpydoc validation" condition: eq(variables['TEST_DOCSTRINGS'], 'true') - task: PublishTestResults@2 inputs: testResultsFiles: '$(TEST_DIR)/$(JUNITXML)' testRunTitle: ${{ format('{0}-$(Agent.JobName)', parameters.name) }} displayName: 'Publish Test Results' condition: succeededOrFailed() - script: | build_tools/azure/upload_codecov.sh condition: and(succeeded(), eq(variables['COVERAGE'], 'true')) displayName: 'Upload To Codecov' env: CODECOV_TOKEN: $(CODECOV_TOKEN) imbalanced-learn-0.12.2/build_tools/azure/test_docs.sh000077500000000000000000000002711460233407600227520ustar00rootroot00000000000000#!/bin/bash set -e if [[ "$DISTRIB" =~ ^conda.* ]]; then source activate $VIRTUALENV elif [[ "$DISTRIB" == "ubuntu" ]]; then source $VIRTUALENV/bin/activate fi make test-doc imbalanced-learn-0.12.2/build_tools/azure/test_docstring.sh000077500000000000000000000003251460233407600240160ustar00rootroot00000000000000#!/bin/bash set -e if [[ "$DISTRIB" =~ ^conda.* ]]; then source activate $VIRTUALENV elif [[ "$DISTRIB" == "ubuntu" ]]; then source $VIRTUALENV/bin/activate fi pytest -vsl maint_tools/test_docstring.py imbalanced-learn-0.12.2/build_tools/azure/test_script.sh000077500000000000000000000042461460233407600233340ustar00rootroot00000000000000#!/bin/bash set -e if [[ "$DISTRIB" =~ ^conda.* ]]; then source activate $VIRTUALENV elif [[ "$DISTRIB" == "ubuntu" ]] || [[ "$DISTRIB" == "debian-32" ]]; then source $VIRTUALENV/bin/activate fi mkdir -p $TEST_DIR cp setup.cfg $TEST_DIR cd $TEST_DIR # python -c "import joblib; print(f'Number of cores (physical): \ # {joblib.cpu_count()} ({joblib.cpu_count(only_physical_cores=True)})')" # python -c "import sklearn; sklearn.show_versions()" python -c "import imblearn; imblearn.show_versions()" if ! command -v conda &> /dev/null then pip list else # conda list provides more info than pip list (when available) conda list fi TEST_CMD="python -m pytest --showlocals --durations=20 --junitxml=$JUNITXML" if [[ "$COVERAGE" == "true" ]]; then # Note: --cov-report= is used to disable to long text output report in the # CI logs. The coverage data is consolidated by codecov to get an online # web report across all the platforms so there is no need for this text # report that otherwise hides the test failures and forces long scrolls in # the CI logs. export COVERAGE_PROCESS_START="$BUILD_SOURCESDIRECTORY/.coveragerc" TEST_CMD="$TEST_CMD --cov-config='$COVERAGE_PROCESS_START' --cov imblearn --cov-report=" fi if [[ "$CHECK_WARNINGS" == "true" ]]; then # numpy's 1.19.0's tostring() deprecation is ignored until scipy and joblib removes its usage TEST_CMD="$TEST_CMD -Werror::DeprecationWarning -Werror::FutureWarning -Wignore:tostring:DeprecationWarning" # numpy's 1.20's np.object deprecationg is ignored until tensorflow removes its usage TEST_CMD="$TEST_CMD -Wignore:\`np.object\`:DeprecationWarning" # Python 3.10 deprecates disutils and is imported by numpy interally during import time TEST_CMD="$TEST_CMD -Wignore:The\ distutils:DeprecationWarning" # Workaround for https://github.com/pypa/setuptools/issues/2885 TEST_CMD="$TEST_CMD -Wignore:Creating\ a\ LegacyVersion:DeprecationWarning" fi if [[ "$PYTEST_XDIST_VERSION" != "none" ]]; then TEST_CMD="$TEST_CMD -n$CPU_COUNT" fi if [[ "$SHOW_SHORT_SUMMARY" == "true" ]]; then TEST_CMD="$TEST_CMD -ra" fi set -x eval "$TEST_CMD --pyargs imblearn" set +x imbalanced-learn-0.12.2/build_tools/azure/upload_codecov.sh000077500000000000000000000006671460233407600237620ustar00rootroot00000000000000#!/bin/bash set -e # called when COVERAGE=="true" and DISTRIB=="conda" export PATH=$HOME/miniconda3/bin:$PATH source activate $VIRTUALENV # Need to run codecov from a git checkout, so we copy .coverage # from TEST_DIR where pytest has been run pushd $TEST_DIR coverage combine --append popd cp $TEST_DIR/.coverage $BUILD_REPOSITORY_LOCALPATH codecov --root $BUILD_REPOSITORY_LOCALPATH -t $CODECOV_TOKEN || echo "codecov upload failed" imbalanced-learn-0.12.2/build_tools/azure/windows.yml000066400000000000000000000032131460233407600226400ustar00rootroot00000000000000 parameters: name: '' vmImage: '' matrix: [] dependsOn: [] condition: ne(variables['Build.Reason'], 'Schedule') jobs: - job: ${{ parameters.name }} dependsOn: ${{ parameters.dependsOn }} condition: ${{ parameters.condition }} pool: vmImage: ${{ parameters.vmImage }} variables: VIRTUALENV: 'testvenv' JUNITXML: 'test-data.xml' SKLEARN_SKIP_NETWORK_TESTS: '1' PYTEST_VERSION: '5.2.1' PYTEST_XDIST: 'true' PYTEST_XDIST_VERSION: 'latest' TEST_DIR: '$(Agent.WorkFolder)/tmp_folder' CPU_COUNT: '2' CHECK_WARNINGS: 'false' strategy: matrix: ${{ insert }}: ${{ parameters.matrix }} steps: - bash: echo "##vso[task.prependpath]$CONDA/Scripts" displayName: Add conda to PATH for 64 bit Python condition: eq(variables['PYTHON_ARCH'], '64') - task: UsePythonVersion@0 inputs: versionSpec: '$(PYTHON_VERSION)' addToPath: true architecture: 'x86' displayName: Use 32 bit System Python condition: eq(variables['PYTHON_ARCH'], '32') - bash: ./build_tools/azure/install_win.sh displayName: 'Install' - bash: ./build_tools/azure/test_script.sh displayName: 'Test Library' - bash: ./build_tools/azure/upload_codecov.sh condition: and(succeeded(), eq(variables['COVERAGE'], 'true')) displayName: 'Upload To Codecov' env: CODECOV_TOKEN: $(CODECOV_TOKEN) - task: PublishTestResults@2 inputs: testResultsFiles: '$(TEST_DIR)/$(JUNITXML)' testRunTitle: ${{ format('{0}-$(Agent.JobName)', parameters.name) }} displayName: 'Publish Test Results' condition: succeededOrFailed() imbalanced-learn-0.12.2/build_tools/circle/000077500000000000000000000000001460233407600205375ustar00rootroot00000000000000imbalanced-learn-0.12.2/build_tools/circle/build_doc.sh000077500000000000000000000101361460233407600230230ustar00rootroot00000000000000#!/usr/bin/env bash set -x set -e # Decide what kind of documentation build to run, and run it. # # If the last commit message has a "[doc skip]" marker, do not build # the doc. On the contrary if a "[doc build]" marker is found, build the doc # instead of relying on the subsequent rules. # # We always build the documentation for jobs that are not related to a specific # PR (e.g. a merge to master or a maintenance branch). # # If this is a PR, do a full build if there are some files in this PR that are # under the "doc/" or "examples/" folders, otherwise perform a quick build. # # If the inspection of the current commit fails for any reason, the default # behavior is to quick build the documentation. get_build_type() { if [ -z "$CIRCLE_SHA1" ] then echo SKIP: undefined CIRCLE_SHA1 return fi commit_msg=$(git log --format=%B -n 1 $CIRCLE_SHA1) if [ -z "$commit_msg" ] then echo QUICK BUILD: failed to inspect commit $CIRCLE_SHA1 return fi if [[ "$commit_msg" =~ \[doc\ skip\] ]] then echo SKIP: [doc skip] marker found return fi if [[ "$commit_msg" =~ \[doc\ quick\] ]] then echo QUICK: [doc quick] marker found return fi if [[ "$commit_msg" =~ \[doc\ build\] ]] then echo BUILD: [doc build] marker found return fi if [ -z "$CI_PULL_REQUEST" ] then echo BUILD: not a pull request return fi git_range="origin/master...$CIRCLE_SHA1" git fetch origin master >&2 || (echo QUICK BUILD: failed to get changed filenames for $git_range; return) filenames=$(git diff --name-only $git_range) if [ -z "$filenames" ] then echo QUICK BUILD: no changed filenames for $git_range return fi if echo "$filenames" | grep -q -e ^examples/ then echo BUILD: detected examples/ filename modified in $git_range: $(echo "$filenames" | grep -e ^examples/ | head -n1) return fi echo QUICK BUILD: no examples/ filename modified in $git_range: echo "$filenames" } build_type=$(get_build_type) if [[ "$build_type" =~ ^SKIP ]] then exit 0 fi make_args=html make_args="SPHINXOPTS=-T $make_args" # show full traceback on exception # Installing required system packages to support the rendering of math # notation in the HTML documentation and to optimize the image files sudo -E apt-get -yq update --allow-releaseinfo-change sudo -E apt-get -yq remove texlive-binaries --purge sudo -E apt-get -yq --no-install-suggests --no-install-recommends \ install dvipng texlive-latex-base texlive-latex-extra \ texlive-latex-recommended texlive-fonts-recommended \ latexmk gsfonts zip optipng # deactivate circleci virtualenv and setup a miniconda env instead if [[ `type -t deactivate` ]]; then deactivate fi MAMBAFORGE_PATH=$HOME/mambaforge # Install dependencies with mamba wget -q https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh \ -O mambaforge.sh chmod +x mambaforge.sh && ./mambaforge.sh -b -p $MAMBAFORGE_PATH export PATH="$MAMBAFORGE_PATH/bin:$PATH" mamba update --yes --quiet conda # imports get_dep source build_tools/shared.sh # packaging won't be needed once setuptools starts shipping packaging>=17.0 mamba create -n $CONDA_ENV_NAME --yes --quiet \ python="${PYTHON_VERSION:-*}" \ "$(get_dep numpy $NUMPY_VERSION)" \ "$(get_dep scipy $SCIPY_VERSION)" \ "$(get_dep scikit-learn $SKLEARN_VERSION)" \ "$(get_dep matplotlib $MATPLOTLIB_VERSION)" \ "$(get_dep sphinx $SPHINX_VERSION)" \ "$(get_dep pandas $PANDAS_VERSION)" \ "$(get_dep sphinx-gallery $SPHINX_GALLERY_VERSION)" \ "$(get_dep numpydoc $NUMPYDOC_VERSION)" \ "$(get_dep sphinxcontrib-bibtex $SPHINXCONTRIB_BIBTEX_VERSION)" \ "$(get_dep sphinx-copybutton $SPHINXCONTRIB_BIBTEX_VERSION)" \ "$(get_dep pydata-sphinx-theme $PYDATA_SPHINX_THEME_VERSION)" \ memory_profiler packaging seaborn pytest coverage compilers tensorflow source activate $CONDA_ENV_NAME # Build and install imbalanced-learn in dev mode ls -l pip install -e . --no-build-isolation # The pipefail is requested to propagate exit code set -o pipefail && cd doc && make $make_args 2>&1 | tee ~/log.txt cd - set +o pipefail imbalanced-learn-0.12.2/build_tools/circle/checkout_merge_commit.sh000077500000000000000000000016411460233407600254340ustar00rootroot00000000000000#!/bin/bash # Add `master` branch to the update list. # Otherwise CircleCI will give us a cached one. FETCH_REFS="+master:master" # Update PR refs for testing. if [[ -n "${CIRCLE_PR_NUMBER}" ]] then FETCH_REFS="${FETCH_REFS} +refs/pull/${CIRCLE_PR_NUMBER}/head:pr/${CIRCLE_PR_NUMBER}/head" FETCH_REFS="${FETCH_REFS} +refs/pull/${CIRCLE_PR_NUMBER}/merge:pr/${CIRCLE_PR_NUMBER}/merge" fi # Retrieve the refs. git fetch -u origin ${FETCH_REFS} # Checkout the PR merge ref. if [[ -n "${CIRCLE_PR_NUMBER}" ]] then git checkout -qf "pr/${CIRCLE_PR_NUMBER}/merge" || ( echo Could not fetch merge commit. >&2 echo There may be conflicts in merging PR \#${CIRCLE_PR_NUMBER} with master. >&2; exit 1) fi # Check for merge conflicts. if [[ -n "${CIRCLE_PR_NUMBER}" ]] then git branch --merged | grep master > /dev/null git branch --merged | grep "pr/${CIRCLE_PR_NUMBER}/head" > /dev/null fi imbalanced-learn-0.12.2/build_tools/circle/linting.sh000077500000000000000000000142621460233407600225470ustar00rootroot00000000000000#!/bin/bash # This script is used in CircleCI to check that PRs do not add obvious # flake8 violations. It relies on two things: # - find common ancestor between branch and # scikit-learn/scikit-learn remote # - run flake8 --diff on the diff between the branch and the common # ancestor # # Additional features: # - the line numbers in Travis match the local branch on the PR # author machine. # - ./build_tools/circle/flake8_diff.sh can be run locally for quick # turn-around set -e # pipefail is necessary to propagate exit codes set -o pipefail PROJECT=scikit-learn-contrib/imbalanced-learn PROJECT_URL=https://github.com/$PROJECT.git # Find the remote with the project name (upstream in most cases) REMOTE=$(git remote -v | grep $PROJECT | cut -f1 | head -1 || echo '') # Add a temporary remote if needed. For example this is necessary when # Travis is configured to run in a fork. In this case 'origin' is the # fork and not the reference repo we want to diff against. if [[ -z "$REMOTE" ]]; then TMP_REMOTE=tmp_reference_upstream REMOTE=$TMP_REMOTE git remote add $REMOTE $PROJECT_URL fi echo "Remotes:" echo '--------------------------------------------------------------------------------' git remote --verbose # Travis does the git clone with a limited depth (50 at the time of # writing). This may not be enough to find the common ancestor with # $REMOTE/master so we unshallow the git checkout if [[ -a .git/shallow ]]; then echo -e '\nTrying to unshallow the repo:' echo '--------------------------------------------------------------------------------' git fetch --unshallow fi if [[ "$TRAVIS" == "true" ]]; then if [[ "$TRAVIS_PULL_REQUEST" == "false" ]] then # In main repo, using TRAVIS_COMMIT_RANGE to test the commits # that were pushed into a branch if [[ "$PROJECT" == "$TRAVIS_REPO_SLUG" ]]; then if [[ -z "$TRAVIS_COMMIT_RANGE" ]]; then echo "New branch, no commit range from Travis so passing this test by convention" exit 0 fi COMMIT_RANGE=$TRAVIS_COMMIT_RANGE fi else # We want to fetch the code as it is in the PR branch and not # the result of the merge into master. This way line numbers # reported by Travis will match with the local code. LOCAL_BRANCH_REF=travis_pr_$TRAVIS_PULL_REQUEST # In Travis the PR target is always origin git fetch origin pull/$TRAVIS_PULL_REQUEST/head:refs/$LOCAL_BRANCH_REF fi fi # If not using the commit range from Travis we need to find the common # ancestor between $LOCAL_BRANCH_REF and $REMOTE/master if [[ -z "$COMMIT_RANGE" ]]; then if [[ -z "$LOCAL_BRANCH_REF" ]]; then LOCAL_BRANCH_REF=$(git rev-parse --abbrev-ref HEAD) fi echo -e "\nLast 2 commits in $LOCAL_BRANCH_REF:" echo '--------------------------------------------------------------------------------' git --no-pager log -2 $LOCAL_BRANCH_REF REMOTE_MASTER_REF="$REMOTE/master" # Make sure that $REMOTE_MASTER_REF is a valid reference echo -e "\nFetching $REMOTE_MASTER_REF" echo '--------------------------------------------------------------------------------' git fetch $REMOTE master:refs/remotes/$REMOTE_MASTER_REF LOCAL_BRANCH_SHORT_HASH=$(git rev-parse --short $LOCAL_BRANCH_REF) REMOTE_MASTER_SHORT_HASH=$(git rev-parse --short $REMOTE_MASTER_REF) COMMIT=$(git merge-base $LOCAL_BRANCH_REF $REMOTE_MASTER_REF) || \ echo "No common ancestor found for $(git show $LOCAL_BRANCH_REF -q) and $(git show $REMOTE_MASTER_REF -q)" if [ -z "$COMMIT" ]; then exit 1 fi COMMIT_SHORT_HASH=$(git rev-parse --short $COMMIT) echo -e "\nCommon ancestor between $LOCAL_BRANCH_REF ($LOCAL_BRANCH_SHORT_HASH)"\ "and $REMOTE_MASTER_REF ($REMOTE_MASTER_SHORT_HASH) is $COMMIT_SHORT_HASH:" echo '--------------------------------------------------------------------------------' git --no-pager show --no-patch $COMMIT_SHORT_HASH COMMIT_RANGE="$COMMIT_SHORT_HASH..$LOCAL_BRANCH_SHORT_HASH" if [[ -n "$TMP_REMOTE" ]]; then git remote remove $TMP_REMOTE fi else echo "Got the commit range from Travis: $COMMIT_RANGE" fi echo -e '\nRunning flake8 on the diff in the range' "$COMMIT_RANGE" \ "($(git rev-list $COMMIT_RANGE | wc -l) commit(s)):" echo '--------------------------------------------------------------------------------' # We ignore files from sklearn/externals. Unfortunately there is no # way to do it with flake8 directly (the --exclude does not seem to # work with --diff). We could use the exclude magic in the git pathspec # ':!sklearn/externals' but it is only available on git 1.9 and Travis # uses git 1.8. # We need the following command to exit with 0 hence the echo in case # there is no match MODIFIED_FILES="$(git diff --name-only $COMMIT_RANGE | grep -v 'sklearn/externals' | \ grep -v 'doc/sphinxext' || echo "no_match")" check_files() { files="$1" shift options="$*" if [ -n "$files" ]; then # Conservative approach: diff without context (--unified=0) so that code # that was not changed does not create failures git diff --unified=0 $COMMIT_RANGE -- $files | flake8 --diff --max-line-length=88 --show-source $options fi } if [[ "$MODIFIED_FILES" == "no_match" ]]; then echo "No file outside sklearn/externals and doc/sphinxext has been modified" else check_files "$(echo "$MODIFIED_FILES" | grep -v ^examples)" check_files "$(echo "$MODIFIED_FILES" | grep ^examples)" \ --config ./setup.cfg fi echo -e "No problem detected by flake8\n" # For docstrings and warnings of deprecated attributes to be rendered # properly, the property decorator must come before the deprecated decorator # (else they are treated as functions) # do not error when grep -B1 "@property" finds nothing set +e bad_deprecation_property_order=`git grep -A 10 "@property" -- "*.py" | awk '/@property/,/def /' | grep -B1 "@deprecated"` if [ ! -z "$bad_deprecation_property_order" ] then echo "property decorator should come before deprecated decorator" echo "found the following occurrencies:" echo $bad_deprecation_property_order exit 1 fi imbalanced-learn-0.12.2/build_tools/circle/push_doc.sh000077500000000000000000000024621460233407600227060ustar00rootroot00000000000000#!/bin/bash # This script is meant to be called in the "deploy" step defined in # circle.yml. See https://circleci.com/docs/ for more details. # The behavior of the script is controlled by environment variable defined # in the circle.yml in the top level folder of the project. GENERATED_DOC_DIR=$1 if [[ -z "$GENERATED_DOC_DIR" ]]; then echo "Need to pass directory of the generated doc as argument" echo "Usage: $0 " exit 1 fi # Absolute path needed because we use cd further down in this script GENERATED_DOC_DIR=$(readlink -f $GENERATED_DOC_DIR) if [ "$CIRCLE_BRANCH" = "master" ] then dir=dev else # Strip off .X dir="${CIRCLE_BRANCH::-2}" fi MSG="Pushing the docs to $dir/ for branch: $CIRCLE_BRANCH, commit $CIRCLE_SHA1" cd $HOME if [ ! -d $DOC_REPO ]; then git clone --depth 1 --no-checkout -b master "git@github.com:"$ORGANIZATION"/"$DOC_REPO".git"; fi cd $DOC_REPO git config core.sparseCheckout true echo $dir > .git/info/sparse-checkout git checkout master git reset --hard origin/master git rm -rf $dir/ && rm -rf $dir/ cp -R $GENERATED_DOC_DIR $dir touch $dir/.nojekyll git config --global user.email $EMAIL git config --global user.name $USERNAME git config --global push.default matching git add -f $dir/ git commit -m "$MSG" $dir git push origin master echo $MSG imbalanced-learn-0.12.2/build_tools/shared.sh000066400000000000000000000007271460233407600211060ustar00rootroot00000000000000get_dep() { package="$1" version="$2" if [[ "$version" == "none" ]]; then # do not install with none echo elif [[ "${version%%[^0-9.]*}" ]]; then # version number is explicitly passed echo "$package==$version" elif [[ "$version" == "latest" ]]; then # use latest echo "$package" elif [[ "$version" == "min" ]]; then echo "$package==$(python imblearn/_min_dependencies.py $package)" fi } imbalanced-learn-0.12.2/conftest.py000066400000000000000000000014361460233407600171620ustar00rootroot00000000000000# This file is here so that when running from the root folder # ./imblearn is added to sys.path by pytest. # See https://docs.pytest.org/en/latest/pythonpath.html for more details. # For example, this allows to build extensions in place and run pytest # doc/modules/clustering.rst and use imblearn from the local folder # rather than the one from site-packages. import os import pytest def pytest_runtest_setup(item): fname = item.fspath.strpath if ( fname.endswith(os.path.join("keras", "_generator.py")) or fname.endswith(os.path.join("tensorflow", "_generator.py")) or fname.endswith("miscellaneous.rst") ): try: import tensorflow # noqa except ImportError: pytest.skip("The tensorflow package is not installed.") imbalanced-learn-0.12.2/doc/000077500000000000000000000000001460233407600155245ustar00rootroot00000000000000imbalanced-learn-0.12.2/doc/Makefile000066400000000000000000000156421460233407600171740ustar00rootroot00000000000000# Makefile for Sphinx documentation # # You can set these variables from the command line. SPHINXOPTS = -v SPHINXBUILD = sphinx-build PAPER = BUILDDIR = _build # User-friendly check for sphinx-build ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) endif # Internal variables. PAPEROPT_a4 = -D latex_paper_size=a4 PAPEROPT_letter = -D latex_paper_size=letter ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . # the i18n builder cannot share the environment and doctrees with the others I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext help: @echo "Please use \`make ' where is one of" @echo " html to make standalone HTML files" @echo " dirhtml to make HTML files named index.html in directories" @echo " singlehtml to make a single large HTML file" @echo " pickle to make pickle files" @echo " json to make JSON files" @echo " htmlhelp to make HTML files and a HTML help project" @echo " qthelp to make HTML files and a qthelp project" @echo " devhelp to make HTML files and a Devhelp project" @echo " epub to make an epub" @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" @echo " latexpdf to make LaTeX files and run them through pdflatex" @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" @echo " text to make text files" @echo " man to make manual pages" @echo " texinfo to make Texinfo files" @echo " info to make Texinfo files and run them through makeinfo" @echo " gettext to make PO message catalogs" @echo " changes to make an overview of all changed/added/deprecated items" @echo " xml to make Docutils-native XML files" @echo " pseudoxml to make pseudoxml-XML files for display purposes" @echo " linkcheck to check all external links for integrity" @echo " doctest to run all doctests embedded in the documentation (if enabled)" clean: -rm -rf $(BUILDDIR)/* -rm -rf auto_examples/ -rm -rf generated/* -rm -rf modules/generated/* html: # These two lines make the build a bit more lengthy, and the # the embedding of images more robust rm -rf $(BUILDDIR)/html/_images #rm -rf _build/doctrees/ $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html touch $(BUILDDIR)/html/.nojekyll @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." dirhtml: $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." singlehtml: $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml @echo @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." pickle: $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle @echo @echo "Build finished; now you can process the pickle files." json: $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json @echo @echo "Build finished; now you can process the JSON files." htmlhelp: $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp @echo @echo "Build finished; now you can run HTML Help Workshop with the" \ ".hhp project file in $(BUILDDIR)/htmlhelp." qthelp: $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp @echo @echo "Build finished; now you can run "qcollectiongenerator" with the" \ ".qhcp project file in $(BUILDDIR)/qthelp, like this:" @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/imbalanced-learn.qhcp" @echo "To view the help file:" @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/imbalanced-learn.qhc" devhelp: $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp @echo @echo "Build finished." @echo "To view the help file:" @echo "# mkdir -p $$HOME/.local/share/devhelp/imbalanced-learn" @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/imbalanced-learn" @echo "# devhelp" epub: $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub @echo @echo "Build finished. The epub file is in $(BUILDDIR)/epub." latex: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." @echo "Run \`make' in that directory to run these through (pdf)latex" \ "(use \`make latexpdf' here to do that automatically)." latexpdf: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo "Running LaTeX files through pdflatex..." $(MAKE) -C $(BUILDDIR)/latex all-pdf @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." latexpdfja: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo "Running LaTeX files through platex and dvipdfmx..." $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." text: $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text @echo @echo "Build finished. The text files are in $(BUILDDIR)/text." man: $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man @echo @echo "Build finished. The manual pages are in $(BUILDDIR)/man." texinfo: $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo @echo @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." @echo "Run \`make' in that directory to run these through makeinfo" \ "(use \`make info' here to do that automatically)." info: $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo @echo "Running Texinfo files through makeinfo..." make -C $(BUILDDIR)/texinfo info @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." gettext: $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale @echo @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." changes: $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes @echo @echo "The overview file is in $(BUILDDIR)/changes." linkcheck: $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck @echo @echo "Link check complete; look for any errors in the above output " \ "or in $(BUILDDIR)/linkcheck/output.txt." doctest: $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest @echo "Testing of doctests in the sources finished, look at the " \ "results in $(BUILDDIR)/doctest/output.txt." xml: $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml @echo @echo "Build finished. The XML files are in $(BUILDDIR)/xml." pseudoxml: $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml @echo @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." imbalanced-learn-0.12.2/doc/_static/000077500000000000000000000000001460233407600171525ustar00rootroot00000000000000imbalanced-learn-0.12.2/doc/_static/css/000077500000000000000000000000001460233407600177425ustar00rootroot00000000000000imbalanced-learn-0.12.2/doc/_static/css/imbalanced-learn.css000066400000000000000000000016701460233407600236360ustar00rootroot00000000000000@import url("theme.css"); .highlight a { text-decoration: underline; } .deprecated p { padding: 10px 7px 10px 10px; color: #b94a48; background-color: #f3e5e5; border: 1px solid #eed3d7; } .deprecated p span.versionmodified { font-weight: bold; } .wy-nav-content { max-width: 1200px !important; } /* Override some aspects of the pydata-sphinx-theme */ /* Getting started index page */ .intro-card { background: #fff; border-radius: 0; padding: 30px 10px 10px 10px; margin: 10px 0px; } .intro-card .card-text { margin: 20px 0px; /*min-height: 150px; */ } .custom-button { background-color: #dcdcdc; border: none; color: #484848; text-align: center; text-decoration: none; display: inline-block; font-size: 0.9rem; border-radius: 0.5rem; max-width: 220px; padding: 0.5rem 0rem; } .custom-button a { color: #484848; } .custom-button p { margin-top: 0; margin-bottom: 0rem; color: #484848; } imbalanced-learn-0.12.2/doc/_static/img/000077500000000000000000000000001460233407600177265ustar00rootroot00000000000000imbalanced-learn-0.12.2/doc/_static/img/favicon.ico000066400000000000000000000021761460233407600220550ustar00rootroot00000000000000 h(    ÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿþþþÿþþþÿþþþÿþþþÿþþþÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿóòòÿàÞÝÿÛØ×ÿÞÛÚÿðïîÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿüüüÿÿÿÿÿ¡›—ÿ‚ytÿºµ³ÿwrÿœ•‘ÿÿÿÿÿùùùÿþþþÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿýýýÿüüüÿûûûÿÿÿÿÿåãâÿ—“ÿÿÿÿÿ¤ž›ÿëêéÿÿÿÿÿþþþÿÿÿÿÿùøøÿûûúÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿýüüÿÿÿÿÿ¡š—ÿ}toÿ€xsÿź±ÿ¦—Šÿ—‡zÿˆynÿpgaÿÓÑÐÿÿÿÿÿûûûÿ¿º·ÿŸ™—ÿÿƒ~€ÿ{tsÿypjÿulfÿ”ÿl…žÿD~¶ÿ`™Ïÿ¯Íêÿúûúÿÿÿÿÿþþÿÿÿÿÿÿ±­¬ÿ¬¤›ÿ¯Tÿ½˜OÿãÓ³ÿýÿÿÿÿÿÿÿµÑçÿ:Äÿ9~Àÿ5|Áÿ4s°ÿw¥Ñÿ·´²ÿÀ»¸ÿÿ½ÿÿÿÿÿëɃÿÕœ)ÿЗ#ÿÞ¹mÿÿÿÿÿõþÿÿE„Àÿ1^Œÿ:lžÿ8e“ÿ9`ˆÿ#YÿbuŠÿŒ‚{ÿ‰…ÿùöïÿÍ›6ÿÌ—-ÿÈÿáÈÿÿÿÿÿçóþÿ=˜îÿ8f“ÿ;•éÿ@œðÿC›îÿA˜ëÿEïÿÚçòÿöòïÿÿÿÿÿãÆ‹ÿΚ1ÿÛ¶iÿúöìÿÿÿÿÿüýýÿQ©üÿ.åÿY¨ôÿp·ýÿq·üÿkµüÿ]®ûÿÇãýÿÿÿÿÿÿÿÿÿÿþýÿüøñÿÿþýÿÿÿÿÿüýÿÿÿÿÿÿšÌûÿ ‘ûÿ:œùÿ5™÷ÿ4™÷ÿ7š÷ÿ(“÷ÿ¿ßüÿÿÿÿÿÿÿÿÿÿþþÿÿÿÿÿÿþýÿÿþþÿÿÿÿÿÿÿÿÿûýÿÿvºùÿ(“÷ÿ.–øÿ0—øÿ-•øÿ/–øÿßïþÿÿÿÿÿÿÿÿÿÿÿÿÿÿþþÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿûýÿÿ±Øüÿp·úÿ`¯ùÿi³úÿ«Õüÿÿÿÿÿÿÿÿÿimbalanced-learn-0.12.2/doc/_static/img/logo.png000066400000000000000000001046061460233407600214030ustar00rootroot00000000000000‰PNG  IHDR°1¢÷WƒiCCPICC profile(‘}‘=HÃ@Å_S¥E*ì â¡:YqÔ*¡B¨Zu0¹ô š4$).Ž‚kÁÁŪƒ‹³®®‚ øâææ¤è"%þ/)´ˆñà¸ïî=îÞB³Ê4«gÐtṲ̂’b.¿*†^! Š0âefs’”†ïøºG€¯w žåîÏѯ,DâYf˜6ññô¦mpÞ'ޱ²¬Ÿ™tAâG®+¿q.¹,ð̘™ÍÌLjÅR+]ÌʦF‡Hš¹sëùÝsî9çÊä{—k„b<­É©z-à+tFì¬N¼?‚£þžù‹1z–%xñG—?nÕÿüÕI_9Ê!DºkÆ (¨f¦@#H}qvpÊãï=0báºnFÊÜuö¨*"’ð·îž‰jaWÉU‹¦i.VPUãäUÚ+w&b'^„‹ÖY¾É뜑Ú"y,YŸØ“†Â1-XñÙ©ê¥M^‚4LtOëa8>éɲxªG=y ƤUfGÏlöûnžUðäÕlŸ•Óµ«ÞÑl‡ÑaQ:'…y ÿXî¿íÇöZü[­#<ønâ”Õ7© Ø-Н 7= ©1iÅ5¸°o³.KPð¼úþ7b§°.•î5X¯ÇÏŽ³d'ƒóƒ#;ŠúåN¹ØÞßfX¼Ý?Ðò"Z¹UØmøyj¥_¹Gˆ€¤#B­M™l ‚W÷rHZOfªNݸ‚Š©'7/Óãñ³°Ô%æuÎ ¶É›EÛÁ£³â•5þ§È ƒ”Þ]Z–†/¬²~>{|(9NÄ”7uÆj˜Ç^ÈX9ºY<ÿ2G=Ç3¶Ïò¦åÚ¡«s.øª¨pÿ[þ§È‰£Zî§UÂï&.ÊɃCíËŠ‹L\ ¥0ëIËí¼ó8‰¬~¬ŒgMçÝ`çuwÄXuR ìÍ †=' l™æ8ÕŽ¨ ù^õ“º¶"‡àÅç€GZy¦ó®#;ïjÜë±³ž=öŽÿýÍàRÜ#ùàD]¸oI¢ùð܃Còj –\ v>1b‹{½å™™ÐŠT¥‘x:TE‡  *€ „¾(½Þ@ 4 ’e7°£É¿­›Q6‰È&D6 ºAD£žG´½džÔÆ‘–7$)jè!uÔÆ´µÁÌ´öÕ 7±[v Oä_ó<{Œb™b¿>4¬ÙÓðsï"8thH`-Ê+\OLÑx XÌ‹åúí RÕ Àغ#=Á ýë…°¶ûÆU€îuÿF$û¶jC`œ*›ÀzTX!ð.°®(bo«:nŒÇ}µÀSTÓÎ[SU,+3¡öÕË6³.ÒÂy–ãÁ½‹§ÙÕ‡yt) M[-m˜Ä¯0²·.,›bdª“T™  LTÕŠ,hOíÞ4``Ý¿©ûc½ëF-ZuvkÅ o"ºÜXòžçèÞôå¼$sÄÁédDfçså]×ÅŽDpb±¬ “Îê°õÔŠ`ç~ôM.D__gX²½ñ"ƒãǸ„h²#6®ãø²*»EDíêæN ±g[㺇¡ ÌWO{绌IE@ÿºÓ÷Ûs=GkUÀ<ùbda¤”5µû‚3ާ§Ð&oº„&ļƒãd1 Q'UË·ì6¼¼Ù¿6¥wr÷ùdÚ×7¦zôêj_‰û$Á‰9¾ŸM',c0R"ÊÁªœz¬qÜ@I8š-jl%ÀD`¢ÂWÕSj÷±I„W^aŽk­ëפ¤`,ƒçz âÊêR„»/Úi:6ÿwGªˆ-¨“ùÝYs5¶ä>¿h• }•Ûpú„öѾ6ïV}l±v;¬Ø&lÚ#|¸zÁ®Êè>0a€ÇøþÊÀî^GM]ÄõsÞUwøÑ‘^…‚ TÑ“QNÇÓ£ú†Ü”ªrp‘*¸¸«yT,Àóv·:’xzU±NÕ¡Þgƒ¼LNE$±dO­öüÍRUTøíÜDÏÃoáÑ£,{í¨Š ó×^fxm[rhe Öí^Ý ¼¯ßù£•K§º i'·þ}µÂÂu†ƺmfÿ).2DcG\‚ŒRÑsQÎUtJQÈ;YÃh`´¢_ÅÓ]udö7ëYO}É6@˜ÄÌÍhT0öiA2è:,ÌãªNÉ_›wI ó!À°^Í+÷èr;Áóp\78y|v´¯¨#<¿ÂpëkÛjƒ¿ÿÐjáÑ÷m~qœÇ‰ã²g’Þ^XaqÇ"à WÆÚ6u1`í ƒ òÐ Ï):¥ó¦zè8tÎUô\“×?E¸Gż†×Heo¦…)j ât&+ËdÜ‹´¯ÂヿÚ/ªgùF+Ðó‡ô¢Ù™Ö†JáW 5 fº”F2ߊ÷¶nzÙN0U¦‚Zn˜cøZ|êÀÌ­ëÁü,nzÕâ£*XvmëgX©Ä€¥MZ]Ô“3Uõ2ýû†\Cà ª|õVŠp+È_Uë¼›øÖ‹ÛyÔ°‚ò2Æàe *}~Ž¹Î‹˜vs•ž³&˜Ü:nDb½<…{ç'N«óF)S‡ež{ËæÇs“+ß=\9v´C¿r%êÀÛ›-~ø‚Ŧ6®˜ûÖ‹cúy ï!°e·á¶W,žª‹©[vMëä4,ý¹%Tõ*ÏåbÐ>!Wä<ƪr7èÀn¯376ÞÍw”i ä•Î==YÓ.ÛiBVVóëƒظŠÄʽ¼Êâѵ e”ÛpÕ 7£Ù¸ªcÂoæD’’Wÿxø<‡‹‹1 »b(-‚©Ã\~}Š?ýÃk™Ù×-øÀpÁßíDòjCõ—vlAÎ^TÕeÀ, $¯üB_à§x¬A¸F‹#¦‰j`ùˆt2tâÜtî‰ocZ {oúûºŸ÷ŸÃ‰"ʾNøú^¡w×ÔæQuL¸ÿ5‹»—7‘ÖÈ+H X:ˆ˜HqLcŸGù¶¢£C(ôA¹SjcŸ‘‹Uu-Ä͈A+¼ àIOž‡—¨ƒ·TVÓßk£»•ˆg Q…éܽÿ’ä’SH x# öÕ·*ÊãmŽºpÓ‹†ªFJΧGjFƒ–«¢Âž±™³1y=o>Ém‘¼öï3ަmÀÚí©Øžá§ÏÚ<·¡á˯‰¶(L‚Ä€¥¾©3]½2æÅn…2¿ q„ª.álU^UB'Žüä¯6sÜ€²Ÿ´²îž\w§øtîDë-à9[–K\[ÌìpÊ­·Tük©à½Ø·fÍt2fɨu„_½Ð2y}~œ2cdÛdÙ×')UV ˜æ¸}Ÿð?OÙ,ø¸¡ŽË®¶¼§2>cÀR…E)®\©ê}Êø‚GOUž4"Ç{ªo˜ˆ…+ì¬7i(µ,+©™f?Ÿt\0è~ϲ8¡M—{¼š]§° ;…-5ÁÞ™Ð?.à­³øÕÂÄ©ô‹Ý”MpÉö¿ŸŸx¶–0¶_8ÜñuÎæ÷ê¤Ú€nÈÛö×ÿ§(¼–_kqÓ!æGß(v‹DD.Åeèm!yu*”{ªÿAèç¹NÁ7¶ ó”y²±ìjÙ6Ó¤*™ãšY\¤•E˜½¼#0V~|/3¼¬Ûn¸þ™Ä Ã×§xLž¹ßsïÙü¾•‹5¯™ìùÖ¬öÔúc°òbÿõÛ[+üà›·*5¼Z»ú"[›(9ºÖŠÞ21”åƒP~oŒõi¯ m‰©)éÖ¥‘Æ•µ®ŠF™Î½Iÿ)ÎîׯîÀQQnxÒN8÷:n rÑa™#¯Õ¾ýRëÓô”ñþU™ÕþžÐÝߤ‰¹Âo_L4.¿&Ú y‘ )2xPU_¼Bp–zÞE¡–oŒ,%(µÄvïËWý‘ér¢Š16s½+ˆÖf…ž\ìÞE0{®•mc` |ïD—b;3u­‰Á/_lX/,—áBŽ*K`XoeÞÿšÅ#46:­óS†‡±¸ÈXј^­ª?ÊC¹¢~ª)·`ÌmåT 5°\Ai/<­ÉÿkÔ5îCäz3Ÿ“eélS¥aO@ ëö(Í*~{š“Q—ù/‹°x{ë>j„ÿï}²WÚ,àŠ)E>Ò×?´¸óÍF®òׯÚ÷n.a|mÔ{MUoÉ+DTàyß.䚎ý¸I”Ì-Â7ý}ÝÏR÷žˆÄã«wà.ªÎáƒ{²–ÛgÝŽôK¾é8—ñ2'¼7í2ÍÒQ%ÃØ ÿ›?i²—§ƵmÝY%üxŽÕˆ¼¢íf«¶ŒØßBY Êé­`"¤Þî&È(“qWõI8D(Ì£=eºÞ•’WoK¯¿fªÏø·7Ú&›£*”>]ý‘fÔ…{µÝΟœàP^¢m)ÈÜ;ßfCÝyZké¡2½éP‘a®§/ Üaføm¢Õ¯‡–2a%’VûÄ]y âC²!F:ŠÂâÁÑ3‚¸ßûÀëR÷¼QÊ¥ÓœŒ¦ŠZ·Ýðg7B>Ôÿ\ze•ÅŠ6N¾?Ýå°amââõ[é#=T¦/ô9WT—Ç„r9D|ÅéXäZÜUƒvVD–à~Ÿª£$xÐÁQÊ×uˆX™ß§ßõ7-ùô\û‰á»/µ®Ñ}kšrî䶵Ȩ#Ü>×bÙ5ÑVÉK$sìeGl[„[Q}èÊãÑËEÏ Ì·Æ•»qW·ÇeEùm™4: >Þ›Ú˜Ðn<Õ¡KqfÉkg•ð‡·üÕ©¿·…m{…<¡µ¸ä_ërÉa1_AÎó?0,ÛÙÖÈ\œ—ˆôsbÎ ª|ðŠ“©‹¾«­.E72F`ùw•ÏçeZ×áéØÖÝÁçôÀ¸ùŒX³{À2e ~“`ônãükÃNÃÕDŒcL9?Åaâ /%_ž=5ÂwNŒµ¨.eÌn.|Õ_†Â6DÐK\=x¬ÓXÞk]m·0O)¬ÁKqm_¨¹4ÄÒ,ù¾¼ÿ‰ƒ@±Ê*aéÃïJÂU.ñ¹±pÆAã*¼´œP[ÕÖ27InDù~(gCdQ¬ß© ,_…zjcÝHÛÌCÂnK«ŽÆSBE¦r6ÅšOü3L­'<Ü2¦.™ä1c¤K¿ò¼·ßß %lˆ,ã$,S‚ëÕBc˜vÂÏWm3®‰ÝÓâå™Õ±ÔT’â,¥€^õIêïö-†‹'xLá1ºŸ‡!çÒvÚ°ý*$¯í„~âê4…ÿv.ëLªWk] ’/qõ[Ž–Ì‰U±'•ùö»¼ñI0BUŸ9P9t°Ë¨~þðæÚ†MàF…ï„r5DûIý B`&gVr^ £ª¿#L ¢Ã„_ÐfG"¸®t[$óòJKujk8ÚþC|ƒ’"÷nß—>i×:ÂìW#üjaêeÕzñK%óмàXU}¼ó'Dþ؉M`Žã„ƒì[³É/-5æÄ7&%)ÆäoÜ™aWVß¶ùãÛéÿŸ—¢y²Ï1)ü(WMˆ–Z3 –ÀÄÎ7O»\Pù¢‰)3̽”¤ÏõßRW>Üa¸ú‘"žÊÐùÕâíÂóïeN {w³ÉÖäè¯ê= ôWJˆÀ"&ïojNºZÕ É+uM,?HL=O÷¼+¥w^#l˜X^]cqáCvÒ„¸¿=Ñå‹ãS;”ûÙ\‹Õ›4ÇžÏbü€†:dl$…‘¨> WIˆXGIx®œ¶v“7ð„ƒ{¤6Þy=‚糩»«…;þO°[ÕÄÜWlà¾3]Në2¾j}WåµÛRQ5Æ®já–—"|óE+#9åwÀaáÚ‘[’Jgi𕙎•<8£Wå£KîMéÕ¿¬„/³[%±ZGxa…Í‹$=ïê_9×aÊÐ8«5Ö~‚bK œÿÍÓïXÔø ЮŽÂ3ïZ\ø·ÿ÷ž´zÏWŠ8x89\!r‡ç{2qcð‰pÎD—Á=•" ª¢ñ;Æ–m´xàMaC ÅOí£üô'á^±AÝ•cx-&æm ŽÂw_¶¼ÐâòC=¬ðØ])-Šg®¹qmpã.Ãâõ†¿¾-)å‚ô¹1œ \AîÜ "K˜˜ï É÷. ‡1›;„âœÚÜΛ)ûîœÕnß;´rý1]‹›³ûkk-¾üTû»ÆgXûú"p_8ûCä8\DÆ¡º&Ÿ*]rÕ¢„b Ͼ²§¶†ˆë›ðöûoOóøÎ‰±¤ä0u˜ËQymÆî ÜÎüyKà |n@ƒ­&t›ÏöÇ]å*T•Ò¯ÝÕoô/‰;k\2Õi5S¼eà†có:ñV`P8ëCäTõ‚ °¾²:M“ÛN‹f­ìÓ†*½ Vï¬ÑFöQn>¾ýH?ƒæÃ/Ÿç{ˆ<„¼ªm“ü³â9Cd™Â<r¼›»^w{FË+³àWǺÜxzŒ>]ƒm‘N:Ðå¦gס¨Ød”¼î$L"¿p`~ ÒÄ­ à…9Û^n_ªæ©0²+|°7ý²>{€ÇU3\*º¥ÞÞóuèVbñ—2Ïü§ U®™±y?ø'PNòy†Æ˜®žçíÍýª6—Ÿ6„æÃöÛ=(JÓr]Ïvýî9'ÆOžµ™»55ÇŽO V¾8ÍM+¦«1Nïr@å·¯X,ø8}g“ã*—Os9x —©kÝú/ƒÃ "ÑSUûy@`Í™Ê$ ^nGä,yÕ¡_¹r˧c<ó®Íoöøð§(6pÙAÊq£]ÆVx¿ðrt?;ÎQ­3üu©a^@rí[ Oð˜1ÂcLë' wÙ"o÷Õq ÂùX÷0¬#:=ÇœðöE…·6ÞÜ(¼µEXY)x tW†tƒñʘ~#ûxtm§üꪰq—°j«Åªm°f»áýJØR ="0¸\éWÃ{)£ú(Cz*C{*EvV:ü›ÀoÂ"Ïqðû|©lã8°0K@ÇPù`¸íR¤1Â刹Eþƒ{(ƒ{8?¶C«rð“p.‡(ŒÈ׊‡S¡²«¢‘¢œ­ßt¹'¤¶q=¡ÓFˆÀÚÛ˜À 4YŒ»J»nÔ."éšp#„ zCòç ·ÉÂ5aŽTsVÿ-ðMjºˆç÷> ¢ÐÏcO—{˜Î½yUq;ôAì8xš›ga…<#Œžb£ê€2»©÷±Ua_ÎÝõ’Wî¨Ã!Ú£ËÓîó?Þ Q@è.â5ÝÆÆ÷±qBëøsñä&;OâB´3¦Ë=,Ð/\»Ò䯾À1áìQx0ý€Ý­-œ¸f'“²¢s¢—·çÊM^ë¼B™0ù½FÓ.á| K87B< `UPª¢Ñ8/“$XˆɦJ!nlÒËnÕøq83Bè‚ï|Ïê1]îAT1Æf®wEöùK3½' ‘É=¯?Ž.úk86 øÐ'솅Uz¤ú¢®ç0#ÛçdªÍ´°Àrbó“›êŽÛ[8}\·IH1ÌN gjˆ–B=Ò-AëÎɦ“M"ÓÀrpòä讬€ìˆûÛ’Z Ød`T8OC°Ö#ƒe1ƒ»Û¥Ö!åÂÔÉY‡ Á2’“&Î@êS$‚K±§¯#fyuÌ<ÌÔi_!Bt´•¡Î¤x¤dæþÌð ,‡ô¯HQyŽÕ©eVU7žZLn[¡5õĽc€ááÌ ÑIPÚžóÔåÈHú™, ~ô çcˆíƒT51“{b3䯜Ù¥ónLJŽkÁþ0ßC ÒO•+ÃÉ"D{kbÁIÌP‘‚¿\Ñ`zzx=Az‚BdªÃ‰'hí”R—,o°ØŠÈzT7 ²IÑJÙ%¢•žGmF1·ø4SwÅí·äÙ‚¸Ú®½lU¯¡[(RB„hÿ çtîf_ F`…J^"2YU/Žõðz=A»-gtmúûºŸëaªîRe'°‘ÇQy¼3Æ94™®Š8 :%☺od¾Ížm!8ˆã_ý3F"ž§W„‚$DˆŽÃ‘r/óô*ŸŠIƒ°/¢\D&³*U]| 8t%=ƒV PŒNAu6xë€ ¿C8 $å³éDÇ‘vLêœVöKFç åz 1`þÍŸ"tÑ9Q›+ñÔc†øsì0oþ È]Ü£ÊrU]\MûÆ5ôý*Êë Ë€Wùbpº;Ôxš0 $PiÕÀ1`ªl!Bjr©2ª0³ø~ÿ–ÇE!ÂMNÌÙ\•#»çþÀѨÞW§^üdxYQQÞupGfÚððê/ËkT¡ºÀé¶=U& ÓF…輨ε 9µ5mÿÁ'å‘éÖŨò¦*ßÊñêþô½ªhôu0§çÓŒÎ9P÷N72=¶ôh$`Ý=¾@' ì—¾ÇS|åBН\ˆ :;ß!js±R1ÇõI`y …)9!¶{_5p)ä·X 0¼'Y)0Ã2vQs'çT°<•΂DsÑB–YØGþšâ+RtáSí3ìåëIÙ ¿¸S÷½5í'ñ¾¿ø9¿k½:7[¢Ì0÷ú$°\ÖÂJ{OkžSÕ耴'™£0ÏõœÅ"rn#·x Oä$`oŠÈaÀØrBtZ(U9[5Ïcº•Ü©ÃN¶{ͽx$™¨Õ;æå4eRÕƒÌ]—{]ŸŸ*X`‡$Õ édñüºíEjx(¸Cì_é»sº‚žøÐÀrS »FUy5Æ™ +~Ÿk]¯yª‚y¶ÿ|ŒV$b)œа[“]9¾+åhðA`¹µ+¸ ¸“,_¶–(V¸¨‘p[œ&,×ÿå_^Ì™Œ{-DçÖÀtG®×1–ä˜ÎN®–™¸sÇâ…k;Ù<ꮪó‡ Ãe•ÂN Ø­õ?wí”Ú—ôœNÑgÿ€èS¿@?yƒâK‰ÿüÐÕHßIX‡œ…éÞŒ…ÖVámýwá=hå" kâ X£DÊ{ÅE`U%îû¯ã.þh¬¹\|Öij‘>C‘H1Z³oÛ:ÜåÿA·$qô0eX_‹>éÞ)*ÏC«÷Äß{ët«'… åJè Šºcþ%Lß¡`C÷Uâ~¸÷›ÀÝ“ø¹Ñ_ÂwÒk`¼ÑtÇ&ÜU/ã­j~‘£ôž‰uÈE˜ŠÒrptïÜßÄ]z8;›·)Òkò,¬a“ãý¯ ûvâ~´ ÷ߦ ±=æm<ÀùÐÖ ,N^™J„—Òvà«h§#¯ÆZñǨÈѨ®î@»BÞ!UÔçÔU[-qäÌö›h°]€5þ˜Ä%Y\†5ô Lů‰=õì#®À øL·¾Ø“OCÊûâ¼”¸|­C¾}Ø9 žÎRÖkØ$¬!á¼>wùÍ /” 'rú-˜^ƒšTÎ ]{buí‰5l"±¹#ðV¶’6Ír¤b2‘_€H#¿1;‚tï‹=ñS˜ŠÑÄž¸ ¼ ûÄßa ?¤YßÉ€Q˜£pGóüuàÅ}&Ì„YD¿¬F¢ØXHÏØ=`:‚ØÓ?Œoö—×u‘3~ƒ”÷NüN÷~ØŸ„5tÞ¦e®lËõ.mk`ñötØqØ×QnéìBD¡?ªï ò}To ŪÏ~ðk<°<3ÜCLJ½Ö¤_ƃî«ÄYøW¼žFºŒÀšrÖÈ)HqEgý,wõBÜ7ÿ‚V­Ç ={úç’.XLÁ}cbƒ,)-Ç>ì3hÍ>œÅà­})ê‰÷ÔžÚ IDATuÖÁ'!E¥ØÓÎEw|€·!®ùØ3ÿ_=é¸kƵ¤ÊeHQdø)ØS>ƒ”tÁ>üB¢«ˆH¤[Ž=ýs`Y8ËžÁ[ù(Z³ ÓÿH¬©bzÀTŒÀŒù"ÞŠ»°gÞ\O^oÿ­| é{ö!b†ŒÇz:ùÜÅ7b¸”Èô‹@ ÞŽ¸‹ÆÛü_¤l(fܙ؃tíIäÔŸýç%Û¦„È©¿¬'/wõk¸o=Œî^‰ô›=åBLÅH¬&äæƒ¶äÅúnBJ¶ßÛ…[E¦©ê¯CÒ`( ÞG±.öÔ­içñÈ?-,@}ÕÓ3Â)–N”ØSßG+_÷Så6œ9‹0½Gº÷‹“×û¯ã¼<«Áj³æ>c9æ ƒôÔHÊ#öôÑí¯ÆË¬Y‡»d)Þ¦¥ú=°#ØÓ/#úð#`•c ™Ð.Çy©á;êìDß]ãF‰}y\»é{trSbÊ‘¢RbsÀ{ïw mýðïx[_£ø¢?ƒ]„p ºý8¬1Óãßz÷œyßnøÖ–g‰=ó,‘³þŽ©5n&îÒ2ìÃ/1èÎ-ÄýR½©Pk7â.X€nù‘®kŠ“gá.úÖÁ³ñazëyÜ×¾ßðÛü,‘ÓïÃô? èÂù8_ÍU­üµý|<,Ëꪪs€H(Ašá3žºo˜ˆ©hÏÎ×/ç]Gñ…W85œZÍá®]ZO^á}òÑþÝ-šÿ}ýœ†Š†ÝÕ ëÉ+a ¶<…»j~|ìzT '!=+.¼Õ¯´ nßlóH×äs!åèÎÍ äUšux;7×m3˰>DÐh5΢ß$ïו/Å¿SÖkÒ,¤K8½ö@Òs.oí_p?Z—cŽŒÿwôŒx½öUâ.úYóx58óî äð•h>ÎU»Õ¿zíãÈaG"–‹½H<[|ˆä˜àż÷Edªª¾vGz–-Ò"‰ÊÑ…×éŸ]ë'´ ™Å³ iõtï;Íÿ^»¥Å­„»ú…$Ó:Áþdý™›< wñÏÛŽQ+éÓØT¼Û_M»wÃÛ­jªõ{ýŠQq‘¹y5Ķ%§+ï¡¶îœÍ>á®z"Úo6MúÎÚ…XÃ&!¥åH¿ýãõZ·´E³©î˜W¹Ós€ß!ÿX,‰áÕ©hÙ¦1Çq.¦†â·Mt©‹‰»øwØiˆù¨9´´ðZ–*y5oµ‘UÈS…]Ýòt‹uÔO^X DJn­ìžHŸé˜þ±<6 © œ½ÛÚnª‰;„ÄIs¿Þ®#"­lýèIw¯oÅCލ'Yýxeëïmßþ lëáîïœ|œím˜u—gç0Dl颎þ5©¾Ñø—ˆœ¬ªÏgO …}»‹_8llâ¤ÖÍð`7*Nk«âul% ³ÖV!‘ˆ4ì)¤× ÌØ30}G å}â.æ)äkM§ù8v..kx¾Ú_<°ÅÛi£øÊ…þÞéÖ¿á;{6¶þpíÞ ]´>_§¾ík³èСŽ>rRð5©ªÏÔ¥ š,ÈÃó¯âbC­ÿ°ã ‡‹÷‡½t`èK뤕3I$¼:íΚúc쉟Ó(«ŠE÷îÀÛ¹ݱ{ÊYm~:Så´ ÓHŒz>íp©øXÜL$sÛ‘µùºYµý·1 WÌ#)zBÈG©-Ð;€÷gÂîˆÇ€ùg[¥Æq'˜F™»»­¢’ˆ«A#“’.ušÃ>Ì—arjF³÷½—ñÖ½’à"ݧ@Ä“©rÚD#-MJ|^‹Ÿ'ºk—â¼àïUé2Öˆø´•²Þ­Ž¸”ɼ§´¶•ù%§7Røi¢²*€§˜ó=õÎ\©ùi>ôÀl¹Þ$…²pú´"ÅHï™I½¤Ï±‰‹oÇGXcãʱF«‰ýûZtߊäZ[[ãÜf9šwÑš¸ ´¸ é9¤åeÕ(û‰Öì‹ÿ®k/ÿßÙþF\“ƒTߺ¶DÞ½‡Q$Öäï.^„@)3tg˜1f4ðépu§O½DÈ\@®æ§=ÁwîaÕ¼Î7Y­³øåsEœù§"^báåø™1§¶¨1ZcO«ŸwúѤKCÄÖµ-Ž"½Ûžîm—ƒ¯r|M©­q%Æ &ùÞÈ kpzuWÍ‹ÿ®Ï¤Û¤–5Œcþ7~¯×ç_„Øv¼íñ³/kØ!`JZhÓÌúX1Ÿ«=o50V^ÃÎFZÙáˆ`2$Ü<Ï»-¤žŒ¡T•—íâ’Ìh’§7‹øu¡ŽÈ·¦¹ž°r«áómNº/•OZü}5|T?[`ñâ*+ÝAÏêøÛcBúÓ|ïßçX¬±ñØ&oûè¾ Ž¥åÉëé‹=弶ǹÕrð]ޝñYñl¼veݱ¦þ¿$ÐkBü´ÄÛ¾÷Í»Ñh ˆÁ>òëÉG¤ß X#«ï›8ñý·^s³ÿa’—,ì_ Rõ"æ“ü%°¦»"m´;j:‘Uë½Ó“RL y'£èëÔÖü7bÛvºå£G1ù|¼9ÿÚ¼Kxt¹ÍçþáüÚܱ԰'‰Ãó­ó,¢iÅñhÖ4poûF0‘S„™0 JG!ec±þ‘Ó~V$¼ð÷ Þ¦÷êµkÚO¡¨N›°Ê1£/§èœû‘î}äÉÀC’j=­—ó%ßåøjã‡á~¸,ÎU?…}Ìÿ"½f@Qfð9DÎþ#RÖ=¾ìQ¨ÝˆózÜêoHä¬1C> EHùÁXƒÈÉÿvbµ¸¯ÇãÆ¼wgãmû0þƒNÀ>á.¤ïñé‹ô?•Èi÷cúÂÛ´ÒoÕ7áÍ®ÊW¡gûßÑ eEª¢QT5=§Õé@¿s2Ž)1ǽ¸5ûð‘è?7^©–SûçϦÔNçå 'ߊéÖ˜éõ©¥êá,}ïýÿ«#£;qºöÅžx2¦b$æ”oßJ|¥¶ŠØ ·4Ž«‹3çgDNùÒ£kä¡X#Ml÷š×ñÖ¼„øm? þ|—‘É÷.IAiK{‘ÌŽ ù&KƒjÌXõ¼U)i_|¥°ûFäPU]œkõйÂ{[ /¯1<ðŽà¤ TnœéqÖD'ÝÉ4 ‘ÿ¢ž…=ù³Hï!ñkFª÷àmZ»ä>t÷²Ä‡#}±¦Þ€5tbæâáíþoÝÜ心Øv̈K°ø €Ë€?åR…öÖ —<aݾôʹò`kNƒÀÄÔk ¢,¿j1ÁÜ^s4Þ-?wtgN×»žÈWK-[¯* ¸žƒ±‚{®{n8k²Žƒñäì6W`üLÖø6~Nȵ¼¸ÒòM^çŒT^½<Æ“; lâ„¶iWàL‘—˜DžÉŽÐ Wxfà€¾“Ï H;ݼçºI ¯ñ<~!² Õ"”–z§K÷|á,<Ûb÷§Ÿ`ç™ÿ¡ê³Ï`.}ÕïÛcsKû‚Ûù[†cÊáÇ9t+U÷ô¸zZ"Ùì¥ÚŸÒ„¼Ò ˆú¬ ´pH.IÊ׶ɰÙ7²É{¦ëÝkó¹vfЉùÊ—(HWE‡…ó¦]P£úOÀEþ6Άù{/ʹFˆÄÿg×Y§ß!ÅÆÔÖz#s©}/­²ÙVëïÙïëR^Ò°Æè%ìAkS²&#¯LkâÒ¨è@¶5¼/’D; ͈ibq¾7ÀÎTAªŠ‰àÆb­h_ÚèΛvÃi" Pesë$!Ì÷®Ê²Š º]g?ž•ò£Q-æÊàì«îô©}°ÿZîÙy@ËÎ îH$°>]C á“¿Ä´I`­;Ê8wÌbìÄ™)óJ\sËy³ŠŠ÷³~¯pêt.»€]· ­õçÇt ep–ÅÃk%–Ó7$°þ°Ò³©-í¨ëÕw¶øçšß}¨!À%|-j‚¹O`:%t¡ï@œüŸ _éä]ÐáóïÙ÷ü[íÏßòúîaîÖ†Õ?¡t/ ,„/¼xÐm¦UõJZa†š;¯ÃËÐ ªéúô™vî8+œ;Š‹:yû;”ÀvV ÷,÷¿åœ4¸e[ýqâÒ=z˜Îî~•ž§ÛzÆi!ßdÕ³ðDÒ×¼2Ä`¦ýû.D–ÝØ™Í¸¥ùñEë,ßפœ3RéYÖòÃKšÜ6q`¨}…ð…½bɼ¦¿\9f®‰ç©Àò”}³¯Ox¦zöuþ< ç~§‚íØ%„ö&°êpþt(Šj€µ¹J:ªfžÂ?h_ÇnY£ªu„ßM\º£ú†Â޹±t‡ŠW*ÀØU›±¼xèr²°ªÙ×5"¦VVŸ6£­*ű$!¨1= ko ïëp /Çv^þê8öûÛ ‹·û_¬­Ý¬¼r«°=ÚðóÔ>J¿òЄÂ×y´¸»ƒ¨©³Þµ°oöu-..Ç5õQŸ|¤^±©—_öI8ƒ:Š^P¸kSsm¾×ã•5þ?} m5«Æ «yøìñ¡öÂ\#ò˜¿ÅÛ7{VŽÖ´$aå²¹D,/­°}ÒÛS¶÷‚^ΡÇ‘K`b|.›öEUT¸ÿ-ÿKíÄQ-Wsg•ðÀ»‰eòàPû á o|GäC?z¸ì»ãºæÔ¥`;.E¢iÇ€ °ëâôr"¶3IH`9€“Ýx`Aj—m¯&§C¤ÆÞ_ž0°eBšÿAâ’=n`ürË!|˜(þá[;rí&gUu¿®*Ç‹Dr"¬Ý L`E8‹rBÔŸXˆ­*ºbA[Ä:¢^½ã™ .¥Åàå¨ ÷-I4¹œ{pH^!üZ(äA_Ï0Ækú2åv5‘²Ý9Öî¦èá4ʉ¬œÜI›^ÕÞܲ[xú#ÿ[ͳÇ(–I¾²hXÓ(sï"8thH`!ü¯N¼#²ÁϳŽH#ÇŒøÿ³c;Ø+Í©°v'0`#®¸ŽÖ¿`ØOí*̘¼Ö[µ·½«³lC°%vH çYŽ÷.JŒ¡¹ú0.E¡G_ø³ß-/Ñ•~_™‹k÷Èxo&ŠkoÛì çR‡£»ÛÛ½0Ù¹ÕeÑîsï©Á–é}“Øëë K¶7ül ?&¼•8„/ìS5ÿô·|7Dï½9.U¦Í°Ž¢°v%0#²ØΧG7UíÖù0v´g]¶ì6¼¼Ùÿ›Ò;¹û|2íëS=z…X†ð‡Ç'Ͷ}ÉÝ„¸,ÆL:’ÖbÀ\7uƒZº1`Ð÷5‚§ºØ çT‡¢ ¤K!6,R ÑÚÜ ° æÃ–Üç­³´¯rNŸÐ>Ú׿]ª-Ön‡Û„M{„÷B"ÒUÝ& ðß_ؽcNTaã.aåVû[„w>ÖÔYÈGtSé¯2H™4Ø£kqpÒ¯¬YfqÛ’øxžöµÌYaóëù†ÊF|uü¨àíó^Xañ£W,ªšì=^Ù,¼²Yx»ÍuÇ‹ôX´Îâ®e‰ùùÑ<àWuñ^ñÇ,ÂØIGQ»Ë.)ê=Ø`&«ÊdA']µy"u¹DýÒCõçØ!p†*?ÑÃꩬþo: ákŽí|ѽëú³K¿z›¯:·{fy8ä€jE!6«èŠ-Û…ÍíUÍ»$ù`X¯æBàÑåv‚çá¸npòøìh_QGxòm‹³þ/ /X-’BKxhµpîßm^X‘Ý}ñ¾¨ðèr›³ÿáëê¹= ßyÉâ–‹ˆº­“Í¢u_üGÿï•Dò˜0 ØæÁSøã›o½Øœ¼ão¶îößçUQá7¯6WÙ¦6¾™@¸gܯ‹Ûd\µí>ÆèÀòâîÎQ³H•{@¿¢p8Íao|ïÍysºÚÕm’—Àà¿ #ÖFUº«§OVÏž5&'50TW?LïëHˆô.ئµ ƒ º®½ô¯åƒÙ‚éE³3­ •¯& ´fº”F2ߊ÷¶nzÙN0U¦‚Zn˜cøZ|êÀÌ­ëÁü,nzÕâ£4"þ²ÊŠl®žéÐôÖ5Û ÷.°xv}r"ßõ¦=þ–ÍKýmf>Þ+Ttó7¾_l%lnNªßߧƺÏךqb_UäÒòãÁ±“Žòöº-[O“–®7õgr ÖøT‘¹Ýªl“œEŽÝóê@Z%î_ ¦E‹|‘GϱgþEÑÓRÒs„«gÏSzÍí«Zﯱ^éC éX(Ú¿PÛæ8J2;¢*ÕĈ²ŠÊjZÜÁ·„q‰’àåU®m(£Ü†«f¸H½™«cÂoæD’’Wÿxø<‡‹‹1 »b(-‚©Ã\~}Š?ýÃk™Ù/øÀpÁßíVɫ̂;OvùÅ1¦ ué^ïR¤LâqÛÙ1¦öIìãÛæÙì¬yÓâô?GÚ$/€Cù'dUøãÂ`}Ð¥mkÛ÷ ß¾y¹Å& v8~ëkéu1Ð?€ðxϨy³XË’j_FÌè¨á5$5òjd29¶mÂïl5°…‰«]Ö©˜ÍŠW |˜íï¯Ü<ÃöÐ^ ‚kc¥á^J,ãGǸ½2eOð“gl\%IÉளcŒé—ü{t›‘A2<¾Nؾ7uÆ­Ž w½jó•§ífçPÑ»þòY‡™£Üµ£n¥Ê G'¶çJ8ñþ?™gQë£kËmÙ×ÿ¼½ÉðØÚ`íïׯÁJU~òœÍ†$7+~aB\ë¬[ÿštgÄWæ#UýZ õex &Ib£z„çéâç^i*: ÉMSÝF˜±£Ñ³P)&ùq¬Îö÷ߨ}õ-†Šò8!D]¸éE“pØÿ鑚Ѡ媨ð£gìS\Ý|’ËÈVJŽæOY»=5ÛS#üð)›»—·.¢Š Üs¶Ã}Ú&–qý]¦41i:,œçSŠmÿ/<²<ØFff…RÖJf•¨ ·½\Ä+›“÷é1¼#Eô&Ÿä5˜ÈÂ!ò®`š¦E=<õ3¹iòNnÙmxäƒ`ä}B+×èÔÄàæ9þÞÂökxÛ`†~zâEoøëóå€Ý°¢8꼋ñÐFYo=áQ ,ƒ2ªÍàk»Ã„Œê#„Þˆ×ýP\Ð ¬»o¨©,Ïf,؆–š`ïL茋ÖYüja¢Àÿʼn.½»f.û÷ïç'ž­%/îø:góëÈPë«û¶=Â×+â­Ê¶ß»|‚rô¨`šiß4úr\…{kcpÍsâ äåïØ7¿áÉVÎ/™¨D,ðù™ŸïÙŽÛ˱­sÍ!ÑGÛŠïê—ž¯È_3Ï'².w5°8ý/¤’ŽêzŠ œ¿’ã!­ÙK(½òãàKjx/X·Ýpý3‰&§¯Oñ˜6wѧtÿÏŸä/@ãI¿W~`[ÞzË2ï+¼–B—Wèc½Û¦ÙÁrf=°8&¤”vG¬g.›ó‡éÍ¢š£µêK™þ¦ãÁ“kƒ½Ó»fÏM HXß;Ñ ä0ÐjbðË['Ö‹ÇËe¸Ð‡€XÃzû+óþ×,ßgFÓû)GUESëà ¼@ó h õA=>Üax›áÉ üàgê.5øûÄ;"oúzÉ•‰XzP°]¯<3vÒQ*€çé©bäï ¢¼b»î9Žm%ä•Þ)Ü!öÚæŽÓÖšÝá‚s‡÷vÈ'í§à[(ŠAÝ&‹GxåôLnS¥aOÀ^Ý¥™CÅoOs2ê2ÿïe·á xÔÿßûd¯´YÀS<_Éi_ÿÐâÎ7ýƒ¾|„‹¢íèãÜú'ô–}ÃM!Ëú¹¥.ŠÇ~U+þµ/±ô¢ ”bŒ>WƒMÄõŽ6Æû4C(<áU•–.{Riwµ«Ëö:¥‡¶!/‹³jÓÑ2ÆÃ{äéOÚY¶#û:A›“W|§:7ß[·#}«ìMǹŒ9òÚ´Ë4KG• c+ü‹3?i²—§ƵmÝY%üx޳ëÌŠxVùTñ~ ctæX @^l+‡ôjŸ9~áhèÕ~;qvä#?ï¬\6W½0ðz²ä¥RÏ=Ôïq§†y¬Ü®>ÇjB^{ݲ“HÅaLܧýtºÉ A#|ˆ¢Ý hÁŸ=]ñZe5‚e™×ÉÂÙëêmé-§Y‡jÆóþí¶Éᨠ¥OW¤uážEm·ó''8”—´NŠZ—.kCµÿö\6-X*§Æˆ¹Â+›‚¿|ÈààcrÞAísÝÍ ñøÀ ˆùµñi¦;p⑇ƒ ø©UF.žèÓ@㻟±c;ÎÛë”ÆZ4ç—Ml]õæü7Ê®¾-?LUW CZiWõdw§h¦Ô-‰Fp/ 2/Óßz}Cêïž7J¹tš“ÑTQë¶þìãFèÇú×¾^Ye±¢™óýé.‡ k›¯·øÛJÿ × & J]ûÚº[èÕO¹ ô ~yâ8—£*R;Ã,6qÔSÛ—2+žEDn8øNÙ«ê¯/]cÎJa%­s\ëY _£_ÎÝWæžãØ=“5C3™ IDAT*ÅZÖôìºàÑ1“ŽòÕy&g$1W„¬ÒŽ‚]ù¸sðtR% DŸÊäwª£øö K¦}ýXgOÆðô»þ–÷ Ÿž‚k?1|÷¥Ö5ºoMSÎܶöu„ÛçóؼìP7­>úà“àãsÁ8¢œiJ"ð³ÓN@›¤üïI.Ï_ã¸Ñn‹™Röã y”ñüÁ·›‡+ÀšÐÎõSÀÈF¿x+f—œÑ¥ÊjQ®¡ê\RH˜`‰þÓïHÙ9#ióñÞÔö‚zÀ§:t)Î,yí¬þð–¿:õ÷‘J`Û^áÏFp´åzþêX—SÆûK8<ÿòÁÚ4mxzgƒK6'°iCS—žeÊ/ψqòjïY¬krê<½ŸrøåÀ et?Þu×é¸üüùH›å9‹ŠÈ—Elüç»j¶p&¦9½6ºjŸqjvµ®“è—üj…§ZEÈK›wúó†29%k`6)½…¸ŸÝÜÚ¹|.qòj´–Tu1ï× îÇÀ¸ùŒX³{À2eŒosYï6ο6ì4\ýH¤Åã1åðà9§NðG^1W¸÷õ`Ú×£µ^À§¤!Ç„¬ .îF÷Kï,+bų… ï–ØÇßœêq÷y1¾x„Ã#Ü„¶½¶Öjó6ƒ%0²¯þ`⑵A\Ô î)iN­=j8ݧuƒ¹êXU9:°\Bþ±y§ãøoOÁƒ"òÍ^ÚÛ:KCeÿ©é*Â#™úƦ]Á ì{Ǹ ìž´Vϯò¿´ËZØì{ ¯®±¸ì_6«Zpy¹v²Ç}çÇyN¾³ÙðNe°ös@zÚת­ÒêmÈI¿9@éÕ%ý±øËë‘„ü–CËà3“܉ö–ym“ûYc¼OJKôæªsTMqQÎem3KjGBªú—€JOŽiªχü’ulï, -ºüµz‰Ü˜Ãþ‘©o¬¯ þN¯²ì´wwµðćþ 5’ä ace<ĵÏZl«mþ÷Ó†*ÿ<ÏáÊm{6ų+‚‹tC 欞!娑é‡3ÌÿÀâö%‰cñÝ£]º¶`2~aEó[–“ao-?œ|GQà Šèô4¤óµÏ´õTW»:‚ê¥)|`…‘Öù¤É&ÔÎE#—TyèEˆl eW¡4Æó<ÇÁ²,Œ1Í&y=]¡‰™9Ldnl phºuXW\kíêŒtðÑÎ`uiì±n»ðŸå6z7y' †ÏOq˜8ÈKÉcrOò<8²BéY–z_íªþú^ðÊØ?½ñÙ´Ëðíç‰óÜ”é-dûß]-ܼ mrï¡öžõzPåPŠ£½µ¶(¥{º½U‘»}±Sz†$z+úüªOœdr‰óõË És’ÀTYOÜ¡ãÎk²‚}À>Ïóò¾!¶mcŒAUëI,és—ÏùoêQŸIÛscÂßÐô li ˆ¥YJ§üþ'þ5œb•UÂÒ †'Þ•SWc|n,œqø /-Wÿ[ƒk_GHožÎ{ß ì>0¬Wêß­ŽÂÏž·2³ .…ëŽvZŒc{~¥ÕêÅûQ“ÿt‰5òþÓº ý6δ¦hjŠPõïùvNJϧÐÝŽ³þϱP¡ìšÛØ7{]®‰ûYYv„¹NóÔvÕìY‰»ÂknÏÙ4‘ÃP½,䛌cˆì±m» ãº.555õ»4iA–î×¾yÒ‰‘ÕÕŸ0+wSaµ=…0üLå:lŠ5ÜÅk=8áä‡`cÊãIfgŒtéWž™º¾½9¸™ÆõˆQþ¸$8ižØ¯VìÅ꜖‚Öò¤c;[Ê®½êß]¿Q¢Nëj)ñ]3 Ö”Ð:ŒÜDĨÎòà8`XÈ9Åî¹—Œ-¢ã8õÿ\×M07Ä×EÂíÚÈ•~ÿuFêê&àYàìTëPKm[[œ¥ýê4üxûÃÅ<¦ðÝÏK9ç`Køïºàöï–º&ôú:gJM1mHj„í)ü~¾ÍC«çÄ·¦µžë¿kŒ/í  È‹¾³â›G=ÄO¥µ[È÷³Ä¸àr˜gJ¯»}qõìY¾¨¯XsHí®ÁûösÌÞß]©[·OžÑŸ<ÚäÑ-XkhWrSU/®)œ¢ªï…œ“Q|P\\÷Y–––R[[KMM ±X UMÐÀØÎ3Ÿ¤Û£g€Ö)aõçarèYš¥*żþ¶•y Ìõà€»£Êá3*‡vÕÏ_Þ”ú) KRpꙢ³‹ãÁ½‹RkÌÈÞ©‘æ?G¸gybÿŸ7J¹pJ¬Uí¾%þê©Èš˜ÙŠ*]#5ìuËPÏß<2¢ã‚Æe ü\+|6…ûb· èyj×Å·P5û:L£eØÒ‘@J”Ü2AjªºB„óUy(äŒaŽ1¦`³¿-Ú$ÈV“Ý6¾IV ñC1¬Æ¥òýZ'5 ÌdáF¶VØÏ{¾:ÍãÄqÙ¿œ`ÃÎàsÎRòžÿÅ›;R«ëÀÁß¹ÿ5›['ê}•¯kU“]¾ÑøÖè|UX¹|ã&YO-A[aÅ„Zã²Ïþï~kkºº%±nVMäĺûÁ‡zÈ9ê¤s|h‘qŒ™tÔ7V-›{KVA2¥±©ò0ð L7• ˆ¼¬ªÒÁuÝzâþ30UÅó<šuåÒóñ³âæy Z˜p3ÊRÒzRŒu¹‚m27O¿cñÝ—ƒ“Ä¢„Çewœª¢Âyÿ .nú—¤FôÕQ¸u^jÚ—-šÞ]#ܺ8±ï–ÀOOqèÚ†±ãùÖªòáÊ1ýÑ$ÜIÛxcÖôÏ'Æz.…®ØU~í­õÅ`vuÑI* ú£úàÌ?›Ô9(™£Ù¸Cf^¦ª?né°Sü4¶+¾¤qV"ŽÅ_šôV!µÇƒ1˲Œ1õ?© ¢±+}Ý-£v]¹‘Æö¦ªÔ47¥4’~û]Zbó«…©iÕÿy_˜UK›‚6Uì©~úlj¢¦[ŠŽ¿móÁÞÔÞÔKå\Ü]-|ÿéÄö•ÛpÇYÝZ/§²Jxp•†VÑ­cWmjÎ&.I£zÅxÓQûqÐT\aV>÷™x2Ç™wI›¦J=5Åü½åóŽtTRkIÓõ;ã¤sN7ª¿j!ÇÜèÛÒØŒe]à¹îÀA„HK3šò¼ƒá¡¨€±-D=<ÏÃóâ*‘X†d&ÿ]Ÿ~’nÿ>£~Ç*pcE¸ åÖ u°R´ÆnßgèQšž‹x­#Ü3Ïæo§>¦µ^üRÉãÆdþ û„›Bœ lÖlTÕŒ¿L^xCa3„A2‘"a§°­dnK&ZH%[IØ©>·+WÝ¢D¦£2%òZ‰Xw~—‚×£ã…þ\,„uR5×ð|ïáÂïzð޼q:ïÜœ-à^Š”fiJW´Î%‹vš¶=ø«f¤Š° !ÞBÙ|‡ÂÍ:d2™Ü–Ífs[ôûÂíí o/ÌP‚_¯TÒ©Õ¼Þð²`k…I€e`÷oþ\œ~xŽÇeÇT§–üö“—vÄ›¤„eÏ›|ôf‹méÚ¼´7¬3(å)îúð‹5¿{¡ç»“ÆI.;ÅÁ•-0Önxýtá›» >w‹Ý+$à¼C$‹/.?¦bw—¨¦æ Äyag·[ݬº÷f2™ †%:úÅ-KbQm»bKt¼H`Óú5Å3~ørVWè°„±£Gƒï¹1oŒžþî‹>„ðïÚ 4I?kZØÿãÏî6„8xQóRÙxõ±Kg¿åº0T6%‰ „01 +o+ö]á†È–™È ÄW+éÔ)ÉŒÑÕIB¿{ܦÌ0öu ¯R v 3«' øï‹<Î9Úã˜*søuy°ð«¢TTQìíüçr›/=Ø[’;c‚ä†Uçªçk‚»6Zýªò¾y¯Í/#õÏ’üŸù-‰Ê½·¥UÆü¢’êë&¿±wvþ!ùÆùNŸIz‹agGuïŒDÞò-Ï?`ïÞñÉsÎ_pìÙ|蛾+Ÿqv‘^R|­òň³%:V$pô¬3R`"•Û~OêÂ#«¸•Ô GdúJVÞwkîZOš(Î~Ï%_Cò§Bµ!Rüø…u«¿ Ê0Ì5<Ú®¸öAYq˜AÀ?u^wåtOäcýÂl~¤¦Š[œ^î9~¶Úâ窛ø¯9ÝçâY}çÈ˸‚Õ/›üð!ƒ-EVìS°ø"—£Æ+±áÍ=‚ _½{£%àÛgyÌ;Ê+˾ׅ•/›üä³hûΙ,ùúy.íIÉ%¿MT•`álŸ÷L÷™0ÒÇó;ö žxÝà‡y9~vÇiAÒÜ·»ðÑßžïñÎ R ^ß%¸}£ÉoŠ$þØÑ*Ö«ÒÌ*O¿apéíîG÷’eqŽçq†”ü¾"õ^Êe¦í}Qéºoÿ/ù@Qò,CÊwIÃ|TÊ‘ײ8ÔuÅõAèB\ûÂú5Wåö3aÌR~GJyµžŽûQ½jò*ŠYK¬¿"’ÌPúÁç¤ä)(¯Vûô‰6Twýo>dðòÛ6 fzL#I˜*kÅ–½‚õ›M~½Nðf…ÛO'ùæùn^]±ƒGIæä÷™˜·\ _Ya2ù1“Ë÷™6ÁgÒ(IKBe®w<% nÞkðä¿ß(úÌùáwJ¾0×ÍeÞ?{ŠÏËÏT×®%ë –¬+}쿜ÚC^í©Ê%ÒW:àc·˜%ÿ§gø|út» ó¡ã ¸7ðZWúá;$Ò•%šñ£ä`˜}»Ò DUåF}!®—¸—°2ÏvfÛŽ”Âû{× ¡¨çä÷¼9~HXçâ…ð•Ö…‹ŸBÊ_£Ñ(xÃÄOu'ôÓ=vàØ³ÚÜ€àÇH¾XÎñ³'ûX¢º¬çxAð‡*Ž~§äª³Ü^j+!à£ÇIVĬ¹ýf·"×j- W/ùøÉn^LÕi‡ûyª¾ZãcGK>t\¾~uÄÅ·}ùdÉGOp«£0 ¶¤qRÞÖn§?Öá¶¢ÒtÓ½–Lý¹ÒË"û—‰c…4Öv¸-`øô--%|IÀöêá4 u-Yôga˜G+õ”œ¿ØùsÝ }cæ‹K„q e:tŒl‘\~lýÊÓüóÉ>WŸÓ·Íå¤Ã<Θ0xÙV¾{–Çå§:½‚$™Ò60×¼ðPÉ•gõ&”)Éô/i¿s¦ÇßžèTM^Àw>Ë¢êþk_ØðЂòQiD]¯2¦GÏ:£OWz »è^²ù÷ÅÈkÈ€›îÚÝuÝ•sCüÛh@™…U§Œ•\}ÊÀIaï9Tòû8œphyDyÄ8ÉÎöêÖw*ùÍG¦OêÿšçLõ8ùÀÚ½Žï>XÙûKú;{rüç2)ø ›g_«|0r=B^4ó'v‡í¦÷"Y]›^¯Rœ.¥øuÑŸ=öWÖLú\ýs¥7L±¢¦/“–8AJÿáþv3†òdÔµdѶ®ë®üG!Œ™ÀC@–á)„¸¤sñÂ-ššÊ‡mçv ›«²$Ø‹gyœ0¶¶k…V¾7×ã[:Œk¯ìÜó§y|í]«ÚLJ¥ö­÷¸e%ÁMX’/Ubýð;%ß~ÃÈ–þ¯;k²Ï¤Tõ×y×xɯ>äpô¿ú±_õögf_—tÂo„ 8Ž;ÌçX-³}!ŸRÕ™sR–»GRYH¹ì[ëPÌ•~ÓÓ«WQ›&Ä·Úí®³|W–´â 7ú²&+–X ÈHxï0¸eG1§sÉ¢Ç4%UŽõ Ý ÔJ¸ÔSñåTÁ¶}‚ÏÞbWL6ŠKŽôùôi^ɤ°¥pïs&W/¯}¡¯÷*Y8ÇãàÑ~ÝÛôÅ“|>z‚Wvâ]•­¿òë]>]ò§9U¬]â3®³o,: K–IA5eHÞBŠÏ!ä¥v´\«ÝµÜJ¤°×€)•ImrDÜãuÚˆ!/ÃOô·Ó ë× ?‹¢ýŠ%GúøßFÅŽMb··¸Û´ìOìÿÉg4ªÆ3‹Ü¼rBˆ¿•Rþ®œcwì|ã¾ê“Øž;YrÙÉ^Ù9õÊÁK; ~¸Òä‘ñÝ·çM’\~²ÇŒI~¬ÜÐwm4ùêÊÊHå#àëïv™qpe}ãúpÍ]6w•™~kR ¾v–ËéGƺÇÍRðþÙKOö#ì,0Ö€(—0\’µZ¾a»éJ*«g€D¿r< bÀ_&7Wz³–H¸¢Âö ä×75鿎~qkIÑ<6å6¹¥–kJÕ\ë‹DÈ„ä* ç7ù|›‚k@ü¡sÉ¢74ýÔ:……1ÿøBY³… ÷>gñéh[ I.=V2ïGOð¤à¥ã Ö¾jðû§ ª\LÂǦûœv¸ÏQ5l߯-ß[nóÌžþ%©)mpÅ)sÞá÷é¬Q YÁ’Ux¡ïÆ·š°ðŸ÷Íð‘Š%ù®4 ã#3[ÛKî)Ã…ðn’ˆãúÙ«į-Óý¡ë™/WÑž-ä— JOXã ±ÆÁ|()ÝXÙ+_X¿F5ûôo )þ™aZžñsÓ5ÿǵ܎ ®QMfGGГT1Ø_‘s*Éî~`{@l@Ã;$-gIÿ³Rò^à°&‘ÌÞ^ñ«®ëýZÓÍÀ`ýB'WzŶ–ãdïέdÒ|f³ÁºÍ‚g¶ ^Ø#ð%9JrÈH8f‚ä¨ñ>GŒó¬&W¯ùRÂæ½‚·›¼ø¼¼Óà/{`[7Œ¶aòÉøV˜r€äã$‡Œ‘:F’°f(g=Xÿ¦ÉÚ×k7üeLiLí3c¢dÆ$Ÿ#”$kp}χMÛL–¿,ظ]ðâÁ”’YáøÉ>³'ûq‰ ?2¤õ•×e/îcYÛö¸—KÉß!Å1Ùl‘°NÀ]"™½Mf;«è¬‚ «båìzÜµÆ ˆða"§»ð!Å9LÛQa)+|)–‚ªÌåØè€¸g‡­%»H¹w¶X*Q±å2ú"Êyäµà˜ŽFš¬Æ]õs£ËËb<ð¤ü»€¸Ï‚ø#È;×»®»rObÑL‰òŽš®{F£BˆËf.±oÔ]10(‡ÀF.ާHû*á£Ü3;€­ÁmDÅk=ß+è¹5 ÄNýÈ4ƒÄœœ2Á0ŒC}ß8X÷ŒF€§„¹ÄÖ Æ‘ÀBòú<0«ÎmóPt;{3ÔÞþ‚ÒáîfØß4š”Ä"êD!ıRÊÀXÝ3Ã×Z†¸zúb;­»¢~VÌÈ6øÄ ¨Ì™f°ª=8±àwåI³# ·Wƒíµ€Üvj‚ÓHÌZb³~‘ ¾”r#B\€”÷#uï Kl‚Kg.IÜ«»¢þ($°)Àû㸽Êw80'ò› ® åL²eoÛ<ü¶)|éh"#±Å®È  åã(‡ŽeÀÝ;à · ƒOÍ\œÐ)ë„ÀfsQMx/åÞ?*ØN‰üÚßöä¶x6ØÖITAŒÖ@¢Óö7>1óº$ëyßäc Þ ò^”—®ÆÐÆ.‹f]gÿAwEãI`S†è½¨ ¾±Á6 8«€à”êñ-ThÀËÔ¶!ø}w°ïV´zrØcöb¥ñìb#Ä<¤¼˜¤{gˆBp“bÑÌÅöVÝE`£QñOõ––Ha‚íØÈoe{ë@yLî ¤´W'Pö¸.”íMÛ߆f-±Ù°ÈAúr†q:¾/*nRcè`«b¡•JÜr̤6;4(éB}­» lã‚ïNüîEˆkW@r›—€çƒÿï $ 1­Jl,dÂÚÙN”ƒÉK¨P]hû[CbýB!˜)}ùg´]¬Ñщà{Âä‡3’èÖÝÑøVè…øj°ÍÖÝÔPHÛÀÔ‚ß$*©ò~TÌ[¡ýM;˜ "f© Ï6.rNð$×"åe€Ð=Óh Dñ_|cÖbû-Ý̓B[¬ŽDe6 Ðlã‹ü^è`²7²PYROnÞí`2@8v±Ý |rýÂìmH~N~Y Áƒà·†ß±Øþ‹îŽ&œü¢*D€®ë®œ‹R#ÎÕÝ3,à„zOîFx¿<H~o ¼k†WfÇøß—ð)݃D\ð;ÿ>ë:û%ÝÍ…>m`…É|¿„.1œ!é‰ë $¸´ö*ÀûEt€wÕX¿Ð9 )ÌнQtƒø•!Ä÷Äe¼î ‰!ú$H‰id2Yº»»Éd2˜féJÖ"RJZJ™Û|•Ë3WqÜ0 „†‘û,„È«HÞ(?~\ßV@bÿ€*§’Òï¾F‚óPö·Î€Äv¡ªüx ¥’Ôö·~°áÊŒ‰g|Z"ÿæ(¢ÚŒØü C\—üä£oK”Þ½Ñÿ†¤’Pgg'ÝÝÝø¾"ï·j‘N§sÄýž7ü+(v{üøñýX„ÄfÎG¥Ç1õXÐ(aîÉ.TüÛ”åkôöʧ¼ ‹ÜQRúÿ„äóèÌöµÂ~" ã¦ä'iJ¯Â&0Ïóò$³¨fFCWE!²)‘ÍNC9xŒB%ÎÕÞTq ®´½= ³g)îÙ`Ÿ0ÀûÙáÐ)Ï,ÌèK¾,¢v…d‡:üQ"~š´õ\úpSÇq 4™¦‰çy¸®‹çy9©Ë4M Ãè! Ñ8S}EVD*òvHeÇ jw ˆrõNhiM#θ ¶P" +xïÖ£<(72„¼7,òFKßû\@dõ+Qr1ô8‚_Z†¸Á¼üÑýCf 0Y–…ïûxž—'…ç5M³×5›Ìª&°~ˆ-Vè|hðÿc€C€#PåM ÐF%ÏÕШ–à|zìoaý·m(Ç’—PYLš>À{Ãç„tø0È+€“õ¸É{žþ ñçÄ'’Þ„M`Q{WèÔ’™ïû$‰<ÒŠî?d¬ ‚kC©CÙDf¢¼ÛÎÆ ¼ÇÛˆ`E­ªFœÉ-Œë¤´Bû[è`Úçö4úM­_˜äR࣠ß8²';„7ÙŸ|dc#©¶š‘À|ßÏó<”RâºnN¥hÛvq5mß¾=ǼŽãàº.BR©---$“ÉÜ…¬lšfNG*¥¤££ƒÎÎN<ÏË-ËÂ4M\×ÍÓ§FÝ6MÓäôw@mÄzaM Hì¤@}rT@n â‘,´ýM#žÊÉ ÈmW@po¡Ô’/kƒ…Ôî`¿-$¹­»ÊI Ožä£¨ êã†ð³êîBÜ%[Íד{hØ8ùÔƒÀ ‰É÷ýœ›}ô…$¶u—× IDATaP lË–-9Bñ}Ÿl6‹çyX–E*•"•Jå58$#×uÉf³$“IvíÚÅþýû1M“T*•có0NÁ²,,ËÊ‘`ÔXX sæ/Õ“£QÙ&ÆÐc;Œž"œ“’KÒ;ȆF%ðÂÚض€àÖ¢‚ºÞ·7ƒý_4qäóÓvÓ¥ä½ /ÞÑäï•èA!Äý–a>t¢·¤úÅC°øï¦Ö¥ñ'\ÿôÀºÐ åhßû'ׄÀBŠQ”ÌÇ铼‚ÀÞz뭼ඨGŠïûŒ;Ó4s·ÏÎ;óˆ-*ÕCÔìƒà&¡¼% Hnr r‹²¿iªÇ| ÁuÑ£zÜHikQAÞ¯ïᎀ·Ô«që9“…dŽ”òL”çðÁ¸hD8(ç›§…à S>9ã'‰ŽO,/ê@Ь8ñúuøjËÅ{…ö'@H‰D DH(Ó2xü²™&•B1/Ũ†-tüˆJ‹Ñx²„Î#°;vä.ŠŽ!yªû¢jÀpŸ£±á‹×/ßœù BûÛA(·íp@¨^ŽE©.¢Çþ¦Õ“qÎE©'÷£Òsí G=¹.2 ö·g®pô…óBݵE%,@‚•÷ÝT|µUÄK1ªR µm…)¨BþˆC=ë ý«GئþI¡T”Ç¢B&«ÚÁ™Z‚ÓˆIp~D:Û‡²·íV¡ìI›‚÷îÍ€àöR’†FI¬^vs/+Ì‹h2™L/R!‘ik~‚L:¢T’'D71øn*<ÀÔÒ›FL‚óP&ûQž“ÛPjÉõ¨]¯£+kT@býåJìêêê!“‚ß ÃÈið4 MLB©'£TB³²›ÛÁt72î´zR#|”Ý* ì~åXñÊÖ¶e—Ó7LgB Õ†Q‹nQ‡¿PZ‹:ýi lø¢Ðþ¶•Jèààïä«'µýM#®ôVØ…RA¾Hp/ßm öÕ•»‡2‰ R‘X_ÎáçD"çy¹$¡Ý+ §h/DðÛ¸ÈÒ;@öùÈçþìo…&#Ñù'5J/¾m”·í(”«ýœ"׉²µm ˆm7*½“RS¶hÛ[s®dd>aõ&¸›X6›E‘KV†X9ŽC&“¡½½½ÏóÔä¥ÕØE1‚›òN;‚žŠcÐö7øËãìÜ‹(/ɧQjÉW‚•†2 ía½zA*ªÂX⨗bar‹jIMK`ÃÅd×F>‡ö·ñÁÊû¨”…‰•Cõ¤Ð§ÑŒà=90Øf¢ á†ðP^‘;Q6¸WIí9”=n ÚþÖ8S€W¿Ös]7÷]a ªZ¤¢Ò6|¦7z5øûPÇ®EÙßÚQ*#]=@£Ì`;8ØfG~‹zOî¤Çþ’\è`ÆÆíAÛßVœ–Qåz5”´Âüº¡dÚÇ ]í5iÔç£bÜ`2HnG ‚gÇÓ£¾lÕ§Q"xGBÛbö·°ôN ÿEçÑ!G`5!¯Æ­ìþ†!°¨ñ4|aB2«“ä°Xã5ÇÝ{]™*¥"œ ,ÙÖ_èo‡•÷ÝH6›¥¥¥)%sæ/(G‚üïq¨”JS"ÿ?€ž89 |„±k_ AT‹Ij¡ÄÍ¥Õ%‰^5'L˜  ¬pRwº:°ZÚ©…óP#XÔK¨u"°1ô=Wù|øÂ¶×Öþ|¡'§ËØý×ÀÇc¶ù¨ø >±âÞ°,«¬\nsæ/è+À{2J5ù΀äÆÐ“¢KÛ߆·4¶¬‘Hl È-ZC,¬M\¬Ùĉ‡ÕBú2…OÂôXzÏ5SEnZ·º{Ígêì³êu±ß…3ËÉsËÜ»ø^^ÿ\Þß-¿-ˆcñ93ª}‡ .¬pT@lÓPÎ/ãP¶¹PÚ ,þµ™o¢Á… À¨>Z@3“Éä~÷9è ƒ†Õ†l–·Ÿå.­á9ˆÀ$¸Ž‡eÚ*á6¯W?x–ª2Cú{Óûö½/1jôª {?pk̶NïzlyXùàMX–Aç¾ý´··“ͺ´¶¶cY[6oc̘1øžS‹÷¼-Æ¢G=)“)n&ÊþvÊþÖŽ‘ivH”·â¿PÇŽF•Ø¢åYBiìCÑ–{KœÆŽÅm·ÞPcbl ¤?ðô&Ì8霿¬”R®N­¾í’Q‰${¬¨ðÚ–”rW0©W‹µäçÛ+ŠUËoÄq²Œ9š={öáy’öövº:Ó´¶¶â:™zŒƒ¨ƒI6 283 ¸(ÕäØ@ÂÓ•»›û€ÿ¦„Mv¨“Z4w¢”’É“'kS²—ÄÏv²fåÒ’îŸÄ¤"Ðl¥»»I¶$˜vÜÜz\z*jÕžxÂ󞑦9³ÊÃ/þ_Ì{x/pW©Xú¿Œ=–½{÷²o_£GFB§ôÝÁ#¡ƒÉhz’áŽEyK†RQݨl)º¸iãHak‚…Ȱ@!™…v±¨wbT…Ø´j†šØ¾ ‰þ~!<ÏÃ2ÔdšJ¥ˆlwÆ!/¤4ª&/‚•ë5Á$]-~LðûúÛÉu}Òé4¶m“J¥‚ºG‚t:MÂ\GÃUKo’Bˆ £˜#Ïç\\Ìþ6 es;&Xˆ!·$Ú{²^Ás˜Jƒ;sÔr^’X¡G/¯êf”ÀjA^Ÿ[rßÝÅ3Í=ï<¿6ä6˜R˜ï‘ìB%Õ¬“çáã1¯ó7›ýrLÏÐO×Ǽ—O•#É=pÏŸÔTZaá8¶5¸¡`…%ߣƒ”’l6›ç¦\XpðŒs..´¿ÜL”p\ ½ Ž«%^>ŠJK5l’X!Œ7n¸˜ÀÝ,_z÷€å`²Ž‹iŠzØ+ÁäV]“¥ÏÁ㘶{÷[›ž~bM‡“d ‚91”#qÑPŒðs*•¢­­é'Ì- ð>%ø<•Þkz EDOõn­žÔV‰EÝì‹XÓ©k"} Éæ×6”Õµ¸ÞÔÙsÄ„šƒM3Q/òZ‡¼2éô;¶½¹iã3a.ÔêOü3ð`ŒsÀÿÔßN©d+=D!|„a¨ˆžBÔ«Ë÷},Ëê5AË?×Ïâ, Yx6øûjä½/ ð¥ó‘(»Î‹(mhkÕ× õa±¹~õ²›‹Æ°6-ÕD(ñ3¼øÂ›Ãâ^=Ï#›Íb'ê64•O°úg#%vÂ|fäÈ‘Iry)l9p?Çõò À¿¡þÅég_ÄÒ;þˆiØX–“ëwS4Ž6-j  ¬¥¥¥bÒª@ó é‰É ƒÙ¿!¸Êîvd@^Yz;˜„1pD8œìo¨Óözôœù xøÁÛŠÖk:«…4dHŸô¾îŠÄØf– ÓD˜Óê'}Mˆ3ÁŽÕ~îž··o~ø¡yÒצu+ã__ˆrĈ“å~!ÄiRJ¯oÉ^`Û6é´ëfhkkÃÉ NÝÃèª5”®<ÏÃqœ\úžÂD©EßÃÈΣyëúšP*$¸°0äS}Œ™J%91Øn IG„v4Ê&7e—³†Áu0 U‡ÅáM/yîk±eD"Åʵ·V<ÔreZoœð®óêq™‰À7bMºø^gÇþe"çJ/±i²¨¢——Æ8ljRÊó€> §óßûž|xÙl¶æÒL5‹˲òrË !°,+Wh°‘\¸ê»!Bl¡õìà÷ÐÁäX”ÇÏq¨€îã‚ÕAôw7‹ƒI'°•ØW£¯9£Yœ8jbûòöÜúÇ#3yM?~^\ÏÐ=ÁÄV-^B)¥ì³Ëî¼Ñ£G²oÿÚÚZp2ƒ˜mÛd22™ RJ‰¶m—½(+”ÀÂXœÐ‰Ã²k ŽŠë¯¸)• Ì2캓VáÿÃZK¡Š3*õ§žiF+AnSè ÜþoJxOA9˜ŒEÙèêEp¾‚ ¢¶ö¯r¬)Þ¾š‡ï »‡>­~Ò×âžÀˤwlß^Ú34Ž-Ìâ§¾”_ &©jqm);mîûXóà͸®‹•°ëúÌ 12™ –e‘L&óÜèANØ´nõ«(/ÉÁW†Â ˆimd÷RÞ!ÉüV‹ï4ÊéèWš¼ÊððB”Òçà‰ê§=`Iì[5 éABܳ{×Àæ ôÕ¬~6ð|Œ•ôhà;”ð¶<ãì<ùÈ]tudá}ïq°ãhBwùh™Šf¶ëÖÔ:PÎ[ ~ÊN=gR@Rãè˜_,À»•5㔺²Üox$ Ö_Û5y•9ÿ4º ±ÒWº»‹Ç×ÜÛ8Ò ¯F¬‡íË4 áyþc¨¬èUO·†”FÂ6ĺ'V™Qý>î/–*qʃ­Z¼rï~»¿ž~ì:*ðx­jñ ŒJaѯÂ8¯ðø~CH…X£qÕŽR7ŽA¹¸÷…ÂïY¨¸Ä“PêöÀèq•ï@yÊî@£šR…X“ló*®¨¦9(nõTŒ–Ô'¿£çù§ÇÇœ‚¿2"‘ä±ÇèC0Š’XLÏÐ91'ŠqÀŸìo§ãN¹€U÷ß´?ñ=Ð))£3\×ÅqTFÐþfønjĔآä–^ ¾. ð¾7Ém‘‘­¥­*a•Z53‚¸"1=Q§w&Ï?½ª4qû,ò%”îLšTRåâÛµoW=šŸ~F<½ÿfáyßlÝ%$ƒÞš—çŸ^GÊ| åž§ÐæåÀO€~ wR SÐÝ&‘H0zôhöíÛGGPC,¼·BB2¸g£wJ§Â¾ SB…ÒQ45T輑(dÕŠ¥Íf1 #§‚ŒVÉÕä×'¹õµhŒÜsQu¾4*Ð4ý~ûöí ÙàÚÄ}y¬Zvkã¶/˜ˆû}p¾(ºd—Á×™L7©T Oúd2’©$ÓëS¬òÃÀŸb½”Rþˆ'–/ªâ*¬ÊZj5í¸ª+MLžFeX¯§—ÚéÞ{~Ë„ ؾ};–aç<…²óƒ\T…ýPHfQ5_H`†aàºnM&°Í!Q¶¶¶2bĆò¸ Xå"€-ºGúƪ¥7å>O˜Ð“+aH+°Ó]iæÎ¿Ëî­‰süYý“˜ÑKä &õ×¶•—›!Œ€ÀZêÕ½‹c­-|¤ÿ½pò5M3o2,Ìž> fTZ£oÇ8ÇiÀÅ”¨˜ëd=¤/˜¹{Í÷éë礫RÙä^YäÃcãÂ4Í^vµrVÆeIlET‘Zb뛼 a5bDþç\\“ó<ñÈÒ†8q=ÁLSMŠÂ01bÓã97”‹+Çjáµ$ÄüƘçxPXÎcÀÕð2òFì—À.zÿ¥Ü~ëÿ0zôhZR)Òé4é´ªØì1ѽ%°àù ò1¢…ý¢Yâ£RWô½ªÅø.T_Öó s‚[Œ‹åRÊa+±õ÷[CÕ•ÖˤÜMØ¿×èo¥lœ>üo¨B ½Ä|ßöêstiÌã×ïÙŸ^¾î‰;r¹ùÂ’á¡m%t6(%ElZ·ºj[˜Tîϳ€õ1îå”#ËSýí”É8Øv%x \ÇGúGIÑ%¤¼¢Éx‰DŽPú#¹8ä;””§QzòB°þ±û½îîî­¡CNáø˜3Áx”Mú(`åPéƒþÞa±mÛ¶†jìÜó.!.%ÒcùÒ[êÒ޳λ¤V«­>gپćp1 ƒŒ“eÖIï®Ç-_B$/]uÄaX¾×í½ôLOÜh˜«/˜…*Åþàº.ÇžXÝ϶Lá¸Þ:Tê¡j±_’> N+—ߎaÀÞÝûhmMå¤&Üs_˜0{—T’S í«ä¤”±- ÃÈ=—è3I&“´´´ ;7úÁD:¦»»›l6›sÖ)wrÖy—LAÅ´ü²Yîyå}½ëâNœ8±gœ4RØÜó/Áóâ®è$«—ÝÂü °ìž›^Òó<æžÿ¡šœë™Ç{—­2 ÞO?'¹ÉÊu]LËbö)gôí쌹šúóèQíY}ÿí¹I1ªFŒN¾á]j‚Íd2Z¬„ÅwÆì›Ï”šV­¸ƒý{;hkk!™LªjÈ~ ,Èš!rRP¡ª5ZI9ºZnq &$°¨×ahÇK$½¼5j?¿„Ï×q2™L^ÒR©ÀúC£KlÑØ¯(òr!îÞ½»a|êÜ‹jržGWÜQWñö]óÞW“s3V–"°ÖÖV¶oßÎ…üx=n÷‡”(ìXJXjIÚ§.»ëÏO ¶ 'Çl6‹ã8¸®ËÈ‘#û— l;Gx1m§Ë¹1Žï@¥z£¿{hY.þÊu³9q_æ#ó*(‡îïÑx®Â÷1Jhq&¸ð|ÑÔSRÊœg£eY¹ø2A¸pû>\È„9-ªÿO{Q(±ËB2àèo3fLdœ4ˆ ¬V™#êí=ì¿îÜý®:Ì‚Äç~à–˜“õ8ûÂÖãVOyŽkÓWµ¶š¬¼ÿî²Ø·o–e岨G¥€Ðª³‰Ç_µËl”[}|2¼Åž|l9©¤’ŒBó´èµDQÈfÝÜDVl«Çû m?§Ói:;;µ l€áû>¶mãy–eå&ïÐtPç9ºÜ,$:‡Gßû!¥ÀÃ0·[È à[qÇ"pÍãßWÑA¡ 1\ù{žG±¸±`ðÅ@ʬ_AåIìèo'Ë´<l£'$"8"Ã}˜†F=QE’@† IßcÓ†‡óáÖDŠœ3AŸºßÞ7]·Øàü˜üuŽ—Éî­ô¨d2™“´BÕU4ÓDáäÓ3ôg¨¢œí1nô6 _ošY'ÎáÙ~j¨*ò<ËNöºG &·¾$·r[—´ŽÀj1ñ·$`  n2 4˜sîûêuµcõ‹`›ðåòÕ«î®øØ¨Ê0œÜ£Ÿ 'ôç⥘ê¾lU Ũ²ýÆéxXfïô–!yEÓBShh4¹­ˆsC¥“Ö=ñР·áùÙßz' T;ÃQV=×J®^yÿmÌ¿ ò$È¡ãBºrGóÿ…“}-VtÀ¿ÓÓx(Xmö‰™'Ì)ì#\×ÍÝ“aË?¨%/ %°ZH_²xضj“àôÌs 6q…¸xUh_~TÖ•DBï»Äйß×qí}S€¿©äÏó{ÅÂY–††FƒXMÈKJ^X×8! O=²¬&ç)eÆ©“úðãJ_ ÿ©l›^9/l "$³Â¸±p+™å¿!®^ÙÔ—Úaúì9¸.8Ž—#ß¼4MZêÒÐhl ,.ZSHùÈòÛkrž9ó/Á¡Táã @ÔM* |5& ï’X™” ³ÓGÓMÙ¶Ý+KEaQÇ*E¸&fÿ¡Œ<‹ŽãR—Rš¦‰a¢˜‘9)¼XÑJ]QYCXK_O?öà~@­­-dÜ,YÏÁ¶U0ãç.¨Ç¥¿‚*^-„˜³êþIéÍÔÚŠÇ!Nã8Bˆ¸¶°?/ÅlæçKípü»æ‘H%p}¬ë`˜ÊÞÕÑÕ‘§·¬80 M`ÃÅrxUƒãO¿ˆÖ¶–eÑÙÙY/GC•)î¶Lã¥nkÔ>f'r1c1¦ø 7µeÔL{ëí´¤T ¶ç+X{k;7«’†F£Xm¤/¿îY7 ¾†¡¤¯3ÏýÐÀ_OÊ ‚«—¾à²å÷Þ8à|ªC³m!®ë’Édxö©XöÑ­@\Þ%”H<÷œ*À9“Éä¾÷\åBCcÈI`BÀ¦u5|;û+ÄV Þuö%†A[Ûˆz4{$ð…˜çø/‰Q—$›ÑšXáßh¦Çqby† Uy:îû%b.÷íë ’É$àù>-õ+Nª¡¡ ¬^ÒW3Ù®kåw꼜:ïõhòU±€_Z³¬~ù?£ÁÍ <ùlÛÎeñx|ͽ՟[ÕLþbÌ&ž ô›©ú”3/À4LBçÔЪØpQ&ߌE&pU[ÒîªwËy)F]Ò×?vœÓÿ س‰ÿ^j‡i³çàù0°­$ªÒŠ6Skh4ÕÆöÕ|®ÃMä-ö§˜Çï~µôÎ? JóÎK$9/ÅØÄ­Ws4ð©R;™†™#1ÏÓnò A`5!/)Ù´nMÓur­laŒ§Å<Ç¿Ö@R©Xòú,-•ÆÖ=+ÈübdÙð!°KIa™´ƒãxØ–­g( F‘Àj°ÌÖOl` ŒÌ%ðWÃKj™u£¬8R3JjÑ‚™ ò ¦R©8—òQ•›ã`¼”|²ÔN­­)„0óîE2kh S£b–Ù¼ö€zOìâDà°˜ç¸rå}75üŒÓ~ú,poÌ&ü¬T_O›=+糨mƒJ`~VŠŸ\ÙÔÝÀ$ö@Ìã—w6êÍE%)%ÏÅ‹ û×4éG¥­J®hhh "Õ*eÔPñ ü[ƒÇ¶õ±(R¹=Ï£»»;nÍ0PqpP9Ïcúì³xíMÊ/IDATH&øK“H’ñ=ƒihkD ™„½ÂGŠ?JŸA÷ªœ@9ùJ`Íò¢ÙêÃâ—¾ï“N§Ù¹sg\ÏЀûc6±,ô‚ûägÑÐÐV#ÔJ}8PÊV!<²üÖz7ëW1ï‚¥5‰ôÚ¾B»—"¯ðeXv%&þHÇ8~$e$RîîîV$‚t6MÂJèA¦¡ LK_BÉ+$7ðêÙŒCócžã“RÒTº«ÐæyRÊi% Z[[I¥RlX[}2Û2_âz„,F÷·Ã©§]ˆ¤»³h'b M`Zúpâê]uÓÄ¥€ÇcžãyàÍfû*ô@B`YV®ÌŠçy¤ÓÕ PŽëIT¹•8Úa ¸¯¤TümiÑÕš54jF`Úm¾<ëqƒ6Y¯f|óßm6ÏÐÐöeYV.s}Hd®ëæ6Ïóâ–½q)#»F œ Ìïo‡cƒqfÛaÁK M`u@¡MLg_pI=.=øzÌs¬~·ò¾›šªÏ]×Í}6MµXð<Çqp‡;vÐÑÑišŒ;6îê ;f“Hcý"›uuu! M`ZúXdœ4®ï`X‚dÒÆ4MÏÅõ¼>‹0>N ûJXÔˆªÃ0ÃF˜¨7ü.ü>‘Hàº.ÙlÃ0°m›l6Kgg'Žã0aÂ&NœÈÈ‘#sÿóø™”p«Ÿ>{NàÌY å<•µÕu3 t2`¡ Q WÜáL`ÑIÀq:::Èf³Áw>–e!¥$›Í"…A"‘À4l\é3gÞEõhæVTÖ‡jñ:ªdJèþ¼ ìÌš‚ž7ù,‚ÿï¶{V/»¹»V7æû>Éd’––‰DÑ$½Ñb—Ùl–îîn„$“IR©TÎ>Je¾ïsÒœªã¼[€Wbö÷T¸C¿xjí L–e P*QÃPêÒZ­Q£Ùû£ŸÓé4y}«Q{ø¾mÛxž‡eYŒ3&÷}À ¹h­ wé«…Šë{ÁÄiÒÚÚŠ&§Ÿua=šø}àËCiLGSd!‰x¨¤ÃN@À28ÞˆœÇö#rƒž|áfDÚ`\3*µ£bÞçw¯•Úéù`Ü„7ÉtçÂ\ÇÁ²“±ßoM`šÀ•À¬ÆhÐÐ~]×CÐÒÒ‚ï+’s<·—|vˆu©Q¡XaÍ0õU”=lwéw ””ј6×uc˜†F£Oƒ.}=ÿôÐ-—bÚŽ—Åó<¥:´m2ŽÃ9ç/èóÛª™gâeÄKØ«1¸¸¦ÔÓfÏ ¤¡î­€OJ»ÚkhXòÚ4¤k}ùy±FÙl6§èŽ[³Zò?ѯxSãóÀ¥vjmMåœUBÉ+›É0´JÁjhÔXÓ(l6K"‘À°-:Óݸ®Ë¼ùÔ£Q.®.G ³mÏS ˲jQïLCch˜–¾ÊïÞÎÎNlÛέ5yiTˆã€I¥v’@:›A¢²íËŠM…ZÓ(À¸qãØ³g&&¾ïºC4*Å(àÈr¤0òðÇsi1B—ÓòT/Äá ÉuwwçÙjhTíRe«/‰––V<_bš :º»hmIÕ/Q™†F3X#&ìÕI„5†*¤r£ß]öþR…¦©¬"ZÁ¢1´QÑ®‰¢¹`êùk(à-`S9;;{–iàz>É„ŽÿÒÐÖ38N8K§m2øž÷aÝ MûPi¸ÊÂô . Àõ´½†&0<ß×êôfƒàÝ Mÿ¨ô€SN?OIàZ×Ц %gëîj.HÌ]€0à7º7šlí!DÆ0ŒÍÕ¯_444…8JwWs‡O´'M|ϽGJ™F;X7ôºC _‚Ù¾ï?YíI¦ÏžƒãèÎÔÚ¨Ä ÑÖÝÕœ;¢EìÜß- ÓBÑ.¥*û‘¤²LëÇ´@ÎC»XVøb‚€#úŠ•`‰^s* 7¼·ª ²ÎëF®A侯¡J›ŒÚPYç½à}v‚ýÍೌ´Ñöó#[8^ÜàsØF“ž¼Maÿµ£ÜßSAûý‚þ ï5 ì^–wûÏ_¦ã>÷ÎÎNR£ÛrŸÛÚÔç½{÷2jÔ¨aÑ\cøØÝ]͉û»{È@ÊTM/…¨‘„|úÜóyî©Õ€O[[[.ç¨QªÒ‹ã8yÙë54š •,¿ÑÝÕüÐÊÃa6À9¯Äl6› ¨w‡D"¡;Hcxئu«7i§& æÂÔãTì¦ïû´··Ó”XI§ÓE«Wkh U ßótih4ZZZr¥VºººrD¦¡1lLûåjh4¦w&‰D"§>´m˲Èd2ºs4†=¿îa6­[­3¬5!„Úãl˜#“Éàºn® ªeYºS4šÿ6'@ŠùÃåâIEND®B`‚imbalanced-learn-0.12.2/doc/_static/img/logo.xcf000066400000000000000000001542471460233407600214050ustar00rootroot00000000000000gimp xcf v011 1–C•ÿìC•ÿìó gimp-commentCreated with GIMPgimp-image-grid¬(style solid) (fgcolor (color-rgba 0 0 0 1)) (bgcolor (color-rgba 1 1 1 1)) (xspacing 10) (yspacing 10) (spacing-unit inches) (xoffset 0) (yoffset 0) (offset-unit inches) ‚#|BÔauÚlearnÿ!?€ "     •Y%$ÿÿÿÿ#ÿÿÿÿdgimp-text-layerH(markup "learn") (font ".SF Compact Ultra-Bold #4") (font-size 62) (font-size-unit pixels) (antialias yes) (language "en-fr") (base-direction ltr) (color (color-rgb 0 0 0)) (justify left) (box-mode dynamic) (box-unit pixels) (hinting yes) ìuÚ #X#d#puÚðÜìü , 1 :,@ÁiÇ Ð ¼"ø###(#8#H ñú,‰×óÀ56ýoÍÿþ÷82üS²ûÿþà/ý8—î ÿÿo,ý{ÙÿÿÖ)ü `¾þÿÿ.%üE£õÿÿo#ý*ˆäÿÿ§!þOËÿÿÓ þ ìÿÿð ÿxÿüË.œÿÿ þ êÿúÐ Áÿÿÿuÿþô!ÿDÿÿ$þ èÿÿoþíÿÿ-ÿrÿþÓÿ·ÿÿ*þ çÿÿLÿ–ÿÿ"ÿoÿÿÅÿˆÿÿ @ @ @ @þåÿÿKÿƒÿÿÿlÿÿÐÿÿÿþãÿÿ\ÿ—ÿÿðÿ^ÿþèÿ©ÿÿÚÿÍÿÿ€ ÿÆÿÿÃÿ=ÿþû ÿâÿÿ¬ÿ­ÿÿ® þ ýÿÿ‰þüÿÿH ÿ3ÿÿcÿŒÿþà ÿ`ÿÿ=þ ïÿÿ ÿ’ÿÿÿkÿÿ# ÿÏÿÿêþÙÿÿÄ þ üÿÿµÿKÿÿf ÿIÿÿÿºÿþù ÿ“ÿÿJÿ'ÿÿµ ÿÝÿþþÿŠÿÿ` ÿ(ÿÿÍþçÿþú ÿ|ÿÿˆÿNÿÿ¶ ÿÔÿÿCÿ°ÿÿe ÿ*ÿþõþúÿþþ ÿ…ÿÿ«ÿtÿÿÉ þåÿÿUÿÕÿÿ| ÿLÿþõ ÿ8ÿÿ. ÿ¯ÿÿ«ÿšÿÿâ þûÿÿLþ ñÿÿ› ÿˆÿþãÿ^ÿÿT þ ìÿÿÿ½ÿþü ÿeÿþýþýÿÿÅ þÙÿÿ°ÿjÿÿ~ ÿVÿÿ;ÿ¿ÿÿ: ÿÎÿÿÅþýÿþóÿJÿÿOÿjÿÿ¶þÎÿþÓÿÀÿÿsÿUÿÿPþýÿÿ1þ×ÿÿÉÿkÿþíÿeÿÿDÿÀÿÿ±þéÿÿ¼þýÿÿsÿŠÿþü-ÿkÿÿ5þ#ùÿÿ”ÿÀÿþóÿ¹ÿþìþýÿÿºÿZÿÿiÿ^ÿÿ|þèÿþÈÿ©ÿÿAÿÿþú,þðÿþü ÿKÿÿ‚ÿAÿÿÎþæÿþ×ÿÿÿ”ÿ¨ÿþü9ÿ×ÿÿZÿbÿÿ„ÿÿÿ þ)õÿþÌÿaÿÿçþ Óÿþö(ÿ¥ÿÿ±ÿ¡ÿÿlÿçÿü{oÿÿ®ÿ-ÿüE@ûÿþßÿpÿüþ"êÿþù5ÿ®ÿýÙÙÿÿoÿéÿþªÂÿÿ©ÿ&ÿÿôÿþÒ ÿbÿþî" ÿžÿþýF!ÿ×ÿÿv!þ þÿÿŸ"ÿ>ÿþ¾"ÿrÿþØ#ÿ¥ÿþì$ ÿ/ÿØÿþ÷÷ ÿþú7 ÿ$ÿÿ¿þ0ð ÿÿs þúÿÿãþ$è ÿþÅÿíÿÿõþÜ ÿþù*ÿãÿÿûþ¶ ÿÿÿëÿÿëÿ‚ ÿþòÿ÷ÿÿØ þMþ ÿÿ†ÿÿÿª þí ÿþûÿ8ÿÿu ÿ¯ ÿÿ©ÿoÿÿ' ÿZ ÿÿLÿ«ÿÿÍ þì ÿþîþóÿÿ^ ÿž ÿÿ¡ÿWÿþÚ þ,ý ÿÿ]ÿÇÿÿJ ÿµÿÿÿFÿÿŸ ÿ@ÿÿÝþÔ ÿþÕ ÿ¿ÿÿ«ÿ„ ÿþë! ÿÿyþDý ÿþï5 ÿÿNþ,ð ÿþî3ÿÿ- þ-è ÿþÜ)dóZ‘¹ÝîûôäÏ n"/ý-‚Ö ÿýÆW)ý_ÈÿþË1&ý tçÿþ÷_#ýpìÿÿm!þRÜÿþþYþ!¶ÿþó"ÿýtöÿ÷ä’E"6„éÿþÂýYáþ ÁÿýêkþªÿùR9ÇÿÿþWñÿþ° þ¦ÿýÒ!¨ÿþÿýþy þ áÿÿüÿþÁÿÿiÿ[ ÿþ"Ùÿÿlþí ÿ þ2éÿÿÿ§ ÿ þ6ïÿþ¿ÿt ÿ þ9ñÿþèÿN ÿ þ.ðÿÿRÿ? ÿ þ æÿÿ¨ÿ3 ÿþÙÿþíÿ; ÿþ¹ÿÿnÿE ÿÿŒÿþØÿP ÿÿQÿÿLÿi ÿþíÿÿÆÿ… ÿþÀÿÿFÿ¡ ÿÿnÿÿÆÿÉ ÿþóÿÿGþõ ÿÿ¶ÿÿÓÿ& ÿÿMÿÿbÿY ÿþØÿþéÿ™ ÿÿlÿÿ„ÿØ ÿýæÿÿ#ÿ ÿJþ$®ÿÿÕö6ˆ½äõøå´aýùÿÿˆþQÔÿþÏýlìÿÿ<þ´ ÿþÍ þL×ÿþìþ%á ÿÿZ þ.» ÿÿ£þ$çÿÿ® ýšý ÿÿVþÕÿÿáþzò ÿþúÿ£ÿÿõ ÿÿ½ÿIÿÿú ÿÿqþÜÿÿî ÿÿ$ÿnÿÿÓ ÿÿØþ èÿÿ° ÿÿ‹ÿmÿÿ„ ÿÿ?þÜÿÿP ÿþîÿHÿÿ ÿÿ¦ÿ¥ÿÿÕ ÿÿYþóÿÿ ÿþûÿEÿÿE ÿÿÀÿˆÿþñÿs ÿÿtÿ½ÿÿ¥þ­ ÿÿ'ÿêÿÿNþTå ÿÿÛÿ ÿþïþ8¾ ÿÿŽÿÿÿ™ý UÁÿÿAÿ(ÿûL8m¬óÿþðÿ)ÿÿ¨þ ü(ÿÿ\ÿÎ'ÿþüÿ{'ÿÿÃþñ&ÿÿvÿiÿýü±ú ÿÿ*ÿœÿüô” ‰ ÿÿÞþ“ÿùþÀa +úÿL÷j®ºÅÑÜèóV%û@q¢Ó þ÷ÿþò ûI{¬ÝÿÿÕ ÿŸÿÿžú"S…¶çÿÿw þ3ýÿÿAú,]ŽÀð ÿþý ÿÅÿþâþ9÷ÿÿº ÿ^ÿÿ‡ÿ«ÿÿ_þìÿÿ*þüÿþö ÿ¦ÿÿÍÿÿÿ©ÿVÿÿpþ ðÿÿOþíÿþûÿnÿþðþÄÿÿµþÜÿÿ ÿ ÿÿXÿOÿÿJÿd ÿþò ÿ¸ÿþîþFú ÿÿžþýÿÿ¡þ<õ ÿÿAÿƒÿÿOþ>ò ÿþâþäÿþôþMö ÿÿ‡ÿMÿÿ¬ÿý ÿÿ*ÿ³ÿÿ_ ÿÿÍþüÿþý ÿÿoÿxÿÿÅÿ5 ÿþûÿÑÿÿ}þ0î ÿÿµÿ*ÿÿ5þ'ê ÿÿYÿƒÿúìã ÿþó ÿÛÿû¬Ú ÿÿ¢ÿ4ÿülÐÿÿFÿÿý,Áÿþèÿßÿýð¯ÿÿ’ÿ*ÿþºšÿÿ7ÿuÿÿìÿÿ²ÿÿÝÿ¿ÿþÄÿÿ„þúÿý× @ @ @Õÿ0ö.t®ÙðúåÈ('ýx×ÿýý—#ýõ ÿþÙ ý{òÿþàþ?ØÿÿµýŽþÿÿ[þ'ÓÿÿÒþUòÿÿ;þƒÿÿ|þ «ÿÿ·þÇÿÿÒþ Øÿÿåþ+æÿÿéþ3ìÿÿÛþ;ñ ÿÿÊþ>ô!ÿÿ©þ?ô"ÿÿ† þ@õÿûÏ[žÿÿ[ þ;ôÿý÷lþ õÿÿ( ÿñÿþÙ.ÿÕÿþó ÿþ¾ÿÝÿÿº ÿþŸþ ýÿÿz ÿþ‘ÿEÿÿ: ÿÿ‡ ÿŒÿþñ ÿÿ‡ ÿØÿÿ«ÿÿ ÿ-ÿÿ`þÿ— ÿ„ÿþþþ¥ ÿÞÿÿÄÿ ÿ;ÿÿpÿœÿÿþ òÿÿÈÿ0ÿÿg"ûsÿYÿÿ#ûÉÿ‚ÿÿ™#úÿÿ¬ÿÿ #úrÿÿÓÿÿö$úÃÿÿòÿÿâ#úùÿÿÿÿÐ#ÿ<ÿþ1ÿÿ¾#ÿwÿþPÿÿ¬#ÿ±ÿþmÿÿœ#ÿÞÿþƒÿÿŒ"þüÿþ–ÿÿ~"ÿ#ÿþªÿÿq"ÿEÿþ½ÿÿg"ÿcÿþÎÿÿa"ÿÛÿþ×ÿÿd!ÿzÿþÞÿÿq ÿ‹ÿþåÿÿ~þŸÿþëÿÿŽþ ²ÿþíÿÿµþÆÿþæÿÿàþ!Ù ÿþßÿþþþ8é ÿþÑÿÿXþW÷ ÿþ¹ÿÿ«þ„ ÿþ¡ÿþõþ² ÿþ þÿþÿÿ‹þ2Ý ÿýÖÿþTÿþ÷ýsùÿûþm ÿþ(ÿþ¿þ+Ä ÿþùWÿZÿýìÿþ  ýšü ÿþð@ùúÿÿ®ÿþ¹ý3™õ ÿþá,ú·ÿÿfÿöö–D'OÒ ÿþÊùNÿÿû.ÿþ¬ ýßÿÿµ,ÿýþ þcÿÿH+ÿþòP þÓÿÓ)ÿþÕ( ÿ?ÿL(ÿþ¦ ÿº%ÿþôcþ ð#ÿþÍ'ÿY!ÿýû€ÿ„ÿþÊ0ÿ™ÿýïmÿ‡ÿýú“"þVïÿýù™%ý£þÿýå€)ý-™ó ÿüå›D-òFˆ´Úíúûñݾ—e,£ÿÿ þ>í ÿþ¾ÿÿî þlú ÿýûÿÿÛ þ¶ ÿþÛ8ÿÿÈýrñ ÿýùƒÿÿ¶ý^Û ÿþ·'ÿ"ÿÿ¯ýoÜ ÿþËBÿ}ÿù©@œô ÿýÍSÿÑÿü¨D“ã ÿþÂIÿ$ÿýø¡3ÿnÿýÜzÿ¯ÿüõ¡?þìÿüù´\ "ÿ(ÿüø¶b%ÿVÿüî£T (ÿƒÿÿq*ÿ ÿÿ­)þÆÿþí'þÇÿÿ<&þËÿÿ’%þÑÿþë#þ×ÿÿ^"þ%áÿþÓ þ5êÿÿ_þIôÿþèþiüÿÿ¬þ’ÿÿñÿÿ|þÀÿýð7×ÿÿpþEçÿüï5´ÿþý˜þÿûé0†ÿþÍ4ý yî ÿþã'ÿTÿüþ­A ý/‹ê ÿþÙþþÿóåœe9  4d™Ö ÿþÉÿÏ5ÿþµ ÿ{4ÿþ”ÿ"2ÿþüo þ¸þ/ÿþîF þE.ÿþÒ# þÉ,ÿþŸ þß)ÿþò]þ(æ'ÿþÄ!þ#ß$ÿýõsþÁ"ÿþ´ ý†üÿþÑEþ.ÃÿýÉP þI¾ÿýñš6%ý+ƒØÿüÙ—I*ïU‰¬Îæñûüõë×¾¡xM¡þjÿÿÁÿf ÿþàÿÿ_ÿ¶ ÿÿSÿþúþ ù ÿÿºÿÿ¸ÿ\ÿÿþÿÿfÿ½#ÿÿþþ"ÿÿØÿ}#ÿÿ—þä#ÿÿWÿY$ÿÿ$ÿÈ#ÿþóÿ:$ÿÿÆÿ¸$ÿÿ ÿ5%ÿÿƒÿ´%ÿÿfÿ9&ÿÿPÿÇ&ÿÿFÿW'ÿÿ<þ ß'ÿÿ7ÿ(ÿÿFþ7ý(ÿÿXþ Õ)ÿÿˆÿ£ÿÿå#ÿÿÁÿmÿþýG#ÿþý þLúÿþ„ $ÿÿþHøÿýÇ$ÿþû/ þX÷ÿüí %ÿþÚ þÿûþM&ÿþØ1þQÚÿÿˆÿ÷&ÿ÷ûŸF:‚Öÿþºÿå7ÿþßÿÒ6ÿþò-ÿ¶5ÿþüJÿ–5ÿÿkÿp4ÿÿ…ÿA3ÿÿ› þýÿÿÂ!ÿþ£ ÿËÿþ5þÿþ§ ÿƒÿþ’ÿþ› ÿ+ÿý ØÿÿˆÿÇ ÿþ)ñÿþúdÿO ÿþIúÿþã;ÿÁ ÿþPúÿþ®þ(ó ÿþ>êÿþãTþOû ÿþ³ÿýévþOò ÿþJÅ ÿüýÀaþ"²ÿ ô)z³ÞòýôãÀ–Z ÷+¾çøûìËš€ÿÿ•þYì ÿüöµlþà ÿÿMý‹öÿûã®xBÿb ÿþù þæÿÿ§ þì ÿÿ½þÁÿþó ÿ ÿÿxÿŒÿÿu þ$ú ÿÿ4ÿSÿþË ÿ¯ ÿþïþ2ôÿþù, ÿ? ÿÿ®þäÿÿv ÿÉ ÿÿpþ Íÿþº ÿLÿÿ3þ²ÿþé ÿÌ ÿþóþ¡ÿþüB ÿNÿÿ¼ÿÿÿt ÿÍÿÿ†ÿ}ÿÿ¨ ÿBÿÿQÿzÿþÉ ÿ²ÿÿþöÿþá þ$ýÿÿñÿÿþï, ÿ‘ÿÿÆþ«ÿþ÷> þïÿÿŸþ©ÿþüP ÿTÿÿ€þ«ÿþþaÿ±ÿÿcþ®ÿÿjþúÿÿTþ±ÿÿtÿ_ÿÿJþ ¸ÿÿwÿ¦ÿÿR þ Àÿÿrþëÿÿd þÈÿÿnÿ2ÿÿŽ þÕÿþùNÿfÿÿÓ þ0áÿþöGÿ”ÿÿ?þYõÿþôAÿÂÿþÛþ®ÿþð;ÿäÿøåhI¥÷ÿþë2ÿö$ÿþå*ÿ$ÿþÞ"ÿ#ÿþÓþý!ÿþÇÿí ÿþµ ÿÕÿþŸÿŸÿÿ…ÿcÿþûeþ þÿþñG!ÿµÿþÜ)"ÿFÿþº$ÿ½ÿýþ…%þòÿþæD(ÿ]ÿýþ*ÿzÿþÀ5-þgø ÿýùª=0þ.¿ÿþ[4÷5ÊîüôÜ´{€ÿÿ+ÿTÿüç ÿÿÓÿœÿþô2ÿÿ{ÿÚÿþýMÿÿ$ÿÿÿnÿÿÍÿMÿÿÿÿwÿ‡ÿþ±ÿÿ"ÿÀÿþÌ ÿÿÎþõÿþïÿÿ|ÿ+ÿÿRÿÿ+ÿRÿÿ° ÿÿÞÿzÿþú% ÿÿ—ÿ¢ÿÿ¥ ÿÿTÿÊÿÿ= ÿÿÿñÿÿæ ÿÿâþŒÿÿ¥ ÿÿ°ýÏÞÿÿk ÿÿ‹üÖÿöÿÿA ÿÿkþ!Ýÿÿ ÿÿWþ)äÿÿí ÿÿLþ3ëÿÿÆ ÿÿPþ@ñÿÿ¢ ÿÿbþPøÿÿ~ ÿÿŽþfüÿÿ\ ÿÿÏ ÿÿÿ> ÿÿ5 þ¢ÿÿðÿÿ ÿþÇþÇÿýî5Ûÿþþ ÿþ¥ýgðÿüì1âÿÿì ÿøÕa8}à ÿûæ+êÿÿÖ ÿþß"ÿïÿÿÀ ÿþÓÿíÿÿ° ÿþÃÿéÿÿ¤ ÿþ®ÿâÿÿ™ ÿþÿÙÿÿ‘ ÿþülÿÎÿÿŠ ÿþïE ÿÂÿÿ„ ÿþÖ% ÿ´ÿÿ€ ÿþ© ÿ¦ÿÿ| ÿþùo ÿ—ÿÿx ÿþÙ1ÿ‡ÿÿv ÿýþ” ÿvÿÿu ÿþÜBÿdÿÿt ÿýø‚ÿRÿýåª9 ÿýþ¦ÿ? ÿûþÖš^"ÿýü©-ÿ+ÿûùÇ‹Oüÿá…ÿÿûð¸{?þ6úâ¨l0 @ @ @ ÿbÿÿoÿÆÿþüÿ*ÿÿ¸ÿÿÿ\þëÿþò ÿWÿÿ›ÿºÿÿ9þýÿÿ×ÿtÿÿuÿÑÿþûÿ,ÿÿ®ÿ…ÿÿKÿØÿþçÿ-ÿÿÿ€ÿÿ/þ76ÿÏÿÿÔý è@ÿÿÿ{üÕÿ@ÿdÿÿ$ûÁÿÿ@ÿ­ÿÿÒþ«ÿÿ@þíÿÿÿ˜ÿÿ@ÿ.ÿÿ6ÿˆÿÿ@ÿmÿþìÿ}ÿÿ@ÿ¦ÿÿ¯ÿyÿÿ@ÿØÿÿwÿ~ÿÿ@þ ýÿÿL ÿŠÿþâÿ5ÿÿ5 þ ÿþç%ÿWÿÿ9 þ½ÿþì,ÿxÿÿX þ1Þÿþí0ÿÿÿžýsùÿþì0ÿ›ÿþù/þ:Êÿþé/ÿ¤ÿùìg\¾ ÿþã' ÿ™&ÿþÛ ÿˆ%ÿþÊ ÿv$ÿþµ ÿO#ÿþ’ ÿ!ÿþúgÿÚÿþå:ÿ‚ÿþ»þúÿýúwÿ ÿþÐ1þæÿýö|þHþÿþ® þhþÿþ¿9þTñÿýü®:!ý¤ü ÿýÏx%ôr²ÞöýôâÀ•^×€€€€€€€€€€€€€€€€€€€€bbbbºm]6.†X Imbalancedÿ!?€ "     ’%$ÿÿÿÿ#ÿÿÿÿ‡gimp-text-layerk(markup "Imbalanced") (font ".SF Compact Ultra-Bold #4") (font-size 62) (font-size-unit pixels) (antialias yes) (language "en-fr") (base-direction ltr) (color (color-rgb 0 0 0)) (justify left) (box-mode dynamic) (box-unit pixels) (hinting yes) &†X&JBlBxB„†X&Ê*›.D25X8n;y9?c@\A;B3†ÿ×ÿÿG7ÿ×ÿÿG7ÿ×ÿÿG7ÿ×ÿÿG7ÿ×ÿÿG7ÿ×ÿÿG7ÿ×ÿÿG7ÿ×ÿÿG7ÿ×ÿÿG7ÿ×ÿÿG7ÿ×ÿÿG7ÿ×ÿÿG7ÿ×ÿÿG7ÿ×ÿÿG7ÿ×ÿÿGÿÿô>‰ºáòüñÛ°r$ û6„¹âÿ×ÿÿGÿ3ÿÿËýpà ÿýþ¶0ý^Õÿÿ×ÿÿGÿ3ÿÿËþ7× ÿþûrþ)Çÿÿ×ÿÿGÿ3ÿûË[øÿÿmþJñÿÿ×ÿÿGÿ3ÿüËYýÿûø)Gùÿÿ×ÿÿGÿ3ÿýË7úÿý§%óÿÿ×ÿÿGÿ3ÿþÞßÿ÷Æk/0vÝÿþúÑÿûë’Q#ÿ×ÿÿGÿ3 ÿþèNþš ÿýùx ÿ×ÿÿGÿ3ÿþÅ þ²ÿþë: ÿ×ÿÿGÿ3ÿþË þ#üÿþú? ÿ×ÿÿGÿ3ÿþó! ÿÁÿÿ† ÿ×ÿÿGÿ3ÿÿsÿÿþó ÿ×ÿÿGÿ3ÿþð ÿWÿÿ«ÿ×ÿÿGÿ3ÿÿžÿAÿÿiÿ×ÿÿGÿ3ÿÿ[ÿ6ÿÿGÿ×ÿÿGÿ3ÿÿ7ÿ3ÿÿ3ÿ×ÿÿGÿ3ÿÿ+ÿ3ÿÿ+ÿ×ÿÿGÿ3ÿÿ+ÿ3ÿÿ+ÿ×ÿÿGÿ3ÿÿ+ÿ3ÿÿ+ÿ×ÿÿGÿ3ÿÿ+ÿ3ÿÿ+ÿ×ÿÿGÿ3ÿÿ+ÿ3ÿÿ+ÿ×ÿÿGÿ3ÿÿ+ÿ3ÿÿ+ÿ×ÿÿGÿ3ÿÿ+ÿ3ÿÿ+ÿ×ÿÿGÿ3ÿÿ+ÿ3ÿÿ+ÿ×ÿÿGÿ3ÿÿ+ÿ3ÿÿ+ÿ×ÿÿGÿ3ÿÿ+ÿ3ÿÿ+ÿ×ÿÿGÿ3ÿÿ+ÿ3ÿÿ+ÿ×ÿÿGÿ3ÿÿ+ÿ3ÿÿ+ÿ×ÿÿGÿ3ÿÿ+ÿ3ÿÿ+ÿ×ÿÿGÿ3ÿÿ+ÿ3ÿÿ+ÿ×ÿÿGÿ3ÿÿ+ÿ3ÿÿ+ÿ×ÿÿGÿ3ÿÿ+ÿ3ÿÿ+˜ÿ÷ÿÿg8ÿ÷ÿÿg8ÿ÷ÿÿg8ÿ÷ÿÿg8ÿ÷ÿÿg8ÿ÷ÿÿg8ÿ÷ÿÿg8ÿ÷ÿÿg8ÿ÷ÿÿg8ÿ÷ÿÿg8ÿ÷ÿÿg8ÿ÷ÿÿg8ÿ÷ÿÿg8ÿ÷ÿÿg øóûíÞÀZ ÿ÷ÿÿgò<~²×íúøëÔ­z3 ÿýô  ÿ÷ÿÿgý‚ä ÿýÙn ÿþóf ÿ÷ÿûg\îÿþßO ÿÿo ÿ÷ÿýg…ÿþ— ÿÿF ÿ÷ÿþghÿþ¸  ÿþÖ ÿ÷ÿþ…ôÿþ²û7âÿÿT ÿ÷ÿÿùÿöÓ{=;wÍÿÿ‡ þ ÿÿ  ÿ÷ ÿþêS þGáÿþý< þ®ÿÿç ÿ÷ÿþ¼ þ ¥ÿþÑ þùÿÿÿ÷ÿþÆ þªÿÿ] ÿ¸ÿÿÿ÷ÿþáþÉÿÿÑ ÿxÿÿ*ÿ÷ÿÿWþ3ýÿÿ6 ÿTÿÿ+ÿ÷ÿÿÉÿžÿÿ ÿ@ÿÿ+ÿ÷ÿÿcÿ8ÿÿÙ ÿ6ÿÿ+ÿ÷ÿþúÿÙÿÿ ÿ3ÿÿ+ÿ÷ÿÿÂÿ’ÿÿL ÿ3ÿÿ+ÿ÷ÿÿ’ÿ]ÿÿz ÿ3ÿÿ+ÿ÷ÿÿdÿ+ÿÿ˜ ÿ3ÿÿ+ÿ÷ÿÿIÿÿÿ± ÿ3ÿÿ+ÿ÷ÿÿ:ÿÿÿ½ ÿ3ÿÿ+ÿ÷ÿÿ/ÿüÿÿÅ ÿ3ÿÿ+ÿ÷ÿÿ;ÿ ÿÿ¿ ÿ3ÿÿ+ÿ÷ÿÿJÿÿÿ· ÿ3ÿÿ+ÿ÷ÿÿdÿ7ÿÿ  ÿ3ÿÿ+ÿ÷ÿÿ’ÿbÿÿ† ÿ3ÿÿ+ÿ÷ÿÿÃÿÿÿ] ÿ3ÿÿ+ÿ÷ÿþùÿ×ÿÿ/ ÿ3ÿÿ+ÿ÷ÿÿeÿ&ÿþð ÿ3ÿÿ+ÿ÷ÿþÊÿ‰ÿÿ° ÿ3ÿÿ+ÿ÷ÿÿ]þõÿÿ\ ÿ3ÿÿ+ÿ÷ÿþçÿ¢ÿþò ÿ3ÿÿ+ÿ÷ÿþÎÿqÿÿެÿÓÿÿ‹8ÿÓÿÿ‹8ÿÓÿÿ‹8ÿÓÿÿ‹8ÿÓÿÿ‹8ÿÓÿÿ‹8ÿÓÿÿ‹8ÿÓÿÿ‹8ÿÓÿÿ‹8ÿÓÿÿ‹8ÿÓÿÿ‹8ÿÓÿÿ‹8ÿÓÿÿ‹8ÿÓÿÿ‹ð N~ªÑâðü÷îßĨw<ÿÓÿÿ‹ýL­ö ÿýì… ÿÓÿÿ‹þ4ÁÿþòZ ÿÓÿÿ‹þeûÿÿ‡ ÿÓÿÿ‹ÿ€ÿÿY ÿÓÿÿ‹ ÿ€ÿNÿþô ÿÓÿÿ‹ þNÿþêÿôô«d8 -J…áÿÿ~ ÿÓÿÿ‹ ýêÿÿÿýû ývûÿÿÎ ÿÓÿÿ‹ ýÿÿþòÿþûA ÿdÿþúÿÓÿÿ‹üòÿÿÿHÿÿÿÂÿÿÿÓÿÿ‹ÿHÿÿ€ÿþûÿaÿÿ(ÿÓÿÿ‹ÿ€ÿÿ©ÿÿÉÿ:ÿÿ,ÿÓÿÿ‹ÿ©ÿÿÁÿÿ¥ÿ%ÿÿ,ÿÓÿÿ‹ÿÁÿÿ?ÿÿ,ÿÓÿÿ‹'ÿ¡ÿÿ,ÿÓÿÿ‹%þ ‘ÿÿ,ÿÓÿÿ‹"ü <‹çÿÿ,ÿÓÿÿ‹ùFjŒ°Õû ÿÿ,ÿÓÿÿ‹û C»íÿÿ,ÿÓÿÿ‹ü}Êûÿÿýÿÿ,ÿÓÿÿ‹ý!­þÿýë–Mÿÿ,ÿÓÿÿ‹ ÿ!þh÷ ÿ÷üݵ‹a44ÿÿ,ÿÓÿÿ‹ þh÷ÿ~ÿúÓlH%ÿ4ÿÿ,ÿÓÿÿ‹ ý~ÿÿÿRÿýï}! ÿ4ÿÿ,ÿÓÿÿ‹ÿRÿþêÿþ°ÿ4ÿÿ,ÿÓÿÿ‹þêÿÿ€ÿþ¯ÿ4ÿÿ,ÿÓÿÿ‹ÿ€ÿÿÖÿþõÿ;ÿÿ,ÿÓÿÿ‹ÿÖÿþÿÿ­ÿZÿÿ,ÿÓÿÿ‹ÿÿþ4ÿÿ‚ÿ¤ÿÿ,ÿÓÿÿ‹ÿ4ÿþEÿÿzþõÿÿ,ÿÓÿÿ‹ÿEÿþHÿÿ•ÿ«ÿÿ,ÿÓÿÿ‹ÿHÿþ:ÿþáÿ†ÿÿ/ÿÓÿÿ‹ÿ:ÿð N~ªÑâðü÷îßĨw<ÿÿó;‡·àðüöéÌ¢gýL­ö ÿýì… ÿ3ÿÿËýjß ÿúüž4ÁÿþòZ ÿ3ÿÿËþ-Ïÿý÷eûÿÿ‡ ÿ3ÿûËDò)ÿÿY ÿ3ÿüË9ø*ÿþôÿ3ÿýËéÿôô«d8 -J…áÿÿ~ÿ3ÿþ˨ÿöí£Y.,jÌÿýû ývûÿÿÎÿ3ÿÿúÿýô€ þ_öÿþûA ÿdÿþúÿ3ÿþä+ þDüÿÿÿÂÿÿÿ3ÿþç ÿˆÿþûÿaÿÿ(ÿ3ÿÿIþùÿÿÉÿ:ÿÿ,ÿ3ÿÿ©ÿÀÿÿ¥ÿ%ÿÿ,ÿ3ÿÿ8þ‘ÿÿ?ÿÿ,ÿ3ÿÿ×þ~ÿÿ¡ÿÿ,ÿ3ÿÿ™þwÿþ ‘ÿÿ,ÿ3ÿÿbþwÿü <‹çÿÿ,ÿ3ÿÿHþwÿùFjŒ°Õû ÿÿ,ÿ3ÿÿ6þwÿû C»íÿÿ,ÿ3ÿÿ,ùwÿ}Êûÿÿýÿÿ,ÿ3ÿÿ+üwÿ­þÿýë–Mÿÿ,ÿ3ÿÿ+ÿw ÿ÷üݵ‹a44ÿÿ,ÿ3ÿÿ+ÿwÿúÓlH%ÿ4ÿÿ,ÿ3ÿÿ+ÿwÿýï}! ÿ4ÿÿ,ÿ3ÿÿ+ÿwÿþ°ÿ4ÿÿ,ÿ3ÿÿ+ÿwÿþ¯ÿ4ÿÿ,ÿ3ÿÿ+ûwÿÿõÿ;ÿÿ,ÿ3ÿÿ+üwÿÿ­ÿZÿÿ,ÿ3ÿÿ+üwÿÿ‚ÿ¤ÿÿ,ÿ3ÿÿ+üwÿÿzþõÿÿ,ÿ3ÿÿ+üwÿÿ•ÿ«ÿÿ,ÿ3ÿÿ+ûwÿÿáÿ†ÿÿ/ÿ3ÿÿ+þwÿòD‡´Ùìù÷èØ¿Ž]/ý!‘î ÿüþÂfýjlý ŽùÿþßQûQáÿÿþ2Ûÿþ¦þ £ÿÿ^þHôÿþ¼ þÌÿþí þ>÷ÿÿ° þÑÿÿ‹ þîÿõçM$ $T˜ðÿÿi þ¼ÿþä þÀÿý÷rý Žýÿþöÿ ÿÿ= ÿ[ÿþÔ* þWúÿÿŽþ-úÿÿˆÿÿoþßÿþáÿrÿþëÿ¹ÿþ|ÿÿœÿWÿþ÷,þÐÿÿ=ÿ:ÿý°ÿÿÁÿ»ÿÿŒÿZÿÿyÿ§ÿü÷ÿÿÏþüÿþòþ ùÿÿ£þùÿÿŽÿÿÜÿYÿÿ¢ÿYÿÿ.ÿÿçÿ™ÿÿRÿžÿÿßÿÿçÿËÿþý ÿÔÿÿ«ÿÿçþõÿÿÝþüÿÿ†ÿÿçÿÿÿ³ÿ ÿÿçÿ0ÿÿ•ÿ9ÿÿçÿ?ÿÿ‡ÿCÿÿçÿGÿÿzÿGÿÿçÿHÿÿ{ÿ2ÿÿçÿ?ÿÿŠÿÿÿçÿ1ÿÿœþýÿÿxÿÿçÿÿÿÄ ÿèÿÿ„ÿÿçþôÿþòÿ·ÿÿšÿÿçÿÈÿÿ/ÿ¯ÿÿ×ÿ}ÿÿÏÿÿçÿ“ÿÿ†ÿåÿÿ ÿBÿþüÿÿçÿOÿþßÿ1ÿÿiþ úÿÿtÿÿçþ ôÿÿhÿ”ÿÿ+ÿ´ÿüâ ÿÿçÿ¢ÿþéþ&øÿÿÍÿVÿý˜ÿÿçÿ3ÿþà þÅÿÿkþÞÿþh»ÿëÿ:ÿëÿ:ÿëÿ:ÿëÿ:ÿëÿ:ÿëÿ:ÿëÿ:ÿëÿ:ÿëÿ:ÿëÿ:ÿëÿ:ÿëÿ:ÿëÿ:ÿëÿó/t­ÔíúöèÔ§v'óaŸÇçóüòÙ¶~<ÿëÿÿÓ ÿþÍOýG·þ ÿýáy ÿëÿþÁþ%¼ÿþí\ÿëÿþïGþ]ôÿü£ëÿþúXÿ{ÿý´ëÿþûE ÿoÿþ~ëÿõþ¾o10l½þÿþð þAýÿõ×~@5jºþÿþýþÿþÁ3þ0»ÿþ¾þ äÿýïZþ+Å ÿÿ þzþÿÿVÿˆÿþÉ ýtýÿ ÿdÿþÛþ÷ÿþÚÿhÿÿ’ÿÿSÿÿþô%ÿ†ÿþ æÿÿ½ÿÝÿÿ†þÞÿÿjÿþýÿ3ÿþñÿTÿþùÿÿcÿvÿÿŸþëÿÿ¶ÿÿ¡ÿ³ÿÿOÿ•ÿÿ‚ÿÿÙÿàÿþý ÿLÿÿ[ÿþúÿ ÿÿÚÿÿÿ"ÿ$ÿÿ°ÿìÿÿFÿ;ÿÿ“ÿÓÿÿcÿDÿÿ…ÿÄÿÿiÿIÿÿxÿ¸ÿÿnÿBÿÿ~ÿÃÿÿtÿ7ÿÿÿÒÿÿÿÿ¥ÿìÿþûÿÿÕÿÿÿÕÿþý ÿJÿÿ¤ÿÿQÿ”ÿÿcÿÿ°þèÿÿ•ÿÿ»þþÿþúÿQÿþåÿÿuÿÃÿÿ¨þÚÿÿcÿÿ/ÿ`ÿÿGÿ{ÿþéÿÿÓþåÿþñ2þWþÿ€€€kþÿsþÿsþÿsþÿsþÿsþÿsþÿsþÿsþÿsþÿsþÿsþÿsþÿsþÿsþÿsþÿsþÿsþÿsþÿsþÿsþÿsþÿsþÿsþÿsþÿsþÿsþÿsþÿsþÿsþÿsþÿsþÿsþÿsþÿsþÿsþÿsþÿsþÿsþÿsþÿsþÿsþÿsþÿsþÿsþÿsþÿsÿ×ÿÿGÿ3ÿÿ+ÿ3ÿÿ+ÿ×ÿÿGÿ3ÿÿ+ÿ3ÿÿ+ÿ×ÿÿGÿ3ÿÿ+ÿ3ÿÿ+ÿ×ÿÿGÿ3ÿÿ+ÿ3ÿÿ+ÿ×ÿÿGÿ3ÿÿ+ÿ3ÿÿ+ÿ×ÿÿGÿ3ÿÿ+ÿ3ÿÿ+ÿ×ÿÿGÿ3ÿÿ+ÿ3ÿÿ+ÿ×ÿÿGÿ3ÿÿ+ÿ3ÿÿ+ ÿ3ÿÿ+ÿ÷ÿþÈ þküÿþó ÿ3ÿÿ+ÿ÷ ÿýògþ®ÿÿx ÿ3ÿÿ+ÿ÷ÿÿýÿõá‰I  (X£÷ÿþÄ ÿ3ÿÿ+ÿ÷ÿþ…íÿþá ÿ3ÿÿ+ÿ÷ÿýgKúÿþá'ÿ3ÿÿ+ÿ÷ÿügKõÿþÆÿ3ÿÿ+ÿ÷ÿûg%Âÿýózÿ3ÿÿ+ÿ÷ÿÿgýJ¾ý ÿýê‰#ÿ*+ÿóbÈèùûïÛ¶†CÎþÿÿw þ Ÿÿÿ<ÿÓÿÿ‹ÿÿÿáÿýýuýdÞ ÿÿZÿÓÿÿ‹ÿáÿÿŽÿõØz; /^ ç ÿþÁÿÓÿÿ‹ÿŽÿþóÿþ²Ñÿÿ(ÿÓÿÿ‹þóÿÿoÿý¸·ÿÿ(ÿÓÿÿ‹ÿoÿþšÿü–‚ÿÿ(ÿÓÿÿ‹üšÿÿþúÿþßJþ$ûÿÿ(ÿÓÿÿ‹ þúý(­ù ÿýÊbþjýÿÿ(ÿÓÿÿ‹ ÿ(ó`•ÄâñüõçÇŸf"÷:¬èüöÜ©SÙÿÿw þ Ÿÿÿ<ÿ3ÿÿ+ÿwÿýýuýdÞ ÿÿZÿ3ÿÿ+ÿwÿõØz; /^ ç ÿþÁÿ3ÿÿ+ÿwÿþ²Ñÿÿ(ÿ3ÿÿ+ÿwÿý¸·ÿÿ(ÿ3ÿÿ+ÿwÿü–‚ÿÿ(ÿ3ÿÿ+ÿwÿþßJþ$ûÿÿ(ÿ3ÿÿ+üwÿ­ù ÿýÊbþjýÿÿ(ÿ3ÿÿ+ðwÿ`•ÄâñüõçÇŸf"÷:¬èüöÜ©Sàÿÿç ÿ³ÿþ´ þ ³ÿþîÿ_ÿÿ„ÿÿç þ#÷ÿþãE þ@ÙÿÿiþÄ ÿÿç ÿlÿöÇp5 AzÏÿþÍ þ!î ÿÿç ÿŸÿþè# þEùÿÿç þ§ÿþò2 þNöÿÿç ý€ýÿþÚ2þ4àÿÿçþ7Ïÿýþž ý “úÿÿçüQ½ý ÿýõ•%þ!”ò]˜ÀáñüôãÑ·~B Úÿ þÒÿÿ]ÿdÿþê? þ_øÿþÑG ý^èÿþßþÉÿýþ’þ© ÿõÙŽI* K‰ßÿþý> þ%óÿõò›S%  (X£÷ÿþûýÿÿ„ þKüÿþ‰ëÿþ“ þWùÿý× ëÿþý~þ@çÿüã!ëÿþÍ5ý¥þÿûºëÿÿí ÿüþºUý6ªû ÿýý¼HÿëÿòD‚²Õí÷ûìÛÅ“\óZšÃåóýõæÅ`ÿ(+Àþÿsþÿsþÿsþÿsþÿsþÿsþÿsþÿsþ+]Ã,a0  6 Backgroundÿ!?€ "     ÿÿÿû%$ÿÿÿÿ#ÿÿÿÿC“ 6CÏÔ1Ô=ÔIÔU 6EçFqF½GÉKòMQS…S•S¥SµSÅSÕSåWø\ ]¯a…ccàhUhehuh…h•h¥hµmn»q€w¦z …qŽt«»ËÛëû;o’ ”•Ñ–ó›Õ¢À¦¦«n¯¶µèµø¶€··‰¸E¸U¸eºý¾ãÃ[È„ÎÍÔ!$1þ231<1'1ÿ21<1)1ÿ21ÿ1­$š›š&šÿ£š'šÿ›š<š<šÿ£­$ÎþÏÖÎ&ÎÿàÎ'ÎÿÏÎ<Î<Îÿà­Rÿ­ 11ø 1ÿ21ø 11ø 12ø 1ÿ22øÀšÿ›}šÿ›2šÀ Î16 Î16 ÎÿÏ16 Î26 ÎÿÏ26À øþç+ 'ÿçøÿÅ'ÿ'øÿ± 'ÿyøÿ“'ÿ¾&øþöZ 'ÿ»øþñ:'ÿâ&øÿÓ 'ÿåøÿÊ'ÿ\'øÿ” 'ÿjøÿ†'ÿ«øÀ šþ0 /ÿšÿ|/ÿ^'šÿp /ÿQšÿ_/ÿx&šþ™B /ÿvšþ–5/ÿ&šÿ„ /ÿšÿ/ÿC'šÿ` /ÿJšÿY/ÿmšÀ 6ÿ7 <ÿ76ÿ9<ÿ:'6ÿ: <ÿ;6ÿ:<ÿ9'6ÿ; <ÿ96ÿ7<ÿ8&6ÿ8 <ÿ76ÿ8<ÿ;'6ÿ: <ÿ;6ÿ;<ÿ:6À4øÿùzøþÿù<øÿùKøýúùø~þýø;ÿùø9ø7þÿùø6ÿùø4 ø2þÿù ø1 ø/ÿûø.ÿ÷ø-ø+þüùø*ÿùø)ø(ÿùø'ø&ø%ø$ø#ø"ÿùø!ø ÿùøÿùøÿùøÿù ø"ø#ø$ø%ø'ø'ø(ø(ø)ø*ø+ø,øÿù,ø-ø.ø.ø/ø/ø0ø0ø4š›;šÿ<šþ—›<šÿ›<šÿ› šý›š~þœ›;›š9ÿš7þ—›š6ÿ›š4ÿ› š2þ › š1ÿ› š/þœ› š.ÿ›š-ÿ›š+þŸ›š*ÿ›š)ÿœš(š'ÿ›š&ÿ›š%š$š#š"š!ÿ›š ÿ›šÿ›š šÿ› šÿ›!šÿ›"š$š%š'š'š(š(š)š*š+š,š-š-š.š.š/š/š0š0š46þ87<6ÿ7;6þ<7<6ÿ7<6ÿ76þ76~þ;6;ü87669þ6767þ<766ÿ764ÿ7 62þ87 61 6/ÿ76.ÿ76-ÿ76+ÿ:6*ÿ76)6(6'ÿ76&6%6$6#6"6!6 ÿ76ÿ76 6ÿ8 6ÿ7!6#6$6%6'6'6(6(6)6*6+6,6-6-6.6.6/6/60606¾þd;ü qäÿ9ý næÿ7ý^áÿ6þ>Ñÿ4þ§ÿ2ýmò ÿ1þ*Ë ÿ/ýuù ÿ.þÀÿ-þSîÿ+ýþÿ*þÀÿ)þ.àÿ(þKòÿ'þbûÿ&ÿxÿ%ÿƒÿ$ÿ‹ÿ#ÿˆÿ"ÿƒÿ!ÿwÿ þgþÿþQýÿþ@øÿþ,ïÿþá ÿþ"Ë!ÿþ+¸"ÿþ>æ#ÿý—û$ÿÿa&ÿþä&ÿÿ 'ÿþaý'ÿþ7õ(ÿþ"ã)ÿþà*ÿþ¬+ÿÿt,ÿþ4õ,ÿÿ£-ÿþê-ÿÿX.ÿþå.ÿÿµ/ÿ øÿù:øÿù;øÿù;øÿú~ø¥ÿúùøùøù#øÿö)øþøù,ø ÿù0ø ÿù3øÿú6øþúù8øÿù;øþù ¾øšþœ›;šÿ›;šÿ›»š¥ûœ›œš›šÿ›š›šþœ›#šþž›(šþœ›,š þš›/š þš›2š7šþ›8šÿ›;šÿ ¿š6796ÿ7;6ÿ7;6ÿ7~6¥ü6787676ÿ7$6ÿ9)67+6 ÿ706 ÿ736ÿ766þ6786ÿ7;6þ7 ¾6åô 'Eb}”«ÀÓæöÿù0[‚©ËçÿúN±ßöÿú9w±èû$ÿû5}½÷)ÿ üa¬ê-ÿ ü$wÊù0ÿüsÏý3ÿü X¸þ6ÿý+‘ë9ÿüM¾ü;ÿÿÚ ÿøüùøùûøýùêéêÿù'øÿùøþúéê,øþùú øêÿü/øÿù øþúý2øùøù ø€ øùÿú-øûùøùùÿ&øÿú!øýùúÿ"øüùøùû'øþùø*øþùú-øþùø/øù 3ø 4øÿ÷ 5øÿù6øÿù7øÿú9ø9øÿú9øÿù;ø;ø<ø<ø<øýü<øýù=øþû=øþù>øÿ>øÿ÷€øš›þœšý›êéêÿ›(šÿœšþ›éêÿ›,šÿ› šü›êê/šþ›œ šþœ2š›šÿ› š€ šú›š›œ›œ-š›ÿ &šþ›ž! šþœ¥"š›þœ(šÿœ+šÿ›-šþ›œ/š› 2šÿœ 3šþ›¡ 5šÿ›6šÿ›7šÿ›8šÿ›9šÿ9šÿ›:šÿœ;š;šÿ›<š<šý›<šý›=šþ¥=šþ›>šÿ>šÿž>šÿ›@š6ÿ96þêéêÿ7'6þ786þ7éêÿ8+6þ7: 6êÿ906ÿ8 6þ8536ÿ7 6ÿ7 6€ 6ü7876-6ÿ:&6ý767! 67%6ÿ9'6þ78*6þ7:.6ÿ806ÿ7 36 46ÿ; 56ÿ77676ÿ79696ÿ8:6:6ÿ8;6;6ÿ7<6<6ý7=6=6þ<=6þ7=6þ7>6ÿ7€6ÀÿóþðÞ̺¨‘x^D( -ÿ÷þåÊ­‹c9&ÿùùçÆ“c5 !ÿùþ÷Ñ–\%!ÿûå¦g$%ÿüÚ–G(ÿüð¬R +ÿüô 9.ÿýØg 0ÿýê} 2ÿýëk 4ÿþÇ!5ÿþî=6ÿþò47ÿþä8ÿþ±9ÿÿZ9ÿþè:ÿÿ„:ÿþò;ÿÿ…;ÿüí<ÿýd<ÿýÎ=ÿþ7=ÿþ˜=ÿþí>ÿÿN>ÿÿ¤>ÿÿéÀÿ/ø/ø/ø/ø/øÐÿø>ÿù>ÿø>þøú=ø=ø=ø=ø=øÿù<øÿù<ø<ø<ø<ø;øÿù;ø;ø;ø;øÿú:øÿù:ø:ø:ø:ø:ø:øÿú9ø9øÿù9/š/š/š/š/šÐÿ>ÿš>ÿš>þšŸ=þšœ=þš›=š=š=šÿ›<š<š<š<š<šÿ›;šÿ›;š;š;š;šÿœ:šÿ›:šÿ›:š:š:š:š:šÿœ9šÿ›9šÿœ9/6/6/6/6/6Ðÿ8>ÿ7>ÿ6>þ69=þ65=6=þ67=6=6ÿ8<6ÿ7<6<6<6<6ÿ7;6ÿ7;6;6;6;6ÿ8:6ÿ7:6ÿ7:6:6:6:6:6ÿ896ÿ8969 ÿ>ÿ>>ÿ>þÒ=þÿ=þÿ[=þÿŸ=þÿÜ=ÿÿ<ÿÿP<ÿÿŠ<ÿÿÁ<ÿÿï<ÿþü$;ÿÿP;ÿÿ€;ÿÿ«;ÿÿÔ;ÿþî :ÿÿ":ÿÿD:ÿÿe:ÿÿ†:ÿÿ§:ÿÿÆ:ÿþÜ9ÿþð9ÿÿ9=ý3119ÿ215213ÿ5 112 1/ÿ61-þ321,1+ÿ21*1)ÿ21(1'1&1%ÿ21$ÿ41#ÿ51#1"ÿ21"1!ÿ21!1 ÿ31 1ÿ21ÿ211ÿ41ÿ21 1ÿ2 1!1!1ÿ?!1ÿ2 1!1ÿ2 1!1!1!1!1!1!1!1!1!1!1ÿ2 1=š9ÿ›š5›š3ÿŸ š1› š/ÿ¨š-›š,ÿ£š+ÿ›š*š)š(š'ÿ›š&š%ÿ›š$ÿš#ÿŸš#ÿ›š"ÿžš"š!š!š ÿœš šÿ›šššÿš š š!š!š!šÿ·!šÿ› š!š!š!šÿ› š!š!š!š!š!š!š!š!š!š=ýÐÎÏ9ÿÏÎ5þÎÏÎ3ÿÒ Î1 Î/ÿÙÎ-þÖÏÎ,ÿàÎ+ÿÏÎ*Î)Î(Î'Î&Î%ÿÏÎ$ÿÑÎ#ÿÓÎ#ÿÏÎ"ÿÏÎ"Î!ÿÏÎ!Î ÿÐÎ ÎÿÏÎÎÎÿÑÎ Î ÎÿÏ Î!Î!Îÿì!ÎÿÏ ÎÿÏ Î!Î!Î!Î!Î!Î!Î!Î!Î!Î!ÎÿÏ ÎÿÏ Î=ý"39ú1Ïâóþ5ü .•öÿ3ý ™ßÿ1þ!Š ÿ/ý¡ñ ÿ-ýCùÿ,þ»ÿ+þ.òÿ*þkøÿ)ÿfÿ(ÿ“ÿ'ÿiÿ&ÿqÿ%þ6úÿ$þ óÿ#þÊÿ#ÿZÿ"ÿ ÿ"ÿµÿ!þ,øÿ!ÿ¯ÿ ÿ%ÿ ÿ¶ÿþéÿÿPÿÿÄÿÿ ÿÿ`ÿÿ¾ÿþ Ýÿþ ñÿþ3þÿÿÿR ÿÿ~ ÿÿ  ÿÿÁ ÿÿÙ ÿÿë ÿÿø ÿÿû ÿÿü ÿÿó ÿÿå ÿÿÑ ÿÿ´ ÿÿ— ÿÿo ÿ 1ü2123.1+1'1%1ÿ;!1 1þ24!1ÿ2$1$1ÿ2&1ÿ2(1(1þ21+1,1ÿ3-1ÿ3.1ÿ20101ÿ2 21 31 31ÿ3 41ÿ2 51ÿ561ÿ67171ÿ2919191ÿ6:1:1ÿ2:1ÿ2;1ÿ0<1<1ý3=1=1=1>1ÿ>1ÿ~1ÿ2>1ÿ21þš›šÿ›šÿ›šþ›œ.šþ›š+šþ›š'šÿ›%šÿ !š šý›šœ!šÿ›$š%š&šÿ›(š)šÿ›*šÿ›,šÿ›-šÿŸ.šÿ›0š0šÿ› 2š 3š 3šÿ› 4šÿ› 5šÿ¡6šÿž7š7šÿœ8šÿ8šÿ›:š:š:šÿ›:šÿ›;šÿŸ<š<šýœ<šý›=š=š>šÿ>šÿ>šÿ›€šþÎÏÎÿÏÎúÏÎÏÎÏÎ.ÎþÏÐ+ÎýÏÎÏ'ÎÿÏ%ÎþÏÔ!ÎÿÐ ÎýÏÎÒ!ÎÿÏ#ÎÿÏ%Î&ÎÿÏ(Î)ÎÿÏ*ÎÿÏ,ÎÿÑ-ÎÿÐ.ÎÿÏ0Î0ÎÿÏ 1ÎÿÏ 3Î 3ÎÿÐ 5Î 5ÎÿÖ6ÎÿÒ6ÎÿÏ7ÎÿÐ9Î8ÎÿÏ9Îÿ×:Î;Î:ÎÿÏ;ÎÿÔ<Î=Î<ÎýÏ=Î=Î>Îÿ>Îÿ>ÎÿÏ~ÎÿÏÎïNg‡›«µ¶·¨†iP6&. ÿú÷è×·j+ÿüî–4'ÿýçÁa%ÿý !ÿý÷ÌK ÿýÙ: ÿþÓS"ÿþ² #ÿþø›%ÿþð7&ÿþý¨(ÿþç%)ÿþúv+ÿþ®,ÿþÔ-ÿþô).ÿþóQ/ÿþþE 0ÿþýk 1ÿþþH 3ÿÿU 3ÿþö* 4ÿþê 5ÿþÌ6ÿÿq7ÿÿ&7ÿþÒ8ÿÿW8ÿþø9ÿÿ§9ÿþñ:ÿÿƒ;ÿÿ ;ÿÿ‹;ÿüâ <ÿý6<ÿý=ÿ=ÿþh=ÿþÁ=ÿþä=ÿþþ->ÿÿm>ÿÿÄÀÿ ÿø>ÿù>ÿø>ÿø>ÿø=þÿø=þùø=ø=þùø=þùø=ø=ø=ø=ø=ø=ø=ø=ø=ø=ø=øÿ1<øÿ1<ø ÿ›>ÿ›>ÿš>ÿš>ÿš=þ¡š=þ›š=þœš=þ›š=š=þ›š=š=š=š=š=š=š=š=š=š=šÿ›<šÿ›<š ÿ7>ÿ7>ÿ6>ÿ6>ÿ6=þ86=þ76=þ76=þ76=6=6=6=6=6=6=6=6=6=6=6=6ÿÏ<6ÿÎ<6 ÿ>ÿ2>ÿ`>ÿ‹>ÿ²=þÔ=þî=þÿ=þ1ÿ=þMÿ=þfÿ=þ|ÿ=þÿ=þ ÿ=þ­ÿ=þºÿ=þÃÿ=þËÿ=þÎÿ=þÐÿ=ýÏÿ)<ýËÿm<þÆÿ 1ø 1ø 2ø 2ø 3ø 3ø 3ø 4ø 5ø 5ø6ø6ø6ø7ø7øþù÷6øÿù7øÿú8øÿù8ø9ø:ø:ø:øÿù:øÿù:øÿû;øÿù;ø<øÿ÷<øÿú<øÿù<ø=øþ÷=øÿ>øÿ>øÿ>øÿö>øÿù>øÿù.øÿïøÿö6øÿõGøÿò0øÿë;øÿó=øÿì}øÿì}øÿí}øÿî}øÿõÕøÿéføÿì}øÿöøÿ÷¦øÿî£ø 1š 1š 2š 2š 3š 3š 3š 4š 5š 5š6š6š6š7š7šþ›™6š8šÿœ8šÿ›8š9šÿœ9š:š:šÿ›:š;šÿŸ;š<š<šÿž<šÿœ<š=š=šþ =šÿ>šÿ2š› šÿ>šÿž>šÿ›nšÿ•šÿ™ šÿ›+šÿ˜Gšÿ—0šÿ’;šÿ—=šÿ“}šÿ“}šÿ“}šÿ”}šÿ˜Õšÿ‘fšÿ“}šÿ™šÿ™¦šÿ”£š 16 16 26 26 36 36 36 46 56 566666667676ÿ77686ÿ7869696:6:6:6ÿ7:6;6ÿ;;6ÿ7;6<6ÿ5<6ÿ7<6=6=6þ546ÿ76þ7=6ÿ26þ87 6ÿ3676ÿ9>6ÿ7n6ÿ7;6ÿ7y6ÿ7;6ÿ7=6ÿ7ý6ÿ7}6ÿ7T6ÿ7f6ÿ7=6ÿ7£6 þô/ÿ ÿk0ÿ þë0ÿ ÿ1ÿ þ×1ÿ þ4û1ÿ ÿ¡2ÿ þ/÷2ÿ þµ3ÿ ÿV4ÿþÜ4ÿþ(ü4ÿÿx5ÿþé5ÿÿŒ6ÿþñ6ÿÿ{7ÿþß7ÿÿK8ÿÿ³8ÿþý8ÿÿx9ÿÿ×9ÿþ4þ9ÿÿŒ:ÿþÞ:ÿÿ6;ÿÿˆ;ÿþÕ;ÿÿ#<ÿÿo<ÿÿ¸<ÿýö<ÿþA=ÿþƒ=ÿþÄ=ÿþ õ=ÿÿ8>ÿÿo>ÿÿ§>ÿÿØ>ÿÿõ¿ÿøÑšÿ›šÿ›šÿœšÿœMšÿ›'šÿœ2šÿšÿZšÿœ>šÿœ>šÿœ>šÿœ>šÿœ>šÿœ>šÿœ>šÿœšÿ›-šÿœšÿœšÿœšÿœšÿœ šÿ› šÿ›šÿœ šÿ› š›<š›šü›šš›šÿ›š›#šÿ›šÿ›šÿ› šÿ›šÿ›šÿœ šÿ›šÑ6ÿ96ÿ;6ÿ@6ÿ@M6ÿ;'6ÿ>26ÿC6ÿCZ6ÿ@>6ÿ@>6ÿ@>6ÿ@>6ÿ@>6ÿ@>6ÿ@>6ÿ@6ÿ8-6ÿ@6ÿ@6ÿ@6ÿ@6ÿ@ 6ÿ8 6ÿ;6ÿ@ 6ÿ; 686ÿ76ÿ7 6ÿ7686ü86676ÿ7 6ÿ86û786676ÿ7676ÿ86ÿ7676ÿ76ÿ767 6þ78676ÿ86ÿ: 6ÿ96û76677ù6ÿ?øÿú>øÿ>øÿ=øþùÓšÿšÿ› šÿœšýœš›šÿœ$šÿœ'šÿœ0šÿœešÿœ&šÿ›Qšÿœšÿ›gšÿ›>šÿœ>šÿ=šþ›=šþœÓ6ÿC6ÿ>6ÿ86ÿ@6ý>696ÿ@$6ÿ>%6ý86>06ÿ@Ì6ÿ8˜6ÿ@&6ÿ9Q6ÿ@6ÿ96ÿ86ÿ7>6ÿ>6ÿ=6þ7ÿÿó>ÿÿ¼>ÿÿm=ÿþü=ÿþÌ=ÿþu=ÿþøÿù9øÿù9øÿù9ø9ø9ø9ø9ø9ø9ø9ø9ø9ø9ø9ø9ø9ø9ø9ø9ø9ø9ø9ø9ø9øÿù9ø9øÿù9ø9ø9øÿù9øÿù9øÿù9øÿú9ø:ø:ø:ø:øÿù:ø:øÿù:ø;ø;øÿù;ø;ø;ø<ø<ø<ø<øÿ÷<ø=ø=ø=þøù=þøû=ÿø>ÿù>ÿú¿šÿœ9š9šÿ›9š9š9š9š9šÿ›9š9š9š9š9š9š9š9š9š9š9š9š9š9š9šÿ›9š9š9š9š9š9šÿ›9šÿ›9šÿ›9š9šÿ9š:š:šÿ›:šÿ›:šÿ›:šÿ›:šÿœ:š;šÿ›;š;šÿ›;šÿ›;š<š<š<š<šÿœ<š=š=þš›=þšœ=þšž=ÿš>ÿš>ÿ¿6ÿ896ÿ7969696ÿ7969696969696969696969696969696969696969696ÿ7969696ÿ796ÿ79696ÿ796ÿ896:6:6ÿ7:6:6ÿ7:6ÿ7:6ÿ9:6;6;6;6ÿ7;6;6<6<6<6<6ÿ8<6=6=6=þ68=þ6:=ÿ6>ÿ6>ÿ7¿ÿÿ$9ÿÿ>9ÿÿS9ÿÿc9ÿÿs9ÿÿƒ9ÿÿ“9ÿÿ 9ÿÿ¨9ÿÿ¯9ÿÿ·9ÿÿÀ9ÿÿÆ9ÿÿÆ9ÿÿÆ9ÿÿÆ9ÿÿÆ9ÿÿÆ9ÿÿ¿9ÿÿ·9ÿÿ®9ÿÿ§9ÿÿž9ÿÿ”9ÿÿ†9ÿÿw9ÿÿe9ÿÿR9ÿÿ=9ÿÿ%9ÿÿ9ÿþñ 9ÿþÛ9ÿÿÄ:ÿÿ§:ÿÿ†:ÿÿd:ÿÿ@:ÿþþ:ÿþí :ÿÿÒ;ÿÿ§;ÿÿz;ÿÿL;ÿþû!;ÿÿî<ÿÿ»<ÿÿ…<ÿÿO<ÿÿ<þÿÚ=þÿ=þÿ]=þÿ=þØ=ÿ–>ÿM>ÿ¿!1!1!1ÿ3 1 1ÿ21 11ÿ2111 1 1!1!1!1"1"ÿ31#1#ÿ21$1$1%1%ÿ31&ÿ21'1'ÿ21(1)1(þ131*ÿ21+ÿ21,1-1-ÿ31.1/ÿ210ÿ4 11ÿ3 13ÿ2 132 15 16ÿ216þ.191;ÿ21<1ÿ2ÿ'<!š!š!šÿ› š š šÿ›ššÿ›ššÿ›š š ÿ›š!š!ÿ›š!š"š"š#š#ÿ›š$š$ÿœš%š%š&ÿ›š'š'ÿ›š(š)ÿ›š(ÿ›š*ÿ›š+ÿ›š,š-š-ÿš.š/ÿ›š0ÿŸ š1ÿ› š3 š3þœ› š5ÿ›š6ÿœš6ý”›š9ÿ›š;ÿ›š<šÿ/<!Î!Î!ÎÿÏ Î Î ÎÿÏÎÎÿÏÎÎÿÏÎ Î ÿÏÎ!Î!ÿÏÎ!Î"Î"ÿÐÎ#ÿÏÎ#ÿÏÎ$Î$ÿÐÎ%Î%Î&ÿÏÎ'Î'Î(Î)ÿÏÎ(ÿÏÎ*ÿÏÎ+Î,Î-Î-ÿÑÎ.ÿÓÎ/ÿÏÎ0ÿÓ Î1ÿÏ Î3 Î3þÐÏ Î5ÿÏÎ6ÿÑÎ6ýÇÏÎ9ÿÏÎ;ÿÐÎ<ýÐÏÎÿ<<ÿM ÿþ2ýÿþðÿþßÿÿÊÿÿ}ÿÿ(ÿÿòÿÿÿÿ:ÿþçÿ ÿ¿ÿ ÿRÿ!ÿòÿ!ÿlÿ!þòÿ"ÿÀÿ"ÿ+ÿ#ÿ¶ÿ#þ'ùÿ$ÿÂÿ$ÿ&ÿ%ÿ}ÿ%þëÿ&ÿ_ÿ'ÿºÿ'þíÿ(ÿkÿ)ÿŸÿ(ý àÿ*ÿ(ÿ+ÿUÿ,ÿ›ÿ-ÿ°ÿ-þÓÿ.þÕÿ/þä ÿ0þÞ ÿ1þÙ ÿ3ÿ¾ ÿ3þˆ ÿ5þVïÿ6þÀÿ6üyëÿ9þ˜ÿ;ü-Äùÿ<ý'³Mÿ6ÿ,ÿ$"ÿ+ÿ3ÿ 9ÿ9ÿ9ÿ:ÿ:ÿþ*:ÿ:ÿ9ÿ9ÿ9ÿ ¿1ÿ=1þ2=1=1=1=1;1ÿ2:1ÿ291ÿ2ÿ371ÿ41ÿ2012 þ.2-1ÿ;*1Mÿšÿ›~šÿ›>šÿ>šÿ=š<šý›<šý›<šý¦;šÿœ:šÿ›9šÿ›8šÿžþš›1š› þœ›-šÿž›'šMÿÎÿÏ~ÎÿÏ>Îÿ>Îÿ=Î<ÎýÏ=Î<ÎýÝ;ÎÿÐ:ÎÿÏ9ÎÿÏÿÐ7ÎÿÓÿÏ3ÎÿÏ ÿÐ-ÎÿÏüÔÎÏÏ$ÎýÏÎÎM¿ÿÿÎ>ÿÿw>ÿÿ2=ÿþç=ÿþÇ=ÿþq=ÿ<ÿý•;ÿüú-;ÿüÂ;ÿÿ9ÿþþS8ÿþú7ý#§è3ÿýð¦ýH·/ÿýùw ü}Éè(ÿýã¿_û-}Ù"ÿüÔ|0Íÿ< ÿ5ÿ-ÿ%!ÿ(ÿ0ÿ8ÿ€ÿÿ2<øþ16;øþ12;ø1;ø1;ø1;ø1;ø1;üùø12;üúø11;üùø11<ýø11<ýø12<ýø11<ýø12<ýù11<ýþ11=1=þ12=1=1=þ12=þ12=1=1=1=1=1=þ12=ÿ1>ÿ1>ÿ1>ÿ1ÿÿš<šÿ ;šÿ›;š;š;šÿ›;š;ü›šš›;ÿ›š;ÿœš;ÿ›š<š<š<ý›šš<šÿ›<š<ý¡šš=š=š=š=þš›=š=š=þš›=þš›=þš›=š=š=þš›=ÿš>ÿš>ÿš>ÿ›ÿÿÎ<6þÎÛ;6þÎÏ;6Î;6þÎÏ;6þÎÏ;ü76ÎÎ;ü76ÎÏ;6þÎÏ;ü86ÎÎ;ü86ÎÎ<ý6ÎÎ<ý6ÎÎ<ý6ÎÎ<ý7ÎÎ<ý7ÎÎ<ý7ÎÎ=Î=þÎÏ=Î=Î=Î=þÎÏ=Î=Î=Î=Î=Î=þÎÏ=ÿÎ>ÿÎ>ÿÎ>ÿÏÿÿ´<ü¼ÿÓ;ü°ÿâ;ü£ÿð;ü‘ÿú);üÿÿ:;ügÿÿQ;üNÿÿd;ü.ÿÿ;üÿÿ—;üêÿ§<ýËÿ¸<ý¦ÿÄ<ýyÿË<ýKÿÑ<ýÿÒ<ýÿÍ=þÿÉ=þÿ¾=þÿ±=þÿ£=þÿ=þÿw=þÿ^=þÿI=þÿ6=þø'=þí=þà=ÿÒ>ÿ°>ÿm>ÿ-ÿÿ>ÿ62øÿí}øÿí}øÿê}øÿï}øÿ÷'øÿêøÿ÷}øÿõ(øÿõøÿïøÿñ=øÿ÷jøÿñ*øÿôøÿ÷øÿðBøÿù>øÿü%øÿ÷øÿ$øÿóøÿøÿïøÿíøþù!øÿíøÿìø=øÿù<øÿú4øÿêøÿú<ø<øÿùøÿï øÿìøøÿí øÿîøøÿòøÿõøøÿôøøÿïøÿûøÿêøÿõ ø,øÿñ ø+øÿê øÿù7øÿù øÿëøÿ÷øøÿöøÿêøøÿöøÿñøÿùøýïõë øÿ÷ø øÿêøÿì ø 'øÿôøÿõø 4ø "øÿòø ÿù øÿïøÿôø 2ø ÿù øÿ÷%ø øÿñ øÿöøÿóÿý øÿíøÿìø/øÿú-øÿù,ø,ø+ø*ø)ø(øÿù&øÿ÷%ø$øÿù"øÿû!ø ø2šÿ”}šÿ”}šÿ‘}šÿ•}šÿ™'šÿ’šÿ™}šÿ˜(šÿ˜šÿ•šÿ–=šÿ™jšÿ–*šÿ˜šÿ™šÿ•šÿ›>šÿ›>šÿž%šÿ™šÿ$šÿ—šþ›šÿ”šÿ“šþ›!šÿ”šÿ“š=šÿ›<šÿ›4šÿ‘šÿ<šÿ›;šÿšÿ” šÿ“ššÿ” šÿ”ššÿ—šÿ˜ššÿ˜šÿ›šÿ•šÿšÿ’šÿ˜ šÿ›+šÿ– šÿœ*šÿ’ šÿ›7šÿœ šÿ’šÿ™šÿ›šÿ™šÿ‘ššÿ™šÿ–šÿ›šý•˜’ šÿ™š ÿ› šÿ‘šÿ“ š ÿœ&šÿ˜šÿ˜š ÿ›3š "šÿ–š ÿŸ šÿ•šÿ˜š ÿœ1š ÿ› šÿ™%š šÿ– šÿ™šÿ—ÿŸ šÿ”šÿ“šÿ›.šÿœ-šÿ›,šÿ›+š+š*šÿ›(šÿœ'š'šÿœ%šÿ›#šÿœ"šþœ› šÿ›š26ÿ7}6ÿ7}6ÿ7}6ÿ7Ž6ÿ7ª6ÿ7Õ6ÿ8>6þ7#6ÿ76ÿ#6ÿ76þ7!6ÿ76ÿ76=6=656ÿ76ÿ;<6<6ÿ76ÿ766ÿ7 6ÿ76ÿ7:6:66ÿ76ÿ596,6ÿ7 6ÿ78686 6ÿ7*6ÿ7$6ÿ76"6ÿ76676 6ÿ76ÿ7 6 ÿ846 46 36 6ÿ7'6 ÿ716 ÿ706/6ÿ7ÿ< 6ÿ7"6ÿ8.6ÿ8-6ÿ7,6ÿ7+6+6ÿ7)6ÿ7(6ÿ8'6ÿ5&6þ97$6ÿ7#6ÿ7"67 6 6€ÿÿý>ÿÿå>ÿÿ´>ÿÿw>ÿÿ;>ÿþò=ÿþ·=ÿþm=ÿþ =ÿÿÓ<ÿÿ<ÿÿ*<ÿþÍ;ÿÿs;ÿþ÷:ÿÿ°:ÿþCþ9ÿÿ×9ÿÿc9ÿþæ8ÿÿp8ÿþ ç7ÿÿm7ÿþÝ6ÿÿW6ÿÿÄ5ÿþ.ü4ÿ ÿŒ4ÿ þ Ý3ÿ þCü2ÿ ÿ2ÿ þÓ1ÿ þ(ô0ÿ ÿ[0ÿÿŽ/ÿþ¹.ÿþÔ-ÿþä,ÿþ,ì+ÿþ5ñ*ÿþ7ò)ÿþ5í(ÿþ.å'ÿþ Ø&ÿþÀ%ÿþ•$ÿþbø"ÿþ.Ø!ÿý þÿþNæÿ4øÿ÷øÿìøÿî(øÿíøÿò(øÿí<øÿõøÿõ=øÿï=øÿëøÿï øÿïøÿî'øÿë=øÿï=øÿô=øÿì=øÿó4øÿò}øÿëøÿì=øÿ÷tøÿöøÿò4øÿ÷Gøÿó2øÿë{øÿ÷ øÿë.øÿ÷;øÿ÷<øÿô;øÿòøÿî}øÿî&øÿìøÿë=øÿí)øÿòøÿ÷"øÿïøÿé2øÿðøÿïøþë÷-øûêñöñô øÿî=øÿóøÿõøÿëøÿ÷øÿóøÿï=øÿíøÿïøÿòøÿêøÿóøÿîøÿñ4øÿóøÿ÷)øÿôøÿöøÿï øýíø÷ øÿòøÿòøÿñ‘ø4šÿ™šÿ“šÿ”(šÿ“šÿ–(šÿ”<šÿ˜šÿ˜=šÿ•=šÿ’šÿ• šÿ•šÿ”'šÿ’=šÿ•=šÿ˜=šÿ“=šÿ—4šÿ—}šÿ’šÿ“=šÿ™tšÿ™šÿ–4šÿ™Gšÿ—2šÿ’{šÿ™ šÿ’.šÿ™;šÿ™<šÿ˜;šÿ—šÿ”}šÿ”&šÿ“šÿ’=šÿ”)šÿ—šÿ™"šÿ”šÿ‘2šÿ•šÿ”šþ’™-šû‘–™–˜ šÿ”=šÿ—šÿ˜šÿ’šÿ™šÿ—šÿ•=šÿ“šÿ•šÿ–šÿ‘šÿ—šÿ”šÿ–4šÿ—šÿ™)šÿ˜šÿ™šÿ• šý”š™ šÿ–šÿ—šÿ–‘š>6ÿ76ÿ7(6ÿ76ÿ7(6ÿ7‘6ÿ7=6ÿ7'6ÿ76ÿ7'6ÿ7¼6ÿ7=6ÿ7´6ÿ76ÿ7½6ÿ7}6ÿ726ÿ7Š6ÿ7}6ÿ7=6ÿ7=6ÿ7ú6ÿ7E6ÿ7.67 6ÿ7=6ÿ76ÿ7!6ÿ76ÿ7=6ÿ7(6ÿ76ÿ76ÿ7%6ÿ746ÿ7{6ÿ7 6ÿ7 6ÿ76ÿ7‘6ÿ<øýù<øýù=ø<ø øÿðøÿïøÿò øÿöøüöøøùÿõøÿñøÿöøÿ÷øÿÿþøòøÿöøÿöøÿîøÿ÷øøÿ÷ øÿö)øÿ÷:øøÿóøÿ÷øÿöøÿìøøÿñøÿîøÿî ø-øÿòøü÷øøù,øÿðøýóøø øÿñøÿö ø%øÿõ øý÷øø7øøÿôøÿù øÿ÷ øÿñøýîøø4øÿù 2øýöøù øÿò øÿðøþïù øÿô øÿéøþðø øÿóøü÷øøõøþêù ÿ$ øÿ÷øÿñøþìò ÿ& øÿìøÿïøþÎ÷ ÿ&0øÿ¨ ÿ'øÿöøÿêøþÖø þ(' øÿî øÿïøÿóø 'øÿó øÿë ø þ&'øÿöøÿ, þ&'øÿôøÿó øÿ& þ('øÿìøþ'( ý&''øÿó øÿêø' ÿ&'øÿê øÿìø'ÿ& ÿ('øÿéø' 'øÿôø'ÿ(ÿ('!øÿöø'ÿ&'øÿõ øÿì øÿòø'ÿ(ÿ)'øÿïø 'ý+$)'ý('' øþòñøÿñø'ü+(''øÿó øÿöø'û#&''øÿõ øþßø'ú3&''øÿ÷øÿx'ý('' øþù"'ÿ(ý('' ø'ÿ)ý(''øþùõÿ('ÿ"ý#'' øÿîøÿúÿ('ý+'' øÿ÷øÿ÷ÿ& 'þ(" ' øÿëøÿúþ'('ÿ+ 'ÿõøÿëøÿú þ@('ý&'(þ'(øÿù&øÿù'øÿü(øÿù*øÿù+ø-øÿù. øþùý/ øÿú1 ø3 øÿÿ4øÿù6ø8øÿú:<šý›<šý›=š<š šÿ•šÿ”šÿ— šÿ™šü™šš›ÿ˜šÿ–šÿ™šÿ™šÿ¦þš—šÿ™šÿ™šÿ”šü™šš›šÿ™ šÿ™)šÿ9šÿ›šÿ—šÿ™šÿ™šü“šš›šÿ–šÿ”šÿ” š-šÿ— šÿ›,šÿ•šý—šš šÿ–šÿ™ šÿ›%šÿ˜šÿ›6šÿœšÿ˜š šÿ™ šÿ–šý”šš5š 2šý™š› šÿ– šÿ•šþ”› šÿ˜ šÿ‘šþ–š šÿ— šÿ˜šþ‘› ÿ. šÿ™šÿ–šþ“— ÿ. šÿ“šÿ•šþ› ÿ.0šÿk ÿ/šÿ™šÿ‘šþ†š þ6/ šÿ” šÿ”šÿ—š þ./šÿ— šÿ’ š /šÿ™šÿ, /šÿ˜šÿ— šÿ/ þ0/šÿ“šþ/0 /šÿ— šÿ’š/ ÿ-/šÿ‘ šÿ’š/ÿ0 þ1./šÿ‘š/ ÿ0/šÿ˜š/ÿ./!šÿ™š/ÿU/šÿ˜ šÿ“ šÿ–š/ÿ3/šÿ”š /ý+$././ šþ—–šÿ–š/ü30//šÿ— šÿ™š/þ./šÿ˜ šþ‹š/ü.3/šÿ™šÿQ// šþž3/ÿ0ý0//š›ÿ./ÿ.ý-//šÿ¡ÿ./ÿ3ý,// šÿ”šÿ. /ÿ.ý+// šÿ™šÿ› /ÿ3 / šÿ’šÿœÿ. /ÿ2 /ÿ˜šÿ’šþ›š ü@0/./ÿ6þ./šÿ›&šÿ›'šÿœ(šÿ›*šÿœ+š-šþ›œ. šþ›£/ šÿœ1 šÿ›3šþ›Ÿ4šþ›ž6šÿœ8šþ›œ:=6=6<6ý8<6 6ÿ76ÿ76ÿ7;6ÿ516ÿ76:6ÿ996ÿ76ÿ756ÿ76ÿ76ÿ76ÿ79656ý766 6ÿ7(6ÿ77666ÿ7666ÿ76ÿ756 46ÿ8 6ÿ7&6ÿ7 6ÿ7 6ÿ76þ76 6ÿ767 ÿ@067 ÿ= 6ÿ76ÿ76þ86 ÿ=06ÿ9 ÿ<%6ÿ76þ86 þ6<6ÿ76 <6ÿ7 6ÿ7 6 þ=<<6ý5<þ;=ý><<6ÿ;ÿ=<þ;Dý><<6ÿ7<ÿ=ý@<<6þ9; <þ;D < 6ÿ76ÿ8þ>;<þ=9 <6ÿ76ÿ: ü@<<=<ÿ6<6ÿ7&6ÿ7'6ÿ7(6*6+6ÿ7-6ÿ7. 6þ7=/ 6þ781 6ÿ73674666786ÿ7:<ÿýÂ<ÿýe;ÿüô;ÿÿ§;ÿÿA:ÿþÙ:ÿÿr9ÿþô9ÿÿ•8ÿþý"8ÿÿª8ÿÿ47ÿÿ¶6ÿþý:6ÿÿ·5ÿþý05ÿÿ¥4ÿþö4ÿÿƒ 3ÿþã 1ÿýýÿV 2ÿÿº 0ÿýÿ! 1ÿÿ 0ÿþõ /ÿþÇy .ÿýëV .ÿþˆ-ÿý×',ÿýùd+ÿýý¶*ÿýþÒ6*ÿþíi)ÿýö¢(ÿýúÁ3'ÿýùÆM&ÿýùÆM ÿ%ÿýùÆM$ÿýó­M"ÿüùÓ3"ÿý­S!ÿþò3 ÿþËUÿÿþ¸ ÿþšÿýþyÿþõT!ÿþä3"ÿþÀ#ÿþŽ ÿÿþóT&ÿþÍ&'ÿþ‘(ÿþèH*ÿþ°+ÿþíZ-ÿþ¬. ÿýæR/ ÿýû“ 1 ÿþÂ13ÿýâY4ÿýô| 6ÿýø”8ÿýú#:û$'('& ø&''&'&'+û &('('ÿ) ÿ('ÿ(þ&( ' ÿ( 'ÿ( 'ÿ( 'ÿ(ÿ+ 'ÿ%ÿ-'ÿ(ÿ( 'ÿ)'ÿ( 'ÿ(ÿ)'ÿ& 'ÿ$' 'ÿ('ÿ(ÿ+ 'ÿ&'ÿ( 'ÿ%ÿ&'þ#$' 'ÿ&ÿ&'ÿ& 'ÿ@ 'ü&&'ÿ+ 'ÿ( 'ý(%'þ&$ÿ3 'ÿ& ')'ÿ3 'ÿ&ÿ) 'ÿ+'ÿ+ 'ÿ&ÿ&'ÿÿ. 'ÿ&'ÿ"ÿ& 'ÿ)'ÿ$ÿ( 'ÿ@'ÿ& 'ÿ&' 'ÿ%'ÿ3 ' 'ÿ( ÿ& ' 'ÿ@ ÿ( 'ÿ+ 'ÿ( ÿ ' ' ' 'ÿ( 'ÿ@ 'ÿ$ ÿ$ ' ÿ& ' ÿ( 'ÿ) þ(' ' ' ý%'' ' ' ' 'ÿ( ÿ& 'ÿ&ÿ&' ' ÿ& 'ÿ#' ' ÿ& '' 'ÿ& '' 'ÿ$ 'ÿ)ÿ+' ' ÿ( 'ý&3(' ' 'ÿ& ' ÿ&''ÿ& ÿ&'ÿ( ' ÿ('ÿ) ' 'ÿ3 ' ÿ(''ÿ( ÿ( 'ÿ('ý(% ÿ&'(ÿ+'ÿ+þ*&'ÿ%ÿ':û,+*(':û,+*('7'û)*)('5'û()(('5 '5ÿ('3ü@F/)'31þ-)'3*þ)('2ÿ( '1ÿ( '1 ' þU1/ /þ.+ý 00/ÿ0 þ/./ÿ1þ30 / / /ÿ0 /ÿ.ÿ+ /ÿ0þ-./ÿ0 /ÿ.ÿ0/ÿ0 /ÿ0./ÿ. /ÿ1ÿ./ÿ. //ÿ.ÿ+ /ÿ0/ÿ.ÿ0 /ÿ./, /ÿ, //ÿ, /ÿ@ÿ, /ü.-/þ.+ÿ. / /ý0-/ÿ$ÿ3 / /þ3./þ03ÿ0 /ÿ0ÿ. /ÿ2/ÿ+ /ÿ0/ÿ9ÿ. /ÿ0/ÿ3ÿ. /ÿ0/ÿ.ÿ0 /ÿ@/ÿ1ÿ. //ÿ.ÿ0 /ÿ./ÿ3 / / /ÿ0 /ÿ@ /ÿ+ /ÿ- ÿ0 /ÿ. /ÿ0 /ÿ. /ÿ0 ÿ0 /ÿ@ /ÿ. ÿ7 / ÿ/ / ÿ. /ÿ. þ0/ / / ý0// / /ÿ0 / /ÿ. /ÿ0/ / ÿ0 /ÿ.ÿ./ /ÿ. ÿ. /ÿ0/ÿ0 /ÿ- /ÿUÿ0/ÿ. /ÿ7 /ÿ.ÿ3/ / /ü03-./ / / / /ÿ0 / ÿ./ÿ0 / ÿ1/þ.3/ÿ. /ÿ3 / /ÿ1 / /ÿU/0ÿ0 /ÿ+ý/0+/ÿ./ÿ.ÿ/:0/:0/7/ÿ0/5 /5 /5 /3ü8;20/32þ10/30/2 /1 /1 / úU==<<= ø=<=<<==@ü@:<=<ÿ; <ÿ=ÿ@ < <ÿ; <ÿ=<ÿ+ <ÿ:þ<=<ÿ= <ÿ>< <þ;=<ÿ; <ÿ=<ÿ;ÿ= <ÿ;<ÿ;ÿ9 <ÿ;ÿ;<ÿ; <ÿ;<þ>: <ÿ= <<ÿ= <ÿ@ÿ= <ü=;<ÿ9ÿ= < <þ><ÿIÿ3 <ÿ=ÿ; <=<þ=3ÿ= <ÿ:ÿ; <ÿ9<ÿ+ÿ= <ÿ= <ÿ=<ÿ9ÿF <<ÿD <ÿ;<ÿ@ÿ; <ÿ@<ÿ; <ÿ=<ÿ= < <ÿ;ÿ3 < <ÿ; <ÿ= <ÿ@ <ÿ@ < ÿ@ <ÿ; < <ÿ; < <ÿ@ <ÿ@ ÿ7 < ÿ< < ÿ= <ÿ; þ=< < < ý=<< < < ÿ;< <ÿ= ÿ9 <ÿ=< <ÿ= ÿ= <ÿ:ÿ>< <ÿ> <<ÿ= <ÿ9 <ÿU<ÿ= <ÿ7 ÿ; <ÿ;þ<=< < <ü:3>;< < < < <ÿ; < <ÿ> < ÿ=<= < <þ=3 < ÿ;<ÿ;<þ=; <þ;U<þ=@ÿ; <þ=@ý<=+;<;ÿ<ÿ<:<:<7<5 <5 <5 <3 <3 <3 <2 <1 <1 < ÿ„ÿFÿ(ÿÊÿÿ•ÿRÿÿ¾ÿ$ÿ×ÿ¾ÿðÿÁÿ)ÿ½ Dø('%-32,('.ý)('û(+-*('-'ý()('+'+'+'+'+'+'+'+''+''+'ÿ('+'ÿ('+'ÿ)'+ Dù0/.1321/.0ÿ/0/-/+/+/+/+/+/+/+/+//+//+//+//+/ÿ0/+ D <.<ÿ <-<+<+<+<+<+<+<+<+<<+<<+<<+<<+<<+8ÿ8ÿ7ÿ6ÿ 4ÿ 1ÿ+ÿ#ÿ"ÿ*ÿ2 ÿ:ÿ ÿ<ÿ 5ÿ-ÿ%ÿ!ÿ(ÿ0ÿ8ÿ ðÿ&<ÿ}ÿ=ÿ ýÿA½ÿ}ÿ/Úðÿ'<ÿ}ÿ=ÿýÿD½ÿ}ÿ3Úðÿ*<ÿ}ÿ=ÿ ýÿL½ÿ}ÿ<Úÿ/ÿ'ÿ'ÿ,ÿ6ÿ>ÿÿ@ÿÿ>ÿ/ÿ/ÿ)ÿ!ÿ ÿ!ÿÿÿ– ÿÿ ÿÿ ÿÿ ÿÿ ÿÿÿÿÿ ÿÿÿà ÿÿ ÿÿ ÿÿÿÿÿÿÿ(ÿÿÿÿÿÿÿòÿÿÿÿÿÿÿÿÿÿÿÿÿþ+ÿ,ÿ-ÿ-ÿ-ÿ-ÿ-ÿ,ÿ-ÿ,ÿ-ÿ,ÿ-ÿ-ÿ-ÿ#þûùø!ø"ùø$ÿùø&ÿúø(ÿùø*ÿùø-ø/ÿ÷ø2þôù ø6þøùø:ûúøùøø ÿš!ÿ›š"ÿš$þœ›š&ÿ›š(ÿœš*ÿŸš-ÿ›š/þž› š2þŸ› š6þœ›š:þš›š 6!ÿ76"þ576$ÿ86&6(ÿ96*76-6/ÿ762ý577 6676:þ876 þ›ÿ!þ:Òÿ"ýièÿ$ýñÿ&ý†óÿ(ýzçÿ*ü\Çýÿ(ý-æÿ"üCœëÿû>Š×û#ÿûY—Ôö'ÿû;pŸÉ+ÿ;ÿ;ÿÈÿ:ÿ 4ÿ,ÿ%ÿ!ÿ)ÿ/ÿ3 ÿ3 ÿ3 ÿ2 ÿ1ÿ0ÿ0ÿ/ÿ/ÿ/ÿ.ÿ-ÿ.ÿ-ÿ.ÿ-ÿ.ÿ-ÿ-ÿ.ÿ-ÿ.ÿ-ÿ.ÿ.ÿ-ÿ%¿øÿ:øþùø7øýùøÿ6øÿü3øÿù 0øÿù -øþùø)øþùý$øùÿúøüùøùÿþøùøúùøùúùø ⾚þœ;šÿœ9šÿŸ5šþ›3šÿœ 0šÿ› -šþ›¥)šþ›$š›ÿžš›ÿ©ý›œ›š›ÿœ â~6ÿ7=6þ7;6ÿ886þ7966ÿ736ÿ7 06ÿ7 -6þ7:)6þ7:%6þ76 67ÿBü687767ÿ8 â}ÿþö–;ÿüè9ÿýË\ 6ÿüó›13ÿüýÈb 0ÿüþÕ{% .ÿýÌ, *ÿûö´j&ÿû÷Ç…A!ÿúúá°w? ÿêÿùäÄoAâ7Xw§»ÌÜçòøýÿþùóìàÒiL*¢ÿ< ÿ4ÿ,ÿ%!ÿ)ÿ1ÿ 8ÿ€ÿ<ÿ 4ÿ,ÿ%ÿ!ÿ)ÿ/ÿ6ÿ=ÿ€ø<ÿú '‰'D'@'D'C'š<ÿ› /‰/D/@/D/C/6ÿ7<ÿ8 <‰ÿ6ÿ.ÿ&ÿ#ÿ&ÿ(ÿ)ÿ*ÿ*ÿ+ÿ+ÿ+ÿ,ÿ,ÿ,ÿ,ÿ+ÿ+ÿ+ÿ*ÿ*ÿÿ ÿÿÿ%ÿ ÿ.ý ÿ9ÿ@ýDý@A‡@ýDý?Cý @f1 '0'0'0'/'.ÿ)'.'.'-'-',þ.('+,þ)('++þ*('+)ÿ('+'*ÿ)'*ÿ('*þ &'*þ0)')ü74/)')û431+('*.ý-*('*)ÿ(')'ÿ()'+'+'+')ÿ('(ÿ('('(')ÿ('(ÿ('('('ÿ'#'ÿ'#'ÿ'"'ÿ(þ%'"'ù(33+("'ù(21-)"'(ü0/-*"'ü),//ü)**("'û).=:' 'û(*$' 'ü3+(' 'ü@1/)' 'û30/+(' ÿ(',ÿ)'ü71+('û()**'('ü42.)'ü)-0-'û0/.,)'ü(,,)' ü+*)('(' ÿ(' ÿ('ÿ*!ÿ(''' ''' '''$'&'''(' 1 /0/0/0///./././-/-/,ÿ1/+0/+0/+0/+/*ÿ0/*/*ÿ-/*þ20/)ý431/)ü3220/*ý210/*0/)/)/+/+/+/)/(/(/(/)/(/(/(/ÿ/#/ÿ/#/ÿ/"//"/ü230/"/2þ10"/ü2110"/ý011ÿ0/"/ü165/ /ü0./ /ý30/ /ý721/ /ü3210/ /ÿ10/ý420/0/ü3210/ý121/2þ10/þ01/ 0/ ÿ0/ ÿ0/ÿ0!/// /// ///$/&/'/(/ 1 <0<0<09+('5û:97/)'5û100,)'5ü*))('5'5 '5 '5 '5 '4ÿ('(ÿ)4ÿ('û(+,,-4'þ*063'ü).<<4'(þ):5'ü()**6'ý)*,5'û(*++-,ÿH9'ü+.02&'ø.452'''ü+046"þ4+'ø)./,''('ü(,4:!ýA3+ '( 'ÿ)!û(+81*'ÿ& ú('')*)''ÿ('ÿ('û(*+*)'þ*.1'(''û*/455ÿ('ú)+,+'''ü(*45ÿ( 'ü+.01 ' 'ü)-22 'û()**" 'þ)4 'ü(-27 '(þ'$ 'ü(.;_ÿ) 'ü()** 'þ(* 'ü)*++ ' 'ü(),, ''ÿ('''ÿ(''''''''''ÿ('ÿ( ÿ(''ÿ( ÿ('' ,þ*(',ÿ('ÿ( +þ*('+þ)('ÿ( )ÿ('û+**)('ÿ( 'ÿ('þ*)('ÿ( þ=.''ÿ(û(H9-(''ÿ(ú(&-0,('//+//+//+//+//+//+//+//+//+//+ÿ/ /+ÿ//,ÿ//-ÿ//-ÿ0ý750/5û54410/5ü2120/50/5/5 /5 /5 /5 /4/04ÿ0/ÿ014/þ0133/ý1554/ÿ55/06/05/0,ÿ:4/ü0122&/ø1332///ü0233"þ30 /1/ÿ/ý035!ý830/ÿ0!û/0420/ /0///0ÿ//þ012//þ013 /0//ý033 /ü0122 / /ü0123 /0ÿ. /þ03 /ý124/ /ý16E/ /ÿ0 /0 //0 //////////////// // // 0/0/ 0/0/ 0/0/ // þ61//ü0;40///1ÿ0/<<+<<+<<+<<+<<+<<+<<+<<+<<+<<+ÿ< <+ÿ<<,ÿ<<-ÿ<<-ÿ<<5 <5 <5 <5<5 <5 <5 <5 <4 <4 <4 <3 <4 <5<6<5 <,<<& < <"<ÿ 99>/./û02332/ý011'ý610/0/ÿ01%ý861/ü0110$ü0241/0/"/0/!/þ42!/ü5430 /ÿ0/û.1320/0ü10////ý011/þ012ÿ0/ /ú013422/ /0ý137//ý012 //ý011 // // // // / / / / /ÿ0 /ÿ0 / / / /ÿ0 / / / / / / / / / ////// // ///¼ <.<'<%<$<"<!<#<%<<<<<<<<<< << << << << << < < < < < < < < < < < < < < < < < < < <<<<<< << <<<'öD65*'')0<;þ<=< <ý@?>< << <;< <;< <ÿ>< <ÿ=< << <;< <;< <=<<ý=<=<<þ:;<<:ÿ;<<;<<<<-<=,<.<.<;+<;)<;,<=,<.<ÿ=<<þD><<ý@>=<<þ>=<<=<<=<ƒù#&'((''&þ%&'ù&'&&'(*(&'&ÿ('ÿ&')(( 'ÿ& û(''(('ÿ&'(('(ÿ) ('ÿ('( 'þ() ('( 'þ(* ('( ' ÿ('('þ&'ý+,('ÿ( '%ý*)('ÿ( 'ÿ#ý*(('ÿ( ')ÿ('ÿ( '('(' 'ÿ&#' 'þ%#(!'ÿ( 'ÿ&#'( '&#'( '&ÿ&$'ÿ( 'ü&%%& '&' '&%& 'ü&$&&' '&ÿ& 'ü%$&&''& 'ÿ(''& 'þ)*þ*(''ü&'',ý*)('&'( '&('&'( 'þ+''&'('þ(/'& 'þ&'ÿ('& 'ý&%&' 'þ%!'( 'þ%"(')'þ%$'('þ(*'þ&"'&'þ&% '&'ÿ& 'ý&%%ƒþ20/ /ý.-*/ÿ0/ý/.. /.ü-.14 / /. /û.//11 /0 /ÿ. /0 /0 / /ÿ0 /./0 ÿ././0ÿ/./ /0û20//./ÿ0 /ý./1ÿ./ÿ0 /./ÿ0 /0ÿ-./ /þ01./. /þ02!/. /ÿ0"/. /$/. /0ÿ%/. /ÿ /./ý.-, /þ.- /ú.//.-./ý.-% /ý-. /û.-.--/ÿ./.þ,./ù.//-*.//./.þ-,//./ý.,(ý300//0 /-þ10//0/0ÿ.ÿ0/0/ÿ0 /þ02/ /þ0// /ý010ÿ./ /þ14/ /þ13þ/0/0 /ÿ1.0/ÿ0/ÿ-.//þ01 //ÿ0 0/.ƒ=<=ý>=<=<=þ>@ <= <=>þ=> <= <= <; <ý=<< <; < <ÿ; <<= ÿ=< <þ=<=< <;û?=<<=< <ÿ8ý<==< <;=< <<;þ<=< <ÿ= <; <ÿ>;<; <; <; <;"<ÿ; <;ÿ;$< <ü<;;$<ÿ= <ÿ; <ÿ=<ÿ? <þ; <þ=><<=ÿ; <ú><<=<<= <þ:9ÿ:<<=<=ÿ9ý=;;<=<ÿ= <?ÿ=< <ý=:>=<ý;:9<ÿ; <þ;6=<ÿ;<ÿ= <ÿ;=< <ÿ:þ?=< <=ÿ>ÿ=<<ý=AD;<;<ý=?A;<ÿ;<þ=Cþ<;<=<ÿ= ÿ=<=< <ÿ=<ÿ= ÿ+>ÿ(>ÿ'>ÿ'>ÿ'>ÿ'>ÿ'r ÿ2>ÿ0>ÿ/>ÿ/>ÿ/>ÿ/>ÿ/r ÿ>>ÿ<>ÿ<>ÿ<>ÿ<>ÿ<>ÿ<r ÿ>ÿ>ÿF>ÿl>ÿq>ÿa>ÿ(r ¼ÿ:à ¼ÿ<à ¼ÿEÃ?ÿÿ>ÿÿ=ÿ<ÿ<ÿ;ÿ;ÿ;ÿ:ÿ9ÿ9ÿ9ÿ9ÿ9ÿ9ÿ9ÿ9ÿ9ÿ9ÿ9ÿ;ÿ<ÿ;üÿÀ € € € ÿ,ÿ-ÿ,ÿ,ÿ,ÿ-ÿ,ÿ-ÿ,ÿ,ÿ,ÿ-ÿ-ÿ-ÿ-ÿ,ÿ-ÿ,ÿ-ÿ-ÿ.ÿ.ÿ/ÿ0ÿ0 ÿ1 ÿ1ÿÀÿÀ € € €ÿ.ÿ-ÿ.ÿ.ÿ-ÿ-ÿ-ÿ.ÿ-ÿ-ÿ.ÿ-ÿ.ÿ-ÿ.ÿ.ÿ.ÿ.ÿ-ÿ.ÿ.ÿ-ÿ-ÿ.ÿ.ÿ-ÿ-ÿ)ÿ*ÿ*ÿ+ÿ+ÿ,ÿ,ÿ,ÿ,ÿ,ÿ+ÿ+ÿ*ÿ)ÿ(ÿ× € € € € € € € €'(ÿ#'(þ1('(ý6+('(ý2,('(þ+)'(ÿ('('('('('('('('('('('('ÿ(('ÿ(''''''&'&'%!'ÿ)('+þ)('ü,+)('ý-,('''ÿ(' '!'"'#'$'ü)+*( 'ý())'%ú'&./-('('ü)+,)'(ô53/)''(*-.,('ü)-./+ô53/)'&-58870'ü)-//„/(ÿ./(þ20/(þ40/(þ20/(ÿ0/(/(/(/(/(/(/(/(/(/(/(/(/(/'/'/'/&/&/%!/!/0/0/þ10//// /!/"/#/$/0/%/1 /0/(3ÿ1/ü0110/1+3ÿ1/ÿ14ÿ1/1„<(<(<(<(<(<(<(<(<(<(<(<(<(<(<(<(<(<(<'<'<'<&<&<%!<!<!<!<!<<<< /ÿ0&/0/ý03&/0/ý03%/ÿ./ÿ!/.þ/0/ÿ!/.þ//ÿ"/ü./0/ÿ"/ÿ0/ÿ/0ÿ1ÿ./ÿ/ý./0-ÿ./ÿ/þ./,ÿ-/.ÿ/ý0/.þ,-/.þ./ÿ1 þ+./þ./. þ,./ÿ/0. //0þ/0/ÿ,/ý013 /0ÿ1ø./00//../û02101./ÿ0/0‰<<<ÿ;þ);<<ÿ:ý86;<<ÿ;:ÿ;<<=<<==<<<<ý;9:<<;<ÿ<=ü9:;;<ÿ<ý7:;<þ=><ý8:;<ÿ><=þ9;.< ú>=:;<=,< =/<ÿ= ÿ;0<= ;0<=;<=<ÿ<;<=<þ=;ú<=><;;<=ÿ><ù:9B@;==<=<;=ÿ;"<;'<ÿ=<ý;96<<ý;9'<ÿ;<þ;%<ý:;;<ÿ!<;þ:;<ÿ!<ü;:8=<ÿ"<ú:6>==<ÿ"<>ÿ=<ÿ<;ÿ:><ÿ<;ü<=<;><ÿ<ú;<<==<><ÿ<þ;<=þ>=<ÿ<ÿ> þ?=<ü<<;<=ÿ> þ?=<þ:;<ú;<>?== <õ=<;;6;<<;<;þ<><ü=<;;üA=<; <ü=<:8 <ü=>BFÿ=<=<û;:;<:<ø=;:;<<=>‰ €' ('þ&%' &'' ''( ''ÿ( ý'(('þ)+ý'(' þ%'ÿ)ý'(( þ%&'ÿ(þ') 'ÿ''&ÿ( ÿ$'&ÿ% þ&%'ü)'&&ÿ('( ('('ý()* þ$&''ÿ( &'' '' þ%&'ÿ&' 'ÿ&'& '&'% '&'% 'ÿ&'ÿ& ''ÿ(þ+( '')ý.*( ''(ú*6)''( '('ý()('('ÿ('('ÿ&'&%'&&%'&%'ÿ&#'(ÿ("'(ÿ*'(ÿ,'ý)*(('ÿ+('!('('"('('(&&'(þ)*)ú%')('&'&ü'(()™/. /ÿ./ÿ- // /ÿ0/ /ÿ0/ /ý.-,ý/.. ÿ2/.ÿ,ý/-- /.ÿ-þ/( /.ÿ0ÿ0/ÿ-/ÿ0 þ70//ÿ/ ý830/ÿ1/ÿ0/ 0/./ÿ. ý100// 0//0 //0 /ÿ1/0 ÿ-/ÿ0/0ý-../ÿ0/ý0/./ÿ./þ./0 /ÿ./ý100 /ÿ0/ü30/0 /ÿ0/.ý620 /./1ü-./0 /ÿ./. //. /,/*/ÿ.'/ÿ.$/þ01$/0þ10/û./010þ50/ü./2; /þ03/0þ/1/þ02!/þ03"/&0/.)ÿ0/ÿ././ÿ.™< ;<<;ÿ= <<; <<; <=<ÿ; <þ?A< þF><;ÿ>< =<;ÿ=< <;ÿ<<ÿ:<ÿ= ÿ6<<ÿ@ ý79;<ÿ><< =<=<ÿ; ý>==<< =<< ;<ÿ;< <þ:;< ÿ><;<=ý@>=<ÿ=<;ý<==<<; <<ÿ; <ÿ=<= <=ý<;;<;ÿ<= <=ý<;;<÷;74=<;<<= <ÿ=<ý;:;<<ÿ;<,<*<(<%<þ;:#<ý=>?ÿ=<=ÿ>ÿ@<=ÿ><ü=<<>;<þ=:;<ÿ;!ÿ:<;<ÿ;"<;&;<ý=>>);<;<þ=>™ €ÀÀÀÀ›ÈMd&2 1Selection Mask!? " & Õ 1Õ[ØwØƒØØ› 1×s×w×{×׃ׇ׋×דחכןףק׫ׯ׳׷׻׿×Ã×Ç×Ë×Ï×Ó×××Û×ß×ã×ç×ë×ï×ó×÷×û×ÿØØØ ØØØØØØ#Ø'Ø+Ø/Ø3Ø7Ø;Ø?ØCØGØKØOØSØWØ[Ø_ØcØgØkØoØsÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿ @ÿ @ÿ @ÿ @ÿ @ÿ @ÿ @ÿ @ÿ @ÿ @ÿ @ÿ @ÿ ÿ˜ÈLd&2imbalanced-learn-0.12.2/doc/_static/img/logo_wide.png000066400000000000000000000566431460233407600224220ustar00rootroot00000000000000‰PNG  IHDR 1„eÆ…iCCPICC profile(‘}‘=HÃ@Å_S¥*•vQÈP,HÑM«P„ ¥VhÕÁäÒ¡IC’ââ(¸üX¬:¸8ëêà*‚ nnNŠ.Râÿ’B‹ŽûñîÞãî ÔËL5;ÆU³Œt".fs+bàÝBÓˆIÌÔgS©$<Ç×=||½‹ò,ïsŽ^%o2À'Ï0ݰˆ×‰'7-ó>q˜•$…øœxÔ  ?r]vùsÑag†LzŽ8L,ÛXncV2Tâ ∢j”/d]V8oqVËUÖ¼'a0¯-/qæ XÀ"R!£Š ”a!J«FЉ4íÇ=üŽ?E.™\`ä˜G*$Çþ¿»5 ã17):_lûcìšmÛvãð?WZË_©SŸ¤×ZZäm×-MÞ.w€þ']2$GòÓ àýŒ¾)ôÝ=«noÍ}œ>ê*y#EÊ^óxwW{oÿžiö÷üjrÞ¡2ÌbKGDÿÿÿ ½§“ pHYs.#.#x¥?vtIMEå  ÄšBAtEXtCommentCreated with GIMPW IDATxÚìwœ]Eùÿßs·¦P„Þ„¶ $¡ˆÁ¨ ¢ -Dì"~UTl bd74 JU@P@D `— ½×„@êö;¿?æäg{Ïm3çž{ïçýzdï½sfž™sÎ|Î<Ï3 „B!„B!„BQk™@‘&fMi¶Ö´`̆dí¦ÖØM°l¬¬‡a<–µµ5Qž«`ÅÀÂUŽ×1¼ŠåcÌ+óÖ¾dŒhž1g@='„BH€!RLçÔñÀFÖÚ‰À„èØØX¿JšaW— Ýee!„ B‘‡¾YS[eGkÙìôHt´Ê2EñŠ1ÜiáNc¸-cžm:áÞa™E!„ˆBýS7´Æî‡å£ÀÞÀxYÅ+Oÿ4˜ëlÆÜÛ:cöb™D!„ˆ¢ÞDÇVÖØC± ì 4Ë*‰°(#—Ópssû=Ëd!„ Bˆšd cêFYì'£pãY¥¢,®6† ­ÉÌn1[q#B!$@„U.:.š2ÆfÍÖÚc}uŸH-ù`þØÒ®¬ZB!$@„UFçÔ‰ÖÚ™À§€ud‘ªa>ðk2üªuÆœE2‡B !Dº…GÇÔ-ö‹¸`òY¤jY€áLÛÜtÁ¨cîÎÊB!$@„©axÖž-ƒvðh,_¶–EjŠÙƘOµ´w=+S!„BT”þÎic-¶k¿ l$‹Ô,oÃA-ísî’)„BH€!§ï¢)£6í`O6Eê‚%cÞßÜÞu¿L!„BD‘ö‚½Ì@ãàÑÖÚï›Ê"uÇË&µ¶ÏyC¦B‘”Ó_ˆ¥¿sêÞý Zk/“ø¨[6ÂÒ9pÑ4½lB‘e!j‹¾Î©›`íÙÖÚOÊø˜Íf.—)„B¤½¢F°—Nk´'[k¬&‹ˆ•xLf›Ö³ËB!*\°„¨ú:§lß?m­ý•ćõÈf¿&3!„HZ¢Šœ5µqØÚ/b9 h–ED K0fËÖö®2…BˆJ¢!ª”ÞΩ› gí?±œ-ñ! `5¬ý¢Ì „¢ÒhDˆ*¤¯sê¡XÛ Œ“5D,l4f‹Æö.Å‚!„¨Ê‚%D1tÉÃCCçXkOC/Dñ¬5Œý0K¦BQ)4¢Jè®µöJà}²†(ƒž†1Í“›>ý+S!„¨Š¢ è›5e'kí‰á¶ìòÁ)2ƒB !ĈôwLù(Yþl&kXËq²‚Bˆª m¿Nî\Bx¦¯sêñþŠöö~%Èá½Om’„BTµé9zsù áU|L9k/BÉ"„Ö2ÃöÃ2ƒBˆª Bâ£cÊ÷±üH–Á°|BFBQ ä6%DúÄÇ9ÀWd ˜7hÈlÖzÂì>™B!D’hDˆt‰K|ˆ„X× [eÃB!"D½Òß1åûÀ×e ‘{ˆ¬ „BDˆ:¤¯sÊ×-üŸ,!æ—ì®ç€B !ê‰þΩ'*à\Tˆ³ÃCëÉ B!$@„¨ñqˆµö|”BT kÞ/#!„¢ÄGÇ”éÖÚ?é: ”„BH€Qëâ£sÚþ´È¢Â ä½²Bˆ$‘Û‡ Ó×9u}¬í6•5D*&3¾¥}öYB!DhDˆDÅÇ”-±ö‰‘*¬ÝKFB!"D­‰Ž)±œì*kˆTé첂Bˆ¤h” „HD|ì üØ@Ö)dªL „BDˆÚà<`†®7‘bv’ „B$…\°„Ë1ÀI"å¬Ö×9u«å>ØÇ~2{Áôæ°á‰2MI<’ÞçÈ4BÄ2?ǵóe !RB_Ç”qÀ÷e Q4ØAfB‘ B„ã\`#™ATÖÚ]d!„ BT)}SŽŽ–%D1Q&B!"DuŠ©Àot}‰*c;™@!„ˆÕ'>¶®FË¢ÊØp`Ö´±2ƒB !ªG|Œn6–5D2ÎZ»®Ì „BDˆ* ¿cŠ.6“5DµbaKYA!„ˆÕ1qû2p¨,!ª{ Û-d!„ B¤œ¾Ž)»ß•%D "„BDˆ*à4t.$@„B !BÓ×1¥ ØO–5‚b˜„BH€‘rÆË ¢FXw`Ö´F™A!„ˆ)¤¯cÊZÀÏd QC¬“µH€!„Š4B”Îï eQC¬aL¶èKÙsjíþ>¼™ã7 ÀžÀG6`0«µ x˜ ü¸xªˆú`7àcÑ·‹Ên‹€€ྨü¥ Û,ì¼ØØXX ÈK€×Ç€›nÀ&T¿1À€]€VêŸÕ±À ÐÙòeà¥ÈžwG6í¯‚k©’m,åšÉ»DuÞX3ªë@4f^þÕñ¯ÑJ‚ད=wŒÆóÚ+çEÀÛÀ«‘ {¢kûùû{gàC‘ 'DõŠî €G¢º]]{I±°/°7°-°yÔ¯£€å‘Ýžfãö2»;‰{ÑóVˆâ‰6|6zÈQ3˜Lfë–³ŸòTÜcÑo$ö&¾ù˜ Ü?ÂßïÀÊ4'_ ðx ü8x0Ϥþ(àëÀöEØ`p5ð­hW ½‘ÀY•“€ Wú÷šÀ—€ã£IG1<œt ÇŽ&?¥ðfÔæó€WJ,ã‘H¬ÊO¯Ö@‹¹fš€ö¨Ý›qŽaàwÀ·#ñ‚-¢kùèH¼ƒþü!ªgˆ1Ý Ì>›c<ÅÑ |/#¡Ø/²ß~EÎ÷_~ ü&ž+3Xg„ß|…"=Bä‚%Di|BâCÔ$Y[­«zS‡€s).˜ÞD“Åû¢ÉûHl‡{+xY‘âƒè>q îÍñÌ€í? x&:•ðûm€_F¢ñ ÏuÛ¸¸øxspo¾ÏæŸJÑø«Æ6î̉„ÎæEþ¶!ZOŸó\¯1ÀÏ£ö¾ñ±âº~o$äñ\ÇC¢¶ŸW‚ø˜„[EºØÔsݶþ½XÙŸâ6~<ÝW‚ "D‘ôuLY8S–5‰±Õ˜TáÓу|Û2ÊhĽÁ;u•¿çš0­Ì:Ž&C§hÿp.¡ã<”µy41º÷v¼\Ú{qî>Y ø#𣌿jlã#Q½K™å´¿Æß^X“p+‘_Ä­0àiL_ \î¡ÌQѵv­'á°wôâÄW6Í£qn^ðPÖZQ[ïé^ "D™|•‘— …¨z¬eÍ*«ò‘8‹Oåƒ[ñ šüç·ï‹_àü±}qîm¹oN½Í/g£3q.]!ãMOÆ@¥¨Æ6îü-8¾ø6ð™2ËØ»±UÀ{ÅMe´{à.ü¯ ¬]kŸ,³œoF÷BßÞGE÷A¯šADˆ"èë˜Ò€{#*DbªI€LŹEù|–µD“©­q±Íêý OÖMqq ¡Øø3ÎݦX>|'¡qÐIeÒ¡Wc×Å­pµ¨ã¹Àz%þvïh’Úµù¸7úŲp.¾&MQ½>TâïOÎ h·ÿ'"Dåh#ÜÛ!R€­²v49É5à^œ_ú.ÀÑwÆG×ï×q±q‰±#|¶çròñè^°f4™Ûç¦u&ùƒÍ·ö`ƒ¯ÄLØ& {ëã\eZ¢ â^¸·êÝœãÀ&6ë³ò|gEp~;îüúQ[ZpÙwŠ&=?ÄÅÆ1œ$ÕÚÆ_Eu_•…¸€íOá2:m‰ïq¸X‘SpÓòMÒ¿RB6‰DQkžï½†‹SúXô‚`õHȯýûpœ›ã¢<åT¤-€+£þÊyãþ… úžõoKÔçm8×ÎÛ‰Ï.Õ\Eñ®]ûàbfò1/ÌûDõõñ:À”¨o©ãw9»ZI( –EÐ×1å2Ê_f"Íü¦uæœS=•2 ÖH¼Ž ô¾>Ï÷Z£‰X{mˆÀ¿Ž&–ù&‹¿Âe£ÊÅeÀ±ž»·€ÉÙ æG“«k üþ~8¢¸IOx.«P!ü‚ܱ.CÑ„ûÀ[EL¿Œ‹uɵrÔMü ÁG¬4·1×538‚`_µûœÆ5¸„ ;Èþåh,e‹˜‡þ+_9½8÷¢ßD×a>Æâ2LFîí½Ñ „B²Œ}·*š‹ãXvÆe¿¢v1^ý“äžhRy}!—3p"ÎïºàÞþ¸ÀIÚ2\zÎ¿ä™øûæ6ÜÛêkŠøÍÍ‘Ý.Ï3W8¯À9CKŒðÂÅ¿|§ˆ‰9¸”¯gÿÖz’ÛÛ¬ZÛØ4‚X˜Š[µ[V`—áV †s|¾nžBùTñ17T?/P|€ÛwçKQÙƒ9¾3ŠÂVkv¾óùO€éŠpöŠU.1°n5§¾™G|ÌÁ­h]Ná{{ÌÅeûjB²h$@„(L|Ü­VYCÔ4¶*Çøý¸Tº Šj©Ë´3œç{ƒÀa‘À¡Èò¿3iZ¿™eîŹK•²9Ü\öœ«b¾³.ÛX>ö!wïqowKe.8×äzý„Æ[-´ñ5Üî¹%üöŸÄǺÕ@¼{ß Ñdý¿%¶ñ œ«a.Ž.àüiÌw¾•_ʦ}?$>-÷È{5žø¬zOF÷ÅJ¨_6jû) BT–ÃQð¹¨Zª¬¾Kq+“‹KøíÓ¸T¤ù& w–X·òLF}eÓ{!½e”‘Åe»™óBÞçJW<œï¡­—Ä|66¡1W m<¾Ä‰é Ή×;XÆaäÞ¤ój™í<—æ6×õ—^{rg¬». å0 çÒ9[‘?Nìdr¿]Ùoa™uü-R]K€‘‡¾Ž)™èm…õ@µ­€œ <[ÆïoÏ#nÊÍ2uGÌgã<Ùàk¸Ý³Ëe šÔär»Ø ¬Çv9þÞMi«3«òxÌgI¹`U{¯þ^foâVÝFbë˘óÙy¸D åbq®m¹Ø+æ³ScÄÑ)”¶ò±*§Çˆ¬“óü6.õW¸|¯Ì{¬ˆ%² Åï+Dµ2ªŠêº—õ¦æÅ|öʃøTÌg>ÁÌÁ¹šøâœŸ.òí̽~ v(†ù)wÕÞÆs=•“ke¡}sÆã\ÙFb ~ߺß@îÕšísü} \¦»‘¸?ãƒåävC›Nî´Æ“-s|¶·B勾è%‡ˆ s0¥åÁ¢i®¢º^‹[¥(‡7b>»ÝC¶AGÂe~8Ïoß «UO{ªÛ†)wÕÜÆyÀlOeåZu+$‘Åcž«ÅÏŠÞÊ‚¦%GŸã7‰¹^ê¹Oþ˜C e€rüæyî‹oy®ã_}ßË$@„ˆ“ýSÖÀª !ÒÇmʈËþóŒ‡ò—lÿ ñ™¶Je6ðDŽÏ¶!>ëNhޝƒq²ÿöXVŽ¿²™àô<“ÝJ“kCÀ瀇=ŸkQ̽lïoÂö¹îI€‘ßbäM›„•LJËK\¬Ë÷!*;n–¶ ôõ(Ü&tµ¼SmìJI[seÊÊ·¤ ~¹bît¾\åæ ’ß1Æ~·ªãß}ÖˆbDú;§®›ÍÚv£í:…H+K—ß—òö?°ìûb>Û ø[Bòɸ«"¹4»I‹Ž$ÛøJ ÚlÈ{ñ\×u!sã\¨>蜹lÕgh¥¿6Ëñý§ ·êú o#ד;—%;Ì8°aÌX»9.%ÛšQçŒ1X»·Dÿ:Ƽˆµ¯Ì+û¶1f‘1öífLê×< ±¶Ý˜‚‚é„¢<°ìî˜Ï6õ|®p»/o… ¬Ý —Ii›𧤥 S`‹ñÑv‹HçìkÖ” ÈòYBÔ)½2AÕ06`Ù£Éí?PÀïOÂmx×Zäy‡çqûUü7:ÁŤd]SdÿzhcL‰Ÿ%Åü”ÙkUÁÑ_„XñÍ_U½™òçÇ3ƒ‹—]b-Cê†Ï{úò¹m~§gæ.7êù™B²ƒRT‹úE.XÕÃËŽ{žæ[%û6ðÝÎñ(n'÷\Jå§qY«ÀöõÐÆPôVhLÊ«)³×ª¢"nås\µÜsªV€Lîü¯±ô¿pñ²›¨®{[Éý[[gÏÆÚã2™ÆûÓ"-#ˆºÅh¤Š¹cöf1Ÿ½óÙÇóLÌß~üx¶Jí^m I\œB‚õç“;çÀ+\¿×b> ½o™·{NU¾åmûýs™¬í»ÅZûÏ*ïÄÚm,Ü=œz`RgÏ¡º'UžþΩ»d Q·ØTdÉ…±SÀ²'Æ|–k/‰u€ßÆüî:\êÙïWñļÚš¸ìmÛ¦ ~CävÛ‚ú½Â;÷Y™]‹µ}VudRgÏNô.|ø` ]Œ;Xk¯nëè¾¾­£gsÝ›** ?I:|P…¨&YhDa„Ü‘|ǘÏËñ÷Ïáöx‰¿‡ágwùJRm Í|r»í”’gpOŽ¿¯›‚º OĸP"É«¸©*ÒÖÑ}еv°Z^”‚7©£»S÷§ \Ñ—ìÙ`ÝÒºõ«Á1‹d…ªa á\Vöù,׈ŸÈñ÷åÀ ä~k[ -¶y=´1ümÆÅÆŒÄj„YÙûKtÞU\búžß-%6쉙×(Ð9½&ç©2©£{&ð›:¸0[,Ìhëè~{RgÏT=_“#;84—RQˆºÅ`Ê UC3Â}wr»Â,Ê1yl—ã7ÿÀßÜ›UÐÞõÐÆ¤¸/æ³#‰õ‘èÊñ÷»sü}GÂz¿b> ÇšÁ­îÕ—iëèþµ… ëìÁ²†µöž¶Žî?ë› «Õ’f¬ 1Ÿ5yºSNW9#ÑxaÖ´/¡½ÞÔ ®*Úñï²rBÌgwàÒÈ®Ê{b~3×cÝ*™"½Ú˜ÿŒùìHü&Ij#wðt.2‡‘3•5ä¹>Jåó”i2»<]åŒÄðÈ´æe¥ ›+óiÖo{™¯A]Uìå±¼õóËU‡—ȽJÓœã±~MÀOBLpÓ+>:º¿ˆåüºŸß¹Í ç¶uö|MÏÚ04d3›ÛËqÓ¦ô×1ÓZÙr²}ùEYÙ‚(ã·­«vs|ŠG‘NÎÆ_pìÙ¸MGb .ËÓˆ·Ñ<“jü"¦n‰<*ê I± ¸6æóïâ'¾i›1ÓMüÊÕïrü}Ü”¾8†Ü+4·ÄüώöóT¿¯á2»Õ‡™ÔÙ3%„âªbš°ö'mÝ×Nî|¨Uæð,ò²öY!Ÿ‘*°oV±nMÙ¾°åMÅÎ5²ëhßÐ`¯:ÖÇÅ ”»IØ£c>Ÿ‰‘ˆËžv€‡ùÆÉïƒÞØÎõÐÆ$ùeÌgëFb·œùN+p ¹c‡/ÎóûkÈJùDà$6Ø$ê÷‘xx(æ·—“;%6À•À¤2ëw.¥´ÿG_Gä®=4ÖZ{ïÞ~^À!Y;|ÿäK\O¦ð8å‚Ë )dxIºÊ A6UÛn,ÆmŠZ´áüÂ7/ñ÷§ï¶±øiÌçq{^l |µÄz‰&‘ß(໡3dÖC“äào1Ÿï…sù[³D›^ì‘ãó·¢ |쓃ø '/Î*c.½1p#¹W/ÏÎóûà;1Ÿ¯½˜x_‰õ;¸Œ@i‘S'@v»ä‘†áááÛIÇf/iebv0ûô¤ÎžídŠòé½t·f`ïÚkYS©gFk`U'o˜3(3T-Sq)rO¤ð7åëã\a~“ç7? þmë«À 1Ÿÿ0š Z¯fœëÌ#À±þfÓÀö­‡6&͈ŸùhdŸbRËNǹWÅy6|3!ùø¹c-V”ÓCqîNà3ÀýäÞoç9 $Dãæs1—1ëW‘ )„u+€K ˜y6udhhèÒ“g9ÍŒ±ÖÎiëè>D¦(sš>Ù¿e)ÁªsE±¼Þ|Âì!™¡j)Áj¸ºŸÁ¹1—ÖuTôÌ_ —b÷“Àïqoõó=G ÖkòÌ7ÎÆùÜŸìMŽš¢ºm ìMÌ.ž:).5ú‰äö¥÷E=´1Iž&ÿÊÑÆ¸Xˆ{#Á².>¤·Ò±°Odóžh½MLyÝžYÕâÜãÄÊN¸•†ÇŸG×ÜöQß7â^¨o‰©ŸDm¾ŒÜY¥²¸Õ‡ÁïGáV(sNs€Sqë¸ÔÝ[G¶kˆê·ò=áyFÞïfˆØ¬'Åј¦Q8éâž1vÈþQÏ”‚ \3©³g¿îö¶[eŽ’'êï¯5•q÷K+ñ!JâE™ ªø1Î}¤`ÝMqÁ£å&/y!šŒ’Yá\\Zà8† ¸Õ”R¹!T#mV·7ðòJÿÞ˜çÙæõÐÆ¤ù 0™ü«@Ó¢£¬—,Ñx.f"ý,.âo¸U«\l_,³Ž?"wv®‘x2׿ú¶nÅ­Ô c?ŽÖñÑé©Z±CöJ=OŠŸrZkÿÑÖÑsŠLQ2ûÔŽ–ÊF F½*J¸›˜ge„ªâÜÛôÁ€åïK¼ëÕªöÿ¶÷æhòøJm^m¬3(Ì娖àöÈxº„ßÞ ô& ƾUÂïn>‹ Á<<£§F€LêèÙø€ž'¥ö£ýu[G÷þ2Eq,¿x÷Q¸àÍbÅþEB=vž‘ ªŽq.T¾³Ü ìŽs+)†_ãÜ<|sΧ¿ø]´“ Ú˜4ÃÀ§pAßÙå?ÉÿbCJåzÜ Ôóê—Å”ŸZÆü \,ŠïTê/EÂmiM ‹ý>ƒ]êü}r烇Ë…Ó0œÝ™úȹ.jíC›ˆyJ¶­Z² p—‡²–âkßKioá³ÀLà{øY™yç;2Î\`p%c•ê¡•™ ™¸ÌUz,ó\:Únå­ÿ þV»q®eßóPÖ8×½Ëñó&òVœ{œ÷ÕñTɳܚܻ@ŠbîŠ6{Ù¤Înm¨Wð­ÉN­æê÷šÆ€ºµ®ô0Wƒ \ÕÉ“¸·³Åá;Hæã¨·Âes.³>߉&B×Rš[È}¸¬·nZå³§€ÓB]jcÅéŠÃÇÛK´Árà|\<Îñø}{¿·R18XX¢0ºçn7¿+^óqîXÓp©ˆ‹²Yà6\0ý¾@½¡Rá(ÞÖÑ}Ú‡Á'ó7¿ï˜í—ËñôuNù–OVS‡³††LÊUFftùû[˜Œ‹iI±Ðd6i™q¯îécûÏëWÈgƒè¹º'0—…gîÅãR`A$ZŒ&wÎ|p.{Ñθù5pYxúqü=‡ó3¿ø'…‰á=€/áö‹X'ªÿ+¸ý%®þ’à¾ÚX)6>åmqYÃÖŒlÛ] #»Ü ü'1˪_ nÅpoÜJä{pé­Gã’=-Šêønµ£ dþBBõKc<—´`K\JÞÑ‘ýáV8æE÷[Ƚú¹-#º¿¼YUdRgÏzÖڇɽ‹(š§Í9~'¥ÔŒ SæEo1jŠah®ä~¹>H¥Ï]É6t÷n;ì³ÄGZgÎÙIw!„ÁÕ¯µ»K|aòàÐð©2CŒøèœº6°Išë¸¤¯¥âHS IDAT´w¾ÅD±Œ$|•“&üŠ0ÌÕ]A!D] _Q7³îÏ'ÍzpÙ!×|‹ÍHaúàðÿDÇj­tµ2eÜ|MÞ³ËÁ4•/P|•S”ý’R~nãÆòˆî B!j^€ìzÑC«á|úD( ’Í^ÝvQw«,1‚m¬M»ÉÒ~CSC â;Œï[ƒ)ý{6&ÑH¶ˆ”ì±åXíð½B‘»ò~ªkÌú+!„¨y’>T]œÉšƒd†™˜¶ mI‘ø(7|e*ÚÌhÃEO • –BˆÚ Öíà*‚Ú^6¥ã1­‚¼›TŸ/íOA%VLèÿ¿ø(c‚o³ÿû½IX™Ê·ùÅÌ»ÎRáÔ KZÛçht!„µ-@&uôŒÅùà‹ð´ Ò{‰Ì°Ò¼õ²i\*ºÔ0¶âÛpŽ$>l€s˜Hè±cûÊuQŒhó“2y@&BQóÄb×Áå´Éð‘IÝÈ ŽÛ l˜–ú,ë¯ôüUV*LÆÓ¤ÞŽðoûî‘Lcáå­êf4 ËgŽL „¢æ°6°šº 1Vs²ÌMc cqG¥‚1iˆýøÿbÁäp™ H¶Œíj ö–B!ªB€¼ïï r½ÚÝ 9n';½´öÿv¹äÑQ²›žý?–¤á2°«ü¿\šêêÞÉüGVBQ3äί3âLƳƒÌ_‰÷àÀ÷dÀÚÓR•Ñ͚싊ò䨳ߔ„BÔŒÉ=ÿKWpqÄäYÝuïúf ©ˆ‡Îj@ŠŠs—L „¢.ˆÁn!óW„M²ÖlW÷V°¬›†j4d4 E…1Ü!#!„¨ ba}™¿RÆ·'ÊÊÀ&€1 !„õ!@0¬%óWŒãeÖ¬åÆ-êU~Qg2™Wd!„õ!@¬RðV’¶Žî“êÜ5=þÖ¥ÀvQ·7ò !„¨Ú=¬²)R9–ök…BT—„BÔ“Ñ ¬²¼§­³§žÝà*ºÊØ­PˆŠ³Ô4˜»e!„õ$@zeþв¡±v£:nk¥Nœ ¬=¬´(Œ®æºÊ B!êI€,–ù+Š3½ŽÛ_1ÀLàµ?£µEQØà:A!D½ 2e±Ø#4ö…¨K†3Æ\/3!„¨·IØ‹2ÅÙ³®õWX> å ‘ lžÑõ¼Ì „¢Îˆ‘Im=õº+úP%N:ºY"®„BÔ10OæOöƒuÚðAõ½¨[ýaÌŸd!„u'@,ö™?Ë~uÚôåµÖ erï‰îj™Ñõ’,!„¢îð2UT ›M¹ôÉzœ¹.­µ‘{—(ŒßËB!êU€¼ ,QTœ5†—®Q‡í®©±§ÕQèP±6sµÌ „¢.HƘ·€·ÔgukíêuØîšÚ€M«¢@nh9[÷]!„õ)@ho[ ÌWTœÑ`ÆH€T/‹{µú±‚Bw_Ú_šÍúýÙº7fÕj`¸øò²6ÿß1eWüðÃ4Ò„BT¥0pƒº â4aWÍ6¯ÕJKVnõ£Zö-Y ŠBwÛâl6\`Z”´¹µ©|[¯£bV­šK˜âgÌÈ‚l¥¿?Ñ<&{gÙ ¸êªa„Bˆj Ö˜«Ô)ÀÚõêOðj-4£w0Œ@rå¦}ß’±/cZJ«gC&~µa…XhiôS߬Y($" šOÝ*ñG!„¢=ímóP z &ãfíúÓö¹ZhǨ¦0¡¹1ÝÂcÅÊ…Ø—æ†Ü®[ùÄB¶ÈÓg-(åŠ+­-³™†‹vI6ï%'„B¤A€¸¤VA*Þ–êOse^¨ö6XÏa¸Š’b7d|‡o ¦jüe£N¸÷í|_›8qb¨g‚²$!„H±Ö^¤n¨p`ׯ»6[zI Ââ¾pâÃxœüöï“újcpØ„oAˆ[YÚÿ¿¯?+¤¼¹sçjo&!„µ/@0æIà5uEEY³ÞÜÒ>»x>ôyVo uÙø+kIŸ æÊõN‘Sž@ MSCq6è2Aû+_\Çòø˜±-ÿÿì×´¶w=£ÛœB ˆžö¶ùÀç~YM!„È*ô´·]‹²aU -õÙpópÈâCÄT”ºŠð®¦ÛðoðËe(믽¾)ÆýÊ'Y ­Ø$r'ËbÌYºÃ•ÄÀO€Ç¢gÓÀn2‹BÔÀ˜3Ô•2=ÍuÙnx„üéCkµÏSÍÒ~CcŠƒâC‹·\«g…dݲÿà·´¶wÝ£;\ÁŒNÀݾl‹[!~ðo`šÌ$„µ$@¬½­‚TÈôõ™£¿©…—©’•zã^ ‡Íÿß©¼^)gõ,)YŒùFx Ÿ Ó§Oo Tt p(p=ð ð `RŽï¶¦^WŒ…¢HÏÌI/Ýê’ÊÌùêrðÓ5 ô„({È㺊Ow©¾ ötÜŽãï† µ#>†*´¾fàÏ­3º |šÄ:êŽ;îò\ä4ààUàjà@ ‘³p’BQ#ÄU&sŠº¤2ó¤ºm¹á®Åútòé.ÕZÁ˜Š|ÓÕB1{z g“]¹Z>@¯%øêG5²ðMàqàÞHH”’ðD™R!jH€<0s—¹`þ®nIzn–ÕoÛùF@x–ô埈/í¯7³baÒþ£›ÁÀÏZgv½Pø%PÓŒŽnžÎ¶)³Ìí€tU !DèáþE`@]“[·±7 ™ûHiì‘­¡0ˆ|©t­¥îã>Ê%Úý%Læ'E]þ™8qbˆgJ±")|ø=ð:p °g±µFœBÔéno{èR×$©úX\¯Mo<~ö˜»ÓV¯Áa“úLUž_éE’e¼óÍ_]Ò|ÂÝY —k8/¡nóÊm2ÁÿhÙuðxZ&«åÐPæÕÝÜ®½£›ës|-ísyVL,n·ï3[gΪUåqñj6ôpš‡€ ªqÝ©;¦BÔq"dÒUÀ,u'Œ¹CFXÕ&ü4éSVÓ†|å²| ºÚZlß,މáÛRVUÞ.h9gN^™ÍÀ×=•Uè5žK€,B+ BQ? gæ¤và:uŸ[¶·="+¼“†Œý=èîðƒÃ~\°Êu»÷–>Kú £›«+l¡˜8ŽÅ½†ÕKÌîµ4>ø¼÷ƾÖï{qFø$°±‡s¼\Yàw×Êñ÷7t§Bˆ: ™††#p)EúC&aÂyÂ}Ý[au[šõ™²Ý® ì·ô9Y6`jz£ÂE½¦¬}WÆæ¶Í pðËÖ™s^©ñË/΀§y:Çoqü…°IŒˆBQoäv]ÿªKĘ+d„œ¶é^«–ê.F“ûáÍcškW| ea0›>÷?m9ç:¾"ß LòÑUÀ…E|c !„y=3wì7Æì ¼¬®,ko•F¦uF×2c Ê’“ |¬ ¬LÈôZ¤Ü,a9vD~Ô:sÎ EzøáµÔ‹_ðTÎ î^9 Ø(ÇgOkÔ !D €îö¶7ššv$~Ã(ñn^Øe† ™¦¸ÀMä<éÇ´ëÛtÕ§ÔM"K«)fsÉ¡aÏâફj%MïfÀAžÊê(â»ÛÇ<#ŸÐ R!êX€ÜÜŽoeŒÙU…¢x®§½m¾Ì›–sú1æŒb~SnŒC¶ÀŸ/î5©ÞpQ ë7ªÄÔ¿¥ÄÕôÀØ–ÂÇBó;w¾XŠÛ ¯tñQ[œ øXÍy¸¥ˆïïó™ž5BQïàö¶…=3'M0ð/uk~LáY`êšÖö®+€»» M¼ér³úÕG%»¼Ð7XøÄâã!–ô™Ô¯ÞŒÄÀðÿìGÿPÉbg—X£ø¡Äc€<•u n#ÇB‰‹9yR]#„ +µ(ó!cÌOÔµñtÏœt®¬P X3œ $æÎ²bSÂÞAÃpּí«Ø@s_ýÖ¦ÿýÿâ^Cÿ!kÝd{å½óM¼—G™°Ln{ÒÜßþƒÃ†–ÆâËŽDç߀3ŸÕyÀùÊ ¬é©¬?ùý÷æøû|ª(A…B¤rnU« këìùÖ^èñáUK3êó{ÚÛN‘! §¯sÊO±|¹ï..3k©,í7E¹ý$Íà°)jO  gßéBµ¤/\*àUÏU ¯.Ê,Ù`ì>ïz>ͶñPÖÝÀ^E|5ÜÆ#¹~Ý|TÝ#„¥“©Õ†õ´·]i2 ïîT7¿sö·2C±S¡Ì·)0 ½âc(›;æÀÇæ„åbí»7ô ½û{ïà»÷E %>¬¥œ=Xî¿ðsB@ñQ­÷ùx¿+òû{‘;î¤^w¢B ‚&e}Ëßê™9izH¼@Ý ÀlƬ¡ ‹¤uÆìåf’ +V1Ä¥mHÁU>’ËUÈÕeý†QM¶¢í+›M£Ùçǹ÷ª€ÕËVéeçkãÁ Xû~$æ³ûtGBˆ2Ÿ›õÒжΞõ±öLàĺîðLf£î»¼¢¡_}S~áqbTÞ¬jè]Ù“REï ŒjÊ×Cs£_¡P)¸î¾`Oim¿oPWÖ»˜€s¿òA).SO[Žðw ŒÞT !Dédꥡ=ím¯õÌœt’1™pþÀuÖ×Ös˜ÄG™sÆ&¾ŽËTTÀ<œ¾ʦ[| ›¼âð.>–äw+$“UðkÎnn:±FŇŸà±>Å®~l›C|ôH|!„HÑt·ïòd¦ƒÙɸ¬3õÀ 1f÷îö¶k4äË£å¸9ýdÌáÀ²|ß éÔ˜ò+7é`ó¢gtéo[Ê=ËË{u±<ƒùDËÌ9?sÌݶF/“rÛÕíkX×ù›Ãb>»UwA!„(Sï˜Üñà{²dÏöÖ«±æ-njhl:æþãw”›‡Gú;§~ÚZ[PZOk©Ê´³EÏ:«¨CÙÒD\™m|ÙÕ®LWyøX ¢!'>žc$&æøìƒÀmê"!„¨œ±µ$jÚ:zÆcìdc9ÍÂþUÞ¯Æðm0—w··½¨a†¾Ž)?¾X`¢¤ý!Àí‘ pUùJ‹ª~…Ð;Xz°ù²ØæÂ~ë!}ï™Læˆæ³_¯³çK)Fû+p§:Ì:‹øþöÀÜŸ-Öúu÷Bˆdˆ­À9g§Y®Óh³Ÿµ–€Í¨Ž•‘—€§À\Ú3³í2 íð _²WãààÀÀ¾¾'½KûalK2í(å­~ˆàñÿ_ö°Ûô/^Й²]© ÿÞ^ö 1œ›±§7ϼ§`ç­éÓ§7ÞqÇCIå‰'fæÎ›†lYëF÷²&CØxµˆßü8#ÇgW‡ëÎ'„ɉ›²ú$ž?šY><0̺ÀÁX{þòÒû`.˜?ƒýðBÏÌI 5¤“¥¯sÚêØì=ävÙõËRcÌq-í]WËóàOeͦñýà`ß‚âÚ…B”8á·UV_!e`Ö´M³Ùì=¸·­BtcŽliïzB¦(й87(| 8«ˆïÜã³å¸Õ™eê"!„(Ÿ|a˜iËÒbóB$NóŒÙ/cöGé9…ãW³§ÄGÑLó(>®/òûq©o’øBä[Q°uÖ^!J¦¯sênXûO`uY£.yÝŽmiŸó™¢$:€vOe½lZÄ÷7žÁ¥‰©Ÿ´íBœL‰mÒ ŠBk{×}¸€ô%²FÝqɰƒÄGÉŒŽðXÞÍE~ÿ”ññ.¯BˆH="q"Ê!3çtù „õÁB0Ÿn9çà–sÈ%s~W‹ £€1Ÿ_ «‹„B$mâDE¬$BºîØ}€WdÆpɘZgv].c”Í1ËþYÄ÷ÖÎñY¸HÝ#„ÉM¨%PD9"¤½ëa2™=ÇešãUcÌ¡M­-‡·ÌèzUæ(›õ€éËëÞ.âø•˜ÏoÀ¥æB‘€(¢\2cös; ¸SÖ¨ †Àü&“ÉLhiﺶáè»tÝúáPÏÏ¢bܯ#~o§sÔ=BX€LØy¯íe’T Q¥´Ì¼ïm›iØÌù²FUsÆLiÙujóŒÙJ2àß»‹ã~uzÌg÷w«{„Â?&f2,ª¿EŠèëœz<Öž´ÈUËóõ–F®0Çuee﬇‹•òµ² ðÝ7Æ|~0pºH!’¸J„H ÏôÏšº“ÍÚ+ ²FªY†áǦŸµ?§WæÆÉÀyËû{$, ¹'>´åøüQ`'=…" Š©’Zft=Ü1“1æbÙ>ÌydÌ­ísÎ’øÎ'<—w[ß;,F||K×§B„C+ ¢Üq"J¤¯sÊǰüØ@Ö¨8ƒ~Ÿ1æ‡Í3ºž–9Á·ûÀ$ 'Ïwp+Ûæø|0µÊl9>T»DÇ{€5€5££èÅíOô20/²Ó¿€¹5>Î6öŽút[`+œ›Þjî…oó‡€nÜŽ÷Ï$P¯­€³CVùû‰ Ùf ðq`7ܪü¸h¬¼k€?ëp2ðA`,0f„ÿž\™Ò±µFtÝ­| <ðœ;á6<Þ#'ëá6r¼ˆ[¾"ºÆ«zb)"$NBÎz/š2.;ÌÙ6~#4RxÀÀü¨ufד2G¢œüÆcy u xnµ1ŸˆâÙ+ŶÑäñãÀä2ÊyøpðZŒ­qÉ ¶+á÷÷¿Œ&¾¾çAkâVØ>4ðùOˆOŽàƒC€ï(t"L:êM€'€ÖŸ/‰&ØiX‰^?“V[&$×fâöKÚ¶€ïg£{ÜÀ[ BE䤯sêXûÛèÁ)ÂÓ æÒŒ1?nnŸ­}*ÃÀû<–w}4QŠcuàI`ݟ߆{›Vš€#/;û¾ çgF¿j£8ø,þV°î&~=ÊjNŠì»vÌ÷Þ ü'Ö.>Zäïº#›y®Ï%À±y>?¾s–-FëðÛÓ#é‹ñÀ7£—&£KøýÑýìÅjJ„ ”è¿h·†33-ö;¸·>Â?¯1çµÎèZ sTŒîW_~–ç;g_ÍñÙp4Ñx$…öM¬¿lø\/Ÿ¤zR&ö_.p’XŠ0; ¸¬Œ2Àí)“ïíõ›Ñµ1 [ãVö6-ñ÷Ÿ.÷XŸÉ8wǸ{Àûqn‚¡hŒú¤m%Á± εªX²‘m_öP¯æè>õuœ«`9kù"p*î-Ÿ(Ž¥þl1ç·4e2ÇÝ«}<ÒÃøu¿*dÿ›c&oâvD_˜"m‰´}+t~‹ó9ÿ} ÇÏÇq«YïIØŸþTÀw×ÃÅXœ@q«|Gà?àzu\LK¹M¿læ©NŸ!ÿŠÒoìÏ«"qï‹c)o•¬ø!n•3Äç(\Bš˜ÈIŒ •蟵ûš6;|r$DÖW·Æ’îÃÐј1W5žÐ¥]ËÓÇ:8W8ŸîWùÞ>~*ÏÃ÷d\vZ8·"3ºÂõÀeêJ‰]¶ˆìòáJ½ÔÀÅÞ<3‰ü. X×™AœÏÿ"Ïuþ}4ù,—å8WÀr‚[‘«é š„‰@ !D.™ÚlùØSpî&ÚÇç÷œÇ€ËMÆ\Ù2CÙ¬RN!o?‹å[ÀY9>[—’2W\UÎ$ +d££ ö±EüæqœËÆ-8ïA´‡àâ"Êõ!çßWA»d¢‰ý÷€Qe”ó2p-pC4}·º¼%ð1\°ïºyʸ ·zgWy–|÷Ö¾Ô‹Ûx¶Û‡qî7>$·ûb1\ŽK¤G7åevËÇÏ¢ñä‹+q«W¥p8p‘‡ë´ö"E±]•˜€I  “2é뜲#–c£y½î#òpƒ1æš–ö®G5l«†+ð¿a\°êEäΤ3ˆ ‹ßtî×áÞô—ÚŸw{ðÝ›2¸½1|&/¸Œâ\&׎^ŠL«À½&Uq Õ69ÑÆ‰¢ÚEr02ö¸û†GÇ×úfMÝØXÞk­Ý;šälIyþÓ!^zŒá^0³Mƒ} ùø9K5¤jŠ”yOŽ¿Ÿ#>†qnY•®¡ðT /ãV>Š};|¥²oBdmœ{ÓÞe–s ¥'XŠÛEú1ßiðØæ<Ûpþ÷’*ÇMñœÅ„ͺv þ3§“µlsÜ*Ä*ÃÚ¤ˆzÛcAET»È.™Žiã³Ænc¬ÝÍ:—íqÁªãnãËÀsÀãÆð°Å£ ¥d¤Ú ·|9Ü[Á É–Àßq)‘ËáTÊßg¦øábIDAT¤¾Vca:.VÇ×3äÉíC€¿–(†: üî’H8…Ú«éÀ~Ë{+ªï`߈ËÖWɘÍ7HÑ&ÇÚ…ZEÔ±@éëœÒbLfukí–X¶»Î}}\<É8\¡1ÑÑŒ nÌD×Ë .Ug?î­ábœOì`×,æcxÑÀ3^²ÆôµÎ˜-WªúccÜæb¾Ù"´+ظŸÜ)lÀVлDúbba Ùé=œKcSu~ Ø$ M&EâcÝ2Ëù.ˆÙÂe· É[Q›‡<•·dG߬Gñ»iï‹s7*tÜ]Bî„å².}²Ïì‘¿ÃÅåcrô²¡Ò+OábDŠ:ªý¢R`<¿ºÊ9Fáüœs}9þW`Še+àµ"Ûy·‡—o–iëem2%šˆ—;þ…_×ò¯$pO¾Ü³-ÿ ŽsK¨Ç^ј)æÆÁŸõÙäÀ÷Ô;<×÷Ç금Âc8¾ŒË0WÊy¶xíÝÀ.q±Ó)Ìý¬—Dc.ÆæâÀãíÓz ‰(¢žøB€ñ–DÅ–¸àØ8¦Ý*Üþ£úVêåxö¾Ò£=Æáö¾ð1Þ$ܬ{¾g~Ås}o PÇ?؟הqŽ^{áü}Úd˜Üî~{’å£çºªËÙŠ›)æØ!$PtHœˆ:âÚcêq\ŠÔ|ÙÓ*Üöõ)-|0š<ù`š{ÕS]p|ƒÏ컾/úÞâuÌ·Ìa¸Ôêåœ#ä5zz›Üã\»‘?žéqâ7œh¬-!\Œ(:$PD*y=À˜¹·éfÚƒ.ÿRbû|Öý4Ò3Yþ©Ç10òR çãà€÷¼'=×ut€:¾…Û%—¨½ÝÃ9 »7ÅCìò­γ °0Ïïn'ÌÏÓÆÛ-z :$Pê‰m‰|nã\*ɧÊhŸÏ]ã¯*ÓÖ¾\dñ<ÜŸ xOû¹çºn Ž®rŽQÀá¸M}ãÆ€ý·} ¾[Õ¥sùWŸ.)@,¡ôš|Çz ‰:$Pê‰ã+0âbC*ɺ¸ 9K­³§zøHÍúõؘò÷"Yù¸'>üAÀ1ê{Ï‹mÔñÜJÇ©¸¬O!Rü°ÿBìý±€wº2mJþ׿¤°}|Bºüí¡G‘Ñ¡C⤞¸8áþÆíÀ\Íí¾Ìc=Ž(ÓžOàâ6Ê!ƒËøä³Ÿ“èãËÑ·ñ»a"¸¾«íÞ¼ç:Š'Ôyåô»ëEׇ¯ôÂ? dçb2™ !a¢C‡JMðXÂýùµ´yÊs¥8Úc]î-ÓžñP‡Ï{îã{êÇ®@c4DÚàQøÏöúø]À¾Û5PWlfºð Gñ.¦)D¯ÒcH (õÄj”–~¶Ôã’”´ûÆ2Û±¡§zì›{nŠ×ýèà áR¢¨ÎÝUvo ¹Šõ³@uÞçÖ˜/æÜ"ë»c@;«G‘!:$Pꉽì‡ÛI‡›Áô2Ûñ´§z4–Q¹Àêêq“ç~~$¡~ •n©òÉÔQÆím›Äoš·بˆòÎ-Ó!3Ö­àýøóê{nÀú…Âßĺ©©õ®Jœ·Eœ‰Ð< ýËSÚÖ»<µï¦<çi>ƒ[½(¥ü¿ÚaK\À²ï~Ÿ™`_ÞèÚ=½Bcó§yêõÕ"ÊÚŠò¼ÜÖºÿÂC]›qû턨ß┾¨¢:'Í£ZÇV2&£®l­CÄ!2»¼Âv¶âÏeç/9α. ô©˽çךo÷&Ô—›PصcóÐÆE1ñ?)Ó_ÜÞ%p¾ŽÒ6\•#Ö±SÓN!Ò!”UâD$YÖ ÔþSØÖ½<¶ïŠ•Ê5Àt\PíÒ2Þ8••èóy öe(õ–§Ik1LÊ3núq{ÆÊû<ØaËÀm¾#ðý÷~`´§ºÞ°ž»kÚ)„¿‰oµ‹ õ4Ûµýö¶õëÛ÷gÜ.æ¿Æe%*µœNØ[êó ª#ðR 6\›p_lX@[ŠÝ‘½Ü,oÝ ôß²€÷Ý·xw:ârú'ÔIUË„£!ÂOjEu`ԗ›*wa Ûº‡Ç²ŽˆŽRèÇ­ üŠð¾ö#ñ‘@åÞ‘Pý§¸`ìbøw‚ý0¸!O[nÅ¥æ-”€)eÖ+´›€¿Õ‰‘8xÞSYŸ"ÜŠØÅzüñ¿IkYÇ><`Û4ÔCðÄl¬£úÇà©Ú>+…m}¦Âãá)à ÜNì•ä/Ú·UBõŸ°&'Ô†Fàúôè»'pÝn¨î‘ Ïwþç)-0y+ü¤Š ½yè8òïö^êñGÏu=3à8Û9€m§R~ö³œdÂÿ¤Ï¹8 ?ch¤C„eT rRÔÆ3“¡i¯*êçÕ ëßÿFàúÿ#`=}®Ä¶àbvV”==ÔMD.XÂçÍfçÉ{%õ¶×¤¬í"½Œä¾µ²@áîUÍîbÍÊ]³‚m…˳lçúDŠûv<ð/`×Î5¨Ü³p.&¡yÞcY+V>ò ´SÞæyG{¬÷Ï6Ý:ÝBJs]ËŇÕóÜf¤¾4Á€½õÈÕ&@ªñm¯VA„w&ì¼×Ø ;ïµÁ„÷Ú¬ÈñW+¼I˜7}7V¨=kãüÁ“J80hJa¿n <– ö І©ø ®.ä8ÁS›"ñ[È9Oö0Ö<Úà}79º6’è;Ÿñ&à=ñ»ëùÛÊŸ®'ºøPýEm^3µÈ+¶ÏU -[“|ֳϤ¬O·Æ½ÑOÒŸô܆Q¸¸Œ¤ê˜‡:7ã‚ 9Ÿ¸¤c<Ûà`O}7X”`ßýÎã¸Û)`=}e©úqŽò%@„ˆê/j• ;ïÕ­˜lX#M å_ŸÅ?'Å4œF%Ò.?BzܤwÃe¤JÚ?öÜŽßÇœ+DÜÒG<¦x®¯{²Ñåžmp¼‡:Œ ªOrìùŒ7;5PxºG|+æ ¢æˆD”¹Êô ;ïµÏ„÷úQ•Tù!ªç­x.ŽÀeשäÞ/Ǥ /?ŒÛ ±í÷¹]Ü$p·ºïú—“v,pGçù’'ü»8•›êx&.¨>é±w”DZwy :þÅCݾ‘ç ¢¦Å‡ˆÅ ’´¯˜ÜpbpUàº\r6Ÿ| —qªR|¶B“¿•…ÁæÚqñq gfCÅ%Öw-`vçø¼ÇþžÀ=eÔç»{>±çªc¹«^ß.à ¢¦ˆÄ”~ʺ‘@y_…ªpYÀ‰Á2\ lÖ®/¢.—âÜÂBN„®®@ÿ5¿*°~¡ýòVf[ö'Þ}çÞ¨½«¨{)oÿ7¦°8•~àcžû}f >Ü­Èz´àb0 )ûí@uÞÜ“MÇ.éA9þgPþ Àúz¢ŠZµ(@$BDšÅÉæ‘K×Ì€§ùNàIé9ê<‰âöX‘}æ¿„û ‘u)Üõçm\V©™±KlË'‰w£{Ø4úî(ÂÄñÃ.è¿ç2æ›ßêÃ{q›k*Àî-°Ü…‘¸yÒs}³øËB·[Àkc·êÓ½8ÉWö«¸ÄBH|¨]B%¾WL|nd6ÒÑ lâ©®à˸7Ê…NNN]é÷—’ŒKÈ Š&…Ôç­•&@ Üö'(îMlpnýxà*ã DÝ ½¦ŽÄ­îå+o)°O þÿwÀ>ü#ñû5'RøŠÆ -úí žëêsï’#Út÷"ë2¸­Àöï '£QÛ„¨6¶M`Bþ/ «š‹‰ÀŠ8g4¡X™O‘œ_ú¹8w!߬Û Pw²×qoëWðÃÚ>o¥ g‡D‚¥” [!’<‰[UÊÅf¾ÇÇÛÀž¯Û—÷á\à¸èåA3Εr. ÓsE”óJtí®Àw†¶ÿz´ééíYLBŽ6à™Ê|s•k[MÐÕ>!ª½E =)½´D²ÐAqÖo3r@æšøÝ¸-ßñþv!‹Ë‚³°ˆó¿LX¥œÃjû p nƒÂ ¢IìFÀÀ÷(Üî–ãæå@õ~8 çîÕŒs3ú0nU PÑ3ç&’å ŽãRgy÷.èoy>Çmzn@[\\Àù‹Yá] ñ!$@ÔF!ªkIfBrs4 -„ÉÀ…æê²òñ<ñ. Ox–®Á¹â˜úfg\M±oŽǽ±_•ñ„Æ÷9Í•Yìá”Öùe`ûÀ×ksôÝ#9®uß»ŒÿÕ£];Ú£Ÿøø¨=€9¾ª¹³[BsµSˆj'TF\A¹çŒ&Ã8ç ÀAÑ[ÈR'—sÈð± ONgí8Ÿð€ÑÑ›ÏÕ"Á°+.%ì…”4߬cƒž*˜À.vŒiÃM)¬ó3¼ûÖHyßÝ…[m ß›…þÁ£]ÿØ. p±3D÷½q{˜ÜNy«šBÔäÄ\bKˆÚgœËŒ­âãêh2Ÿ ávOÃq#.hßLy†È¿3ùù¤ïÿ ]¯†pébË=®Zcêî;ýBv½0å×Å“øK9\=#E‘òzÂÈnB”Ìœ¯}µÞëÎÇ­®ä#‹ÛÌ®9·Â³,Ï÷þ”òvœ†[áˆã‰Õw6n3¼Wóo§°ß~ÉÿkïîB÷,ë8€ç”Æ¢÷Ì—¬l†8–R‘AoZCÍI0 ‘:P„<¨£Ê¢:ˆÞ):± *bA±Ð*A”´—娑–4sa‘èÔ$š³r[êÖÁu‹FþŸçÝ÷}ÝÏóù;w]¿û¾öü~×Û]Îü?ÿÍÊçÁŠÿ¯} ‰])Û´þìçŠ)ü(/Ûj€U˜ßÅ™ÞlÿãI.£¯Ç¥ÌX/ʪÇS]Ò>‹›íËçVØþ·§§õ#Œ×ݽXa»ï¬üw©bL¯ÍtW5A"®ß0YSJÊw§\!<¯ó3ÃØÏô·‹æèÿ… öå[3´ÿyÒ¶ôsÕòJlkä™íëÞ¥•ªýý’š«™[×eõW˜ƒ$\ÿ¡iïœHÒ½½ÒŒàW&^|ÜäŒUôÿg õå»™}«ù®´¿RÓ—÷5ðÌîÉì_à®ý!ÌOUŒé‰ ‡¥X ¸þÃR¸¥á„û]âU˳RnŒšbñ±-«ßúsnÚXú~æ›åýÌm}2å³±>ò³»>åÛ4³Ú^¹Ÿ¬×?40ö¦œ÷€¥K¾ˆƒ"„¥uzÊ×Öî_§Ÿë'OÊ´nÅz<Éû]Æßv5ï%;›nëþ$ïhh¬Þ4Âó:”ÙÏíë•Ûó±Ê1ýôÈãáÖ$/ñ3„Ä[,`½·¡„ûÉ”m}îµ?+ÉC(>v$ye徯OrïHýùb…ö鶴÷%ÙØØ8½hàçµ'ÉkWÙæ/TnÓµ•czjÊö§1¶\}4õnô ÷Äã¡aY}¶„ûWI^3PÏH»+!ÿLò¡ôwÿÆ”Ùý!“­«+µ}ˆƒÃ·$yQ£ãôÇôÿp’¯¥Î¹«Ú7M}¤‡˜~gàñý› …(@ÄÆXæÚ—äƒþûW§$ùicÅÇ2ÌÇǶd˜™ß’¼¹rÛûzf‡S›·ü¶W¤lËësågsÅö^]¹}î!¦/Ì0+¢O¤œa9ÁO mqQ€À«½eâ™fÆ¿šqg›×&ù|Ê· Æ,<~›Ù®7­áÒž‹“¼¸‡vŸ•äï•ÛúP’ &2F¯èáYL9±®r[ß]©}{“Üä=Åôâô{ÈÿÆŒôUsh‹ LÅå)«}lýFÊln+ÎIòó _v…ÀX{ÁÏKòhê¯z\Ös»·¦œª±êñí$ÏŸØýxê­úlïq,ž?g›îIòÍ$W&Ù0PL¯é¡¹½Ç¢ $Ø EËî´”o5ÔüAÞ›rÀü”†û½%Ém=û»äê =ëVè×_Röü¯¨Ý[³ºíH?I¹šxªÞŸ²r1ï×̯OrvÏm\ÉÍe‡R.ør÷LÇ\½,«?õTÊŠ‡«u‘\/Yr-FPÏÙ)«ónyÙ›rõê™Ö~7¥|w¢ÖQ{R®$Ý’ò=’]’ä3öëé.‘¿j¤~mHòƒÌ¶úvC’·.Èø<'e–}¥ý¿?åœËËjßIÇhÃcInNò‰”’uÅôeI¾—Ù·eÞ•rMðK§þR¹š‹c%ÖÞ-±‚1¬KÙ®³9É«“œ™2Sùœnœ<‘ä¯]‚³'É)³š¿[€¾Ÿšrúõ)·gmHrrÊ-Aë»þ:*§lCú}’Ý)Û¬P7&yWÊÖ‘M)ç8žä@ÊÖ¼?¦l‘Ù‘²ZôH#ãå]a±1É º‚wÊmgw¥^¿©K€͹IÞÓõÿUIž›²:òHWDïLùæÄ® ;Á¶¶‹ÿÝû²#ÉÝ)+«­;­{§ÞÖý›wrWdHò·$:j|ß–äÁEy™$>ô‘T¯+ñ8–ã„€êe±FÜ ,FR€„f~Ü‚KüŠ‘’ñe-B Àä A€f™¹Åê‡X Æ  afìë,ma aá’\³þŠŠ‘ˆ €&Y„X$JLžÕ±`0Ç s2»€äAŠŒUI Šã m΀ àXÅ XLVA€P€ €ö M¯;ÏÑ`ư|ÈÓ‡¯:`äÈ‘lºÉY³ç‹ð –¯Ir¦ÐIbñ܆*@NºizÓ[.üÏJˆ„Vñ0¸ãçüs Ý4ݱãÖ#[Æ2ï ÈN¡€öo¬F·3âùŽIEND®B`‚imbalanced-learn-0.12.2/doc/_static/js/000077500000000000000000000000001460233407600175665ustar00rootroot00000000000000imbalanced-learn-0.12.2/doc/_static/js/copybutton.js000066400000000000000000000053631460233407600223410ustar00rootroot00000000000000$(document).ready(function() { /* Add a [>>>] button on the top-right corner of code samples to hide * the >>> and ... prompts and the output and thus make the code * copyable. */ var div = $('.highlight-python .highlight,' + '.highlight-python3 .highlight,' + '.highlight-pycon .highlight,' + '.highlight-default .highlight') var pre = div.find('pre'); // get the styles from the current theme pre.parent().parent().css('position', 'relative'); var hide_text = 'Hide the prompts and output'; var show_text = 'Show the prompts and output'; var border_width = pre.css('border-top-width'); var border_style = pre.css('border-top-style'); var border_color = pre.css('border-top-color'); var button_styles = { 'cursor':'pointer', 'position': 'absolute', 'top': '0', 'right': '0', 'border-color': border_color, 'border-style': border_style, 'border-width': border_width, 'color': border_color, 'text-size': '75%', 'font-family': 'monospace', 'padding-left': '0.2em', 'padding-right': '0.2em', 'border-radius': '0 3px 0 0' } // create and add the button to all the code blocks that contain >>> div.each(function(index) { var jthis = $(this); if (jthis.find('.gp').length > 0) { var button = $('>>>'); button.css(button_styles) button.attr('title', hide_text); button.data('hidden', 'false'); jthis.prepend(button); } // tracebacks (.gt) contain bare text elements that need to be // wrapped in a span to work with .nextUntil() (see later) jthis.find('pre:has(.gt)').contents().filter(function() { return ((this.nodeType == 3) && (this.data.trim().length > 0)); }).wrap(''); }); // define the behavior of the button when it's clicked $('.copybutton').click(function(e){ e.preventDefault(); var button = $(this); if (button.data('hidden') === 'false') { // hide the code output button.parent().find('.go, .gp, .gt').hide(); button.next('pre').find('.gt').nextUntil('.gp, .go').css('visibility', 'hidden'); button.css('text-decoration', 'line-through'); button.attr('title', show_text); button.data('hidden', 'true'); } else { // show the code output button.parent().find('.go, .gp, .gt').show(); button.next('pre').find('.gt').nextUntil('.gp, .go').css('visibility', 'visible'); button.css('text-decoration', 'none'); button.attr('title', hide_text); button.data('hidden', 'false'); } }); }); imbalanced-learn-0.12.2/doc/_templates/000077500000000000000000000000001460233407600176615ustar00rootroot00000000000000imbalanced-learn-0.12.2/doc/_templates/class.rst000066400000000000000000000007111460233407600215170ustar00rootroot00000000000000{{objname}} {{ underline }}============== .. currentmodule:: {{ module }} .. autoclass:: {{ objname }} {% block methods %} {% if methods %} .. rubric:: Methods .. autosummary:: {% for item in methods %} {% if '__init__' not in item %} ~{{ name }}.{{ item }} {% endif %} {%- endfor %} {% endif %} {% endblock %} .. include:: {{module}}.{{objname}}.examples .. raw:: html
imbalanced-learn-0.12.2/doc/_templates/function.rst000066400000000000000000000003231460233407600222360ustar00rootroot00000000000000{{objname}} {{ underline }}==================== .. currentmodule:: {{ module }} .. autofunction:: {{ objname }} .. include:: {{module}}.{{objname}}.examples .. raw:: html
imbalanced-learn-0.12.2/doc/_templates/numpydoc_docstring.rst000066400000000000000000000003251460233407600243250ustar00rootroot00000000000000{{index}} {{summary}} {{extended_summary}} {{parameters}} {{returns}} {{yields}} {{other_parameters}} {{attributes}} {{raises}} {{warns}} {{warnings}} {{see_also}} {{notes}} {{references}} {{examples}} {{methods}}imbalanced-learn-0.12.2/doc/_templates/sidebar-search-bs.html000066400000000000000000000005631460233407600240310ustar00rootroot00000000000000 imbalanced-learn-0.12.2/doc/about.rst000066400000000000000000000012131460233407600173650ustar00rootroot00000000000000About us ======== .. include:: ../AUTHORS.rst .. _citing-imbalanced-learn: Citing imbalanced-learn ----------------------- If you use imbalanced-learn in a scientific publication, we would appreciate citations to the following paper:: @article{JMLR:v18:16-365, author = {Guillaume Lema{{\^i}}tre and Fernando Nogueira and Christos K. Aridas}, title = {Imbalanced-learn: A Python Toolbox to Tackle the Curse of Imbalanced Datasets in Machine Learning}, journal = {Journal of Machine Learning Research}, year = {2017}, volume = {18}, number = {17}, pages = {1-5}, url = {http://jmlr.org/papers/v18/16-365.html} } imbalanced-learn-0.12.2/doc/bibtex/000077500000000000000000000000001460233407600170015ustar00rootroot00000000000000imbalanced-learn-0.12.2/doc/bibtex/refs.bib000066400000000000000000000201731460233407600204210ustar00rootroot00000000000000@inproceedings{mani2003knn, title={kNN approach to unbalanced data distributions: a case study involving information extraction}, author={Mani, Inderjeet and Zhang, I}, booktitle={Proceedings of workshop on learning from imbalanced datasets}, volume={126}, year={2003} } @article{batista2004study, title={A study of the behavior of several methods for balancing machine learning training data}, author={Batista, Gustavo EAPA and Prati, Ronaldo C and Monard, Maria Carolina}, journal={ACM SIGKDD explorations newsletter}, volume={6}, number={1}, pages={20--29}, year={2004}, publisher={ACM} } @inproceedings{batista2003balancing, title={Balancing Training Data for Automated Annotation of Keywords: a Case Study.}, author={Batista, Gustavo EAPA and Bazzan, Ana LC and Monard, Maria Carolina}, booktitle={WOB}, pages={10--18}, year={2003} } @article{chen2004using, title={Using random forest to learn imbalanced data}, author={Chen, Chao and Liaw, Andy and Breiman, Leo and others}, journal={University of California, Berkeley}, volume={110}, number={1-12}, pages={24}, year={2004} } @article{liu2008exploratory, title={Exploratory undersampling for class-imbalance learning}, author={Liu, Xu-Ying and Wu, Jianxin and Zhou, Zhi-Hua}, journal={IEEE Transactions on Systems, Man, and Cybernetics, Part B (Cybernetics)}, volume={39}, number={2}, pages={539--550}, year={2008}, publisher={IEEE} } @article{seiffert2009rusboost, title={RUSBoost: A hybrid approach to alleviating class imbalance}, author={Seiffert, Chris and Khoshgoftaar, Taghi M and Van Hulse, Jason and Napolitano, Amri}, journal={IEEE Transactions on Systems, Man, and Cybernetics-Part A: Systems and Humans}, volume={40}, number={1}, pages={185--197}, year={2009}, publisher={IEEE} } @inproceedings{kubat1997addressing, title={Addressing the curse of imbalanced training sets: one-sided selection}, author={Kubat, Miroslav and Matwin, Stan and others}, booktitle={Icml}, volume={97}, pages={179--186}, year={1997}, organization={Nashville, USA} } @article{barandela2003strategies, title={Strategies for learning in class imbalance problems}, author={Barandela, Ricardo and S{\'a}nchez, Jos{\'e} Salvador and Garca, V and Rangel, Edgar}, journal={Pattern Recognition}, volume={36}, number={3}, pages={849--851}, year={2003}, publisher={Elsevier Science Publishing Company, Inc.} } @article{garcia2012effectiveness, title={On the effectiveness of preprocessing methods when dealing with different levels of class imbalance}, author={Garc{\'\i}a, Vicente and S{\'a}nchez, Jos{\'e} Salvador and Mollineda, Ram{\'o}n Alberto}, journal={Knowledge-Based Systems}, volume={25}, number={1}, pages={13--21}, year={2012}, publisher={Elsevier} } @inproceedings{he2008adasyn, title={ADASYN: Adaptive synthetic sampling approach for imbalanced learning}, author={He, Haibo and Bai, Yang and Garcia, Edwardo A and Li, Shutao}, booktitle={2008 IEEE International Joint Conference on Neural Networks (IEEE World Congress on Computational Intelligence)}, pages={1322--1328}, year={2008}, organization={IEEE} } @article{chawla2002smote, title={SMOTE: synthetic minority over-sampling technique}, author={Chawla, Nitesh V and Bowyer, Kevin W and Hall, Lawrence O and Kegelmeyer, W Philip}, journal={Journal of artificial intelligence research}, volume={16}, pages={321--357}, year={2002} } @inproceedings{han2005borderline, title={Borderline-SMOTE: a new over-sampling method in imbalanced data sets learning}, author={Han, Hui and Wang, Wen-Yuan and Mao, Bing-Huan}, booktitle={International conference on intelligent computing}, pages={878--887}, year={2005}, organization={Springer} } @inproceedings{nguyen2009borderline, title={Borderline over-sampling for imbalanced data classification}, author={Nguyen, Hien M and Cooper, Eric W and Kamei, Katsuari}, booktitle={Proceedings: Fifth International Workshop on Computational Intelligence \& Applications}, volume={2009}, number={1}, pages={24--29}, year={2009}, organization={IEEE SMC Hiroshima Chapter} } @article{last2017oversampling, title={Oversampling for Imbalanced Learning Based on K-Means and SMOTE}, author={Last, Felix and Douzas, Georgios and Bacao, Fernando}, journal={arXiv preprint arXiv:1711.00837}, year={2017} } @article{tomek1976two, title={Two modifications of CNN}, author={Tomek, Ivan}, journal={IEEE Trans. Systems, Man and Cybernetics}, volume={6}, pages={769--772}, year={1976} } @article{wilson1972asymptotic, title={Asymptotic properties of nearest neighbor rules using edited data}, author={Wilson, Dennis L}, journal={IEEE Transactions on Systems, Man, and Cybernetics}, number={3}, pages={408--421}, year={1972}, publisher={IEEE} } @article{tomek1976experiment, title={An experiment with the edited nearest-neighbor rule}, author={Tomek, Ivan}, journal={IEEE Transactions on systems, Man, and Cybernetics}, volume={6}, number={6}, pages={448--452}, year={1976} } @article{hart1968condensed, title={The condensed nearest neighbor rule (Corresp.)}, author={Hart, Peter}, journal={IEEE transactions on information theory}, volume={14}, number={3}, pages={515--516}, year={1968}, publisher={Citeseer} } @inproceedings{laurikkala2001improving, title={Improving identification of difficult small classes by balancing class distribution}, author={Laurikkala, Jorma}, booktitle={Conference on Artificial Intelligence in Medicine in Europe}, pages={63--66}, year={2001}, organization={Springer} } @article{smith2014instance, title={An instance level analysis of data complexity}, author={Smith, Michael R and Martinez, Tony and Giraud-Carrier, Christophe}, journal={Machine learning}, volume={95}, number={2}, pages={225--256}, year={2014}, publisher={Springer} } @article{torelli2014rose, author = {Menardi, Giovanna and Torelli, Nicola}, title={Training and assessing classification rules with imbalanced data}, journal={Data Mining and Knowledge Discovery}, volume={28}, pages={92-122}, year={2014}, publisher={Springer}, issue = {1}, issn = {1573-756X}, url = {https://doi.org/10.1007/s10618-012-0295-5}, doi = {10.1007/s10618-012-0295-5} } @article{esuli2009ordinal, author = {A. Esuli and S. Baccianella and F. Sebastiani}, title = {Evaluation Measures for Ordinal Regression}, journal = {Intelligent Systems Design and Applications, International Conference on}, year = {2009}, volume = {1}, issn = {}, pages = {283-287}, keywords = {ordinal regression;ordinal classification;evaluation measures;class imbalance;product reviews}, doi = {10.1109/ISDA.2009.230}, url = {https://doi.ieeecomputersociety.org/10.1109/ISDA.2009.230}, publisher = {IEEE Computer Society}, address = {Los Alamitos, CA, USA}, month = {dec} } @article{stanfill1986toward, title={Toward memory-based reasoning}, author={Stanfill, Craig and Waltz, David}, journal={Communications of the ACM}, volume={29}, number={12}, pages={1213--1228}, year={1986}, publisher={ACM New York, NY, USA} } @article{wilson1997improved, title={Improved heterogeneous distance functions}, author={Wilson, D Randall and Martinez, Tony R}, journal={Journal of artificial intelligence research}, volume={6}, pages={1--34}, year={1997} } @inproceedings{wang2009diversity, title={Diversity analysis on imbalanced data sets by using ensemble models}, author={Wang, Shuo and Yao, Xin}, booktitle={2009 IEEE symposium on computational intelligence and data mining}, pages={324--331}, year={2009}, organization={IEEE} } @article{hido2009roughly, title={Roughly balanced bagging for imbalanced data}, author={Hido, Shohei and Kashima, Hisashi and Takahashi, Yutaka}, journal={Statistical Analysis and Data Mining: The ASA Data Science Journal}, volume={2}, number={5-6}, pages={412--426}, year={2009}, publisher={Wiley Online Library} } @article{maclin1997empirical, title={An empirical evaluation of bagging and boosting}, author={Maclin, Richard and Opitz, David}, journal={AAAI/IAAI}, volume={1997}, pages={546--551}, year={1997} } imbalanced-learn-0.12.2/doc/combine.rst000066400000000000000000000044411460233407600176750ustar00rootroot00000000000000.. _combine: ======================================= Combination of over- and under-sampling ======================================= .. currentmodule:: imblearn.over_sampling We previously presented :class:`SMOTE` and showed that this method can generate noisy samples by interpolating new points between marginal outliers and inliers. This issue can be solved by cleaning the space resulting from over-sampling. .. currentmodule:: imblearn.combine In this regard, Tomek's link and edited nearest-neighbours are the two cleaning methods that have been added to the pipeline after applying SMOTE over-sampling to obtain a cleaner space. The two ready-to use classes imbalanced-learn implements for combining over- and undersampling methods are: (i) :class:`SMOTETomek` :cite:`batista2004study` and (ii) :class:`SMOTEENN` :cite:`batista2003balancing`. Those two classes can be used like any other sampler with parameters identical to their former samplers:: >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> X, y = make_classification(n_samples=5000, n_features=2, n_informative=2, ... n_redundant=0, n_repeated=0, n_classes=3, ... n_clusters_per_class=1, ... weights=[0.01, 0.05, 0.94], ... class_sep=0.8, random_state=0) >>> print(sorted(Counter(y).items())) [(0, 64), (1, 262), (2, 4674)] >>> from imblearn.combine import SMOTEENN >>> smote_enn = SMOTEENN(random_state=0) >>> X_resampled, y_resampled = smote_enn.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 4060), (1, 4381), (2, 3502)] >>> from imblearn.combine import SMOTETomek >>> smote_tomek = SMOTETomek(random_state=0) >>> X_resampled, y_resampled = smote_tomek.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 4499), (1, 4566), (2, 4413)] We can also see in the example below that :class:`SMOTEENN` tends to clean more noisy samples than :class:`SMOTETomek`. .. image:: ./auto_examples/combine/images/sphx_glr_plot_comparison_combine_001.png :target: ./auto_examples/combine/plot_comparison_combine.html :scale: 60 :align: center .. topic:: Examples * :ref:`sphx_glr_auto_examples_combine_plot_comparison_combine.py` imbalanced-learn-0.12.2/doc/common_pitfalls.rst000066400000000000000000000170421460233407600214500ustar00rootroot00000000000000.. _common_pitfalls: ========================================= Common pitfalls and recommended practices ========================================= This section is a complement to the documentation given `[here] `_ in scikit-learn. Indeed, we will highlight the issue of misusing resampling, leading to a **data leakage**. Due to this leakage, the performance of a model reported will be over-optimistic. Data leakage ============ As mentioned in the scikit-learn documentation, data leakage occurs when information that would not be available at prediction time is used when building the model. In the resampling setting, there is a common pitfall that corresponds to resample the **entire** dataset before splitting it into a train and a test partitions. Note that it would be equivalent to resample the train and test partitions as well. Such of a processing leads to two issues: * the model will not be tested on a dataset with class distribution similar to the real use-case. Indeed, by resampling the entire dataset, both the training and testing set will be potentially balanced while the model should be tested on the natural imbalanced dataset to evaluate the potential bias of the model; * the resampling procedure might use information about samples in the dataset to either generate or select some of the samples. Therefore, we might use information of samples which will be later used as testing samples which is the typical data leakage issue. We will demonstrate the wrong and right ways to do some sampling and emphasize the tools that one should use, avoiding to fall in the trap. We will use the adult census dataset. For the sake of simplicity, we will only use the numerical features. Also, we will make the dataset more imbalanced to increase the effect of the wrongdoings:: >>> from sklearn.datasets import fetch_openml >>> from imblearn.datasets import make_imbalance >>> X, y = fetch_openml( ... data_id=1119, as_frame=True, return_X_y=True ... ) >>> X = X.select_dtypes(include="number") >>> X, y = make_imbalance( ... X, y, sampling_strategy={">50K": 300}, random_state=1 ... ) Let's first check the balancing ratio on this dataset:: >>> from collections import Counter >>> {key: value / len(y) for key, value in Counter(y).items()} {'<=50K': 0.988..., '>50K': 0.011...} To later highlight some of the issue, we will keep aside a left-out set that we will not use for the evaluation of the model:: >>> from sklearn.model_selection import train_test_split >>> X, X_left_out, y, y_left_out = train_test_split( ... X, y, stratify=y, random_state=0 ... ) We will use a :class:`sklearn.ensemble.HistGradientBoostingClassifier` as a baseline classifier. First, we will train and check the performance of this classifier, without any preprocessing to alleviate the bias toward the majority class. We evaluate the generalization performance of the classifier via cross-validation:: >>> from sklearn.ensemble import HistGradientBoostingClassifier >>> from sklearn.model_selection import cross_validate >>> model = HistGradientBoostingClassifier(random_state=0) >>> cv_results = cross_validate( ... model, X, y, scoring="balanced_accuracy", ... return_train_score=True, return_estimator=True, ... n_jobs=-1 ... ) >>> print( ... f"Balanced accuracy mean +/- std. dev.: " ... f"{cv_results['test_score'].mean():.3f} +/- " ... f"{cv_results['test_score'].std():.3f}" ... ) Balanced accuracy mean +/- std. dev.: 0.609 +/- 0.024 We see that the classifier does not give good performance in terms of balanced accuracy mainly due to the class imbalance issue. In the cross-validation, we stored the different classifiers of all folds. We will show that evaluating these classifiers on the left-out data will give close statistical performance:: >>> import numpy as np >>> from sklearn.metrics import balanced_accuracy_score >>> scores = [] >>> for fold_id, cv_model in enumerate(cv_results["estimator"]): ... scores.append( ... balanced_accuracy_score( ... y_left_out, cv_model.predict(X_left_out) ... ) ... ) >>> print( ... f"Balanced accuracy mean +/- std. dev.: " ... f"{np.mean(scores):.3f} +/- {np.std(scores):.3f}" ... ) Balanced accuracy mean +/- std. dev.: 0.628 +/- 0.009 Let's now show the **wrong** pattern to apply when it comes to resampling to alleviate the class imbalance issue. We will use a sampler to balance the **entire** dataset and check the statistical performance of our classifier via cross-validation:: >>> from imblearn.under_sampling import RandomUnderSampler >>> sampler = RandomUnderSampler(random_state=0) >>> X_resampled, y_resampled = sampler.fit_resample(X, y) >>> model = HistGradientBoostingClassifier(random_state=0) >>> cv_results = cross_validate( ... model, X_resampled, y_resampled, scoring="balanced_accuracy", ... return_train_score=True, return_estimator=True, ... n_jobs=-1 ... ) >>> print( ... f"Balanced accuracy mean +/- std. dev.: " ... f"{cv_results['test_score'].mean():.3f} +/- " ... f"{cv_results['test_score'].std():.3f}" ... ) Balanced accuracy mean +/- std. dev.: 0.724 +/- 0.042 The cross-validation performance looks good, but evaluating the classifiers on the left-out data shows a different picture:: >>> scores = [] >>> for fold_id, cv_model in enumerate(cv_results["estimator"]): ... scores.append( ... balanced_accuracy_score( ... y_left_out, cv_model.predict(X_left_out) ... ) ... ) >>> print( ... f"Balanced accuracy mean +/- std. dev.: " ... f"{np.mean(scores):.3f} +/- {np.std(scores):.3f}" ... ) Balanced accuracy mean +/- std. dev.: 0.698 +/- 0.014 We see that the performance is now worse than the cross-validated performance. Indeed, the data leakage gave us too optimistic results due to the reason stated earlier in this section. We will now illustrate the correct pattern to use. Indeed, as in scikit-learn, using a :class:`~imblearn.pipeline.Pipeline` avoids to make any data leakage because the resampling will be delegated to imbalanced-learn and does not require any manual steps:: >>> from imblearn.pipeline import make_pipeline >>> model = make_pipeline( ... RandomUnderSampler(random_state=0), ... HistGradientBoostingClassifier(random_state=0) ... ) >>> cv_results = cross_validate( ... model, X, y, scoring="balanced_accuracy", ... return_train_score=True, return_estimator=True, ... n_jobs=-1 ... ) >>> print( ... f"Balanced accuracy mean +/- std. dev.: " ... f"{cv_results['test_score'].mean():.3f} +/- " ... f"{cv_results['test_score'].std():.3f}" ... ) Balanced accuracy mean +/- std. dev.: 0.732 +/- 0.019 We observe that we get good statistical performance as well. However, now we can check the performance of the model from each cross-validation fold to ensure that we have similar performance:: >>> scores = [] >>> for fold_id, cv_model in enumerate(cv_results["estimator"]): ... scores.append( ... balanced_accuracy_score( ... y_left_out, cv_model.predict(X_left_out) ... ) ... ) >>> print( ... f"Balanced accuracy mean +/- std. dev.: " ... f"{np.mean(scores):.3f} +/- {np.std(scores):.3f}" ... ) Balanced accuracy mean +/- std. dev.: 0.727 +/- 0.008 We see that the statistical performance are very close to the cross-validation study that we perform, without any sign of over-optimistic results. imbalanced-learn-0.12.2/doc/conf.py000066400000000000000000000246331460233407600170330ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # imbalanced-learn documentation build configuration file, created by # sphinx-quickstart on Mon Jan 18 14:44:12 2016. # # This file is execfile()d with the current directory set to its # containing dir. # # Note that not all possible configuration values are present in this # autogenerated file. # # All configuration values have a default; values that are commented out # serve to show the default. import os import sys from datetime import datetime from io import StringIO from pathlib import Path # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. sys.path.insert(0, os.path.abspath("sphinxext")) from github_link import make_linkcode_resolve # noqa # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. # needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ "sphinx.ext.autodoc", "sphinx.ext.autosummary", "sphinx.ext.doctest", "sphinx.ext.intersphinx", "sphinx.ext.linkcode", "sphinxcontrib.bibtex", "numpydoc", "sphinx_issues", "sphinx_gallery.gen_gallery", "sphinx_copybutton", ] # Specify how to identify the prompt when copying code snippets copybutton_prompt_text = r">>> |\.\.\. " copybutton_prompt_is_regexp = True # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] # The suffix of source filenames. source_suffix = ".rst" # The master toctree document. master_doc = "index" # General information about the project. project = "imbalanced-learn" copyright = f"2014-{datetime.now().year}, The imbalanced-learn developers" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. from imblearn import __version__ # noqa version = __version__ # The full version, including alpha/beta/rc tags. release = __version__ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. exclude_patterns = ["_build", "_templates"] # The reST default role (used for this markup: `text`) to use for all # documents. default_role = "literal" # If true, '()' will be appended to :func: etc. cross-reference text. add_function_parentheses = False # The name of the Pygments (syntax highlighting) style to use. pygments_style = "sphinx" # -- Options for HTML output ---------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. html_theme = "pydata_sphinx_theme" html_title = f"Version {version}" html_favicon = "_static/img/favicon.ico" html_logo = "_static/img/logo_wide.png" html_style = "css/imbalanced-learn.css" html_css_files = [ "css/imbalanced-learn.css", ] html_sidebars = { "changelog": [], } html_theme_options = { "external_links": [], "github_url": "https://github.com/scikit-learn-contrib/imbalanced-learn", # "twitter_url": "https://twitter.com/pandas_dev", "use_edit_page_button": True, "show_toc_level": 1, # "navbar_align": "right", # For testing that the navbar items align properly } html_context = { "github_user": "scikit-learn-contrib", "github_repo": "imbalanced-learn", "github_version": "master", "doc_path": "doc", } # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ["_static"] # Output file base name for HTML help builder. htmlhelp_basename = "imbalanced-learndoc" # -- Options for autodoc ------------------------------------------------------ autodoc_default_options = { "members": True, "inherited-members": True, } # generate autosummary even if no references autosummary_generate = True # -- Options for numpydoc ----------------------------------------------------- # this is needed for some reason... # see https://github.com/numpy/numpydoc/issues/69 numpydoc_show_class_members = False # -- Options for sphinxcontrib-bibtex ----------------------------------------- # bibtex file bibtex_bibfiles = ["bibtex/refs.bib"] # -- Options for intersphinx -------------------------------------------------- # intersphinx configuration intersphinx_mapping = { "python": ("https://docs.python.org/{.major}".format(sys.version_info), None), "numpy": ("https://numpy.org/doc/stable", None), "scipy": ("https://docs.scipy.org/doc/scipy/reference", None), "matplotlib": ("https://matplotlib.org/", None), "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None), "joblib": ("https://joblib.readthedocs.io/en/latest/", None), "seaborn": ("https://seaborn.pydata.org/", None), } # -- Options for sphinx-gallery ----------------------------------------------- # Generate the plot for the gallery plot_gallery = True # sphinx-gallery configuration sphinx_gallery_conf = { "doc_module": "imblearn", "backreferences_dir": os.path.join("references/generated"), "show_memory": True, "reference_url": {"imblearn": None}, } # -- Options for github link for what's new ----------------------------------- # Config for sphinx_issues issues_uri = "https://github.com/scikit-learn-contrib/imbalanced-learn/issues/{issue}" issues_github_path = "scikit-learn-contrib/imbalanced-learn" issues_user_uri = "https://github.com/{user}" # The following is used by sphinx.ext.linkcode to provide links to github linkcode_resolve = make_linkcode_resolve( "imblearn", "https://github.com/scikit-learn-contrib/" "imbalanced-learn/blob/{revision}/" "{package}/{path}#L{lineno}", ) # -- Options for LaTeX output --------------------------------------------- latex_elements = { # The paper size ('letterpaper' or 'a4paper'). # 'papersize': 'letterpaper', # The font size ('10pt', '11pt' or '12pt'). # 'pointsize': '10pt', # Additional stuff for the LaTeX preamble. # 'preamble': '', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ ( "index", "imbalanced-learn.tex", "imbalanced-learn Documentation", "The imbalanced-learn developers", "manual", ), ] # -- Options for manual page output --------------------------------------- # If false, no module index is generated. # latex_domain_indices = True # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ ( "index", "imbalanced-learn", "imbalanced-learn Documentation", ["The imbalanced-learn developers"], 1, ) ] # If true, show URL addresses after external links. # man_show_urls = False # -- Options for Texinfo output ------------------------------------------- # Grouping the document tree into Texinfo files. List of tuples # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ ( "index", "imbalanced-learn", "imbalanced-learn Documentation", "The imbalanced-learn developerss", "imbalanced-learn", "Toolbox for imbalanced dataset in machine learning.", "Miscellaneous", ), ] # -- Dependencies generation ---------------------------------------------- def generate_min_dependency_table(app): """Generate min dependency table for docs.""" from sklearn._min_dependencies import dependent_packages # get length of header package_header_len = max(len(package) for package in dependent_packages) + 4 version_header_len = len("Minimum Version") + 4 tags_header_len = max(len(tags) for _, tags in dependent_packages.values()) + 4 output = StringIO() output.write( " ".join( ["=" * package_header_len, "=" * version_header_len, "=" * tags_header_len] ) ) output.write("\n") dependency_title = "Dependency" version_title = "Minimum Version" tags_title = "Purpose" output.write( f"{dependency_title:<{package_header_len}} " f"{version_title:<{version_header_len}} " f"{tags_title}\n" ) output.write( " ".join( ["=" * package_header_len, "=" * version_header_len, "=" * tags_header_len] ) ) output.write("\n") for package, (version, tags) in dependent_packages.items(): output.write( f"{package:<{package_header_len}} {version:<{version_header_len}} {tags}\n" ) output.write( " ".join( ["=" * package_header_len, "=" * version_header_len, "=" * tags_header_len] ) ) output.write("\n") output = output.getvalue() with (Path(".") / "min_dependency_table.rst").open("w") as f: f.write(output) def generate_min_dependency_substitutions(app): """Generate min dependency substitutions for docs.""" from sklearn._min_dependencies import dependent_packages output = StringIO() for package, (version, _) in dependent_packages.items(): package = package.capitalize() output.write(f".. |{package}MinVersion| replace:: {version}") output.write("\n") output = output.getvalue() with (Path(".") / "min_dependency_substitutions.rst").open("w") as f: f.write(output) # -- Additional temporary hacks ----------------------------------------------- # Temporary work-around for spacing problem between parameter and parameter # type in the doc, see https://github.com/numpy/numpydoc/issues/215. The bug # has been fixed in sphinx (https://github.com/sphinx-doc/sphinx/pull/5976) but # through a change in sphinx basic.css except rtd_theme does not use basic.css. # In an ideal world, this would get fixed in this PR: # https://github.com/readthedocs/sphinx_rtd_theme/pull/747/files def setup(app): app.connect("builder-inited", generate_min_dependency_table) app.connect("builder-inited", generate_min_dependency_substitutions) app.add_css_file("basic.css") imbalanced-learn-0.12.2/doc/datasets/000077500000000000000000000000001460233407600173345ustar00rootroot00000000000000imbalanced-learn-0.12.2/doc/datasets/index.rst000066400000000000000000000176061460233407600212070ustar00rootroot00000000000000.. _datasets: ========================= Dataset loading utilities ========================= .. currentmodule:: imblearn.datasets The :mod:`imblearn.datasets` package is complementing the :mod:`sklearn.datasets` package. The package provides both: (i) a set of imbalanced datasets to perform systematic benchmark and (ii) a utility to create an imbalanced dataset from an original balanced dataset. .. _zenodo: Imbalanced datasets for benchmark ================================= :func:`fetch_datasets` allows to fetch 27 datasets which are imbalanced and binarized. The following data sets are available: +--+--------------+-------------------------------+-------+---------+-----+ |ID|Name | Repository & Target | Ratio | #S | #F | +==+==============+===============================+=======+=========+=====+ |1 |ecoli | UCI, target: imU | 8.6:1 | 336 | 7 | +--+--------------+-------------------------------+-------+---------+-----+ |2 |optical_digits| UCI, target: 8 | 9.1:1 | 5,620 | 64 | +--+--------------+-------------------------------+-------+---------+-----+ |3 |satimage | UCI, target: 4 | 9.3:1 | 6,435 | 36 | +--+--------------+-------------------------------+-------+---------+-----+ |4 |pen_digits | UCI, target: 5 | 9.4:1 | 10,992 | 16 | +--+--------------+-------------------------------+-------+---------+-----+ |5 |abalone | UCI, target: 7 | 9.7:1 | 4,177 | 10 | +--+--------------+-------------------------------+-------+---------+-----+ |6 |sick_euthyroid| UCI, target: sick euthyroid | 9.8:1 | 3,163 | 42 | +--+--------------+-------------------------------+-------+---------+-----+ |7 |spectrometer | UCI, target: >=44 | 11:1 | 531 | 93 | +--+--------------+-------------------------------+-------+---------+-----+ |8 |car_eval_34 | UCI, target: good, v good | 12:1 | 1,728 | 21 | +--+--------------+-------------------------------+-------+---------+-----+ |9 |isolet | UCI, target: A, B | 12:1 | 7,797 | 617 | +--+--------------+-------------------------------+-------+---------+-----+ |10|us_crime | UCI, target: >0.65 | 12:1 | 1,994 | 100 | +--+--------------+-------------------------------+-------+---------+-----+ |11|yeast_ml8 | LIBSVM, target: 8 | 13:1 | 2,417 | 103 | +--+--------------+-------------------------------+-------+---------+-----+ |12|scene | LIBSVM, target: >one label | 13:1 | 2,407 | 294 | +--+--------------+-------------------------------+-------+---------+-----+ |13|libras_move | UCI, target: 1 | 14:1 | 360 | 90 | +--+--------------+-------------------------------+-------+---------+-----+ |14|thyroid_sick | UCI, target: sick | 15:1 | 3,772 | 52 | +--+--------------+-------------------------------+-------+---------+-----+ |15|coil_2000 | KDD, CoIL, target: minority | 16:1 | 9,822 | 85 | +--+--------------+-------------------------------+-------+---------+-----+ |16|arrhythmia | UCI, target: 06 | 17:1 | 452 | 278 | +--+--------------+-------------------------------+-------+---------+-----+ |17|solar_flare_m0| UCI, target: M->0 | 19:1 | 1,389 | 32 | +--+--------------+-------------------------------+-------+---------+-----+ |18|oil | UCI, target: minority | 22:1 | 937 | 49 | +--+--------------+-------------------------------+-------+---------+-----+ |19|car_eval_4 | UCI, target: vgood | 26:1 | 1,728 | 21 | +--+--------------+-------------------------------+-------+---------+-----+ |20|wine_quality | UCI, wine, target: <=4 | 26:1 | 4,898 | 11 | +--+--------------+-------------------------------+-------+---------+-----+ |21|letter_img | UCI, target: Z | 26:1 | 20,000 | 16 | +--+--------------+-------------------------------+-------+---------+-----+ |22|yeast_me2 | UCI, target: ME2 | 28:1 | 1,484 | 8 | +--+--------------+-------------------------------+-------+---------+-----+ |23|webpage | LIBSVM, w7a, target: minority | 33:1 | 34,780 | 300 | +--+--------------+-------------------------------+-------+---------+-----+ |24|ozone_level | UCI, ozone, data | 34:1 | 2,536 | 72 | +--+--------------+-------------------------------+-------+---------+-----+ |25|mammography | UCI, target: minority | 42:1 | 11,183 | 6 | +--+--------------+-------------------------------+-------+---------+-----+ |26|protein_homo | KDD CUP 2004, minority | 11:1 | 145,751 | 74 | +--+--------------+-------------------------------+-------+---------+-----+ |27|abalone_19 | UCI, target: 19 | 130:1 | 4,177 | 10 | +--+--------------+-------------------------------+-------+---------+-----+ A specific data set can be selected as:: >>> from collections import Counter >>> from imblearn.datasets import fetch_datasets >>> ecoli = fetch_datasets()['ecoli'] >>> ecoli.data.shape (336, 7) >>> print(sorted(Counter(ecoli.target).items())) [(-1, 301), (1, 35)] .. _make_imbalanced: Imbalanced generator ==================== :func:`make_imbalance` turns an original dataset into an imbalanced dataset. This behaviour is driven by the parameter ``sampling_strategy`` which behave similarly to other resampling algorithm. ``sampling_strategy`` can be given as a dictionary where the key corresponds to the class and the value is the number of samples in the class:: >>> from sklearn.datasets import load_iris >>> from imblearn.datasets import make_imbalance >>> iris = load_iris() >>> sampling_strategy = {0: 20, 1: 30, 2: 40} >>> X_imb, y_imb = make_imbalance(iris.data, iris.target, ... sampling_strategy=sampling_strategy) >>> sorted(Counter(y_imb).items()) [(0, 20), (1, 30), (2, 40)] Note that all samples of a class are passed-through if the class is not mentioned in the dictionary:: >>> sampling_strategy = {0: 10} >>> X_imb, y_imb = make_imbalance(iris.data, iris.target, ... sampling_strategy=sampling_strategy) >>> sorted(Counter(y_imb).items()) [(0, 10), (1, 50), (2, 50)] Instead of a dictionary, a function can be defined and directly pass to ``sampling_strategy``:: >>> def ratio_multiplier(y): ... multiplier = {0: 0.5, 1: 0.7, 2: 0.95} ... target_stats = Counter(y) ... for key, value in target_stats.items(): ... target_stats[key] = int(value * multiplier[key]) ... return target_stats >>> X_imb, y_imb = make_imbalance(iris.data, iris.target, ... sampling_strategy=ratio_multiplier) >>> sorted(Counter(y_imb).items()) [(0, 25), (1, 35), (2, 47)] It would also work with pandas dataframe:: >>> from sklearn.datasets import fetch_openml >>> df, y = fetch_openml( ... 'iris', version=1, return_X_y=True, as_frame=True) >>> df_resampled, y_resampled = make_imbalance( ... df, y, sampling_strategy={'Iris-setosa': 10, 'Iris-versicolor': 20}, ... random_state=42) >>> df_resampled.head() sepallength sepalwidth petallength petalwidth 13 4.3 3.0 1.1 0.1 39 5.1 3.4 1.5 0.2 30 4.8 3.1 1.6 0.2 45 4.8 3.0 1.4 0.3 17 5.1 3.5 1.4 0.3 >>> Counter(y_resampled) Counter({'Iris-virginica': 50, 'Iris-versicolor': 20, 'Iris-setosa': 10}) See :ref:`sphx_glr_auto_examples_datasets_plot_make_imbalance.py` and :ref:`sphx_glr_auto_examples_api_plot_sampling_strategy_usage.py`. imbalanced-learn-0.12.2/doc/developers_utils.rst000066400000000000000000000162711460233407600216550ustar00rootroot00000000000000.. _developers-utils: =================== Developer guideline =================== Developer utilities ------------------- Imbalanced-learn contains a number of utilities to help with development. These are located in :mod:`imblearn.utils`, and include tools in a number of categories. All the following functions and classes are in the module :mod:`imblearn.utils`. .. warning :: These utilities are meant to be used internally within the imbalanced-learn package. They are not guaranteed to be stable between versions of imbalanced-learn. Backports, in particular, will be removed as the imbalanced-learn dependencies evolve. Validation Tools ~~~~~~~~~~~~~~~~ .. currentmodule:: imblearn.utils These are tools used to check and validate input. When you write a function which accepts arrays, matrices, or sparse matrices as arguments, the following should be used when applicable. - :func:`check_neighbors_object`: Check the objects is consistent to be a NN. - :func:`check_target_type`: Check the target types to be conform to the current samplers. - :func:`check_sampling_strategy`: Checks that sampling target is consistent with the type and return a dictionary containing each targeted class with its corresponding number of pixel. Deprecation ~~~~~~~~~~~ .. currentmodule:: imblearn.utils.deprecation .. warning :: Apart from :func:`deprecate_parameter` the rest of this section is taken from scikit-learn. Please refer to their original documentation. If any publicly accessible method, function, attribute or parameter is renamed, we still support the old one for two releases and issue a deprecation warning when it is called/passed/accessed. E.g., if the function ``zero_one`` is renamed to ``zero_one_loss``, we add the decorator ``deprecated`` (from ``sklearn.utils``) to ``zero_one`` and call ``zero_one_loss`` from that function:: from ..utils import deprecated def zero_one_loss(y_true, y_pred, normalize=True): # actual implementation pass @deprecated("Function 'zero_one' was renamed to 'zero_one_loss' " "in version 0.13 and will be removed in release 0.15. " "Default behavior is changed from 'normalize=False' to " "'normalize=True'") def zero_one(y_true, y_pred, normalize=False): return zero_one_loss(y_true, y_pred, normalize) If an attribute is to be deprecated, use the decorator ``deprecated`` on a property. E.g., renaming an attribute ``labels_`` to ``classes_`` can be done as:: @property @deprecated("Attribute labels_ was deprecated in version 0.13 and " "will be removed in 0.15. Use 'classes_' instead") def labels_(self): return self.classes_ If a parameter has to be deprecated, use ``FutureWarning`` appropriately. In the following example, k is deprecated and renamed to n_clusters:: import warnings def example_function(n_clusters=8, k=None): if k is not None: warnings.warn("'k' was renamed to n_clusters in version 0.13 and " "will be removed in 0.15.", DeprecationWarning) n_clusters = k As in these examples, the warning message should always give both the version in which the deprecation happened and the version in which the old behavior will be removed. If the deprecation happened in version 0.x-dev, the message should say deprecation occurred in version 0.x and the removal will be in 0.(x+2). For example, if the deprecation happened in version 0.18-dev, the message should say it happened in version 0.18 and the old behavior will be removed in version 0.20. In addition, a deprecation note should be added in the docstring, recalling the same information as the deprecation warning as explained above. Use the ``.. deprecated::`` directive:: .. deprecated:: 0.13 ``k`` was renamed to ``n_clusters`` in version 0.13 and will be removed in 0.15. On the top of all the functionality provided by scikit-learn. imbalanced-learn provides :func:`deprecate_parameter`: which is used to deprecate a sampler's parameter (attribute) by another one. Making a release ---------------- This section document the different steps that are necessary to make a new imbalanced-learn release. Major release ~~~~~~~~~~~~~ * Update the release note `whats_new/v0..rst` by giving a date and removing the status "Under development" from the title. * Run `bumpversion release`. It will remove the `dev0` tag. * Commit the change `git commit -am "bumpversion 0..0"` (e.g., `git commit -am "bumpversion 0.5.0"`). * Create a branch for this version (e.g., `git checkout -b 0..X`). * Push the new branch into the upstream remote imbalanced-learn repository. * Change the `symlink` in the `imbalanced-learn website repository `_ such that stable points to the latest release version, i.e, `0.`. To do this, clone the repository, `run unlink stable`, followed by `ln -s 0. stable`. To check that this was performed correctly, ensure that stable has the new version number using `ls -l`. * Return to your imbalanced-learn repository, in the branch `0..X`. * Create the source distribution and wheel: `python setup.py sdist` and `python setup.py bdist_wheel`. * Upload these file to PyPI using `twine upload dist/*` * Switch to the `master` branch and run `bumpversion minor`, commit and push on upstream. We are officially at `0..0.dev0`. * Create a GitHub release by clicking on "Draft a new release" here. "Tag version" should be the latest version number (e.g., `0..0`), "Target" should be the branch for that the release (e.g., `0..X`) and "Release title" should be "Version ". Add the notes from the release notes there. * Add a new `v0..rst` file in `doc/whats_new/` and `.. include::` this new file in `doc/whats_new.rst`. Mark the version as the version under development. * Finally, go to the `conda-forge feedstock `_ and a new PR will be created when the feedstock will synchronizing with the PyPI repository. Merge this PR such that we have the binary for `conda` available. Bug fix release ~~~~~~~~~~~~~~~ * Find the commit(s) hash of the bug fix commit you wish to back port using `git log`. * Checkout the branch for the lastest release, e.g., `git checkout 0..X`. * Append the bug fix commit(s) to the branch using `git cherry-pick `. Alternatively, you can use interactive rebasing from the `master` branch. * Bump the version number with bumpversion patch. This will bump the patch version, for example from `0.X.0` to `0.X.* dev0`. * Mark the current version as a release version (as opposed to `dev` version) with `bumpversion release --allow-dirty`. It will bump the version, for example from `0.X.* dev0` to `0.X.1`. * Commit the changes with `git commit -am 'bumpversion '`. * Push the changes to the release branch in upstream, e.g. `git push `. * Use the same process as in a major release to upload on PyPI and conda-forge. imbalanced-learn-0.12.2/doc/ensemble.rst000066400000000000000000000114071460233407600200530ustar00rootroot00000000000000.. _ensemble: ==================== Ensemble of samplers ==================== .. currentmodule:: imblearn.ensemble .. _ensemble_meta_estimators: Classifier including inner balancing samplers ============================================= .. _bagging: Bagging classifier ------------------ In ensemble classifiers, bagging methods build several estimators on different randomly selected subset of data. In scikit-learn, this classifier is named :class:`~sklearn.ensemble.BaggingClassifier`. However, this classifier does not allow to balance each subset of data. Therefore, when training on imbalanced data set, this classifier will favor the majority classes:: >>> from sklearn.datasets import make_classification >>> X, y = make_classification(n_samples=10000, n_features=2, n_informative=2, ... n_redundant=0, n_repeated=0, n_classes=3, ... n_clusters_per_class=1, ... weights=[0.01, 0.05, 0.94], class_sep=0.8, ... random_state=0) >>> from sklearn.model_selection import train_test_split >>> from sklearn.metrics import balanced_accuracy_score >>> from sklearn.ensemble import BaggingClassifier >>> from sklearn.tree import DecisionTreeClassifier >>> X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) >>> bc = BaggingClassifier(DecisionTreeClassifier(), random_state=0) >>> bc.fit(X_train, y_train) #doctest: BaggingClassifier(...) >>> y_pred = bc.predict(X_test) >>> balanced_accuracy_score(y_test, y_pred) 0.77... In :class:`BalancedBaggingClassifier`, each bootstrap sample will be further resampled to achieve the `sampling_strategy` desired. Therefore, :class:`BalancedBaggingClassifier` takes the same parameters as the scikit-learn :class:`~sklearn.ensemble.BaggingClassifier`. In addition, the sampling is controlled by the parameter `sampler` or the two parameters `sampling_strategy` and `replacement`, if one wants to use the :class:`~imblearn.under_sampling.RandomUnderSampler`:: >>> from imblearn.ensemble import BalancedBaggingClassifier >>> bbc = BalancedBaggingClassifier(DecisionTreeClassifier(), ... sampling_strategy='auto', ... replacement=False, ... random_state=0) >>> bbc.fit(X_train, y_train) BalancedBaggingClassifier(...) >>> y_pred = bbc.predict(X_test) >>> balanced_accuracy_score(y_test, y_pred) 0.8... Changing the `sampler` will give rise to different known implementation :cite:`maclin1997empirical`, :cite:`hido2009roughly`, :cite:`wang2009diversity`. You can refer to the following example shows in practice these different methods: :ref:`sphx_glr_auto_examples_ensemble_plot_bagging_classifier.py` .. _forest: Forest of randomized trees -------------------------- :class:`BalancedRandomForestClassifier` is another ensemble method in which each tree of the forest will be provided a balanced bootstrap sample :cite:`chen2004using`. This class provides all functionality of the :class:`~sklearn.ensemble.RandomForestClassifier`:: >>> from imblearn.ensemble import BalancedRandomForestClassifier >>> brf = BalancedRandomForestClassifier( ... n_estimators=100, random_state=0, sampling_strategy="all", replacement=True, ... bootstrap=False, ... ) >>> brf.fit(X_train, y_train) BalancedRandomForestClassifier(...) >>> y_pred = brf.predict(X_test) >>> balanced_accuracy_score(y_test, y_pred) 0.8... .. _boosting: Boosting -------- Several methods taking advantage of boosting have been designed. :class:`RUSBoostClassifier` randomly under-sample the dataset before to perform a boosting iteration :cite:`seiffert2009rusboost`:: >>> from imblearn.ensemble import RUSBoostClassifier >>> rusboost = RUSBoostClassifier(n_estimators=200, algorithm='SAMME.R', ... random_state=0) >>> rusboost.fit(X_train, y_train) RUSBoostClassifier(...) >>> y_pred = rusboost.predict(X_test) >>> balanced_accuracy_score(y_test, y_pred) 0... A specific method which uses :class:`~sklearn.ensemble.AdaBoostClassifier` as learners in the bagging classifier is called "EasyEnsemble". The :class:`EasyEnsembleClassifier` allows to bag AdaBoost learners which are trained on balanced bootstrap samples :cite:`liu2008exploratory`. Similarly to the :class:`BalancedBaggingClassifier` API, one can construct the ensemble as:: >>> from imblearn.ensemble import EasyEnsembleClassifier >>> eec = EasyEnsembleClassifier(random_state=0) >>> eec.fit(X_train, y_train) EasyEnsembleClassifier(...) >>> y_pred = eec.predict(X_test) >>> balanced_accuracy_score(y_test, y_pred) 0.6... .. topic:: Examples * :ref:`sphx_glr_auto_examples_ensemble_plot_comparison_ensemble_classifier.py` imbalanced-learn-0.12.2/doc/index.rst000066400000000000000000000073331460233407600173730ustar00rootroot00000000000000.. project-template documentation master file, created by sphinx-quickstart on Mon Jan 18 14:44:12 2016. You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. :notoc: ############################## imbalanced-learn documentation ############################## **Date**: |today| **Version**: |version| **Useful links**: `Binary Installers `__ | `Source Repository `__ | `Issues & Ideas `__ | `Q&A Support `__ Imbalanced-learn (imported as :mod:`imblearn`) is an open source, MIT-licensed library relying on scikit-learn (imported as :mod:`sklearn`) and provides tools when dealing with classification with imbalanced classes. .. raw:: html
Getting started

Check out the getting started guides to install imbalanced-learn. Some extra information to get started with a new contribution is also provided.

.. container:: custom-button :ref:`To the installation guideline` .. raw:: html
User guide

The user guide provides in-depth information on the key concepts of imbalanced-learn with useful background information and explanation.

.. container:: custom-button :ref:`To the user guide` .. raw:: html
API reference

The reference guide contains a detailed description of the imbalanced-learn API. To known more about methods parameters.

.. container:: custom-button :ref:`To the reference guide` .. raw:: html
Examples

The gallery of examples is a good place to see imbalanced-learn in action. Select an example and dive in.

.. container:: custom-button :ref:`To the gallery of examples` .. raw:: html
.. toctree:: :maxdepth: 3 :hidden: :titlesonly: install user_guide references/index auto_examples/index whats_new about imbalanced-learn-0.12.2/doc/install.rst000066400000000000000000000045611460233407600177320ustar00rootroot00000000000000.. _getting_started: ############### Getting Started ############### Prerequisites ============= You can find the complete list of the dependencies in the following table: .. include:: min_dependency_table.rst Install ======= From PyPi or conda-forge repositories ------------------------------------- imbalanced-learn is currently available on the PyPi's repositories and you can install it via `pip`:: pip install imbalanced-learn The package is released also on the conda-forge repositories and you can install it with `conda` (or `mamba`):: conda install -c conda-forge imbalanced-learn Intel optimizations via scikit-learn-intelex -------------------------------------------- Imbalanced-learn relies entirely on scikit-learn algorithms. Intel provides an optimized version of scikit-learn for Intel hardwares, called scikit-learn-intelex. Installing scikit-learn-intelex and patching scikit-learn will activate the Intel optimizations. You can refer to the following `blog post `_ for some benchmarks. Refer to the following documentation for instructions: - `Installation guide `_. - `Patching guide `_. From source available on GitHub ------------------------------- If you prefer, you can clone it and run the setup.py file. Use the following commands to get a copy from Github and install all dependencies:: git clone https://github.com/scikit-learn-contrib/imbalanced-learn.git cd imbalanced-learn pip install . Be aware that you can install in developer mode with:: pip install --no-build-isolation --editable . If you wish to make pull-requests on GitHub, we advise you to install pre-commit:: pip install pre-commit pre-commit install Test and coverage ================= You want to test the code before to install:: $ make test You wish to test the coverage of your version:: $ make coverage You can also use `pytest`:: $ pytest imblearn -v Contribute ========== You can contribute to this code through Pull Request on GitHub_. Please, make sure that your code is coming with unit tests to ensure full coverage and continuous integration in the API. .. _GitHub: https://github.com/scikit-learn-contrib/imbalanced-learn/pulls imbalanced-learn-0.12.2/doc/introduction.rst000066400000000000000000000050141460233407600207770ustar00rootroot00000000000000.. _introduction: ============ Introduction ============ .. _api_imblearn: API's of imbalanced-learn samplers ---------------------------------- The available samplers follows the scikit-learn API using the base estimator and adding a sampling functionality through the ``sample`` method: :Estimator: The base object, implements a ``fit`` method to learn from data, either:: estimator = obj.fit(data, targets) :Resampler: To resample a data sets, each sampler implements:: data_resampled, targets_resampled = obj.fit_resample(data, targets) Imbalanced-learn samplers accept the same inputs that in scikit-learn: * `data`: * 2-D :class:`list`, * 2-D :class:`numpy.ndarray`, * :class:`pandas.DataFrame`, * :class:`scipy.sparse.csr_matrix` or :class:`scipy.sparse.csc_matrix`; * `targets`: * 1-D :class:`numpy.ndarray`, * :class:`pandas.Series`. The output will be of the following type: * `data_resampled`: * 2-D :class:`numpy.ndarray`, * :class:`pandas.DataFrame`, * :class:`scipy.sparse.csr_matrix` or :class:`scipy.sparse.csc_matrix`; * `targets_resampled`: * 1-D :class:`numpy.ndarray`, * :class:`pandas.Series`. .. topic:: Pandas in/out Unlike scikit-learn, imbalanced-learn provides support for pandas in/out. Therefore providing a dataframe, will output as well a dataframe. .. topic:: Sparse input For sparse input the data is **converted to the Compressed Sparse Rows representation** (see ``scipy.sparse.csr_matrix``) before being fed to the sampler. To avoid unnecessary memory copies, it is recommended to choose the CSR representation upstream. .. _problem_statement: Problem statement regarding imbalanced data sets ------------------------------------------------ The learning phase and the subsequent prediction of machine learning algorithms can be affected by the problem of imbalanced data set. The balancing issue corresponds to the difference of the number of samples in the different classes. We illustrate the effect of training a linear SVM classifier with different levels of class balancing. .. image:: ./auto_examples/over-sampling/images/sphx_glr_plot_comparison_over_sampling_001.png :target: ./auto_examples/over-sampling/plot_comparison_over_sampling.html :scale: 60 :align: center As expected, the decision function of the linear SVM varies greatly depending upon how imbalanced the data is. With a greater imbalanced ratio, the decision function favors the class with the larger number of samples, usually referred as the majority class. imbalanced-learn-0.12.2/doc/make.bat000066400000000000000000000151011460233407600171270ustar00rootroot00000000000000@ECHO OFF REM Command file for Sphinx documentation if "%SPHINXBUILD%" == "" ( set SPHINXBUILD=sphinx-build ) set BUILDDIR=_build set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . set I18NSPHINXOPTS=%SPHINXOPTS% . if NOT "%PAPER%" == "" ( set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% ) if "%1" == "" goto help if "%1" == "help" ( :help echo.Please use `make ^` where ^ is one of echo. html to make standalone HTML files echo. dirhtml to make HTML files named index.html in directories echo. singlehtml to make a single large HTML file echo. pickle to make pickle files echo. json to make JSON files echo. htmlhelp to make HTML files and a HTML help project echo. qthelp to make HTML files and a qthelp project echo. devhelp to make HTML files and a Devhelp project echo. epub to make an epub echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter echo. text to make text files echo. man to make manual pages echo. texinfo to make Texinfo files echo. gettext to make PO message catalogs echo. changes to make an overview over all changed/added/deprecated items echo. xml to make Docutils-native XML files echo. pseudoxml to make pseudoxml-XML files for display purposes echo. linkcheck to check all external links for integrity echo. doctest to run all doctests embedded in the documentation if enabled goto end ) if "%1" == "clean" ( for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i del /q /s %BUILDDIR%\* goto end ) %SPHINXBUILD% 2> nul if errorlevel 9009 ( echo. echo.The 'sphinx-build' command was not found. Make sure you have Sphinx echo.installed, then set the SPHINXBUILD environment variable to point echo.to the full path of the 'sphinx-build' executable. Alternatively you echo.may add the Sphinx directory to PATH. echo. echo.If you don't have Sphinx installed, grab it from echo.http://sphinx-doc.org/ exit /b 1 ) if "%1" == "html" ( %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html if errorlevel 1 exit /b 1 echo. echo.Build finished. The HTML pages are in %BUILDDIR%/html. goto end ) if "%1" == "dirhtml" ( %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml if errorlevel 1 exit /b 1 echo. echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. goto end ) if "%1" == "singlehtml" ( %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml if errorlevel 1 exit /b 1 echo. echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. goto end ) if "%1" == "pickle" ( %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can process the pickle files. goto end ) if "%1" == "json" ( %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can process the JSON files. goto end ) if "%1" == "htmlhelp" ( %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can run HTML Help Workshop with the ^ .hhp project file in %BUILDDIR%/htmlhelp. goto end ) if "%1" == "qthelp" ( %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can run "qcollectiongenerator" with the ^ .qhcp project file in %BUILDDIR%/qthelp, like this: echo.^> qcollectiongenerator %BUILDDIR%\qthelp\imbalanced-learn.qhcp echo.To view the help file: echo.^> assistant -collectionFile %BUILDDIR%\qthelp\imbalanced-learn.ghc goto end ) if "%1" == "devhelp" ( %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp if errorlevel 1 exit /b 1 echo. echo.Build finished. goto end ) if "%1" == "epub" ( %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub if errorlevel 1 exit /b 1 echo. echo.Build finished. The epub file is in %BUILDDIR%/epub. goto end ) if "%1" == "latex" ( %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex if errorlevel 1 exit /b 1 echo. echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. goto end ) if "%1" == "latexpdf" ( %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex cd %BUILDDIR%/latex make all-pdf cd %BUILDDIR%/.. echo. echo.Build finished; the PDF files are in %BUILDDIR%/latex. goto end ) if "%1" == "latexpdfja" ( %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex cd %BUILDDIR%/latex make all-pdf-ja cd %BUILDDIR%/.. echo. echo.Build finished; the PDF files are in %BUILDDIR%/latex. goto end ) if "%1" == "text" ( %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text if errorlevel 1 exit /b 1 echo. echo.Build finished. The text files are in %BUILDDIR%/text. goto end ) if "%1" == "man" ( %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man if errorlevel 1 exit /b 1 echo. echo.Build finished. The manual pages are in %BUILDDIR%/man. goto end ) if "%1" == "texinfo" ( %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo if errorlevel 1 exit /b 1 echo. echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. goto end ) if "%1" == "gettext" ( %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale if errorlevel 1 exit /b 1 echo. echo.Build finished. The message catalogs are in %BUILDDIR%/locale. goto end ) if "%1" == "changes" ( %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes if errorlevel 1 exit /b 1 echo. echo.The overview file is in %BUILDDIR%/changes. goto end ) if "%1" == "linkcheck" ( %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck if errorlevel 1 exit /b 1 echo. echo.Link check complete; look for any errors in the above output ^ or in %BUILDDIR%/linkcheck/output.txt. goto end ) if "%1" == "doctest" ( %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest if errorlevel 1 exit /b 1 echo. echo.Testing of doctests in the sources finished, look at the ^ results in %BUILDDIR%/doctest/output.txt. goto end ) if "%1" == "xml" ( %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml if errorlevel 1 exit /b 1 echo. echo.Build finished. The XML files are in %BUILDDIR%/xml. goto end ) if "%1" == "pseudoxml" ( %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml if errorlevel 1 exit /b 1 echo. echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. goto end ) :end imbalanced-learn-0.12.2/doc/metrics.rst000066400000000000000000000132101460233407600177210ustar00rootroot00000000000000.. _metrics: ======= Metrics ======= .. currentmodule:: imblearn.metrics Classification metrics ---------------------- Currently, scikit-learn only offers the ``sklearn.metrics.balanced_accuracy_score`` (in 0.20) as metric to deal with imbalanced datasets. The module :mod:`imblearn.metrics` offers a couple of other metrics which are used in the literature to evaluate the quality of classifiers. .. _sensitivity_specificity: Sensitivity and specificity metrics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Sensitivity and specificity are metrics which are well known in medical imaging. Sensitivity (also called true positive rate or recall) is the proportion of the positive samples which is well classified while specificity (also called true negative rate) is the proportion of the negative samples which are well classified. Therefore, depending of the field of application, either the sensitivity/specificity or the precision/recall pair of metrics are used. Currently, only the `precision and recall metrics `_ are implemented in scikit-learn. :func:`sensitivity_specificity_support`, :func:`sensitivity_score`, and :func:`specificity_score` add the possibility to use those metrics. .. _imbalanced_metrics: Additional metrics specific to imbalanced datasets ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The :func:`geometric_mean_score` :cite:`barandela2003strategies,kubat1997addressing` is the root of the product of class-wise sensitivity. This measure tries to maximize the accuracy on each of the classes while keeping these accuracies balanced. The :func:`make_index_balanced_accuracy` :cite:`garcia2012effectiveness` can wrap any metric and give more importance to a specific class using the parameter ``alpha``. .. _macro_averaged_mean_absolute_error: Macro-Averaged Mean Absolute Error (MA-MAE) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Ordinal classification is used when there is a rank among classes, for example levels of functionality or movie ratings. The :func:`macro_averaged_mean_absolute_error` :cite:`esuli2009ordinal` is used for imbalanced ordinal classification. The mean absolute error is computed for each class and averaged over classes, giving an equal weight to each class. .. _classification_report: Summary of important metrics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The :func:`classification_report_imbalanced` will compute a set of metrics per class and summarize it in a table. The parameter `output_dict` allows to get a string or a Python dictionary. This dictionary can be reused to create a Pandas dataframe for instance. The bottom row (i.e "avg/total") contains the weighted average by the support (i.e column "sup") of each column. Note that the weighted average of the class recalls is also known as the classification accuracy. .. _pairwise_metrics: Pairwise metrics ---------------- The :mod:`imblearn.metrics.pairwise` submodule implements pairwise distances that are available in scikit-learn while used in some of the methods in imbalanced-learn. .. _vdm: Value Difference Metric ~~~~~~~~~~~~~~~~~~~~~~~ The class :class:`~imblearn.metrics.pairwise.ValueDifferenceMetric` is implementing the Value Difference Metric proposed in :cite:`stanfill1986toward`. This measure is used to compute the proximity of two samples composed of only categorical values. Given a single feature, categories with similar correlation with the target vector will be considered closer. Let's give an example to illustrate this behaviour as given in :cite:`wilson1997improved`. `X` will be represented by a single feature which will be some color and the target will be if a sample is whether or not an apple:: >>> import numpy as np >>> X = np.array(["green"] * 10 + ["red"] * 10 + ["blue"] * 10).reshape(-1, 1) >>> y = ["apple"] * 8 + ["not apple"] * 5 + ["apple"] * 7 + ["not apple"] * 9 + ["apple"] In this dataset, the categories "red" and "green" are more correlated to the target `y` and should have a smaller distance than with the category "blue". We should this behaviour. Be aware that we need to encode the `X` to work with numerical values:: >>> from sklearn.preprocessing import OrdinalEncoder >>> encoder = OrdinalEncoder(dtype=np.int32) >>> X_encoded = encoder.fit_transform(X) Now, we can compute the distance between three different samples representing the different categories:: >>> from imblearn.metrics.pairwise import ValueDifferenceMetric >>> vdm = ValueDifferenceMetric().fit(X_encoded, y) >>> X_test = np.array(["green", "red", "blue"]).reshape(-1, 1) >>> X_test_encoded = encoder.transform(X_test) >>> vdm.pairwise(X_test_encoded) array([[0. , 0.04, 1.96], [0.04, 0. , 1.44], [1.96, 1.44, 0. ]]) We see that the minimum distance happen when the categories "red" and "green" are compared. Whenever comparing with "blue", the distance is much larger. **Mathematical formulation** The distance between feature values of two samples is defined as: .. math:: \delta(x, y) = \sum_{c=1}^{C} |p(c|x_{f}) - p(c|y_{f})|^{k} \ , where :math:`x` and :math:`y` are two samples and :math:`f` a given feature, :math:`C` is the number of classes, :math:`p(c|x_{f})` is the conditional probability that the output class is :math:`c` given that the feature value :math:`f` has the value :math:`x` and :math:`k` an exponent usually defined to 1 or 2. The distance for the feature vectors :math:`X` and :math:`Y` is subsequently defined as: .. math:: \Delta(X, Y) = \sum_{f=1}^{F} \delta(X_{f}, Y_{f})^{r} \ , where :math:`F` is the number of feature and :math:`r` an exponent usually defined equal to 1 or 2. imbalanced-learn-0.12.2/doc/miscellaneous.rst000066400000000000000000000165211460233407600211260ustar00rootroot00000000000000.. _miscellaneous: ====================== Miscellaneous samplers ====================== .. currentmodule:: imblearn .. _function_sampler: Custom samplers --------------- A fully customized sampler, :class:`FunctionSampler`, is available in imbalanced-learn such that you can fast prototype your own sampler by defining a single function. Additional parameters can be added using the attribute ``kw_args`` which accepts a dictionary. The following example illustrates how to retain the 10 first elements of the array ``X`` and ``y``:: >>> import numpy as np >>> from imblearn import FunctionSampler >>> from sklearn.datasets import make_classification >>> X, y = make_classification(n_samples=5000, n_features=2, n_informative=2, ... n_redundant=0, n_repeated=0, n_classes=3, ... n_clusters_per_class=1, ... weights=[0.01, 0.05, 0.94], ... class_sep=0.8, random_state=0) >>> def func(X, y): ... return X[:10], y[:10] >>> sampler = FunctionSampler(func=func) >>> X_res, y_res = sampler.fit_resample(X, y) >>> np.all(X_res == X[:10]) True >>> np.all(y_res == y[:10]) True In addition, the parameter ``validate`` controls input checking. For instance, turning ``validate=False`` allows to pass any type of target ``y`` and do some sampling for regression targets:: >>> from sklearn.datasets import make_regression >>> X_reg, y_reg = make_regression(n_samples=100, random_state=42) >>> rng = np.random.RandomState(42) >>> def dummy_sampler(X, y): ... indices = rng.choice(np.arange(X.shape[0]), size=10) ... return X[indices], y[indices] >>> sampler = FunctionSampler(func=dummy_sampler, validate=False) >>> X_res, y_res = sampler.fit_resample(X_reg, y_reg) >>> y_res array([ 41.49112498, -142.78526195, 85.55095317, 141.43321419, 75.46571114, -67.49177372, 159.72700509, -169.80498923, 211.95889757, 211.95889757]) We illustrated the use of such sampler to implement an outlier rejection estimator which can be easily used within a :class:`~imblearn.pipeline.Pipeline`: :ref:`sphx_glr_auto_examples_applications_plot_outlier_rejections.py` .. _generators: Custom generators ----------------- Imbalanced-learn provides specific generators for TensorFlow and Keras which will generate balanced mini-batches. .. _tensorflow_generator: TensorFlow generator ~~~~~~~~~~~~~~~~~~~~ The :func:`~imblearn.tensorflow.balanced_batch_generator` allows to generate balanced mini-batches using an imbalanced-learn sampler which returns indices. Let's first generate some data:: >>> n_features, n_classes = 10, 2 >>> X, y = make_classification( ... n_samples=10_000, n_features=n_features, n_informative=2, ... n_redundant=0, n_repeated=0, n_classes=n_classes, ... n_clusters_per_class=1, weights=[0.1, 0.9], ... class_sep=0.8, random_state=0 ... ) >>> X = X.astype(np.float32) Then, we can create the generator that will yield mini-batches that will be balanced:: >>> from imblearn.under_sampling import RandomUnderSampler >>> from imblearn.tensorflow import balanced_batch_generator >>> training_generator, steps_per_epoch = balanced_batch_generator( ... X, ... y, ... sample_weight=None, ... sampler=RandomUnderSampler(), ... batch_size=32, ... random_state=42, ... ) The ``generator`` and ``steps_per_epoch`` are used during the training of a Tensorflow model. We will illustrate how to use this generator. First, we can define a logistic regression model which will be optimized by a gradient descent:: >>> import tensorflow as tf >>> # initialize the weights and intercept >>> normal_initializer = tf.random_normal_initializer(mean=0, stddev=0.01) >>> coef = tf.Variable(normal_initializer( ... shape=[n_features, n_classes]), dtype="float32" ... ) >>> intercept = tf.Variable( ... normal_initializer(shape=[n_classes]), dtype="float32" ... ) >>> # define the model >>> def logistic_regression(X): ... return tf.nn.softmax(tf.matmul(X, coef) + intercept) >>> # define the loss function >>> def cross_entropy(y_true, y_pred): ... y_true = tf.one_hot(y_true, depth=n_classes) ... y_pred = tf.clip_by_value(y_pred, 1e-9, 1.) ... return tf.reduce_mean(-tf.reduce_sum(y_true * tf.math.log(y_pred))) >>> # define our metric >>> def balanced_accuracy(y_true, y_pred): ... cm = tf.math.confusion_matrix(tf.cast(y_true, tf.int64), tf.argmax(y_pred, 1)) ... per_class = np.diag(cm) / tf.math.reduce_sum(cm, axis=1) ... return np.mean(per_class) >>> # define the optimizer >>> optimizer = tf.optimizers.SGD(learning_rate=0.01) >>> # define the optimization step >>> def run_optimization(X, y): ... with tf.GradientTape() as g: ... y_pred = logistic_regression(X) ... loss = cross_entropy(y, y_pred) ... gradients = g.gradient(loss, [coef, intercept]) ... optimizer.apply_gradients(zip(gradients, [coef, intercept])) Once initialized, the model is trained by iterating on balanced mini-batches of data and minimizing the loss previously defined:: >>> epochs = 10 >>> for e in range(epochs): ... y_pred = logistic_regression(X) ... loss = cross_entropy(y, y_pred) ... bal_acc = balanced_accuracy(y, y_pred) ... print(f"epoch: {e}, loss: {loss:.3f}, accuracy: {bal_acc}") ... for i in range(steps_per_epoch): ... X_batch, y_batch = next(training_generator) ... run_optimization(X_batch, y_batch) epoch: 0, ... .. _keras_generator: Keras generator ~~~~~~~~~~~~~~~ Keras provides an higher level API in which a model can be defined and train by calling ``fit_generator`` method to train the model. To illustrate, we will define a logistic regression model:: >>> from tensorflow import keras >>> y = keras.utils.to_categorical(y, 3) >>> model = keras.Sequential() >>> model.add( ... keras.layers.Dense( ... y.shape[1], input_dim=X.shape[1], activation='softmax' ... ) ... ) >>> model.compile( ... optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'] ... ) :func:`~imblearn.keras.balanced_batch_generator` creates a balanced mini-batches generator with the associated number of mini-batches which will be generated:: >>> from imblearn.keras import balanced_batch_generator >>> training_generator, steps_per_epoch = balanced_batch_generator( ... X, y, sampler=RandomUnderSampler(), batch_size=10, random_state=42 ... ) Then, ``fit`` can be called passing the generator and the step:: >>> callback_history = model.fit( ... training_generator, ... steps_per_epoch=steps_per_epoch, ... epochs=10, ... verbose=1, ... ) Epoch 1/10 ... The second possibility is to use :class:`~imblearn.keras.BalancedBatchGenerator`. Only an instance of this class will be passed to ``fit``:: >>> from imblearn.keras import BalancedBatchGenerator >>> training_generator = BalancedBatchGenerator( ... X, y, sampler=RandomUnderSampler(), batch_size=10, random_state=42 ... ) >>> callback_history = model.fit( ... training_generator, ... steps_per_epoch=steps_per_epoch, ... epochs=10, ... verbose=1, ... ) Epoch 1/10 ... .. topic:: References * :ref:`sphx_glr_auto_examples_applications_porto_seguro_keras_under_sampling.py` imbalanced-learn-0.12.2/doc/over_sampling.rst000066400000000000000000000343241460233407600211310ustar00rootroot00000000000000.. _over-sampling: ============= Over-sampling ============= .. currentmodule:: imblearn.over_sampling A practical guide ================= You can refer to :ref:`sphx_glr_auto_examples_over-sampling_plot_comparison_over_sampling.py`. .. _random_over_sampler: Naive random over-sampling -------------------------- One way to fight this issue is to generate new samples in the classes which are under-represented. The most naive strategy is to generate new samples by randomly sampling with replacement the current available samples. The :class:`RandomOverSampler` offers such scheme:: >>> from sklearn.datasets import make_classification >>> X, y = make_classification(n_samples=5000, n_features=2, n_informative=2, ... n_redundant=0, n_repeated=0, n_classes=3, ... n_clusters_per_class=1, ... weights=[0.01, 0.05, 0.94], ... class_sep=0.8, random_state=0) >>> from imblearn.over_sampling import RandomOverSampler >>> ros = RandomOverSampler(random_state=0) >>> X_resampled, y_resampled = ros.fit_resample(X, y) >>> from collections import Counter >>> print(sorted(Counter(y_resampled).items())) [(0, 4674), (1, 4674), (2, 4674)] The augmented data set should be used instead of the original data set to train a classifier:: >>> from sklearn.linear_model import LogisticRegression >>> clf = LogisticRegression() >>> clf.fit(X_resampled, y_resampled) LogisticRegression(...) In the figure below, we compare the decision functions of a classifier trained using the over-sampled data set and the original data set. .. image:: ./auto_examples/over-sampling/images/sphx_glr_plot_comparison_over_sampling_002.png :target: ./auto_examples/over-sampling/plot_comparison_over_sampling.html :scale: 60 :align: center As a result, the majority class does not take over the other classes during the training process. Consequently, all classes are represented by the decision function. In addition, :class:`RandomOverSampler` allows to sample heterogeneous data (e.g. containing some strings):: >>> import numpy as np >>> X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]], ... dtype=object) >>> y_hetero = np.array([0, 0, 1]) >>> X_resampled, y_resampled = ros.fit_resample(X_hetero, y_hetero) >>> print(X_resampled) [['xxx' 1 1.0] ['yyy' 2 2.0] ['zzz' 3 3.0] ['zzz' 3 3.0]] >>> print(y_resampled) [0 0 1 1] It would also work with pandas dataframe:: >>> from sklearn.datasets import fetch_openml >>> df_adult, y_adult = fetch_openml( ... 'adult', version=2, as_frame=True, return_X_y=True) >>> df_adult.head() # doctest: +SKIP >>> df_resampled, y_resampled = ros.fit_resample(df_adult, y_adult) >>> df_resampled.head() # doctest: +SKIP If repeating samples is an issue, the parameter `shrinkage` allows to create a smoothed bootstrap. However, the original data needs to be numerical. The `shrinkage` parameter controls the dispersion of the new generated samples. We show an example illustrate that the new samples are not overlapping anymore once using a smoothed bootstrap. This ways of generating smoothed bootstrap is also known a Random Over-Sampling Examples (ROSE) :cite:`torelli2014rose`. .. image:: ./auto_examples/over-sampling/images/sphx_glr_plot_comparison_over_sampling_003.png :target: ./auto_examples/over-sampling/plot_comparison_over_sampling.html :scale: 60 :align: center .. _smote_adasyn: From random over-sampling to SMOTE and ADASYN --------------------------------------------- Apart from the random sampling with replacement, there are two popular methods to over-sample minority classes: (i) the Synthetic Minority Oversampling Technique (SMOTE) :cite:`chawla2002smote` and (ii) the Adaptive Synthetic (ADASYN) :cite:`he2008adasyn` sampling method. These algorithms can be used in the same manner:: >>> from imblearn.over_sampling import SMOTE, ADASYN >>> X_resampled, y_resampled = SMOTE().fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 4674), (1, 4674), (2, 4674)] >>> clf_smote = LogisticRegression().fit(X_resampled, y_resampled) >>> X_resampled, y_resampled = ADASYN().fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 4673), (1, 4662), (2, 4674)] >>> clf_adasyn = LogisticRegression().fit(X_resampled, y_resampled) The figure below illustrates the major difference of the different over-sampling methods. .. image:: ./auto_examples/over-sampling/images/sphx_glr_plot_comparison_over_sampling_004.png :target: ./auto_examples/over-sampling/plot_comparison_over_sampling.html :scale: 60 :align: center Ill-posed examples ------------------ While the :class:`RandomOverSampler` is over-sampling by duplicating some of the original samples of the minority class, :class:`SMOTE` and :class:`ADASYN` generate new samples in by interpolation. However, the samples used to interpolate/generate new synthetic samples differ. In fact, :class:`ADASYN` focuses on generating samples next to the original samples which are wrongly classified using a k-Nearest Neighbors classifier while the basic implementation of :class:`SMOTE` will not make any distinction between easy and hard samples to be classified using the nearest neighbors rule. Therefore, the decision function found during training will be different among the algorithms. .. image:: ./auto_examples/over-sampling/images/sphx_glr_plot_comparison_over_sampling_005.png :target: ./auto_examples/over-sampling/plot_comparison_over_sampling.html :align: center The sampling particularities of these two algorithms can lead to some peculiar behavior as shown below. .. image:: ./auto_examples/over-sampling/images/sphx_glr_plot_comparison_over_sampling_006.png :target: ./auto_examples/over-sampling/plot_comparison_over_sampling.html :scale: 60 :align: center SMOTE variants -------------- SMOTE might connect inliers and outliers while ADASYN might focus solely on outliers which, in both cases, might lead to a sub-optimal decision function. In this regard, SMOTE offers three additional options to generate samples. Those methods focus on samples near the border of the optimal decision function and will generate samples in the opposite direction of the nearest neighbors class. Those variants are presented in the figure below. .. image:: ./auto_examples/over-sampling/images/sphx_glr_plot_comparison_over_sampling_007.png :target: ./auto_examples/over-sampling/plot_comparison_over_sampling.html :scale: 60 :align: center The :class:`BorderlineSMOTE` :cite:`han2005borderline`, :class:`SVMSMOTE` :cite:`nguyen2009borderline`, and :class:`KMeansSMOTE` :cite:`last2017oversampling` offer some variant of the SMOTE algorithm:: >>> from imblearn.over_sampling import BorderlineSMOTE >>> X_resampled, y_resampled = BorderlineSMOTE().fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 4674), (1, 4674), (2, 4674)] When dealing with mixed data type such as continuous and categorical features, none of the presented methods (apart of the class :class:`RandomOverSampler`) can deal with the categorical features. The :class:`SMOTENC` :cite:`chawla2002smote` is an extension of the :class:`SMOTE` algorithm for which categorical data are treated differently:: >>> # create a synthetic data set with continuous and categorical features >>> rng = np.random.RandomState(42) >>> n_samples = 50 >>> X = np.empty((n_samples, 3), dtype=object) >>> X[:, 0] = rng.choice(['A', 'B', 'C'], size=n_samples).astype(object) >>> X[:, 1] = rng.randn(n_samples) >>> X[:, 2] = rng.randint(3, size=n_samples) >>> y = np.array([0] * 20 + [1] * 30) >>> print(sorted(Counter(y).items())) [(0, 20), (1, 30)] In this data set, the first and last features are considered as categorical features. One needs to provide this information to :class:`SMOTENC` via the parameters ``categorical_features`` either by passing the indices, the feature names when `X` is a pandas DataFrame, a boolean mask marking these features, or relying on `dtype` inference if the columns are using the :class:`pandas.CategoricalDtype`:: >>> from imblearn.over_sampling import SMOTENC >>> smote_nc = SMOTENC(categorical_features=[0, 2], random_state=0) >>> X_resampled, y_resampled = smote_nc.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 30), (1, 30)] >>> print(X_resampled[-5:]) [['A' 0.19... 2] ['B' -0.36... 2] ['B' 0.87... 2] ['B' 0.37... 2] ['B' 0.33... 2]] Therefore, it can be seen that the samples generated in the first and last columns are belonging to the same categories originally presented without any other extra interpolation. However, :class:`SMOTENC` is only working when data is a mixed of numerical and categorical features. If data are made of only categorical data, one can use the :class:`SMOTEN` variant :cite:`chawla2002smote`. The algorithm changes in two ways: * the nearest neighbors search does not rely on the Euclidean distance. Indeed, the value difference metric (VDM) also implemented in the class :class:`~imblearn.metrics.ValueDifferenceMetric` is used. * a new sample is generated where each feature value corresponds to the most common category seen in the neighbors samples belonging to the same class. Let's take the following example:: >>> import numpy as np >>> X = np.array(["green"] * 5 + ["red"] * 10 + ["blue"] * 7, ... dtype=object).reshape(-1, 1) >>> y = np.array(["apple"] * 5 + ["not apple"] * 3 + ["apple"] * 7 + ... ["not apple"] * 5 + ["apple"] * 2, dtype=object) We generate a dataset associating a color to being an apple or not an apple. We strongly associated "green" and "red" to being an apple. The minority class being "not apple", we expect new data generated belonging to the category "blue":: >>> from imblearn.over_sampling import SMOTEN >>> sampler = SMOTEN(random_state=0) >>> X_res, y_res = sampler.fit_resample(X, y) >>> X_res[y.size:] array([['blue'], ['blue'], ['blue'], ['blue'], ['blue'], ['blue']], dtype=object) >>> y_res[y.size:] array(['not apple', 'not apple', 'not apple', 'not apple', 'not apple', 'not apple'], dtype=object) Mathematical formulation ======================== Sample generation ----------------- Both :class:`SMOTE` and :class:`ADASYN` use the same algorithm to generate new samples. Considering a sample :math:`x_i`, a new sample :math:`x_{new}` will be generated considering its k neareast-neighbors (corresponding to ``k_neighbors``). For instance, the 3 nearest-neighbors are included in the blue circle as illustrated in the figure below. Then, one of these nearest-neighbors :math:`x_{zi}` is selected and a sample is generated as follows: .. math:: x_{new} = x_i + \lambda \times (x_{zi} - x_i) where :math:`\lambda` is a random number in the range :math:`[0, 1]`. This interpolation will create a sample on the line between :math:`x_{i}` and :math:`x_{zi}` as illustrated in the image below: .. image:: ./auto_examples/over-sampling/images/sphx_glr_plot_illustration_generation_sample_001.png :target: ./auto_examples/over-sampling/plot_illustration_generation_sample.html :scale: 60 :align: center SMOTE-NC slightly change the way a new sample is generated by performing something specific for the categorical features. In fact, the categories of a new generated sample are decided by picking the most frequent category of the nearest neighbors present during the generation. .. warning:: Be aware that SMOTE-NC is not designed to work with only categorical data. The other SMOTE variants and ADASYN differ from each other by selecting the samples :math:`x_i` ahead of generating the new samples. The **regular** SMOTE algorithm --- cf. to the :class:`SMOTE` object --- does not impose any rule and will randomly pick-up all possible :math:`x_i` available. The **borderline** SMOTE --- cf. to the :class:`BorderlineSMOTE` with the parameters ``kind='borderline-1'`` and ``kind='borderline-2'`` --- will classify each sample :math:`x_i` to be (i) noise (i.e. all nearest-neighbors are from a different class than the one of :math:`x_i`), (ii) in danger (i.e. at least half of the nearest neighbors are from the same class than :math:`x_i`, or (iii) safe (i.e. all nearest neighbors are from the same class than :math:`x_i`). **Borderline-1** and **Borderline-2** SMOTE will use the samples *in danger* to generate new samples. In **Borderline-1** SMOTE, :math:`x_{zi}` will belong to the same class than the one of the sample :math:`x_i`. On the contrary, **Borderline-2** SMOTE will consider :math:`x_{zi}` which can be from any class. **SVM** SMOTE --- cf. to :class:`SVMSMOTE` --- uses an SVM classifier to find support vectors and generate samples considering them. Note that the ``C`` parameter of the SVM classifier allows to select more or less support vectors. For both borderline and SVM SMOTE, a neighborhood is defined using the parameter ``m_neighbors`` to decide if a sample is in danger, safe, or noise. **KMeans** SMOTE --- cf. to :class:`KMeansSMOTE` --- uses a KMeans clustering method before to apply SMOTE. The clustering will group samples together and generate new samples depending of the cluster density. ADASYN works similarly to the regular SMOTE. However, the number of samples generated for each :math:`x_i` is proportional to the number of samples which are not from the same class than :math:`x_i` in a given neighborhood. Therefore, more samples will be generated in the area that the nearest neighbor rule is not respected. The parameter ``m_neighbors`` is equivalent to ``k_neighbors`` in :class:`SMOTE`. Multi-class management ---------------------- All algorithms can be used with multiple classes as well as binary classes classification. :class:`RandomOverSampler` does not require any inter-class information during the sample generation. Therefore, each targeted class is resampled independently. In the contrary, both :class:`ADASYN` and :class:`SMOTE` need information regarding the neighbourhood of each sample used for sample generation. They are using a one-vs-rest approach by selecting each targeted class and computing the necessary statistics against the rest of the data set which are grouped in a single class. imbalanced-learn-0.12.2/doc/references/000077500000000000000000000000001460233407600176455ustar00rootroot00000000000000imbalanced-learn-0.12.2/doc/references/combine.rst000066400000000000000000000005001460233407600220060ustar00rootroot00000000000000.. _combine_ref: Combination of over- and under-sampling methods =============================================== .. automodule:: imblearn.combine :no-members: :no-inherited-members: .. currentmodule:: imblearn.combine .. autosummary:: :toctree: generated/ :template: class.rst SMOTEENN SMOTETomek imbalanced-learn-0.12.2/doc/references/datasets.rst000066400000000000000000000004041460233407600222050ustar00rootroot00000000000000.. _datasets_ref: Datasets ======== .. automodule:: imblearn.datasets :no-members: :no-inherited-members: .. currentmodule:: imblearn.datasets .. autosummary:: :toctree: generated/ :template: function.rst make_imbalance fetch_datasets imbalanced-learn-0.12.2/doc/references/ensemble.rst000066400000000000000000000007571460233407600222020ustar00rootroot00000000000000.. _ensemble_ref: Ensemble methods ================ .. automodule:: imblearn.ensemble :no-members: :no-inherited-members: .. currentmodule:: imblearn.ensemble Boosting algorithms ------------------- .. autosummary:: :toctree: generated/ :template: class.rst EasyEnsembleClassifier RUSBoostClassifier Bagging algorithms ------------------ .. autosummary:: :toctree: generated/ :template: class.rst BalancedBaggingClassifier BalancedRandomForestClassifier imbalanced-learn-0.12.2/doc/references/index.rst000066400000000000000000000004501460233407600215050ustar00rootroot00000000000000.. _api: ############# API reference ############# This is the full API documentation of the `imbalanced-learn` toolbox. .. toctree:: :maxdepth: 3 under_sampling over_sampling combine ensemble keras tensorflow miscellaneous pipeline metrics datasets utils imbalanced-learn-0.12.2/doc/references/keras.rst000066400000000000000000000005701460233407600215060ustar00rootroot00000000000000.. _keras_ref: Batch generator for Keras ========================= .. automodule:: imblearn.keras :no-members: :no-inherited-members: .. currentmodule:: imblearn .. autosummary:: :toctree: generated/ :template: class.rst keras.BalancedBatchGenerator .. autosummary:: :toctree: generated/ :template: function.rst keras.balanced_batch_generator imbalanced-learn-0.12.2/doc/references/metrics.rst000066400000000000000000000015561460233407600220540ustar00rootroot00000000000000.. _metrics_ref: Metrics ======= .. automodule:: imblearn.metrics :no-members: :no-inherited-members: Classification metrics ---------------------- See the :ref:`metrics` section of the user guide for further details. .. currentmodule:: imblearn.metrics .. autosummary:: :toctree: generated/ :template: function.rst classification_report_imbalanced sensitivity_specificity_support sensitivity_score specificity_score geometric_mean_score macro_averaged_mean_absolute_error make_index_balanced_accuracy Pairwise metrics ---------------- See the :ref:`pairwise_metrics` section of the user guide for further details. .. automodule:: imblearn.metrics.pairwise :no-members: :no-inherited-members: .. currentmodule:: imblearn.metrics.pairwise .. autosummary:: :toctree: generated/ :template: class.rst ValueDifferenceMetric imbalanced-learn-0.12.2/doc/references/miscellaneous.rst000066400000000000000000000003251460233407600232420ustar00rootroot00000000000000.. _misc_ref: Miscellaneous ============= Imbalance-learn provides some fast-prototyping tools. .. currentmodule:: imblearn .. autosummary:: :toctree: generated/ :template: class.rst FunctionSampler imbalanced-learn-0.12.2/doc/references/over_sampling.rst000066400000000000000000000010001460233407600232330ustar00rootroot00000000000000.. _over_sampling_ref: Over-sampling methods ===================== .. automodule:: imblearn.over_sampling :no-members: :no-inherited-members: .. currentmodule:: imblearn.over_sampling Basic over-sampling ------------------- .. autosummary:: :toctree: generated/ :template: class.rst RandomOverSampler SMOTE algorithms ---------------- .. autosummary:: :toctree: generated/ :template: class.rst SMOTE SMOTENC SMOTEN ADASYN BorderlineSMOTE KMeansSMOTE SVMSMOTE imbalanced-learn-0.12.2/doc/references/pipeline.rst000066400000000000000000000005001460233407600221770ustar00rootroot00000000000000.. _pipeline_ref: Pipeline ======== .. automodule:: imblearn.pipeline :no-members: :no-inherited-members: .. currentmodule:: imblearn.pipeline .. autosummary:: :toctree: generated/ :template: class.rst Pipeline .. autosummary:: :toctree: generated/ :template: function.rst make_pipeline imbalanced-learn-0.12.2/doc/references/tensorflow.rst000066400000000000000000000004561460233407600226060ustar00rootroot00000000000000.. _tensorflow_ref: Batch generator for TensorFlow ============================== .. automodule:: imblearn.tensorflow :no-members: :no-inherited-members: .. currentmodule:: imblearn .. autosummary:: :toctree: generated/ :template: function.rst tensorflow.balanced_batch_generator imbalanced-learn-0.12.2/doc/references/under_sampling.rst000066400000000000000000000016271460233407600234140ustar00rootroot00000000000000.. _under_sampling_ref: Under-sampling methods ====================== .. automodule:: imblearn.under_sampling :no-members: :no-inherited-members: Prototype generation -------------------- .. automodule:: imblearn.under_sampling._prototype_generation :no-members: :no-inherited-members: .. currentmodule:: imblearn.under_sampling .. autosummary:: :toctree: generated/ :template: class.rst ClusterCentroids Prototype selection ------------------- .. automodule:: imblearn.under_sampling._prototype_selection :no-members: :no-inherited-members: .. currentmodule:: imblearn.under_sampling .. autosummary:: :toctree: generated/ :template: class.rst CondensedNearestNeighbour EditedNearestNeighbours RepeatedEditedNearestNeighbours AllKNN InstanceHardnessThreshold NearMiss NeighbourhoodCleaningRule OneSidedSelection RandomUnderSampler TomekLinks imbalanced-learn-0.12.2/doc/references/utils.rst000066400000000000000000000013151460233407600215370ustar00rootroot00000000000000Utilities ========= .. automodule:: imblearn.utils :no-members: :no-inherited-members: .. currentmodule:: imblearn.utils Validation checks used in samplers ---------------------------------- .. autosummary:: :toctree: generated/ :template: function.rst estimator_checks.parametrize_with_checks check_neighbors_object check_sampling_strategy check_target_type Testing compatibility of your own sampler ----------------------------------------- .. automodule:: imblearn.utils.estimator_checks :no-members: :no-inherited-members: .. currentmodule:: imblearn.utils.estimator_checks .. autosummary:: :toctree: generated/ :template: function.rst parametrize_with_checks imbalanced-learn-0.12.2/doc/sphinxext/000077500000000000000000000000001460233407600175565ustar00rootroot00000000000000imbalanced-learn-0.12.2/doc/sphinxext/LICENSE.txt000066400000000000000000000136231460233407600214060ustar00rootroot00000000000000------------------------------------------------------------------------------- The files - numpydoc.py - autosummary.py - autosummary_generate.py - docscrape.py - docscrape_sphinx.py - phantom_import.py have the following license: Copyright (C) 2008 Stefan van der Walt , Pauli Virtanen Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------- The files - compiler_unparse.py - comment_eater.py - traitsdoc.py have the following license: This software is OSI Certified Open Source Software. OSI Certified is a certification mark of the Open Source Initiative. Copyright (c) 2006, Enthought, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Enthought, Inc. nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------- The files - only_directives.py - plot_directive.py originate from Matplotlib (http://matplotlib.sf.net/) which has the following license: Copyright (c) 2002-2008 John D. Hunter; All Rights Reserved. 1. This LICENSE AGREEMENT is between John D. Hunter (“JDHâ€), and the Individual or Organization (“Licenseeâ€) accessing and otherwise using matplotlib software in source or binary form and its associated documentation. 2. Subject to the terms and conditions of this License Agreement, JDH hereby grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce, analyze, test, perform and/or display publicly, prepare derivative works, distribute, and otherwise use matplotlib 0.98.3 alone or in any derivative version, provided, however, that JDH’s License Agreement and JDH’s notice of copyright, i.e., “Copyright (c) 2002-2008 John D. Hunter; All Rights Reserved†are retained in matplotlib 0.98.3 alone or in any derivative version prepared by Licensee. 3. In the event Licensee prepares a derivative work that is based on or incorporates matplotlib 0.98.3 or any part thereof, and wants to make the derivative work available to others as provided herein, then Licensee hereby agrees to include in any such work a brief summary of the changes made to matplotlib 0.98.3. 4. JDH is making matplotlib 0.98.3 available to Licensee on an “AS IS†basis. JDH MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, JDH MAKES NO AND DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF MATPLOTLIB 0.98.3 WILL NOT INFRINGE ANY THIRD PARTY RIGHTS. 5. JDH SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF MATPLOTLIB 0.98.3 FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING MATPLOTLIB 0.98.3, OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. 6. This License Agreement will automatically terminate upon a material breach of its terms and conditions. 7. Nothing in this License Agreement shall be deemed to create any relationship of agency, partnership, or joint venture between JDH and Licensee. This License Agreement does not grant permission to use JDH trademarks or trade name in a trademark sense to endorse or promote products or services of Licensee, or any third party. 8. By copying, installing or otherwise using matplotlib 0.98.3, Licensee agrees to be bound by the terms and conditions of this License Agreement. imbalanced-learn-0.12.2/doc/sphinxext/MANIFEST.in000066400000000000000000000000531460233407600213120ustar00rootroot00000000000000recursive-include tests *.py include *.txt imbalanced-learn-0.12.2/doc/sphinxext/README.txt000066400000000000000000000032401460233407600212530ustar00rootroot00000000000000===================================== numpydoc -- Numpy's Sphinx extensions ===================================== Numpy's documentation uses several custom extensions to Sphinx. These are shipped in this ``numpydoc`` package, in case you want to make use of them in third-party projects. The following extensions are available: - ``numpydoc``: support for the Numpy docstring format in Sphinx, and add the code description directives ``np-function``, ``np-cfunction``, etc. that support the Numpy docstring syntax. - ``numpydoc.traitsdoc``: For gathering documentation about Traits attributes. - ``numpydoc.plot_directives``: Adaptation of Matplotlib's ``plot::`` directive. Note that this implementation may still undergo severe changes or eventually be deprecated. - ``numpydoc.only_directives``: (DEPRECATED) - ``numpydoc.autosummary``: (DEPRECATED) An ``autosummary::`` directive. Available in Sphinx 0.6.2 and (to-be) 1.0 as ``sphinx.ext.autosummary``, and it the Sphinx 1.0 version is recommended over that included in Numpydoc. numpydoc ======== Numpydoc inserts a hook into Sphinx's autodoc that converts docstrings following the Numpy/Scipy format to a form palatable to Sphinx. Options ------- The following options can be set in conf.py: - numpydoc_use_plots: bool Whether to produce ``plot::`` directives for Examples sections that contain ``import matplotlib``. - numpydoc_show_class_members: bool Whether to show all members of a class in the Methods and Attributes sections automatically. - numpydoc_edit_link: bool (DEPRECATED -- edit your HTML template instead) Whether to insert an edit link after docstrings. imbalanced-learn-0.12.2/doc/sphinxext/github_link.py000066400000000000000000000050701460233407600224310ustar00rootroot00000000000000import inspect import os import subprocess import sys from functools import partial from operator import attrgetter REVISION_CMD = "git rev-parse --short HEAD" def _get_git_revision(): try: revision = subprocess.check_output(REVISION_CMD.split()).strip() except (subprocess.CalledProcessError, OSError): print("Failed to execute git to get revision") return None return revision.decode("utf-8") def _linkcode_resolve(domain, info, package, url_fmt, revision): """Determine a link to online source for a class/method/function This is called by sphinx.ext.linkcode An example with a long-untouched module that everyone has >>> _linkcode_resolve('py', {'module': 'tty', ... 'fullname': 'setraw'}, ... package='tty', ... url_fmt='http://hg.python.org/cpython/file/' ... '{revision}/Lib/{package}/{path}#L{lineno}', ... revision='xxxx') 'http://hg.python.org/cpython/file/xxxx/Lib/tty/tty.py#L18' """ if revision is None: return if domain not in ("py", "pyx"): return if not info.get("module") or not info.get("fullname"): return class_name = info["fullname"].split(".")[0] if type(class_name) != str: # Python 2 only class_name = class_name.encode("utf-8") module = __import__(info["module"], fromlist=[class_name]) obj = attrgetter(info["fullname"])(module) try: fn = inspect.getsourcefile(obj) except Exception: fn = None if not fn: try: fn = inspect.getsourcefile(sys.modules[obj.__module__]) except Exception: fn = None if not fn: return fn = os.path.relpath(fn, start=os.path.dirname(__import__(package).__file__)) try: lineno = inspect.getsourcelines(obj)[1] except Exception: lineno = "" return url_fmt.format(revision=revision, package=package, path=fn, lineno=lineno) def make_linkcode_resolve(package, url_fmt): """Returns a linkcode_resolve function for the given URL format revision is a git commit reference (hash or name) package is the name of the root module of the package url_fmt is along the lines of ('https://github.com/USER/PROJECT/' 'blob/{revision}/{package}/' '{path}#L{lineno}') """ revision = _get_git_revision() return partial( _linkcode_resolve, revision=revision, package=package, url_fmt=url_fmt ) imbalanced-learn-0.12.2/doc/sphinxext/sphinx_issues.py000066400000000000000000000177671460233407600230560ustar00rootroot00000000000000# -*- coding: utf-8 -*- """A Sphinx extension for linking to your project's issue tracker. Copyright 2014 Steven Loria Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ import re from docutils import nodes, utils from sphinx.util.nodes import split_explicit_title __version__ = "1.2.0" __author__ = "Steven Loria" __license__ = "MIT" def user_role(name, rawtext, text, lineno, inliner, options=None, content=None): """Sphinx role for linking to a user profile. Defaults to linking to Github profiles, but the profile URIS can be configured via the ``issues_user_uri`` config value. Examples: :: :user:`sloria` Anchor text also works: :: :user:`Steven Loria ` """ options = options or {} content = content or [] has_explicit_title, title, target = split_explicit_title(text) target = utils.unescape(target).strip() title = utils.unescape(title).strip() config = inliner.document.settings.env.app.config if config.issues_user_uri: ref = config.issues_user_uri.format(user=target) else: ref = "https://github.com/{0}".format(target) if has_explicit_title: text = title else: text = "@{0}".format(target) link = nodes.reference(text=text, refuri=ref, **options) return [link], [] def cve_role(name, rawtext, text, lineno, inliner, options=None, content=None): """Sphinx role for linking to a CVE on https://cve.mitre.org. Examples: :: :cve:`CVE-2018-17175` """ options = options or {} content = content or [] has_explicit_title, title, target = split_explicit_title(text) target = utils.unescape(target).strip() title = utils.unescape(title).strip() ref = "https://cve.mitre.org/cgi-bin/cvename.cgi?name={0}".format(target) text = title if has_explicit_title else target link = nodes.reference(text=text, refuri=ref, **options) return [link], [] class IssueRole(object): EXTERNAL_REPO_REGEX = re.compile(r"^(\w+)/(.+)([#@])([\w]+)$") def __init__( self, uri_config_option, format_kwarg, github_uri_template, format_text=None, ): self.uri_config_option = uri_config_option self.format_kwarg = format_kwarg self.github_uri_template = github_uri_template self.format_text = format_text or self.default_format_text @staticmethod def default_format_text(issue_no): return "#{0}".format(issue_no) def make_node(self, name, issue_no, config, options=None): name_map = {"pr": "pull", "issue": "issues", "commit": "commit"} options = options or {} repo_match = self.EXTERNAL_REPO_REGEX.match(issue_no) if repo_match: # External repo username, repo, symbol, issue = repo_match.groups() if name not in name_map: raise ValueError( "External repo linking not supported for :{}:".format(name) ) path = name_map.get(name) ref = "https://github.com/{issues_github_path}/{path}/{n}".format( issues_github_path="{}/{}".format(username, repo), path=path, n=issue, ) formatted_issue = self.format_text(issue).lstrip("#") text = "{username}/{repo}{symbol}{formatted_issue}".format(**locals()) link = nodes.reference(text=text, refuri=ref, **options) return link if issue_no not in ("-", "0"): uri_template = getattr(config, self.uri_config_option, None) if uri_template: ref = uri_template.format(**{self.format_kwarg: issue_no}) elif config.issues_github_path: ref = self.github_uri_template.format( issues_github_path=config.issues_github_path, n=issue_no ) else: raise ValueError( "Neither {} nor issues_github_path " "is set".format(self.uri_config_option) ) issue_text = self.format_text(issue_no) link = nodes.reference(text=issue_text, refuri=ref, **options) else: link = None return link def __call__( self, name, rawtext, text, lineno, inliner, options=None, content=None ): options = options or {} content = content or [] issue_nos = [each.strip() for each in utils.unescape(text).split(",")] config = inliner.document.settings.env.app.config ret = [] for i, issue_no in enumerate(issue_nos): node = self.make_node(name, issue_no, config, options=options) ret.append(node) if i != len(issue_nos) - 1: sep = nodes.raw(text=", ", format="html") ret.append(sep) return ret, [] """Sphinx role for linking to an issue. Must have `issues_uri` or `issues_github_path` configured in ``conf.py``. Examples: :: :issue:`123` :issue:`42,45` :issue:`sloria/konch#123` """ issue_role = IssueRole( uri_config_option="issues_uri", format_kwarg="issue", github_uri_template="https://github.com/{issues_github_path}/issues/{n}", ) """Sphinx role for linking to a pull request. Must have `issues_pr_uri` or `issues_github_path` configured in ``conf.py``. Examples: :: :pr:`123` :pr:`42,45` :pr:`sloria/konch#43` """ pr_role = IssueRole( uri_config_option="issues_pr_uri", format_kwarg="pr", github_uri_template="https://github.com/{issues_github_path}/pull/{n}", ) def format_commit_text(sha): return sha[:7] """Sphinx role for linking to a commit. Must have `issues_pr_uri` or `issues_github_path` configured in ``conf.py``. Examples: :: :commit:`123abc456def` :commit:`sloria/konch@123abc456def` """ commit_role = IssueRole( uri_config_option="issues_commit_uri", format_kwarg="commit", github_uri_template="https://github.com/{issues_github_path}/commit/{n}", format_text=format_commit_text, ) def setup(app): # Format template for issues URI # e.g. 'https://github.com/sloria/marshmallow/issues/{issue} app.add_config_value("issues_uri", default=None, rebuild="html") # Format template for PR URI # e.g. 'https://github.com/sloria/marshmallow/pull/{issue} app.add_config_value("issues_pr_uri", default=None, rebuild="html") # Format template for commit URI # e.g. 'https://github.com/sloria/marshmallow/commits/{commit} app.add_config_value("issues_commit_uri", default=None, rebuild="html") # Shortcut for Github, e.g. 'sloria/marshmallow' app.add_config_value("issues_github_path", default=None, rebuild="html") # Format template for user profile URI # e.g. 'https://github.com/{user}' app.add_config_value("issues_user_uri", default=None, rebuild="html") app.add_role("issue", issue_role) app.add_role("pr", pr_role) app.add_role("user", user_role) app.add_role("commit", commit_role) app.add_role("cve", cve_role) return { "version": __version__, "parallel_read_safe": True, "parallel_write_safe": True, } imbalanced-learn-0.12.2/doc/under_sampling.rst000066400000000000000000000540241460233407600212720ustar00rootroot00000000000000.. _under-sampling: ============== Under-sampling ============== .. currentmodule:: imblearn.under_sampling One way of handling imbalanced datasets is to reduce the number of observations from all classes but the minority class. The minority class is that with the least number of observations. The most well known algorithm in this group is random undersampling, where samples from the targeted classes are removed at random. But there are many other algorithms to help us reduce the number of observations in the dataset. These algorithms can be grouped based on their undersampling strategy into: - Prototype generation methods. - Prototype selection methods. And within the latter, we find: - Controlled undersampling - Cleaning methods We will discuss the different algorithms throughout this document. Check also :ref:`sphx_glr_auto_examples_under-sampling_plot_comparison_under_sampling.py`. .. _cluster_centroids: Prototype generation ==================== Given an original data set :math:`S`, prototype generation algorithms will generate a new set :math:`S'` where :math:`|S'| < |S|` and :math:`S' \not\subset S`. In other words, prototype generation techniques will reduce the number of samples in the targeted classes but the remaining samples are generated --- and not selected --- from the original set. :class:`ClusterCentroids` makes use of K-means to reduce the number of samples. Therefore, each class will be synthesized with the centroids of the K-means method instead of the original samples:: >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> X, y = make_classification(n_samples=5000, n_features=2, n_informative=2, ... n_redundant=0, n_repeated=0, n_classes=3, ... n_clusters_per_class=1, ... weights=[0.01, 0.05, 0.94], ... class_sep=0.8, random_state=0) >>> print(sorted(Counter(y).items())) [(0, 64), (1, 262), (2, 4674)] >>> from imblearn.under_sampling import ClusterCentroids >>> cc = ClusterCentroids(random_state=0) >>> X_resampled, y_resampled = cc.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 64), (1, 64), (2, 64)] The figure below illustrates such under-sampling. .. image:: ./auto_examples/under-sampling/images/sphx_glr_plot_comparison_under_sampling_001.png :target: ./auto_examples/under-sampling/plot_comparison_under_sampling.html :scale: 60 :align: center :class:`ClusterCentroids` offers an efficient way to represent the data cluster with a reduced number of samples. Keep in mind that this method requires that your data are grouped into clusters. In addition, the number of centroids should be set such that the under-sampled clusters are representative of the original one. .. warning:: :class:`ClusterCentroids` supports sparse matrices. However, the new samples generated are not specifically sparse. Therefore, even if the resulting matrix will be sparse, the algorithm will be inefficient in this regard. Prototype selection =================== Prototype selection algorithms will select samples from the original set :math:`S`, generating a dataset :math:`S'`, where :math:`|S'| < |S|` and :math:`S' \subset S`. In other words, :math:`S'` is a subset of :math:`S`. Prototype selection algorithms can be divided into two groups: (i) controlled under-sampling techniques and (ii) cleaning under-sampling techniques. Controlled under-sampling methods reduce the number of observations in the majority class or classes to an arbitrary number of samples specified by the user. Typically, they reduce the number of observations to the number of samples observed in the minority class. In contrast, cleaning under-sampling techniques "clean" the feature space by removing either "noisy" or "too easy to classify" observations, depending on the method. The final number of observations in each class varies with the cleaning method and can't be specified by the user. .. _controlled_under_sampling: Controlled under-sampling techniques ------------------------------------ Controlled under-sampling techniques reduce the number of observations from the targeted classes to a number specified by the user. Random under-sampling ^^^^^^^^^^^^^^^^^^^^^ :class:`RandomUnderSampler` is a fast and easy way to balance the data by randomly selecting a subset of data for the targeted classes:: >>> from imblearn.under_sampling import RandomUnderSampler >>> rus = RandomUnderSampler(random_state=0) >>> X_resampled, y_resampled = rus.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 64), (1, 64), (2, 64)] .. image:: ./auto_examples/under-sampling/images/sphx_glr_plot_comparison_under_sampling_002.png :target: ./auto_examples/under-sampling/plot_comparison_under_sampling.html :scale: 60 :align: center :class:`RandomUnderSampler` allows bootstrapping the data by setting ``replacement`` to ``True``. When there are multiple classes, each targeted class is under-sampled independently:: >>> import numpy as np >>> print(np.vstack([tuple(row) for row in X_resampled]).shape) (192, 2) >>> rus = RandomUnderSampler(random_state=0, replacement=True) >>> X_resampled, y_resampled = rus.fit_resample(X, y) >>> print(np.vstack(np.unique([tuple(row) for row in X_resampled], axis=0)).shape) (181, 2) :class:`RandomUnderSampler` handles heterogeneous data types, i.e. numerical, categorical, dates, etc.:: >>> X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]], ... dtype=object) >>> y_hetero = np.array([0, 0, 1]) >>> X_resampled, y_resampled = rus.fit_resample(X_hetero, y_hetero) >>> print(X_resampled) [['xxx' 1 1.0] ['zzz' 3 3.0]] >>> print(y_resampled) [0 1] :class:`RandomUnderSampler` also supports pandas dataframes as input for undersampling:: >>> from sklearn.datasets import fetch_openml >>> df_adult, y_adult = fetch_openml( ... 'adult', version=2, as_frame=True, return_X_y=True) >>> df_adult.head() # doctest: +SKIP >>> df_resampled, y_resampled = rus.fit_resample(df_adult, y_adult) >>> df_resampled.head() # doctest: +SKIP :class:`NearMiss` adds some heuristic rules to select samples :cite:`mani2003knn`. :class:`NearMiss` implements 3 different types of heuristic which can be selected with the parameter ``version``:: >>> from imblearn.under_sampling import NearMiss >>> nm1 = NearMiss(version=1) >>> X_resampled_nm1, y_resampled = nm1.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 64), (1, 64), (2, 64)] As later stated in the next section, :class:`NearMiss` heuristic rules are based on nearest neighbors algorithm. Therefore, the parameters ``n_neighbors`` and ``n_neighbors_ver3`` accept classifier derived from ``KNeighborsMixin`` from scikit-learn. The former parameter is used to compute the average distance to the neighbors while the latter is used for the pre-selection of the samples of interest. Mathematical formulation ^^^^^^^^^^^^^^^^^^^^^^^^ Let *positive samples* be the samples belonging to the targeted class to be under-sampled. *Negative sample* refers to the samples from the minority class (i.e., the most under-represented class). NearMiss-1 selects the positive samples for which the average distance to the :math:`N` closest samples of the negative class is the smallest. .. image:: ./auto_examples/under-sampling/images/sphx_glr_plot_illustration_nearmiss_001.png :target: ./auto_examples/under-sampling/plot_illustration_nearmiss.html :scale: 60 :align: center NearMiss-2 selects the positive samples for which the average distance to the :math:`N` farthest samples of the negative class is the smallest. .. image:: ./auto_examples/under-sampling/images/sphx_glr_plot_illustration_nearmiss_002.png :target: ./auto_examples/under-sampling/plot_illustration_nearmiss.html :scale: 60 :align: center NearMiss-3 is a 2-steps algorithm. First, for each negative sample, their :math:`M` nearest-neighbors will be kept. Then, the positive samples selected are the one for which the average distance to the :math:`N` nearest-neighbors is the largest. .. image:: ./auto_examples/under-sampling/images/sphx_glr_plot_illustration_nearmiss_003.png :target: ./auto_examples/under-sampling/plot_illustration_nearmiss.html :scale: 60 :align: center In the next example, the different :class:`NearMiss` variant are applied on the previous toy example. It can be seen that the decision functions obtained in each case are different. When under-sampling a specific class, NearMiss-1 can be altered by the presence of noise. In fact, it will implied that samples of the targeted class will be selected around these samples as it is the case in the illustration below for the yellow class. However, in the normal case, samples next to the boundaries will be selected. NearMiss-2 will not have this effect since it does not focus on the nearest samples but rather on the farthest samples. We can imagine that the presence of noise can also altered the sampling mainly in the presence of marginal outliers. NearMiss-3 is probably the version which will be less affected by noise due to the first step sample selection. .. image:: ./auto_examples/under-sampling/images/sphx_glr_plot_comparison_under_sampling_003.png :target: ./auto_examples/under-sampling/plot_comparison_under_sampling.html :scale: 60 :align: center Cleaning under-sampling techniques ---------------------------------- Cleaning under-sampling methods "clean" the feature space by removing either "noisy" observations or observations that are "too easy to classify", depending on the method. The final number of observations in each targeted class varies with the cleaning method and cannot be specified by the user. .. _tomek_links: Tomek's links ^^^^^^^^^^^^^ A Tomek's link exists when two samples from different classes are closest neighbors to each other. Mathematically, a Tomek's link between two samples from different classes :math:`x` and :math:`y` is defined such that for any sample :math:`z`: .. math:: d(x, y) < d(x, z) \text{ and } d(x, y) < d(y, z) where :math:`d(.)` is the distance between the two samples. :class:`TomekLinks` detects and removes Tomek's links :cite:`tomek1976two`. The underlying idea is that Tomek's links are noisy or hard to classify observations and would not help the algorithm find a suitable discrimination boundary. In the following figure, a Tomek's link between an observation of class :math:`+` and class :math:`-` is highlighted in green: .. image:: ./auto_examples/under-sampling/images/sphx_glr_plot_illustration_tomek_links_001.png :target: ./auto_examples/under-sampling/plot_illustration_tomek_links.html :scale: 60 :align: center When :class:`TomekLinks` finds a Tomek's link, it can either remove the sample of the majority class, or both. The parameter ``sampling_strategy`` controls which samples from the link will be removed. By default (i.e., ``sampling_strategy='auto'``), it will remove the sample from the majority class. Both samples, that is that from the majority and the one from the minority class, can be removed by setting ``sampling_strategy`` to ``'all'``. The following figure illustrates this behaviour: on the left, only the sample from the majority class is removed, whereas on the right, the entire Tomek's link is removed. .. image:: ./auto_examples/under-sampling/images/sphx_glr_plot_illustration_tomek_links_002.png :target: ./auto_examples/under-sampling/plot_illustration_tomek_links.html :scale: 60 :align: center .. _edited_nearest_neighbors: Editing data using nearest neighbours ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Edited nearest neighbours ~~~~~~~~~~~~~~~~~~~~~~~~~ The edited nearest neighbours methodology uses K-Nearest Neighbours to identify the neighbours of the targeted class samples, and then removes observations if any or most of their neighbours are from a different class :cite:`wilson1972asymptotic`. :class:`EditedNearestNeighbours` carries out the following steps: 1. Train a K-Nearest neighbours using the entire dataset. 2. Find each observations' K closest neighbours (only for the targeted classes). 3. Remove observations if any or most of its neighbours belong to a different class. Below the code implementation:: >>> sorted(Counter(y).items()) [(0, 64), (1, 262), (2, 4674)] >>> from imblearn.under_sampling import EditedNearestNeighbours >>> enn = EditedNearestNeighbours() >>> X_resampled, y_resampled = enn.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 64), (1, 213), (2, 4568)] To paraphrase step 3, :class:`EditedNearestNeighbours` will retain observations from the majority class when **most**, or **all** of its neighbours are from the same class. To control this behaviour we set ``kind_sel='mode'`` or ``kind_sel='all'``, respectively. Hence, `kind_sel='all'` is less conservative than `kind_sel='mode'`, resulting in the removal of more samples:: >>> enn = EditedNearestNeighbours(kind_sel="all") >>> X_resampled, y_resampled = enn.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 64), (1, 213), (2, 4568)] >>> enn = EditedNearestNeighbours(kind_sel="mode") >>> X_resampled, y_resampled = enn.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 64), (1, 234), (2, 4666)] The parameter ``n_neighbors`` accepts integers. The integer refers to the number of neighbours to examine for each sample. It can also take a classifier subclassed from ``KNeighborsMixin`` from scikit-learn. When passing a classifier, note that, if you pass a 3-Nearest Neighbors classifier, only 2 neighbours will be examined for the cleaning, as the third sample is the one being examined for undersampling since it is part of the samples provided at `fit`. Repeated Edited Nearest Neighbours ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :class:`RepeatedEditedNearestNeighbours` extends :class:`EditedNearestNeighbours` by repeating the algorithm multiple times :cite:`tomek1976experiment`. Generally, repeating the algorithm will delete more data:: >>> from imblearn.under_sampling import RepeatedEditedNearestNeighbours >>> renn = RepeatedEditedNearestNeighbours() >>> X_resampled, y_resampled = renn.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 64), (1, 208), (2, 4551)] The user can set up the number of times the edited nearest neighbours method should be repeated through the parameter `max_iter`. The repetitions will stop when: 1. the maximum number of iterations is reached, or 2. no more observations are removed, or 3. one of the majority classes becomes a minority class, or 4. one of the majority classes disappears during the undersampling. All KNN ~~~~~~~ :class:`AllKNN` is a variation of the :class:`RepeatedEditedNearestNeighbours` where the number of neighbours evaluated at each round of :class:`EditedNearestNeighbours` increases. It starts by editing based on 1-Nearest Neighbour, and it increases the neighbourhood by 1 at each iteration :cite:`tomek1976experiment`:: >>> from imblearn.under_sampling import AllKNN >>> allknn = AllKNN() >>> X_resampled, y_resampled = allknn.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 64), (1, 220), (2, 4601)] :class:`AllKNN` stops cleaning when the maximum number of neighbours to examine, which is determined by the user through the parameter `n_neighbors` is reached, or when the majority class becomes the minority class. In the example below, we see that :class:`EditedNearestNeighbours`, :class:`RepeatedEditedNearestNeighbours` and :class:`AllKNN` have similar impact when cleaning "noisy" samples at the boundaries between classes. .. image:: ./auto_examples/under-sampling/images/sphx_glr_plot_comparison_under_sampling_004.png :target: ./auto_examples/under-sampling/plot_comparison_under_sampling.html :scale: 60 :align: center .. _condensed_nearest_neighbors: Condensed nearest neighbors ^^^^^^^^^^^^^^^^^^^^^^^^^^^ :class:`CondensedNearestNeighbour` uses a 1 nearest neighbor rule to iteratively decide if a sample should be removed :cite:`hart1968condensed`. The algorithm runs as follows: 1. Get all minority samples in a set :math:`C`. 2. Add a sample from the targeted class (class to be under-sampled) in :math:`C` and all other samples of this class in a set :math:`S`. 3. Train a 1-Nearest Neigbhour on :math:`C`. 4. Go through the samples in set :math:`S`, sample by sample, and classify each one using a 1 nearest neighbor rule (trained in 3). 5. If the sample is misclassified, add it to :math:`C`, and go to step 6. 6. Repeat steps 3 to 5 until all observations in :math:`S` have been examined. The final dataset is :math:`S`, containing all observations from the minority class and those from the majority that were miss-classified by the successive 1-Nearest Neigbhour algorithms. The :class:`CondensedNearestNeighbour` can be used in the following manner:: >>> from imblearn.under_sampling import CondensedNearestNeighbour >>> cnn = CondensedNearestNeighbour(random_state=0) >>> X_resampled, y_resampled = cnn.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 64), (1, 24), (2, 115)] :class:`CondensedNearestNeighbour` is sensitive to noise and may add noisy samples (see figure later on). One Sided Selection ~~~~~~~~~~~~~~~~~~~ In an attempt to remove the noisy observations introduced by :class:`CondensedNearestNeighbour`, :class:`OneSidedSelection` will first find the observations that are hard to classify, and then will use :class:`TomekLinks` to remove noisy samples :cite:`hart1968condensed`. :class:`OneSidedSelection` runs as follows: 1. Get all minority samples in a set :math:`C`. 2. Add a sample from the targeted class (class to be under-sampled) in :math:`C` and all other samples of this class in a set :math:`S`. 3. Train a 1-Nearest Neighbors on :math:`C`. 4. Using a 1 nearest neighbor rule trained in 3, classify all samples in set :math:`S`. 5. Add all misclassified samples to :math:`C`. 6. Remove Tomek Links from :math:`C`. The final dataset is :math:`S`, containing all observations from the minority class, plus the observations from the majority that were added at random, plus all those from the majority that were miss-classified by the 1-Nearest Neighbors algorithms. Note that differently from :class:`CondensedNearestNeighbour`, :class:`OneSidedSelection` does not train a K-Nearest Neighbors after each sample is misclassified. It uses the 1-Nearest Neighbors from step 3 to classify all samples from the majority in 1 pass. The class can be used as:: >>> from imblearn.under_sampling import OneSidedSelection >>> oss = OneSidedSelection(random_state=0) >>> X_resampled, y_resampled = oss.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 64), (1, 174), (2, 4404)] Our implementation offers the possibility to set the number of observations to put at random in the set :math:`C` through the parameter ``n_seeds_S``. :class:`NeighbourhoodCleaningRule` will focus on cleaning the data than condensing them :cite:`laurikkala2001improving`. Therefore, it will used the union of samples to be rejected between the :class:`EditedNearestNeighbours` and the output a 3 nearest neighbors classifier. The class can be used as:: >>> from imblearn.under_sampling import NeighbourhoodCleaningRule >>> ncr = NeighbourhoodCleaningRule(n_neighbors=11) >>> X_resampled, y_resampled = ncr.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 64), (1, 193), (2, 4535)] .. image:: ./auto_examples/under-sampling/images/sphx_glr_plot_comparison_under_sampling_005.png :target: ./auto_examples/under-sampling/plot_comparison_under_sampling.html :scale: 60 :align: center .. _instance_hardness_threshold: Additional undersampling techniques ----------------------------------- Instance hardness threshold ^^^^^^^^^^^^^^^^^^^^^^^^^^^ **Instance Hardness** is a measure of how difficult it is to classify an instance or observation correctly. In other words, hard instances are observations that are hard to classify correctly. Fundamentally, instances that are hard to classify correctly are those for which the learning algorithm or classifier produces a low probability of predicting the correct class label. If we removed these hard instances from the dataset, the logic goes, we would help the classifier better identify the different classes :cite:`smith2014instance`. :class:`InstanceHardnessThreshold` trains a classifier on the data and then removes the samples with lower probabilities :cite:`smith2014instance`. Or in other words, it retains the observations with the higher class probabilities. In our implementation, :class:`InstanceHardnessThreshold` is (almost) a controlled under-sampling method: it will retain a specific number of observations of the target class(es), which is specified by the user (see caveat below). The class can be used as:: >>> from sklearn.linear_model import LogisticRegression >>> from imblearn.under_sampling import InstanceHardnessThreshold >>> iht = InstanceHardnessThreshold(random_state=0, ... estimator=LogisticRegression( ... solver='lbfgs', multi_class='auto')) >>> X_resampled, y_resampled = iht.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 64), (1, 64), (2, 64)] :class:`InstanceHardnessThreshold` has 2 important parameters. The parameter ``estimator`` accepts any scikit-learn classifier with a method ``predict_proba``. This classifier will be used to identify the hard instances. The training is performed with cross-validation which can be specified through the parameter ``cv`. .. note:: :class:`InstanceHardnessThreshold` could almost be considered as a controlled under-sampling method. However, due to the probability outputs, it is not always possible to get the specified number of samples. The figure below shows examples of instance hardness undersampling on a toy dataset. .. image:: ./auto_examples/under-sampling/images/sphx_glr_plot_comparison_under_sampling_006.png :target: ./auto_examples/under-sampling/plot_comparison_under_sampling.html :scale: 60 :align: center imbalanced-learn-0.12.2/doc/user_guide.rst000066400000000000000000000007661460233407600204220ustar00rootroot00000000000000.. title:: User guide: contents .. _user_guide: ========== User Guide ========== .. Ensure that the references will be alphabetically collected last .. Check https://github.com/mcmtroffaes/sphinxcontrib-bibtex/issues/113 .. toctree:: :numbered: introduction.rst over_sampling.rst under_sampling.rst combine.rst ensemble.rst miscellaneous.rst metrics.rst common_pitfalls.rst Dataset loading utilities developers_utils.rst zzz_references.rst imbalanced-learn-0.12.2/doc/whats_new.rst000066400000000000000000000007341460233407600202610ustar00rootroot00000000000000.. currentmodule:: imblearn =============== Release history =============== .. include:: whats_new/v0.12.rst .. include:: whats_new/v0.11.rst .. include:: whats_new/v0.10.rst .. include:: whats_new/v0.9.rst .. include:: whats_new/v0.8.rst .. include:: whats_new/v0.7.rst .. include:: whats_new/v0.6.rst .. include:: whats_new/v0.5.rst .. include:: whats_new/v0.4.rst .. include:: whats_new/v0.3.rst .. include:: whats_new/v0.2.rst .. include:: whats_new/v0.1.rst imbalanced-learn-0.12.2/doc/whats_new/000077500000000000000000000000001460233407600175235ustar00rootroot00000000000000imbalanced-learn-0.12.2/doc/whats_new/v0.1.rst000066400000000000000000000022221460233407600207370ustar00rootroot00000000000000.. _changes_0_1: Version 0.1 =========== **December 26, 2016** Changelog --------- API ~~~ - First release of the stable API. By :user;`Fernando Nogueira `, :user:`Guillaume Lemaitre `, :user:`Christos Aridas `, and :user:`Dayvid Oliveira `. New methods ~~~~~~~~~~~ * Under-sampling 1. Random majority under-sampling with replacement 2. Extraction of majority-minority Tomek links 3. Under-sampling with Cluster Centroids 4. NearMiss-(1 & 2 & 3) 5. Condensend Nearest Neighbour 6. One-Sided Selection 7. Neighboorhood Cleaning Rule 8. Edited Nearest Neighbours 9. Instance Hardness Threshold 10. Repeated Edited Nearest Neighbours * Over-sampling 1. Random minority over-sampling with replacement 2. SMOTE - Synthetic Minority Over-sampling Technique 3. bSMOTE(1 & 2) - Borderline SMOTE of types 1 and 2 4. SVM SMOTE - Support Vectors SMOTE 5. ADASYN - Adaptive synthetic sampling approach for imbalanced learning * Over-sampling followed by under-sampling 1. SMOTE + Tomek links 2. SMOTE + ENN * Ensemble sampling 1. EasyEnsemble 2. BalanceCascade imbalanced-learn-0.12.2/doc/whats_new/v0.10.rst000066400000000000000000000040611460233407600210220ustar00rootroot00000000000000.. _changes_0_10: Version 0.10.1 ============== **December 28, 2022** Changelog --------- Bug fixes ......... - Fix a regression in over-sampler where the string `minority` was rejected as an unvalid sampling strategy. :pr:`964` by :user:`Prakhyath Bhandary `. Version 0.10.0 ============== **December 9, 2022** Changelog --------- Bug fixes ......... - Make sure that :class:`~imblearn.utils._docstring.Substitution` is working with `python -OO` that replace `__doc__` by `None`. :pr:`953` bu :user:`Guillaume Lemaitre `. Compatibility ............. - Maintenance release for be compatible with scikit-learn >= 1.0.2. :pr:`946`, :pr:`947`, :pr:`949` by :user:`Guillaume Lemaitre `. - Add support for automatic parameters validation as in scikit-learn >= 1.2. :pr:`955` by :user:`Guillaume Lemaitre `. - Add support for `feature_names_in_` as well as `get_feature_names_out` for all samplers. :pr:`959` by :user:`Guillaume Lemaitre `. Deprecation ........... - The parameter `n_jobs` has been deprecated from the classes :class:`~imblearn.over_sampling.ADASYN`, :class:`~imblearn.over_sampling.BorderlineSMOTE`, :class:`~imblearn.over_sampling.SMOTE`, :class:`~imblearn.over_sampling.SMOTENC`, :class:`~imblearn.over_sampling.SMOTEN`, and :class:`~imblearn.over_sampling.SVMSMOTE`. Instead, pass a nearest neighbors estimator where `n_jobs` is set. :pr:`887` by :user:`Guillaume Lemaitre `. - The parameter `base_estimator` is deprecated and will be removed in version 0.12. It is impacted the following classes: :class:`~imblearn.ensemble.BalancedBaggingClassifier`, :class:`~imblearn.ensemble.EasyEnsembleClassifier`, :class:`~imblearn.ensemble.RUSBoostClassifier`. :pr:`946` by :user:`Guillaume Lemaitre `. Enhancements ............ - Add support to accept compatible `NearestNeighbors` objects by only duck-typing. For instance, it allows to accept cuML instances. :pr:`858` by :user:`NV-jpt ` and :user:`Guillaume Lemaitre `. imbalanced-learn-0.12.2/doc/whats_new/v0.11.rst000066400000000000000000000055121460233407600210250ustar00rootroot00000000000000.. _changes_0_11: Version 0.11.0 ============== **July 8, 2023** Changelog --------- Bug fixes ......... - Fix a bug in :func:`~imblearn.metrics.classification_report_imbalanced` where the parameter `target_names` was not taken into account when `output_dict=True`. :pr:`989` by :user:`AYY7 `. - :class:`~imblearn.over_sampling.SMOTENC` now handles mix types of data type such as `bool` and `pd.category` by delegating the conversion to scikit-learn encoder. :pr:`1002` by :user:`Guillaume Lemaitre `. - Handle sparse matrices in :class:`~imblearn.over_sampling.SMOTEN` and raise a warning since it requires a conversion to dense matrices. :pr:`1003` by :user:`Guillaume Lemaitre `. - Remove spurious warning raised when minority class get over-sampled more than the number of sample in the majority class. :pr:`1007` by :user:`Guillaume Lemaitre `. Compatibility ............. - Maintenance release for being compatible with scikit-learn >= 1.3.0. :pr:`999` by :user:`Guillaume Lemaitre `. Deprecation ........... - The fitted attribute `ohe_` in :class:`~imblearn.over_sampling.SMOTENC` is deprecated and will be removed in version 0.13. Use `categorical_encoder_` instead. :pr:`1000` by :user:`Guillaume Lemaitre `. - The default of the parameters `sampling_strategy`, `bootstrap` and `replacement` will change in :class:`~imblearn.ensemble.BalancedRandomForestClassifier` to follow the implementation of the original paper. This changes will take effect in version 0.13. :pr:`1006` by :user:`Guillaume Lemaitre `. Enhancements ............ - :class:`~imblearn.over_sampling.SMOTENC` now accepts a parameter `categorical_encoder` allowing to specify a :class:`~sklearn.preprocessing.OneHotEncoder` with custom parameters. :pr:`1000` by :user:`Guillaume Lemaitre `. - :class:`~imblearn.over_sampling.SMOTEN` now accepts a parameter `categorical_encoder` allowing to specify a :class:`~sklearn.preprocessing.OrdinalEncoder` with custom parameters. A new fitted parameter `categorical_encoder_` is exposed to access the fitted encoder. :pr:`1001` by :user:`Guillaume Lemaitre `. - :class:`~imblearn.under_sampling.RandomUnderSampler` and :class:`~imblearn.over_sampling.RandomOverSampler` (when `shrinkage is not None`) now accept any data types and will not attempt any data conversion. :pr:`1004` by :user:`Guillaume Lemaitre `. - :class:`~imblearn.over_sampling.SMOTENC` now support passing array-like of `str` when passing the `categorical_features` parameter. :pr:`1008` by :user`Guillaume Lemaitre `. - :class:`~imblearn.over_sampling.SMOTENC` now support automatic categorical inference when `categorical_features` is set to `"auto"`. :pr:`1009` by :user`Guillaume Lemaitre `. imbalanced-learn-0.12.2/doc/whats_new/v0.12.rst000066400000000000000000000077111460233407600210310ustar00rootroot00000000000000.. _changes_0_12: Version 0.12.2 ============== **March 31, 2024** Changelog --------- Bug fixes ......... - Fix the way we check for a specific Python version in the test suite. :pr:`1075` by :user:`Guillaume Lemaitre `. Version 0.12.1 ============== **March 31, 2024** Changelog --------- Bug fixes ......... - Fix a bug in :class:`~imblearn.under_sampling.InstanceHardnessThreshold` where `estimator` could not be a :class:`~sklearn.pipeline.Pipeline` object. :pr:`1049` by :user:`Gonenc Mogol `. Compatibility ............. - Do not use `distutils` in tests due to deprecation. :pr:`1065` by :user:`Michael R. Crusoe `. - Fix the scikit-learn import in tests to be compatible with version 1.4.1.post1. :pr:`1073` by :user:`Guillaume Lemaitre `. - Fix test to be compatible with Python 3.13. :pr:`1073` by :user:`Guillaume Lemaitre `. Version 0.12.0 ============== **January 24, 2024** Changelog --------- Bug fixes ......... - Fix a bug in :class:`~imblearn.over_sampling.SMOTENC` where the entries of the one-hot encoding should be divided by `sqrt(2)` and not `2`, taking into account that they are plugged into an Euclidean distance computation. :pr:`1014` by :user:`Guillaume Lemaitre `. - Raise an informative error message when all support vectors are tagged as noise in :class:`~imblearn.over_sampling.SVMSMOTE`. :pr:`1016` by :user:`Guillaume Lemaitre `. - Fix a bug in :class:`~imblearn.over_sampling.SMOTENC` where the median of standard deviation of the continuous features was only computed on the minority class. Now, we are computing this statistic for each class that is up-sampled. :pr:`1015` by :user:`Guillaume Lemaitre `. - Fix a bug in :class:`~imblearn.over_sampling.SMOTENC` such that the case where the median of standard deviation of the continuous features is null is handled in the multiclass case as well. :pr:`1015` by :user:`Guillaume Lemaitre `. - Fix a bug in :class:`~imblearn.over_sampling.BorderlineSMOTE` version 2 where samples should be generated from the whole dataset and not only from the minority class. :pr:`1023` by :user:`Guillaume Lemaitre `. - Fix a bug in :class:`~imblearn.under_sampling.NeighbourhoodCleaningRule` where the `kind_sel="all"` was not working as explained in the literature. :pr:`1012` by :user:`Guillaume Lemaitre `. - Fix a bug in :class:`~imblearn.under_sampling.NeighbourhoodCleaningRule` where the `threshold_cleaning` ratio was multiplied on the total number of samples instead of the number of samples in the minority class. :pr:`1012` by :user:`Guillaume Lemaitre `. - Fix a bug in :class:`~imblearn.under_sampling.RandomUnderSampler` and :class:`~imblearn.over_sampling.RandomOverSampler` where a column containing only NaT was not handled correctly. :pr:`1059` by :user:`Guillaume Lemaitre `. Compatibility ............. - :class:`~imblearn.ensemble.BalancedRandomForestClassifier` now support missing values and monotonic constraints if scikit-learn >= 1.4 is installed. - :class:`~imblearn.pipeline.Pipeline` support metadata routing if scikit-learn >= 1.4 is installed. - Compatibility with scikit-learn 1.4. :pr:`1058` by :user:`Guillaume Lemaitre `. Deprecations ............ - Deprecate `estimator_` argument in favor of `estimators_` for the classes :class:`~imblearn.under_sampling.CondensedNearestNeighbour` and :class:`~imblearn.under_sampling.OneSidedSelection`. `estimator_` will be removed in 0.14. :pr:`1011` by :user:`Guillaume Lemaitre `. - Deprecate `kind_sel` in :class:`~imblearn.under_sampling.NeighbourhoodCleaningRule. It will be removed in 0.14. The parameter does not have any effect. :pr:`1012` by :user:`Guillaume Lemaitre `. Enhancements ............ - Allows to output dataframe with sparse format if provided as input. :pr:`1059` by :user:`ts2095 `. imbalanced-learn-0.12.2/doc/whats_new/v0.2.rst000066400000000000000000000136641460233407600207540ustar00rootroot00000000000000.. _changes_0_2: Version 0.2 =========== **January 1, 2017** Changelog --------- Bug fixes ~~~~~~~~~ - Fixed a bug in :class:`under_sampling.NearMiss` which was not picking the right samples during under sampling for the method 3. By :user:`Guillaume Lemaitre `. - Fixed a bug in :class:`ensemble.EasyEnsemble`, correction of the `random_state` generation. By :user:`Guillaume Lemaitre ` and :user:`Christos Aridas `. - Fixed a bug in :class:`under_sampling.RepeatedEditedNearestNeighbours`, add additional stopping criterion to avoid that the minority class become a majority class or that a class disappear. By :user:`Guillaume Lemaitre `. - Fixed a bug in :class:`under_sampling.AllKNN`, add stopping criteria to avoid that the minority class become a majority class or that a class disappear. By :user:`Guillaume Lemaitre `. - Fixed a bug in :class:`under_sampling.CondensedNeareastNeigbour`, correction of the list of indices returned. By :user:`Guillaume Lemaitre `. - Fixed a bug in :class:`ensemble.BalanceCascade`, solve the issue to obtain a single array if desired. By :user:`Guillaume Lemaitre `. - Fixed a bug in :class:`pipeline.Pipeline`, solve to embed `Pipeline` in other `Pipeline`. :issue:`231` by :user:`Christos Aridas `. - Fixed a bug in :class:`pipeline.Pipeline`, solve the issue to put to sampler in the same `Pipeline`. :issue:`188` by :user:`Christos Aridas `. - Fixed a bug in :class:`under_sampling.CondensedNeareastNeigbour`, correction of the shape of `sel_x` when only one sample is selected. By :user:`Aliaksei Halachkin `. - Fixed a bug in :class:`under_sampling.NeighbourhoodCleaningRule`, selecting neighbours instead of minority class misclassified samples. :issue:`230` by :user:`Aleksandr Loskutov `. - Fixed a bug in :class:`over_sampling.ADASYN`, correction of the creation of a new sample so that the new sample lies between the minority sample and the nearest neighbour. :issue:`235` by :user:`Rafael Wampfler `. New features ~~~~~~~~~~~~ - Added AllKNN under sampling technique. By :user:`Dayvid Oliveira `. - Added a module `metrics` implementing some specific scoring function for the problem of balancing. :issue:`204` by :user:`Guillaume Lemaitre ` and :user:`Christos Aridas `. Enhancement ~~~~~~~~~~~ - Added support for bumpversion. By :user:`Guillaume Lemaitre `. - Validate the type of target in binary samplers. A warning is raised for the moment. By :user:`Guillaume Lemaitre ` and :user:`Christos Aridas `. - Change from `cross_validation` module to `model_selection` module for `sklearn` deprecation cycle. By :user:`Dayvid Oliveira ` and :user:`Christos Aridas `. API changes summary ~~~~~~~~~~~~~~~~~~~ - `size_ngh` has been deprecated in :class:`combine.SMOTEENN`. Use `n_neighbors` instead. By :user:`Guillaume Lemaitre `, :user:`Christos Aridas `, and :user:`Dayvid Oliveira `. - `size_ngh` has been deprecated in :class:`under_sampling.EditedNearestNeighbors`. Use `n_neighbors` instead. By :user:`Guillaume Lemaitre `, :user:`Christos Aridas `, and :user:`Dayvid Oliveira `. - `size_ngh` has been deprecated in :class:`under_sampling.CondensedNeareastNeigbour`. Use `n_neighbors` instead. By :user:`Guillaume Lemaitre `, :user:`Christos Aridas `, and :user:`Dayvid Oliveira `. - `size_ngh` has been deprecated in :class:`under_sampling.OneSidedSelection`. Use `n_neighbors` instead. By :user:`Guillaume Lemaitre `, :user:`Christos Aridas `, and :user:`Dayvid Oliveira `. - `size_ngh` has been deprecated in :class:`under_sampling.NeighbourhoodCleaningRule`. Use `n_neighbors` instead. By :user:`Guillaume Lemaitre `, :user:`Christos Aridas `, and :user:`Dayvid Oliveira `. - `size_ngh` has been deprecated in :class:`under_sampling.RepeatedEditedNearestNeighbours`. Use `n_neighbors` instead. By :user:`Guillaume Lemaitre `, :user:`Christos Aridas `, and :user:`Dayvid Oliveira `. - `size_ngh` has been deprecated in :class:`under_sampling.AllKNN`. Use `n_neighbors` instead. By :user:`Guillaume Lemaitre `, :user:`Christos Aridas `, and :user:`Dayvid Oliveira `. - Two base classes :class:`BaseBinaryclassSampler` and :class:`BaseMulticlassSampler` have been created to handle the target type and raise warning in case of abnormality. By :user:`Guillaume Lemaitre ` and :user:`Christos Aridas `. - Move `random_state` to be assigned in the :class:`SamplerMixin` initialization. By :user:`Guillaume Lemaitre `. - Provide estimators instead of parameters in :class:`combine.SMOTEENN` and :class:`combine.SMOTETomek`. Therefore, the list of parameters have been deprecated. By :user:`Guillaume Lemaitre ` and :user:`Christos Aridas `. - `k` has been deprecated in :class:`over_sampling.ADASYN`. Use `n_neighbors` instead. :issue:`183` by :user:`Guillaume Lemaitre `. - `k` and `m` have been deprecated in :class:`over_sampling.SMOTE`. Use `k_neighbors` and `m_neighbors` instead. :issue:`182` by :user:`Guillaume Lemaitre `. - `n_neighbors` accept `KNeighborsMixin` based object for :class:`under_sampling.EditedNearestNeighbors`, :class:`under_sampling.CondensedNeareastNeigbour`, :class:`under_sampling.NeighbourhoodCleaningRule`, :class:`under_sampling.RepeatedEditedNearestNeighbours`, and :class:`under_sampling.AllKNN`. :issue:`109` by :user:`Guillaume Lemaitre `. Documentation changes ~~~~~~~~~~~~~~~~~~~~~ - Replace some remaining `UnbalancedDataset` occurences. By :user:`Francois Magimel `. - Added doctest in the documentation. By :user:`Guillaume Lemaitre `. imbalanced-learn-0.12.2/doc/whats_new/v0.3.rst000066400000000000000000000064741460233407600207560ustar00rootroot00000000000000.. _changes_0_3: Version 0.3 =========== **February 22, 2018** Changelog --------- Testing ~~~~~~~ - Pytest is used instead of nosetests. :issue:`321` by :user:`Joan Massich `. Documentation ~~~~~~~~~~~~~ - Added a User Guide and extended some examples. :issue:`295` by :user:`Guillaume Lemaitre `. Bug fixes ~~~~~~~~~ - Fixed a bug in :func:`utils.check_ratio` such that an error is raised when the number of samples required is negative. :issue:`312` by :user:`Guillaume Lemaitre `. - Fixed a bug in :class:`under_sampling.NearMiss` version 3. The indices returned were wrong. :issue:`312` by :user:`Guillaume Lemaitre `. - Fixed bug for :class:`ensemble.BalanceCascade` and :class:`combine.SMOTEENN` and :class:`SMOTETomek`. :issue:`295` by :user:`Guillaume Lemaitre `. - Fixed bug for `check_ratio` to be able to pass arguments when `ratio` is a callable. :issue:`307` by :user:`Guillaume Lemaitre `. New features ~~~~~~~~~~~~ - Turn off steps in :class:`pipeline.Pipeline` using the `None` object. By :user:`Christos Aridas `. - Add a fetching function :func:`datasets.fetch_datasets` in order to get some imbalanced datasets useful for benchmarking. :issue:`249` by :user:`Guillaume Lemaitre `. Enhancement ~~~~~~~~~~~ - All samplers accepts sparse matrices with defaulting on CSR type. :issue:`316` by :user:`Guillaume Lemaitre `. - :func:`datasets.make_imbalance` take a ratio similarly to other samplers. It supports multiclass. :issue:`312` by :user:`Guillaume Lemaitre `. - All the unit tests have been factorized and a :func:`utils.check_estimators` has been derived from scikit-learn. By :user:`Guillaume Lemaitre `. - Script for automatic build of conda packages and uploading. :issue:`242` by :user:`Guillaume Lemaitre ` - Remove seaborn dependence and improve the examples. :issue:`264` by :user:`Guillaume Lemaitre `. - adapt all classes to multi-class resampling. :issue:`290` by :user:`Guillaume Lemaitre ` API changes summary ~~~~~~~~~~~~~~~~~~~ - `__init__` has been removed from the :class:`base.SamplerMixin` to create a real mixin class. :issue:`242` by :user:`Guillaume Lemaitre `. - creation of a module :mod:`exceptions` to handle consistant raising of errors. :issue:`242` by :user:`Guillaume Lemaitre `. - creation of a module ``utils.validation`` to make checking of recurrent patterns. :issue:`242` by :user:`Guillaume Lemaitre `. - move the under-sampling methods in ``prototype_selection`` and ``prototype_generation`` submodule to make a clearer dinstinction. :issue:`277` by :user:`Guillaume Lemaitre `. - change ``ratio`` such that it can adapt to multiple class problems. :issue:`290` by :user:`Guillaume Lemaitre `. Deprecation ~~~~~~~~~~~ - Deprecation of the use of ``min_c_`` in :func:`datasets.make_imbalance`. :issue:`312` by :user:`Guillaume Lemaitre ` - Deprecation of the use of float in :func:`datasets.make_imbalance` for the ratio parameter. :issue:`290` by :user:`Guillaume Lemaitre `. - deprecate the use of float as ratio in favor of dictionary, string, or callable. :issue:`290` by :user:`Guillaume Lemaitre `. imbalanced-learn-0.12.2/doc/whats_new/v0.4.rst000066400000000000000000000206611460233407600207510ustar00rootroot00000000000000.. _changes_0_4: Version 0.4.2 ============= **October 21, 2018** Changelog --------- Bug fixes ......... - Fix a bug in :class:`imblearn.over_sampling.SMOTENC` in which the the median of the standard deviation instead of half of the median of the standard deviation. By :user:`Guillaume Lemaitre ` in :issue:`491`. - Raise an error when passing target which is not supported, i.e. regression target or multilabel targets. Imbalanced-learn does not support this case. By :user:`Guillaume Lemaitre ` in :issue:`490`. - Fix a bug in :class:`imblearn.over_sampling.SMOTENC` in which a sparse matrices were densify during ``inverse_transform``. By :user:`Guillaume Lemaitre ` in :issue:`495`. - Fix a bug in :class:`imblearn.over_sampling.SMOTE_NC` in which a the tie breaking was wrongly sampling. By :user:`Guillaume Lemaitre ` in :issue:`497`. Version 0.4 =========== **October 12, 2018** .. warning:: Version 0.4 is the last version of imbalanced-learn to support Python 2.7 and Python 3.4. Imbalanced-learn 0.5 will require Python 3.5 or higher. Highlights ---------- This release brings its set of new feature as well as some API changes to strengthen the foundation of imbalanced-learn. As new feature, 2 new modules :mod:`imblearn.keras` and :mod:`imblearn.tensorflow` have been added in which imbalanced-learn samplers can be used to generate balanced mini-batches. The module :mod:`imblearn.ensemble` has been consolidated with new classifier: :class:`imblearn.ensemble.BalancedRandomForestClassifier`, :class:`imblearn.ensemble.EasyEnsembleClassifier`, :class:`imblearn.ensemble.RUSBoostClassifier`. Support for string has been added in :class:`imblearn.over_sampling.RandomOverSampler` and :class:`imblearn.under_sampling.RandomUnderSampler`. In addition, a new class :class:`imblearn.over_sampling.SMOTENC` allows to generate sample with data sets containing both continuous and categorical features. The :class:`imblearn.over_sampling.SMOTE` has been simplified and break down to 2 additional classes: :class:`imblearn.over_sampling.SVMSMOTE` and :class:`imblearn.over_sampling.BorderlineSMOTE`. There is also some changes regarding the API: the parameter ``sampling_strategy`` has been introduced to replace the ``ratio`` parameter. In addition, the ``return_indices`` argument has been deprecated and all samplers will exposed a ``sample_indices_`` whenever this is possible. Changelog --------- API ... - Replace the parameter ``ratio`` by ``sampling_strategy``. :issue:`411` by :user:`Guillaume Lemaitre `. - Enable to use a ``float`` with binary classification for ``sampling_strategy``. :issue:`411` by :user:`Guillaume Lemaitre `. - Enable to use a ``list`` for the cleaning methods to specify the class to sample. :issue:`411` by :user:`Guillaume Lemaitre `. - Replace ``fit_sample`` by ``fit_resample``. An alias is still available for backward compatibility. In addition, ``sample`` has been removed to avoid resampling on different set of data. :issue:`462` by :user:`Guillaume Lemaitre `. New features ............ - Add a :mod:`keras` and :mod:`tensorflow` modules to create balanced mini-batches generator. :issue:`409` by :user:`Guillaume Lemaitre `. - Add :class:`imblearn.ensemble.EasyEnsembleClassifier` which create a bag of AdaBoost classifier trained on balanced bootstrap samples. :issue:`455` by :user:`Guillaume Lemaitre `. - Add :class:`imblearn.ensemble.BalancedRandomForestClassifier` which balanced each bootstrap provided to each tree of the forest. :issue:`459` by :user:`Guillaume Lemaitre `. - Add :class:`imblearn.ensemble.RUSBoostClassifier` which applied a random under-sampling stage before each boosting iteration of AdaBoost. :issue:`469` by :user:`Guillaume Lemaitre `. - Add :class:`imblern.over_sampling.SMOTENC` which generate synthetic samples on data set with heterogeneous data type (continuous and categorical features). :issue:`412` by :user:`Denis Dudnik ` and :user:`Guillaume Lemaitre `. Enhancement ........... - Add a documentation node to create a balanced random forest from a balanced bagging classifier. :issue:`372` by :user:`Guillaume Lemaitre `. - Document the metrics to evaluate models on imbalanced dataset. :issue:`367` by :user:`Guillaume Lemaitre `. - Add support for one-vs-all encoded target to support keras. :issue:`409` by :user:`Guillaume Lemaitre `. - Adding specific class for borderline and SVM SMOTE using :class:`BorderlineSMOTE` and :class:`SVMSMOTE`. :issue:`440` by :user:`Guillaume Lemaitre `. - Allow :class:`imblearn.over_sampling.RandomOverSampler` can return indices using the attributes ``return_indices``. :issue:`439` by :user:`Hugo Gascon` and :user:`Guillaume Lemaitre `. - Allow :class:`imblearn.under_sampling.RandomUnderSampler` and :class:`imblearn.over_sampling.RandomOverSampler` to sample object array containing strings. :issue:`451` by :user:`Guillaume Lemaitre `. Bug fixes ......... - Fix bug in :func:`metrics.classification_report_imbalanced` for which `y_pred` and `y_true` where inversed. :issue:`394` by :user:`Ole Silvig .` - Fix bug in ADASYN to consider only samples from the current class when generating new samples. :issue:`354` by :user:`Guillaume Lemaitre `. - Fix bug which allow for sorted behavior of ``sampling_strategy`` dictionary and thus to obtain a deterministic results when using the same random state. :issue:`447` by :user:`Guillaume Lemaitre `. - Force to clone scikit-learn estimator passed as attributes to samplers. :issue:`446` by :user:`Guillaume Lemaitre `. - Fix bug which was not preserving the dtype of X and y when generating samples. :issue:`450` by :user:`Guillaume Lemaitre `. - Add the option to pass a ``Memory`` object to :func:`make_pipeline` like in :class:`pipeline.Pipeline` class. :issue:`458` by :user:`Christos Aridas `. Maintenance ........... - Remove deprecated parameters in 0.2 - :issue:`331` by :user:`Guillaume Lemaitre `. - Make some modules private. :issue:`452` by :user:`Guillaume Lemaitre `. - Upgrade requirements to scikit-learn 0.20. :issue:`379` by :user:`Guillaume Lemaitre `. - Catch deprecation warning in testing. :issue:`441` by :user:`Guillaume Lemaitre `. - Refactor and impose `pytest` style tests. :issue:`470` by :user:`Guillaume Lemaitre `. Documentation ............. - Remove some docstring which are not necessary. :issue:`454` by :user:`Guillaume Lemaitre `. - Fix the documentation of the ``sampling_strategy`` parameters when used as a float. :issue:`480` by :user:`Guillaume Lemaitre `. Deprecation ........... - Deprecate ``ratio`` in favor of ``sampling_strategy``. :issue:`411` by :user:`Guillaume Lemaitre `. - Deprecate the use of a ``dict`` for cleaning methods. a ``list`` should be used. :issue:`411` by :user:`Guillaume Lemaitre `. - Deprecate ``random_state`` in :class:`imblearn.under_sampling.NearMiss`, :class:`imblearn.under_sampling.EditedNearestNeighbors`, :class:`imblearn.under_sampling.RepeatedEditedNearestNeighbors`, :class:`imblearn.under_sampling.AllKNN`, :class:`imblearn.under_sampling.NeighbourhoodCleaningRule`, :class:`imblearn.under_sampling.InstanceHardnessThreshold`, :class:`imblearn.under_sampling.CondensedNearestNeighbours`. - Deprecate ``kind``, ``out_step``, ``svm_estimator``, ``m_neighbors`` in :class:`imblearn.over_sampling.SMOTE`. User should use :class:`imblearn.over_sampling.SVMSMOTE` and :class:`imblearn.over_sampling.BorderlineSMOTE`. :issue:`440` by :user:`Guillaume Lemaitre `. - Deprecate :class:`imblearn.ensemble.EasyEnsemble` in favor of meta-estimator :class:`imblearn.ensemble.EasyEnsembleClassifier` which follow the exact algorithm described in the literature. :issue:`455` by :user:`Guillaume Lemaitre `. - Deprecate :class:`imblearn.ensemble.BalanceCascade`. :issue:`472` by :user:`Guillaume Lemaitre `. - Deprecate ``return_indices`` in all samplers. Instead, an attribute ``sample_indices_`` is created whenever the sampler is selecting a subset of the original samples. :issue:`474` by :user:`Guillaume Lemaitre `. - Add :class:`imblearn.over_sampling.BorderlineSMOTE` and :class:`imblearn.over_sampling.SVMSMOTE` in the API documenation. :issue:`530` by :user:`Guillaume Lemaitre `. Enhancement ........... - Add Parallelisation for SMOTEENN and SMOTETomek. :pr:`547` by :user:`Michael Hsieh `. - Add :class:`imblearn.utils._show_versions`. Updated the contribution guide and issue template showing how to print system and dependency information from the command line. :pr:`557` by :user:`Alexander L. Hayes `. - Add :class:`imblearn.over_sampling.KMeansSMOTE` which is an over-sampler clustering points before to apply SMOTE. :pr:`435` by :user:`Stephan Heijl `. Maintenance ........... - Make it possible to ``import imblearn`` and access submodule. :pr:`500` by :user:`Guillaume Lemaitre `. - Remove support for Python 2, remove deprecation warning from scikit-learn 0.21. :pr:`576` by :user:`Guillaume Lemaitre `. Bug ... - Fix wrong usage of :class:`keras.layers.BatchNormalization` in ``porto_seguro_keras_under_sampling.py`` example. The batch normalization was moved before the activation function and the bias was removed from the dense layer. :pr:`531` by :user:`Guillaume Lemaitre `. - Fix bug which converting to COO format sparse when stacking the matrices in :class:`imblearn.over_sampling.SMOTENC`. This bug was only old scipy version. :pr:`539` by :user:`Guillaume Lemaitre `. - Fix bug in :class:`imblearn.pipeline.Pipeline` where None could be the final estimator. :pr:`554` by :user:`Oliver Rausch `. - Fix bug in :class:`imblearn.over_sampling.SVMSMOTE` and :class:`imblearn.over_sampling.BorderlineSMOTE` where the default parameter of ``n_neighbors`` was not set properly. :pr:`578` by :user:`Guillaume Lemaitre `. - Fix bug by changing the default depth in :class:`imblearn.ensemble.RUSBoostClassifier` to get a decision stump as a weak learner as in the original paper. :pr:`545` by :user:`Christos Aridas `. - Allow to import ``keras`` directly from ``tensorflow`` in the :mod:`imblearn.keras`. :pr:`531` by :user:`Guillaume Lemaitre `. imbalanced-learn-0.12.2/doc/whats_new/v0.6.rst000066400000000000000000000115471460233407600207560ustar00rootroot00000000000000.. _changes_0_6_2: Version 0.6.2 ============== **February 16, 2020** This is a bug-fix release to resolve some issues regarding the handling the input and the output format of the arrays. Changelog --------- - Allow column vectors to be passed as targets. :pr:`673` by :user:`Christos Aridas `. - Better input/output handling for pandas, numpy and plain lists. :pr:`681` by :user:`Christos Aridas `. .. _changes_0_6_1: Version 0.6.1 ============== **December 7, 2019** This is a bug-fix release to primarily resolve some packaging issues in version 0.6.0. It also includes minor documentation improvements and some bug fixes. Changelog --------- Bug fixes ......... - Fix a bug in :class:`imblearn.ensemble.BalancedRandomForestClassifier` leading to a wrong number of samples used during fitting due `max_samples` and therefore a bad computation of the OOB score. :pr:`656` by :user:`Guillaume Lemaitre `. .. _changes_0_6: Version 0.6.0 ============= **December 5, 2019** Changelog --------- Changed models .............. The following models might give some different sampling due to changes in scikit-learn: - :class:`imblearn.under_sampling.ClusterCentroids` - :class:`imblearn.under_sampling.InstanceHardnessThreshold` The following samplers will give different results due to change linked to the random state internal usage: - :class:`imblearn.over_sampling.ADASYN` - :class:`imblearn.over_sampling.SMOTENC` Bug fixes ......... - :class:`imblearn.under_sampling.InstanceHardnessThreshold` now take into account the `random_state` and will give deterministic results. In addition, `cross_val_predict` is used to take advantage of the parallelism. :pr:`599` by :user:`Shihab Shahriar Khan `. - Fix a bug in :class:`imblearn.ensemble.BalancedRandomForestClassifier` leading to a wrong computation of the OOB score. :pr:`656` by :user:`Guillaume Lemaitre `. Maintenance ........... - Update imports from scikit-learn after that some modules have been privatize. The following import have been changed: :class:`sklearn.ensemble._base._set_random_states`, :class:`sklearn.ensemble._forest._parallel_build_trees`, :class:`sklearn.metrics._classification._check_targets`, :class:`sklearn.metrics._classification._prf_divide`, :class:`sklearn.utils.Bunch`, :class:`sklearn.utils._safe_indexing`, :class:`sklearn.utils._testing.assert_allclose`, :class:`sklearn.utils._testing.assert_array_equal`, :class:`sklearn.utils._testing.SkipTest`. :pr:`617` by :user:`Guillaume Lemaitre `. - Synchronize :mod:`imblearn.pipeline` with :mod:`sklearn.pipeline`. :pr:`620` by :user:`Guillaume Lemaitre `. - Synchronize :class:`imblearn.ensemble.BalancedRandomForestClassifier` and add parameters `max_samples` and `ccp_alpha`. :pr:`621` by :user:`Guillaume Lemaitre `. Enhancement ........... - :class:`imblearn.under_sampling.RandomUnderSampling`, :class:`imblearn.over_sampling.RandomOverSampling`, :class:`imblearn.datasets.make_imbalance` accepts Pandas DataFrame in and will output Pandas DataFrame. Similarly, it will accepts Pandas Series in and will output Pandas Series. :pr:`636` by :user:`Guillaume Lemaitre `. - :class:`imblearn.FunctionSampler` accepts a parameter ``validate`` allowing to check or not the input ``X`` and ``y``. :pr:`637` by :user:`Guillaume Lemaitre `. - :class:`imblearn.under_sampling.RandomUnderSampler`, :class:`imblearn.over_sampling.RandomOverSampler` can resample when non finite values are present in ``X``. :pr:`643` by :user:`Guillaume Lemaitre `. - All samplers will output a Pandas DataFrame if a Pandas DataFrame was given as an input. :pr:`644` by :user:`Guillaume Lemaitre `. - The samples generation in :class:`imblearn.over_sampling.ADASYN`, :class:`imblearn.over_sampling.SMOTE`, :class:`imblearn.over_sampling.BorderlineSMOTE`, :class:`imblearn.over_sampling.SVMSMOTE`, :class:`imblearn.over_sampling.KMeansSMOTE`, :class:`imblearn.over_sampling.SMOTENC` is now vectorize with giving an additional speed-up when `X` in sparse. :pr:`596` and :pr:`649` by :user:`Matt Eding `. Deprecation ........... - The following classes have been removed after 2 deprecation cycles: `ensemble.BalanceCascade` and `ensemble.EasyEnsemble`. :pr:`617` by :user:`Guillaume Lemaitre `. - The following functions have been removed after 2 deprecation cycles: `utils.check_ratio`. :pr:`617` by :user:`Guillaume Lemaitre `. - The parameter `ratio` and `return_indices` has been removed from all samplers. :pr:`617` by :user:`Guillaume Lemaitre `. - The parameters `m_neighbors`, `out_step`, `kind`, `svm_estimator` have been removed from the :class:`imblearn.over_sampling.SMOTE`. :pr:`617` by :user:`Guillaume Lemaitre `. imbalanced-learn-0.12.2/doc/whats_new/v0.7.rst000066400000000000000000000046041460233407600207530ustar00rootroot00000000000000.. _changes_0_7: Version 0.7.0 ============= **June 9, 2020** Changelog --------- Maintenance ........... - Ensure that :class:`imblearn.pipeline.Pipeline` is working when `memory` is activated and `joblib==0.11`. :pr:`687` by :user:`Christos Aridas `. - Refactor common test to use the dev tools from `scikit-learn` 0.23. :pr:`710` by :user:`Guillaume Lemaitre `. - Remove `FutureWarning` issued by `scikit-learn` 0.23. :pr:`710` by :user:`Guillaume Lemaitre `. - Impose keywords only argument as in `scikit-learn`. :pr:`721` by :user:`Guillaume Lemaitre `. Changed models .............. The following models might give some different results due to changes: - :class:`imblearn.ensemble.BalancedRandomForestClassifier` Bug fixes ......... - Change the default value `min_samples_leaf` to be consistent with scikit-learn. :pr:`711` by :user:`zerolfx `. - Fix a bug due to change in `scikit-learn` 0.23 in :class:`imblearn.metrics.make_index_balanced_accuracy`. The function was unusable. :pr:`710` by :user:`Guillaume Lemaitre `. - Raise a proper error message when only numerical or categorical features are given in :class:`imblearn.over_sampling.SMOTENC`. :pr:`720` by :user:`Guillaume Lemaitre `. - Fix a bug when the median of the standard deviation is null in :class:`imblearn.over_sampling.SMOTENC`. :pr:`675` by :user:`bganglia `. Enhancements ............ - The classifier implemented in imbalanced-learn, :class:`imblearn.ensemble.BalancedBaggingClassifier`, :class:`imblearn.ensemble.BalancedRandomForestClassifier`, :class:`imblearn.ensemble.EasyEnsembleClassifier`, and :class:`imblearn.ensemble.RUSBoostClassifier`, accept `sampling_strategy` with the same key than in `y` without the need of encoding `y` in advance. :pr:`718` by :user:`Guillaume Lemaitre `. - Lazy import `keras` module when importing `imblearn.keras` :pr:`719` by :user:`Guillaume Lemaitre `. Deprecation ........... - Deprecation of the parameters `n_jobs` in :class:`imblearn.under_sampling.ClusterCentroids` since it was used by :class:`sklearn.cluster.KMeans` which deprecated it. :pr:`710` by :user:`Guillaume Lemaitre `. - Deprecation of passing keyword argument by position similarly to `scikit-learn`. :pr:`721` by :user:`Guillaume lemaitre `. imbalanced-learn-0.12.2/doc/whats_new/v0.8.rst000066400000000000000000000051641460233407600207560ustar00rootroot00000000000000.. _changes_0_8: Version 0.8.1 ============= **September 29, 2020** Changelog --------- Maintenance ........... - Make `imbalanced-learn` compatible with `scikit-learn` 1.0. :pr:`864` by :user:`Guillaume Lemaitre `. Version 0.8.0 ============= **February 18, 2021** Changelog --------- New features ............ - Add the the function :func:`imblearn.metrics.macro_averaged_mean_absolute_error` returning the average across class of the MAE. This metric is used in ordinal classification. :pr:`780` by :user:`Aurélien Massiot `. - Add the class :class:`imblearn.metrics.pairwise.ValueDifferenceMetric` to compute pairwise distances between samples containing only categorical values. :pr:`796` by :user:`Guillaume Lemaitre `. - Add the class :class:`imblearn.over_sampling.SMOTEN` to over-sample data only containing categorical features. :pr:`802` by :user:`Guillaume Lemaitre `. - Add the possibility to pass any type of samplers in :class:`imblearn.ensemble.BalancedBaggingClassifier` unlocking the implementation of methods based on resampled bagging. :pr:`808` by :user:`Guillaume Lemaitre `. Enhancements ............ - Add option `output_dict` in :func:`imblearn.metrics.classification_report_imbalanced` to return a dictionary instead of a string. :pr:`770` by :user:`Guillaume Lemaitre `. - Added an option to generate smoothed bootstrap in :class:`imblearn.over_sampling.RandomOverSampler`. It is controls by the parameter `shrinkage`. This method is also known as Random Over-Sampling Examples (ROSE). :pr:`754` by :user:`Andrea Lorenzon ` and :user:`Guillaume Lemaitre `. Bug fixes ......... - Fix a bug in :class:`imblearn.under_sampling.ClusterCentroids` where `voting="hard"` could have lead to select a sample from any class instead of the targeted class. :pr:`769` by :user:`Guillaume Lemaitre `. - Fix a bug in :class:`imblearn.FunctionSampler` where validation was performed even with `validate=False` when calling `fit`. :pr:`790` by :user:`Guillaume Lemaitre `. Maintenance ........... - Remove requirements files in favour of adding the packages in the `extras_require` within the `setup.py` file. :pr:`816` by :user:`Guillaume Lemaitre `. - Change the website template to use `pydata-sphinx-theme`. :pr:`801` by :user:`Guillaume Lemaitre `. Deprecation ........... - The context manager :func:`imblearn.utils.testing.warns` is deprecated in 0.8 and will be removed 1.0. :pr:`815` by :user:`Guillaume Lemaitre `. imbalanced-learn-0.12.2/doc/whats_new/v0.9.rst000066400000000000000000000006221460233407600207510ustar00rootroot00000000000000.. _changes_0_9: Version 0.9.1 ============= **May 16, 2022** Changelog --------- This release provides fixes that make `imbalanced-learn` works with the latest release (`1.1.0`) of `scikit-learn`. Version 0.9.0 ============= **January 11, 2022** Changelog --------- This release is mainly providing fixes that make `imbalanced-learn` works with the latest release (`1.0.2`) of `scikit-learn`. imbalanced-learn-0.12.2/doc/zzz_references.rst000066400000000000000000000001031460233407600213060ustar00rootroot00000000000000========== References ========== .. bibliography:: bibtex/refs.bibimbalanced-learn-0.12.2/examples/000077500000000000000000000000001460233407600165755ustar00rootroot00000000000000imbalanced-learn-0.12.2/examples/README.txt000066400000000000000000000001701460233407600202710ustar00rootroot00000000000000.. _general_examples: Examples -------- General-purpose and introductory examples for the `imbalanced-learn` toolbox. imbalanced-learn-0.12.2/examples/api/000077500000000000000000000000001460233407600173465ustar00rootroot00000000000000imbalanced-learn-0.12.2/examples/api/README.txt000066400000000000000000000002601460233407600210420ustar00rootroot00000000000000.. _api_usage: Examples showing API imbalanced-learn usage ------------------------------------------- Examples that show some details regarding the API of imbalanced-learn. imbalanced-learn-0.12.2/examples/api/plot_sampling_strategy_usage.py000066400000000000000000000137231460233407600257040ustar00rootroot00000000000000""" ==================================================== How to use ``sampling_strategy`` in imbalanced-learn ==================================================== This example shows the different usage of the parameter ``sampling_strategy`` for the different family of samplers (i.e. over-sampling, under-sampling. or cleaning methods). """ # Authors: Guillaume Lemaitre # License: MIT # %% print(__doc__) import seaborn as sns sns.set_context("poster") # %% [markdown] # Create an imbalanced dataset # ---------------------------- # # First, we will create an imbalanced data set from a the iris data set. # %% from sklearn.datasets import load_iris from imblearn.datasets import make_imbalance iris = load_iris(as_frame=True) sampling_strategy = {0: 10, 1: 20, 2: 47} X, y = make_imbalance(iris.data, iris.target, sampling_strategy=sampling_strategy) # %% import matplotlib.pyplot as plt fig, axs = plt.subplots(ncols=2, figsize=(10, 5)) autopct = "%.2f" iris.target.value_counts().plot.pie(autopct=autopct, ax=axs[0]) axs[0].set_title("Original") y.value_counts().plot.pie(autopct=autopct, ax=axs[1]) axs[1].set_title("Imbalanced") fig.tight_layout() # %% [markdown] # Using ``sampling_strategy`` in resampling algorithms # ==================================================== # # `sampling_strategy` as a `float` # -------------------------------- # # `sampling_strategy` can be given a `float`. For **under-sampling # methods**, it corresponds to the ratio :math:`\alpha_{us}` defined by # :math:`N_{rM} = \alpha_{us} \times N_{m}` where :math:`N_{rM}` and # :math:`N_{m}` are the number of samples in the majority class after # resampling and the number of samples in the minority class, respectively. # %% # select only 2 classes since the ratio make sense in this case binary_mask = y.isin([0, 1]) binary_y = y[binary_mask] binary_X = X[binary_mask] # %% from imblearn.under_sampling import RandomUnderSampler sampling_strategy = 0.8 rus = RandomUnderSampler(sampling_strategy=sampling_strategy) X_res, y_res = rus.fit_resample(binary_X, binary_y) ax = y_res.value_counts().plot.pie(autopct=autopct) _ = ax.set_title("Under-sampling") # %% [markdown] # For **over-sampling methods**, it correspond to the ratio # :math:`\alpha_{os}` defined by :math:`N_{rm} = \alpha_{os} \times N_{M}` # where :math:`N_{rm}` and :math:`N_{M}` are the number of samples in the # minority class after resampling and the number of samples in the majority # class, respectively. # %% from imblearn.over_sampling import RandomOverSampler ros = RandomOverSampler(sampling_strategy=sampling_strategy) X_res, y_res = ros.fit_resample(binary_X, binary_y) ax = y_res.value_counts().plot.pie(autopct=autopct) _ = ax.set_title("Over-sampling") # %% [markdown] # `sampling_strategy` as a `str` # ------------------------------- # # `sampling_strategy` can be given as a string which specify the class # targeted by the resampling. With under- and over-sampling, the number of # samples will be equalized. # # Note that we are using multiple classes from now on. # %% sampling_strategy = "not minority" fig, axs = plt.subplots(ncols=2, figsize=(10, 5)) rus = RandomUnderSampler(sampling_strategy=sampling_strategy) X_res, y_res = rus.fit_resample(X, y) y_res.value_counts().plot.pie(autopct=autopct, ax=axs[0]) axs[0].set_title("Under-sampling") sampling_strategy = "not majority" ros = RandomOverSampler(sampling_strategy=sampling_strategy) X_res, y_res = ros.fit_resample(X, y) y_res.value_counts().plot.pie(autopct=autopct, ax=axs[1]) _ = axs[1].set_title("Over-sampling") # %% [markdown] # With **cleaning method**, the number of samples in each class will not be # equalized even if targeted. # %% from imblearn.under_sampling import TomekLinks sampling_strategy = "not minority" tl = TomekLinks(sampling_strategy=sampling_strategy) X_res, y_res = tl.fit_resample(X, y) ax = y_res.value_counts().plot.pie(autopct=autopct) _ = ax.set_title("Cleaning") # %% [markdown] # `sampling_strategy` as a `dict` # ------------------------------ # # When `sampling_strategy` is a `dict`, the keys correspond to the targeted # classes. The values correspond to the desired number of samples for each # targeted class. This is working for both **under- and over-sampling** # algorithms but not for the **cleaning algorithms**. Use a `list` instead. # %% fig, axs = plt.subplots(ncols=2, figsize=(10, 5)) sampling_strategy = {0: 10, 1: 15, 2: 20} rus = RandomUnderSampler(sampling_strategy=sampling_strategy) X_res, y_res = rus.fit_resample(X, y) y_res.value_counts().plot.pie(autopct=autopct, ax=axs[0]) axs[0].set_title("Under-sampling") sampling_strategy = {0: 25, 1: 35, 2: 47} ros = RandomOverSampler(sampling_strategy=sampling_strategy) X_res, y_res = ros.fit_resample(X, y) y_res.value_counts().plot.pie(autopct=autopct, ax=axs[1]) _ = axs[1].set_title("Under-sampling") # %% [markdown] # `sampling_strategy` as a `list` # ------------------------------- # # When `sampling_strategy` is a `list`, the list contains the targeted # classes. It is used only for **cleaning methods** and raise an error # otherwise. # %% sampling_strategy = [0, 1, 2] tl = TomekLinks(sampling_strategy=sampling_strategy) X_res, y_res = tl.fit_resample(X, y) ax = y_res.value_counts().plot.pie(autopct=autopct) _ = ax.set_title("Cleaning") # %% [markdown] # `sampling_strategy` as a callable # --------------------------------- # # When callable, function taking `y` and returns a `dict`. The keys # correspond to the targeted classes. The values correspond to the desired # number of samples for each class. # %% def ratio_multiplier(y): from collections import Counter multiplier = {1: 0.7, 2: 0.95} target_stats = Counter(y) for key, value in target_stats.items(): if key in multiplier: target_stats[key] = int(value * multiplier[key]) return target_stats X_res, y_res = RandomUnderSampler(sampling_strategy=ratio_multiplier).fit_resample(X, y) ax = y_res.value_counts().plot.pie(autopct=autopct) ax.set_title("Under-sampling") plt.show() imbalanced-learn-0.12.2/examples/applications/000077500000000000000000000000001460233407600212635ustar00rootroot00000000000000imbalanced-learn-0.12.2/examples/applications/README.txt000066400000000000000000000002131460233407600227550ustar00rootroot00000000000000.. _realword_examples: Examples based on real world datasets ------------------------------------- Examples which use real-word dataset. imbalanced-learn-0.12.2/examples/applications/plot_impact_imbalanced_classes.py000066400000000000000000000301311460233407600300220ustar00rootroot00000000000000""" ========================================================== Fitting model on imbalanced datasets and how to fight bias ========================================================== This example illustrates the problem induced by learning on datasets having imbalanced classes. Subsequently, we compare different approaches alleviating these negative effects. """ # Authors: Guillaume Lemaitre # License: MIT # %% print(__doc__) # %% [markdown] # Problem definition # ------------------ # # We are dropping the following features: # # - "fnlwgt": this feature was created while studying the "adult" dataset. # Thus, we will not use this feature which is not acquired during the survey. # - "education-num": it is encoding the same information than "education". # Thus, we are removing one of these 2 features. # %% from sklearn.datasets import fetch_openml df, y = fetch_openml("adult", version=2, as_frame=True, return_X_y=True) df = df.drop(columns=["fnlwgt", "education-num"]) # %% [markdown] # The "adult" dataset as a class ratio of about 3:1 # %% classes_count = y.value_counts() classes_count # %% [markdown] # This dataset is only slightly imbalanced. To better highlight the effect of # learning from an imbalanced dataset, we will increase its ratio to 30:1 # %% from imblearn.datasets import make_imbalance ratio = 30 df_res, y_res = make_imbalance( df, y, sampling_strategy={classes_count.idxmin(): classes_count.max() // ratio}, ) y_res.value_counts() # %% [markdown] # We will perform a cross-validation evaluation to get an estimate of the test # score. # # As a baseline, we could use a classifier which will always predict the # majority class independently of the features provided. from sklearn.dummy import DummyClassifier # %% from sklearn.model_selection import cross_validate dummy_clf = DummyClassifier(strategy="most_frequent") scoring = ["accuracy", "balanced_accuracy"] cv_result = cross_validate(dummy_clf, df_res, y_res, scoring=scoring) print(f"Accuracy score of a dummy classifier: {cv_result['test_accuracy'].mean():.3f}") # %% [markdown] # Instead of using the accuracy, we can use the balanced accuracy which will # take into account the balancing issue. # %% print( f"Balanced accuracy score of a dummy classifier: " f"{cv_result['test_balanced_accuracy'].mean():.3f}" ) # %% [markdown] # Strategies to learn from an imbalanced dataset # ---------------------------------------------- # We will use a dictionary and a list to continuously store the results of # our experiments and show them as a pandas dataframe. # %% index = [] scores = {"Accuracy": [], "Balanced accuracy": []} # %% [markdown] # Dummy baseline # .............. # # Before to train a real machine learning model, we can store the results # obtained with our :class:`~sklearn.dummy.DummyClassifier`. # %% import pandas as pd index += ["Dummy classifier"] cv_result = cross_validate(dummy_clf, df_res, y_res, scoring=scoring) scores["Accuracy"].append(cv_result["test_accuracy"].mean()) scores["Balanced accuracy"].append(cv_result["test_balanced_accuracy"].mean()) df_scores = pd.DataFrame(scores, index=index) df_scores # %% [markdown] # Linear classifier baseline # .......................... # # We will create a machine learning pipeline using a # :class:`~sklearn.linear_model.LogisticRegression` classifier. In this regard, # we will need to one-hot encode the categorical columns and standardized the # numerical columns before to inject the data into the # :class:`~sklearn.linear_model.LogisticRegression` classifier. # # First, we define our numerical and categorical pipelines. # %% from sklearn.impute import SimpleImputer from sklearn.pipeline import make_pipeline from sklearn.preprocessing import OneHotEncoder, StandardScaler num_pipe = make_pipeline( StandardScaler(), SimpleImputer(strategy="mean", add_indicator=True) ) cat_pipe = make_pipeline( SimpleImputer(strategy="constant", fill_value="missing"), OneHotEncoder(handle_unknown="ignore"), ) # %% [markdown] # Then, we can create a preprocessor which will dispatch the categorical # columns to the categorical pipeline and the numerical columns to the # numerical pipeline # %% from sklearn.compose import make_column_selector as selector from sklearn.compose import make_column_transformer preprocessor_linear = make_column_transformer( (num_pipe, selector(dtype_include="number")), (cat_pipe, selector(dtype_include="category")), n_jobs=2, ) # %% [markdown] # Finally, we connect our preprocessor with our # :class:`~sklearn.linear_model.LogisticRegression`. We can then evaluate our # model. # %% from sklearn.linear_model import LogisticRegression lr_clf = make_pipeline(preprocessor_linear, LogisticRegression(max_iter=1000)) # %% index += ["Logistic regression"] cv_result = cross_validate(lr_clf, df_res, y_res, scoring=scoring) scores["Accuracy"].append(cv_result["test_accuracy"].mean()) scores["Balanced accuracy"].append(cv_result["test_balanced_accuracy"].mean()) df_scores = pd.DataFrame(scores, index=index) df_scores # %% [markdown] # We can see that our linear model is learning slightly better than our dummy # baseline. However, it is impacted by the class imbalance. # # We can verify that something similar is happening with a tree-based model # such as :class:`~sklearn.ensemble.RandomForestClassifier`. With this type of # classifier, we will not need to scale the numerical data, and we will only # need to ordinal encode the categorical data. from sklearn.ensemble import RandomForestClassifier # %% from sklearn.preprocessing import OrdinalEncoder num_pipe = SimpleImputer(strategy="mean", add_indicator=True) cat_pipe = make_pipeline( SimpleImputer(strategy="constant", fill_value="missing"), OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), ) preprocessor_tree = make_column_transformer( (num_pipe, selector(dtype_include="number")), (cat_pipe, selector(dtype_include="category")), n_jobs=2, ) rf_clf = make_pipeline( preprocessor_tree, RandomForestClassifier(random_state=42, n_jobs=2) ) # %% index += ["Random forest"] cv_result = cross_validate(rf_clf, df_res, y_res, scoring=scoring) scores["Accuracy"].append(cv_result["test_accuracy"].mean()) scores["Balanced accuracy"].append(cv_result["test_balanced_accuracy"].mean()) df_scores = pd.DataFrame(scores, index=index) df_scores # %% [markdown] # The :class:`~sklearn.ensemble.RandomForestClassifier` is as well affected by # the class imbalanced, slightly less than the linear model. Now, we will # present different approach to improve the performance of these 2 models. # # Use `class_weight` # .................. # # Most of the models in `scikit-learn` have a parameter `class_weight`. This # parameter will affect the computation of the loss in linear model or the # criterion in the tree-based model to penalize differently a false # classification from the minority and majority class. We can set # `class_weight="balanced"` such that the weight applied is inversely # proportional to the class frequency. We test this parametrization in both # linear model and tree-based model. # %% lr_clf.set_params(logisticregression__class_weight="balanced") index += ["Logistic regression with balanced class weights"] cv_result = cross_validate(lr_clf, df_res, y_res, scoring=scoring) scores["Accuracy"].append(cv_result["test_accuracy"].mean()) scores["Balanced accuracy"].append(cv_result["test_balanced_accuracy"].mean()) df_scores = pd.DataFrame(scores, index=index) df_scores # %% rf_clf.set_params(randomforestclassifier__class_weight="balanced") index += ["Random forest with balanced class weights"] cv_result = cross_validate(rf_clf, df_res, y_res, scoring=scoring) scores["Accuracy"].append(cv_result["test_accuracy"].mean()) scores["Balanced accuracy"].append(cv_result["test_balanced_accuracy"].mean()) df_scores = pd.DataFrame(scores, index=index) df_scores # %% [markdown] # We can see that using `class_weight` was really effective for the linear # model, alleviating the issue of learning from imbalanced classes. However, # the :class:`~sklearn.ensemble.RandomForestClassifier` is still biased toward # the majority class, mainly due to the criterion which is not suited enough to # fight the class imbalance. # # Resample the training set during learning # ......................................... # # Another way is to resample the training set by under-sampling or # over-sampling some of the samples. `imbalanced-learn` provides some samplers # to do such processing. # %% from imblearn.pipeline import make_pipeline as make_pipeline_with_sampler from imblearn.under_sampling import RandomUnderSampler lr_clf = make_pipeline_with_sampler( preprocessor_linear, RandomUnderSampler(random_state=42), LogisticRegression(max_iter=1000), ) # %% index += ["Under-sampling + Logistic regression"] cv_result = cross_validate(lr_clf, df_res, y_res, scoring=scoring) scores["Accuracy"].append(cv_result["test_accuracy"].mean()) scores["Balanced accuracy"].append(cv_result["test_balanced_accuracy"].mean()) df_scores = pd.DataFrame(scores, index=index) df_scores # %% rf_clf = make_pipeline_with_sampler( preprocessor_tree, RandomUnderSampler(random_state=42), RandomForestClassifier(random_state=42, n_jobs=2), ) # %% index += ["Under-sampling + Random forest"] cv_result = cross_validate(rf_clf, df_res, y_res, scoring=scoring) scores["Accuracy"].append(cv_result["test_accuracy"].mean()) scores["Balanced accuracy"].append(cv_result["test_balanced_accuracy"].mean()) df_scores = pd.DataFrame(scores, index=index) df_scores # %% [markdown] # Applying a random under-sampler before the training of the linear model or # random forest, allows to not focus on the majority class at the cost of # making more mistake for samples in the majority class (i.e. decreased # accuracy). # # We could apply any type of samplers and find which sampler is working best # on the current dataset. # # Instead, we will present another way by using classifiers which will apply # sampling internally. # # Use of specific balanced algorithms from imbalanced-learn # ......................................................... # # We already showed that random under-sampling can be effective on decision # tree. However, instead of under-sampling once the dataset, one could # under-sample the original dataset before to take a bootstrap sample. This is # the base of the :class:`imblearn.ensemble.BalancedRandomForestClassifier` and # :class:`~imblearn.ensemble.BalancedBaggingClassifier`. # %% from imblearn.ensemble import BalancedRandomForestClassifier rf_clf = make_pipeline( preprocessor_tree, BalancedRandomForestClassifier( sampling_strategy="all", replacement=True, bootstrap=False, random_state=42, n_jobs=2, ), ) # %% index += ["Balanced random forest"] cv_result = cross_validate(rf_clf, df_res, y_res, scoring=scoring) scores["Accuracy"].append(cv_result["test_accuracy"].mean()) scores["Balanced accuracy"].append(cv_result["test_balanced_accuracy"].mean()) df_scores = pd.DataFrame(scores, index=index) df_scores # %% [markdown] # The performance with the # :class:`~imblearn.ensemble.BalancedRandomForestClassifier` is better than # applying a single random under-sampling. We will use a gradient-boosting # classifier within a :class:`~imblearn.ensemble.BalancedBaggingClassifier`. from sklearn.ensemble import HistGradientBoostingClassifier from imblearn.ensemble import BalancedBaggingClassifier bag_clf = make_pipeline( preprocessor_tree, BalancedBaggingClassifier( estimator=HistGradientBoostingClassifier(random_state=42), n_estimators=10, random_state=42, n_jobs=2, ), ) index += ["Balanced bag of histogram gradient boosting"] cv_result = cross_validate(bag_clf, df_res, y_res, scoring=scoring) scores["Accuracy"].append(cv_result["test_accuracy"].mean()) scores["Balanced accuracy"].append(cv_result["test_balanced_accuracy"].mean()) df_scores = pd.DataFrame(scores, index=index) df_scores # %% [markdown] # This last approach is the most effective. The different under-sampling allows # to bring some diversity for the different GBDT to learn and not focus on a # portion of the majority class. imbalanced-learn-0.12.2/examples/applications/plot_multi_class_under_sampling.py000066400000000000000000000027261460233407600303100ustar00rootroot00000000000000""" ============================================= Multiclass classification with under-sampling ============================================= Some balancing methods allow for balancing dataset with multiples classes. We provide an example to illustrate the use of those methods which do not differ from the binary case. """ # Authors: Guillaume Lemaitre # License: MIT from collections import Counter from sklearn.datasets import load_iris from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from imblearn.datasets import make_imbalance from imblearn.metrics import classification_report_imbalanced from imblearn.pipeline import make_pipeline from imblearn.under_sampling import NearMiss print(__doc__) RANDOM_STATE = 42 # Create a folder to fetch the dataset iris = load_iris() X, y = make_imbalance( iris.data, iris.target, sampling_strategy={0: 25, 1: 50, 2: 50}, random_state=RANDOM_STATE, ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_STATE) print(f"Training target statistics: {Counter(y_train)}") print(f"Testing target statistics: {Counter(y_test)}") # Create a pipeline pipeline = make_pipeline(NearMiss(version=2), StandardScaler(), LogisticRegression()) pipeline.fit(X_train, y_train) # Classify and report the results print(classification_report_imbalanced(y_test, pipeline.predict(X_test))) imbalanced-learn-0.12.2/examples/applications/plot_outlier_rejections.py000066400000000000000000000104011460233407600265770ustar00rootroot00000000000000""" =============================================================== Customized sampler to implement an outlier rejections estimator =============================================================== This example illustrates the use of a custom sampler to implement an outlier rejections estimator. It can be used easily within a pipeline in which the number of samples can vary during training, which usually is a limitation of the current scikit-learn pipeline. """ # Authors: Guillaume Lemaitre # License: MIT import matplotlib.pyplot as plt import numpy as np from sklearn.datasets import make_blobs, make_moons from sklearn.ensemble import IsolationForest from sklearn.linear_model import LogisticRegression from sklearn.metrics import classification_report from imblearn import FunctionSampler from imblearn.pipeline import make_pipeline print(__doc__) rng = np.random.RandomState(42) def plot_scatter(X, y, title): """Function to plot some data as a scatter plot.""" plt.figure() plt.scatter(X[y == 1, 0], X[y == 1, 1], label="Class #1") plt.scatter(X[y == 0, 0], X[y == 0, 1], label="Class #0") plt.legend() plt.title(title) ############################################################################## # Toy data generation ############################################################################## ############################################################################## # We are generating some non Gaussian data set contaminated with some unform # noise. moons, _ = make_moons(n_samples=500, noise=0.05) blobs, _ = make_blobs( n_samples=500, centers=[(-0.75, 2.25), (1.0, 2.0)], cluster_std=0.25 ) outliers = rng.uniform(low=-3, high=3, size=(500, 2)) X_train = np.vstack([moons, blobs, outliers]) y_train = np.hstack( [ np.ones(moons.shape[0], dtype=np.int8), np.zeros(blobs.shape[0], dtype=np.int8), rng.randint(0, 2, size=outliers.shape[0], dtype=np.int8), ] ) plot_scatter(X_train, y_train, "Training dataset") ############################################################################## # We will generate some cleaned test data without outliers. moons, _ = make_moons(n_samples=50, noise=0.05) blobs, _ = make_blobs( n_samples=50, centers=[(-0.75, 2.25), (1.0, 2.0)], cluster_std=0.25 ) X_test = np.vstack([moons, blobs]) y_test = np.hstack( [np.ones(moons.shape[0], dtype=np.int8), np.zeros(blobs.shape[0], dtype=np.int8)] ) plot_scatter(X_test, y_test, "Testing dataset") ############################################################################## # How to use the :class:`~imblearn.FunctionSampler` ############################################################################## ############################################################################## # We first define a function which will use # :class:`~sklearn.ensemble.IsolationForest` to eliminate some outliers from # our dataset during training. The function passed to the # :class:`~imblearn.FunctionSampler` will be called when using the method # ``fit_resample``. def outlier_rejection(X, y): """This will be our function used to resample our dataset.""" model = IsolationForest(max_samples=100, contamination=0.4, random_state=rng) model.fit(X) y_pred = model.predict(X) return X[y_pred == 1], y[y_pred == 1] reject_sampler = FunctionSampler(func=outlier_rejection) X_inliers, y_inliers = reject_sampler.fit_resample(X_train, y_train) plot_scatter(X_inliers, y_inliers, "Training data without outliers") ############################################################################## # Integrate it within a pipeline ############################################################################## ############################################################################## # By elimnating outliers before the training, the classifier will be less # affected during the prediction. pipe = make_pipeline( FunctionSampler(func=outlier_rejection), LogisticRegression(solver="lbfgs", multi_class="auto", random_state=rng), ) y_pred = pipe.fit(X_train, y_train).predict(X_test) print(classification_report(y_test, y_pred)) clf = LogisticRegression(solver="lbfgs", multi_class="auto", random_state=rng) y_pred = clf.fit(X_train, y_train).predict(X_test) print(classification_report(y_test, y_pred)) plt.show() imbalanced-learn-0.12.2/examples/applications/plot_over_sampling_benchmark_lfw.py000066400000000000000000000112161460233407600304230ustar00rootroot00000000000000""" ========================================================== Benchmark over-sampling methods in a face recognition task ========================================================== In this face recognition example two faces are used from the LFW (Faces in the Wild) dataset. Several implemented over-sampling methods are used in conjunction with a 3NN classifier in order to examine the improvement of the classifier's output quality by using an over-sampler. """ # Authors: Christos Aridas # Guillaume Lemaitre # License: MIT # %% print(__doc__) import seaborn as sns sns.set_context("poster") # %% [markdown] # Load the dataset # ---------------- # # We will use a dataset containing image from know person where we will # build a model to recognize the person on the image. We will make this problem # a binary problem by taking picture of only George W. Bush and Bill Clinton. # %% import numpy as np from sklearn.datasets import fetch_lfw_people data = fetch_lfw_people() george_bush_id = 1871 # Photos of George W. Bush bill_clinton_id = 531 # Photos of Bill Clinton classes = [george_bush_id, bill_clinton_id] classes_name = np.array(["B. Clinton", "G.W. Bush"], dtype=object) # %% mask_photos = np.isin(data.target, classes) X, y = data.data[mask_photos], data.target[mask_photos] y = (y == george_bush_id).astype(np.int8) y = classes_name[y] # %% [markdown] # We can check the ratio between the two classes. # %% import matplotlib.pyplot as plt import pandas as pd class_distribution = pd.Series(y).value_counts(normalize=True) ax = class_distribution.plot.barh() ax.set_title("Class distribution") pos_label = class_distribution.idxmin() plt.tight_layout() print(f"The positive label considered as the minority class is {pos_label}") # %% [markdown] # We see that we have an imbalanced classification problem with ~95% of the # data belonging to the class G.W. Bush. # # Compare over-sampling approaches # -------------------------------- # # We will use different over-sampling approaches and use a kNN classifier # to check if we can recognize the 2 presidents. The evaluation will be # performed through cross-validation and we will plot the mean ROC curve. # # We will create different pipelines and evaluate them. from sklearn.neighbors import KNeighborsClassifier from imblearn import FunctionSampler from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler from imblearn.pipeline import make_pipeline classifier = KNeighborsClassifier(n_neighbors=3) pipeline = [ make_pipeline(FunctionSampler(), classifier), make_pipeline(RandomOverSampler(random_state=42), classifier), make_pipeline(ADASYN(random_state=42), classifier), make_pipeline(SMOTE(random_state=42), classifier), ] # %% from sklearn.model_selection import StratifiedKFold cv = StratifiedKFold(n_splits=3) # %% [markdown] # We will compute the mean ROC curve for each pipeline using a different splits # provided by the :class:`~sklearn.model_selection.StratifiedKFold` # cross-validation. # %% from sklearn.metrics import RocCurveDisplay, auc, roc_curve disp = [] for model in pipeline: # compute the mean fpr/tpr to get the mean ROC curve mean_tpr, mean_fpr = 0.0, np.linspace(0, 1, 100) for train, test in cv.split(X, y): model.fit(X[train], y[train]) y_proba = model.predict_proba(X[test]) pos_label_idx = np.flatnonzero(model.classes_ == pos_label)[0] fpr, tpr, thresholds = roc_curve( y[test], y_proba[:, pos_label_idx], pos_label=pos_label ) mean_tpr += np.interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 mean_tpr /= cv.get_n_splits(X, y) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) # Create a display that we will reuse to make the aggregated plots for # all methods disp.append( RocCurveDisplay( fpr=mean_fpr, tpr=mean_tpr, roc_auc=mean_auc, estimator_name=f"{model[0].__class__.__name__}", ) ) # %% [markdown] # In the previous cell, we created the different mean ROC curve and we can plot # them on the same plot. # %% fig, ax = plt.subplots(figsize=(9, 9)) for d in disp: d.plot(ax=ax, linestyle="--") ax.plot([0, 1], [0, 1], linestyle="--", color="k") ax.axis("square") fig.suptitle("Comparison of over-sampling methods \nwith a 3NN classifier") ax.set_xlim([0, 1]) ax.set_ylim([0, 1]) sns.despine(offset=10, ax=ax) plt.legend(loc="lower right", fontsize=16) plt.tight_layout() plt.show() # %% [markdown] # We see that for this task, methods that are generating new samples with some # interpolation (i.e. ADASYN and SMOTE) perform better than random # over-sampling or no resampling. imbalanced-learn-0.12.2/examples/applications/plot_topic_classication.py000066400000000000000000000064361460233407600265560ustar00rootroot00000000000000""" ================================================= Example of topic classification in text documents ================================================= This example shows how to balance the text data before to train a classifier. Note that for this example, the data are slightly imbalanced but it can happen that for some data sets, the imbalanced ratio is more significant. """ # Authors: Guillaume Lemaitre # License: MIT # %% print(__doc__) # %% [markdown] # Setting the data set # -------------------- # # We use a part of the 20 newsgroups data set by loading 4 topics. Using the # scikit-learn loader, the data are split into a training and a testing set. # # Note the class \#3 is the minority class and has almost twice less samples # than the majority class. # %% from sklearn.datasets import fetch_20newsgroups categories = [ "alt.atheism", "talk.religion.misc", "comp.graphics", "sci.space", ] newsgroups_train = fetch_20newsgroups(subset="train", categories=categories) newsgroups_test = fetch_20newsgroups(subset="test", categories=categories) X_train = newsgroups_train.data X_test = newsgroups_test.data y_train = newsgroups_train.target y_test = newsgroups_test.target # %% from collections import Counter print(f"Training class distributions summary: {Counter(y_train)}") print(f"Test class distributions summary: {Counter(y_test)}") # %% [markdown] # The usual scikit-learn pipeline # ------------------------------- # # You might usually use scikit-learn pipeline by combining the TF-IDF # vectorizer to feed a multinomial naive bayes classifier. A classification # report summarized the results on the testing set. # # As expected, the recall of the class \#3 is low mainly due to the class # imbalanced. # %% from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn.pipeline import make_pipeline model = make_pipeline(TfidfVectorizer(), MultinomialNB()) model.fit(X_train, y_train) y_pred = model.predict(X_test) # %% from imblearn.metrics import classification_report_imbalanced print(classification_report_imbalanced(y_test, y_pred)) # %% [markdown] # Balancing the class before classification # ----------------------------------------- # # To improve the prediction of the class \#3, it could be interesting to apply # a balancing before to train the naive bayes classifier. Therefore, we will # use a :class:`~imblearn.under_sampling.RandomUnderSampler` to equalize the # number of samples in all the classes before the training. # # It is also important to note that we are using the # :class:`~imblearn.pipeline.make_pipeline` function implemented in # imbalanced-learn to properly handle the samplers. from imblearn.pipeline import make_pipeline as make_pipeline_imb # %% from imblearn.under_sampling import RandomUnderSampler model = make_pipeline_imb(TfidfVectorizer(), RandomUnderSampler(), MultinomialNB()) model.fit(X_train, y_train) y_pred = model.predict(X_test) # %% [markdown] # Although the results are almost identical, it can be seen that the resampling # allowed to correct the poor recall of the class \#3 at the cost of reducing # the other metrics for the other classes. However, the overall results are # slightly better. # %% print(classification_report_imbalanced(y_test, y_pred)) imbalanced-learn-0.12.2/examples/applications/porto_seguro_keras_under_sampling.py000066400000000000000000000210611460233407600306400ustar00rootroot00000000000000""" ========================================================== Porto Seguro: balancing samples in mini-batches with Keras ========================================================== This example compares two strategies to train a neural-network on the Porto Seguro Kaggle data set [1]_. The data set is imbalanced and we show that balancing each mini-batch allows to improve performance and reduce the training time. References ---------- .. [1] https://www.kaggle.com/c/porto-seguro-safe-driver-prediction/data """ # Authors: Guillaume Lemaitre # License: MIT print(__doc__) ############################################################################### # Data loading ############################################################################### from collections import Counter import numpy as np import pandas as pd ############################################################################### # First, you should download the Porto Seguro data set from Kaggle. See the # link in the introduction. training_data = pd.read_csv("./input/train.csv") testing_data = pd.read_csv("./input/test.csv") y_train = training_data[["id", "target"]].set_index("id") X_train = training_data.drop(["target"], axis=1).set_index("id") X_test = testing_data.set_index("id") ############################################################################### # The data set is imbalanced and it will have an effect on the fitting. print(f"The data set is imbalanced: {Counter(y_train['target'])}") ############################################################################### # Define the pre-processing pipeline ############################################################################### from sklearn.compose import ColumnTransformer from sklearn.impute import SimpleImputer from sklearn.pipeline import make_pipeline from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler def convert_float64(X): return X.astype(np.float64) ############################################################################### # We want to standard scale the numerical features while we want to one-hot # encode the categorical features. In this regard, we make use of the # :class:`~sklearn.compose.ColumnTransformer`. numerical_columns = [ name for name in X_train.columns if "_calc_" in name and "_bin" not in name ] numerical_pipeline = make_pipeline( FunctionTransformer(func=convert_float64, validate=False), StandardScaler() ) categorical_columns = [name for name in X_train.columns if "_cat" in name] categorical_pipeline = make_pipeline( SimpleImputer(missing_values=-1, strategy="most_frequent"), OneHotEncoder(categories="auto"), ) preprocessor = ColumnTransformer( [ ("numerical_preprocessing", numerical_pipeline, numerical_columns), ( "categorical_preprocessing", categorical_pipeline, categorical_columns, ), ], remainder="drop", ) # Create an environment variable to avoid using the GPU. This can be changed. import os os.environ["CUDA_VISIBLE_DEVICES"] = "-1" from tensorflow.keras.layers import Activation, BatchNormalization, Dense, Dropout ############################################################################### # Create a neural-network ############################################################################### from tensorflow.keras.models import Sequential def make_model(n_features): model = Sequential() model.add(Dense(200, input_shape=(n_features,), kernel_initializer="glorot_normal")) model.add(BatchNormalization()) model.add(Activation("relu")) model.add(Dropout(0.5)) model.add(Dense(100, kernel_initializer="glorot_normal", use_bias=False)) model.add(BatchNormalization()) model.add(Activation("relu")) model.add(Dropout(0.25)) model.add(Dense(50, kernel_initializer="glorot_normal", use_bias=False)) model.add(BatchNormalization()) model.add(Activation("relu")) model.add(Dropout(0.15)) model.add(Dense(25, kernel_initializer="glorot_normal", use_bias=False)) model.add(BatchNormalization()) model.add(Activation("relu")) model.add(Dropout(0.1)) model.add(Dense(1, activation="sigmoid")) model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"]) return model ############################################################################### # We create a decorator to report the computation time import time from functools import wraps def timeit(f): @wraps(f) def wrapper(*args, **kwds): start_time = time.time() result = f(*args, **kwds) elapsed_time = time.time() - start_time print(f"Elapsed computation time: {elapsed_time:.3f} secs") return (elapsed_time, result) return wrapper ############################################################################### # The first model will be trained using the ``fit`` method and with imbalanced # mini-batches. import tensorflow from sklearn.metrics import roc_auc_score from sklearn.utils.fixes import parse_version tf_version = parse_version(tensorflow.__version__) @timeit def fit_predict_imbalanced_model(X_train, y_train, X_test, y_test): model = make_model(X_train.shape[1]) model.fit(X_train, y_train, epochs=2, verbose=1, batch_size=1000) if tf_version < parse_version("2.6"): # predict_proba was removed in tensorflow 2.6 predict_method = "predict_proba" else: predict_method = "predict" y_pred = getattr(model, predict_method)(X_test, batch_size=1000) return roc_auc_score(y_test, y_pred) ############################################################################### # In the contrary, we will use imbalanced-learn to create a generator of # mini-batches which will yield balanced mini-batches. from imblearn.keras import BalancedBatchGenerator @timeit def fit_predict_balanced_model(X_train, y_train, X_test, y_test): model = make_model(X_train.shape[1]) training_generator = BalancedBatchGenerator( X_train, y_train, batch_size=1000, random_state=42 ) model.fit(training_generator, epochs=5, verbose=1) y_pred = model.predict(X_test, batch_size=1000) return roc_auc_score(y_test, y_pred) ############################################################################### # Classification loop ############################################################################### ############################################################################### # We will perform a 10-fold cross-validation and train the neural-network with # the two different strategies previously presented. from sklearn.model_selection import StratifiedKFold skf = StratifiedKFold(n_splits=10) cv_results_imbalanced = [] cv_time_imbalanced = [] cv_results_balanced = [] cv_time_balanced = [] for train_idx, valid_idx in skf.split(X_train, y_train): X_local_train = preprocessor.fit_transform(X_train.iloc[train_idx]) y_local_train = y_train.iloc[train_idx].values.ravel() X_local_test = preprocessor.transform(X_train.iloc[valid_idx]) y_local_test = y_train.iloc[valid_idx].values.ravel() elapsed_time, roc_auc = fit_predict_imbalanced_model( X_local_train, y_local_train, X_local_test, y_local_test ) cv_time_imbalanced.append(elapsed_time) cv_results_imbalanced.append(roc_auc) elapsed_time, roc_auc = fit_predict_balanced_model( X_local_train, y_local_train, X_local_test, y_local_test ) cv_time_balanced.append(elapsed_time) cv_results_balanced.append(roc_auc) ############################################################################### # Plot of the results and computation time ############################################################################### df_results = pd.DataFrame( { "Balanced model": cv_results_balanced, "Imbalanced model": cv_results_imbalanced, } ) df_results = df_results.unstack().reset_index() df_time = pd.DataFrame( {"Balanced model": cv_time_balanced, "Imbalanced model": cv_time_imbalanced} ) df_time = df_time.unstack().reset_index() import matplotlib.pyplot as plt import seaborn as sns plt.figure() sns.boxplot(y="level_0", x=0, data=df_time) sns.despine(top=True, right=True, left=True) plt.xlabel("time [s]") plt.ylabel("") plt.title("Computation time difference using a random under-sampling") plt.figure() sns.boxplot(y="level_0", x=0, data=df_results, whis=10.0) sns.despine(top=True, right=True, left=True) ax = plt.gca() ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, pos: "%i%%" % (100 * x))) plt.xlabel("ROC-AUC") plt.ylabel("") plt.title("Difference in terms of ROC-AUC using a random under-sampling") imbalanced-learn-0.12.2/examples/combine/000077500000000000000000000000001460233407600202115ustar00rootroot00000000000000imbalanced-learn-0.12.2/examples/combine/README.txt000066400000000000000000000004261460233407600217110ustar00rootroot00000000000000.. _combine_examples: Examples using combine class methods ==================================== Combine methods mixed over- and under-sampling methods. Generally SMOTE is used for over-sampling while some cleaning methods (i.e., ENN and Tomek links) are used to under-sample. imbalanced-learn-0.12.2/examples/combine/plot_comparison_combine.py000066400000000000000000000073541460233407600255000ustar00rootroot00000000000000""" ================================================== Compare sampler combining over- and under-sampling ================================================== This example shows the effect of applying an under-sampling algorithms after SMOTE over-sampling. In the literature, Tomek's link and edited nearest neighbours are the two methods which have been used and are available in imbalanced-learn. """ # Authors: Guillaume Lemaitre # License: MIT # %% print(__doc__) import matplotlib.pyplot as plt import seaborn as sns sns.set_context("poster") # %% [markdown] # Dataset generation # ------------------ # # We will create an imbalanced dataset with a couple of samples. We will use # :func:`~sklearn.datasets.make_classification` to generate this dataset. # %% from sklearn.datasets import make_classification X, y = make_classification( n_samples=100, n_features=2, n_informative=2, n_redundant=0, n_repeated=0, n_classes=3, n_clusters_per_class=1, weights=[0.1, 0.2, 0.7], class_sep=0.8, random_state=0, ) # %% _, ax = plt.subplots(figsize=(6, 6)) _ = ax.scatter(X[:, 0], X[:, 1], c=y, alpha=0.8, edgecolor="k") # %% [markdown] # The following function will be used to plot the sample space after resampling # to illustrate the characteristic of an algorithm. # %% from collections import Counter def plot_resampling(X, y, sampler, ax): """Plot the resampled dataset using the sampler.""" X_res, y_res = sampler.fit_resample(X, y) ax.scatter(X_res[:, 0], X_res[:, 1], c=y_res, alpha=0.8, edgecolor="k") sns.despine(ax=ax, offset=10) ax.set_title(f"Decision function for {sampler.__class__.__name__}") return Counter(y_res) # %% [markdown] # The following function will be used to plot the decision function of a # classifier given some data. # %% import numpy as np def plot_decision_function(X, y, clf, ax): """Plot the decision function of the classifier and the original data""" plot_step = 0.02 x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid( np.arange(x_min, x_max, plot_step), np.arange(y_min, y_max, plot_step) ) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) ax.contourf(xx, yy, Z, alpha=0.4) ax.scatter(X[:, 0], X[:, 1], alpha=0.8, c=y, edgecolor="k") ax.set_title(f"Resampling using {clf[0].__class__.__name__}") # %% [markdown] # :class:`~imblearn.over_sampling.SMOTE` allows to generate samples. However, # this method of over-sampling does not have any knowledge regarding the # underlying distribution. Therefore, some noisy samples can be generated, e.g. # when the different classes cannot be well separated. Hence, it can be # beneficial to apply an under-sampling algorithm to clean the noisy samples. # Two methods are usually used in the literature: (i) Tomek's link and (ii) # edited nearest neighbours cleaning methods. Imbalanced-learn provides two # ready-to-use samplers :class:`~imblearn.combine.SMOTETomek` and # :class:`~imblearn.combine.SMOTEENN`. In general, # :class:`~imblearn.combine.SMOTEENN` cleans more noisy data than # :class:`~imblearn.combine.SMOTETomek`. from sklearn.linear_model import LogisticRegression from imblearn.combine import SMOTEENN, SMOTETomek # %% from imblearn.over_sampling import SMOTE from imblearn.pipeline import make_pipeline samplers = [SMOTE(random_state=0), SMOTEENN(random_state=0), SMOTETomek(random_state=0)] fig, axs = plt.subplots(3, 2, figsize=(15, 25)) for ax, sampler in zip(axs, samplers): clf = make_pipeline(sampler, LogisticRegression()).fit(X, y) plot_decision_function(X, y, clf, ax[0]) plot_resampling(X, y, sampler, ax[1]) fig.tight_layout() plt.show() imbalanced-learn-0.12.2/examples/datasets/000077500000000000000000000000001460233407600204055ustar00rootroot00000000000000imbalanced-learn-0.12.2/examples/datasets/README.txt000066400000000000000000000001721460233407600221030ustar00rootroot00000000000000.. _dataset_examples: Dataset examples ----------------------- Examples concerning the :mod:`imblearn.datasets` module. imbalanced-learn-0.12.2/examples/datasets/plot_make_imbalance.py000066400000000000000000000046521460233407600247340ustar00rootroot00000000000000""" ============================ Create an imbalanced dataset ============================ An illustration of the :func:`~imblearn.datasets.make_imbalance` function to create an imbalanced dataset from a balanced dataset. We show the ability of :func:`~imblearn.datasets.make_imbalance` of dealing with Pandas DataFrame. """ # Authors: Dayvid Oliveira # Christos Aridas # Guillaume Lemaitre # License: MIT # %% print(__doc__) import seaborn as sns sns.set_context("poster") # %% [markdown] # Generate the dataset # -------------------- # # First, we will generate a dataset and convert it to a # :class:`~pandas.DataFrame` with arbitrary column names. We will plot the # original dataset. # %% import matplotlib.pyplot as plt import pandas as pd from sklearn.datasets import make_moons X, y = make_moons(n_samples=200, shuffle=True, noise=0.5, random_state=10) X = pd.DataFrame(X, columns=["feature 1", "feature 2"]) ax = X.plot.scatter( x="feature 1", y="feature 2", c=y, colormap="viridis", colorbar=False, ) sns.despine(ax=ax, offset=10) plt.tight_layout() # %% [markdown] # Make a dataset imbalanced # ------------------------- # # Now, we will show the helpers :func:`~imblearn.datasets.make_imbalance` # that is useful to random select a subset of samples. It will impact the # class distribution as specified by the parameters. # %% from collections import Counter def ratio_func(y, multiplier, minority_class): target_stats = Counter(y) return {minority_class: int(multiplier * target_stats[minority_class])} # %% from imblearn.datasets import make_imbalance fig, axs = plt.subplots(nrows=2, ncols=3, figsize=(15, 10)) X.plot.scatter( x="feature 1", y="feature 2", c=y, ax=axs[0, 0], colormap="viridis", colorbar=False, ) axs[0, 0].set_title("Original set") sns.despine(ax=axs[0, 0], offset=10) multipliers = [0.9, 0.75, 0.5, 0.25, 0.1] for ax, multiplier in zip(axs.ravel()[1:], multipliers): X_resampled, y_resampled = make_imbalance( X, y, sampling_strategy=ratio_func, **{"multiplier": multiplier, "minority_class": 1}, ) X_resampled.plot.scatter( x="feature 1", y="feature 2", c=y_resampled, ax=ax, colormap="viridis", colorbar=False, ) ax.set_title(f"Sampling ratio = {multiplier}") sns.despine(ax=ax, offset=10) plt.tight_layout() plt.show() imbalanced-learn-0.12.2/examples/ensemble/000077500000000000000000000000001460233407600203675ustar00rootroot00000000000000imbalanced-learn-0.12.2/examples/ensemble/README.txt000066400000000000000000000005511460233407600220660ustar00rootroot00000000000000.. _ensemble_examples: Example using ensemble class methods ==================================== Under-sampling methods implies that samples of the majority class are lost during the balancing procedure. Ensemble methods offer an alternative to use most of the samples. In fact, an ensemble of balanced sets is created and used to later train any classifier. imbalanced-learn-0.12.2/examples/ensemble/plot_bagging_classifier.py000066400000000000000000000136061460233407600256070ustar00rootroot00000000000000""" ================================= Bagging classifiers using sampler ================================= In this example, we show how :class:`~imblearn.ensemble.BalancedBaggingClassifier` can be used to create a large variety of classifiers by giving different samplers. We will give several examples that have been published in the passed year. """ # Authors: Guillaume Lemaitre # License: MIT # %% print(__doc__) # %% [markdown] # Generate an imbalanced dataset # ------------------------------ # # For this example, we will create a synthetic dataset using the function # :func:`~sklearn.datasets.make_classification`. The problem will be a toy # classification problem with a ratio of 1:9 between the two classes. # %% from sklearn.datasets import make_classification X, y = make_classification( n_samples=10_000, n_features=10, weights=[0.1, 0.9], class_sep=0.5, random_state=0, ) # %% import pandas as pd pd.Series(y).value_counts(normalize=True) # %% [markdown] # In the following sections, we will show a couple of algorithms that have # been proposed over the years. We intend to illustrate how one can reuse the # :class:`~imblearn.ensemble.BalancedBaggingClassifier` by passing different # sampler. from sklearn.ensemble import BaggingClassifier # %% from sklearn.model_selection import cross_validate ebb = BaggingClassifier() cv_results = cross_validate(ebb, X, y, scoring="balanced_accuracy") print(f"{cv_results['test_score'].mean():.3f} +/- {cv_results['test_score'].std():.3f}") # %% [markdown] # Exactly Balanced Bagging and Over-Bagging # ----------------------------------------- # # The :class:`~imblearn.ensemble.BalancedBaggingClassifier` can use in # conjunction with a :class:`~imblearn.under_sampling.RandomUnderSampler` or # :class:`~imblearn.over_sampling.RandomOverSampler`. These methods are # referred as Exactly Balanced Bagging and Over-Bagging, respectively and have # been proposed first in [1]_. # %% from imblearn.ensemble import BalancedBaggingClassifier from imblearn.under_sampling import RandomUnderSampler # Exactly Balanced Bagging ebb = BalancedBaggingClassifier(sampler=RandomUnderSampler()) cv_results = cross_validate(ebb, X, y, scoring="balanced_accuracy") print(f"{cv_results['test_score'].mean():.3f} +/- {cv_results['test_score'].std():.3f}") # %% from imblearn.over_sampling import RandomOverSampler # Over-bagging over_bagging = BalancedBaggingClassifier(sampler=RandomOverSampler()) cv_results = cross_validate(over_bagging, X, y, scoring="balanced_accuracy") print(f"{cv_results['test_score'].mean():.3f} +/- {cv_results['test_score'].std():.3f}") # %% [markdown] # SMOTE-Bagging # ------------- # # Instead of using a :class:`~imblearn.over_sampling.RandomOverSampler` that # make a bootstrap, an alternative is to use # :class:`~imblearn.over_sampling.SMOTE` as an over-sampler. This is known as # SMOTE-Bagging [2]_. # %% from imblearn.over_sampling import SMOTE # SMOTE-Bagging smote_bagging = BalancedBaggingClassifier(sampler=SMOTE()) cv_results = cross_validate(smote_bagging, X, y, scoring="balanced_accuracy") print(f"{cv_results['test_score'].mean():.3f} +/- {cv_results['test_score'].std():.3f}") # %% [markdown] # Roughly Balanced Bagging # ------------------------ # While using a :class:`~imblearn.under_sampling.RandomUnderSampler` or # :class:`~imblearn.over_sampling.RandomOverSampler` will create exactly the # desired number of samples, it does not follow the statistical spirit wanted # in the bagging framework. The authors in [3]_ proposes to use a negative # binomial distribution to compute the number of samples of the majority # class to be selected and then perform a random under-sampling. # # Here, we illustrate this method by implementing a function in charge of # resampling and use the :class:`~imblearn.FunctionSampler` to integrate it # within a :class:`~imblearn.pipeline.Pipeline` and # :class:`~sklearn.model_selection.cross_validate`. # %% from collections import Counter import numpy as np from imblearn import FunctionSampler def roughly_balanced_bagging(X, y, replace=False): """Implementation of Roughly Balanced Bagging for binary problem.""" # find the minority and majority classes class_counts = Counter(y) majority_class = max(class_counts, key=class_counts.get) minority_class = min(class_counts, key=class_counts.get) # compute the number of sample to draw from the majority class using # a negative binomial distribution n_minority_class = class_counts[minority_class] n_majority_resampled = np.random.negative_binomial(n=n_minority_class, p=0.5) # draw randomly with or without replacement majority_indices = np.random.choice( np.flatnonzero(y == majority_class), size=n_majority_resampled, replace=replace, ) minority_indices = np.random.choice( np.flatnonzero(y == minority_class), size=n_minority_class, replace=replace, ) indices = np.hstack([majority_indices, minority_indices]) return X[indices], y[indices] # Roughly Balanced Bagging rbb = BalancedBaggingClassifier( sampler=FunctionSampler(func=roughly_balanced_bagging, kw_args={"replace": True}) ) cv_results = cross_validate(rbb, X, y, scoring="balanced_accuracy") print(f"{cv_results['test_score'].mean():.3f} +/- {cv_results['test_score'].std():.3f}") # %% [markdown] # .. topic:: References: # # .. [1] R. Maclin, and D. Opitz. "An empirical evaluation of bagging and # boosting." AAAI/IAAI 1997 (1997): 546-551. # # .. [2] S. Wang, and X. Yao. "Diversity analysis on imbalanced data sets by # using ensemble models." 2009 IEEE symposium on computational # intelligence and data mining. IEEE, 2009. # # .. [3] S. Hido, H. Kashima, and Y. Takahashi. "Roughly balanced bagging # for imbalanced data." Statistical Analysis and Data Mining: The ASA # Data Science Journal 2.5â€6 (2009): 412-426. imbalanced-learn-0.12.2/examples/ensemble/plot_comparison_ensemble_classifier.py000066400000000000000000000162521460233407600302350ustar00rootroot00000000000000""" ============================================= Compare ensemble classifiers using resampling ============================================= Ensemble classifiers have shown to improve classification performance compare to single learner. However, they will be affected by class imbalance. This example shows the benefit of balancing the training set before to learn learners. We are making the comparison with non-balanced ensemble methods. We make a comparison using the balanced accuracy and geometric mean which are metrics widely used in the literature to evaluate models learned on imbalanced set. """ # Authors: Guillaume Lemaitre # License: MIT # %% print(__doc__) # %% [markdown] # Load an imbalanced dataset # -------------------------- # # We will load the UCI SatImage dataset which has an imbalanced ratio of 9.3:1 # (number of majority sample for a minority sample). The data are then split # into training and testing. from sklearn.model_selection import train_test_split # %% from imblearn.datasets import fetch_datasets satimage = fetch_datasets()["satimage"] X, y = satimage.data, satimage.target X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0) # %% [markdown] # Classification using a single decision tree # ------------------------------------------- # # We train a decision tree classifier which will be used as a baseline for the # rest of this example. # # The results are reported in terms of balanced accuracy and geometric mean # which are metrics widely used in the literature to validate model trained on # imbalanced set. # %% from sklearn.tree import DecisionTreeClassifier tree = DecisionTreeClassifier() tree.fit(X_train, y_train) y_pred_tree = tree.predict(X_test) # %% from sklearn.metrics import balanced_accuracy_score from imblearn.metrics import geometric_mean_score print("Decision tree classifier performance:") print( f"Balanced accuracy: {balanced_accuracy_score(y_test, y_pred_tree):.2f} - " f"Geometric mean {geometric_mean_score(y_test, y_pred_tree):.2f}" ) # %% import seaborn as sns from sklearn.metrics import ConfusionMatrixDisplay sns.set_context("poster") disp = ConfusionMatrixDisplay.from_estimator(tree, X_test, y_test, colorbar=False) _ = disp.ax_.set_title("Decision tree") # %% [markdown] # Classification using bagging classifier with and without sampling # ----------------------------------------------------------------- # # Instead of using a single tree, we will check if an ensemble of decision tree # can actually alleviate the issue induced by the class imbalancing. First, we # will use a bagging classifier and its counter part which internally uses a # random under-sampling to balanced each bootstrap sample. # %% from sklearn.ensemble import BaggingClassifier from imblearn.ensemble import BalancedBaggingClassifier bagging = BaggingClassifier(n_estimators=50, random_state=0) balanced_bagging = BalancedBaggingClassifier(n_estimators=50, random_state=0) bagging.fit(X_train, y_train) balanced_bagging.fit(X_train, y_train) y_pred_bc = bagging.predict(X_test) y_pred_bbc = balanced_bagging.predict(X_test) # %% [markdown] # Balancing each bootstrap sample allows to increase significantly the balanced # accuracy and the geometric mean. # %% print("Bagging classifier performance:") print( f"Balanced accuracy: {balanced_accuracy_score(y_test, y_pred_bc):.2f} - " f"Geometric mean {geometric_mean_score(y_test, y_pred_bc):.2f}" ) print("Balanced Bagging classifier performance:") print( f"Balanced accuracy: {balanced_accuracy_score(y_test, y_pred_bbc):.2f} - " f"Geometric mean {geometric_mean_score(y_test, y_pred_bbc):.2f}" ) # %% import matplotlib.pyplot as plt fig, axs = plt.subplots(ncols=2, figsize=(10, 5)) ConfusionMatrixDisplay.from_estimator( bagging, X_test, y_test, ax=axs[0], colorbar=False ) axs[0].set_title("Bagging") ConfusionMatrixDisplay.from_estimator( balanced_bagging, X_test, y_test, ax=axs[1], colorbar=False ) axs[1].set_title("Balanced Bagging") fig.tight_layout() # %% [markdown] # Classification using random forest classifier with and without sampling # ----------------------------------------------------------------------- # # Random forest is another popular ensemble method and it is usually # outperforming bagging. Here, we used a vanilla random forest and its balanced # counterpart in which each bootstrap sample is balanced. # %% from sklearn.ensemble import RandomForestClassifier from imblearn.ensemble import BalancedRandomForestClassifier rf = RandomForestClassifier(n_estimators=50, random_state=0) brf = BalancedRandomForestClassifier( n_estimators=50, sampling_strategy="all", replacement=True, bootstrap=False, random_state=0, ) rf.fit(X_train, y_train) brf.fit(X_train, y_train) y_pred_rf = rf.predict(X_test) y_pred_brf = brf.predict(X_test) # %% [markdown] # Similarly to the previous experiment, the balanced classifier outperform the # classifier which learn from imbalanced bootstrap samples. In addition, random # forest outperforms the bagging classifier. # %% print("Random Forest classifier performance:") print( f"Balanced accuracy: {balanced_accuracy_score(y_test, y_pred_rf):.2f} - " f"Geometric mean {geometric_mean_score(y_test, y_pred_rf):.2f}" ) print("Balanced Random Forest classifier performance:") print( f"Balanced accuracy: {balanced_accuracy_score(y_test, y_pred_brf):.2f} - " f"Geometric mean {geometric_mean_score(y_test, y_pred_brf):.2f}" ) # %% fig, axs = plt.subplots(ncols=2, figsize=(10, 5)) ConfusionMatrixDisplay.from_estimator(rf, X_test, y_test, ax=axs[0], colorbar=False) axs[0].set_title("Random forest") ConfusionMatrixDisplay.from_estimator(brf, X_test, y_test, ax=axs[1], colorbar=False) axs[1].set_title("Balanced random forest") fig.tight_layout() # %% [markdown] # Boosting classifier # ------------------- # # In the same manner, easy ensemble classifier is a bag of balanced AdaBoost # classifier. However, it will be slower to train than random forest and will # achieve worse performance. # %% from sklearn.ensemble import AdaBoostClassifier from imblearn.ensemble import EasyEnsembleClassifier, RUSBoostClassifier estimator = AdaBoostClassifier(n_estimators=10) eec = EasyEnsembleClassifier(n_estimators=10, estimator=estimator) eec.fit(X_train, y_train) y_pred_eec = eec.predict(X_test) rusboost = RUSBoostClassifier(n_estimators=10, estimator=estimator) rusboost.fit(X_train, y_train) y_pred_rusboost = rusboost.predict(X_test) # %% print("Easy ensemble classifier performance:") print( f"Balanced accuracy: {balanced_accuracy_score(y_test, y_pred_eec):.2f} - " f"Geometric mean {geometric_mean_score(y_test, y_pred_eec):.2f}" ) print("RUSBoost classifier performance:") print( f"Balanced accuracy: {balanced_accuracy_score(y_test, y_pred_rusboost):.2f} - " f"Geometric mean {geometric_mean_score(y_test, y_pred_rusboost):.2f}" ) # %% fig, axs = plt.subplots(ncols=2, figsize=(10, 5)) ConfusionMatrixDisplay.from_estimator(eec, X_test, y_test, ax=axs[0], colorbar=False) axs[0].set_title("Easy Ensemble") ConfusionMatrixDisplay.from_estimator( rusboost, X_test, y_test, ax=axs[1], colorbar=False ) axs[1].set_title("RUSBoost classifier") fig.tight_layout() plt.show() imbalanced-learn-0.12.2/examples/evaluation/000077500000000000000000000000001460233407600207445ustar00rootroot00000000000000imbalanced-learn-0.12.2/examples/evaluation/README.txt000066400000000000000000000002221460233407600224360ustar00rootroot00000000000000.. _evaluation_examples: Evaluation examples ------------------- Examples illustrating how classification using imbalanced dataset can be done. imbalanced-learn-0.12.2/examples/evaluation/plot_classification_report.py000066400000000000000000000030601460233407600267410ustar00rootroot00000000000000""" ============================================= Evaluate classification by compiling a report ============================================= Specific metrics have been developed to evaluate classifier which has been trained using imbalanced data. :mod:`imblearn` provides a classification report similar to :mod:`sklearn`, with additional metrics specific to imbalanced learning problem. """ # Authors: Guillaume Lemaitre # License: MIT from sklearn import datasets from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from imblearn import over_sampling as os from imblearn import pipeline as pl from imblearn.metrics import classification_report_imbalanced print(__doc__) RANDOM_STATE = 42 # Generate a dataset X, y = datasets.make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=10, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=4, n_samples=5000, random_state=RANDOM_STATE, ) pipeline = pl.make_pipeline( StandardScaler(), os.SMOTE(random_state=RANDOM_STATE), LogisticRegression(max_iter=10_000), ) # Split the data X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_STATE) # Train the classifier with balancing pipeline.fit(X_train, y_train) # Test the classifier and get the prediction y_pred_bal = pipeline.predict(X_test) # Show the classification report print(classification_report_imbalanced(y_test, y_pred_bal)) imbalanced-learn-0.12.2/examples/evaluation/plot_metrics.py000066400000000000000000000055241460233407600240300ustar00rootroot00000000000000""" ======================================= Metrics specific to imbalanced learning ======================================= Specific metrics have been developed to evaluate classifier which has been trained using imbalanced data. :mod:`imblearn` provides mainly two additional metrics which are not implemented in :mod:`sklearn`: (i) geometric mean and (ii) index balanced accuracy. """ # Authors: Guillaume Lemaitre # License: MIT # %% print(__doc__) RANDOM_STATE = 42 # %% [markdown] # First, we will generate some imbalanced dataset. # %% from sklearn.datasets import make_classification X, y = make_classification( n_classes=3, class_sep=2, weights=[0.1, 0.9], n_informative=10, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=4, n_samples=5000, random_state=RANDOM_STATE, ) # %% [markdown] # We will split the data into a training and testing set. # %% from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split( X, y, stratify=y, random_state=RANDOM_STATE ) # %% [markdown] # We will create a pipeline made of a :class:`~imblearn.over_sampling.SMOTE` # over-sampler followed by a :class:`~sklearn.linear_model.LogisticRegression` # classifier. from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import StandardScaler from imblearn.over_sampling import SMOTE # %% from imblearn.pipeline import make_pipeline model = make_pipeline( StandardScaler(), SMOTE(random_state=RANDOM_STATE), LogisticRegression(max_iter=10_000, random_state=RANDOM_STATE), ) # %% [markdown] # Now, we will train the model on the training set and get the prediction # associated with the testing set. Be aware that the resampling will happen # only when calling `fit`: the number of samples in `y_pred` is the same than # in `y_test`. # %% model.fit(X_train, y_train) y_pred = model.predict(X_test) # %% [markdown] # The geometric mean corresponds to the square root of the product of the # sensitivity and specificity. Combining the two metrics should account for # the balancing of the dataset. # %% from imblearn.metrics import geometric_mean_score print(f"The geometric mean is {geometric_mean_score(y_test, y_pred):.3f}") # %% [markdown] # The index balanced accuracy can transform any metric to be used in # imbalanced learning problems. # %% from imblearn.metrics import make_index_balanced_accuracy alpha = 0.1 geo_mean = make_index_balanced_accuracy(alpha=alpha, squared=True)(geometric_mean_score) print( f"The IBA using alpha={alpha} and the geometric mean: " f"{geo_mean(y_test, y_pred):.3f}" ) # %% alpha = 0.5 geo_mean = make_index_balanced_accuracy(alpha=alpha, squared=True)(geometric_mean_score) print( f"The IBA using alpha={alpha} and the geometric mean: " f"{geo_mean(y_test, y_pred):.3f}" ) imbalanced-learn-0.12.2/examples/model_selection/000077500000000000000000000000001460233407600217425ustar00rootroot00000000000000imbalanced-learn-0.12.2/examples/model_selection/README.txt000066400000000000000000000001701460233407600234360ustar00rootroot00000000000000.. _model_selection_examples: Model Selection --------------- Examples related to the selection of balancing methods. imbalanced-learn-0.12.2/examples/model_selection/plot_validation_curve.py000066400000000000000000000061211460233407600267100ustar00rootroot00000000000000""" ========================== Plotting Validation Curves ========================== In this example the impact of the :class:`~imblearn.over_sampling.SMOTE`'s `k_neighbors` parameter is examined. In the plot you can see the validation scores of a SMOTE-CART classifier for different values of the :class:`~imblearn.over_sampling.SMOTE`'s `k_neighbors` parameter. """ # Authors: Christos Aridas # Guillaume Lemaitre # License: MIT # %% print(__doc__) import seaborn as sns sns.set_context("poster") RANDOM_STATE = 42 # %% [markdown] # Let's first generate a dataset with imbalanced class distribution. # %% from sklearn.datasets import make_classification X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=10, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=4, n_samples=5000, random_state=RANDOM_STATE, ) # %% [markdown] # We will use an over-sampler :class:`~imblearn.over_sampling.SMOTE` followed # by a :class:`~sklearn.tree.DecisionTreeClassifier`. The aim will be to # search which `k_neighbors` parameter is the most adequate with the dataset # that we generated. from sklearn.tree import DecisionTreeClassifier # %% from imblearn.over_sampling import SMOTE from imblearn.pipeline import make_pipeline model = make_pipeline( SMOTE(random_state=RANDOM_STATE), DecisionTreeClassifier(random_state=RANDOM_STATE) ) # %% [markdown] # We can use the :class:`~sklearn.model_selection.validation_curve` to inspect # the impact of varying the parameter `k_neighbors`. In this case, we need # to use a score to evaluate the generalization score during the # cross-validation. # %% from sklearn.metrics import cohen_kappa_score, make_scorer from sklearn.model_selection import validation_curve scorer = make_scorer(cohen_kappa_score) param_range = range(1, 11) train_scores, test_scores = validation_curve( model, X, y, param_name="smote__k_neighbors", param_range=param_range, cv=3, scoring=scorer, ) # %% train_scores_mean = train_scores.mean(axis=1) train_scores_std = train_scores.std(axis=1) test_scores_mean = test_scores.mean(axis=1) test_scores_std = test_scores.std(axis=1) # %% [markdown] # We can now plot the results of the cross-validation for the different # parameter values that we tried. # %% import matplotlib.pyplot as plt fig, ax = plt.subplots(figsize=(7, 7)) ax.plot(param_range, test_scores_mean, label="SMOTE") ax.fill_between( param_range, test_scores_mean + test_scores_std, test_scores_mean - test_scores_std, alpha=0.2, ) idx_max = test_scores_mean.argmax() ax.scatter( param_range[idx_max], test_scores_mean[idx_max], label=r"Cohen Kappa: ${:.2f}\pm{:.2f}$".format( test_scores_mean[idx_max], test_scores_std[idx_max] ), ) fig.suptitle("Validation Curve with SMOTE-CART") ax.set_xlabel("Number of neighbors") ax.set_ylabel("Cohen's kappa") # make nice plotting sns.despine(ax=ax, offset=10) ax.set_xlim([1, 10]) ax.set_ylim([0.4, 0.8]) ax.legend(loc="lower right", fontsize=16) plt.tight_layout() plt.show() imbalanced-learn-0.12.2/examples/over-sampling/000077500000000000000000000000001460233407600213605ustar00rootroot00000000000000imbalanced-learn-0.12.2/examples/over-sampling/README.txt000066400000000000000000000003771460233407600230650ustar00rootroot00000000000000.. _over_sampling_examples: Example using over-sampling class methods ========================================= Data balancing can be performed by over-sampling such that new samples are generated in the minority class to reach a given balancing ratio. imbalanced-learn-0.12.2/examples/over-sampling/plot_comparison_over_sampling.py000066400000000000000000000253531460233407600300770ustar00rootroot00000000000000""" ============================== Compare over-sampling samplers ============================== The following example attends to make a qualitative comparison between the different over-sampling algorithms available in the imbalanced-learn package. """ # Authors: Guillaume Lemaitre # License: MIT # %% print(__doc__) import matplotlib.pyplot as plt import seaborn as sns sns.set_context("poster") # %% [markdown] # The following function will be used to create toy dataset. It uses the # :func:`~sklearn.datasets.make_classification` from scikit-learn but fixing # some parameters. # %% from sklearn.datasets import make_classification def create_dataset( n_samples=1000, weights=(0.01, 0.01, 0.98), n_classes=3, class_sep=0.8, n_clusters=1, ): return make_classification( n_samples=n_samples, n_features=2, n_informative=2, n_redundant=0, n_repeated=0, n_classes=n_classes, n_clusters_per_class=n_clusters, weights=list(weights), class_sep=class_sep, random_state=0, ) # %% [markdown] # The following function will be used to plot the sample space after resampling # to illustrate the specificities of an algorithm. # %% def plot_resampling(X, y, sampler, ax, title=None): X_res, y_res = sampler.fit_resample(X, y) ax.scatter(X_res[:, 0], X_res[:, 1], c=y_res, alpha=0.8, edgecolor="k") if title is None: title = f"Resampling with {sampler.__class__.__name__}" ax.set_title(title) sns.despine(ax=ax, offset=10) # %% [markdown] # The following function will be used to plot the decision function of a # classifier given some data. # %% import numpy as np def plot_decision_function(X, y, clf, ax, title=None): plot_step = 0.02 x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid( np.arange(x_min, x_max, plot_step), np.arange(y_min, y_max, plot_step) ) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) ax.contourf(xx, yy, Z, alpha=0.4) ax.scatter(X[:, 0], X[:, 1], alpha=0.8, c=y, edgecolor="k") if title is not None: ax.set_title(title) # %% [markdown] # Illustration of the influence of the balancing ratio # ---------------------------------------------------- # # We will first illustrate the influence of the balancing ratio on some toy # data using a logistic regression classifier which is a linear model. # %% from sklearn.linear_model import LogisticRegression clf = LogisticRegression() # %% [markdown] # We will fit and show the decision boundary model to illustrate the impact of # dealing with imbalanced classes. # %% fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(15, 12)) weights_arr = ( (0.01, 0.01, 0.98), (0.01, 0.05, 0.94), (0.2, 0.1, 0.7), (0.33, 0.33, 0.33), ) for ax, weights in zip(axs.ravel(), weights_arr): X, y = create_dataset(n_samples=300, weights=weights) clf.fit(X, y) plot_decision_function(X, y, clf, ax, title=f"weight={weights}") fig.suptitle(f"Decision function of {clf.__class__.__name__}") fig.tight_layout() # %% [markdown] # Greater is the difference between the number of samples in each class, poorer # are the classification results. # # Random over-sampling to balance the data set # -------------------------------------------- # # Random over-sampling can be used to repeat some samples and balance the # number of samples between the dataset. It can be seen that with this trivial # approach the boundary decision is already less biased toward the majority # class. The class :class:`~imblearn.over_sampling.RandomOverSampler` # implements such of a strategy. from imblearn.over_sampling import RandomOverSampler # %% from imblearn.pipeline import make_pipeline X, y = create_dataset(n_samples=100, weights=(0.05, 0.25, 0.7)) fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(15, 7)) clf.fit(X, y) plot_decision_function(X, y, clf, axs[0], title="Without resampling") sampler = RandomOverSampler(random_state=0) model = make_pipeline(sampler, clf).fit(X, y) plot_decision_function(X, y, model, axs[1], f"Using {model[0].__class__.__name__}") fig.suptitle(f"Decision function of {clf.__class__.__name__}") fig.tight_layout() # %% [markdown] # By default, random over-sampling generates a bootstrap. The parameter # `shrinkage` allows adding a small perturbation to the generated data # to generate a smoothed bootstrap instead. The plot below shows the difference # between the two data generation strategies. # %% fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(15, 7)) sampler.set_params(shrinkage=None) plot_resampling(X, y, sampler, ax=axs[0], title="Normal bootstrap") sampler.set_params(shrinkage=0.3) plot_resampling(X, y, sampler, ax=axs[1], title="Smoothed bootstrap") fig.suptitle(f"Resampling with {sampler.__class__.__name__}") fig.tight_layout() # %% [markdown] # It looks like more samples are generated with smoothed bootstrap. This is due # to the fact that the samples generated are not superimposing with the # original samples. # # More advanced over-sampling using ADASYN and SMOTE # -------------------------------------------------- # # Instead of repeating the same samples when over-sampling or perturbating the # generated bootstrap samples, one can use some specific heuristic instead. # :class:`~imblearn.over_sampling.ADASYN` and # :class:`~imblearn.over_sampling.SMOTE` can be used in this case. # %% from imblearn import FunctionSampler # to use a idendity sampler from imblearn.over_sampling import ADASYN, SMOTE X, y = create_dataset(n_samples=150, weights=(0.1, 0.2, 0.7)) fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(15, 15)) samplers = [ FunctionSampler(), RandomOverSampler(random_state=0), SMOTE(random_state=0), ADASYN(random_state=0), ] for ax, sampler in zip(axs.ravel(), samplers): title = "Original dataset" if isinstance(sampler, FunctionSampler) else None plot_resampling(X, y, sampler, ax, title=title) fig.tight_layout() # %% [markdown] # The following plot illustrates the difference between # :class:`~imblearn.over_sampling.ADASYN` and # :class:`~imblearn.over_sampling.SMOTE`. # :class:`~imblearn.over_sampling.ADASYN` will focus on the samples which are # difficult to classify with a nearest-neighbors rule while regular # :class:`~imblearn.over_sampling.SMOTE` will not make any distinction. # Therefore, the decision function depending of the algorithm. X, y = create_dataset(n_samples=150, weights=(0.05, 0.25, 0.7)) fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(20, 6)) models = { "Without sampler": clf, "ADASYN sampler": make_pipeline(ADASYN(random_state=0), clf), "SMOTE sampler": make_pipeline(SMOTE(random_state=0), clf), } for ax, (title, model) in zip(axs, models.items()): model.fit(X, y) plot_decision_function(X, y, model, ax=ax, title=title) fig.suptitle(f"Decision function using a {clf.__class__.__name__}") fig.tight_layout() # %% [markdown] # Due to those sampling particularities, it can give rise to some specific # issues as illustrated below. # %% X, y = create_dataset(n_samples=5000, weights=(0.01, 0.05, 0.94), class_sep=0.8) samplers = [SMOTE(random_state=0), ADASYN(random_state=0)] fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(15, 15)) for ax, sampler in zip(axs, samplers): model = make_pipeline(sampler, clf).fit(X, y) plot_decision_function( X, y, clf, ax[0], title=f"Decision function with {sampler.__class__.__name__}" ) plot_resampling(X, y, sampler, ax[1]) fig.suptitle("Particularities of over-sampling with SMOTE and ADASYN") fig.tight_layout() # %% [markdown] # SMOTE proposes several variants by identifying specific samples to consider # during the resampling. The borderline version # (:class:`~imblearn.over_sampling.BorderlineSMOTE`) will detect which point to # select which are in the border between two classes. The SVM version # (:class:`~imblearn.over_sampling.SVMSMOTE`) will use the support vectors # found using an SVM algorithm to create new sample while the KMeans version # (:class:`~imblearn.over_sampling.KMeansSMOTE`) will make a clustering before # to generate samples in each cluster independently depending each cluster # density. # %% from sklearn.cluster import MiniBatchKMeans from imblearn.over_sampling import SVMSMOTE, BorderlineSMOTE, KMeansSMOTE X, y = create_dataset(n_samples=5000, weights=(0.01, 0.05, 0.94), class_sep=0.8) fig, axs = plt.subplots(5, 2, figsize=(15, 30)) samplers = [ SMOTE(random_state=0), BorderlineSMOTE(random_state=0, kind="borderline-1"), BorderlineSMOTE(random_state=0, kind="borderline-2"), KMeansSMOTE( kmeans_estimator=MiniBatchKMeans(n_clusters=10, n_init=1, random_state=0), random_state=0, ), SVMSMOTE(random_state=0), ] for ax, sampler in zip(axs, samplers): model = make_pipeline(sampler, clf).fit(X, y) plot_decision_function( X, y, clf, ax[0], title=f"Decision function for {sampler.__class__.__name__}" ) plot_resampling(X, y, sampler, ax[1]) fig.suptitle("Decision function and resampling using SMOTE variants") fig.tight_layout() # %% [markdown] # When dealing with a mixed of continuous and categorical features, # :class:`~imblearn.over_sampling.SMOTENC` is the only method which can handle # this case. # %% from collections import Counter from imblearn.over_sampling import SMOTENC rng = np.random.RandomState(42) n_samples = 50 # Create a dataset of a mix of numerical and categorical data X = np.empty((n_samples, 3), dtype=object) X[:, 0] = rng.choice(["A", "B", "C"], size=n_samples).astype(object) X[:, 1] = rng.randn(n_samples) X[:, 2] = rng.randint(3, size=n_samples) y = np.array([0] * 20 + [1] * 30) print("The original imbalanced dataset") print(sorted(Counter(y).items())) print() print("The first and last columns are containing categorical features:") print(X[:5]) print() smote_nc = SMOTENC(categorical_features=[0, 2], random_state=0) X_resampled, y_resampled = smote_nc.fit_resample(X, y) print("Dataset after resampling:") print(sorted(Counter(y_resampled).items())) print() print("SMOTE-NC will generate categories for the categorical features:") print(X_resampled[-5:]) print() # %% [markdown] # However, if the dataset is composed of only categorical features then one # should use :class:`~imblearn.over_sampling.SMOTEN`. # %% from imblearn.over_sampling import SMOTEN # Generate only categorical data X = np.array(["A"] * 10 + ["B"] * 20 + ["C"] * 30, dtype=object).reshape(-1, 1) y = np.array([0] * 20 + [1] * 40, dtype=np.int32) print(f"Original class counts: {Counter(y)}") print() print(X[:5]) print() sampler = SMOTEN(random_state=0) X_res, y_res = sampler.fit_resample(X, y) print(f"Class counts after resampling {Counter(y_res)}") print() print(X_res[-5:]) print() imbalanced-learn-0.12.2/examples/over-sampling/plot_illustration_generation_sample.py000066400000000000000000000037321460233407600313020ustar00rootroot00000000000000""" ============================================ Sample generator used in SMOTE-like samplers ============================================ This example illustrates how a new sample is generated taking into account the neighbourhood of this sample. A new sample is generated by selecting the randomly 2 samples of the same class and interpolating a point between these samples. """ # Authors: Guillaume Lemaitre # License: MIT # %% print(__doc__) import matplotlib.pyplot as plt import numpy as np import seaborn as sns sns.set_context("poster") rng = np.random.RandomState(18) f, ax = plt.subplots(figsize=(8, 8)) # generate some data points y = np.array([3.65284, 3.52623, 3.51468, 3.22199, 3.21]) z = np.array([0.43, 0.45, 0.6, 0.4, 0.211]) y_2 = np.array([3.3, 3.6]) z_2 = np.array([0.58, 0.34]) # plot the majority and minority samples ax.scatter(z, y, label="Minority class", s=100) ax.scatter(z_2, y_2, label="Majority class", s=100) idx = rng.randint(len(y), size=2) annotation = [r"$x_i$", r"$x_{zi}$"] for a, i in zip(annotation, idx): ax.annotate(a, (z[i], y[i]), xytext=tuple([z[i] + 0.01, y[i] + 0.005]), fontsize=15) # draw the circle in which the new sample will generated radius = np.sqrt((z[idx[0]] - z[idx[1]]) ** 2 + (y[idx[0]] - y[idx[1]]) ** 2) circle = plt.Circle((z[idx[0]], y[idx[0]]), radius=radius, alpha=0.2) ax.add_artist(circle) # plot the line on which the sample will be generated ax.plot(z[idx], y[idx], "--", alpha=0.5) # create and plot the new sample step = rng.uniform() y_gen = y[idx[0]] + step * (y[idx[1]] - y[idx[0]]) z_gen = z[idx[0]] + step * (z[idx[1]] - z[idx[0]]) ax.scatter(z_gen, y_gen, s=100) ax.annotate( r"$x_{new}$", (z_gen, y_gen), xytext=tuple([z_gen + 0.01, y_gen + 0.005]), fontsize=15, ) # make the plot nicer with legend and label sns.despine(ax=ax, offset=10) ax.set_xlim([0.2, 0.7]) ax.set_ylim([3.2, 3.7]) plt.xlabel(r"$X_1$") plt.ylabel(r"$X_2$") plt.legend() plt.tight_layout() plt.show() imbalanced-learn-0.12.2/examples/over-sampling/plot_shrinkage_effect.py000066400000000000000000000075641460233407600262730ustar00rootroot00000000000000""" ====================================================== Effect of the shrinkage factor in random over-sampling ====================================================== This example shows the effect of the shrinkage factor used to generate the smoothed bootstrap using the :class:`~imblearn.over_sampling.RandomOverSampler`. """ # Authors: Guillaume Lemaitre # License: MIT # %% print(__doc__) import seaborn as sns sns.set_context("poster") # %% # First, we will generate a toy classification dataset with only few samples. # The ratio between the classes will be imbalanced. from collections import Counter from sklearn.datasets import make_classification X, y = make_classification( n_samples=100, n_features=2, n_redundant=0, weights=[0.1, 0.9], random_state=0, ) Counter(y) # %% import matplotlib.pyplot as plt fig, ax = plt.subplots(figsize=(7, 7)) scatter = plt.scatter(X[:, 0], X[:, 1], c=y, alpha=0.4) class_legend = ax.legend(*scatter.legend_elements(), loc="lower left", title="Classes") ax.add_artist(class_legend) ax.set_xlabel("Feature #1") _ = ax.set_ylabel("Feature #2") plt.tight_layout() # %% # Now, we will use a :class:`~imblearn.over_sampling.RandomOverSampler` to # generate a bootstrap for the minority class with as many samples as in the # majority class. from imblearn.over_sampling import RandomOverSampler sampler = RandomOverSampler(random_state=0) X_res, y_res = sampler.fit_resample(X, y) Counter(y_res) # %% fig, ax = plt.subplots(figsize=(7, 7)) scatter = plt.scatter(X_res[:, 0], X_res[:, 1], c=y_res, alpha=0.4) class_legend = ax.legend(*scatter.legend_elements(), loc="lower left", title="Classes") ax.add_artist(class_legend) ax.set_xlabel("Feature #1") _ = ax.set_ylabel("Feature #2") plt.tight_layout() # %% # We observe that the minority samples are less transparent than the samples # from the majority class. Indeed, it is due to the fact that these samples # of the minority class are repeated during the bootstrap generation. # # We can set `shrinkage` to a floating value to add a small perturbation to the # samples created and therefore create a smoothed bootstrap. sampler = RandomOverSampler(shrinkage=1, random_state=0) X_res, y_res = sampler.fit_resample(X, y) Counter(y_res) # %% fig, ax = plt.subplots(figsize=(7, 7)) scatter = plt.scatter(X_res[:, 0], X_res[:, 1], c=y_res, alpha=0.4) class_legend = ax.legend(*scatter.legend_elements(), loc="lower left", title="Classes") ax.add_artist(class_legend) ax.set_xlabel("Feature #1") _ = ax.set_ylabel("Feature #2") plt.tight_layout() # %% # In this case, we see that the samples in the minority class are not # overlapping anymore due to the added noise. # # The parameter `shrinkage` allows to add more or less perturbation. Let's # add more perturbation when generating the smoothed bootstrap. sampler = RandomOverSampler(shrinkage=3, random_state=0) X_res, y_res = sampler.fit_resample(X, y) Counter(y_res) # %% fig, ax = plt.subplots(figsize=(7, 7)) scatter = plt.scatter(X_res[:, 0], X_res[:, 1], c=y_res, alpha=0.4) class_legend = ax.legend(*scatter.legend_elements(), loc="lower left", title="Classes") ax.add_artist(class_legend) ax.set_xlabel("Feature #1") _ = ax.set_ylabel("Feature #2") plt.tight_layout() # %% # Increasing the value of `shrinkage` will disperse the new samples. Forcing # the shrinkage to 0 will be equivalent to generating a normal bootstrap. sampler = RandomOverSampler(shrinkage=0, random_state=0) X_res, y_res = sampler.fit_resample(X, y) Counter(y_res) # %% fig, ax = plt.subplots(figsize=(7, 7)) scatter = plt.scatter(X_res[:, 0], X_res[:, 1], c=y_res, alpha=0.4) class_legend = ax.legend(*scatter.legend_elements(), loc="lower left", title="Classes") ax.add_artist(class_legend) ax.set_xlabel("Feature #1") _ = ax.set_ylabel("Feature #2") plt.tight_layout() # %% # Therefore, the `shrinkage` is handy to manually tune the dispersion of the # new samples. imbalanced-learn-0.12.2/examples/pipeline/000077500000000000000000000000001460233407600204025ustar00rootroot00000000000000imbalanced-learn-0.12.2/examples/pipeline/README.txt000066400000000000000000000002331460233407600220760ustar00rootroot00000000000000.. _pipeline_examples: Pipeline examples ================= Example of how to use the a pipeline to include under-sampling with `scikit-learn` estimators.imbalanced-learn-0.12.2/examples/pipeline/plot_pipeline_classification.py000066400000000000000000000037261460233407600267020ustar00rootroot00000000000000""" ==================================== Usage of pipeline embedding samplers ==================================== An example of the :class:~imblearn.pipeline.Pipeline` object (or :func:`~imblearn.pipeline.make_pipeline` helper function) working with transformers and resamplers. """ # Authors: Christos Aridas # Guillaume Lemaitre # License: MIT # %% print(__doc__) # %% [markdown] # Let's first create an imbalanced dataset and split in to two sets. # %% from sklearn.datasets import make_classification from sklearn.model_selection import train_test_split X, y = make_classification( n_classes=2, class_sep=1.25, weights=[0.3, 0.7], n_informative=3, n_redundant=1, flip_y=0, n_features=5, n_clusters_per_class=1, n_samples=5000, random_state=10, ) X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42) # %% [markdown] # Now, we will create each individual steps that we would like later to combine # %% from sklearn.decomposition import PCA from sklearn.neighbors import KNeighborsClassifier from imblearn.over_sampling import SMOTE from imblearn.under_sampling import EditedNearestNeighbours pca = PCA(n_components=2) enn = EditedNearestNeighbours() smote = SMOTE(random_state=0) knn = KNeighborsClassifier(n_neighbors=1) # %% [markdown] # Now, we can finally create a pipeline to specify in which order the different # transformers and samplers should be executed before to provide the data to # the final classifier. # %% from imblearn.pipeline import make_pipeline model = make_pipeline(pca, enn, smote, knn) # %% [markdown] # We can now use the pipeline created as a normal classifier where resampling # will happen when calling `fit` and disabled when calling `decision_function`, # `predict_proba`, or `predict`. # %% from sklearn.metrics import classification_report model.fit(X_train, y_train) y_pred = model.predict(X_test) print(classification_report(y_test, y_pred)) imbalanced-learn-0.12.2/examples/under-sampling/000077500000000000000000000000001460233407600215225ustar00rootroot00000000000000imbalanced-learn-0.12.2/examples/under-sampling/README.txt000066400000000000000000000005121460233407600232160ustar00rootroot00000000000000.. _under_sampling_examples: Example using under-sampling class methods ========================================== Under-sampling refers to the process of reducing the number of samples in the majority classes. The implemented methods can be categorized into 2 groups: (i) fixed under-sampling and (ii) cleaning under-sampling. imbalanced-learn-0.12.2/examples/under-sampling/plot_comparison_under_sampling.py000066400000000000000000000227531460233407600304040ustar00rootroot00000000000000""" =============================== Compare under-sampling samplers =============================== The following example attends to make a qualitative comparison between the different under-sampling algorithms available in the imbalanced-learn package. """ # Authors: Guillaume Lemaitre # License: MIT # %% print(__doc__) import seaborn as sns sns.set_context("poster") # %% [markdown] # The following function will be used to create toy dataset. It uses the # :func:`~sklearn.datasets.make_classification` from scikit-learn but fixing # some parameters. # %% from sklearn.datasets import make_classification def create_dataset( n_samples=1000, weights=(0.01, 0.01, 0.98), n_classes=3, class_sep=0.8, n_clusters=1, ): return make_classification( n_samples=n_samples, n_features=2, n_informative=2, n_redundant=0, n_repeated=0, n_classes=n_classes, n_clusters_per_class=n_clusters, weights=list(weights), class_sep=class_sep, random_state=0, ) # %% [markdown] # The following function will be used to plot the sample space after resampling # to illustrate the specificities of an algorithm. # %% def plot_resampling(X, y, sampler, ax, title=None): X_res, y_res = sampler.fit_resample(X, y) ax.scatter(X_res[:, 0], X_res[:, 1], c=y_res, alpha=0.8, edgecolor="k") if title is None: title = f"Resampling with {sampler.__class__.__name__}" ax.set_title(title) sns.despine(ax=ax, offset=10) # %% [markdown] # The following function will be used to plot the decision function of a # classifier given some data. # %% import numpy as np def plot_decision_function(X, y, clf, ax, title=None): plot_step = 0.02 x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid( np.arange(x_min, x_max, plot_step), np.arange(y_min, y_max, plot_step) ) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) ax.contourf(xx, yy, Z, alpha=0.4) ax.scatter(X[:, 0], X[:, 1], alpha=0.8, c=y, edgecolor="k") if title is not None: ax.set_title(title) # %% from sklearn.linear_model import LogisticRegression clf = LogisticRegression() # %% [markdown] # Prototype generation: under-sampling by generating new samples # -------------------------------------------------------------- # # :class:`~imblearn.under_sampling.ClusterCentroids` under-samples by replacing # the original samples by the centroids of the cluster found. # %% import matplotlib.pyplot as plt from sklearn.cluster import MiniBatchKMeans from imblearn import FunctionSampler from imblearn.pipeline import make_pipeline from imblearn.under_sampling import ClusterCentroids X, y = create_dataset(n_samples=400, weights=(0.05, 0.15, 0.8), class_sep=0.8) samplers = { FunctionSampler(), # identity resampler ClusterCentroids( estimator=MiniBatchKMeans(n_init=1, random_state=0), random_state=0 ), } fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(15, 15)) for ax, sampler in zip(axs, samplers): model = make_pipeline(sampler, clf).fit(X, y) plot_decision_function( X, y, model, ax[0], title=f"Decision function with {sampler.__class__.__name__}" ) plot_resampling(X, y, sampler, ax[1]) fig.tight_layout() # %% [markdown] # Prototype selection: under-sampling by selecting existing samples # ----------------------------------------------------------------- # # The algorithm performing prototype selection can be subdivided into two # groups: (i) the controlled under-sampling methods and (ii) the cleaning # under-sampling methods. # # With the controlled under-sampling methods, the number of samples to be # selected can be specified. # :class:`~imblearn.under_sampling.RandomUnderSampler` is the most naive way of # performing such selection by randomly selecting a given number of samples by # the targeted class. # %% from imblearn.under_sampling import RandomUnderSampler X, y = create_dataset(n_samples=400, weights=(0.05, 0.15, 0.8), class_sep=0.8) samplers = { FunctionSampler(), # identity resampler RandomUnderSampler(random_state=0), } fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(15, 15)) for ax, sampler in zip(axs, samplers): model = make_pipeline(sampler, clf).fit(X, y) plot_decision_function( X, y, model, ax[0], title=f"Decision function with {sampler.__class__.__name__}" ) plot_resampling(X, y, sampler, ax[1]) fig.tight_layout() # %% [markdown] # :class:`~imblearn.under_sampling.NearMiss` algorithms implement some # heuristic rules in order to select samples. NearMiss-1 selects samples from # the majority class for which the average distance of the :math:`k`` nearest # samples of the minority class is the smallest. NearMiss-2 selects the samples # from the majority class for which the average distance to the farthest # samples of the negative class is the smallest. NearMiss-3 is a 2-step # algorithm: first, for each minority sample, their :math:`m` # nearest-neighbors will be kept; then, the majority samples selected are the # on for which the average distance to the :math:`k` nearest neighbors is the # largest. # %% from imblearn.under_sampling import NearMiss X, y = create_dataset(n_samples=1000, weights=(0.05, 0.15, 0.8), class_sep=1.5) samplers = [NearMiss(version=1), NearMiss(version=2), NearMiss(version=3)] fig, axs = plt.subplots(nrows=3, ncols=2, figsize=(15, 25)) for ax, sampler in zip(axs, samplers): model = make_pipeline(sampler, clf).fit(X, y) plot_decision_function( X, y, model, ax[0], title=f"Decision function for {sampler.__class__.__name__}-{sampler.version}", ) plot_resampling( X, y, sampler, ax[1], title=f"Resampling using {sampler.__class__.__name__}-{sampler.version}", ) fig.tight_layout() # %% [markdown] # :class:`~imblearn.under_sampling.EditedNearestNeighbours` removes samples of # the majority class for which their class differ from the one of their # nearest-neighbors. This sieve can be repeated which is the principle of the # :class:`~imblearn.under_sampling.RepeatedEditedNearestNeighbours`. # :class:`~imblearn.under_sampling.AllKNN` is slightly different from the # :class:`~imblearn.under_sampling.RepeatedEditedNearestNeighbours` by changing # the :math:`k` parameter of the internal nearest neighors algorithm, # increasing it at each iteration. # %% from imblearn.under_sampling import ( AllKNN, EditedNearestNeighbours, RepeatedEditedNearestNeighbours, ) X, y = create_dataset(n_samples=500, weights=(0.2, 0.3, 0.5), class_sep=0.8) samplers = [ EditedNearestNeighbours(), RepeatedEditedNearestNeighbours(), AllKNN(allow_minority=True), ] fig, axs = plt.subplots(3, 2, figsize=(15, 25)) for ax, sampler in zip(axs, samplers): model = make_pipeline(sampler, clf).fit(X, y) plot_decision_function( X, y, clf, ax[0], title=f"Decision function for \n{sampler.__class__.__name__}" ) plot_resampling( X, y, sampler, ax[1], title=f"Resampling using \n{sampler.__class__.__name__}" ) fig.tight_layout() # %% [markdown] # :class:`~imblearn.under_sampling.CondensedNearestNeighbour` makes use of a # 1-NN to iteratively decide if a sample should be kept in a dataset or not. # The issue is that :class:`~imblearn.under_sampling.CondensedNearestNeighbour` # is sensitive to noise by preserving the noisy samples. # :class:`~imblearn.under_sampling.OneSidedSelection` also used the 1-NN and # use :class:`~imblearn.under_sampling.TomekLinks` to remove the samples # considered noisy. The # :class:`~imblearn.under_sampling.NeighbourhoodCleaningRule` use a # :class:`~imblearn.under_sampling.EditedNearestNeighbours` to remove some # sample. Additionally, they use a 3 nearest-neighbors to remove samples which # do not agree with this rule. # %% from imblearn.under_sampling import ( CondensedNearestNeighbour, NeighbourhoodCleaningRule, OneSidedSelection, ) X, y = create_dataset(n_samples=500, weights=(0.2, 0.3, 0.5), class_sep=0.8) fig, axs = plt.subplots(nrows=3, ncols=2, figsize=(15, 25)) samplers = [ CondensedNearestNeighbour(random_state=0), OneSidedSelection(random_state=0), NeighbourhoodCleaningRule(n_neighbors=11), ] for ax, sampler in zip(axs, samplers): model = make_pipeline(sampler, clf).fit(X, y) plot_decision_function( X, y, clf, ax[0], title=f"Decision function for \n{sampler.__class__.__name__}" ) plot_resampling( X, y, sampler, ax[1], title=f"Resampling using \n{sampler.__class__.__name__}" ) fig.tight_layout() # %% [markdown] # :class:`~imblearn.under_sampling.InstanceHardnessThreshold` uses the # prediction of classifier to exclude samples. All samples which are classified # with a low probability will be removed. # %% from imblearn.under_sampling import InstanceHardnessThreshold samplers = { FunctionSampler(), # identity resampler InstanceHardnessThreshold( estimator=LogisticRegression(), random_state=0, ), } fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(15, 15)) for ax, sampler in zip(axs, samplers): model = make_pipeline(sampler, clf).fit(X, y) plot_decision_function( X, y, model, ax[0], title=f"Decision function with \n{sampler.__class__.__name__}", ) plot_resampling( X, y, sampler, ax[1], title=f"Resampling using \n{sampler.__class__.__name__}" ) fig.tight_layout() plt.show() imbalanced-learn-0.12.2/examples/under-sampling/plot_illustration_nearmiss.py000066400000000000000000000132071460233407600275670ustar00rootroot00000000000000""" ============================ Sample selection in NearMiss ============================ This example illustrates the different way of selecting example in :class:`~imblearn.under_sampling.NearMiss`. """ # Authors: Guillaume Lemaitre # License: MIT # %% print(__doc__) import seaborn as sns sns.set_context("poster") # %% [markdown] # We define a function allowing to make some nice decoration on the plot. # %% def make_plot_despine(ax): sns.despine(ax=ax, offset=10) ax.set_xlim([0, 3.5]) ax.set_ylim([0, 3.5]) ax.set_xticks(np.arange(0, 3.6, 0.5)) ax.set_yticks(np.arange(0, 3.6, 0.5)) ax.set_xlabel(r"$X_1$") ax.set_ylabel(r"$X_2$") ax.legend(loc="upper left", fontsize=16) # %% [markdown] # We can start by generating some data to later illustrate the principle of # each :class:`~imblearn.under_sampling.NearMiss` heuristic rules. # %% import numpy as np rng = np.random.RandomState(18) X_minority = np.transpose( [[1.1, 1.3, 1.15, 0.8, 0.8, 0.6, 0.55], [1.0, 1.5, 1.7, 2.5, 2.0, 1.2, 0.55]] ) X_majority = np.transpose( [ [2.1, 2.12, 2.13, 2.14, 2.2, 2.3, 2.5, 2.45], [1.5, 2.1, 2.7, 0.9, 1.0, 1.4, 2.4, 2.9], ] ) # %% [mardown] # NearMiss-1 # ---------- # # NearMiss-1 selects samples from the majority class for which the average # distance to some nearest neighbours is the smallest. In the following # example, we use a 3-NN to compute the average distance on 2 specific samples # of the majority class. Therefore, in this case the point linked by the # green-dashed line will be selected since the average distance is smaller. # %% import matplotlib.pyplot as plt from sklearn.neighbors import NearestNeighbors fig, ax = plt.subplots(figsize=(8, 8)) ax.scatter( X_minority[:, 0], X_minority[:, 1], label="Minority class", s=200, marker="_", ) ax.scatter( X_majority[:, 0], X_majority[:, 1], label="Majority class", s=200, marker="+", ) nearest_neighbors = NearestNeighbors(n_neighbors=3) nearest_neighbors.fit(X_minority) dist, ind = nearest_neighbors.kneighbors(X_majority[:2, :]) dist_avg = dist.sum(axis=1) / 3 for positive_idx, (neighbors, distance, color) in enumerate( zip(ind, dist_avg, ["g", "r"]) ): for make_plot, sample_idx in enumerate(neighbors): ax.plot( [X_majority[positive_idx, 0], X_minority[sample_idx, 0]], [X_majority[positive_idx, 1], X_minority[sample_idx, 1]], "--" + color, alpha=0.3, label=f"Avg. dist.={distance:.2f}" if make_plot == 0 else "", ) ax.set_title("NearMiss-1") make_plot_despine(ax) plt.tight_layout() # %% [mardown] # NearMiss-2 # ---------- # # NearMiss-2 selects samples from the majority class for which the average # distance to the farthest neighbors is the smallest. With the same # configuration as previously presented, the sample linked to the green-dashed # line will be selected since its distance the 3 farthest neighbors is the # smallest. # %% fig, ax = plt.subplots(figsize=(8, 8)) ax.scatter( X_minority[:, 0], X_minority[:, 1], label="Minority class", s=200, marker="_", ) ax.scatter( X_majority[:, 0], X_majority[:, 1], label="Majority class", s=200, marker="+", ) nearest_neighbors = NearestNeighbors(n_neighbors=X_minority.shape[0]) nearest_neighbors.fit(X_minority) dist, ind = nearest_neighbors.kneighbors(X_majority[:2, :]) dist = dist[:, -3::] ind = ind[:, -3::] dist_avg = dist.sum(axis=1) / 3 for positive_idx, (neighbors, distance, color) in enumerate( zip(ind, dist_avg, ["g", "r"]) ): for make_plot, sample_idx in enumerate(neighbors): ax.plot( [X_majority[positive_idx, 0], X_minority[sample_idx, 0]], [X_majority[positive_idx, 1], X_minority[sample_idx, 1]], "--" + color, alpha=0.3, label=f"Avg. dist.={distance:.2f}" if make_plot == 0 else "", ) ax.set_title("NearMiss-2") make_plot_despine(ax) plt.tight_layout() # %% [mardown] # NearMiss-3 # ---------- # # NearMiss-3 can be divided into 2 steps. First, a nearest-neighbors is used to # short-list samples from the majority class (i.e. correspond to the # highlighted samples in the following plot). Then, the sample with the largest # average distance to the *k* nearest-neighbors are selected. # %% fig, ax = plt.subplots(figsize=(8.5, 8.5)) ax.scatter( X_minority[:, 0], X_minority[:, 1], label="Minority class", s=200, marker="_", ) ax.scatter( X_majority[:, 0], X_majority[:, 1], label="Majority class", s=200, marker="+", ) nearest_neighbors = NearestNeighbors(n_neighbors=3) nearest_neighbors.fit(X_majority) # select only the majority point of interest selected_idx = nearest_neighbors.kneighbors(X_minority, return_distance=False) X_majority = X_majority[np.unique(selected_idx), :] ax.scatter( X_majority[:, 0], X_majority[:, 1], label="Short-listed samples", s=200, alpha=0.3, color="g", ) nearest_neighbors = NearestNeighbors(n_neighbors=3) nearest_neighbors.fit(X_minority) dist, ind = nearest_neighbors.kneighbors(X_majority[:2, :]) dist_avg = dist.sum(axis=1) / 3 for positive_idx, (neighbors, distance, color) in enumerate( zip(ind, dist_avg, ["r", "g"]) ): for make_plot, sample_idx in enumerate(neighbors): ax.plot( [X_majority[positive_idx, 0], X_minority[sample_idx, 0]], [X_majority[positive_idx, 1], X_minority[sample_idx, 1]], "--" + color, alpha=0.3, label=f"Avg. dist.={distance:.2f}" if make_plot == 0 else "", ) ax.set_title("NearMiss-3") make_plot_despine(ax) plt.tight_layout() plt.show() imbalanced-learn-0.12.2/examples/under-sampling/plot_illustration_tomek_links.py000066400000000000000000000061541460233407600302700ustar00rootroot00000000000000""" ============================================== Illustration of the definition of a Tomek link ============================================== This example illustrates what is a Tomek link. """ # Authors: Guillaume Lemaitre # License: MIT # %% print(__doc__) import matplotlib.pyplot as plt import seaborn as sns sns.set_context("poster") # %% [markdown] # This function allows to make nice plotting # %% def make_plot_despine(ax): sns.despine(ax=ax, offset=10) ax.set_xlim([0, 3]) ax.set_ylim([0, 3]) ax.set_xlabel(r"$X_1$") ax.set_ylabel(r"$X_2$") ax.legend(loc="lower right") # %% [markdown] # We will generate some toy data that illustrates how # :class:`~imblearn.under_sampling.TomekLinks` is used to clean a dataset. # %% import numpy as np rng = np.random.RandomState(18) X_minority = np.transpose( [[1.1, 1.3, 1.15, 0.8, 0.55, 2.1], [1.0, 1.5, 1.7, 2.5, 0.55, 1.9]] ) X_majority = np.transpose( [ [2.1, 2.12, 2.13, 2.14, 2.2, 2.3, 2.5, 2.45], [1.5, 2.1, 2.7, 0.9, 1.0, 1.4, 2.4, 2.9], ] ) # %% [markdown] # In the figure above, the samples highlighted in green form a Tomek link since # they are of different classes and are nearest neighbors of each other. fig, ax = plt.subplots(figsize=(8, 8)) ax.scatter( X_minority[:, 0], X_minority[:, 1], label="Minority class", s=200, marker="_", ) ax.scatter( X_majority[:, 0], X_majority[:, 1], label="Majority class", s=200, marker="+", ) # highlight the samples of interest ax.scatter( [X_minority[-1, 0], X_majority[1, 0]], [X_minority[-1, 1], X_majority[1, 1]], label="Tomek link", s=200, alpha=0.3, ) make_plot_despine(ax) fig.suptitle("Illustration of a Tomek link") fig.tight_layout() # %% [markdown] # We can run the :class:`~imblearn.under_sampling.TomekLinks` sampling to # remove the corresponding samples. If `sampling_strategy='auto'` only the # sample from the majority class will be removed. If `sampling_strategy='all'` # both samples will be removed. # %% from imblearn.under_sampling import TomekLinks fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(16, 8)) samplers = { "Removing only majority samples": TomekLinks(sampling_strategy="auto"), "Removing all samples": TomekLinks(sampling_strategy="all"), } for ax, (title, sampler) in zip(axs, samplers.items()): X_res, y_res = sampler.fit_resample( np.vstack((X_minority, X_majority)), np.array([0] * X_minority.shape[0] + [1] * X_majority.shape[0]), ) ax.scatter( X_res[y_res == 0][:, 0], X_res[y_res == 0][:, 1], label="Minority class", s=200, marker="_", ) ax.scatter( X_res[y_res == 1][:, 0], X_res[y_res == 1][:, 1], label="Majority class", s=200, marker="+", ) # highlight the samples of interest ax.scatter( [X_minority[-1, 0], X_majority[1, 0]], [X_minority[-1, 1], X_majority[1, 1]], label="Tomek link", s=200, alpha=0.3, ) ax.set_title(title) make_plot_despine(ax) fig.tight_layout() plt.show() imbalanced-learn-0.12.2/imblearn/000077500000000000000000000000001460233407600165505ustar00rootroot00000000000000imbalanced-learn-0.12.2/imblearn/__init__.py000066400000000000000000000075731460233407600206750ustar00rootroot00000000000000"""Toolbox for imbalanced dataset in machine learning. ``imbalanced-learn`` is a set of python methods to deal with imbalanced datset in machine learning and pattern recognition. Subpackages ----------- combine Module which provides methods based on over-sampling and under-sampling. ensemble Module which provides methods generating an ensemble of under-sampled subsets. exceptions Module including custom warnings and error clases used across imbalanced-learn. keras Module which provides custom generator, layers for deep learning using keras. metrics Module which provides metrics to quantified the classification performance with imbalanced dataset. over_sampling Module which provides methods to over-sample a dataset. tensorflow Module which provides custom generator, layers for deep learning using tensorflow. under-sampling Module which provides methods to under-sample a dataset. utils Module including various utilities. pipeline Module which allowing to create pipeline with scikit-learn estimators. """ import importlib import sys import types try: # This variable is injected in the __builtins__ by the build # process. It is used to enable importing subpackages of sklearn when # the binaries are not built # mypy error: Cannot determine type of '__SKLEARN_SETUP__' __IMBLEARN_SETUP__ # type: ignore except NameError: __IMBLEARN_SETUP__ = False if __IMBLEARN_SETUP__: sys.stderr.write("Partial import of imblearn during the build process.\n") # We are not importing the rest of scikit-learn during the build # process, as it may not be compiled yet else: from . import ( combine, ensemble, exceptions, metrics, over_sampling, pipeline, tensorflow, under_sampling, utils, ) from ._version import __version__ from .base import FunctionSampler from .utils._show_versions import show_versions # noqa: F401 # FIXME: When we get Python 3.7 as minimal version, we will need to switch to # the following solution: # https://snarky.ca/lazy-importing-in-python-3-7/ class LazyLoader(types.ModuleType): """Lazily import a module, mainly to avoid pulling in large dependencies. Adapted from TensorFlow: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/ python/util/lazy_loader.py """ def __init__(self, local_name, parent_module_globals, name, warning=None): self._local_name = local_name self._parent_module_globals = parent_module_globals self._warning = warning super(LazyLoader, self).__init__(name) def _load(self): """Load the module and insert it into the parent's globals.""" # Import the target module and insert it into the parent's namespace module = importlib.import_module(self.__name__) self._parent_module_globals[self._local_name] = module # Update this object's dict so that if someone keeps a reference to the # LazyLoader, lookups are efficient (__getattr__ is only called on # lookups that fail). self.__dict__.update(module.__dict__) return module def __getattr__(self, item): module = self._load() return getattr(module, item) def __dir__(self): module = self._load() return dir(module) # delay the import of keras since we are going to import either tensorflow # or keras keras = LazyLoader("keras", globals(), "imblearn.keras") __all__ = [ "combine", "ensemble", "exceptions", "keras", "metrics", "over_sampling", "tensorflow", "under_sampling", "utils", "pipeline", "FunctionSampler", "__version__", ] imbalanced-learn-0.12.2/imblearn/_config.py000066400000000000000000000325351460233407600205360ustar00rootroot00000000000000"""This is copy of sklearn/_config.py # TODO: remove this file when scikit-learn minimum version is 1.3 We remove the array_api_dispatch for the moment. """ import os import threading from contextlib import contextmanager as contextmanager import sklearn from sklearn.utils.fixes import parse_version sklearn_version = parse_version(sklearn.__version__) if sklearn_version < parse_version("1.3"): _global_config = { "assume_finite": bool(os.environ.get("SKLEARN_ASSUME_FINITE", False)), "working_memory": int(os.environ.get("SKLEARN_WORKING_MEMORY", 1024)), "print_changed_only": True, "display": "diagram", "pairwise_dist_chunk_size": int( os.environ.get("SKLEARN_PAIRWISE_DIST_CHUNK_SIZE", 256) ), "enable_cython_pairwise_dist": True, "transform_output": "default", "enable_metadata_routing": False, "skip_parameter_validation": False, } _threadlocal = threading.local() def _get_threadlocal_config(): """Get a threadlocal **mutable** configuration. If the configuration does not exist, copy the default global configuration.""" if not hasattr(_threadlocal, "global_config"): _threadlocal.global_config = _global_config.copy() return _threadlocal.global_config def get_config(): """Retrieve current values for configuration set by :func:`set_config`. Returns ------- config : dict Keys are parameter names that can be passed to :func:`set_config`. See Also -------- config_context : Context manager for global scikit-learn configuration. set_config : Set global scikit-learn configuration. """ # Return a copy of the threadlocal configuration so that users will # not be able to modify the configuration with the returned dict. return _get_threadlocal_config().copy() def set_config( assume_finite=None, working_memory=None, print_changed_only=None, display=None, pairwise_dist_chunk_size=None, enable_cython_pairwise_dist=None, transform_output=None, enable_metadata_routing=None, skip_parameter_validation=None, ): """Set global scikit-learn configuration .. versionadded:: 0.19 Parameters ---------- assume_finite : bool, default=None If True, validation for finiteness will be skipped, saving time, but leading to potential crashes. If False, validation for finiteness will be performed, avoiding error. Global default: False. .. versionadded:: 0.19 working_memory : int, default=None If set, scikit-learn will attempt to limit the size of temporary arrays to this number of MiB (per job when parallelised), often saving both computation time and memory on expensive operations that can be performed in chunks. Global default: 1024. .. versionadded:: 0.20 print_changed_only : bool, default=None If True, only the parameters that were set to non-default values will be printed when printing an estimator. For example, ``print(SVC())`` while True will only print 'SVC()' while the default behaviour would be to print 'SVC(C=1.0, cache_size=200, ...)' with all the non-changed parameters. .. versionadded:: 0.21 display : {'text', 'diagram'}, default=None If 'diagram', estimators will be displayed as a diagram in a Jupyter lab or notebook context. If 'text', estimators will be displayed as text. Default is 'diagram'. .. versionadded:: 0.23 pairwise_dist_chunk_size : int, default=None The number of row vectors per chunk for the accelerated pairwise- distances reduction backend. Default is 256 (suitable for most of modern laptops' caches and architectures). Intended for easier benchmarking and testing of scikit-learn internals. End users are not expected to benefit from customizing this configuration setting. .. versionadded:: 1.1 enable_cython_pairwise_dist : bool, default=None Use the accelerated pairwise-distances reduction backend when possible. Global default: True. Intended for easier benchmarking and testing of scikit-learn internals. End users are not expected to benefit from customizing this configuration setting. .. versionadded:: 1.1 transform_output : str, default=None Configure output of `transform` and `fit_transform`. See :ref:`sphx_glr_auto_examples_miscellaneous_plot_set_output.py` for an example on how to use the API. - `"default"`: Default output format of a transformer - `"pandas"`: DataFrame output - `None`: Transform configuration is unchanged .. versionadded:: 1.2 enable_metadata_routing : bool, default=None Enable metadata routing. By default this feature is disabled. Refer to :ref:`metadata routing user guide ` for more details. - `True`: Metadata routing is enabled - `False`: Metadata routing is disabled, use the old syntax. - `None`: Configuration is unchanged .. versionadded:: 1.3 skip_parameter_validation : bool, default=None If `True`, disable the validation of the hyper-parameters' types and values in the fit method of estimators and for arguments passed to public helper functions. It can save time in some situations but can lead to low level crashes and exceptions with confusing error messages. Note that for data parameters, such as `X` and `y`, only type validation is skipped but validation with `check_array` will continue to run. .. versionadded:: 1.3 See Also -------- config_context : Context manager for global scikit-learn configuration. get_config : Retrieve current values of the global configuration. """ local_config = _get_threadlocal_config() if assume_finite is not None: local_config["assume_finite"] = assume_finite if working_memory is not None: local_config["working_memory"] = working_memory if print_changed_only is not None: local_config["print_changed_only"] = print_changed_only if display is not None: local_config["display"] = display if pairwise_dist_chunk_size is not None: local_config["pairwise_dist_chunk_size"] = pairwise_dist_chunk_size if enable_cython_pairwise_dist is not None: local_config["enable_cython_pairwise_dist"] = enable_cython_pairwise_dist if transform_output is not None: local_config["transform_output"] = transform_output if enable_metadata_routing is not None: local_config["enable_metadata_routing"] = enable_metadata_routing if skip_parameter_validation is not None: local_config["skip_parameter_validation"] = skip_parameter_validation @contextmanager def config_context( *, assume_finite=None, working_memory=None, print_changed_only=None, display=None, pairwise_dist_chunk_size=None, enable_cython_pairwise_dist=None, transform_output=None, enable_metadata_routing=None, skip_parameter_validation=None, ): """Context manager for global scikit-learn configuration. Parameters ---------- assume_finite : bool, default=None If True, validation for finiteness will be skipped, saving time, but leading to potential crashes. If False, validation for finiteness will be performed, avoiding error. If None, the existing value won't change. The default value is False. working_memory : int, default=None If set, scikit-learn will attempt to limit the size of temporary arrays to this number of MiB (per job when parallelised), often saving both computation time and memory on expensive operations that can be performed in chunks. If None, the existing value won't change. The default value is 1024. print_changed_only : bool, default=None If True, only the parameters that were set to non-default values will be printed when printing an estimator. For example, ``print(SVC())`` while True will only print 'SVC()', but would print 'SVC(C=1.0, cache_size=200, ...)' with all the non-changed parameters when False. If None, the existing value won't change. The default value is True. .. versionchanged:: 0.23 Default changed from False to True. display : {'text', 'diagram'}, default=None If 'diagram', estimators will be displayed as a diagram in a Jupyter lab or notebook context. If 'text', estimators will be displayed as text. If None, the existing value won't change. The default value is 'diagram'. .. versionadded:: 0.23 pairwise_dist_chunk_size : int, default=None The number of row vectors per chunk for the accelerated pairwise- distances reduction backend. Default is 256 (suitable for most of modern laptops' caches and architectures). Intended for easier benchmarking and testing of scikit-learn internals. End users are not expected to benefit from customizing this configuration setting. .. versionadded:: 1.1 enable_cython_pairwise_dist : bool, default=None Use the accelerated pairwise-distances reduction backend when possible. Global default: True. Intended for easier benchmarking and testing of scikit-learn internals. End users are not expected to benefit from customizing this configuration setting. .. versionadded:: 1.1 transform_output : str, default=None Configure output of `transform` and `fit_transform`. See :ref:`sphx_glr_auto_examples_miscellaneous_plot_set_output.py` for an example on how to use the API. - `"default"`: Default output format of a transformer - `"pandas"`: DataFrame output - `None`: Transform configuration is unchanged .. versionadded:: 1.2 enable_metadata_routing : bool, default=None Enable metadata routing. By default this feature is disabled. Refer to :ref:`metadata routing user guide ` for more details. - `True`: Metadata routing is enabled - `False`: Metadata routing is disabled, use the old syntax. - `None`: Configuration is unchanged .. versionadded:: 1.3 skip_parameter_validation : bool, default=None If `True`, disable the validation of the hyper-parameters' types and values in the fit method of estimators and for arguments passed to public helper functions. It can save time in some situations but can lead to low level crashes and exceptions with confusing error messages. Note that for data parameters, such as `X` and `y`, only type validation is skipped but validation with `check_array` will continue to run. .. versionadded:: 1.3 Yields ------ None. See Also -------- set_config : Set global scikit-learn configuration. get_config : Retrieve current values of the global configuration. Notes ----- All settings, not just those presently modified, will be returned to their previous values when the context manager is exited. Examples -------- >>> import sklearn >>> from sklearn.utils.validation import assert_all_finite >>> with sklearn.config_context(assume_finite=True): ... assert_all_finite([float('nan')]) >>> with sklearn.config_context(assume_finite=True): ... with sklearn.config_context(assume_finite=False): ... assert_all_finite([float('nan')]) Traceback (most recent call last): ... ValueError: Input contains NaN... """ old_config = get_config() set_config( assume_finite=assume_finite, working_memory=working_memory, print_changed_only=print_changed_only, display=display, pairwise_dist_chunk_size=pairwise_dist_chunk_size, enable_cython_pairwise_dist=enable_cython_pairwise_dist, transform_output=transform_output, enable_metadata_routing=enable_metadata_routing, skip_parameter_validation=skip_parameter_validation, ) try: yield finally: set_config(**old_config) else: from sklearn._config import ( # type: ignore[no-redef] _get_threadlocal_config, _global_config, config_context, # noqa get_config, ) imbalanced-learn-0.12.2/imblearn/_min_dependencies.py000066400000000000000000000043001460233407600225470ustar00rootroot00000000000000"""All minimum dependencies for imbalanced-learn.""" import argparse NUMPY_MIN_VERSION = "1.17.3" SCIPY_MIN_VERSION = "1.5.0" PANDAS_MIN_VERSION = "1.0.5" SKLEARN_MIN_VERSION = "1.0.2" TENSORFLOW_MIN_VERSION = "2.4.3" KERAS_MIN_VERSION = "2.4.3" JOBLIB_MIN_VERSION = "1.1.1" THREADPOOLCTL_MIN_VERSION = "2.0.0" PYTEST_MIN_VERSION = "5.0.1" # 'build' and 'install' is included to have structured metadata for CI. # It will NOT be included in setup's extras_require # The values are (version_spec, comma separated tags) dependent_packages = { "numpy": (NUMPY_MIN_VERSION, "install"), "scipy": (SCIPY_MIN_VERSION, "install"), "scikit-learn": (SKLEARN_MIN_VERSION, "install"), "joblib": (JOBLIB_MIN_VERSION, "install"), "threadpoolctl": (THREADPOOLCTL_MIN_VERSION, "install"), "pandas": (PANDAS_MIN_VERSION, "optional, docs, examples, tests"), "tensorflow": (TENSORFLOW_MIN_VERSION, "optional, docs, examples, tests"), "keras": (KERAS_MIN_VERSION, "optional, docs, examples, tests"), "matplotlib": ("3.1.2", "docs, examples"), "seaborn": ("0.9.0", "docs, examples"), "memory_profiler": ("0.57.0", "docs"), "pytest": (PYTEST_MIN_VERSION, "tests"), "pytest-cov": ("2.9.0", "tests"), "flake8": ("3.8.2", "tests"), "black": ("23.3.0", "tests"), "mypy": ("1.3.0", "tests"), "sphinx": ("6.0.0", "docs"), "sphinx-gallery": ("0.13.0", "docs"), "sphinx-copybutton": ("0.5.2", "docs"), "numpydoc": ("1.5.0", "docs"), "sphinxcontrib-bibtex": ("2.4.1", "docs"), "pydata-sphinx-theme": ("0.13.3", "docs"), } # create inverse mapping for setuptools tag_to_packages: dict = { extra: [] for extra in ["install", "optional", "docs", "examples", "tests"] } for package, (min_version, extras) in dependent_packages.items(): for extra in extras.split(", "): tag_to_packages[extra].append("{}>={}".format(package, min_version)) # Used by CI to get the min dependencies if __name__ == "__main__": parser = argparse.ArgumentParser(description="Get min dependencies for a package") parser.add_argument("package", choices=dependent_packages) args = parser.parse_args() min_version = dependent_packages[args.package][0] print(min_version) imbalanced-learn-0.12.2/imblearn/_version.py000066400000000000000000000011601460233407600207440ustar00rootroot00000000000000""" ``imbalanced-learn`` is a set of python methods to deal with imbalanced datset in machine learning and pattern recognition. """ # Based on NiLearn package # License: simplified BSD # PEP0440 compatible formatted version, see: # https://www.python.org/dev/peps/pep-0440/ # # Generic release markers: # X.Y # X.Y.Z # For bugfix releases # # Admissible pre-release markers: # X.YaN # Alpha release # X.YbN # Beta release # X.YrcN # Release Candidate # X.Y # Final release # # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer. # 'X.Y.dev0' is the canonical version of 'X.Y.dev' # __version__ = "0.12.2" imbalanced-learn-0.12.2/imblearn/base.py000066400000000000000000000312421460233407600200360ustar00rootroot00000000000000"""Base class for sampling""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT from abc import ABCMeta, abstractmethod import numpy as np import sklearn from sklearn.base import BaseEstimator try: # scikit-learn >= 1.2 from sklearn.base import OneToOneFeatureMixin except ImportError: from sklearn.base import _OneToOneFeatureMixin as OneToOneFeatureMixin from sklearn.preprocessing import label_binarize from sklearn.utils.fixes import parse_version from sklearn.utils.multiclass import check_classification_targets from .utils import check_sampling_strategy, check_target_type from .utils._param_validation import validate_parameter_constraints from .utils._validation import ArraysTransformer sklearn_version = parse_version(sklearn.__version__) class _ParamsValidationMixin: """Mixin class to validate parameters.""" def _validate_params(self): """Validate types and values of constructor parameters. The expected type and values must be defined in the `_parameter_constraints` class attribute, which is a dictionary `param_name: list of constraints`. See the docstring of `validate_parameter_constraints` for a description of the accepted constraints. """ if hasattr(self, "_parameter_constraints"): validate_parameter_constraints( self._parameter_constraints, self.get_params(deep=False), caller_name=self.__class__.__name__, ) class SamplerMixin(_ParamsValidationMixin, BaseEstimator, metaclass=ABCMeta): """Mixin class for samplers with abstract method. Warning: This class should not be used directly. Use the derive classes instead. """ _estimator_type = "sampler" def fit(self, X, y): """Check inputs and statistics of the sampler. You should use ``fit_resample`` in all cases. Parameters ---------- X : {array-like, dataframe, sparse matrix} of shape \ (n_samples, n_features) Data array. y : array-like of shape (n_samples,) Target array. Returns ------- self : object Return the instance itself. """ X, y, _ = self._check_X_y(X, y) self.sampling_strategy_ = check_sampling_strategy( self.sampling_strategy, y, self._sampling_type ) return self def fit_resample(self, X, y): """Resample the dataset. Parameters ---------- X : {array-like, dataframe, sparse matrix} of shape \ (n_samples, n_features) Matrix containing the data which have to be sampled. y : array-like of shape (n_samples,) Corresponding label for each sample in X. Returns ------- X_resampled : {array-like, dataframe, sparse matrix} of shape \ (n_samples_new, n_features) The array containing the resampled data. y_resampled : array-like of shape (n_samples_new,) The corresponding label of `X_resampled`. """ check_classification_targets(y) arrays_transformer = ArraysTransformer(X, y) X, y, binarize_y = self._check_X_y(X, y) self.sampling_strategy_ = check_sampling_strategy( self.sampling_strategy, y, self._sampling_type ) output = self._fit_resample(X, y) y_ = ( label_binarize(output[1], classes=np.unique(y)) if binarize_y else output[1] ) X_, y_ = arrays_transformer.transform(output[0], y_) return (X_, y_) if len(output) == 2 else (X_, y_, output[2]) @abstractmethod def _fit_resample(self, X, y): """Base method defined in each sampler to defined the sampling strategy. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : array-like of shape (n_samples,) Corresponding label for each sample in X. Returns ------- X_resampled : {ndarray, sparse matrix} of shape \ (n_samples_new, n_features) The array containing the resampled data. y_resampled : ndarray of shape (n_samples_new,) The corresponding label of `X_resampled`. """ pass class BaseSampler(SamplerMixin, OneToOneFeatureMixin): """Base class for sampling algorithms. Warning: This class should not be used directly. Use the derive classes instead. """ def __init__(self, sampling_strategy="auto"): self.sampling_strategy = sampling_strategy def _check_X_y(self, X, y, accept_sparse=None): if accept_sparse is None: accept_sparse = ["csr", "csc"] y, binarize_y = check_target_type(y, indicate_one_vs_all=True) X, y = self._validate_data(X, y, reset=True, accept_sparse=accept_sparse) return X, y, binarize_y def fit(self, X, y): """Check inputs and statistics of the sampler. You should use ``fit_resample`` in all cases. Parameters ---------- X : {array-like, dataframe, sparse matrix} of shape \ (n_samples, n_features) Data array. y : array-like of shape (n_samples,) Target array. Returns ------- self : object Return the instance itself. """ self._validate_params() return super().fit(X, y) def fit_resample(self, X, y): """Resample the dataset. Parameters ---------- X : {array-like, dataframe, sparse matrix} of shape \ (n_samples, n_features) Matrix containing the data which have to be sampled. y : array-like of shape (n_samples,) Corresponding label for each sample in X. Returns ------- X_resampled : {array-like, dataframe, sparse matrix} of shape \ (n_samples_new, n_features) The array containing the resampled data. y_resampled : array-like of shape (n_samples_new,) The corresponding label of `X_resampled`. """ self._validate_params() return super().fit_resample(X, y) def _more_tags(self): return {"X_types": ["2darray", "sparse", "dataframe"]} def _identity(X, y): return X, y def is_sampler(estimator): """Return True if the given estimator is a sampler, False otherwise. Parameters ---------- estimator : object Estimator to test. Returns ------- is_sampler : bool True if estimator is a sampler, otherwise False. """ if estimator._estimator_type == "sampler": return True return False class FunctionSampler(BaseSampler): """Construct a sampler from calling an arbitrary callable. Read more in the :ref:`User Guide `. Parameters ---------- func : callable, default=None The callable to use for the transformation. This will be passed the same arguments as transform, with args and kwargs forwarded. If func is None, then func will be the identity function. accept_sparse : bool, default=True Whether sparse input are supported. By default, sparse inputs are supported. kw_args : dict, default=None The keyword argument expected by ``func``. validate : bool, default=True Whether or not to bypass the validation of ``X`` and ``y``. Turning-off validation allows to use the ``FunctionSampler`` with any type of data. .. versionadded:: 0.6 Attributes ---------- sampling_strategy_ : dict Dictionary containing the information to sample the dataset. The keys corresponds to the class labels from which to sample and the values are the number of samples to sample. n_features_in_ : int Number of features in the input dataset. .. versionadded:: 0.9 feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during `fit`. Defined only when `X` has feature names that are all strings. .. versionadded:: 0.10 See Also -------- sklearn.preprocessing.FunctionTransfomer : Stateless transformer. Notes ----- See :ref:`sphx_glr_auto_examples_applications_plot_outlier_rejections.py` Examples -------- >>> import numpy as np >>> from sklearn.datasets import make_classification >>> from imblearn import FunctionSampler >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) We can create to select only the first ten samples for instance. >>> def func(X, y): ... return X[:10], y[:10] >>> sampler = FunctionSampler(func=func) >>> X_res, y_res = sampler.fit_resample(X, y) >>> np.all(X_res == X[:10]) True >>> np.all(y_res == y[:10]) True We can also create a specific function which take some arguments. >>> from collections import Counter >>> from imblearn.under_sampling import RandomUnderSampler >>> def func(X, y, sampling_strategy, random_state): ... return RandomUnderSampler( ... sampling_strategy=sampling_strategy, ... random_state=random_state).fit_resample(X, y) >>> sampler = FunctionSampler(func=func, ... kw_args={'sampling_strategy': 'auto', ... 'random_state': 0}) >>> X_res, y_res = sampler.fit_resample(X, y) >>> print(f'Resampled dataset shape {sorted(Counter(y_res).items())}') Resampled dataset shape [(0, 100), (1, 100)] """ _sampling_type = "bypass" _parameter_constraints: dict = { "func": [callable, None], "accept_sparse": ["boolean"], "kw_args": [dict, None], "validate": ["boolean"], } def __init__(self, *, func=None, accept_sparse=True, kw_args=None, validate=True): super().__init__() self.func = func self.accept_sparse = accept_sparse self.kw_args = kw_args self.validate = validate def fit(self, X, y): """Check inputs and statistics of the sampler. You should use ``fit_resample`` in all cases. Parameters ---------- X : {array-like, dataframe, sparse matrix} of shape \ (n_samples, n_features) Data array. y : array-like of shape (n_samples,) Target array. Returns ------- self : object Return the instance itself. """ self._validate_params() # we need to overwrite SamplerMixin.fit to bypass the validation if self.validate: check_classification_targets(y) X, y, _ = self._check_X_y(X, y, accept_sparse=self.accept_sparse) self.sampling_strategy_ = check_sampling_strategy( self.sampling_strategy, y, self._sampling_type ) return self def fit_resample(self, X, y): """Resample the dataset. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : array-like of shape (n_samples,) Corresponding label for each sample in X. Returns ------- X_resampled : {array-like, sparse matrix} of shape \ (n_samples_new, n_features) The array containing the resampled data. y_resampled : array-like of shape (n_samples_new,) The corresponding label of `X_resampled`. """ self._validate_params() arrays_transformer = ArraysTransformer(X, y) if self.validate: check_classification_targets(y) X, y, binarize_y = self._check_X_y(X, y, accept_sparse=self.accept_sparse) self.sampling_strategy_ = check_sampling_strategy( self.sampling_strategy, y, self._sampling_type ) output = self._fit_resample(X, y) if self.validate: y_ = ( label_binarize(output[1], classes=np.unique(y)) if binarize_y else output[1] ) X_, y_ = arrays_transformer.transform(output[0], y_) return (X_, y_) if len(output) == 2 else (X_, y_, output[2]) return output def _fit_resample(self, X, y): func = _identity if self.func is None else self.func output = func(X, y, **(self.kw_args if self.kw_args else {})) return output imbalanced-learn-0.12.2/imblearn/combine/000077500000000000000000000000001460233407600201645ustar00rootroot00000000000000imbalanced-learn-0.12.2/imblearn/combine/__init__.py000066400000000000000000000003211460233407600222710ustar00rootroot00000000000000"""The :mod:`imblearn.combine` provides methods which combine over-sampling and under-sampling. """ from ._smote_enn import SMOTEENN from ._smote_tomek import SMOTETomek __all__ = ["SMOTEENN", "SMOTETomek"] imbalanced-learn-0.12.2/imblearn/combine/_smote_enn.py000066400000000000000000000117421460233407600226710ustar00rootroot00000000000000"""Class to perform over-sampling using SMOTE and cleaning using ENN.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import numbers from sklearn.base import clone from sklearn.utils import check_X_y from ..base import BaseSampler from ..over_sampling import SMOTE from ..over_sampling.base import BaseOverSampler from ..under_sampling import EditedNearestNeighbours from ..utils import Substitution, check_target_type from ..utils._docstring import _n_jobs_docstring, _random_state_docstring @Substitution( sampling_strategy=BaseOverSampler._sampling_strategy_docstring, n_jobs=_n_jobs_docstring, random_state=_random_state_docstring, ) class SMOTEENN(BaseSampler): """Over-sampling using SMOTE and cleaning using ENN. Combine over- and under-sampling using SMOTE and Edited Nearest Neighbours. Read more in the :ref:`User Guide `. Parameters ---------- {sampling_strategy} {random_state} smote : sampler object, default=None The :class:`~imblearn.over_sampling.SMOTE` object to use. If not given, a :class:`~imblearn.over_sampling.SMOTE` object with default parameters will be given. enn : sampler object, default=None The :class:`~imblearn.under_sampling.EditedNearestNeighbours` object to use. If not given, a :class:`~imblearn.under_sampling.EditedNearestNeighbours` object with sampling strategy='all' will be given. {n_jobs} Attributes ---------- sampling_strategy_ : dict Dictionary containing the information to sample the dataset. The keys corresponds to the class labels from which to sample and the values are the number of samples to sample. smote_ : sampler object The validated :class:`~imblearn.over_sampling.SMOTE` instance. enn_ : sampler object The validated :class:`~imblearn.under_sampling.EditedNearestNeighbours` instance. n_features_in_ : int Number of features in the input dataset. .. versionadded:: 0.9 feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during `fit`. Defined only when `X` has feature names that are all strings. .. versionadded:: 0.10 See Also -------- SMOTETomek : Over-sample using SMOTE followed by under-sampling removing the Tomek's links. Notes ----- The method is presented in [1]_. Supports multi-class resampling. Refer to SMOTE and ENN regarding the scheme which used. References ---------- .. [1] G. Batista, R. C. Prati, M. C. Monard. "A study of the behavior of several methods for balancing machine learning training data," ACM Sigkdd Explorations Newsletter 6 (1), 20-29, 2004. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.combine import SMOTEENN >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> sme = SMOTEENN(random_state=42) >>> X_res, y_res = sme.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{0: 900, 1: 881}}) """ _sampling_type = "over-sampling" _parameter_constraints: dict = { **BaseOverSampler._parameter_constraints, "smote": [SMOTE, None], "enn": [EditedNearestNeighbours, None], "n_jobs": [numbers.Integral, None], } def __init__( self, *, sampling_strategy="auto", random_state=None, smote=None, enn=None, n_jobs=None, ): super().__init__() self.sampling_strategy = sampling_strategy self.random_state = random_state self.smote = smote self.enn = enn self.n_jobs = n_jobs def _validate_estimator(self): "Private function to validate SMOTE and ENN objects" if self.smote is not None: self.smote_ = clone(self.smote) else: self.smote_ = SMOTE( sampling_strategy=self.sampling_strategy, random_state=self.random_state, n_jobs=self.n_jobs, ) if self.enn is not None: self.enn_ = clone(self.enn) else: self.enn_ = EditedNearestNeighbours( sampling_strategy="all", n_jobs=self.n_jobs ) def _fit_resample(self, X, y): self._validate_estimator() y = check_target_type(y) X, y = check_X_y(X, y, accept_sparse=["csr", "csc"]) self.sampling_strategy_ = self.sampling_strategy X_res, y_res = self.smote_.fit_resample(X, y) return self.enn_.fit_resample(X_res, y_res) imbalanced-learn-0.12.2/imblearn/combine/_smote_tomek.py000066400000000000000000000115241460233407600232260ustar00rootroot00000000000000"""Class to perform over-sampling using SMOTE and cleaning using Tomek links.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import numbers from sklearn.base import clone from sklearn.utils import check_X_y from ..base import BaseSampler from ..over_sampling import SMOTE from ..over_sampling.base import BaseOverSampler from ..under_sampling import TomekLinks from ..utils import Substitution, check_target_type from ..utils._docstring import _n_jobs_docstring, _random_state_docstring @Substitution( sampling_strategy=BaseOverSampler._sampling_strategy_docstring, n_jobs=_n_jobs_docstring, random_state=_random_state_docstring, ) class SMOTETomek(BaseSampler): """Over-sampling using SMOTE and cleaning using Tomek links. Combine over- and under-sampling using SMOTE and Tomek links. Read more in the :ref:`User Guide `. Parameters ---------- {sampling_strategy} {random_state} smote : sampler object, default=None The :class:`~imblearn.over_sampling.SMOTE` object to use. If not given, a :class:`~imblearn.over_sampling.SMOTE` object with default parameters will be given. tomek : sampler object, default=None The :class:`~imblearn.under_sampling.TomekLinks` object to use. If not given, a :class:`~imblearn.under_sampling.TomekLinks` object with sampling strategy='all' will be given. {n_jobs} Attributes ---------- sampling_strategy_ : dict Dictionary containing the information to sample the dataset. The keys corresponds to the class labels from which to sample and the values are the number of samples to sample. smote_ : sampler object The validated :class:`~imblearn.over_sampling.SMOTE` instance. tomek_ : sampler object The validated :class:`~imblearn.under_sampling.TomekLinks` instance. n_features_in_ : int Number of features in the input dataset. .. versionadded:: 0.9 feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during `fit`. Defined only when `X` has feature names that are all strings. .. versionadded:: 0.10 See Also -------- SMOTEENN : Over-sample using SMOTE followed by under-sampling using Edited Nearest Neighbours. Notes ----- The method is presented in [1]_. Supports multi-class resampling. Refer to SMOTE and TomekLinks regarding the scheme which used. References ---------- .. [1] G. Batista, B. Bazzan, M. Monard, "Balancing Training Data for Automated Annotation of Keywords: a Case Study," In WOB, 10-18, 2003. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.combine import SMOTETomek >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> smt = SMOTETomek(random_state=42) >>> X_res, y_res = smt.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{0: 900, 1: 900}}) """ _sampling_type = "over-sampling" _parameter_constraints: dict = { **BaseOverSampler._parameter_constraints, "smote": [SMOTE, None], "tomek": [TomekLinks, None], "n_jobs": [numbers.Integral, None], } def __init__( self, *, sampling_strategy="auto", random_state=None, smote=None, tomek=None, n_jobs=None, ): super().__init__() self.sampling_strategy = sampling_strategy self.random_state = random_state self.smote = smote self.tomek = tomek self.n_jobs = n_jobs def _validate_estimator(self): "Private function to validate SMOTE and ENN objects" if self.smote is not None: self.smote_ = clone(self.smote) else: self.smote_ = SMOTE( sampling_strategy=self.sampling_strategy, random_state=self.random_state, n_jobs=self.n_jobs, ) if self.tomek is not None: self.tomek_ = clone(self.tomek) else: self.tomek_ = TomekLinks(sampling_strategy="all", n_jobs=self.n_jobs) def _fit_resample(self, X, y): self._validate_estimator() y = check_target_type(y) X, y = check_X_y(X, y, accept_sparse=["csr", "csc"]) self.sampling_strategy_ = self.sampling_strategy X_res, y_res = self.smote_.fit_resample(X, y) return self.tomek_.fit_resample(X_res, y_res) imbalanced-learn-0.12.2/imblearn/combine/tests/000077500000000000000000000000001460233407600213265ustar00rootroot00000000000000imbalanced-learn-0.12.2/imblearn/combine/tests/__init__.py000066400000000000000000000000001460233407600234250ustar00rootroot00000000000000imbalanced-learn-0.12.2/imblearn/combine/tests/test_smote_enn.py000066400000000000000000000113001460233407600247210ustar00rootroot00000000000000"""Test the module SMOTE ENN.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import numpy as np from sklearn.utils._testing import assert_allclose, assert_array_equal from imblearn.combine import SMOTEENN from imblearn.over_sampling import SMOTE from imblearn.under_sampling import EditedNearestNeighbours RND_SEED = 0 X = np.array( [ [0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], ] ) Y = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0]) R_TOL = 1e-4 def test_sample_regular(): smote = SMOTEENN(random_state=RND_SEED) X_resampled, y_resampled = smote.fit_resample(X, Y) X_gt = np.array( [ [1.52091956, -0.49283504], [0.84976473, -0.15570176], [0.61319159, -0.11571667], [0.66052536, -0.28246518], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.08711622, 0.93259929], ] ) y_gt = np.array([0, 0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) def test_sample_regular_pass_smote_enn(): smote = SMOTEENN( smote=SMOTE(sampling_strategy="auto", random_state=RND_SEED), enn=EditedNearestNeighbours(sampling_strategy="all"), random_state=RND_SEED, ) X_resampled, y_resampled = smote.fit_resample(X, Y) X_gt = np.array( [ [1.52091956, -0.49283504], [0.84976473, -0.15570176], [0.61319159, -0.11571667], [0.66052536, -0.28246518], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.08711622, 0.93259929], ] ) y_gt = np.array([0, 0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) def test_sample_regular_half(): sampling_strategy = {0: 10, 1: 12} smote = SMOTEENN(sampling_strategy=sampling_strategy, random_state=RND_SEED) X_resampled, y_resampled = smote.fit_resample(X, Y) X_gt = np.array( [ [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.08711622, 0.93259929], ] ) y_gt = np.array([0, 1, 1, 1]) assert_allclose(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_validate_estimator_init(): smote = SMOTE(random_state=RND_SEED) enn = EditedNearestNeighbours(sampling_strategy="all") smt = SMOTEENN(smote=smote, enn=enn, random_state=RND_SEED) X_resampled, y_resampled = smt.fit_resample(X, Y) X_gt = np.array( [ [1.52091956, -0.49283504], [0.84976473, -0.15570176], [0.61319159, -0.11571667], [0.66052536, -0.28246518], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.08711622, 0.93259929], ] ) y_gt = np.array([0, 0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) def test_validate_estimator_default(): smt = SMOTEENN(random_state=RND_SEED) X_resampled, y_resampled = smt.fit_resample(X, Y) X_gt = np.array( [ [1.52091956, -0.49283504], [0.84976473, -0.15570176], [0.61319159, -0.11571667], [0.66052536, -0.28246518], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.08711622, 0.93259929], ] ) y_gt = np.array([0, 0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) def test_parallelisation(): # Check if default job count is none smt = SMOTEENN(random_state=RND_SEED) smt._validate_estimator() assert smt.n_jobs is None assert smt.smote_.n_jobs is None assert smt.enn_.n_jobs is None # Check if job count is set smt = SMOTEENN(random_state=RND_SEED, n_jobs=8) smt._validate_estimator() assert smt.n_jobs == 8 assert smt.smote_.n_jobs == 8 assert smt.enn_.n_jobs == 8 imbalanced-learn-0.12.2/imblearn/combine/tests/test_smote_tomek.py000066400000000000000000000127101460233407600252660ustar00rootroot00000000000000"""Test the module SMOTE ENN.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import numpy as np from sklearn.utils._testing import assert_allclose, assert_array_equal from imblearn.combine import SMOTETomek from imblearn.over_sampling import SMOTE from imblearn.under_sampling import TomekLinks RND_SEED = 0 X = np.array( [ [0.20622591, 0.0582794], [0.68481731, 0.51935141], [1.34192108, -0.13367336], [0.62366841, -0.21312976], [1.61091956, -0.40283504], [-0.37162401, -2.19400981], [0.74680821, 1.63827342], [0.2184254, 0.24299982], [0.61472253, -0.82309052], [0.19893132, -0.47761769], [1.06514042, -0.0770537], [0.97407872, 0.44454207], [1.40301027, -0.83648734], [-1.20515198, -1.02689695], [-0.27410027, -0.54194484], [0.8381014, 0.44085498], [-0.23374509, 0.18370049], [-0.32635887, -0.29299653], [-0.00288378, 0.84259929], [1.79580611, -0.02219234], ] ) Y = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0]) R_TOL = 1e-4 def test_sample_regular(): smote = SMOTETomek(random_state=RND_SEED) X_resampled, y_resampled = smote.fit_resample(X, Y) X_gt = np.array( [ [0.68481731, 0.51935141], [1.34192108, -0.13367336], [0.62366841, -0.21312976], [1.61091956, -0.40283504], [-0.37162401, -2.19400981], [0.74680821, 1.63827342], [0.61472253, -0.82309052], [0.19893132, -0.47761769], [1.40301027, -0.83648734], [-1.20515198, -1.02689695], [-0.23374509, 0.18370049], [-0.00288378, 0.84259929], [1.79580611, -0.02219234], [0.38307743, -0.05670439], [0.70319159, -0.02571667], [0.75052536, -0.19246518], ] ) y_gt = np.array([1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) def test_sample_regular_half(): sampling_strategy = {0: 9, 1: 12} smote = SMOTETomek(sampling_strategy=sampling_strategy, random_state=RND_SEED) X_resampled, y_resampled = smote.fit_resample(X, Y) X_gt = np.array( [ [0.68481731, 0.51935141], [0.62366841, -0.21312976], [1.61091956, -0.40283504], [-0.37162401, -2.19400981], [0.74680821, 1.63827342], [0.61472253, -0.82309052], [0.19893132, -0.47761769], [1.40301027, -0.83648734], [-1.20515198, -1.02689695], [-0.23374509, 0.18370049], [-0.00288378, 0.84259929], [1.79580611, -0.02219234], [0.45784496, -0.1053161], ] ) y_gt = np.array([1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) def test_validate_estimator_init(): smote = SMOTE(random_state=RND_SEED) tomek = TomekLinks(sampling_strategy="all") smt = SMOTETomek(smote=smote, tomek=tomek, random_state=RND_SEED) X_resampled, y_resampled = smt.fit_resample(X, Y) X_gt = np.array( [ [0.68481731, 0.51935141], [1.34192108, -0.13367336], [0.62366841, -0.21312976], [1.61091956, -0.40283504], [-0.37162401, -2.19400981], [0.74680821, 1.63827342], [0.61472253, -0.82309052], [0.19893132, -0.47761769], [1.40301027, -0.83648734], [-1.20515198, -1.02689695], [-0.23374509, 0.18370049], [-0.00288378, 0.84259929], [1.79580611, -0.02219234], [0.38307743, -0.05670439], [0.70319159, -0.02571667], [0.75052536, -0.19246518], ] ) y_gt = np.array([1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) def test_validate_estimator_default(): smt = SMOTETomek(random_state=RND_SEED) X_resampled, y_resampled = smt.fit_resample(X, Y) X_gt = np.array( [ [0.68481731, 0.51935141], [1.34192108, -0.13367336], [0.62366841, -0.21312976], [1.61091956, -0.40283504], [-0.37162401, -2.19400981], [0.74680821, 1.63827342], [0.61472253, -0.82309052], [0.19893132, -0.47761769], [1.40301027, -0.83648734], [-1.20515198, -1.02689695], [-0.23374509, 0.18370049], [-0.00288378, 0.84259929], [1.79580611, -0.02219234], [0.38307743, -0.05670439], [0.70319159, -0.02571667], [0.75052536, -0.19246518], ] ) y_gt = np.array([1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) def test_parallelisation(): # Check if default job count is None smt = SMOTETomek(random_state=RND_SEED) smt._validate_estimator() assert smt.n_jobs is None assert smt.smote_.n_jobs is None assert smt.tomek_.n_jobs is None # Check if job count is set smt = SMOTETomek(random_state=RND_SEED, n_jobs=8) smt._validate_estimator() assert smt.n_jobs == 8 assert smt.smote_.n_jobs == 8 assert smt.tomek_.n_jobs == 8 imbalanced-learn-0.12.2/imblearn/datasets/000077500000000000000000000000001460233407600203605ustar00rootroot00000000000000imbalanced-learn-0.12.2/imblearn/datasets/__init__.py000066400000000000000000000003171460233407600224720ustar00rootroot00000000000000""" The :mod:`imblearn.datasets` provides methods to generate imbalanced data. """ from ._imbalance import make_imbalance from ._zenodo import fetch_datasets __all__ = ["make_imbalance", "fetch_datasets"] imbalanced-learn-0.12.2/imblearn/datasets/_imbalance.py000066400000000000000000000100301460233407600227760ustar00rootroot00000000000000"""Transform a dataset into an imbalanced dataset.""" # Authors: Dayvid Oliveira # Guillaume Lemaitre # Christos Aridas # License: MIT from collections import Counter from collections.abc import Mapping from ..under_sampling import RandomUnderSampler from ..utils import check_sampling_strategy from ..utils._param_validation import validate_params @validate_params( { "X": ["array-like"], "y": ["array-like"], "sampling_strategy": [Mapping, callable, None], "random_state": ["random_state"], "verbose": ["boolean"], }, prefer_skip_nested_validation=True, ) def make_imbalance( X, y, *, sampling_strategy=None, random_state=None, verbose=False, **kwargs ): """Turn a dataset into an imbalanced dataset with a specific sampling strategy. A simple toy dataset to visualize clustering and classification algorithms. Read more in the :ref:`User Guide `. Parameters ---------- X : {array-like, dataframe} of shape (n_samples, n_features) Matrix containing the data to be imbalanced. y : array-like of shape (n_samples,) Corresponding label for each sample in X. sampling_strategy : dict or callable, Ratio to use for resampling the data set. - When ``dict``, the keys correspond to the targeted classes. The values correspond to the desired number of samples for each targeted class. - When callable, function taking ``y`` and returns a ``dict``. The keys correspond to the targeted classes. The values correspond to the desired number of samples for each class. random_state : int, RandomState instance or None, default=None If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by np.random. verbose : bool, default=False Show information regarding the sampling. **kwargs : dict Dictionary of additional keyword arguments to pass to ``sampling_strategy``. Returns ------- X_resampled : {ndarray, dataframe} of shape (n_samples_new, n_features) The array containing the imbalanced data. y_resampled : ndarray of shape (n_samples_new) The corresponding label of `X_resampled`. Notes ----- See :ref:`sphx_glr_auto_examples_applications_plot_multi_class_under_sampling.py`, :ref:`sphx_glr_auto_examples_datasets_plot_make_imbalance.py`, and :ref:`sphx_glr_auto_examples_api_plot_sampling_strategy_usage.py`. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import load_iris >>> from imblearn.datasets import make_imbalance >>> data = load_iris() >>> X, y = data.data, data.target >>> print(f'Distribution before imbalancing: {Counter(y)}') Distribution before imbalancing: Counter({0: 50, 1: 50, 2: 50}) >>> X_res, y_res = make_imbalance(X, y, ... sampling_strategy={0: 10, 1: 20, 2: 30}, ... random_state=42) >>> print(f'Distribution after imbalancing: {Counter(y_res)}') Distribution after imbalancing: Counter({2: 30, 1: 20, 0: 10}) """ target_stats = Counter(y) # restrict ratio to be a dict or a callable if isinstance(sampling_strategy, Mapping) or callable(sampling_strategy): sampling_strategy_ = check_sampling_strategy( sampling_strategy, y, "under-sampling", **kwargs ) if verbose: print(f"The original target distribution in the dataset is: {target_stats}") rus = RandomUnderSampler( sampling_strategy=sampling_strategy_, replacement=False, random_state=random_state, ) X_resampled, y_resampled = rus.fit_resample(X, y) if verbose: print(f"Make the dataset imbalanced: {Counter(y_resampled)}") return X_resampled, y_resampled imbalanced-learn-0.12.2/imblearn/datasets/_zenodo.py000066400000000000000000000312531460233407600223730ustar00rootroot00000000000000"""Collection of imbalanced datasets. This collection of datasets has been proposed in [1]_. The characteristics of the available datasets are presented in the table below. ID Name Repository & Target Ratio #S #F 1 ecoli UCI, target: imU 8.6:1 336 7 2 optical_digits UCI, target: 8 9.1:1 5,620 64 3 satimage UCI, target: 4 9.3:1 6,435 36 4 pen_digits UCI, target: 5 9.4:1 10,992 16 5 abalone UCI, target: 7 9.7:1 4,177 10 6 sick_euthyroid UCI, target: sick euthyroid 9.8:1 3,163 42 7 spectrometer UCI, target: >=44 11:1 531 93 8 car_eval_34 UCI, target: good, v good 12:1 1,728 21 9 isolet UCI, target: A, B 12:1 7,797 617 10 us_crime UCI, target: >0.65 12:1 1,994 100 11 yeast_ml8 LIBSVM, target: 8 13:1 2,417 103 12 scene LIBSVM, target: >one label 13:1 2,407 294 13 libras_move UCI, target: 1 14:1 360 90 14 thyroid_sick UCI, target: sick 15:1 3,772 52 15 coil_2000 KDD, CoIL, target: minority 16:1 9,822 85 16 arrhythmia UCI, target: 06 17:1 452 278 17 solar_flare_m0 UCI, target: M->0 19:1 1,389 32 18 oil UCI, target: minority 22:1 937 49 19 car_eval_4 UCI, target: vgood 26:1 1,728 21 20 wine_quality UCI, wine, target: <=4 26:1 4,898 11 21 letter_img UCI, target: Z 26:1 20,000 16 22 yeast_me2 UCI, target: ME2 28:1 1,484 8 23 webpage LIBSVM, w7a, target: minority 33:1 34,780 300 24 ozone_level UCI, ozone, data 34:1 2,536 72 25 mammography UCI, target: minority 42:1 11,183 6 26 protein_homo KDD CUP 2004, minority 111:1 145,751 74 27 abalone_19 UCI, target: 19 130:1 4,177 10 References ---------- .. [1] Ding, Zejin, "Diversified Ensemble Classifiers for Highly Imbalanced Data Learning and their Application in Bioinformatics." Dissertation, Georgia State University, (2011). """ # Author: Guillaume Lemaitre # License: BSD 3 clause import tarfile from collections import OrderedDict from io import BytesIO from os import makedirs from os.path import isfile, join from urllib.request import urlopen import numpy as np from sklearn.datasets import get_data_home from sklearn.utils import Bunch, check_random_state from ..utils._param_validation import validate_params URL = "https://zenodo.org/record/61452/files/benchmark-imbalanced-learn.tar.gz" PRE_FILENAME = "x" POST_FILENAME = "data.npz" MAP_NAME_ID_KEYS = [ "ecoli", "optical_digits", "satimage", "pen_digits", "abalone", "sick_euthyroid", "spectrometer", "car_eval_34", "isolet", "us_crime", "yeast_ml8", "scene", "libras_move", "thyroid_sick", "coil_2000", "arrhythmia", "solar_flare_m0", "oil", "car_eval_4", "wine_quality", "letter_img", "yeast_me2", "webpage", "ozone_level", "mammography", "protein_homo", "abalone_19", ] MAP_NAME_ID = OrderedDict() MAP_ID_NAME = OrderedDict() for v, k in enumerate(MAP_NAME_ID_KEYS): MAP_NAME_ID[k] = v + 1 MAP_ID_NAME[v + 1] = k @validate_params( { "data_home": [None, str], "filter_data": [None, tuple], "download_if_missing": ["boolean"], "random_state": ["random_state"], "shuffle": ["boolean"], "verbose": ["boolean"], }, prefer_skip_nested_validation=True, ) def fetch_datasets( *, data_home=None, filter_data=None, download_if_missing=True, random_state=None, shuffle=False, verbose=False, ): """Load the benchmark datasets from Zenodo, downloading it if necessary. .. versionadded:: 0.3 Parameters ---------- data_home : str, default=None Specify another download and cache folder for the datasets. By default all scikit-learn data is stored in '~/scikit_learn_data' subfolders. filter_data : tuple of str/int, default=None A tuple containing the ID or the name of the datasets to be returned. Refer to the above table to get the ID and name of the datasets. download_if_missing : bool, default=True If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. random_state : int, RandomState instance or None, default=None Random state for shuffling the dataset. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. shuffle : bool, default=False Whether to shuffle dataset. verbose : bool, default=False Show information regarding the fetching. Returns ------- datasets : OrderedDict of Bunch object, The ordered is defined by ``filter_data``. Each Bunch object --- referred as dataset --- have the following attributes: dataset.data : ndarray of shape (n_samples, n_features) dataset.target : ndarray of shape (n_samples,) dataset.DESCR : str Description of the each dataset. Notes ----- This collection of datasets have been proposed in [1]_. The characteristics of the available datasets are presented in the table below. +--+--------------+-------------------------------+-------+---------+-----+ |ID|Name | Repository & Target | Ratio | #S | #F | +==+==============+===============================+=======+=========+=====+ |1 |ecoli | UCI, target: imU | 8.6:1 | 336 | 7 | +--+--------------+-------------------------------+-------+---------+-----+ |2 |optical_digits| UCI, target: 8 | 9.1:1 | 5,620 | 64 | +--+--------------+-------------------------------+-------+---------+-----+ |3 |satimage | UCI, target: 4 | 9.3:1 | 6,435 | 36 | +--+--------------+-------------------------------+-------+---------+-----+ |4 |pen_digits | UCI, target: 5 | 9.4:1 | 10,992 | 16 | +--+--------------+-------------------------------+-------+---------+-----+ |5 |abalone | UCI, target: 7 | 9.7:1 | 4,177 | 10 | +--+--------------+-------------------------------+-------+---------+-----+ |6 |sick_euthyroid| UCI, target: sick euthyroid | 9.8:1 | 3,163 | 42 | +--+--------------+-------------------------------+-------+---------+-----+ |7 |spectrometer | UCI, target: >=44 | 11:1 | 531 | 93 | +--+--------------+-------------------------------+-------+---------+-----+ |8 |car_eval_34 | UCI, target: good, v good | 12:1 | 1,728 | 21 | +--+--------------+-------------------------------+-------+---------+-----+ |9 |isolet | UCI, target: A, B | 12:1 | 7,797 | 617 | +--+--------------+-------------------------------+-------+---------+-----+ |10|us_crime | UCI, target: >0.65 | 12:1 | 1,994 | 100 | +--+--------------+-------------------------------+-------+---------+-----+ |11|yeast_ml8 | LIBSVM, target: 8 | 13:1 | 2,417 | 103 | +--+--------------+-------------------------------+-------+---------+-----+ |12|scene | LIBSVM, target: >one label | 13:1 | 2,407 | 294 | +--+--------------+-------------------------------+-------+---------+-----+ |13|libras_move | UCI, target: 1 | 14:1 | 360 | 90 | +--+--------------+-------------------------------+-------+---------+-----+ |14|thyroid_sick | UCI, target: sick | 15:1 | 3,772 | 52 | +--+--------------+-------------------------------+-------+---------+-----+ |15|coil_2000 | KDD, CoIL, target: minority | 16:1 | 9,822 | 85 | +--+--------------+-------------------------------+-------+---------+-----+ |16|arrhythmia | UCI, target: 06 | 17:1 | 452 | 278 | +--+--------------+-------------------------------+-------+---------+-----+ |17|solar_flare_m0| UCI, target: M->0 | 19:1 | 1,389 | 32 | +--+--------------+-------------------------------+-------+---------+-----+ |18|oil | UCI, target: minority | 22:1 | 937 | 49 | +--+--------------+-------------------------------+-------+---------+-----+ |19|car_eval_4 | UCI, target: vgood | 26:1 | 1,728 | 21 | +--+--------------+-------------------------------+-------+---------+-----+ |20|wine_quality | UCI, wine, target: <=4 | 26:1 | 4,898 | 11 | +--+--------------+-------------------------------+-------+---------+-----+ |21|letter_img | UCI, target: Z | 26:1 | 20,000 | 16 | +--+--------------+-------------------------------+-------+---------+-----+ |22|yeast_me2 | UCI, target: ME2 | 28:1 | 1,484 | 8 | +--+--------------+-------------------------------+-------+---------+-----+ |23|webpage | LIBSVM, w7a, target: minority | 33:1 | 34,780 | 300 | +--+--------------+-------------------------------+-------+---------+-----+ |24|ozone_level | UCI, ozone, data | 34:1 | 2,536 | 72 | +--+--------------+-------------------------------+-------+---------+-----+ |25|mammography | UCI, target: minority | 42:1 | 11,183 | 6 | +--+--------------+-------------------------------+-------+---------+-----+ |26|protein_homo | KDD CUP 2004, minority | 111:1 | 145,751 | 74 | +--+--------------+-------------------------------+-------+---------+-----+ |27|abalone_19 | UCI, target: 19 | 130:1 | 4,177 | 10 | +--+--------------+-------------------------------+-------+---------+-----+ References ---------- .. [1] Ding, Zejin, "Diversified Ensemble Classifiers for Highly Imbalanced Data Learning and their Application in Bioinformatics." Dissertation, Georgia State University, (2011). """ data_home = get_data_home(data_home=data_home) zenodo_dir = join(data_home, "zenodo") datasets = OrderedDict() if filter_data is None: filter_data_ = MAP_NAME_ID.keys() else: list_data = MAP_NAME_ID.keys() filter_data_ = [] for it in filter_data: if isinstance(it, str): if it not in list_data: raise ValueError( f"{it} is not a dataset available. " f"The available datasets are {list_data}" ) else: filter_data_.append(it) elif isinstance(it, int): if it < 1 or it > 27: raise ValueError( f"The dataset with the ID={it} is not an " f"available dataset. The IDs are " f"{range(1, 28)}" ) else: # The index start at one, then we need to remove one # to not have issue with the indexing. filter_data_.append(MAP_ID_NAME[it]) else: raise ValueError( f"The value in the tuple should be str or int." f" Got {type(it)} instead." ) # go through the list and check if the data are available for it in filter_data_: filename = PRE_FILENAME + str(MAP_NAME_ID[it]) + POST_FILENAME filename = join(zenodo_dir, filename) available = isfile(filename) if download_if_missing and not available: makedirs(zenodo_dir, exist_ok=True) if verbose: print("Downloading %s" % URL) f = BytesIO(urlopen(URL).read()) tar = tarfile.open(fileobj=f) tar.extractall(path=zenodo_dir) elif not download_if_missing and not available: raise IOError("Data not found and `download_if_missing` is False") data = np.load(filename) X, y = data["data"], data["label"] if shuffle: ind = np.arange(X.shape[0]) rng = check_random_state(random_state) rng.shuffle(ind) X = X[ind] y = y[ind] datasets[it] = Bunch(data=X, target=y, DESCR=it) return datasets imbalanced-learn-0.12.2/imblearn/datasets/tests/000077500000000000000000000000001460233407600215225ustar00rootroot00000000000000imbalanced-learn-0.12.2/imblearn/datasets/tests/__init__.py000066400000000000000000000000001460233407600236210ustar00rootroot00000000000000imbalanced-learn-0.12.2/imblearn/datasets/tests/test_imbalance.py000066400000000000000000000047261460233407600250570ustar00rootroot00000000000000"""Test the module easy ensemble.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT from collections import Counter import numpy as np import pytest from sklearn.datasets import load_iris from imblearn.datasets import make_imbalance @pytest.fixture def iris(): return load_iris(return_X_y=True) @pytest.mark.parametrize( "sampling_strategy, err_msg", [ ({0: -100, 1: 50, 2: 50}, "in a class cannot be negative"), ({0: 10, 1: 70}, "should be less or equal to the original"), ], ) def test_make_imbalance_error(iris, sampling_strategy, err_msg): # we are reusing part of utils.check_sampling_strategy, however this is not # cover in the common tests so we will repeat it here X, y = iris with pytest.raises(ValueError, match=err_msg): make_imbalance(X, y, sampling_strategy=sampling_strategy) def test_make_imbalance_error_single_class(iris): X, y = iris y = np.zeros_like(y) with pytest.raises(ValueError, match="needs to have more than 1 class."): make_imbalance(X, y, sampling_strategy={0: 10}) @pytest.mark.parametrize( "sampling_strategy, expected_counts", [ ({0: 10, 1: 20, 2: 30}, {0: 10, 1: 20, 2: 30}), ({0: 10, 1: 20}, {0: 10, 1: 20, 2: 50}), ], ) def test_make_imbalance_dict(iris, sampling_strategy, expected_counts): X, y = iris _, y_ = make_imbalance(X, y, sampling_strategy=sampling_strategy) assert Counter(y_) == expected_counts @pytest.mark.parametrize("as_frame", [True, False], ids=["dataframe", "array"]) @pytest.mark.parametrize( "sampling_strategy, expected_counts", [ ( {"setosa": 10, "versicolor": 20, "virginica": 30}, {"setosa": 10, "versicolor": 20, "virginica": 30}, ), ( {"setosa": 10, "versicolor": 20}, {"setosa": 10, "versicolor": 20, "virginica": 50}, ), ], ) def test_make_imbalanced_iris(as_frame, sampling_strategy, expected_counts): pd = pytest.importorskip("pandas") iris = load_iris(as_frame=as_frame) X, y = iris.data, iris.target y = iris.target_names[iris.target] if as_frame: y = pd.Series(iris.target_names[iris.target], name="target") X_res, y_res = make_imbalance(X, y, sampling_strategy=sampling_strategy) if as_frame: assert hasattr(X_res, "loc") pd.testing.assert_index_equal(X_res.index, y_res.index) assert Counter(y_res) == expected_counts imbalanced-learn-0.12.2/imblearn/datasets/tests/test_zenodo.py000066400000000000000000000053251460233407600244360ustar00rootroot00000000000000"""Test the datasets loader. Skipped if datasets is not already downloaded to data_home. """ # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import pytest from sklearn.utils._testing import SkipTest from imblearn.datasets import fetch_datasets DATASET_SHAPE = { "ecoli": (336, 7), "optical_digits": (5620, 64), "satimage": (6435, 36), "pen_digits": (10992, 16), "abalone": (4177, 10), "sick_euthyroid": (3163, 42), "spectrometer": (531, 93), "car_eval_34": (1728, 21), "isolet": (7797, 617), "us_crime": (1994, 100), "yeast_ml8": (2417, 103), "scene": (2407, 294), "libras_move": (360, 90), "thyroid_sick": (3772, 52), "coil_2000": (9822, 85), "arrhythmia": (452, 278), "solar_flare_m0": (1389, 32), "oil": (937, 49), "car_eval_4": (1728, 21), "wine_quality": (4898, 11), "letter_img": (20000, 16), "yeast_me2": (1484, 8), "webpage": (34780, 300), "ozone_level": (2536, 72), "mammography": (11183, 6), "protein_homo": (145751, 74), "abalone_19": (4177, 10), } def fetch(*args, **kwargs): return fetch_datasets(*args, download_if_missing=True, **kwargs) @pytest.mark.xfail def test_fetch(): try: datasets1 = fetch(shuffle=True, random_state=42) except IOError: raise SkipTest("Zenodo dataset can not be loaded.") datasets2 = fetch(shuffle=True, random_state=37) for k in DATASET_SHAPE.keys(): X1, X2 = datasets1[k].data, datasets2[k].data assert DATASET_SHAPE[k] == X1.shape assert X1.shape == X2.shape y1, y2 = datasets1[k].target, datasets2[k].target assert (X1.shape[0],) == y1.shape assert (X1.shape[0],) == y2.shape def test_fetch_filter(): try: datasets1 = fetch(filter_data=tuple([1]), shuffle=True, random_state=42) except IOError: raise SkipTest("Zenodo dataset can not be loaded.") datasets2 = fetch(filter_data=tuple(["ecoli"]), shuffle=True, random_state=37) X1, X2 = datasets1["ecoli"].data, datasets2["ecoli"].data assert DATASET_SHAPE["ecoli"] == X1.shape assert X1.shape == X2.shape assert X1.sum() == pytest.approx(X2.sum()) y1, y2 = datasets1["ecoli"].target, datasets2["ecoli"].target assert (X1.shape[0],) == y1.shape assert (X1.shape[0],) == y2.shape @pytest.mark.parametrize( "filter_data, err_msg", [ (("rnf",), "is not a dataset available"), ((-1,), "dataset with the ID="), ((100,), "dataset with the ID="), ((1.00,), "value in the tuple"), ], ) def test_fetch_error(filter_data, err_msg): with pytest.raises(ValueError, match=err_msg): fetch_datasets(filter_data=filter_data) imbalanced-learn-0.12.2/imblearn/ensemble/000077500000000000000000000000001460233407600203425ustar00rootroot00000000000000imbalanced-learn-0.12.2/imblearn/ensemble/__init__.py000066400000000000000000000007211460233407600224530ustar00rootroot00000000000000""" The :mod:`imblearn.ensemble` module include methods generating under-sampled subsets combined inside an ensemble. """ from ._bagging import BalancedBaggingClassifier from ._easy_ensemble import EasyEnsembleClassifier from ._forest import BalancedRandomForestClassifier from ._weight_boosting import RUSBoostClassifier __all__ = [ "BalancedBaggingClassifier", "BalancedRandomForestClassifier", "EasyEnsembleClassifier", "RUSBoostClassifier", ] imbalanced-learn-0.12.2/imblearn/ensemble/_bagging.py000066400000000000000000000405411460233407600224550ustar00rootroot00000000000000"""Bagging classifier trained on balanced bootstrap samples.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import copy import numbers import warnings import numpy as np import sklearn from sklearn.base import clone from sklearn.ensemble import BaggingClassifier from sklearn.ensemble._bagging import _parallel_decision_function from sklearn.ensemble._base import _partition_estimators from sklearn.exceptions import NotFittedError from sklearn.tree import DecisionTreeClassifier from sklearn.utils.fixes import parse_version from sklearn.utils.validation import check_is_fitted try: # scikit-learn >= 1.2 from sklearn.utils.parallel import Parallel, delayed except (ImportError, ModuleNotFoundError): from joblib import Parallel from sklearn.utils.fixes import delayed from ..base import _ParamsValidationMixin from ..pipeline import Pipeline from ..under_sampling import RandomUnderSampler from ..under_sampling.base import BaseUnderSampler from ..utils import Substitution, check_sampling_strategy, check_target_type from ..utils._available_if import available_if from ..utils._docstring import _n_jobs_docstring, _random_state_docstring from ..utils._param_validation import HasMethods, Interval, StrOptions from ..utils.fixes import _fit_context from ._common import _bagging_parameter_constraints, _estimator_has sklearn_version = parse_version(sklearn.__version__) @Substitution( sampling_strategy=BaseUnderSampler._sampling_strategy_docstring, n_jobs=_n_jobs_docstring, random_state=_random_state_docstring, ) class BalancedBaggingClassifier(_ParamsValidationMixin, BaggingClassifier): """A Bagging classifier with additional balancing. This implementation of Bagging is similar to the scikit-learn implementation. It includes an additional step to balance the training set at fit time using a given sampler. This classifier can serves as a basis to implement various methods such as Exactly Balanced Bagging [6]_, Roughly Balanced Bagging [7]_, Over-Bagging [6]_, or SMOTE-Bagging [8]_. Read more in the :ref:`User Guide `. Parameters ---------- estimator : estimator object, default=None The base estimator to fit on random subsets of the dataset. If None, then the base estimator is a decision tree. .. versionadded:: 0.10 n_estimators : int, default=10 The number of base estimators in the ensemble. max_samples : int or float, default=1.0 The number of samples to draw from X to train each base estimator. - If int, then draw ``max_samples`` samples. - If float, then draw ``max_samples * X.shape[0]`` samples. max_features : int or float, default=1.0 The number of features to draw from X to train each base estimator. - If int, then draw ``max_features`` features. - If float, then draw ``max_features * X.shape[1]`` features. bootstrap : bool, default=True Whether samples are drawn with replacement. .. note:: Note that this bootstrap will be generated from the resampled dataset. bootstrap_features : bool, default=False Whether features are drawn with replacement. oob_score : bool, default=False Whether to use out-of-bag samples to estimate the generalization error. warm_start : bool, default=False When set to True, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new ensemble. {sampling_strategy} replacement : bool, default=False Whether or not to randomly sample with replacement or not when `sampler is None`, corresponding to a :class:`~imblearn.under_sampling.RandomUnderSampler`. {n_jobs} {random_state} verbose : int, default=0 Controls the verbosity of the building process. sampler : sampler object, default=None The sampler used to balanced the dataset before to bootstrap (if `bootstrap=True`) and `fit` a base estimator. By default, a :class:`~imblearn.under_sampling.RandomUnderSampler` is used. .. versionadded:: 0.8 Attributes ---------- estimator_ : estimator The base estimator from which the ensemble is grown. .. versionadded:: 0.10 n_features_ : int The number of features when `fit` is performed. .. deprecated:: 1.0 `n_features_` is deprecated in `scikit-learn` 1.0 and will be removed in version 1.2. When the minimum version of `scikit-learn` supported by `imbalanced-learn` will reach 1.2, this attribute will be removed. estimators_ : list of estimators The collection of fitted base estimators. sampler_ : sampler object The validate sampler created from the `sampler` parameter. estimators_samples_ : list of ndarray The subset of drawn samples (i.e., the in-bag samples) for each base estimator. Each subset is defined by a boolean mask. estimators_features_ : list of ndarray The subset of drawn features for each base estimator. classes_ : ndarray of shape (n_classes,) The classes labels. n_classes_ : int or list The number of classes. oob_score_ : float Score of the training dataset obtained using an out-of-bag estimate. oob_decision_function_ : ndarray of shape (n_samples, n_classes) Decision function computed with out-of-bag estimate on the training set. If n_estimators is small it might be possible that a data point was never left out during the bootstrap. In this case, ``oob_decision_function_`` might contain NaN. n_features_in_ : int Number of features in the input dataset. .. versionadded:: 0.9 feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during `fit`. Defined only when `X` has feature names that are all strings. .. versionadded:: 0.9 See Also -------- BalancedRandomForestClassifier : Random forest applying random-under sampling to balance the different bootstraps. EasyEnsembleClassifier : Ensemble of AdaBoost classifier trained on balanced bootstraps. RUSBoostClassifier : AdaBoost classifier were each bootstrap is balanced using random-under sampling at each round of boosting. Notes ----- This is possible to turn this classifier into a balanced random forest [5]_ by passing a :class:`~sklearn.tree.DecisionTreeClassifier` with `max_features='auto'` as a base estimator. See :ref:`sphx_glr_auto_examples_ensemble_plot_comparison_ensemble_classifier.py`. References ---------- .. [1] L. Breiman, "Pasting small votes for classification in large databases and on-line", Machine Learning, 36(1), 85-103, 1999. .. [2] L. Breiman, "Bagging predictors", Machine Learning, 24(2), 123-140, 1996. .. [3] T. Ho, "The random subspace method for constructing decision forests", Pattern Analysis and Machine Intelligence, 20(8), 832-844, 1998. .. [4] G. Louppe and P. Geurts, "Ensembles on Random Patches", Machine Learning and Knowledge Discovery in Databases, 346-361, 2012. .. [5] C. Chen Chao, A. Liaw, and L. Breiman. "Using random forest to learn imbalanced data." University of California, Berkeley 110, 2004. .. [6] R. Maclin, and D. Opitz. "An empirical evaluation of bagging and boosting." AAAI/IAAI 1997 (1997): 546-551. .. [7] S. Hido, H. Kashima, and Y. Takahashi. "Roughly balanced bagging for imbalanced data." Statistical Analysis and Data Mining: The ASA Data Science Journal 2.5â€6 (2009): 412-426. .. [8] S. Wang, and X. Yao. "Diversity analysis on imbalanced data sets by using ensemble models." 2009 IEEE symposium on computational intelligence and data mining. IEEE, 2009. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from sklearn.model_selection import train_test_split >>> from sklearn.metrics import confusion_matrix >>> from imblearn.ensemble import BalancedBaggingClassifier >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> X_train, X_test, y_train, y_test = train_test_split(X, y, ... random_state=0) >>> bbc = BalancedBaggingClassifier(random_state=42) >>> bbc.fit(X_train, y_train) BalancedBaggingClassifier(...) >>> y_pred = bbc.predict(X_test) >>> print(confusion_matrix(y_test, y_pred)) [[ 23 0] [ 2 225]] """ # make a deepcopy to not modify the original dictionary if sklearn_version >= parse_version("1.4"): _parameter_constraints = copy.deepcopy(BaggingClassifier._parameter_constraints) else: _parameter_constraints = copy.deepcopy(_bagging_parameter_constraints) _parameter_constraints.update( { "sampling_strategy": [ Interval(numbers.Real, 0, 1, closed="right"), StrOptions({"auto", "majority", "not minority", "not majority", "all"}), dict, callable, ], "replacement": ["boolean"], "sampler": [HasMethods(["fit_resample"]), None], } ) # TODO: remove when minimum supported version of scikit-learn is 1.4 if "base_estimator" in _parameter_constraints: del _parameter_constraints["base_estimator"] def __init__( self, estimator=None, n_estimators=10, *, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, warm_start=False, sampling_strategy="auto", replacement=False, n_jobs=None, random_state=None, verbose=0, sampler=None, ): super().__init__( n_estimators=n_estimators, max_samples=max_samples, max_features=max_features, bootstrap=bootstrap, bootstrap_features=bootstrap_features, oob_score=oob_score, warm_start=warm_start, n_jobs=n_jobs, random_state=random_state, verbose=verbose, ) self.estimator = estimator self.sampling_strategy = sampling_strategy self.replacement = replacement self.sampler = sampler def _validate_y(self, y): y_encoded = super()._validate_y(y) if ( isinstance(self.sampling_strategy, dict) and self.sampler_._sampling_type != "bypass" ): self._sampling_strategy = { np.where(self.classes_ == key)[0][0]: value for key, value in check_sampling_strategy( self.sampling_strategy, y, self.sampler_._sampling_type, ).items() } else: self._sampling_strategy = self.sampling_strategy return y_encoded def _validate_estimator(self, default=DecisionTreeClassifier()): """Check the estimator and the n_estimator attribute, set the `estimator_` attribute.""" if self.estimator is not None: estimator = clone(self.estimator) else: estimator = clone(default) if self.sampler_._sampling_type != "bypass": self.sampler_.set_params(sampling_strategy=self._sampling_strategy) self.estimator_ = Pipeline( [("sampler", self.sampler_), ("classifier", estimator)] ) # TODO: remove when supporting scikit-learn>=1.2 @property def n_features_(self): """Number of features when ``fit`` is performed.""" warnings.warn( "`n_features_` was deprecated in scikit-learn 1.0. This attribute will " "not be accessible when the minimum supported version of scikit-learn " "is 1.2.", FutureWarning, ) return self.n_features_in_ @_fit_context(prefer_skip_nested_validation=False) def fit(self, X, y): """Build a Bagging ensemble of estimators from the training set (X, y). Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The training input samples. Sparse matrices are accepted only if they are supported by the base estimator. y : array-like of shape (n_samples,) The target values (class labels in classification, real numbers in regression). Returns ------- self : object Fitted estimator. """ # overwrite the base class method by disallowing `sample_weight` self._validate_params() return super().fit(X, y) def _fit(self, X, y, max_samples=None, max_depth=None, sample_weight=None): check_target_type(y) # the sampler needs to be validated before to call _fit because # _validate_y is called before _validate_estimator and would require # to know which type of sampler we are using. if self.sampler is None: self.sampler_ = RandomUnderSampler( replacement=self.replacement, ) else: self.sampler_ = clone(self.sampler) # RandomUnderSampler is not supporting sample_weight. We need to pass # None. return super()._fit(X, y, self.max_samples, sample_weight=None) # TODO: remove when minimum supported version of scikit-learn is 1.1 @available_if(_estimator_has("decision_function")) def decision_function(self, X): """Average of the decision functions of the base classifiers. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The training input samples. Sparse matrices are accepted only if they are supported by the base estimator. Returns ------- score : ndarray of shape (n_samples, k) The decision function of the input samples. The columns correspond to the classes in sorted order, as they appear in the attribute ``classes_``. Regression and binary classification are special cases with ``k == 1``, otherwise ``k==n_classes``. """ check_is_fitted(self) # Check data X = self._validate_data( X, accept_sparse=["csr", "csc"], dtype=None, force_all_finite=False, reset=False, ) # Parallel loop n_jobs, _, starts = _partition_estimators(self.n_estimators, self.n_jobs) all_decisions = Parallel(n_jobs=n_jobs, verbose=self.verbose)( delayed(_parallel_decision_function)( self.estimators_[starts[i] : starts[i + 1]], self.estimators_features_[starts[i] : starts[i + 1]], X, ) for i in range(n_jobs) ) # Reduce decisions = sum(all_decisions) / self.n_estimators return decisions @property def base_estimator_(self): """Attribute for older sklearn version compatibility.""" error = AttributeError( f"{self.__class__.__name__} object has no attribute 'base_estimator_'." ) if sklearn_version < parse_version("1.2"): # The base class require to have the attribute defined. For scikit-learn # > 1.2, we are going to raise an error. try: check_is_fitted(self) return self.estimator_ except NotFittedError: raise error raise error def _more_tags(self): tags = super()._more_tags() tags_key = "_xfail_checks" failing_test = "check_estimators_nan_inf" reason = "Fails because the sampler removed infinity and NaN values" if tags_key in tags: tags[tags_key][failing_test] = reason else: tags[tags_key] = {failing_test: reason} return tags imbalanced-learn-0.12.2/imblearn/ensemble/_common.py000066400000000000000000000066521460233407600223540ustar00rootroot00000000000000from numbers import Integral, Real from sklearn.tree._criterion import Criterion from ..utils._param_validation import ( HasMethods, Hidden, Interval, RealNotInt, StrOptions, ) def _estimator_has(attr): """Check if we can delegate a method to the underlying estimator. First, we check the first fitted estimator if available, otherwise we check the estimator attribute. """ def check(self): if hasattr(self, "estimators_"): return hasattr(self.estimators_[0], attr) elif self.estimator is not None: return hasattr(self.estimator, attr) else: # TODO(1.4): Remove when the base_estimator deprecation cycle ends return hasattr(self.base_estimator, attr) return check _bagging_parameter_constraints = { "estimator": [HasMethods(["fit", "predict"]), None], "n_estimators": [Interval(Integral, 1, None, closed="left")], "max_samples": [ Interval(Integral, 1, None, closed="left"), Interval(RealNotInt, 0, 1, closed="right"), ], "max_features": [ Interval(Integral, 1, None, closed="left"), Interval(RealNotInt, 0, 1, closed="right"), ], "bootstrap": ["boolean"], "bootstrap_features": ["boolean"], "oob_score": ["boolean"], "warm_start": ["boolean"], "n_jobs": [None, Integral], "random_state": ["random_state"], "verbose": ["verbose"], "base_estimator": [ HasMethods(["fit", "predict"]), StrOptions({"deprecated"}), None, ], } _adaboost_classifier_parameter_constraints = { "estimator": [HasMethods(["fit", "predict"]), None], "n_estimators": [Interval(Integral, 1, None, closed="left")], "learning_rate": [Interval(Real, 0, None, closed="neither")], "random_state": ["random_state"], "base_estimator": [HasMethods(["fit", "predict"]), StrOptions({"deprecated"})], "algorithm": [StrOptions({"SAMME", "SAMME.R"})], } _random_forest_classifier_parameter_constraints = { "n_estimators": [Interval(Integral, 1, None, closed="left")], "bootstrap": ["boolean"], "oob_score": ["boolean"], "n_jobs": [Integral, None], "random_state": ["random_state"], "verbose": ["verbose"], "warm_start": ["boolean"], "criterion": [StrOptions({"gini", "entropy", "log_loss"}), Hidden(Criterion)], "max_samples": [ None, Interval(Real, 0.0, 1.0, closed="right"), Interval(Integral, 1, None, closed="left"), ], "max_depth": [Interval(Integral, 1, None, closed="left"), None], "min_samples_split": [ Interval(Integral, 2, None, closed="left"), Interval(RealNotInt, 0.0, 1.0, closed="right"), ], "min_samples_leaf": [ Interval(Integral, 1, None, closed="left"), Interval(RealNotInt, 0.0, 1.0, closed="neither"), ], "min_weight_fraction_leaf": [Interval(Real, 0.0, 0.5, closed="both")], "max_features": [ Interval(Integral, 1, None, closed="left"), Interval(RealNotInt, 0.0, 1.0, closed="right"), StrOptions({"sqrt", "log2"}), None, ], "max_leaf_nodes": [Interval(Integral, 2, None, closed="left"), None], "min_impurity_decrease": [Interval(Real, 0.0, None, closed="left")], "ccp_alpha": [Interval(Real, 0.0, None, closed="left")], "class_weight": [ StrOptions({"balanced_subsample", "balanced"}), dict, list, None, ], "monotonic_cst": ["array-like", None], } imbalanced-learn-0.12.2/imblearn/ensemble/_easy_ensemble.py000066400000000000000000000310521460233407600236670ustar00rootroot00000000000000"""Class to perform under-sampling using easy ensemble.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import copy import numbers import warnings import numpy as np import sklearn from sklearn.base import clone from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier from sklearn.ensemble._bagging import _parallel_decision_function from sklearn.ensemble._base import _partition_estimators from sklearn.exceptions import NotFittedError from sklearn.utils._tags import _safe_tags from sklearn.utils.fixes import parse_version from sklearn.utils.validation import check_is_fitted try: # scikit-learn >= 1.2 from sklearn.utils.parallel import Parallel, delayed except (ImportError, ModuleNotFoundError): from joblib import Parallel from sklearn.utils.fixes import delayed from ..base import _ParamsValidationMixin from ..pipeline import Pipeline from ..under_sampling import RandomUnderSampler from ..under_sampling.base import BaseUnderSampler from ..utils import Substitution, check_sampling_strategy, check_target_type from ..utils._available_if import available_if from ..utils._docstring import _n_jobs_docstring, _random_state_docstring from ..utils._param_validation import Interval, StrOptions from ..utils.fixes import _fit_context from ._common import _bagging_parameter_constraints, _estimator_has MAX_INT = np.iinfo(np.int32).max sklearn_version = parse_version(sklearn.__version__) @Substitution( sampling_strategy=BaseUnderSampler._sampling_strategy_docstring, n_jobs=_n_jobs_docstring, random_state=_random_state_docstring, ) class EasyEnsembleClassifier(_ParamsValidationMixin, BaggingClassifier): """Bag of balanced boosted learners also known as EasyEnsemble. This algorithm is known as EasyEnsemble [1]_. The classifier is an ensemble of AdaBoost learners trained on different balanced bootstrap samples. The balancing is achieved by random under-sampling. Read more in the :ref:`User Guide `. .. versionadded:: 0.4 Parameters ---------- n_estimators : int, default=10 Number of AdaBoost learners in the ensemble. estimator : estimator object, default=AdaBoostClassifier() The base AdaBoost classifier used in the inner ensemble. Note that you can set the number of inner learner by passing your own instance. .. versionadded:: 0.10 warm_start : bool, default=False When set to True, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new ensemble. {sampling_strategy} replacement : bool, default=False Whether or not to sample randomly with replacement or not. {n_jobs} {random_state} verbose : int, default=0 Controls the verbosity of the building process. Attributes ---------- estimator_ : estimator The base estimator from which the ensemble is grown. .. versionadded:: 0.10 estimators_ : list of estimators The collection of fitted base estimators. estimators_samples_ : list of arrays The subset of drawn samples for each base estimator. estimators_features_ : list of arrays The subset of drawn features for each base estimator. classes_ : array, shape (n_classes,) The classes labels. n_classes_ : int or list The number of classes. n_features_ : int The number of features when `fit` is performed. .. deprecated:: 1.0 `n_features_` is deprecated in `scikit-learn` 1.0 and will be removed in version 1.2. When the minimum version of `scikit-learn` supported by `imbalanced-learn` will reach 1.2, this attribute will be removed. n_features_in_ : int Number of features in the input dataset. .. versionadded:: 0.9 feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during `fit`. Defined only when `X` has feature names that are all strings. .. versionadded:: 0.9 See Also -------- BalancedBaggingClassifier : Bagging classifier for which each base estimator is trained on a balanced bootstrap. BalancedRandomForestClassifier : Random forest applying random-under sampling to balance the different bootstraps. RUSBoostClassifier : AdaBoost classifier were each bootstrap is balanced using random-under sampling at each round of boosting. Notes ----- The method is described in [1]_. Supports multi-class resampling by sampling each class independently. References ---------- .. [1] X. Y. Liu, J. Wu and Z. H. Zhou, "Exploratory Undersampling for Class-Imbalance Learning," in IEEE Transactions on Systems, Man, and Cybernetics, Part B (Cybernetics), vol. 39, no. 2, pp. 539-550, April 2009. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from sklearn.model_selection import train_test_split >>> from sklearn.metrics import confusion_matrix >>> from imblearn.ensemble import EasyEnsembleClassifier >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> X_train, X_test, y_train, y_test = train_test_split(X, y, ... random_state=0) >>> eec = EasyEnsembleClassifier(random_state=42) >>> eec.fit(X_train, y_train) EasyEnsembleClassifier(...) >>> y_pred = eec.predict(X_test) >>> print(confusion_matrix(y_test, y_pred)) [[ 23 0] [ 2 225]] """ # make a deepcopy to not modify the original dictionary if sklearn_version >= parse_version("1.4"): _parameter_constraints = copy.deepcopy(BaggingClassifier._parameter_constraints) else: _parameter_constraints = copy.deepcopy(_bagging_parameter_constraints) excluded_params = { "bootstrap", "bootstrap_features", "max_features", "oob_score", "max_samples", } for param in excluded_params: _parameter_constraints.pop(param, None) _parameter_constraints.update( { "sampling_strategy": [ Interval(numbers.Real, 0, 1, closed="right"), StrOptions({"auto", "majority", "not minority", "not majority", "all"}), dict, callable, ], "replacement": ["boolean"], } ) # TODO: remove when minimum supported version of scikit-learn is 1.4 if "base_estimator" in _parameter_constraints: del _parameter_constraints["base_estimator"] def __init__( self, n_estimators=10, estimator=None, *, warm_start=False, sampling_strategy="auto", replacement=False, n_jobs=None, random_state=None, verbose=0, ): super().__init__( n_estimators=n_estimators, max_samples=1.0, max_features=1.0, bootstrap=False, bootstrap_features=False, oob_score=False, warm_start=warm_start, n_jobs=n_jobs, random_state=random_state, verbose=verbose, ) self.estimator = estimator self.sampling_strategy = sampling_strategy self.replacement = replacement def _validate_y(self, y): y_encoded = super()._validate_y(y) if isinstance(self.sampling_strategy, dict): self._sampling_strategy = { np.where(self.classes_ == key)[0][0]: value for key, value in check_sampling_strategy( self.sampling_strategy, y, "under-sampling", ).items() } else: self._sampling_strategy = self.sampling_strategy return y_encoded def _validate_estimator(self, default=AdaBoostClassifier(algorithm="SAMME")): """Check the estimator and the n_estimator attribute, set the `estimator_` attribute.""" if self.estimator is not None: estimator = clone(self.estimator) else: estimator = clone(default) sampler = RandomUnderSampler( sampling_strategy=self._sampling_strategy, replacement=self.replacement, ) self.estimator_ = Pipeline([("sampler", sampler), ("classifier", estimator)]) # TODO: remove when supporting scikit-learn>=1.2 @property def n_features_(self): """Number of features when ``fit`` is performed.""" warnings.warn( "`n_features_` was deprecated in scikit-learn 1.0. This attribute will " "not be accessible when the minimum supported version of scikit-learn " "is 1.2.", FutureWarning, ) return self.n_features_in_ @_fit_context(prefer_skip_nested_validation=False) def fit(self, X, y): """Build a Bagging ensemble of estimators from the training set (X, y). Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The training input samples. Sparse matrices are accepted only if they are supported by the base estimator. y : array-like of shape (n_samples,) The target values (class labels in classification, real numbers in regression). Returns ------- self : object Fitted estimator. """ self._validate_params() # overwrite the base class method by disallowing `sample_weight` return super().fit(X, y) def _fit(self, X, y, max_samples=None, max_depth=None, sample_weight=None): check_target_type(y) # RandomUnderSampler is not supporting sample_weight. We need to pass # None. return super()._fit(X, y, self.max_samples, sample_weight=None) # TODO: remove when minimum supported version of scikit-learn is 1.1 @available_if(_estimator_has("decision_function")) def decision_function(self, X): """Average of the decision functions of the base classifiers. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The training input samples. Sparse matrices are accepted only if they are supported by the base estimator. Returns ------- score : ndarray of shape (n_samples, k) The decision function of the input samples. The columns correspond to the classes in sorted order, as they appear in the attribute ``classes_``. Regression and binary classification are special cases with ``k == 1``, otherwise ``k==n_classes``. """ check_is_fitted(self) # Check data X = self._validate_data( X, accept_sparse=["csr", "csc"], dtype=None, force_all_finite=False, reset=False, ) # Parallel loop n_jobs, _, starts = _partition_estimators(self.n_estimators, self.n_jobs) all_decisions = Parallel(n_jobs=n_jobs, verbose=self.verbose)( delayed(_parallel_decision_function)( self.estimators_[starts[i] : starts[i + 1]], self.estimators_features_[starts[i] : starts[i + 1]], X, ) for i in range(n_jobs) ) # Reduce decisions = sum(all_decisions) / self.n_estimators return decisions @property def base_estimator_(self): """Attribute for older sklearn version compatibility.""" error = AttributeError( f"{self.__class__.__name__} object has no attribute 'base_estimator_'." ) if sklearn_version < parse_version("1.2"): # The base class require to have the attribute defined. For scikit-learn # > 1.2, we are going to raise an error. try: check_is_fitted(self) return self.estimator_ except NotFittedError: raise error raise error def _more_tags(self): if self.estimator is None: estimator = AdaBoostClassifier(algorithm="SAMME") else: estimator = self.estimator return {"allow_nan": _safe_tags(estimator, "allow_nan")} imbalanced-learn-0.12.2/imblearn/ensemble/_forest.py000066400000000000000000001062771460233407600223720ustar00rootroot00000000000000"""Forest classifiers trained on balanced boostrasp samples.""" # Authors: Guillaume Lemaitre # License: MIT import numbers from copy import deepcopy from warnings import warn import numpy as np import sklearn from numpy import float32 as DTYPE from numpy import float64 as DOUBLE from scipy.sparse import issparse from sklearn.base import clone, is_classifier from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble._base import _set_random_states from sklearn.ensemble._forest import ( _generate_unsampled_indices, _get_n_samples_bootstrap, _parallel_build_trees, ) from sklearn.exceptions import DataConversionWarning from sklearn.tree import DecisionTreeClassifier from sklearn.utils import _safe_indexing, check_random_state from sklearn.utils.fixes import parse_version from sklearn.utils.multiclass import type_of_target from sklearn.utils.validation import _check_sample_weight try: # scikit-learn >= 1.2 from sklearn.utils.parallel import Parallel, delayed except (ImportError, ModuleNotFoundError): from joblib import Parallel from sklearn.utils.fixes import delayed from ..base import _ParamsValidationMixin from ..pipeline import make_pipeline from ..under_sampling import RandomUnderSampler from ..utils import Substitution from ..utils._docstring import _n_jobs_docstring, _random_state_docstring from ..utils._param_validation import Hidden, Interval, StrOptions from ..utils._validation import check_sampling_strategy from ..utils.fixes import _fit_context from ._common import _random_forest_classifier_parameter_constraints MAX_INT = np.iinfo(np.int32).max sklearn_version = parse_version(sklearn.__version__) def _local_parallel_build_trees( sampler, tree, bootstrap, X, y, sample_weight, tree_idx, n_trees, verbose=0, class_weight=None, n_samples_bootstrap=None, forest=None, missing_values_in_feature_mask=None, ): # resample before to fit the tree X_resampled, y_resampled = sampler.fit_resample(X, y) if sample_weight is not None: sample_weight = _safe_indexing(sample_weight, sampler.sample_indices_) if _get_n_samples_bootstrap is not None: n_samples_bootstrap = min(n_samples_bootstrap, X_resampled.shape[0]) params_parallel_build_trees = { "tree": tree, "X": X_resampled, "y": y_resampled, "sample_weight": sample_weight, "tree_idx": tree_idx, "n_trees": n_trees, "verbose": verbose, "class_weight": class_weight, "n_samples_bootstrap": n_samples_bootstrap, } if parse_version(sklearn_version.base_version) >= parse_version("1.4"): # TODO: remove when the minimum supported version of scikit-learn will be 1.4 # support for missing values params_parallel_build_trees[ "missing_values_in_feature_mask" ] = missing_values_in_feature_mask # TODO: remove when the minimum supported version of scikit-learn will be 1.1 # change of signature in scikit-learn 1.1 if parse_version(sklearn_version.base_version) >= parse_version("1.1"): params_parallel_build_trees["bootstrap"] = bootstrap else: params_parallel_build_trees["forest"] = forest tree = _parallel_build_trees(**params_parallel_build_trees) return sampler, tree @Substitution( n_jobs=_n_jobs_docstring, random_state=_random_state_docstring, ) class BalancedRandomForestClassifier(_ParamsValidationMixin, RandomForestClassifier): """A balanced random forest classifier. A balanced random forest differs from a classical random forest by the fact that it will draw a bootstrap sample from the minority class and sample with replacement the same number of samples from the majority class. Read more in the :ref:`User Guide `. .. versionadded:: 0.4 Parameters ---------- n_estimators : int, default=100 The number of trees in the forest. criterion : {{"gini", "entropy"}}, default="gini" The function to measure the quality of a split. Supported criteria are "gini" for the Gini impurity and "entropy" for the information gain. Note: this parameter is tree-specific. max_depth : int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples. min_samples_split : int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a percentage and `ceil(min_samples_split * n_samples)` are the minimum number of samples for each split. min_samples_leaf : int or float, default=1 The minimum number of samples required to be at a leaf node: - If int, then consider ``min_samples_leaf`` as the minimum number. - If float, then ``min_samples_leaf`` is a fraction and `ceil(min_samples_leaf * n_samples)` are the minimum number of samples for each node. min_weight_fraction_leaf : float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided. max_features : {{"auto", "sqrt", "log2"}}, int, float, or None, \ default="sqrt" The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a percentage and `int(max_features * n_features)` features are considered at each split. - If "auto", then `max_features=sqrt(n_features)`. - If "sqrt", then `max_features=sqrt(n_features)` (same as "auto"). - If "log2", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features. max_leaf_nodes : int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes. min_impurity_decrease : float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following:: N_t / N * (impurity - N_t_R / N_t * right_impurity - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. bootstrap : bool, default=True Whether bootstrap samples are used when building trees. .. versionchanged:: 0.13 The default of `bootstrap` will change from `True` to `False` in version 0.13. Bootstrapping is already taken care by the internal sampler using `replacement=True`. This implementation follows the algorithm proposed in [1]_. oob_score : bool, default=False Whether to use out-of-bag samples to estimate the generalization accuracy. sampling_strategy : float, str, dict, callable, default="auto" Sampling information to sample the data set. - When ``float``, it corresponds to the desired ratio of the number of samples in the minority class over the number of samples in the majority class after resampling. Therefore, the ratio is expressed as :math:`\\alpha_{{us}} = N_{{m}} / N_{{rM}}` where :math:`N_{{m}}` is the number of samples in the minority class and :math:`N_{{rM}}` is the number of samples in the majority class after resampling. .. warning:: ``float`` is only available for **binary** classification. An error is raised for multi-class classification. - When ``str``, specify the class targeted by the resampling. The number of samples in the different classes will be equalized. Possible choices are: ``'majority'``: resample only the majority class; ``'not minority'``: resample all classes but the minority class; ``'not majority'``: resample all classes but the majority class; ``'all'``: resample all classes; ``'auto'``: equivalent to ``'not minority'``. - When ``dict``, the keys correspond to the targeted classes. The values correspond to the desired number of samples for each targeted class. - When callable, function taking ``y`` and returns a ``dict``. The keys correspond to the targeted classes. The values correspond to the desired number of samples for each class. .. versionchanged:: 0.11 The default of `sampling_strategy` will change from `"auto"` to `"all"` in version 0.13. This forces to use a bootstrap of the minority class as proposed in [1]_. replacement : bool, default=False Whether or not to sample randomly with replacement or not. .. versionchanged:: 0.11 The default of `replacement` will change from `False` to `True` in version 0.13. This forces to use a bootstrap of the minority class and draw with replacement as proposed in [1]_. {n_jobs} {random_state} verbose : int, default=0 Controls the verbosity of the tree building process. warm_start : bool, default=False When set to ``True``, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new forest. class_weight : dict, list of dicts, {{"balanced", "balanced_subsample"}}, \ default=None Weights associated with classes in the form dictionary with the key being the class_label and the value the weight. If not given, all classes are supposed to have weight one. For multi-output problems, a list of dicts can be provided in the same order as the columns of y. Note that for multioutput (including multilabel) weights should be defined for each class of every column in its own dict. For example, for four-class multilabel classification weights should be [{{0: 1, 1: 1}}, {{0: 1, 1: 5}}, {{0: 1, 1: 1}}, {{0: 1, 1: 1}}] instead of [{{1:1}}, {{2:5}}, {{3:1}}, {{4:1}}]. The "balanced" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))`` The "balanced_subsample" mode is the same as "balanced" except that weights are computed based on the bootstrap sample for every tree grown. For multi-output, the weights of each column of y will be multiplied. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. ccp_alpha : non-negative float, default=0.0 Complexity parameter used for Minimal Cost-Complexity Pruning. The subtree with the largest cost complexity that is smaller than ``ccp_alpha`` will be chosen. By default, no pruning is performed. .. versionadded:: 0.6 Added in `scikit-learn` in 0.22 max_samples : int or float, default=None If bootstrap is True, the number of samples to draw from X to train each base estimator. - If None (default), then draw `X.shape[0]` samples. - If int, then draw `max_samples` samples. - If float, then draw `max_samples * X.shape[0]` samples. Thus, `max_samples` should be in the interval `(0, 1)`. Be aware that the final number samples used will be the minimum between the number of samples given in `max_samples` and the number of samples obtained after resampling. .. versionadded:: 0.6 Added in `scikit-learn` in 0.22 monotonic_cst : array-like of int of shape (n_features), default=None Indicates the monotonicity constraint to enforce on each feature. - 1: monotonic increase - 0: no constraint - -1: monotonic decrease If monotonic_cst is None, no constraints are applied. Monotonicity constraints are not supported for: - multiclass classifications (i.e. when `n_classes > 2`), - multioutput classifications (i.e. when `n_outputs_ > 1`), - classifications trained on data with missing values. The constraints hold over the probability of the positive class. .. versionadded:: 0.12 Only supported when scikit-learn >= 1.4 is installed. Otherwise, a `ValueError` is raised. Attributes ---------- estimator_ : :class:`~sklearn.tree.DecisionTreeClassifier` instance The child estimator template used to create the collection of fitted sub-estimators. .. versionadded:: 0.10 estimators_ : list of :class:`~sklearn.tree.DecisionTreeClassifier` The collection of fitted sub-estimators. base_sampler_ : :class:`~imblearn.under_sampling.RandomUnderSampler` The base sampler used to construct the subsequent list of samplers. samplers_ : list of :class:`~imblearn.under_sampling.RandomUnderSampler` The collection of fitted samplers. pipelines_ : list of Pipeline. The collection of fitted pipelines (samplers + trees). classes_ : ndarray of shape (n_classes,) or a list of such arrays The classes labels (single output problem), or a list of arrays of class labels (multi-output problem). n_classes_ : int or list The number of classes (single output problem), or a list containing the number of classes for each output (multi-output problem). n_features_ : int The number of features when `fit` is performed. .. deprecated:: 1.0 `n_features_` is deprecated in `scikit-learn` 1.0 and will be removed in version 1.2. When the minimum version of `scikit-learn` supported by `imbalanced-learn` will reach 1.2, this attribute will be removed. n_features_in_ : int Number of features in the input dataset. .. versionadded:: 0.9 feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during `fit`. Defined only when `X` has feature names that are all strings. .. versionadded:: 0.9 n_outputs_ : int The number of outputs when ``fit`` is performed. feature_importances_ : ndarray of shape (n_features,) The feature importances (the higher, the more important the feature). oob_score_ : float Score of the training dataset obtained using an out-of-bag estimate. oob_decision_function_ : ndarray of shape (n_samples, n_classes) Decision function computed with out-of-bag estimate on the training set. If n_estimators is small it might be possible that a data point was never left out during the bootstrap. In this case, `oob_decision_function_` might contain NaN. See Also -------- BalancedBaggingClassifier : Bagging classifier for which each base estimator is trained on a balanced bootstrap. EasyEnsembleClassifier : Ensemble of AdaBoost classifier trained on balanced bootstraps. RUSBoostClassifier : AdaBoost classifier were each bootstrap is balanced using random-under sampling at each round of boosting. References ---------- .. [1] Chen, Chao, Andy Liaw, and Leo Breiman. "Using random forest to learn imbalanced data." University of California, Berkeley 110 (2004): 1-12. Examples -------- >>> from imblearn.ensemble import BalancedRandomForestClassifier >>> from sklearn.datasets import make_classification >>> >>> X, y = make_classification(n_samples=1000, n_classes=3, ... n_informative=4, weights=[0.2, 0.3, 0.5], ... random_state=0) >>> clf = BalancedRandomForestClassifier( ... sampling_strategy="all", replacement=True, max_depth=2, random_state=0, ... bootstrap=False) >>> clf.fit(X, y) BalancedRandomForestClassifier(...) >>> print(clf.feature_importances_) [...] >>> print(clf.predict([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])) [1] """ # make a deepcopy to not modify the original dictionary if sklearn_version >= parse_version("1.4"): _parameter_constraints = deepcopy(RandomForestClassifier._parameter_constraints) else: _parameter_constraints = deepcopy( _random_forest_classifier_parameter_constraints ) _parameter_constraints.update( { "bootstrap": ["boolean", Hidden(StrOptions({"warn"}))], "sampling_strategy": [ Interval(numbers.Real, 0, 1, closed="right"), StrOptions({"auto", "majority", "not minority", "not majority", "all"}), dict, callable, Hidden(StrOptions({"warn"})), ], "replacement": ["boolean", Hidden(StrOptions({"warn"}))], } ) def __init__( self, n_estimators=100, *, criterion="gini", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features="sqrt", max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap="warn", oob_score=False, sampling_strategy="warn", replacement="warn", n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None, monotonic_cst=None, ): params_random_forest = { "criterion": criterion, "max_depth": max_depth, "n_estimators": n_estimators, "bootstrap": bootstrap, "oob_score": oob_score, "n_jobs": n_jobs, "random_state": random_state, "verbose": verbose, "warm_start": warm_start, "class_weight": class_weight, "min_samples_split": min_samples_split, "min_samples_leaf": min_samples_leaf, "min_weight_fraction_leaf": min_weight_fraction_leaf, "max_features": max_features, "max_leaf_nodes": max_leaf_nodes, "min_impurity_decrease": min_impurity_decrease, "ccp_alpha": ccp_alpha, "max_samples": max_samples, } # TODO: remove when the minimum supported version of scikit-learn will be 1.4 if parse_version(sklearn_version.base_version) >= parse_version("1.4"): # use scikit-learn support for monotonic constraints params_random_forest["monotonic_cst"] = monotonic_cst else: if monotonic_cst is not None: raise ValueError( "Monotonic constraints are not supported for scikit-learn " "version < 1.4." ) # create an attribute for compatibility with other scikit-learn tools such # as HTML representation. self.monotonic_cst = monotonic_cst super().__init__(**params_random_forest) self.sampling_strategy = sampling_strategy self.replacement = replacement def _validate_estimator(self, default=DecisionTreeClassifier()): """Check the estimator and the n_estimator attribute, set the `estimator_` attribute.""" if hasattr(self, "estimator"): base_estimator = self.estimator else: base_estimator = self.base_estimator if base_estimator is not None: self.estimator_ = clone(base_estimator) else: self.estimator_ = clone(default) self.base_sampler_ = RandomUnderSampler( sampling_strategy=self._sampling_strategy, replacement=self._replacement, ) def _make_sampler_estimator(self, random_state=None): """Make and configure a copy of the `base_estimator_` attribute. Warning: This method should be used to properly instantiate new sub-estimators. """ estimator = clone(self.estimator_) estimator.set_params(**{p: getattr(self, p) for p in self.estimator_params}) sampler = clone(self.base_sampler_) if random_state is not None: _set_random_states(estimator, random_state) _set_random_states(sampler, random_state) return estimator, sampler @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): """Build a forest of trees from the training set (X, y). Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The training input samples. Internally, its dtype will be converted to ``dtype=np.float32``. If a sparse matrix is provided, it will be converted into a sparse ``csc_matrix``. y : array-like of shape (n_samples,) or (n_samples, n_outputs) The target values (class labels in classification, real numbers in regression). sample_weight : array-like of shape (n_samples,) Sample weights. If None, then samples are equally weighted. Splits that would create child nodes with net zero or negative weight are ignored while searching for a split in each node. In the case of classification, splits are also ignored if they would result in any single class carrying a negative weight in either child node. Returns ------- self : object The fitted instance. """ self._validate_params() # TODO: remove in 0.13 if self.sampling_strategy == "warn": warn( "The default of `sampling_strategy` will change from `'auto'` to " "`'all'` in version 0.13. This change will follow the implementation " "proposed in the original paper. Set to `'all'` to silence this " "warning and adopt the future behaviour.", FutureWarning, ) self._sampling_strategy = "auto" else: self._sampling_strategy = self.sampling_strategy if self.replacement == "warn": warn( "The default of `replacement` will change from `False` to " "`True` in version 0.13. This change will follow the implementation " "proposed in the original paper. Set to `True` to silence this " "warning and adopt the future behaviour.", FutureWarning, ) self._replacement = False else: self._replacement = self.replacement if self.bootstrap == "warn": warn( "The default of `bootstrap` will change from `True` to " "`False` in version 0.13. This change will follow the implementation " "proposed in the original paper. Set to `False` to silence this " "warning and adopt the future behaviour.", FutureWarning, ) self._bootstrap = True else: self._bootstrap = self.bootstrap # Validate or convert input data if issparse(y): raise ValueError("sparse multilabel-indicator for y is not supported.") # TODO: remove when the minimum supported version of scipy will be 1.4 # Support for missing values if parse_version(sklearn_version.base_version) >= parse_version("1.4"): force_all_finite = False else: force_all_finite = True X, y = self._validate_data( X, y, multi_output=True, accept_sparse="csc", dtype=DTYPE, force_all_finite=force_all_finite, ) # TODO: remove when the minimum supported version of scikit-learn will be 1.4 if parse_version(sklearn_version.base_version) >= parse_version("1.4"): # _compute_missing_values_in_feature_mask checks if X has missing values and # will raise an error if the underlying tree base estimator can't handle # missing values. Only the criterion is required to determine if the tree # supports missing values. estimator = type(self.estimator)(criterion=self.criterion) missing_values_in_feature_mask = ( estimator._compute_missing_values_in_feature_mask( X, estimator_name=self.__class__.__name__ ) ) else: missing_values_in_feature_mask = None if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X) self._n_features = X.shape[1] if issparse(X): # Pre-sort indices to avoid that each individual tree of the # ensemble sorts the indices. X.sort_indices() y = np.atleast_1d(y) if y.ndim == 2 and y.shape[1] == 1: warn( "A column-vector y was passed when a 1d array was" " expected. Please change the shape of y to " "(n_samples,), for example using ravel().", DataConversionWarning, stacklevel=2, ) if y.ndim == 1: # reshape is necessary to preserve the data contiguity against vs # [:, np.newaxis] that does not. y = np.reshape(y, (-1, 1)) self.n_outputs_ = y.shape[1] y_encoded, expanded_class_weight = self._validate_y_class_weight(y) if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: y_encoded = np.ascontiguousarray(y_encoded, dtype=DOUBLE) if isinstance(self._sampling_strategy, dict): self._sampling_strategy = { np.where(self.classes_[0] == key)[0][0]: value for key, value in check_sampling_strategy( self.sampling_strategy, y, "under-sampling", ).items() } else: self._sampling_strategy = self._sampling_strategy if expanded_class_weight is not None: if sample_weight is not None: sample_weight = sample_weight * expanded_class_weight else: sample_weight = expanded_class_weight # Get bootstrap sample size n_samples_bootstrap = _get_n_samples_bootstrap( n_samples=X.shape[0], max_samples=self.max_samples ) # Check parameters self._validate_estimator() if not self._bootstrap and self.oob_score: raise ValueError("Out of bag estimation only available if bootstrap=True") random_state = check_random_state(self.random_state) if not self.warm_start or not hasattr(self, "estimators_"): # Free allocated memory, if any self.estimators_ = [] self.samplers_ = [] self.pipelines_ = [] n_more_estimators = self.n_estimators - len(self.estimators_) if n_more_estimators < 0: raise ValueError( "n_estimators=%d must be larger or equal to " "len(estimators_)=%d when warm_start==True" % (self.n_estimators, len(self.estimators_)) ) elif n_more_estimators == 0: warn( "Warm-start fitting without increasing n_estimators does not " "fit new trees." ) else: if self.warm_start and len(self.estimators_) > 0: # We draw from the random state to get the random state we # would have got if we hadn't used a warm_start. random_state.randint(MAX_INT, size=len(self.estimators_)) trees = [] samplers = [] for _ in range(n_more_estimators): tree, sampler = self._make_sampler_estimator(random_state=random_state) trees.append(tree) samplers.append(sampler) # Parallel loop: we prefer the threading backend as the Cython code # for fitting the trees is internally releasing the Python GIL # making threading more efficient than multiprocessing in # that case. However, we respect any parallel_backend contexts set # at a higher level, since correctness does not rely on using # threads. samplers_trees = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, prefer="threads", )( delayed(_local_parallel_build_trees)( s, t, self._bootstrap, X, y_encoded, sample_weight, i, len(trees), verbose=self.verbose, class_weight=self.class_weight, n_samples_bootstrap=n_samples_bootstrap, forest=self, missing_values_in_feature_mask=missing_values_in_feature_mask, ) for i, (s, t) in enumerate(zip(samplers, trees)) ) samplers, trees = zip(*samplers_trees) # Collect newly grown trees self.estimators_.extend(trees) self.samplers_.extend(samplers) # Create pipeline with the fitted samplers and trees self.pipelines_.extend( [ make_pipeline(deepcopy(s), deepcopy(t)) for s, t in zip(samplers, trees) ] ) if self.oob_score: y_type = type_of_target(y) if y_type in ("multiclass-multioutput", "unknown"): # FIXME: we could consider to support multiclass-multioutput if # we introduce or reuse a constructor parameter (e.g. # oob_score) allowing our user to pass a callable defining the # scoring strategy on OOB sample. raise ValueError( "The type of target cannot be used to compute OOB " f"estimates. Got {y_type} while only the following are " "supported: continuous, continuous-multioutput, binary, " "multiclass, multilabel-indicator." ) self._set_oob_score_and_attributes(X, y_encoded) # Decapsulate classes_ attributes if hasattr(self, "classes_") and self.n_outputs_ == 1: self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] return self def _set_oob_score_and_attributes(self, X, y): """Compute and set the OOB score and attributes. Parameters ---------- X : array-like of shape (n_samples, n_features) The data matrix. y : ndarray of shape (n_samples, n_outputs) The target matrix. """ self.oob_decision_function_ = self._compute_oob_predictions(X, y) if self.oob_decision_function_.shape[-1] == 1: # drop the n_outputs axis if there is a single output self.oob_decision_function_ = self.oob_decision_function_.squeeze(axis=-1) from sklearn.metrics import accuracy_score self.oob_score_ = accuracy_score( y, np.argmax(self.oob_decision_function_, axis=1) ) def _compute_oob_predictions(self, X, y): """Compute and set the OOB score. Parameters ---------- X : array-like of shape (n_samples, n_features) The data matrix. y : ndarray of shape (n_samples, n_outputs) The target matrix. Returns ------- oob_pred : ndarray of shape (n_samples, n_classes, n_outputs) or \ (n_samples, 1, n_outputs) The OOB predictions. """ # Prediction requires X to be in CSR format if issparse(X): X = X.tocsr() n_samples = y.shape[0] n_outputs = self.n_outputs_ if is_classifier(self) and hasattr(self, "n_classes_"): # n_classes_ is a ndarray at this stage # all the supported type of target will have the same number of # classes in all outputs oob_pred_shape = (n_samples, self.n_classes_[0], n_outputs) else: # for regression, n_classes_ does not exist and we create an empty # axis to be consistent with the classification case and make # the array operations compatible with the 2 settings oob_pred_shape = (n_samples, 1, n_outputs) oob_pred = np.zeros(shape=oob_pred_shape, dtype=np.float64) n_oob_pred = np.zeros((n_samples, n_outputs), dtype=np.int64) for sampler, estimator in zip(self.samplers_, self.estimators_): X_resample = X[sampler.sample_indices_] y_resample = y[sampler.sample_indices_] n_sample_subset = y_resample.shape[0] n_samples_bootstrap = _get_n_samples_bootstrap( n_sample_subset, self.max_samples ) unsampled_indices = _generate_unsampled_indices( estimator.random_state, n_sample_subset, n_samples_bootstrap ) y_pred = self._get_oob_predictions( estimator, X_resample[unsampled_indices, :] ) indices = sampler.sample_indices_[unsampled_indices] oob_pred[indices, ...] += y_pred n_oob_pred[indices, :] += 1 for k in range(n_outputs): if (n_oob_pred == 0).any(): warn( "Some inputs do not have OOB scores. This probably means " "too few trees were used to compute any reliable OOB " "estimates.", UserWarning, ) n_oob_pred[n_oob_pred == 0] = 1 oob_pred[..., k] /= n_oob_pred[..., [k]] return oob_pred # TODO: remove when supporting scikit-learn>=1.2 @property def n_features_(self): """Number of features when ``fit`` is performed.""" warn( "`n_features_` was deprecated in scikit-learn 1.0. This attribute will " "not be accessible when the minimum supported version of scikit-learn " "is 1.2.", FutureWarning, ) return self.n_features_in_ def _more_tags(self): return { "multioutput": False, "multilabel": False, } imbalanced-learn-0.12.2/imblearn/ensemble/_weight_boosting.py000066400000000000000000000340451460233407600242540ustar00rootroot00000000000000import copy import numbers from copy import deepcopy import numpy as np import sklearn from sklearn.base import clone from sklearn.ensemble import AdaBoostClassifier from sklearn.ensemble._base import _set_random_states from sklearn.tree import DecisionTreeClassifier from sklearn.utils import _safe_indexing from sklearn.utils.fixes import parse_version from sklearn.utils.validation import has_fit_parameter from ..base import _ParamsValidationMixin from ..pipeline import make_pipeline from ..under_sampling import RandomUnderSampler from ..under_sampling.base import BaseUnderSampler from ..utils import Substitution, check_target_type from ..utils._docstring import _random_state_docstring from ..utils._param_validation import Interval, StrOptions from ..utils.fixes import _fit_context from ._common import _adaboost_classifier_parameter_constraints sklearn_version = parse_version(sklearn.__version__) @Substitution( sampling_strategy=BaseUnderSampler._sampling_strategy_docstring, random_state=_random_state_docstring, ) class RUSBoostClassifier(_ParamsValidationMixin, AdaBoostClassifier): """Random under-sampling integrated in the learning of AdaBoost. During learning, the problem of class balancing is alleviated by random under-sampling the sample at each iteration of the boosting algorithm. Read more in the :ref:`User Guide `. .. versionadded:: 0.4 Parameters ---------- estimator : estimator object, default=None The base estimator from which the boosted ensemble is built. Support for sample weighting is required, as well as proper ``classes_`` and ``n_classes_`` attributes. If ``None``, then the base estimator is ``DecisionTreeClassifier(max_depth=1)``. .. versionadded:: 0.12 n_estimators : int, default=50 The maximum number of estimators at which boosting is terminated. In case of perfect fit, the learning procedure is stopped early. learning_rate : float, default=1.0 Learning rate shrinks the contribution of each classifier by ``learning_rate``. There is a trade-off between ``learning_rate`` and ``n_estimators``. algorithm : {{'SAMME', 'SAMME.R'}}, default='SAMME.R' If 'SAMME.R' then use the SAMME.R real boosting algorithm. ``base_estimator`` must support calculation of class probabilities. If 'SAMME' then use the SAMME discrete boosting algorithm. The SAMME.R algorithm typically converges faster than SAMME, achieving a lower test error with fewer boosting iterations. .. deprecated:: 0.12 `"SAMME.R"` is deprecated and will be removed in version 0.14. '"SAMME"' will become the default. {sampling_strategy} replacement : bool, default=False Whether or not to sample randomly with replacement or not. {random_state} Attributes ---------- estimator_ : estimator The base estimator from which the ensemble is grown. .. versionadded:: 0.10 estimators_ : list of classifiers The collection of fitted sub-estimators. base_sampler_ : :class:`~imblearn.under_sampling.RandomUnderSampler` The base sampler used to generate the subsequent samplers. samplers_ : list of :class:`~imblearn.under_sampling.RandomUnderSampler` The collection of fitted samplers. pipelines_ : list of Pipeline The collection of fitted pipelines (samplers + trees). classes_ : ndarray of shape (n_classes,) The classes labels. n_classes_ : int The number of classes. estimator_weights_ : ndarray of shape (n_estimator,) Weights for each estimator in the boosted ensemble. estimator_errors_ : ndarray of shape (n_estimator,) Classification error for each estimator in the boosted ensemble. feature_importances_ : ndarray of shape (n_features,) The feature importances if supported by the ``base_estimator``. n_features_in_ : int Number of features in the input dataset. .. versionadded:: 0.9 feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during `fit`. Defined only when `X` has feature names that are all strings. .. versionadded:: 0.9 See Also -------- BalancedBaggingClassifier : Bagging classifier for which each base estimator is trained on a balanced bootstrap. BalancedRandomForestClassifier : Random forest applying random-under sampling to balance the different bootstraps. EasyEnsembleClassifier : Ensemble of AdaBoost classifier trained on balanced bootstraps. References ---------- .. [1] Seiffert, C., Khoshgoftaar, T. M., Van Hulse, J., & Napolitano, A. "RUSBoost: A hybrid approach to alleviating class imbalance." IEEE Transactions on Systems, Man, and Cybernetics-Part A: Systems and Humans 40.1 (2010): 185-197. Examples -------- >>> from imblearn.ensemble import RUSBoostClassifier >>> from sklearn.datasets import make_classification >>> >>> X, y = make_classification(n_samples=1000, n_classes=3, ... n_informative=4, weights=[0.2, 0.3, 0.5], ... random_state=0) >>> clf = RUSBoostClassifier(random_state=0) >>> clf.fit(X, y) RUSBoostClassifier(...) >>> clf.predict(X) array([...]) """ # make a deepcopy to not modify the original dictionary if sklearn_version >= parse_version("1.4"): _parameter_constraints = copy.deepcopy( AdaBoostClassifier._parameter_constraints ) else: _parameter_constraints = copy.deepcopy( _adaboost_classifier_parameter_constraints ) _parameter_constraints.update( { "sampling_strategy": [ Interval(numbers.Real, 0, 1, closed="right"), StrOptions({"auto", "majority", "not minority", "not majority", "all"}), dict, callable, ], "replacement": ["boolean"], } ) # TODO: remove when minimum supported version of scikit-learn is 1.4 if "base_estimator" in _parameter_constraints: del _parameter_constraints["base_estimator"] def __init__( self, estimator=None, *, n_estimators=50, learning_rate=1.0, algorithm="SAMME.R", sampling_strategy="auto", replacement=False, random_state=None, ): super().__init__( n_estimators=n_estimators, learning_rate=learning_rate, algorithm=algorithm, random_state=random_state, ) self.estimator = estimator self.sampling_strategy = sampling_strategy self.replacement = replacement @_fit_context(prefer_skip_nested_validation=False) def fit(self, X, y, sample_weight=None): """Build a boosted classifier from the training set (X, y). Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The training input samples. Sparse matrix can be CSC, CSR, COO, DOK, or LIL. DOK and LIL are converted to CSR. y : array-like of shape (n_samples,) The target values (class labels). sample_weight : array-like of shape (n_samples,), default=None Sample weights. If None, the sample weights are initialized to ``1 / n_samples``. Returns ------- self : object Returns self. """ self._validate_params() check_target_type(y) self.samplers_ = [] self.pipelines_ = [] super().fit(X, y, sample_weight) return self def _validate_estimator(self): """Check the estimator and the n_estimator attribute. Sets the `estimator_` attributes. """ default = DecisionTreeClassifier(max_depth=1) if self.estimator is not None: self.estimator_ = clone(self.estimator) else: self.estimator_ = clone(default) # SAMME-R requires predict_proba-enabled estimators if self.algorithm == "SAMME.R": if not hasattr(self.estimator_, "predict_proba"): raise TypeError( "AdaBoostClassifier with algorithm='SAMME.R' requires " "that the weak learner supports the calculation of class " "probabilities with a predict_proba method.\n" "Please change the base estimator or set " "algorithm='SAMME' instead." ) if not has_fit_parameter(self.estimator_, "sample_weight"): raise ValueError( f"{self.estimator_.__class__.__name__} doesn't support sample_weight." ) self.base_sampler_ = RandomUnderSampler( sampling_strategy=self.sampling_strategy, replacement=self.replacement, ) def _make_sampler_estimator(self, append=True, random_state=None): """Make and configure a copy of the `base_estimator_` attribute. Warning: This method should be used to properly instantiate new sub-estimators. """ estimator = clone(self.estimator_) estimator.set_params(**{p: getattr(self, p) for p in self.estimator_params}) sampler = clone(self.base_sampler_) if random_state is not None: _set_random_states(estimator, random_state) _set_random_states(sampler, random_state) if append: self.estimators_.append(estimator) self.samplers_.append(sampler) self.pipelines_.append( make_pipeline(deepcopy(sampler), deepcopy(estimator)) ) return estimator, sampler def _boost_real(self, iboost, X, y, sample_weight, random_state): """Implement a single boost using the SAMME.R real algorithm.""" estimator, sampler = self._make_sampler_estimator(random_state=random_state) X_res, y_res = sampler.fit_resample(X, y) sample_weight_res = _safe_indexing(sample_weight, sampler.sample_indices_) estimator.fit(X_res, y_res, sample_weight=sample_weight_res) y_predict_proba = estimator.predict_proba(X) if iboost == 0: self.classes_ = getattr(estimator, "classes_", None) self.n_classes_ = len(self.classes_) y_predict = self.classes_.take(np.argmax(y_predict_proba, axis=1), axis=0) # Instances incorrectly classified incorrect = y_predict != y # Error fraction estimator_error = np.mean(np.average(incorrect, weights=sample_weight, axis=0)) # Stop if classification is perfect if estimator_error <= 0: return sample_weight, 1.0, 0.0 # Construct y coding as described in Zhu et al [2]: # # y_k = 1 if c == k else -1 / (K - 1) # # where K == n_classes_ and c, k in [0, K) are indices along the second # axis of the y coding with c being the index corresponding to the true # class label. n_classes = self.n_classes_ classes = self.classes_ y_codes = np.array([-1.0 / (n_classes - 1), 1.0]) y_coding = y_codes.take(classes == y[:, np.newaxis]) # Displace zero probabilities so the log is defined. # Also fix negative elements which may occur with # negative sample weights. proba = y_predict_proba # alias for readability np.clip(proba, np.finfo(proba.dtype).eps, None, out=proba) # Boost weight using multi-class AdaBoost SAMME.R alg estimator_weight = ( -1.0 * self.learning_rate * ((n_classes - 1.0) / n_classes) * (y_coding * np.log(y_predict_proba)).sum(axis=1) ) # Only boost the weights if it will fit again if not iboost == self.n_estimators - 1: # Only boost positive weights sample_weight *= np.exp( estimator_weight * ((sample_weight > 0) | (estimator_weight < 0)) ) return sample_weight, 1.0, estimator_error def _boost_discrete(self, iboost, X, y, sample_weight, random_state): """Implement a single boost using the SAMME discrete algorithm.""" estimator, sampler = self._make_sampler_estimator(random_state=random_state) X_res, y_res = sampler.fit_resample(X, y) sample_weight_res = _safe_indexing(sample_weight, sampler.sample_indices_) estimator.fit(X_res, y_res, sample_weight=sample_weight_res) y_predict = estimator.predict(X) if iboost == 0: self.classes_ = getattr(estimator, "classes_", None) self.n_classes_ = len(self.classes_) # Instances incorrectly classified incorrect = y_predict != y # Error fraction estimator_error = np.mean(np.average(incorrect, weights=sample_weight, axis=0)) # Stop if classification is perfect if estimator_error <= 0: return sample_weight, 1.0, 0.0 n_classes = self.n_classes_ # Stop if the error is at least as bad as random guessing if estimator_error >= 1.0 - (1.0 / n_classes): self.estimators_.pop(-1) self.samplers_.pop(-1) self.pipelines_.pop(-1) if len(self.estimators_) == 0: raise ValueError( "BaseClassifier in AdaBoostClassifier " "ensemble is worse than random, ensemble " "can not be fit." ) return None, None, None # Boost weight using multi-class AdaBoost SAMME alg estimator_weight = self.learning_rate * ( np.log((1.0 - estimator_error) / estimator_error) + np.log(n_classes - 1.0) ) # Only boost the weights if I will fit again if not iboost == self.n_estimators - 1: # Only boost positive weights sample_weight *= np.exp(estimator_weight * incorrect * (sample_weight > 0)) return sample_weight, estimator_weight, estimator_error imbalanced-learn-0.12.2/imblearn/ensemble/tests/000077500000000000000000000000001460233407600215045ustar00rootroot00000000000000imbalanced-learn-0.12.2/imblearn/ensemble/tests/__init__.py000066400000000000000000000000001460233407600236030ustar00rootroot00000000000000imbalanced-learn-0.12.2/imblearn/ensemble/tests/test_bagging.py000066400000000000000000000454631460233407600245270ustar00rootroot00000000000000"""Test the module ensemble classifiers.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT from collections import Counter import numpy as np import pytest import sklearn from sklearn.cluster import KMeans from sklearn.datasets import load_iris, make_classification, make_hastie_10_2 from sklearn.dummy import DummyClassifier from sklearn.feature_selection import SelectKBest from sklearn.linear_model import LogisticRegression, Perceptron from sklearn.model_selection import GridSearchCV, ParameterGrid, train_test_split from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.utils._testing import ( assert_allclose, assert_array_almost_equal, assert_array_equal, ) from sklearn.utils.fixes import parse_version from imblearn import FunctionSampler from imblearn.datasets import make_imbalance from imblearn.ensemble import BalancedBaggingClassifier from imblearn.over_sampling import SMOTE, RandomOverSampler from imblearn.pipeline import make_pipeline from imblearn.under_sampling import ClusterCentroids, RandomUnderSampler sklearn_version = parse_version(sklearn.__version__) iris = load_iris() @pytest.mark.parametrize( "estimator", [ None, DummyClassifier(strategy="prior"), Perceptron(max_iter=1000, tol=1e-3), DecisionTreeClassifier(), KNeighborsClassifier(), SVC(gamma="scale"), ], ) @pytest.mark.parametrize( "params", ParameterGrid( { "max_samples": [0.5, 1.0], "max_features": [1, 2, 4], "bootstrap": [True, False], "bootstrap_features": [True, False], } ), ) def test_balanced_bagging_classifier(estimator, params): # Check classification for various parameter settings. X, y = make_imbalance( iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0, ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) bag = BalancedBaggingClassifier(estimator=estimator, random_state=0, **params).fit( X_train, y_train ) bag.predict(X_test) bag.predict_proba(X_test) bag.score(X_test, y_test) if hasattr(estimator, "decision_function"): bag.decision_function(X_test) def test_bootstrap_samples(): # Test that bootstrapping samples generate non-perfect base estimators. X, y = make_imbalance( iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0, ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) estimator = DecisionTreeClassifier().fit(X_train, y_train) # without bootstrap, all trees are perfect on the training set # disable the resampling by passing an empty dictionary. ensemble = BalancedBaggingClassifier( estimator=DecisionTreeClassifier(), max_samples=1.0, bootstrap=False, n_estimators=10, sampling_strategy={}, random_state=0, ).fit(X_train, y_train) assert ensemble.score(X_train, y_train) == estimator.score(X_train, y_train) # with bootstrap, trees are no longer perfect on the training set ensemble = BalancedBaggingClassifier( estimator=DecisionTreeClassifier(), max_samples=1.0, bootstrap=True, random_state=0, ).fit(X_train, y_train) assert ensemble.score(X_train, y_train) < estimator.score(X_train, y_train) def test_bootstrap_features(): # Test that bootstrapping features may generate duplicate features. X, y = make_imbalance( iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0, ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) ensemble = BalancedBaggingClassifier( estimator=DecisionTreeClassifier(), max_features=1.0, bootstrap_features=False, random_state=0, ).fit(X_train, y_train) for features in ensemble.estimators_features_: assert np.unique(features).shape[0] == X.shape[1] ensemble = BalancedBaggingClassifier( estimator=DecisionTreeClassifier(), max_features=1.0, bootstrap_features=True, random_state=0, ).fit(X_train, y_train) unique_features = [ np.unique(features).shape[0] for features in ensemble.estimators_features_ ] assert np.median(unique_features) < X.shape[1] def test_probability(): # Predict probabilities. X, y = make_imbalance( iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0, ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) with np.errstate(divide="ignore", invalid="ignore"): # Normal case ensemble = BalancedBaggingClassifier( estimator=DecisionTreeClassifier(), random_state=0 ).fit(X_train, y_train) assert_array_almost_equal( np.sum(ensemble.predict_proba(X_test), axis=1), np.ones(len(X_test)), ) assert_array_almost_equal( ensemble.predict_proba(X_test), np.exp(ensemble.predict_log_proba(X_test)), ) # Degenerate case, where some classes are missing ensemble = BalancedBaggingClassifier( estimator=LogisticRegression(solver="lbfgs", multi_class="auto"), random_state=0, max_samples=5, ) ensemble.fit(X_train, y_train) assert_array_almost_equal( np.sum(ensemble.predict_proba(X_test), axis=1), np.ones(len(X_test)), ) assert_array_almost_equal( ensemble.predict_proba(X_test), np.exp(ensemble.predict_log_proba(X_test)), ) def test_oob_score_classification(): # Check that oob prediction is a good estimation of the generalization # error. X, y = make_imbalance( iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0, ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) for estimator in [DecisionTreeClassifier(), SVC(gamma="scale")]: clf = BalancedBaggingClassifier( estimator=estimator, n_estimators=100, bootstrap=True, oob_score=True, random_state=0, ).fit(X_train, y_train) test_score = clf.score(X_test, y_test) assert abs(test_score - clf.oob_score_) < 0.1 # Test with few estimators with pytest.warns(UserWarning): BalancedBaggingClassifier( estimator=estimator, n_estimators=1, bootstrap=True, oob_score=True, random_state=0, ).fit(X_train, y_train) def test_single_estimator(): # Check singleton ensembles. X, y = make_imbalance( iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0, ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf1 = BalancedBaggingClassifier( estimator=KNeighborsClassifier(), n_estimators=1, bootstrap=False, bootstrap_features=False, random_state=0, ).fit(X_train, y_train) clf2 = make_pipeline( RandomUnderSampler(random_state=clf1.estimators_[0].steps[0][1].random_state), KNeighborsClassifier(), ).fit(X_train, y_train) assert_array_equal(clf1.predict(X_test), clf2.predict(X_test)) def test_gridsearch(): # Check that bagging ensembles can be grid-searched. # Transform iris into a binary classification task X, y = iris.data, iris.target.copy() y[y == 2] = 1 # Grid search with scoring based on decision_function parameters = {"n_estimators": (1, 2), "estimator__C": (1, 2)} GridSearchCV( BalancedBaggingClassifier(SVC(gamma="scale")), parameters, cv=3, scoring="roc_auc", ).fit(X, y) def test_estimator(): # Check estimator and its default values. X, y = make_imbalance( iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0, ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) ensemble = BalancedBaggingClassifier(None, n_jobs=3, random_state=0).fit( X_train, y_train ) assert isinstance(ensemble.estimator_.steps[-1][1], DecisionTreeClassifier) ensemble = BalancedBaggingClassifier( DecisionTreeClassifier(), n_jobs=3, random_state=0 ).fit(X_train, y_train) assert isinstance(ensemble.estimator_.steps[-1][1], DecisionTreeClassifier) ensemble = BalancedBaggingClassifier( Perceptron(max_iter=1000, tol=1e-3), n_jobs=3, random_state=0 ).fit(X_train, y_train) assert isinstance(ensemble.estimator_.steps[-1][1], Perceptron) def test_bagging_with_pipeline(): X, y = make_imbalance( iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0, ) estimator = BalancedBaggingClassifier( make_pipeline(SelectKBest(k=1), DecisionTreeClassifier()), max_features=2, ) estimator.fit(X, y).predict(X) def test_warm_start(random_state=42): # Test if fitting incrementally with warm start gives a forest of the # right size and the same results as a normal fit. X, y = make_hastie_10_2(n_samples=20, random_state=1) clf_ws = None for n_estimators in [5, 10]: if clf_ws is None: clf_ws = BalancedBaggingClassifier( n_estimators=n_estimators, random_state=random_state, warm_start=True, ) else: clf_ws.set_params(n_estimators=n_estimators) clf_ws.fit(X, y) assert len(clf_ws) == n_estimators clf_no_ws = BalancedBaggingClassifier( n_estimators=10, random_state=random_state, warm_start=False ) clf_no_ws.fit(X, y) assert {pipe.steps[-1][1].random_state for pipe in clf_ws} == { pipe.steps[-1][1].random_state for pipe in clf_no_ws } def test_warm_start_smaller_n_estimators(): # Test if warm start'ed second fit with smaller n_estimators raises error. X, y = make_hastie_10_2(n_samples=20, random_state=1) clf = BalancedBaggingClassifier(n_estimators=5, warm_start=True) clf.fit(X, y) clf.set_params(n_estimators=4) with pytest.raises(ValueError): clf.fit(X, y) def test_warm_start_equal_n_estimators(): # Test that nothing happens when fitting without increasing n_estimators X, y = make_hastie_10_2(n_samples=20, random_state=1) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43) clf = BalancedBaggingClassifier(n_estimators=5, warm_start=True, random_state=83) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) # modify X to nonsense values, this should not change anything X_train += 1.0 warn_msg = "Warm-start fitting without increasing n_estimators does not" with pytest.warns(UserWarning, match=warn_msg): clf.fit(X_train, y_train) assert_array_equal(y_pred, clf.predict(X_test)) def test_warm_start_equivalence(): # warm started classifier with 5+5 estimators should be equivalent to # one classifier with 10 estimators X, y = make_hastie_10_2(n_samples=20, random_state=1) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43) clf_ws = BalancedBaggingClassifier( n_estimators=5, warm_start=True, random_state=3141 ) clf_ws.fit(X_train, y_train) clf_ws.set_params(n_estimators=10) clf_ws.fit(X_train, y_train) y1 = clf_ws.predict(X_test) clf = BalancedBaggingClassifier( n_estimators=10, warm_start=False, random_state=3141 ) clf.fit(X_train, y_train) y2 = clf.predict(X_test) assert_array_almost_equal(y1, y2) def test_warm_start_with_oob_score_fails(): # Check using oob_score and warm_start simultaneously fails X, y = make_hastie_10_2(n_samples=20, random_state=1) clf = BalancedBaggingClassifier(n_estimators=5, warm_start=True, oob_score=True) with pytest.raises(ValueError): clf.fit(X, y) def test_oob_score_removed_on_warm_start(): X, y = make_hastie_10_2(n_samples=2000, random_state=1) clf = BalancedBaggingClassifier(n_estimators=50, oob_score=True) clf.fit(X, y) clf.set_params(warm_start=True, oob_score=False, n_estimators=100) clf.fit(X, y) with pytest.raises(AttributeError): getattr(clf, "oob_score_") def test_oob_score_consistency(): # Make sure OOB scores are identical when random_state, estimator, and # training data are fixed and fitting is done twice X, y = make_hastie_10_2(n_samples=200, random_state=1) bagging = BalancedBaggingClassifier( KNeighborsClassifier(), max_samples=0.5, max_features=0.5, oob_score=True, random_state=1, ) assert bagging.fit(X, y).oob_score_ == bagging.fit(X, y).oob_score_ def test_estimators_samples(): # Check that format of estimators_samples_ is correct and that results # generated at fit time can be identically reproduced at a later time # using data saved in object attributes. X, y = make_hastie_10_2(n_samples=200, random_state=1) # remap the y outside of the BalancedBaggingclassifier # _, y = np.unique(y, return_inverse=True) bagging = BalancedBaggingClassifier( LogisticRegression(solver="lbfgs", multi_class="auto"), max_samples=0.5, max_features=0.5, random_state=1, bootstrap=False, ) bagging.fit(X, y) # Get relevant attributes estimators_samples = bagging.estimators_samples_ estimators_features = bagging.estimators_features_ estimators = bagging.estimators_ # Test for correct formatting assert len(estimators_samples) == len(estimators) assert len(estimators_samples[0]) == len(X) // 2 assert estimators_samples[0].dtype.kind == "i" # Re-fit single estimator to test for consistent sampling estimator_index = 0 estimator_samples = estimators_samples[estimator_index] estimator_features = estimators_features[estimator_index] estimator = estimators[estimator_index] X_train = (X[estimator_samples])[:, estimator_features] y_train = y[estimator_samples] orig_coefs = estimator.steps[-1][1].coef_ estimator.fit(X_train, y_train) new_coefs = estimator.steps[-1][1].coef_ assert_allclose(orig_coefs, new_coefs) def test_max_samples_consistency(): # Make sure validated max_samples and original max_samples are identical # when valid integer max_samples supplied by user max_samples = 100 X, y = make_hastie_10_2(n_samples=2 * max_samples, random_state=1) bagging = BalancedBaggingClassifier( KNeighborsClassifier(), max_samples=max_samples, max_features=0.5, random_state=1, ) bagging.fit(X, y) assert bagging._max_samples == max_samples class CountDecisionTreeClassifier(DecisionTreeClassifier): """DecisionTreeClassifier that will memorize the number of samples seen at fit.""" def fit(self, X, y, sample_weight=None): self.class_counts_ = Counter(y) return super().fit(X, y, sample_weight=sample_weight) @pytest.mark.filterwarnings("ignore:Number of distinct clusters") @pytest.mark.parametrize( "sampler, n_samples_bootstrap", [ (None, 15), (RandomUnderSampler(), 15), # under-sampling with sample_indices_ ( ClusterCentroids(estimator=KMeans(n_init=1)), 15, ), # under-sampling without sample_indices_ (RandomOverSampler(), 40), # over-sampling with sample_indices_ (SMOTE(), 40), # over-sampling without sample_indices_ ], ) def test_balanced_bagging_classifier_samplers(sampler, n_samples_bootstrap): # check that we can pass any kind of sampler to a bagging classifier X, y = make_imbalance( iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0, ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = BalancedBaggingClassifier( estimator=CountDecisionTreeClassifier(), n_estimators=2, sampler=sampler, random_state=0, ) clf.fit(X_train, y_train) clf.predict(X_test) # check that we have balanced class with the right counts of class # sample depending on the sampling strategy assert_array_equal( list(clf.estimators_[0][-1].class_counts_.values()), n_samples_bootstrap ) @pytest.mark.parametrize("replace", [True, False]) def test_balanced_bagging_classifier_with_function_sampler(replace): # check that we can provide a FunctionSampler in BalancedBaggingClassifier X, y = make_classification( n_samples=1_000, n_features=10, n_classes=2, weights=[0.3, 0.7], random_state=0, ) def roughly_balanced_bagging(X, y, replace=False): """Implementation of Roughly Balanced Bagging for binary problem.""" # find the minority and majority classes class_counts = Counter(y) majority_class = max(class_counts, key=class_counts.get) minority_class = min(class_counts, key=class_counts.get) # compute the number of sample to draw from the majority class using # a negative binomial distribution n_minority_class = class_counts[minority_class] n_majority_resampled = np.random.negative_binomial(n=n_minority_class, p=0.5) # draw randomly with or without replacement majority_indices = np.random.choice( np.flatnonzero(y == majority_class), size=n_majority_resampled, replace=replace, ) minority_indices = np.random.choice( np.flatnonzero(y == minority_class), size=n_minority_class, replace=replace, ) indices = np.hstack([majority_indices, minority_indices]) return X[indices], y[indices] # Roughly Balanced Bagging rbb = BalancedBaggingClassifier( estimator=CountDecisionTreeClassifier(random_state=0), n_estimators=2, sampler=FunctionSampler( func=roughly_balanced_bagging, kw_args={"replace": replace} ), random_state=0, ) rbb.fit(X, y) for estimator in rbb.estimators_: class_counts = estimator[-1].class_counts_ assert (class_counts[0] / class_counts[1]) > 0.78 def test_balanced_bagging_classifier_n_features(): """Check that we raise a FutureWarning when accessing `n_features_`.""" X, y = load_iris(return_X_y=True) estimator = BalancedBaggingClassifier().fit(X, y) with pytest.warns(FutureWarning, match="`n_features_` was deprecated"): estimator.n_features_ imbalanced-learn-0.12.2/imblearn/ensemble/tests/test_easy_ensemble.py000066400000000000000000000165321460233407600257370ustar00rootroot00000000000000"""Test the module easy ensemble.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import numpy as np import pytest import sklearn from sklearn.datasets import load_iris, make_hastie_10_2 from sklearn.ensemble import AdaBoostClassifier from sklearn.feature_selection import SelectKBest from sklearn.model_selection import GridSearchCV, train_test_split from sklearn.utils._testing import assert_allclose, assert_array_equal from sklearn.utils.fixes import parse_version from imblearn.datasets import make_imbalance from imblearn.ensemble import EasyEnsembleClassifier from imblearn.pipeline import make_pipeline from imblearn.under_sampling import RandomUnderSampler sklearn_version = parse_version(sklearn.__version__) iris = load_iris() # Generate a global dataset to use RND_SEED = 0 X = np.array( [ [0.5220963, 0.11349303], [0.59091459, 0.40692742], [1.10915364, 0.05718352], [0.22039505, 0.26469445], [1.35269503, 0.44812421], [0.85117925, 1.0185556], [-2.10724436, 0.70263997], [-0.23627356, 0.30254174], [-1.23195149, 0.15427291], [-0.58539673, 0.62515052], ] ) Y = np.array([1, 2, 2, 2, 1, 0, 1, 1, 1, 0]) @pytest.mark.parametrize("n_estimators", [10, 20]) @pytest.mark.parametrize( "estimator", [ AdaBoostClassifier(algorithm="SAMME", n_estimators=5), AdaBoostClassifier(algorithm="SAMME", n_estimators=10), ], ) def test_easy_ensemble_classifier(n_estimators, estimator): # Check classification for various parameter settings. X, y = make_imbalance( iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0, ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) eec = EasyEnsembleClassifier( n_estimators=n_estimators, estimator=estimator, n_jobs=-1, random_state=RND_SEED, ) eec.fit(X_train, y_train).score(X_test, y_test) assert len(eec.estimators_) == n_estimators for est in eec.estimators_: assert len(est.named_steps["classifier"]) == estimator.n_estimators # test the different prediction function eec.predict(X_test) eec.predict_proba(X_test) eec.predict_log_proba(X_test) eec.decision_function(X_test) def test_estimator(): # Check estimator and its default values. X, y = make_imbalance( iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0, ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) ensemble = EasyEnsembleClassifier(2, None, n_jobs=-1, random_state=0).fit( X_train, y_train ) assert isinstance(ensemble.estimator_.steps[-1][1], AdaBoostClassifier) ensemble = EasyEnsembleClassifier( 2, AdaBoostClassifier(algorithm="SAMME"), n_jobs=-1, random_state=0 ).fit(X_train, y_train) assert isinstance(ensemble.estimator_.steps[-1][1], AdaBoostClassifier) def test_bagging_with_pipeline(): X, y = make_imbalance( iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0, ) estimator = EasyEnsembleClassifier( n_estimators=2, estimator=make_pipeline( SelectKBest(k=1), AdaBoostClassifier(algorithm="SAMME") ), ) estimator.fit(X, y).predict(X) def test_warm_start(random_state=42): # Test if fitting incrementally with warm start gives a forest of the # right size and the same results as a normal fit. X, y = make_hastie_10_2(n_samples=20, random_state=1) clf_ws = None for n_estimators in [5, 10]: if clf_ws is None: clf_ws = EasyEnsembleClassifier( n_estimators=n_estimators, random_state=random_state, warm_start=True, ) else: clf_ws.set_params(n_estimators=n_estimators) clf_ws.fit(X, y) assert len(clf_ws) == n_estimators clf_no_ws = EasyEnsembleClassifier( n_estimators=10, random_state=random_state, warm_start=False ) clf_no_ws.fit(X, y) assert {pipe.steps[-1][1].random_state for pipe in clf_ws} == { pipe.steps[-1][1].random_state for pipe in clf_no_ws } def test_warm_start_smaller_n_estimators(): # Test if warm start'ed second fit with smaller n_estimators raises error. X, y = make_hastie_10_2(n_samples=20, random_state=1) clf = EasyEnsembleClassifier(n_estimators=5, warm_start=True) clf.fit(X, y) clf.set_params(n_estimators=4) with pytest.raises(ValueError): clf.fit(X, y) def test_warm_start_equal_n_estimators(): # Test that nothing happens when fitting without increasing n_estimators X, y = make_hastie_10_2(n_samples=20, random_state=1) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43) clf = EasyEnsembleClassifier(n_estimators=5, warm_start=True, random_state=83) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) # modify X to nonsense values, this should not change anything X_train += 1.0 warn_msg = "Warm-start fitting without increasing n_estimators" with pytest.warns(UserWarning, match=warn_msg): clf.fit(X_train, y_train) assert_array_equal(y_pred, clf.predict(X_test)) def test_warm_start_equivalence(): # warm started classifier with 5+5 estimators should be equivalent to # one classifier with 10 estimators X, y = make_hastie_10_2(n_samples=20, random_state=1) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43) clf_ws = EasyEnsembleClassifier(n_estimators=5, warm_start=True, random_state=3141) clf_ws.fit(X_train, y_train) clf_ws.set_params(n_estimators=10) clf_ws.fit(X_train, y_train) y1 = clf_ws.predict(X_test) clf = EasyEnsembleClassifier(n_estimators=10, warm_start=False, random_state=3141) clf.fit(X_train, y_train) y2 = clf.predict(X_test) assert_allclose(y1, y2) def test_easy_ensemble_classifier_single_estimator(): X, y = make_imbalance( iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0, ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf1 = EasyEnsembleClassifier(n_estimators=1, random_state=0).fit(X_train, y_train) clf2 = make_pipeline( RandomUnderSampler(random_state=0), AdaBoostClassifier(algorithm="SAMME", random_state=0), ).fit(X_train, y_train) assert_array_equal(clf1.predict(X_test), clf2.predict(X_test)) def test_easy_ensemble_classifier_grid_search(): X, y = make_imbalance( iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0, ) parameters = { "n_estimators": [1, 2], "estimator__n_estimators": [3, 4], } grid_search = GridSearchCV( EasyEnsembleClassifier(estimator=AdaBoostClassifier(algorithm="SAMME")), parameters, cv=5, ) grid_search.fit(X, y) def test_easy_ensemble_classifier_n_features(): """Check that we raise a FutureWarning when accessing `n_features_`.""" X, y = load_iris(return_X_y=True) estimator = EasyEnsembleClassifier().fit(X, y) with pytest.warns(FutureWarning, match="`n_features_` was deprecated"): estimator.n_features_ imbalanced-learn-0.12.2/imblearn/ensemble/tests/test_forest.py000066400000000000000000000260641460233407600244270ustar00rootroot00000000000000import numpy as np import pytest import sklearn from sklearn.datasets import load_iris, make_classification from sklearn.model_selection import GridSearchCV, train_test_split from sklearn.utils._testing import assert_allclose, assert_array_equal from sklearn.utils.fixes import parse_version from imblearn.ensemble import BalancedRandomForestClassifier sklearn_version = parse_version(sklearn.__version__) @pytest.fixture def imbalanced_dataset(): return make_classification( n_samples=10000, n_features=2, n_informative=2, n_redundant=0, n_repeated=0, n_classes=3, n_clusters_per_class=1, weights=[0.01, 0.05, 0.94], class_sep=0.8, random_state=0, ) def test_balanced_random_forest_error_warning_warm_start(imbalanced_dataset): brf = BalancedRandomForestClassifier( n_estimators=5, sampling_strategy="all", replacement=True, bootstrap=False ) brf.fit(*imbalanced_dataset) with pytest.raises(ValueError, match="must be larger or equal to"): brf.set_params(warm_start=True, n_estimators=2) brf.fit(*imbalanced_dataset) brf.set_params(n_estimators=10) brf.fit(*imbalanced_dataset) with pytest.warns(UserWarning, match="Warm-start fitting without"): brf.fit(*imbalanced_dataset) def test_balanced_random_forest(imbalanced_dataset): n_estimators = 10 brf = BalancedRandomForestClassifier( n_estimators=n_estimators, random_state=0, sampling_strategy="all", replacement=True, bootstrap=False, ) brf.fit(*imbalanced_dataset) assert len(brf.samplers_) == n_estimators assert len(brf.estimators_) == n_estimators assert len(brf.pipelines_) == n_estimators assert len(brf.feature_importances_) == imbalanced_dataset[0].shape[1] def test_balanced_random_forest_attributes(imbalanced_dataset): X, y = imbalanced_dataset n_estimators = 10 brf = BalancedRandomForestClassifier( n_estimators=n_estimators, random_state=0, sampling_strategy="all", replacement=True, bootstrap=False, ) brf.fit(X, y) for idx in range(n_estimators): X_res, y_res = brf.samplers_[idx].fit_resample(X, y) X_res_2, y_res_2 = ( brf.pipelines_[idx].named_steps["randomundersampler"].fit_resample(X, y) ) assert_allclose(X_res, X_res_2) assert_array_equal(y_res, y_res_2) y_pred = brf.estimators_[idx].fit(X_res, y_res).predict(X) y_pred_2 = brf.pipelines_[idx].fit(X, y).predict(X) assert_array_equal(y_pred, y_pred_2) y_pred = brf.estimators_[idx].fit(X_res, y_res).predict_proba(X) y_pred_2 = brf.pipelines_[idx].fit(X, y).predict_proba(X) assert_array_equal(y_pred, y_pred_2) def test_balanced_random_forest_sample_weight(imbalanced_dataset): rng = np.random.RandomState(42) X, y = imbalanced_dataset sample_weight = rng.rand(y.shape[0]) brf = BalancedRandomForestClassifier( n_estimators=5, random_state=0, sampling_strategy="all", replacement=True, bootstrap=False, ) brf.fit(X, y, sample_weight) @pytest.mark.filterwarnings("ignore:Some inputs do not have OOB scores") def test_balanced_random_forest_oob(imbalanced_dataset): X, y = imbalanced_dataset X_train, X_test, y_train, y_test = train_test_split( X, y, random_state=42, stratify=y ) est = BalancedRandomForestClassifier( oob_score=True, random_state=0, n_estimators=1000, min_samples_leaf=2, sampling_strategy="all", replacement=True, bootstrap=True, ) est.fit(X_train, y_train) test_score = est.score(X_test, y_test) assert abs(test_score - est.oob_score_) < 0.1 # Check warning if not enough estimators est = BalancedRandomForestClassifier( oob_score=True, random_state=0, n_estimators=1, bootstrap=True, sampling_strategy="all", replacement=True, ) with pytest.warns(UserWarning) and np.errstate(divide="ignore", invalid="ignore"): est.fit(X, y) def test_balanced_random_forest_grid_search(imbalanced_dataset): brf = BalancedRandomForestClassifier( sampling_strategy="all", replacement=True, bootstrap=False ) grid = GridSearchCV(brf, {"n_estimators": (1, 2), "max_depth": (1, 2)}, cv=3) grid.fit(*imbalanced_dataset) def test_little_tree_with_small_max_samples(): rng = np.random.RandomState(1) X = rng.randn(10000, 2) y = rng.randn(10000) > 0 # First fit with no restriction on max samples est1 = BalancedRandomForestClassifier( n_estimators=1, random_state=rng, max_samples=None, sampling_strategy="all", replacement=True, bootstrap=True, ) # Second fit with max samples restricted to just 2 est2 = BalancedRandomForestClassifier( n_estimators=1, random_state=rng, max_samples=2, sampling_strategy="all", replacement=True, bootstrap=True, ) est1.fit(X, y) est2.fit(X, y) tree1 = est1.estimators_[0].tree_ tree2 = est2.estimators_[0].tree_ msg = "Tree without `max_samples` restriction should have more nodes" assert tree1.node_count > tree2.node_count, msg def test_balanced_random_forest_pruning(imbalanced_dataset): brf = BalancedRandomForestClassifier( sampling_strategy="all", replacement=True, bootstrap=False ) brf.fit(*imbalanced_dataset) n_nodes_no_pruning = brf.estimators_[0].tree_.node_count brf_pruned = BalancedRandomForestClassifier( ccp_alpha=0.015, sampling_strategy="all", replacement=True, bootstrap=False ) brf_pruned.fit(*imbalanced_dataset) n_nodes_pruning = brf_pruned.estimators_[0].tree_.node_count assert n_nodes_no_pruning > n_nodes_pruning @pytest.mark.parametrize("ratio", [0.5, 0.1]) @pytest.mark.filterwarnings("ignore:Some inputs do not have OOB scores") def test_balanced_random_forest_oob_binomial(ratio): # Regression test for #655: check that the oob score is closed to 0.5 # a binomial experiment. rng = np.random.RandomState(42) n_samples = 1000 X = np.arange(n_samples).reshape(-1, 1) y = rng.binomial(1, ratio, size=n_samples) erf = BalancedRandomForestClassifier( oob_score=True, random_state=42, sampling_strategy="not minority", replacement=False, bootstrap=True, ) erf.fit(X, y) assert np.abs(erf.oob_score_ - 0.5) < 0.1 def test_balanced_bagging_classifier_n_features(): """Check that we raise a FutureWarning when accessing `n_features_`.""" X, y = load_iris(return_X_y=True) estimator = BalancedRandomForestClassifier( sampling_strategy="all", replacement=True, bootstrap=False ).fit(X, y) with pytest.warns(FutureWarning, match="`n_features_` was deprecated"): estimator.n_features_ # TODO: remove in 0.13 def test_balanced_random_forest_change_behaviour(imbalanced_dataset): """Check that we raise a change of behaviour for the parameters `sampling_strategy` and `replacement`. """ estimator = BalancedRandomForestClassifier(sampling_strategy="all", bootstrap=False) with pytest.warns(FutureWarning, match="The default of `replacement`"): estimator.fit(*imbalanced_dataset) estimator = BalancedRandomForestClassifier(replacement=True, bootstrap=False) with pytest.warns(FutureWarning, match="The default of `sampling_strategy`"): estimator.fit(*imbalanced_dataset) estimator = BalancedRandomForestClassifier( sampling_strategy="all", replacement=True ) with pytest.warns(FutureWarning, match="The default of `bootstrap`"): estimator.fit(*imbalanced_dataset) @pytest.mark.skipif( parse_version(sklearn_version.base_version) < parse_version("1.4"), reason="scikit-learn should be >= 1.4", ) def test_missing_values_is_resilient(): """Check that forest can deal with missing values and has decent performance.""" rng = np.random.RandomState(0) n_samples, n_features = 1000, 10 X, y = make_classification( n_samples=n_samples, n_features=n_features, random_state=rng ) # Create dataset with missing values X_missing = X.copy() X_missing[rng.choice([False, True], size=X.shape, p=[0.95, 0.05])] = np.nan assert np.isnan(X_missing).any() X_missing_train, X_missing_test, y_train, y_test = train_test_split( X_missing, y, random_state=0 ) # Train forest with missing values forest_with_missing = BalancedRandomForestClassifier( sampling_strategy="all", replacement=True, bootstrap=False, random_state=rng, n_estimators=50, ) forest_with_missing.fit(X_missing_train, y_train) score_with_missing = forest_with_missing.score(X_missing_test, y_test) # Train forest without missing values X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) forest = BalancedRandomForestClassifier( sampling_strategy="all", replacement=True, bootstrap=False, random_state=rng, n_estimators=50, ) forest.fit(X_train, y_train) score_without_missing = forest.score(X_test, y_test) # Score is still 80 percent of the forest's score that had no missing values assert score_with_missing >= 0.80 * score_without_missing @pytest.mark.skipif( parse_version(sklearn_version.base_version) < parse_version("1.4"), reason="scikit-learn should be >= 1.4", ) def test_missing_value_is_predictive(): """Check that the forest learns when missing values are only present for a predictive feature.""" rng = np.random.RandomState(0) n_samples = 300 X_non_predictive = rng.standard_normal(size=(n_samples, 10)) y = rng.randint(0, high=2, size=n_samples) # Create a predictive feature using `y` and with some noise X_random_mask = rng.choice([False, True], size=n_samples, p=[0.95, 0.05]) y_mask = y.astype(bool) y_mask[X_random_mask] = ~y_mask[X_random_mask] predictive_feature = rng.standard_normal(size=n_samples) predictive_feature[y_mask] = np.nan assert np.isnan(predictive_feature).any() X_predictive = X_non_predictive.copy() X_predictive[:, 5] = predictive_feature ( X_predictive_train, X_predictive_test, X_non_predictive_train, X_non_predictive_test, y_train, y_test, ) = train_test_split(X_predictive, X_non_predictive, y, random_state=0) forest_predictive = BalancedRandomForestClassifier( sampling_strategy="all", replacement=True, bootstrap=False, random_state=0 ).fit(X_predictive_train, y_train) forest_non_predictive = BalancedRandomForestClassifier( sampling_strategy="all", replacement=True, bootstrap=False, random_state=0 ).fit(X_non_predictive_train, y_train) predictive_test_score = forest_predictive.score(X_predictive_test, y_test) assert predictive_test_score >= 0.75 assert predictive_test_score >= forest_non_predictive.score( X_non_predictive_test, y_test ) imbalanced-learn-0.12.2/imblearn/ensemble/tests/test_weight_boosting.py000066400000000000000000000063661460233407600263230ustar00rootroot00000000000000import numpy as np import pytest import sklearn from sklearn.datasets import make_classification from sklearn.model_selection import train_test_split from sklearn.utils._testing import assert_array_equal from sklearn.utils.fixes import parse_version from imblearn.ensemble import RUSBoostClassifier sklearn_version = parse_version(sklearn.__version__) @pytest.fixture def imbalanced_dataset(): return make_classification( n_samples=10000, n_features=3, n_informative=2, n_redundant=0, n_repeated=0, n_classes=3, n_clusters_per_class=1, weights=[0.01, 0.05, 0.94], class_sep=0.8, random_state=0, ) @pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"]) @pytest.mark.filterwarnings("ignore:The SAMME.R algorithm (the default) is") def test_rusboost(imbalanced_dataset, algorithm): X, y = imbalanced_dataset X_train, X_test, y_train, y_test = train_test_split( X, y, stratify=y, random_state=1 ) classes = np.unique(y) n_estimators = 500 rusboost = RUSBoostClassifier( n_estimators=n_estimators, algorithm=algorithm, random_state=0 ) rusboost.fit(X_train, y_train) assert_array_equal(classes, rusboost.classes_) # check that we have an ensemble of samplers and estimators with a # consistent size assert len(rusboost.estimators_) > 1 assert len(rusboost.estimators_) == len(rusboost.samplers_) assert len(rusboost.pipelines_) == len(rusboost.samplers_) # each sampler in the ensemble should have different random state assert len({sampler.random_state for sampler in rusboost.samplers_}) == len( rusboost.samplers_ ) # each estimator in the ensemble should have different random state assert len({est.random_state for est in rusboost.estimators_}) == len( rusboost.estimators_ ) # check the consistency of the feature importances assert len(rusboost.feature_importances_) == imbalanced_dataset[0].shape[1] # check the consistency of the prediction outpus y_pred = rusboost.predict_proba(X_test) assert y_pred.shape[1] == len(classes) assert rusboost.decision_function(X_test).shape[1] == len(classes) score = rusboost.score(X_test, y_test) assert score > 0.6, f"Failed with algorithm {algorithm} and score {score}" y_pred = rusboost.predict(X_test) assert y_pred.shape == y_test.shape @pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"]) @pytest.mark.filterwarnings("ignore:The SAMME.R algorithm (the default) is") def test_rusboost_sample_weight(imbalanced_dataset, algorithm): X, y = imbalanced_dataset sample_weight = np.ones_like(y) rusboost = RUSBoostClassifier(algorithm=algorithm, random_state=0) # Predictions should be the same when sample_weight are all ones y_pred_sample_weight = rusboost.fit(X, y, sample_weight).predict(X) y_pred_no_sample_weight = rusboost.fit(X, y).predict(X) assert_array_equal(y_pred_sample_weight, y_pred_no_sample_weight) rng = np.random.RandomState(42) sample_weight = rng.rand(y.shape[0]) y_pred_sample_weight = rusboost.fit(X, y, sample_weight).predict(X) with pytest.raises(AssertionError): assert_array_equal(y_pred_no_sample_weight, y_pred_sample_weight) imbalanced-learn-0.12.2/imblearn/exceptions.py000066400000000000000000000014211460233407600213010ustar00rootroot00000000000000""" The :mod:`imblearn.exceptions` module includes all custom warnings and error classes and functions used across imbalanced-learn. """ # Authors: Guillaume Lemaitre # License: MIT def raise_isinstance_error(variable_name, possible_type, variable): """Raise consistent error message for isinstance() function. Parameters ---------- variable_name : str The name of the variable. possible_type : type The possible type of the variable. variable : object The variable to check. Raises ------ ValueError If the instance is not of the possible type. """ raise ValueError( f"{variable_name} has to be one of {possible_type}. " f"Got {type(variable)} instead." ) imbalanced-learn-0.12.2/imblearn/keras/000077500000000000000000000000001460233407600176555ustar00rootroot00000000000000imbalanced-learn-0.12.2/imblearn/keras/__init__.py000066400000000000000000000003511460233407600217650ustar00rootroot00000000000000"""The :mod:`imblearn.keras` provides utilities to deal with imbalanced dataset in keras.""" from ._generator import BalancedBatchGenerator, balanced_batch_generator __all__ = ["BalancedBatchGenerator", "balanced_batch_generator"] imbalanced-learn-0.12.2/imblearn/keras/_generator.py000066400000000000000000000240441460233407600223600ustar00rootroot00000000000000"""Implement generators for ``keras`` which will balance the data.""" # This is a trick to avoid an error during tests collection with pytest. We # avoid the error when importing the package raise the error at the moment of # creating the instance. # This is a trick to avoid an error during tests collection with pytest. We # avoid the error when importing the package raise the error at the moment of # creating the instance. def import_keras(): """Try to import keras from keras and tensorflow. This is possible to import the sequence from keras or tensorflow. """ def import_from_keras(): try: import keras # noqa if hasattr(keras.utils, "Sequence"): return (keras.utils.Sequence,), True else: return (keras.utils.data_utils.Sequence,), True except ImportError: return tuple(), False def import_from_tensforflow(): try: from tensorflow import keras if hasattr(keras.utils, "Sequence"): return (keras.utils.Sequence,), True else: return (keras.utils.data_utils.Sequence,), True except ImportError: return tuple(), False ParentClassKeras, has_keras_k = import_from_keras() ParentClassTensorflow, has_keras_tf = import_from_tensforflow() has_keras = has_keras_k or has_keras_tf if has_keras: if has_keras_k: ParentClass = ParentClassKeras else: ParentClass = ParentClassTensorflow else: ParentClass = (object,) return ParentClass, has_keras ParentClass, HAS_KERAS = import_keras() from scipy.sparse import issparse # noqa from sklearn.base import clone # noqa from sklearn.utils import _safe_indexing # noqa from sklearn.utils import check_random_state # noqa from ..tensorflow import balanced_batch_generator as tf_bbg # noqa from ..under_sampling import RandomUnderSampler # noqa from ..utils import Substitution # noqa from ..utils._docstring import _random_state_docstring # noqa class BalancedBatchGenerator(*ParentClass): # type: ignore """Create balanced batches when training a keras model. Create a keras ``Sequence`` which is given to ``fit``. The sampler defines the sampling strategy used to balance the dataset ahead of creating the batch. The sampler should have an attribute ``sample_indices_``. .. versionadded:: 0.4 Parameters ---------- X : ndarray of shape (n_samples, n_features) Original imbalanced dataset. y : ndarray of shape (n_samples,) or (n_samples, n_classes) Associated targets. sample_weight : ndarray of shape (n_samples,) Sample weight. sampler : sampler object, default=None A sampler instance which has an attribute ``sample_indices_``. By default, the sampler used is a :class:`~imblearn.under_sampling.RandomUnderSampler`. batch_size : int, default=32 Number of samples per gradient update. keep_sparse : bool, default=False Either or not to conserve or not the sparsity of the input (i.e. ``X``, ``y``, ``sample_weight``). By default, the returned batches will be dense. random_state : int, RandomState instance or None, default=None Control the randomization of the algorithm: - If int, ``random_state`` is the seed used by the random number generator; - If ``RandomState`` instance, random_state is the random number generator; - If ``None``, the random number generator is the ``RandomState`` instance used by ``np.random``. Attributes ---------- sampler_ : sampler object The sampler used to balance the dataset. indices_ : ndarray of shape (n_samples, n_features) The indices of the samples selected during sampling. Examples -------- >>> from sklearn.datasets import load_iris >>> iris = load_iris() >>> from imblearn.datasets import make_imbalance >>> class_dict = dict() >>> class_dict[0] = 30; class_dict[1] = 50; class_dict[2] = 40 >>> X, y = make_imbalance(iris.data, iris.target, sampling_strategy=class_dict) >>> import tensorflow >>> y = tensorflow.keras.utils.to_categorical(y, 3) >>> model = tensorflow.keras.models.Sequential() >>> model.add( ... tensorflow.keras.layers.Dense( ... y.shape[1], input_dim=X.shape[1], activation='softmax' ... ) ... ) >>> model.compile(optimizer='sgd', loss='categorical_crossentropy', ... metrics=['accuracy']) >>> from imblearn.keras import BalancedBatchGenerator >>> from imblearn.under_sampling import NearMiss >>> training_generator = BalancedBatchGenerator( ... X, y, sampler=NearMiss(), batch_size=10, random_state=42) >>> callback_history = model.fit(training_generator, epochs=10, verbose=0) """ # flag for keras sequence duck-typing use_sequence_api = True def __init__( self, X, y, *, sample_weight=None, sampler=None, batch_size=32, keep_sparse=False, random_state=None, ): if not HAS_KERAS: raise ImportError("'No module named 'keras'") self.X = X self.y = y self.sample_weight = sample_weight self.sampler = sampler self.batch_size = batch_size self.keep_sparse = keep_sparse self.random_state = random_state self._sample() def _sample(self): random_state = check_random_state(self.random_state) if self.sampler is None: self.sampler_ = RandomUnderSampler(random_state=random_state) else: self.sampler_ = clone(self.sampler) self.sampler_.fit_resample(self.X, self.y) if not hasattr(self.sampler_, "sample_indices_"): raise ValueError("'sampler' needs to have an attribute 'sample_indices_'.") self.indices_ = self.sampler_.sample_indices_ # shuffle the indices since the sampler are packing them by class random_state.shuffle(self.indices_) def __len__(self): return int(self.indices_.size // self.batch_size) def __getitem__(self, index): X_resampled = _safe_indexing( self.X, self.indices_[index * self.batch_size : (index + 1) * self.batch_size], ) y_resampled = _safe_indexing( self.y, self.indices_[index * self.batch_size : (index + 1) * self.batch_size], ) if issparse(X_resampled) and not self.keep_sparse: X_resampled = X_resampled.toarray() if self.sample_weight is not None: sample_weight_resampled = _safe_indexing( self.sample_weight, self.indices_[index * self.batch_size : (index + 1) * self.batch_size], ) if self.sample_weight is None: return X_resampled, y_resampled else: return X_resampled, y_resampled, sample_weight_resampled @Substitution(random_state=_random_state_docstring) def balanced_batch_generator( X, y, *, sample_weight=None, sampler=None, batch_size=32, keep_sparse=False, random_state=None, ): """Create a balanced batch generator to train keras model. Returns a generator --- as well as the number of step per epoch --- which is given to ``fit``. The sampler defines the sampling strategy used to balance the dataset ahead of creating the batch. The sampler should have an attribute ``sample_indices_``. Parameters ---------- X : ndarray of shape (n_samples, n_features) Original imbalanced dataset. y : ndarray of shape (n_samples,) or (n_samples, n_classes) Associated targets. sample_weight : ndarray of shape (n_samples,), default=None Sample weight. sampler : sampler object, default=None A sampler instance which has an attribute ``sample_indices_``. By default, the sampler used is a :class:`~imblearn.under_sampling.RandomUnderSampler`. batch_size : int, default=32 Number of samples per gradient update. keep_sparse : bool, default=False Either or not to conserve or not the sparsity of the input (i.e. ``X``, ``y``, ``sample_weight``). By default, the returned batches will be dense. {random_state} Returns ------- generator : generator of tuple Generate batch of data. The tuple generated are either (X_batch, y_batch) or (X_batch, y_batch, sampler_weight_batch). steps_per_epoch : int The number of samples per epoch. Required by ``fit_generator`` in keras. Examples -------- >>> from sklearn.datasets import load_iris >>> X, y = load_iris(return_X_y=True) >>> from imblearn.datasets import make_imbalance >>> class_dict = dict() >>> class_dict[0] = 30; class_dict[1] = 50; class_dict[2] = 40 >>> from imblearn.datasets import make_imbalance >>> X, y = make_imbalance(X, y, sampling_strategy=class_dict) >>> import tensorflow >>> y = tensorflow.keras.utils.to_categorical(y, 3) >>> model = tensorflow.keras.models.Sequential() >>> model.add( ... tensorflow.keras.layers.Dense( ... y.shape[1], input_dim=X.shape[1], activation='softmax' ... ) ... ) >>> model.compile(optimizer='sgd', loss='categorical_crossentropy', ... metrics=['accuracy']) >>> from imblearn.keras import balanced_batch_generator >>> from imblearn.under_sampling import NearMiss >>> training_generator, steps_per_epoch = balanced_batch_generator( ... X, y, sampler=NearMiss(), batch_size=10, random_state=42) >>> callback_history = model.fit(training_generator, ... steps_per_epoch=steps_per_epoch, ... epochs=10, verbose=0) """ return tf_bbg( X=X, y=y, sample_weight=sample_weight, sampler=sampler, batch_size=batch_size, keep_sparse=keep_sparse, random_state=random_state, ) imbalanced-learn-0.12.2/imblearn/keras/tests/000077500000000000000000000000001460233407600210175ustar00rootroot00000000000000imbalanced-learn-0.12.2/imblearn/keras/tests/__init__.py000066400000000000000000000000001460233407600231160ustar00rootroot00000000000000imbalanced-learn-0.12.2/imblearn/keras/tests/test_generator.py000066400000000000000000000102311460233407600244130ustar00rootroot00000000000000import numpy as np import pytest from scipy import sparse from sklearn.cluster import KMeans from sklearn.datasets import load_iris from sklearn.preprocessing import LabelBinarizer keras = pytest.importorskip("keras") from keras.layers import Dense # noqa: E402 from keras.models import Sequential # noqa: E402 from imblearn.datasets import make_imbalance # noqa: E402 from imblearn.keras import ( BalancedBatchGenerator, # noqa: E402 balanced_batch_generator, # noqa: E402 ) from imblearn.over_sampling import RandomOverSampler # noqa: E402 from imblearn.under_sampling import ( ClusterCentroids, # noqa: E402 NearMiss, # noqa: E402 ) 3 @pytest.fixture def data(): iris = load_iris() X, y = make_imbalance( iris.data, iris.target, sampling_strategy={0: 30, 1: 50, 2: 40} ) y = LabelBinarizer().fit_transform(y) return X, y def _build_keras_model(n_classes, n_features): model = Sequential() model.add(Dense(n_classes, input_dim=n_features, activation="softmax")) model.compile( optimizer="sgd", loss="categorical_crossentropy", metrics=["accuracy"] ) return model def test_balanced_batch_generator_class_no_return_indices(data): with pytest.raises(ValueError, match="needs to have an attribute"): BalancedBatchGenerator( *data, sampler=ClusterCentroids(estimator=KMeans(n_init=1)), batch_size=10 ) @pytest.mark.filterwarnings("ignore:`wait_time` is not used") # keras 2.2.4 @pytest.mark.parametrize( "sampler, sample_weight", [ (None, None), (RandomOverSampler(), None), (NearMiss(), None), (None, np.random.uniform(size=120)), ], ) def test_balanced_batch_generator_class(data, sampler, sample_weight): X, y = data model = _build_keras_model(y.shape[1], X.shape[1]) training_generator = BalancedBatchGenerator( X, y, sample_weight=sample_weight, sampler=sampler, batch_size=10, random_state=42, ) model.fit(training_generator, epochs=10) @pytest.mark.parametrize("keep_sparse", [True, False]) def test_balanced_batch_generator_class_sparse(data, keep_sparse): X, y = data training_generator = BalancedBatchGenerator( sparse.csr_matrix(X), y, batch_size=10, keep_sparse=keep_sparse, random_state=42, ) for idx in range(len(training_generator)): X_batch, _ = training_generator.__getitem__(idx) if keep_sparse: assert sparse.issparse(X_batch) else: assert not sparse.issparse(X_batch) def test_balanced_batch_generator_function_no_return_indices(data): with pytest.raises(ValueError, match="needs to have an attribute"): balanced_batch_generator( *data, sampler=ClusterCentroids(estimator=KMeans(n_init=10)), batch_size=10, random_state=42, ) @pytest.mark.filterwarnings("ignore:`wait_time` is not used") # keras 2.2.4 @pytest.mark.parametrize( "sampler, sample_weight", [ (None, None), (RandomOverSampler(), None), (NearMiss(), None), (None, np.random.uniform(size=120)), ], ) def test_balanced_batch_generator_function(data, sampler, sample_weight): X, y = data model = _build_keras_model(y.shape[1], X.shape[1]) training_generator, steps_per_epoch = balanced_batch_generator( X, y, sample_weight=sample_weight, sampler=sampler, batch_size=10, random_state=42, ) model.fit( training_generator, steps_per_epoch=steps_per_epoch, epochs=10, ) @pytest.mark.parametrize("keep_sparse", [True, False]) def test_balanced_batch_generator_function_sparse(data, keep_sparse): X, y = data training_generator, steps_per_epoch = balanced_batch_generator( sparse.csr_matrix(X), y, keep_sparse=keep_sparse, batch_size=10, random_state=42, ) for _ in range(steps_per_epoch): X_batch, _ = next(training_generator) if keep_sparse: assert sparse.issparse(X_batch) else: assert not sparse.issparse(X_batch) imbalanced-learn-0.12.2/imblearn/metrics/000077500000000000000000000000001460233407600202165ustar00rootroot00000000000000imbalanced-learn-0.12.2/imblearn/metrics/__init__.py000066400000000000000000000012021460233407600223220ustar00rootroot00000000000000""" The :mod:`imblearn.metrics` module includes score functions, performance metrics and pairwise metrics and distance computations. """ from ._classification import ( classification_report_imbalanced, geometric_mean_score, macro_averaged_mean_absolute_error, make_index_balanced_accuracy, sensitivity_score, sensitivity_specificity_support, specificity_score, ) __all__ = [ "sensitivity_specificity_support", "sensitivity_score", "specificity_score", "geometric_mean_score", "make_index_balanced_accuracy", "classification_report_imbalanced", "macro_averaged_mean_absolute_error", ] imbalanced-learn-0.12.2/imblearn/metrics/_classification.py000066400000000000000000001160271460233407600237310ustar00rootroot00000000000000# coding: utf-8 """Metrics to assess performance on a classification task given class predictions. The available metrics are complementary from the metrics available in scikit-learn. Functions named as ``*_score`` return a scalar value to maximize: the higher the better Function named as ``*_error`` or ``*_loss`` return a scalar value to minimize: the lower the better """ # Authors: Guillaume Lemaitre # Dariusz Brzezinski # License: MIT import functools import numbers import warnings from inspect import signature import numpy as np import scipy as sp from sklearn.metrics import mean_absolute_error, precision_recall_fscore_support from sklearn.metrics._classification import _check_targets, _prf_divide from sklearn.preprocessing import LabelEncoder from sklearn.utils.multiclass import unique_labels from sklearn.utils.validation import check_consistent_length, column_or_1d from ..utils._param_validation import Interval, StrOptions, validate_params @validate_params( { "y_true": ["array-like"], "y_pred": ["array-like"], "labels": ["array-like", None], "pos_label": [str, numbers.Integral, None], "average": [ None, StrOptions({"binary", "micro", "macro", "weighted", "samples"}), ], "warn_for": ["array-like"], "sample_weight": ["array-like", None], }, prefer_skip_nested_validation=True, ) def sensitivity_specificity_support( y_true, y_pred, *, labels=None, pos_label=1, average=None, warn_for=("sensitivity", "specificity"), sample_weight=None, ): """Compute sensitivity, specificity, and support for each class. The sensitivity is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of true positives and ``fn`` the number of false negatives. The sensitivity quantifies the ability to avoid false negatives_[1]. The specificity is the ratio ``tn / (tn + fp)`` where ``tn`` is the number of true negatives and ``fn`` the number of false negatives. The specificity quantifies the ability to avoid false positives_[1]. The support is the number of occurrences of each class in ``y_true``. If ``pos_label is None`` and in binary classification, this function returns the average sensitivity and specificity if ``average`` is one of ``'weighted'``. Read more in the :ref:`User Guide `. Parameters ---------- y_true : array-like of shape (n_samples,) Ground truth (correct) target values. y_pred : array-like of shape (n_samples,) Estimated targets as returned by a classifier. labels : array-like, default=None The set of labels to include when ``average != 'binary'``, and their order if ``average is None``. Labels present in the data can be excluded, for example to calculate a multiclass average ignoring a majority negative class, while labels not present in the data will result in 0 components in a macro average. For multilabel targets, labels are column indices. By default, all labels in ``y_true`` and ``y_pred`` are used in sorted order. pos_label : str, int or None, default=1 The class to report if ``average='binary'`` and the data is binary. If ``pos_label is None`` and in binary classification, this function returns the average sensitivity and specificity if ``average`` is one of ``'weighted'``. If the data are multiclass, this will be ignored; setting ``labels=[pos_label]`` and ``average != 'binary'`` will report scores for that label only. average : str, default=None If ``None``, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data: ``'binary'``: Only report results for the class specified by ``pos_label``. This is applicable only if targets (``y_{true,pred}``) are binary. ``'micro'``: Calculate metrics globally by counting the total true positives, false negatives and false positives. ``'macro'``: Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account. ``'weighted'``: Calculate metrics for each label, and find their average, weighted by support (the number of true instances for each label). This alters 'macro' to account for label imbalance; it can result in an F-score that is not between precision and recall. ``'samples'``: Calculate metrics for each instance, and find their average (only meaningful for multilabel classification where this differs from :func:`accuracy_score`). warn_for : tuple or set of {{"sensitivity", "specificity"}}, for internal use This determines which warnings will be made in the case that this function is being used to return only one of its metrics. sample_weight : array-like of shape (n_samples,), default=None Sample weights. Returns ------- sensitivity : float (if `average is None`) or ndarray of \ shape (n_unique_labels,) The sensitivity metric. specificity : float (if `average is None`) or ndarray of \ shape (n_unique_labels,) The specificity metric. support : int (if `average is None`) or ndarray of \ shape (n_unique_labels,) The number of occurrences of each label in ``y_true``. References ---------- .. [1] `Wikipedia entry for the Sensitivity and specificity `_ Examples -------- >>> import numpy as np >>> from imblearn.metrics import sensitivity_specificity_support >>> y_true = np.array(['cat', 'dog', 'pig', 'cat', 'dog', 'pig']) >>> y_pred = np.array(['cat', 'pig', 'dog', 'cat', 'cat', 'dog']) >>> sensitivity_specificity_support(y_true, y_pred, average='macro') (0.33..., 0.66..., None) >>> sensitivity_specificity_support(y_true, y_pred, average='micro') (0.33..., 0.66..., None) >>> sensitivity_specificity_support(y_true, y_pred, average='weighted') (0.33..., 0.66..., None) """ average_options = (None, "micro", "macro", "weighted", "samples") if average not in average_options and average != "binary": raise ValueError("average has to be one of " + str(average_options)) y_type, y_true, y_pred = _check_targets(y_true, y_pred) present_labels = unique_labels(y_true, y_pred) if average == "binary": if y_type == "binary": if pos_label not in present_labels: if len(present_labels) < 2: # Only negative labels return (0.0, 0.0, 0) else: raise ValueError( "pos_label=%r is not a valid label: %r" % (pos_label, present_labels) ) labels = [pos_label] else: raise ValueError( "Target is %s but average='binary'. Please " "choose another average setting." % y_type ) elif pos_label not in (None, 1): warnings.warn( "Note that pos_label (set to %r) is ignored when " "average != 'binary' (got %r). You may use " "labels=[pos_label] to specify a single positive class." % (pos_label, average), UserWarning, ) if labels is None: labels = present_labels n_labels = None else: n_labels = len(labels) labels = np.hstack( [labels, np.setdiff1d(present_labels, labels, assume_unique=True)] ) # Calculate tp_sum, pred_sum, true_sum ### if y_type.startswith("multilabel"): raise ValueError("imblearn does not support multilabel") elif average == "samples": raise ValueError( "Sample-based precision, recall, fscore is " "not meaningful outside multilabel " "classification. See the accuracy_score instead." ) else: le = LabelEncoder() le.fit(labels) y_true = le.transform(y_true) y_pred = le.transform(y_pred) sorted_labels = le.classes_ # labels are now from 0 to len(labels) - 1 -> use bincount tp = y_true == y_pred tp_bins = y_true[tp] if sample_weight is not None: tp_bins_weights = np.asarray(sample_weight)[tp] else: tp_bins_weights = None if len(tp_bins): tp_sum = np.bincount( tp_bins, weights=tp_bins_weights, minlength=len(labels) ) else: # Pathological case true_sum = pred_sum = tp_sum = np.zeros(len(labels)) if len(y_pred): pred_sum = np.bincount(y_pred, weights=sample_weight, minlength=len(labels)) if len(y_true): true_sum = np.bincount(y_true, weights=sample_weight, minlength=len(labels)) # Compute the true negative tn_sum = y_true.size - (pred_sum + true_sum - tp_sum) # Retain only selected labels indices = np.searchsorted(sorted_labels, labels[:n_labels]) tp_sum = tp_sum[indices] true_sum = true_sum[indices] pred_sum = pred_sum[indices] tn_sum = tn_sum[indices] if average == "micro": tp_sum = np.array([tp_sum.sum()]) pred_sum = np.array([pred_sum.sum()]) true_sum = np.array([true_sum.sum()]) tn_sum = np.array([tn_sum.sum()]) # Finally, we have all our sufficient statistics. Divide! # with np.errstate(divide="ignore", invalid="ignore"): # Divide, and on zero-division, set scores to 0 and warn: # Oddly, we may get an "invalid" rather than a "divide" error # here. specificity = _prf_divide( tn_sum, tn_sum + pred_sum - tp_sum, "specificity", "predicted", average, warn_for, ) sensitivity = _prf_divide( tp_sum, true_sum, "sensitivity", "true", average, warn_for ) # Average the results if average == "weighted": weights = true_sum if weights.sum() == 0: return 0, 0, None elif average == "samples": weights = sample_weight else: weights = None if average is not None: assert average != "binary" or len(specificity) == 1 specificity = np.average(specificity, weights=weights) sensitivity = np.average(sensitivity, weights=weights) true_sum = None # return no support return sensitivity, specificity, true_sum @validate_params( { "y_true": ["array-like"], "y_pred": ["array-like"], "labels": ["array-like", None], "pos_label": [str, numbers.Integral, None], "average": [ None, StrOptions({"binary", "micro", "macro", "weighted", "samples"}), ], "sample_weight": ["array-like", None], }, prefer_skip_nested_validation=True, ) def sensitivity_score( y_true, y_pred, *, labels=None, pos_label=1, average="binary", sample_weight=None, ): """Compute the sensitivity. The sensitivity is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of true positives and ``fn`` the number of false negatives. The sensitivity quantifies the ability to avoid false negatives. The best value is 1 and the worst value is 0. Read more in the :ref:`User Guide `. Parameters ---------- y_true : array-like of shape (n_samples,) Ground truth (correct) target values. y_pred : array-like of shape (n_samples,) Estimated targets as returned by a classifier. labels : array-like, default=None The set of labels to include when ``average != 'binary'``, and their order if ``average is None``. Labels present in the data can be excluded, for example to calculate a multiclass average ignoring a majority negative class, while labels not present in the data will result in 0 components in a macro average. pos_label : str, int or None, default=1 The class to report if ``average='binary'`` and the data is binary. If ``pos_label is None`` and in binary classification, this function returns the average sensitivity if ``average`` is one of ``'weighted'``. If the data are multiclass, this will be ignored; setting ``labels=[pos_label]`` and ``average != 'binary'`` will report scores for that label only. average : str, default=None If ``None``, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data: ``'binary'``: Only report results for the class specified by ``pos_label``. This is applicable only if targets (``y_{true,pred}``) are binary. ``'micro'``: Calculate metrics globally by counting the total true positives, false negatives and false positives. ``'macro'``: Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account. ``'weighted'``: Calculate metrics for each label, and find their average, weighted by support (the number of true instances for each label). This alters 'macro' to account for label imbalance; it can result in an F-score that is not between precision and recall. ``'samples'``: Calculate metrics for each instance, and find their average (only meaningful for multilabel classification where this differs from :func:`accuracy_score`). sample_weight : array-like of shape (n_samples,), default=None Sample weights. Returns ------- specificity : float (if `average is None`) or ndarray of \ shape (n_unique_labels,) The specifcity metric. Examples -------- >>> import numpy as np >>> from imblearn.metrics import sensitivity_score >>> y_true = [0, 1, 2, 0, 1, 2] >>> y_pred = [0, 2, 1, 0, 0, 1] >>> sensitivity_score(y_true, y_pred, average='macro') 0.33... >>> sensitivity_score(y_true, y_pred, average='micro') 0.33... >>> sensitivity_score(y_true, y_pred, average='weighted') 0.33... >>> sensitivity_score(y_true, y_pred, average=None) array([1., 0., 0.]) """ s, _, _ = sensitivity_specificity_support( y_true, y_pred, labels=labels, pos_label=pos_label, average=average, warn_for=("sensitivity",), sample_weight=sample_weight, ) return s @validate_params( { "y_true": ["array-like"], "y_pred": ["array-like"], "labels": ["array-like", None], "pos_label": [str, numbers.Integral, None], "average": [ None, StrOptions({"binary", "micro", "macro", "weighted", "samples"}), ], "sample_weight": ["array-like", None], }, prefer_skip_nested_validation=True, ) def specificity_score( y_true, y_pred, *, labels=None, pos_label=1, average="binary", sample_weight=None, ): """Compute the specificity. The specificity is the ratio ``tn / (tn + fp)`` where ``tn`` is the number of true negatives and ``fp`` the number of false positives. The specificity quantifies the ability to avoid false positives. The best value is 1 and the worst value is 0. Read more in the :ref:`User Guide `. Parameters ---------- y_true : array-like of shape (n_samples,) Ground truth (correct) target values. y_pred : array-like of shape (n_samples,) Estimated targets as returned by a classifier. labels : array-like, default=None The set of labels to include when ``average != 'binary'``, and their order if ``average is None``. Labels present in the data can be excluded, for example to calculate a multiclass average ignoring a majority negative class, while labels not present in the data will result in 0 components in a macro average. pos_label : str, int or None, default=1 The class to report if ``average='binary'`` and the data is binary. If ``pos_label is None`` and in binary classification, this function returns the average specificity if ``average`` is one of ``'weighted'``. If the data are multiclass, this will be ignored; setting ``labels=[pos_label]`` and ``average != 'binary'`` will report scores for that label only. average : str, default=None If ``None``, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data: ``'binary'``: Only report results for the class specified by ``pos_label``. This is applicable only if targets (``y_{true,pred}``) are binary. ``'micro'``: Calculate metrics globally by counting the total true positives, false negatives and false positives. ``'macro'``: Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account. ``'weighted'``: Calculate metrics for each label, and find their average, weighted by support (the number of true instances for each label). This alters 'macro' to account for label imbalance; it can result in an F-score that is not between precision and recall. ``'samples'``: Calculate metrics for each instance, and find their average (only meaningful for multilabel classification where this differs from :func:`accuracy_score`). sample_weight : array-like of shape (n_samples,), default=None Sample weights. Returns ------- specificity : float (if `average is None`) or ndarray of \ shape (n_unique_labels,) The specificity metric. Examples -------- >>> import numpy as np >>> from imblearn.metrics import specificity_score >>> y_true = [0, 1, 2, 0, 1, 2] >>> y_pred = [0, 2, 1, 0, 0, 1] >>> specificity_score(y_true, y_pred, average='macro') 0.66... >>> specificity_score(y_true, y_pred, average='micro') 0.66... >>> specificity_score(y_true, y_pred, average='weighted') 0.66... >>> specificity_score(y_true, y_pred, average=None) array([0.75, 0.5 , 0.75]) """ _, s, _ = sensitivity_specificity_support( y_true, y_pred, labels=labels, pos_label=pos_label, average=average, warn_for=("specificity",), sample_weight=sample_weight, ) return s @validate_params( { "y_true": ["array-like"], "y_pred": ["array-like"], "labels": ["array-like", None], "pos_label": [str, numbers.Integral, None], "average": [ None, StrOptions( {"binary", "micro", "macro", "weighted", "samples", "multiclass"} ), ], "sample_weight": ["array-like", None], "correction": [Interval(numbers.Real, 0, None, closed="left")], }, prefer_skip_nested_validation=True, ) def geometric_mean_score( y_true, y_pred, *, labels=None, pos_label=1, average="multiclass", sample_weight=None, correction=0.0, ): """Compute the geometric mean. The geometric mean (G-mean) is the root of the product of class-wise sensitivity. This measure tries to maximize the accuracy on each of the classes while keeping these accuracies balanced. For binary classification G-mean is the squared root of the product of the sensitivity and specificity. For multi-class problems it is a higher root of the product of sensitivity for each class. For compatibility with other imbalance performance measures, G-mean can be calculated for each class separately on a one-vs-rest basis when ``average != 'multiclass'``. The best value is 1 and the worst value is 0. Traditionally if at least one class is unrecognized by the classifier, G-mean resolves to zero. To alleviate this property, for highly multi-class the sensitivity of unrecognized classes can be "corrected" to be a user specified value (instead of zero). This option works only if ``average == 'multiclass'``. Read more in the :ref:`User Guide `. Parameters ---------- y_true : array-like of shape (n_samples,) Ground truth (correct) target values. y_pred : array-like of shape (n_samples,) Estimated targets as returned by a classifier. labels : array-like, default=None The set of labels to include when ``average != 'binary'``, and their order if ``average is None``. Labels present in the data can be excluded, for example to calculate a multiclass average ignoring a majority negative class, while labels not present in the data will result in 0 components in a macro average. pos_label : str, int or None, default=1 The class to report if ``average='binary'`` and the data is binary. If ``pos_label is None`` and in binary classification, this function returns the average geometric mean if ``average`` is one of ``'weighted'``. If the data are multiclass, this will be ignored; setting ``labels=[pos_label]`` and ``average != 'binary'`` will report scores for that label only. average : str or None, default='multiclass' If ``None``, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data: ``'binary'``: Only report results for the class specified by ``pos_label``. This is applicable only if targets (``y_{true,pred}``) are binary. ``'micro'``: Calculate metrics globally by counting the total true positives, false negatives and false positives. ``'macro'``: Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account. ``'multiclass'``: No average is taken. ``'weighted'``: Calculate metrics for each label, and find their average, weighted by support (the number of true instances for each label). This alters 'macro' to account for label imbalance; it can result in an F-score that is not between precision and recall. ``'samples'``: Calculate metrics for each instance, and find their average (only meaningful for multilabel classification where this differs from :func:`accuracy_score`). sample_weight : array-like of shape (n_samples,), default=None Sample weights. correction : float, default=0.0 Substitutes sensitivity of unrecognized classes from zero to a given value. Returns ------- geometric_mean : float Returns the geometric mean. Notes ----- See :ref:`sphx_glr_auto_examples_evaluation_plot_metrics.py`. References ---------- .. [1] Kubat, M. and Matwin, S. "Addressing the curse of imbalanced training sets: one-sided selection" ICML (1997) .. [2] Barandela, R., Sánchez, J. S., Garcıa, V., & Rangel, E. "Strategies for learning in class imbalance problems", Pattern Recognition, 36(3), (2003), pp 849-851. Examples -------- >>> from imblearn.metrics import geometric_mean_score >>> y_true = [0, 1, 2, 0, 1, 2] >>> y_pred = [0, 2, 1, 0, 0, 1] >>> geometric_mean_score(y_true, y_pred) 0.0 >>> geometric_mean_score(y_true, y_pred, correction=0.001) 0.010... >>> geometric_mean_score(y_true, y_pred, average='macro') 0.471... >>> geometric_mean_score(y_true, y_pred, average='micro') 0.471... >>> geometric_mean_score(y_true, y_pred, average='weighted') 0.471... >>> geometric_mean_score(y_true, y_pred, average=None) array([0.866..., 0. , 0. ]) """ if average is None or average != "multiclass": sen, spe, _ = sensitivity_specificity_support( y_true, y_pred, labels=labels, pos_label=pos_label, average=average, warn_for=("specificity", "specificity"), sample_weight=sample_weight, ) return np.sqrt(sen * spe) else: present_labels = unique_labels(y_true, y_pred) if labels is None: labels = present_labels n_labels = None else: n_labels = len(labels) labels = np.hstack( [labels, np.setdiff1d(present_labels, labels, assume_unique=True)] ) le = LabelEncoder() le.fit(labels) y_true = le.transform(y_true) y_pred = le.transform(y_pred) sorted_labels = le.classes_ # labels are now from 0 to len(labels) - 1 -> use bincount tp = y_true == y_pred tp_bins = y_true[tp] if sample_weight is not None: tp_bins_weights = np.asarray(sample_weight)[tp] else: tp_bins_weights = None if len(tp_bins): tp_sum = np.bincount( tp_bins, weights=tp_bins_weights, minlength=len(labels) ) else: # Pathological case true_sum = tp_sum = np.zeros(len(labels)) if len(y_true): true_sum = np.bincount(y_true, weights=sample_weight, minlength=len(labels)) # Retain only selected labels indices = np.searchsorted(sorted_labels, labels[:n_labels]) tp_sum = tp_sum[indices] true_sum = true_sum[indices] with np.errstate(divide="ignore", invalid="ignore"): recall = _prf_divide(tp_sum, true_sum, "recall", "true", None, "recall") recall[recall == 0] = correction with np.errstate(divide="ignore", invalid="ignore"): gmean = sp.stats.gmean(recall) # old version of scipy return MaskedConstant instead of 0.0 if isinstance(gmean, np.ma.core.MaskedConstant): return 0.0 return gmean @validate_params( {"alpha": [numbers.Real], "squared": ["boolean"]}, prefer_skip_nested_validation=True, ) def make_index_balanced_accuracy(*, alpha=0.1, squared=True): """Balance any scoring function using the index balanced accuracy. This factory function wraps scoring function to express it as the index balanced accuracy (IBA). You need to use this function to decorate any scoring function. Only metrics requiring ``y_pred`` can be corrected with the index balanced accuracy. ``y_score`` cannot be used since the dominance cannot be computed. Read more in the :ref:`User Guide `. Parameters ---------- alpha : float, default=0.1 Weighting factor. squared : bool, default=True If ``squared`` is True, then the metric computed will be squared before to be weighted. Returns ------- iba_scoring_func : callable, Returns the scoring metric decorated which will automatically compute the index balanced accuracy. Notes ----- See :ref:`sphx_glr_auto_examples_evaluation_plot_metrics.py`. References ---------- .. [1] García, Vicente, Javier Salvador Sánchez, and Ramón Alberto Mollineda. "On the effectiveness of preprocessing methods when dealing with different levels of class imbalance." Knowledge-Based Systems 25.1 (2012): 13-21. Examples -------- >>> from imblearn.metrics import geometric_mean_score as gmean >>> from imblearn.metrics import make_index_balanced_accuracy as iba >>> gmean = iba(alpha=0.1, squared=True)(gmean) >>> y_true = [1, 0, 0, 1, 0, 1] >>> y_pred = [0, 0, 1, 1, 0, 1] >>> print(gmean(y_true, y_pred, average=None)) [0.44... 0.44...] """ def decorate(scoring_func): @functools.wraps(scoring_func) def compute_score(*args, **kwargs): signature_scoring_func = signature(scoring_func) params_scoring_func = set(signature_scoring_func.parameters.keys()) # check that the scoring function does not need a score # and only a prediction prohibitied_y_pred = set(["y_score", "y_prob", "y2"]) if prohibitied_y_pred.intersection(params_scoring_func): raise AttributeError( f"The function {scoring_func.__name__} has an unsupported" f" attribute. Metric with`y_pred` are the" f" only supported metrics is the only" f" supported." ) args_scoring_func = signature_scoring_func.bind(*args, **kwargs) args_scoring_func.apply_defaults() _score = scoring_func(*args_scoring_func.args, **args_scoring_func.kwargs) if squared: _score = np.power(_score, 2) signature_sens_spec = signature(sensitivity_specificity_support) params_sens_spec = set(signature_sens_spec.parameters.keys()) common_params = params_sens_spec.intersection( set(args_scoring_func.arguments.keys()) ) args_sens_spec = {k: args_scoring_func.arguments[k] for k in common_params} if scoring_func.__name__ == "geometric_mean_score": if "average" in args_sens_spec: if args_sens_spec["average"] == "multiclass": args_sens_spec["average"] = "macro" elif ( scoring_func.__name__ == "accuracy_score" or scoring_func.__name__ == "jaccard_score" ): # We do not support multilabel so the only average supported # is binary args_sens_spec["average"] = "binary" sensitivity, specificity, _ = sensitivity_specificity_support( **args_sens_spec ) dominance = sensitivity - specificity return (1.0 + alpha * dominance) * _score return compute_score return decorate @validate_params( { "y_true": ["array-like"], "y_pred": ["array-like"], "labels": ["array-like", None], "target_names": ["array-like", None], "sample_weight": ["array-like", None], "digits": [Interval(numbers.Integral, 0, None, closed="left")], "alpha": [numbers.Real], "output_dict": ["boolean"], "zero_division": [ StrOptions({"warn"}), Interval(numbers.Integral, 0, 1, closed="both"), ], }, prefer_skip_nested_validation=True, ) def classification_report_imbalanced( y_true, y_pred, *, labels=None, target_names=None, sample_weight=None, digits=2, alpha=0.1, output_dict=False, zero_division="warn", ): """Build a classification report based on metrics used with imbalanced dataset. Specific metrics have been proposed to evaluate the classification performed on imbalanced dataset. This report compiles the state-of-the-art metrics: precision/recall/specificity, geometric mean, and index balanced accuracy of the geometric mean. Read more in the :ref:`User Guide `. Parameters ---------- y_true : 1d array-like, or label indicator array / sparse matrix Ground truth (correct) target values. y_pred : 1d array-like, or label indicator array / sparse matrix Estimated targets as returned by a classifier. labels : array-like of shape (n_labels,), default=None Optional list of label indices to include in the report. target_names : list of str of shape (n_labels,), default=None Optional display names matching the labels (same order). sample_weight : array-like of shape (n_samples,), default=None Sample weights. digits : int, default=2 Number of digits for formatting output floating point values. When ``output_dict`` is ``True``, this will be ignored and the returned values will not be rounded. alpha : float, default=0.1 Weighting factor. output_dict : bool, default=False If True, return output as dict. .. versionadded:: 0.8 zero_division : "warn" or {0, 1}, default="warn" Sets the value to return when there is a zero division. If set to "warn", this acts as 0, but warnings are also raised. .. versionadded:: 0.8 Returns ------- report : string / dict Text summary of the precision, recall, specificity, geometric mean, and index balanced accuracy. Dictionary returned if output_dict is True. Dictionary has the following structure:: {'label 1': {'pre':0.5, 'rec':1.0, ... }, 'label 2': { ... }, ... } Examples -------- >>> import numpy as np >>> from imblearn.metrics import classification_report_imbalanced >>> y_true = [0, 1, 2, 2, 2] >>> y_pred = [0, 0, 2, 2, 1] >>> target_names = ['class 0', 'class 1', 'class 2'] >>> print(classification_report_imbalanced(y_true, y_pred, \ target_names=target_names)) pre rec spe f1 geo iba\ sup class 0 0.50 1.00 0.75 0.67 0.87 0.77\ 1 class 1 0.00 0.00 0.75 0.00 0.00 0.00\ 1 class 2 1.00 0.67 1.00 0.80 0.82 0.64\ 3 avg / total 0.70 0.60 0.90 0.61 0.66 0.54\ 5 """ if labels is None: labels = unique_labels(y_true, y_pred) else: labels = np.asarray(labels) last_line_heading = "avg / total" if target_names is None: target_names = [f"{label}" for label in labels] name_width = max(len(cn) for cn in target_names) width = max(name_width, len(last_line_heading), digits) headers = ["pre", "rec", "spe", "f1", "geo", "iba", "sup"] fmt = "%% %ds" % width # first column: class name fmt += " " fmt += " ".join(["% 9s" for _ in headers]) fmt += "\n" headers = [""] + headers report = fmt % tuple(headers) report += "\n" # Compute the different metrics # Precision/recall/f1 precision, recall, f1, support = precision_recall_fscore_support( y_true, y_pred, labels=labels, average=None, sample_weight=sample_weight, zero_division=zero_division, ) # Specificity specificity = specificity_score( y_true, y_pred, labels=labels, average=None, sample_weight=sample_weight, ) # Geometric mean geo_mean = geometric_mean_score( y_true, y_pred, labels=labels, average=None, sample_weight=sample_weight, ) # Index balanced accuracy iba_gmean = make_index_balanced_accuracy(alpha=alpha, squared=True)( geometric_mean_score ) iba = iba_gmean( y_true, y_pred, labels=labels, average=None, sample_weight=sample_weight, ) report_dict = {} for i, label in enumerate(labels): report_dict_label = {} values = [target_names[i]] for score_name, score_value in zip( headers[1:-1], [ precision[i], recall[i], specificity[i], f1[i], geo_mean[i], iba[i], ], ): values += ["{0:0.{1}f}".format(score_value, digits)] report_dict_label[score_name] = score_value values += [f"{support[i]}"] report_dict_label[headers[-1]] = support[i] report += fmt % tuple(values) report_dict[target_names[i]] = report_dict_label report += "\n" # compute averages values = [last_line_heading] for score_name, score_value in zip( headers[1:-1], [ np.average(precision, weights=support), np.average(recall, weights=support), np.average(specificity, weights=support), np.average(f1, weights=support), np.average(geo_mean, weights=support), np.average(iba, weights=support), ], ): values += ["{0:0.{1}f}".format(score_value, digits)] report_dict[f"avg_{score_name}"] = score_value values += [f"{np.sum(support)}"] report += fmt % tuple(values) report_dict["total_support"] = np.sum(support) if output_dict: return report_dict return report @validate_params( { "y_true": ["array-like"], "y_pred": ["array-like"], "sample_weight": ["array-like", None], }, prefer_skip_nested_validation=True, ) def macro_averaged_mean_absolute_error(y_true, y_pred, *, sample_weight=None): """Compute Macro-Averaged MAE for imbalanced ordinal classification. This function computes each MAE for each class and average them, giving an equal weight to each class. Read more in the :ref:`User Guide `. .. versionadded:: 0.8 Parameters ---------- y_true : array-like of shape (n_samples,) or (n_samples, n_outputs) Ground truth (correct) target values. y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs) Estimated targets as returned by a classifier. sample_weight : array-like of shape (n_samples,), default=None Sample weights. Returns ------- loss : float or ndarray of floats Macro-Averaged MAE output is non-negative floating point. The best value is 0.0. Examples -------- >>> import numpy as np >>> from sklearn.metrics import mean_absolute_error >>> from imblearn.metrics import macro_averaged_mean_absolute_error >>> y_true_balanced = [1, 1, 2, 2] >>> y_true_imbalanced = [1, 2, 2, 2] >>> y_pred = [1, 2, 1, 2] >>> mean_absolute_error(y_true_balanced, y_pred) 0.5 >>> mean_absolute_error(y_true_imbalanced, y_pred) 0.25 >>> macro_averaged_mean_absolute_error(y_true_balanced, y_pred) 0.5 >>> macro_averaged_mean_absolute_error(y_true_imbalanced, y_pred) 0.16... """ _, y_true, y_pred = _check_targets(y_true, y_pred) if sample_weight is not None: sample_weight = column_or_1d(sample_weight) else: sample_weight = np.ones(y_true.shape) check_consistent_length(y_true, y_pred, sample_weight) labels = unique_labels(y_true, y_pred) mae = [] for possible_class in labels: indices = np.flatnonzero(y_true == possible_class) mae.append( mean_absolute_error( y_true[indices], y_pred[indices], sample_weight=sample_weight[indices], ) ) return np.sum(mae) / len(mae) imbalanced-learn-0.12.2/imblearn/metrics/pairwise.py000066400000000000000000000207361460233407600224230ustar00rootroot00000000000000"""Metrics to perform pairwise computation.""" # Authors: Guillaume Lemaitre # License: MIT import numbers import numpy as np from scipy.spatial import distance_matrix from sklearn.base import BaseEstimator from sklearn.utils import check_consistent_length from sklearn.utils.multiclass import unique_labels from sklearn.utils.validation import check_is_fitted from ..base import _ParamsValidationMixin from ..utils._param_validation import StrOptions class ValueDifferenceMetric(_ParamsValidationMixin, BaseEstimator): r"""Class implementing the Value Difference Metric. This metric computes the distance between samples containing only categorical features. The distance between feature values of two samples is defined as: .. math:: \delta(x, y) = \sum_{c=1}^{C} |p(c|x_{f}) - p(c|y_{f})|^{k} \ , where :math:`x` and :math:`y` are two samples and :math:`f` a given feature, :math:`C` is the number of classes, :math:`p(c|x_{f})` is the conditional probability that the output class is :math:`c` given that the feature value :math:`f` has the value :math:`x` and :math:`k` an exponent usually defined to 1 or 2. The distance for the feature vectors :math:`X` and :math:`Y` is subsequently defined as: .. math:: \Delta(X, Y) = \sum_{f=1}^{F} \delta(X_{f}, Y_{f})^{r} \ , where :math:`F` is the number of feature and :math:`r` an exponent usually defined equal to 1 or 2. The definition of this distance was propoed in [1]_. Read more in the :ref:`User Guide `. .. versionadded:: 0.8 Parameters ---------- n_categories : "auto" or array-like of shape (n_features,), default="auto" The number of unique categories per features. If `"auto"`, the number of categories will be computed from `X` at `fit`. Otherwise, you can provide an array-like of such counts to avoid computation. You can use the fitted attribute `categories_` of the :class:`~sklearn.preprocesssing.OrdinalEncoder` to deduce these counts. k : int, default=1 Exponent used to compute the distance between feature value. r : int, default=2 Exponent used to compute the distance between the feature vector. Attributes ---------- n_categories_ : ndarray of shape (n_features,) The number of categories per features. proba_per_class_ : list of ndarray of shape (n_categories, n_classes) List of length `n_features` containing the conditional probabilities for each category given a class. n_features_in_ : int Number of features in the input dataset. .. versionadded:: 0.10 feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during `fit`. Defined only when `X` has feature names that are all strings. .. versionadded:: 0.10 See Also -------- sklearn.neighbors.DistanceMetric : Interface for fast metric computation. Notes ----- The input data `X` are expected to be encoded by an :class:`~sklearn.preprocessing.OrdinalEncoder` and the data type is used should be `np.int32`. If other data types are given, `X` will be converted to `np.int32`. References ---------- .. [1] Stanfill, Craig, and David Waltz. "Toward memory-based reasoning." Communications of the ACM 29.12 (1986): 1213-1228. Examples -------- >>> import numpy as np >>> X = np.array(["green"] * 10 + ["red"] * 10 + ["blue"] * 10).reshape(-1, 1) >>> y = [1] * 8 + [0] * 5 + [1] * 7 + [0] * 9 + [1] >>> from sklearn.preprocessing import OrdinalEncoder >>> encoder = OrdinalEncoder(dtype=np.int32) >>> X_encoded = encoder.fit_transform(X) >>> from imblearn.metrics.pairwise import ValueDifferenceMetric >>> vdm = ValueDifferenceMetric().fit(X_encoded, y) >>> pairwise_distance = vdm.pairwise(X_encoded) >>> pairwise_distance.shape (30, 30) >>> X_test = np.array(["green", "red", "blue"]).reshape(-1, 1) >>> X_test_encoded = encoder.transform(X_test) >>> vdm.pairwise(X_test_encoded) array([[0. , 0.04, 1.96], [0.04, 0. , 1.44], [1.96, 1.44, 0. ]]) """ _parameter_constraints: dict = { "n_categories": [StrOptions({"auto"}), "array-like"], "k": [numbers.Integral], "r": [numbers.Integral], } def __init__(self, *, n_categories="auto", k=1, r=2): self.n_categories = n_categories self.k = k self.r = r def fit(self, X, y): """Compute the necessary statistics from the training set. Parameters ---------- X : ndarray of shape (n_samples, n_features), dtype=np.int32 The input data. The data are expected to be encoded with a :class:`~sklearn.preprocessing.OrdinalEncoder`. y : ndarray of shape (n_features,) The target. Returns ------- self : object Return the instance itself. """ self._validate_params() check_consistent_length(X, y) X, y = self._validate_data(X, y, reset=True, dtype=np.int32) if isinstance(self.n_categories, str) and self.n_categories == "auto": # categories are expected to be encoded from 0 to n_categories - 1 self.n_categories_ = X.max(axis=0) + 1 else: if len(self.n_categories) != self.n_features_in_: raise ValueError( f"The length of n_categories is not consistent with the " f"number of feature in X. Got {len(self.n_categories)} " f"elements in n_categories and {self.n_features_in_} in " f"X." ) self.n_categories_ = np.array(self.n_categories, copy=False) classes = unique_labels(y) # list of length n_features of ndarray (n_categories, n_classes) # compute the counts self.proba_per_class_ = [ np.empty(shape=(n_cat, len(classes)), dtype=np.float64) for n_cat in self.n_categories_ ] for feature_idx in range(self.n_features_in_): for klass_idx, klass in enumerate(classes): self.proba_per_class_[feature_idx][:, klass_idx] = np.bincount( X[y == klass, feature_idx], minlength=self.n_categories_[feature_idx], ) # normalize by the summing over the classes with np.errstate(invalid="ignore"): # silence potential warning due to in-place division by zero for feature_idx in range(self.n_features_in_): self.proba_per_class_[feature_idx] /= ( self.proba_per_class_[feature_idx].sum(axis=1).reshape(-1, 1) ) np.nan_to_num(self.proba_per_class_[feature_idx], copy=False) return self def pairwise(self, X, Y=None): """Compute the VDM distance pairwise. Parameters ---------- X : ndarray of shape (n_samples, n_features), dtype=np.int32 The input data. The data are expected to be encoded with a :class:`~sklearn.preprocessing.OrdinalEncoder`. Y : ndarray of shape (n_samples, n_features), dtype=np.int32 The input data. The data are expected to be encoded with a :class:`~sklearn.preprocessing.OrdinalEncoder`. Returns ------- distance_matrix : ndarray of shape (n_samples, n_samples) The VDM pairwise distance. """ check_is_fitted(self) X = self._validate_data(X, reset=False, dtype=np.int32) n_samples_X = X.shape[0] if Y is not None: Y = self._validate_data(Y, reset=False, dtype=np.int32) n_samples_Y = Y.shape[0] else: n_samples_Y = n_samples_X distance = np.zeros(shape=(n_samples_X, n_samples_Y), dtype=np.float64) for feature_idx in range(self.n_features_in_): proba_feature_X = self.proba_per_class_[feature_idx][X[:, feature_idx]] if Y is not None: proba_feature_Y = self.proba_per_class_[feature_idx][Y[:, feature_idx]] else: proba_feature_Y = proba_feature_X distance += ( distance_matrix(proba_feature_X, proba_feature_Y, p=self.k) ** self.r ) return distance def _more_tags(self): return { "requires_positive_X": True, # X should be encoded with OrdinalEncoder } imbalanced-learn-0.12.2/imblearn/metrics/tests/000077500000000000000000000000001460233407600213605ustar00rootroot00000000000000imbalanced-learn-0.12.2/imblearn/metrics/tests/__init__.py000066400000000000000000000000001460233407600234570ustar00rootroot00000000000000imbalanced-learn-0.12.2/imblearn/metrics/tests/test_classification.py000066400000000000000000000427701460233407600257760ustar00rootroot00000000000000# coding: utf-8 """Testing the metric for classification with imbalanced dataset""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT from functools import partial import numpy as np import pytest from sklearn import datasets, svm from sklearn.metrics import ( accuracy_score, average_precision_score, brier_score_loss, cohen_kappa_score, jaccard_score, precision_score, recall_score, roc_auc_score, ) from sklearn.preprocessing import label_binarize from sklearn.utils._testing import ( assert_allclose, assert_array_equal, assert_no_warnings, ) from sklearn.utils.validation import check_random_state from imblearn.metrics import ( classification_report_imbalanced, geometric_mean_score, macro_averaged_mean_absolute_error, make_index_balanced_accuracy, sensitivity_score, sensitivity_specificity_support, specificity_score, ) RND_SEED = 42 R_TOL = 1e-2 ############################################################################### # Utilities for testing def make_prediction(dataset=None, binary=False): """Make some classification predictions on a toy dataset using a SVC If binary is True restrict to a binary classification problem instead of a multiclass classification problem """ if dataset is None: # import some data to play with dataset = datasets.load_iris() X = dataset.data y = dataset.target if binary: # restrict to a binary classification task X, y = X[y < 2], y[y < 2] n_samples, n_features = X.shape p = np.arange(n_samples) rng = check_random_state(37) rng.shuffle(p) X, y = X[p], y[p] half = int(n_samples / 2) # add noisy features to make the problem harder and avoid perfect results rng = np.random.RandomState(0) X = np.c_[X, rng.randn(n_samples, 200 * n_features)] # run classifier, get class probabilities and label predictions clf = svm.SVC(kernel="linear", probability=True, random_state=0) probas_pred = clf.fit(X[:half], y[:half]).predict_proba(X[half:]) if binary: # only interested in probabilities of the positive case # XXX: do we really want a special API for the binary case? probas_pred = probas_pred[:, 1] y_pred = clf.predict(X[half:]) y_true = y[half:] return y_true, y_pred, probas_pred ############################################################################### # Tests def test_sensitivity_specificity_score_binary(): y_true, y_pred, _ = make_prediction(binary=True) # detailed measures for each class sen, spe, sup = sensitivity_specificity_support(y_true, y_pred, average=None) assert_allclose(sen, [0.88, 0.68], rtol=R_TOL) assert_allclose(spe, [0.68, 0.88], rtol=R_TOL) assert_array_equal(sup, [25, 25]) # individual scoring function that can be used for grid search: in the # binary class case the score is the value of the measure for the positive # class (e.g. label == 1). This is deprecated for average != 'binary'. for kwargs in ({}, {"average": "binary"}): sen = assert_no_warnings(sensitivity_score, y_true, y_pred, **kwargs) assert sen == pytest.approx(0.68, rel=R_TOL) spe = assert_no_warnings(specificity_score, y_true, y_pred, **kwargs) assert spe == pytest.approx(0.88, rel=R_TOL) @pytest.mark.filterwarnings("ignore:Specificity is ill-defined") @pytest.mark.parametrize( "y_pred, expected_sensitivity, expected_specificity", [(([1, 1], [1, 1]), 1.0, 0.0), (([-1, -1], [-1, -1]), 0.0, 0.0)], ) def test_sensitivity_specificity_f_binary_single_class( y_pred, expected_sensitivity, expected_specificity ): # Such a case may occur with non-stratified cross-validation assert sensitivity_score(*y_pred) == expected_sensitivity assert specificity_score(*y_pred) == expected_specificity @pytest.mark.parametrize( "average, expected_specificty", [ (None, [1.0, 0.67, 1.0, 1.0, 1.0]), ("macro", np.mean([1.0, 0.67, 1.0, 1.0, 1.0])), ("micro", 15 / 16), ], ) def test_sensitivity_specificity_extra_labels(average, expected_specificty): y_true = [1, 3, 3, 2] y_pred = [1, 1, 3, 2] actual = specificity_score(y_true, y_pred, labels=[0, 1, 2, 3, 4], average=average) assert_allclose(expected_specificty, actual, rtol=R_TOL) def test_sensitivity_specificity_ignored_labels(): y_true = [1, 1, 2, 3] y_pred = [1, 3, 3, 3] specificity_13 = partial(specificity_score, y_true, y_pred, labels=[1, 3]) specificity_all = partial(specificity_score, y_true, y_pred, labels=None) assert_allclose([1.0, 0.33], specificity_13(average=None), rtol=R_TOL) assert_allclose(np.mean([1.0, 0.33]), specificity_13(average="macro"), rtol=R_TOL) assert_allclose( np.average([1.0, 0.33], weights=[2.0, 1.0]), specificity_13(average="weighted"), rtol=R_TOL, ) assert_allclose(3.0 / (3.0 + 2.0), specificity_13(average="micro"), rtol=R_TOL) # ensure the above were meaningful tests: for each in ["macro", "weighted", "micro"]: assert specificity_13(average=each) != specificity_all(average=each) def test_sensitivity_specificity_error_multilabels(): y_true = [1, 3, 3, 2] y_pred = [1, 1, 3, 2] y_true_bin = label_binarize(y_true, classes=np.arange(5)) y_pred_bin = label_binarize(y_pred, classes=np.arange(5)) with pytest.raises(ValueError): sensitivity_score(y_true_bin, y_pred_bin) def test_sensitivity_specificity_support_errors(): y_true, y_pred, _ = make_prediction(binary=True) # Bad pos_label with pytest.raises(ValueError): sensitivity_specificity_support(y_true, y_pred, pos_label=2, average="binary") # Bad average option with pytest.raises(ValueError): sensitivity_specificity_support([0, 1, 2], [1, 2, 0], average="mega") def test_sensitivity_specificity_unused_pos_label(): # but average != 'binary'; even if data is binary msg = r"use labels=\[pos_label\] to specify a single" with pytest.warns(UserWarning, match=msg): sensitivity_specificity_support( [1, 2, 1], [1, 2, 2], pos_label=2, average="macro" ) def test_geometric_mean_support_binary(): y_true, y_pred, _ = make_prediction(binary=True) # compute the geometric mean for the binary problem geo_mean = geometric_mean_score(y_true, y_pred) assert_allclose(geo_mean, 0.77, rtol=R_TOL) @pytest.mark.filterwarnings("ignore:Recall is ill-defined") @pytest.mark.parametrize( "y_true, y_pred, correction, expected_gmean", [ ([0, 0, 1, 1], [0, 0, 1, 1], 0.0, 1.0), ([0, 0, 0, 0], [1, 1, 1, 1], 0.0, 0.0), ([0, 0, 0, 0], [0, 0, 0, 0], 0.001, 1.0), ([0, 0, 0, 0], [1, 1, 1, 1], 0.001, 0.001), ([0, 0, 1, 1], [0, 1, 1, 0], 0.001, 0.5), ( [0, 1, 2, 0, 1, 2], [0, 2, 1, 0, 0, 1], 0.001, (0.001**2) ** (1 / 3), ), ([0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], 0.001, 1), ([0, 1, 1, 1, 1, 0], [0, 0, 1, 1, 1, 1], 0.001, (0.5 * 0.75) ** 0.5), ], ) def test_geometric_mean_multiclass(y_true, y_pred, correction, expected_gmean): gmean = geometric_mean_score(y_true, y_pred, correction=correction) assert gmean == pytest.approx(expected_gmean, rel=R_TOL) @pytest.mark.filterwarnings("ignore:Recall is ill-defined") @pytest.mark.parametrize( "y_true, y_pred, average, expected_gmean", [ ([0, 1, 2, 0, 1, 2], [0, 2, 1, 0, 0, 1], "macro", 0.471), ([0, 1, 2, 0, 1, 2], [0, 2, 1, 0, 0, 1], "micro", 0.471), ([0, 1, 2, 0, 1, 2], [0, 2, 1, 0, 0, 1], "weighted", 0.471), ([0, 1, 2, 0, 1, 2], [0, 2, 1, 0, 0, 1], None, [0.8660254, 0.0, 0.0]), ], ) def test_geometric_mean_average(y_true, y_pred, average, expected_gmean): gmean = geometric_mean_score(y_true, y_pred, average=average) assert gmean == pytest.approx(expected_gmean, rel=R_TOL) @pytest.mark.parametrize( "y_true, y_pred, sample_weight, average, expected_gmean", [ ([0, 1, 2, 0, 1, 2], [0, 1, 1, 0, 0, 1], None, "multiclass", 0.707), ( [0, 1, 2, 0, 1, 2], [0, 1, 1, 0, 0, 1], [1, 2, 1, 1, 2, 1], "multiclass", 0.707, ), ( [0, 1, 2, 0, 1, 2], [0, 1, 1, 0, 0, 1], [1, 2, 1, 1, 2, 1], "weighted", 0.333, ), ], ) def test_geometric_mean_sample_weight( y_true, y_pred, sample_weight, average, expected_gmean ): gmean = geometric_mean_score( y_true, y_pred, labels=[0, 1], sample_weight=sample_weight, average=average, ) assert gmean == pytest.approx(expected_gmean, rel=R_TOL) @pytest.mark.parametrize( "average, expected_gmean", [ ("multiclass", 0.41), (None, [0.85, 0.29, 0.7]), ("macro", 0.68), ("weighted", 0.65), ], ) def test_geometric_mean_score_prediction(average, expected_gmean): y_true, y_pred, _ = make_prediction(binary=False) gmean = geometric_mean_score(y_true, y_pred, average=average) assert gmean == pytest.approx(expected_gmean, rel=R_TOL) def test_iba_geo_mean_binary(): y_true, y_pred, _ = make_prediction(binary=True) iba_gmean = make_index_balanced_accuracy(alpha=0.5, squared=True)( geometric_mean_score ) iba = iba_gmean(y_true, y_pred) assert_allclose(iba, 0.5948, rtol=R_TOL) def _format_report(report): return " ".join(report.split()) def test_classification_report_imbalanced_multiclass(): iris = datasets.load_iris() y_true, y_pred, _ = make_prediction(dataset=iris, binary=False) # print classification report with class names expected_report = ( "pre rec spe f1 geo iba sup setosa 0.83 0.79 0.92 " "0.81 0.85 0.72 24 versicolor 0.33 0.10 0.86 0.15 " "0.29 0.08 31 virginica 0.42 0.90 0.55 0.57 0.70 " "0.51 20 avg / total 0.51 0.53 0.80 0.47 0.58 0.40 75" ) report = classification_report_imbalanced( y_true, y_pred, labels=np.arange(len(iris.target_names)), target_names=iris.target_names, ) assert _format_report(report) == expected_report # print classification report with label detection expected_report = ( "pre rec spe f1 geo iba sup 0 0.83 0.79 0.92 0.81 " "0.85 0.72 24 1 0.33 0.10 0.86 0.15 0.29 0.08 31 " "2 0.42 0.90 0.55 0.57 0.70 0.51 20 avg / total " "0.51 0.53 0.80 0.47 0.58 0.40 75" ) report = classification_report_imbalanced(y_true, y_pred) assert _format_report(report) == expected_report def test_classification_report_imbalanced_multiclass_with_digits(): iris = datasets.load_iris() y_true, y_pred, _ = make_prediction(dataset=iris, binary=False) # print classification report with class names expected_report = ( "pre rec spe f1 geo iba sup setosa 0.82609 0.79167 " "0.92157 0.80851 0.85415 0.72010 24 versicolor " "0.33333 0.09677 0.86364 0.15000 0.28910 0.07717 " "31 virginica 0.41860 0.90000 0.54545 0.57143 0.70065 " "0.50831 20 avg / total 0.51375 0.53333 0.79733 " "0.47310 0.57966 0.39788 75" ) report = classification_report_imbalanced( y_true, y_pred, labels=np.arange(len(iris.target_names)), target_names=iris.target_names, digits=5, ) assert _format_report(report) == expected_report # print classification report with label detection expected_report = ( "pre rec spe f1 geo iba sup 0 0.83 0.79 0.92 0.81 " "0.85 0.72 24 1 0.33 0.10 0.86 0.15 0.29 0.08 31 " "2 0.42 0.90 0.55 0.57 0.70 0.51 20 avg / total 0.51 " "0.53 0.80 0.47 0.58 0.40 75" ) report = classification_report_imbalanced(y_true, y_pred) assert _format_report(report) == expected_report def test_classification_report_imbalanced_multiclass_with_string_label(): y_true, y_pred, _ = make_prediction(binary=False) y_true = np.array(["blue", "green", "red"])[y_true] y_pred = np.array(["blue", "green", "red"])[y_pred] expected_report = ( "pre rec spe f1 geo iba sup blue 0.83 0.79 0.92 0.81 " "0.85 0.72 24 green 0.33 0.10 0.86 0.15 0.29 0.08 31 " "red 0.42 0.90 0.55 0.57 0.70 0.51 20 avg / total " "0.51 0.53 0.80 0.47 0.58 0.40 75" ) report = classification_report_imbalanced(y_true, y_pred) assert _format_report(report) == expected_report expected_report = ( "pre rec spe f1 geo iba sup a 0.83 0.79 0.92 0.81 0.85 " "0.72 24 b 0.33 0.10 0.86 0.15 0.29 0.08 31 c 0.42 " "0.90 0.55 0.57 0.70 0.51 20 avg / total 0.51 0.53 " "0.80 0.47 0.58 0.40 75" ) report = classification_report_imbalanced( y_true, y_pred, target_names=["a", "b", "c"] ) assert _format_report(report) == expected_report def test_classification_report_imbalanced_multiclass_with_unicode_label(): y_true, y_pred, _ = make_prediction(binary=False) labels = np.array(["blue\xa2", "green\xa2", "red\xa2"]) y_true = labels[y_true] y_pred = labels[y_pred] expected_report = ( "pre rec spe f1 geo iba sup blue¢ 0.83 0.79 0.92 0.81 " "0.85 0.72 24 green¢ 0.33 0.10 0.86 0.15 0.29 0.08 31 " "red¢ 0.42 0.90 0.55 0.57 0.70 0.51 20 avg / total " "0.51 0.53 0.80 0.47 0.58 0.40 75" ) report = classification_report_imbalanced(y_true, y_pred) assert _format_report(report) == expected_report def test_classification_report_imbalanced_multiclass_with_long_string_label(): y_true, y_pred, _ = make_prediction(binary=False) labels = np.array(["blue", "green" * 5, "red"]) y_true = labels[y_true] y_pred = labels[y_pred] expected_report = ( "pre rec spe f1 geo iba sup blue 0.83 0.79 0.92 0.81 " "0.85 0.72 24 greengreengreengreengreen 0.33 0.10 " "0.86 0.15 0.29 0.08 31 red 0.42 0.90 0.55 0.57 0.70 " "0.51 20 avg / total 0.51 0.53 0.80 0.47 0.58 0.40 75" ) report = classification_report_imbalanced(y_true, y_pred) assert _format_report(report) == expected_report @pytest.mark.parametrize( "score, expected_score", [ (accuracy_score, 0.54756), (jaccard_score, 0.33176), (precision_score, 0.65025), (recall_score, 0.41616), ], ) def test_iba_sklearn_metrics(score, expected_score): y_true, y_pred, _ = make_prediction(binary=True) score_iba = make_index_balanced_accuracy(alpha=0.5, squared=True)(score) score = score_iba(y_true, y_pred) assert score == pytest.approx(expected_score) @pytest.mark.parametrize( "score_loss", [average_precision_score, brier_score_loss, cohen_kappa_score, roc_auc_score], ) def test_iba_error_y_score_prob_error(score_loss): y_true, y_pred, _ = make_prediction(binary=True) aps = make_index_balanced_accuracy(alpha=0.5, squared=True)(score_loss) with pytest.raises(AttributeError): aps(y_true, y_pred) def test_classification_report_imbalanced_dict_with_target_names(): iris = datasets.load_iris() y_true, y_pred, _ = make_prediction(dataset=iris, binary=False) report = classification_report_imbalanced( y_true, y_pred, labels=np.arange(len(iris.target_names)), target_names=iris.target_names, output_dict=True, ) outer_keys = set(report.keys()) inner_keys = set(report["setosa"].keys()) expected_outer_keys = { "setosa", "versicolor", "virginica", "avg_pre", "avg_rec", "avg_spe", "avg_f1", "avg_geo", "avg_iba", "total_support", } expected_inner_keys = {"spe", "f1", "sup", "rec", "geo", "iba", "pre"} assert outer_keys == expected_outer_keys assert inner_keys == expected_inner_keys def test_classification_report_imbalanced_dict_without_target_names(): iris = datasets.load_iris() y_true, y_pred, _ = make_prediction(dataset=iris, binary=False) print(iris.target_names) report = classification_report_imbalanced( y_true, y_pred, labels=np.arange(len(iris.target_names)), output_dict=True, ) print(report.keys()) outer_keys = set(report.keys()) inner_keys = set(report["0"].keys()) expected_outer_keys = { "0", "1", "2", "avg_pre", "avg_rec", "avg_spe", "avg_f1", "avg_geo", "avg_iba", "total_support", } expected_inner_keys = {"spe", "f1", "sup", "rec", "geo", "iba", "pre"} assert outer_keys == expected_outer_keys assert inner_keys == expected_inner_keys @pytest.mark.parametrize( "y_true, y_pred, expected_ma_mae", [ ([1, 1, 1, 2, 2, 2], [1, 2, 1, 2, 1, 2], 0.333), ([1, 1, 1, 1, 1, 2], [1, 2, 1, 2, 1, 2], 0.2), ([1, 1, 1, 2, 2, 2, 3, 3, 3], [1, 3, 1, 2, 1, 1, 2, 3, 3], 0.555), ([1, 1, 1, 1, 1, 1, 2, 3, 3], [1, 3, 1, 2, 1, 1, 2, 3, 3], 0.166), ], ) def test_macro_averaged_mean_absolute_error(y_true, y_pred, expected_ma_mae): ma_mae = macro_averaged_mean_absolute_error(y_true, y_pred) assert ma_mae == pytest.approx(expected_ma_mae, rel=R_TOL) def test_macro_averaged_mean_absolute_error_sample_weight(): y_true = [1, 1, 1, 2, 2, 2] y_pred = [1, 2, 1, 2, 1, 2] ma_mae_no_weights = macro_averaged_mean_absolute_error(y_true, y_pred) sample_weight = [1, 1, 1, 1, 1, 1] ma_mae_unit_weights = macro_averaged_mean_absolute_error( y_true, y_pred, sample_weight=sample_weight, ) assert ma_mae_unit_weights == pytest.approx(ma_mae_no_weights) imbalanced-learn-0.12.2/imblearn/metrics/tests/test_pairwise.py000066400000000000000000000143731460233407600246240ustar00rootroot00000000000000"""Test for the metrics that perform pairwise distance computation.""" # Authors: Guillaume Lemaitre # License: MIT import numpy as np import pytest from sklearn.exceptions import NotFittedError from sklearn.preprocessing import LabelEncoder, OrdinalEncoder from sklearn.utils._testing import _convert_container from imblearn.metrics.pairwise import ValueDifferenceMetric @pytest.fixture def data(): rng = np.random.RandomState(0) feature_1 = ["A"] * 10 + ["B"] * 20 + ["C"] * 30 feature_2 = ["A"] * 40 + ["B"] * 20 feature_3 = ["A"] * 20 + ["B"] * 20 + ["C"] * 10 + ["D"] * 10 X = np.array([feature_1, feature_2, feature_3], dtype=object).T rng.shuffle(X) y = rng.randint(low=0, high=2, size=X.shape[0]) y_labels = np.array(["not apple", "apple"], dtype=object) y = y_labels[y] return X, y @pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64]) @pytest.mark.parametrize("k, r", [(1, 1), (1, 2), (2, 1), (2, 2)]) @pytest.mark.parametrize("y_type", ["list", "array"]) @pytest.mark.parametrize("encode_label", [True, False]) def test_value_difference_metric(data, dtype, k, r, y_type, encode_label): # Check basic feature of the metric: # * the shape of the distance matrix is (n_samples, n_samples) # * computing pairwise distance of X is the same than explicitely between # X and X. X, y = data y = _convert_container(y, y_type) if encode_label: y = LabelEncoder().fit_transform(y) encoder = OrdinalEncoder(dtype=dtype) X_encoded = encoder.fit_transform(X) vdm = ValueDifferenceMetric(k=k, r=r) vdm.fit(X_encoded, y) dist_1 = vdm.pairwise(X_encoded) dist_2 = vdm.pairwise(X_encoded, X_encoded) np.testing.assert_allclose(dist_1, dist_2) assert dist_1.shape == (X.shape[0], X.shape[0]) assert dist_2.shape == (X.shape[0], X.shape[0]) @pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64]) @pytest.mark.parametrize("k, r", [(1, 1), (1, 2), (2, 1), (2, 2)]) @pytest.mark.parametrize("y_type", ["list", "array"]) @pytest.mark.parametrize("encode_label", [True, False]) def test_value_difference_metric_property(dtype, k, r, y_type, encode_label): # Check the property of the vdm distance. Let's check the property # described in "Improved Heterogeneous Distance Functions", D.R. Wilson and # T.R. Martinez, Journal of Artificial Intelligence Research 6 (1997) 1-34 # https://arxiv.org/pdf/cs/9701101.pdf # # "if an attribute color has three values red, green and blue, and the # application is to identify whether or not an object is an apple, red and # green would be considered closer than red and blue because the former two # both have similar correlations with the output class apple." # defined our feature X = np.array(["green"] * 10 + ["red"] * 10 + ["blue"] * 10).reshape(-1, 1) # 0 - not an apple / 1 - an apple y = np.array([1] * 8 + [0] * 5 + [1] * 7 + [0] * 9 + [1]) y_labels = np.array(["not apple", "apple"], dtype=object) y = y_labels[y] y = _convert_container(y, y_type) if encode_label: y = LabelEncoder().fit_transform(y) encoder = OrdinalEncoder(dtype=dtype) X_encoded = encoder.fit_transform(X) vdm = ValueDifferenceMetric(k=k, r=r) vdm.fit(X_encoded, y) sample_green = encoder.transform([["green"]]) sample_red = encoder.transform([["red"]]) sample_blue = encoder.transform([["blue"]]) for sample in (sample_green, sample_red, sample_blue): # computing the distance between a sample of the same category should # give a null distance dist = vdm.pairwise(sample).squeeze() assert dist == pytest.approx(0) # check the property explained in the introduction example dist_1 = vdm.pairwise(sample_green, sample_red).squeeze() dist_2 = vdm.pairwise(sample_blue, sample_red).squeeze() dist_3 = vdm.pairwise(sample_blue, sample_green).squeeze() # green and red are very close # blue is closer to red than green assert dist_1 < dist_2 assert dist_1 < dist_3 assert dist_2 < dist_3 def test_value_difference_metric_categories(data): # Check that "auto" is equivalent to provide the number categories # beforehand X, y = data encoder = OrdinalEncoder(dtype=np.int32) X_encoded = encoder.fit_transform(X) n_categories = np.array([len(cat) for cat in encoder.categories_]) vdm_auto = ValueDifferenceMetric().fit(X_encoded, y) vdm_categories = ValueDifferenceMetric(n_categories=n_categories) vdm_categories.fit(X_encoded, y) np.testing.assert_array_equal(vdm_auto.n_categories_, n_categories) np.testing.assert_array_equal(vdm_auto.n_categories_, vdm_categories.n_categories_) def test_value_difference_metric_categories_error(data): # Check that we raise an error if n_categories is inconsistent with the # number of features in X X, y = data encoder = OrdinalEncoder(dtype=np.int32) X_encoded = encoder.fit_transform(X) n_categories = [1, 2] vdm = ValueDifferenceMetric(n_categories=n_categories) err_msg = "The length of n_categories is not consistent with the number" with pytest.raises(ValueError, match=err_msg): vdm.fit(X_encoded, y) def test_value_difference_metric_missing_categories(data): # Check that we don't get issue when a category is missing between 0 # n_categories - 1 X, y = data encoder = OrdinalEncoder(dtype=np.int32) X_encoded = encoder.fit_transform(X) n_categories = np.array([len(cat) for cat in encoder.categories_]) # remove a categories that could be between 0 and n_categories X_encoded[X_encoded[:, -1] == 1] = 0 np.testing.assert_array_equal(np.unique(X_encoded[:, -1]), [0, 2, 3]) vdm = ValueDifferenceMetric(n_categories=n_categories) vdm.fit(X_encoded, y) for n_cats, proba in zip(n_categories, vdm.proba_per_class_): assert proba.shape == (n_cats, len(np.unique(y))) def test_value_difference_value_unfitted(data): # Check that we raise a NotFittedError when `fit` is not not called before # pairwise. X, y = data encoder = OrdinalEncoder(dtype=np.int32) X_encoded = encoder.fit_transform(X) with pytest.raises(NotFittedError): ValueDifferenceMetric().pairwise(X_encoded) imbalanced-learn-0.12.2/imblearn/metrics/tests/test_score_objects.py000066400000000000000000000040531460233407600256170ustar00rootroot00000000000000"""Test for score""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import pytest from sklearn.datasets import make_blobs from sklearn.linear_model import LogisticRegression from sklearn.metrics import make_scorer from sklearn.model_selection import GridSearchCV, train_test_split from imblearn.metrics import ( geometric_mean_score, make_index_balanced_accuracy, sensitivity_score, specificity_score, ) R_TOL = 1e-2 @pytest.fixture def data(): X, y = make_blobs(random_state=0, centers=2) return train_test_split(X, y, random_state=0) @pytest.mark.parametrize( "score, expected_score", [ (sensitivity_score, 0.90), (specificity_score, 0.90), (geometric_mean_score, 0.90), (make_index_balanced_accuracy()(geometric_mean_score), 0.82), ], ) @pytest.mark.parametrize("average", ["macro", "weighted", "micro"]) def test_scorer_common_average(data, score, expected_score, average): X_train, X_test, y_train, _ = data scorer = make_scorer(score, pos_label=None, average=average) grid = GridSearchCV( LogisticRegression(), param_grid={"C": [1, 10]}, scoring=scorer, cv=3, ) grid.fit(X_train, y_train).predict(X_test) assert grid.best_score_ >= expected_score @pytest.mark.parametrize( "score, average, expected_score", [ (sensitivity_score, "binary", 0.94), (specificity_score, "binary", 0.89), (geometric_mean_score, "multiclass", 0.90), ( make_index_balanced_accuracy()(geometric_mean_score), "multiclass", 0.82, ), ], ) def test_scorer_default_average(data, score, average, expected_score): X_train, X_test, y_train, _ = data scorer = make_scorer(score, pos_label=1, average=average) grid = GridSearchCV( LogisticRegression(), param_grid={"C": [1, 10]}, scoring=scorer, cv=3, ) grid.fit(X_train, y_train).predict(X_test) assert grid.best_score_ >= expected_score imbalanced-learn-0.12.2/imblearn/over_sampling/000077500000000000000000000000001460233407600214155ustar00rootroot00000000000000imbalanced-learn-0.12.2/imblearn/over_sampling/__init__.py000066400000000000000000000006331460233407600235300ustar00rootroot00000000000000""" The :mod:`imblearn.over_sampling` provides a set of method to perform over-sampling. """ from ._adasyn import ADASYN from ._random_over_sampler import RandomOverSampler from ._smote import SMOTE, SMOTEN, SMOTENC, SVMSMOTE, BorderlineSMOTE, KMeansSMOTE __all__ = [ "ADASYN", "RandomOverSampler", "KMeansSMOTE", "SMOTE", "BorderlineSMOTE", "SVMSMOTE", "SMOTENC", "SMOTEN", ] imbalanced-learn-0.12.2/imblearn/over_sampling/_adasyn.py000066400000000000000000000205431460233407600234110ustar00rootroot00000000000000"""Class to perform over-sampling using ADASYN.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import numbers import warnings import numpy as np from scipy import sparse from sklearn.utils import _safe_indexing, check_random_state from ..utils import Substitution, check_neighbors_object from ..utils._docstring import _n_jobs_docstring, _random_state_docstring from ..utils._param_validation import HasMethods, Interval from .base import BaseOverSampler @Substitution( sampling_strategy=BaseOverSampler._sampling_strategy_docstring, n_jobs=_n_jobs_docstring, random_state=_random_state_docstring, ) class ADASYN(BaseOverSampler): """Oversample using Adaptive Synthetic (ADASYN) algorithm. This method is similar to SMOTE but it generates different number of samples depending on an estimate of the local distribution of the class to be oversampled. Read more in the :ref:`User Guide `. Parameters ---------- {sampling_strategy} {random_state} n_neighbors : int or estimator object, default=5 The nearest neighbors used to define the neighborhood of samples to use to generate the synthetic samples. You can pass: - an `int` corresponding to the number of neighbors to use. A `~sklearn.neighbors.NearestNeighbors` instance will be fitted in this case. - an instance of a compatible nearest neighbors algorithm that should implement both methods `kneighbors` and `kneighbors_graph`. For instance, it could correspond to a :class:`~sklearn.neighbors.NearestNeighbors` but could be extended to any compatible class. {n_jobs} .. deprecated:: 0.10 `n_jobs` has been deprecated in 0.10 and will be removed in 0.12. It was previously used to set `n_jobs` of nearest neighbors algorithm. From now on, you can pass an estimator where `n_jobs` is already set instead. Attributes ---------- sampling_strategy_ : dict Dictionary containing the information to sample the dataset. The keys corresponds to the class labels from which to sample and the values are the number of samples to sample. nn_ : estimator object Validated K-nearest Neighbours estimator linked to the parameter `n_neighbors`. n_features_in_ : int Number of features in the input dataset. .. versionadded:: 0.9 feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during `fit`. Defined only when `X` has feature names that are all strings. .. versionadded:: 0.10 See Also -------- SMOTE : Over-sample using SMOTE. SMOTENC : Over-sample using SMOTE for continuous and categorical features. SMOTEN : Over-sample using the SMOTE variant specifically for categorical features only. SVMSMOTE : Over-sample using SVM-SMOTE variant. BorderlineSMOTE : Over-sample using Borderline-SMOTE variant. Notes ----- The implementation is based on [1]_. Supports multi-class resampling. A one-vs.-rest scheme is used. References ---------- .. [1] He, Haibo, Yang Bai, Edwardo A. Garcia, and Shutao Li. "ADASYN: Adaptive synthetic sampling approach for imbalanced learning," In IEEE International Joint Conference on Neural Networks (IEEE World Congress on Computational Intelligence), pp. 1322-1328, 2008. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.over_sampling import ADASYN >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, ... random_state=10) >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> ada = ADASYN(random_state=42) >>> X_res, y_res = ada.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{0: 904, 1: 900}}) """ _parameter_constraints: dict = { **BaseOverSampler._parameter_constraints, "n_neighbors": [ Interval(numbers.Integral, 1, None, closed="left"), HasMethods(["kneighbors", "kneighbors_graph"]), ], "n_jobs": [numbers.Integral, None], } def __init__( self, *, sampling_strategy="auto", random_state=None, n_neighbors=5, n_jobs=None, ): super().__init__(sampling_strategy=sampling_strategy) self.random_state = random_state self.n_neighbors = n_neighbors self.n_jobs = n_jobs def _validate_estimator(self): """Create the necessary objects for ADASYN""" self.nn_ = check_neighbors_object( "n_neighbors", self.n_neighbors, additional_neighbor=1 ) def _fit_resample(self, X, y): # FIXME: to be removed in 0.12 if self.n_jobs is not None: warnings.warn( "The parameter `n_jobs` has been deprecated in 0.10 and will be " "removed in 0.12. You can pass an nearest neighbors estimator where " "`n_jobs` is already set instead.", FutureWarning, ) self._validate_estimator() random_state = check_random_state(self.random_state) X_resampled = [X.copy()] y_resampled = [y.copy()] for class_sample, n_samples in self.sampling_strategy_.items(): if n_samples == 0: continue target_class_indices = np.flatnonzero(y == class_sample) X_class = _safe_indexing(X, target_class_indices) self.nn_.fit(X) nns = self.nn_.kneighbors(X_class, return_distance=False)[:, 1:] # The ratio is computed using a one-vs-rest manner. Using majority # in multi-class would lead to slightly different results at the # cost of introducing a new parameter. n_neighbors = self.nn_.n_neighbors - 1 ratio_nn = np.sum(y[nns] != class_sample, axis=1) / n_neighbors if not np.sum(ratio_nn): raise RuntimeError( "Not any neigbours belong to the majority" " class. This case will induce a NaN case" " with a division by zero. ADASYN is not" " suited for this specific dataset." " Use SMOTE instead." ) ratio_nn /= np.sum(ratio_nn) n_samples_generate = np.rint(ratio_nn * n_samples).astype(int) # rounding may cause new amount for n_samples n_samples = np.sum(n_samples_generate) if not n_samples: raise ValueError( "No samples will be generated with the provided ratio settings." ) # the nearest neighbors need to be fitted only on the current class # to find the class NN to generate new samples self.nn_.fit(X_class) nns = self.nn_.kneighbors(X_class, return_distance=False)[:, 1:] enumerated_class_indices = np.arange(len(target_class_indices)) rows = np.repeat(enumerated_class_indices, n_samples_generate) cols = random_state.choice(n_neighbors, size=n_samples) diffs = X_class[nns[rows, cols]] - X_class[rows] steps = random_state.uniform(size=(n_samples, 1)) if sparse.issparse(X): sparse_func = type(X).__name__ steps = getattr(sparse, sparse_func)(steps) X_new = X_class[rows] + steps.multiply(diffs) else: X_new = X_class[rows] + steps * diffs X_new = X_new.astype(X.dtype) y_new = np.full(n_samples, fill_value=class_sample, dtype=y.dtype) X_resampled.append(X_new) y_resampled.append(y_new) if sparse.issparse(X): X_resampled = sparse.vstack(X_resampled, format=X.format) else: X_resampled = np.vstack(X_resampled) y_resampled = np.hstack(y_resampled) return X_resampled, y_resampled def _more_tags(self): return { "X_types": ["2darray"], } imbalanced-learn-0.12.2/imblearn/over_sampling/_random_over_sampler.py000066400000000000000000000225221460233407600261670ustar00rootroot00000000000000"""Class to perform random over-sampling.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT from collections.abc import Mapping from numbers import Real import numpy as np from scipy import sparse from sklearn.utils import _safe_indexing, check_array, check_random_state from sklearn.utils.sparsefuncs import mean_variance_axis from ..utils import Substitution, check_target_type from ..utils._docstring import _random_state_docstring from ..utils._param_validation import Interval from ..utils._validation import _check_X from .base import BaseOverSampler @Substitution( sampling_strategy=BaseOverSampler._sampling_strategy_docstring, random_state=_random_state_docstring, ) class RandomOverSampler(BaseOverSampler): """Class to perform random over-sampling. Object to over-sample the minority class(es) by picking samples at random with replacement. The bootstrap can be generated in a smoothed manner. Read more in the :ref:`User Guide `. Parameters ---------- {sampling_strategy} {random_state} shrinkage : float or dict, default=None Parameter controlling the shrinkage applied to the covariance matrix. when a smoothed bootstrap is generated. The options are: - if `None`, a normal bootstrap will be generated without perturbation. It is equivalent to `shrinkage=0` as well; - if a `float` is given, the shrinkage factor will be used for all classes to generate the smoothed bootstrap; - if a `dict` is given, the shrinkage factor will specific for each class. The key correspond to the targeted class and the value is the shrinkage factor. The value needs of the shrinkage parameter needs to be higher or equal to 0. .. versionadded:: 0.8 Attributes ---------- sampling_strategy_ : dict Dictionary containing the information to sample the dataset. The keys corresponds to the class labels from which to sample and the values are the number of samples to sample. sample_indices_ : ndarray of shape (n_new_samples,) Indices of the samples selected. .. versionadded:: 0.4 shrinkage_ : dict or None The per-class shrinkage factor used to generate the smoothed bootstrap sample. When `shrinkage=None` a normal bootstrap will be generated. .. versionadded:: 0.8 n_features_in_ : int Number of features in the input dataset. .. versionadded:: 0.9 feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during `fit`. Defined only when `X` has feature names that are all strings. .. versionadded:: 0.10 See Also -------- BorderlineSMOTE : Over-sample using the borderline-SMOTE variant. SMOTE : Over-sample using SMOTE. SMOTENC : Over-sample using SMOTE for continuous and categorical features. SMOTEN : Over-sample using the SMOTE variant specifically for categorical features only. SVMSMOTE : Over-sample using SVM-SMOTE variant. ADASYN : Over-sample using ADASYN. KMeansSMOTE : Over-sample applying a clustering before to oversample using SMOTE. Notes ----- Supports multi-class resampling by sampling each class independently. Supports heterogeneous data as object array containing string and numeric data. When generating a smoothed bootstrap, this method is also known as Random Over-Sampling Examples (ROSE) [1]_. .. warning:: Since smoothed bootstrap are generated by adding a small perturbation to the drawn samples, this method is not adequate when working with sparse matrices. References ---------- .. [1] G Menardi, N. Torelli, "Training and assessing classification rules with imbalanced data," Data Mining and Knowledge Discovery, 28(1), pp.92-122, 2014. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.over_sampling import RandomOverSampler >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> ros = RandomOverSampler(random_state=42) >>> X_res, y_res = ros.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{0: 900, 1: 900}}) """ _parameter_constraints: dict = { **BaseOverSampler._parameter_constraints, "shrinkage": [Interval(Real, 0, None, closed="left"), dict, None], } def __init__( self, *, sampling_strategy="auto", random_state=None, shrinkage=None, ): super().__init__(sampling_strategy=sampling_strategy) self.random_state = random_state self.shrinkage = shrinkage def _check_X_y(self, X, y): y, binarize_y = check_target_type(y, indicate_one_vs_all=True) X = _check_X(X) self._check_n_features(X, reset=True) self._check_feature_names(X, reset=True) return X, y, binarize_y def _fit_resample(self, X, y): random_state = check_random_state(self.random_state) if isinstance(self.shrinkage, Real): self.shrinkage_ = { klass: self.shrinkage for klass in self.sampling_strategy_ } elif self.shrinkage is None or isinstance(self.shrinkage, Mapping): self.shrinkage_ = self.shrinkage if self.shrinkage_ is not None: missing_shrinkage_keys = ( self.sampling_strategy_.keys() - self.shrinkage_.keys() ) if missing_shrinkage_keys: raise ValueError( f"`shrinkage` should contain a shrinkage factor for " f"each class that will be resampled. The missing " f"classes are: {repr(missing_shrinkage_keys)}" ) for klass, shrink_factor in self.shrinkage_.items(): if shrink_factor < 0: raise ValueError( f"The shrinkage factor needs to be >= 0. " f"Got {shrink_factor} for class {klass}." ) # smoothed bootstrap imposes to make numerical operation; we need # to be sure to have only numerical data in X try: X = check_array(X, accept_sparse=["csr", "csc"], dtype="numeric") except ValueError as exc: raise ValueError( "When shrinkage is not None, X needs to contain only " "numerical data to later generate a smoothed bootstrap " "sample." ) from exc X_resampled = [X.copy()] y_resampled = [y.copy()] sample_indices = range(X.shape[0]) for class_sample, num_samples in self.sampling_strategy_.items(): target_class_indices = np.flatnonzero(y == class_sample) bootstrap_indices = random_state.choice( target_class_indices, size=num_samples, replace=True, ) sample_indices = np.append(sample_indices, bootstrap_indices) if self.shrinkage_ is not None: # generate a smoothed bootstrap with a perturbation n_samples, n_features = X.shape smoothing_constant = (4 / ((n_features + 2) * n_samples)) ** ( 1 / (n_features + 4) ) if sparse.issparse(X): _, X_class_variance = mean_variance_axis( X[target_class_indices, :], axis=0, ) X_class_scale = np.sqrt(X_class_variance, out=X_class_variance) else: X_class_scale = np.std(X[target_class_indices, :], axis=0) smoothing_matrix = np.diagflat( self.shrinkage_[class_sample] * smoothing_constant * X_class_scale ) X_new = random_state.randn(num_samples, n_features) X_new = X_new.dot(smoothing_matrix) + X[bootstrap_indices, :] if sparse.issparse(X): X_new = sparse.csr_matrix(X_new, dtype=X.dtype) X_resampled.append(X_new) else: # generate a bootstrap X_resampled.append(_safe_indexing(X, bootstrap_indices)) y_resampled.append(_safe_indexing(y, bootstrap_indices)) self.sample_indices_ = np.array(sample_indices) if sparse.issparse(X): X_resampled = sparse.vstack(X_resampled, format=X.format) else: X_resampled = np.vstack(X_resampled) y_resampled = np.hstack(y_resampled) return X_resampled, y_resampled def _more_tags(self): return { "X_types": ["2darray", "string", "sparse", "dataframe"], "sample_indices": True, "allow_nan": True, "_xfail_checks": { "check_complex_data": "Robust to this type of data.", }, } imbalanced-learn-0.12.2/imblearn/over_sampling/_smote/000077500000000000000000000000001460233407600227035ustar00rootroot00000000000000imbalanced-learn-0.12.2/imblearn/over_sampling/_smote/__init__.py000066400000000000000000000003531460233407600250150ustar00rootroot00000000000000from .base import SMOTE, SMOTEN, SMOTENC from .cluster import KMeansSMOTE from .filter import SVMSMOTE, BorderlineSMOTE __all__ = [ "SMOTE", "SMOTEN", "SMOTENC", "KMeansSMOTE", "BorderlineSMOTE", "SVMSMOTE", ] imbalanced-learn-0.12.2/imblearn/over_sampling/_smote/base.py000066400000000000000000001143551460233407600242000ustar00rootroot00000000000000"""Base class and original SMOTE methods for over-sampling""" # Authors: Guillaume Lemaitre # Fernando Nogueira # Christos Aridas # Dzianis Dudnik # License: MIT import math import numbers import warnings import numpy as np from scipy import sparse from sklearn.base import clone from sklearn.exceptions import DataConversionWarning from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder from sklearn.utils import ( _get_column_indices, _safe_indexing, check_array, check_random_state, ) from sklearn.utils.sparsefuncs_fast import ( csr_mean_variance_axis0, ) from sklearn.utils.validation import _num_features from ...metrics.pairwise import ValueDifferenceMetric from ...utils import Substitution, check_neighbors_object, check_target_type from ...utils._docstring import _n_jobs_docstring, _random_state_docstring from ...utils._param_validation import HasMethods, Interval, StrOptions from ...utils._validation import _check_X from ...utils.fixes import _is_pandas_df, _mode from ..base import BaseOverSampler class BaseSMOTE(BaseOverSampler): """Base class for the different SMOTE algorithms.""" _parameter_constraints: dict = { **BaseOverSampler._parameter_constraints, "k_neighbors": [ Interval(numbers.Integral, 1, None, closed="left"), HasMethods(["kneighbors", "kneighbors_graph"]), ], "n_jobs": [numbers.Integral, None], } def __init__( self, sampling_strategy="auto", random_state=None, k_neighbors=5, n_jobs=None, ): super().__init__(sampling_strategy=sampling_strategy) self.random_state = random_state self.k_neighbors = k_neighbors self.n_jobs = n_jobs def _validate_estimator(self): """Check the NN estimators shared across the different SMOTE algorithms. """ self.nn_k_ = check_neighbors_object( "k_neighbors", self.k_neighbors, additional_neighbor=1 ) def _make_samples( self, X, y_dtype, y_type, nn_data, nn_num, n_samples, step_size=1.0, y=None ): """A support function that returns artificial samples constructed along the line connecting nearest neighbours. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) Points from which the points will be created. y_dtype : dtype The data type of the targets. y_type : str or int The minority target value, just so the function can return the target values for the synthetic variables with correct length in a clear format. nn_data : ndarray of shape (n_samples_all, n_features) Data set carrying all the neighbours to be used nn_num : ndarray of shape (n_samples_all, k_nearest_neighbours) The nearest neighbours of each sample in `nn_data`. n_samples : int The number of samples to generate. step_size : float, default=1.0 The step size to create samples. y : ndarray of shape (n_samples_all,), default=None The true target associated with `nn_data`. Used by Borderline SMOTE-2 to weight the distances in the sample generation process. Returns ------- X_new : {ndarray, sparse matrix} of shape (n_samples_new, n_features) Synthetically generated samples. y_new : ndarray of shape (n_samples_new,) Target values for synthetic samples. """ random_state = check_random_state(self.random_state) samples_indices = random_state.randint(low=0, high=nn_num.size, size=n_samples) # np.newaxis for backwards compatability with random_state steps = step_size * random_state.uniform(size=n_samples)[:, np.newaxis] rows = np.floor_divide(samples_indices, nn_num.shape[1]) cols = np.mod(samples_indices, nn_num.shape[1]) X_new = self._generate_samples(X, nn_data, nn_num, rows, cols, steps, y_type, y) y_new = np.full(n_samples, fill_value=y_type, dtype=y_dtype) return X_new, y_new def _generate_samples( self, X, nn_data, nn_num, rows, cols, steps, y_type=None, y=None ): r"""Generate a synthetic sample. The rule for the generation is: .. math:: \mathbf{s_{s}} = \mathbf{s_{i}} + \mathcal{u}(0, 1) \times (\mathbf{s_{i}} - \mathbf{s_{nn}}) \, where \mathbf{s_{s}} is the new synthetic samples, \mathbf{s_{i}} is the current sample, \mathbf{s_{nn}} is a randomly selected neighbors of \mathbf{s_{i}} and \mathcal{u}(0, 1) is a random number between [0, 1). Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) Points from which the points will be created. nn_data : ndarray of shape (n_samples_all, n_features) Data set carrying all the neighbours to be used. nn_num : ndarray of shape (n_samples_all, k_nearest_neighbours) The nearest neighbours of each sample in `nn_data`. rows : ndarray of shape (n_samples,), dtype=int Indices pointing at feature vector in X which will be used as a base for creating new samples. cols : ndarray of shape (n_samples,), dtype=int Indices pointing at which nearest neighbor of base feature vector will be used when creating new samples. steps : ndarray of shape (n_samples,), dtype=float Step sizes for new samples. y_type : str, int or None, default=None Class label of the current target classes for which we want to generate samples. y : ndarray of shape (n_samples_all,), default=None The true target associated with `nn_data`. Used by Borderline SMOTE-2 to weight the distances in the sample generation process. Returns ------- X_new : {ndarray, sparse matrix} of shape (n_samples, n_features) Synthetically generated samples. """ diffs = nn_data[nn_num[rows, cols]] - X[rows] if y is not None: # only entering for BorderlineSMOTE-2 random_state = check_random_state(self.random_state) mask_pair_samples = y[nn_num[rows, cols]] != y_type diffs[mask_pair_samples] *= random_state.uniform( low=0.0, high=0.5, size=(mask_pair_samples.sum(), 1) ) if sparse.issparse(X): sparse_func = type(X).__name__ steps = getattr(sparse, sparse_func)(steps) X_new = X[rows] + steps.multiply(diffs) else: X_new = X[rows] + steps * diffs return X_new.astype(X.dtype) def _in_danger_noise(self, nn_estimator, samples, target_class, y, kind="danger"): """Estimate if a set of sample are in danger or noise. Used by BorderlineSMOTE and SVMSMOTE. Parameters ---------- nn_estimator : estimator object An estimator that inherits from :class:`~sklearn.neighbors.base.KNeighborsMixin` use to determine if a sample is in danger/noise. samples : {array-like, sparse matrix} of shape (n_samples, n_features) The samples to check if either they are in danger or not. target_class : int or str The target corresponding class being over-sampled. y : array-like of shape (n_samples,) The true label in order to check the neighbour labels. kind : {'danger', 'noise'}, default='danger' The type of classification to use. Can be either: - If 'danger', check if samples are in danger, - If 'noise', check if samples are noise. Returns ------- output : ndarray of shape (n_samples,) A boolean array where True refer to samples in danger or noise. """ x = nn_estimator.kneighbors(samples, return_distance=False)[:, 1:] nn_label = (y[x] != target_class).astype(int) n_maj = np.sum(nn_label, axis=1) if kind == "danger": # Samples are in danger for m/2 <= m' < m return np.bitwise_and( n_maj >= (nn_estimator.n_neighbors - 1) / 2, n_maj < nn_estimator.n_neighbors - 1, ) else: # kind == "noise": # Samples are noise for m = m' return n_maj == nn_estimator.n_neighbors - 1 @Substitution( sampling_strategy=BaseOverSampler._sampling_strategy_docstring, n_jobs=_n_jobs_docstring, random_state=_random_state_docstring, ) class SMOTE(BaseSMOTE): """Class to perform over-sampling using SMOTE. This object is an implementation of SMOTE - Synthetic Minority Over-sampling Technique as presented in [1]_. Read more in the :ref:`User Guide `. Parameters ---------- {sampling_strategy} {random_state} k_neighbors : int or object, default=5 The nearest neighbors used to define the neighborhood of samples to use to generate the synthetic samples. You can pass: - an `int` corresponding to the number of neighbors to use. A `~sklearn.neighbors.NearestNeighbors` instance will be fitted in this case. - an instance of a compatible nearest neighbors algorithm that should implement both methods `kneighbors` and `kneighbors_graph`. For instance, it could correspond to a :class:`~sklearn.neighbors.NearestNeighbors` but could be extended to any compatible class. {n_jobs} .. deprecated:: 0.10 `n_jobs` has been deprecated in 0.10 and will be removed in 0.12. It was previously used to set `n_jobs` of nearest neighbors algorithm. From now on, you can pass an estimator where `n_jobs` is already set instead. Attributes ---------- sampling_strategy_ : dict Dictionary containing the information to sample the dataset. The keys corresponds to the class labels from which to sample and the values are the number of samples to sample. nn_k_ : estimator object Validated k-nearest neighbours created from the `k_neighbors` parameter. n_features_in_ : int Number of features in the input dataset. .. versionadded:: 0.9 feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during `fit`. Defined only when `X` has feature names that are all strings. .. versionadded:: 0.10 See Also -------- SMOTENC : Over-sample using SMOTE for continuous and categorical features. SMOTEN : Over-sample using the SMOTE variant specifically for categorical features only. BorderlineSMOTE : Over-sample using the borderline-SMOTE variant. SVMSMOTE : Over-sample using the SVM-SMOTE variant. ADASYN : Over-sample using ADASYN. KMeansSMOTE : Over-sample applying a clustering before to oversample using SMOTE. Notes ----- See the original papers: [1]_ for more details. Supports multi-class resampling. A one-vs.-rest scheme is used as originally proposed in [1]_. References ---------- .. [1] N. V. Chawla, K. W. Bowyer, L. O.Hall, W. P. Kegelmeyer, "SMOTE: synthetic minority over-sampling technique," Journal of artificial intelligence research, 321-357, 2002. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.over_sampling import SMOTE >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> sm = SMOTE(random_state=42) >>> X_res, y_res = sm.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{0: 900, 1: 900}}) """ def __init__( self, *, sampling_strategy="auto", random_state=None, k_neighbors=5, n_jobs=None, ): super().__init__( sampling_strategy=sampling_strategy, random_state=random_state, k_neighbors=k_neighbors, n_jobs=n_jobs, ) def _fit_resample(self, X, y): # FIXME: to be removed in 0.12 if self.n_jobs is not None: warnings.warn( "The parameter `n_jobs` has been deprecated in 0.10 and will be " "removed in 0.12. You can pass an nearest neighbors estimator where " "`n_jobs` is already set instead.", FutureWarning, ) self._validate_estimator() X_resampled = [X.copy()] y_resampled = [y.copy()] for class_sample, n_samples in self.sampling_strategy_.items(): if n_samples == 0: continue target_class_indices = np.flatnonzero(y == class_sample) X_class = _safe_indexing(X, target_class_indices) self.nn_k_.fit(X_class) nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:] X_new, y_new = self._make_samples( X_class, y.dtype, class_sample, X_class, nns, n_samples, 1.0 ) X_resampled.append(X_new) y_resampled.append(y_new) if sparse.issparse(X): X_resampled = sparse.vstack(X_resampled, format=X.format) else: X_resampled = np.vstack(X_resampled) y_resampled = np.hstack(y_resampled) return X_resampled, y_resampled @Substitution( sampling_strategy=BaseOverSampler._sampling_strategy_docstring, n_jobs=_n_jobs_docstring, random_state=_random_state_docstring, ) class SMOTENC(SMOTE): """Synthetic Minority Over-sampling Technique for Nominal and Continuous. Unlike :class:`SMOTE`, SMOTE-NC for dataset containing numerical and categorical features. However, it is not designed to work with only categorical features. Read more in the :ref:`User Guide `. .. versionadded:: 0.4 Parameters ---------- categorical_features : "infer" or array-like of shape (n_cat_features,) or \ (n_features,), dtype={{bool, int, str}} Specified which features are categorical. Can either be: - "auto" (default) to automatically detect categorical features. Only supported when `X` is a :class:`pandas.DataFrame` and it corresponds to columns that have a :class:`pandas.CategoricalDtype`; - array of `int` corresponding to the indices specifying the categorical features; - array of `str` corresponding to the feature names. `X` should be a pandas :class:`pandas.DataFrame` in this case. - mask array of shape (n_features, ) and ``bool`` dtype for which ``True`` indicates the categorical features. categorical_encoder : estimator, default=None One-hot encoder used to encode the categorical features. If `None`, a :class:`~sklearn.preprocessing.OneHotEncoder` is used with default parameters apart from `handle_unknown` which is set to 'ignore'. {sampling_strategy} {random_state} k_neighbors : int or object, default=5 The nearest neighbors used to define the neighborhood of samples to use to generate the synthetic samples. You can pass: - an `int` corresponding to the number of neighbors to use. A `~sklearn.neighbors.NearestNeighbors` instance will be fitted in this case. - an instance of a compatible nearest neighbors algorithm that should implement both methods `kneighbors` and `kneighbors_graph`. For instance, it could correspond to a :class:`~sklearn.neighbors.NearestNeighbors` but could be extended to any compatible class. {n_jobs} .. deprecated:: 0.10 `n_jobs` has been deprecated in 0.10 and will be removed in 0.12. It was previously used to set `n_jobs` of nearest neighbors algorithm. From now on, you can pass an estimator where `n_jobs` is already set instead. Attributes ---------- sampling_strategy_ : dict Dictionary containing the information to sample the dataset. The keys corresponds to the class labels from which to sample and the values are the number of samples to sample. nn_k_ : estimator object Validated k-nearest neighbours created from the `k_neighbors` parameter. ohe_ : :class:`~sklearn.preprocessing.OneHotEncoder` The one-hot encoder used to encode the categorical features. .. deprecated:: 0.11 `ohe_` is deprecated in 0.11 and will be removed in 0.13. Use `categorical_encoder_` instead. categorical_encoder_ : estimator The encoder used to encode the categorical features. categorical_features_ : ndarray of shape (n_cat_features,), dtype=np.int64 Indices of the categorical features. continuous_features_ : ndarray of shape (n_cont_features,), dtype=np.int64 Indices of the continuous features. median_std_ : dict of int -> float Median of the standard deviation of the continuous features for each class to be over-sampled. n_features_ : int Number of features observed at `fit`. n_features_in_ : int Number of features in the input dataset. .. versionadded:: 0.9 feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during `fit`. Defined only when `X` has feature names that are all strings. .. versionadded:: 0.10 See Also -------- SMOTE : Over-sample using SMOTE. SMOTEN : Over-sample using the SMOTE variant specifically for categorical features only. SVMSMOTE : Over-sample using SVM-SMOTE variant. BorderlineSMOTE : Over-sample using Borderline-SMOTE variant. ADASYN : Over-sample using ADASYN. KMeansSMOTE : Over-sample applying a clustering before to oversample using SMOTE. Notes ----- See the original paper [1]_ for more details. Supports multi-class resampling. A one-vs.-rest scheme is used as originally proposed in [1]_. See :ref:`sphx_glr_auto_examples_over-sampling_plot_comparison_over_sampling.py`, and :ref:`sphx_glr_auto_examples_over-sampling_plot_illustration_generation_sample.py`. References ---------- .. [1] N. V. Chawla, K. W. Bowyer, L. O.Hall, W. P. Kegelmeyer, "SMOTE: synthetic minority over-sampling technique," Journal of artificial intelligence research, 321-357, 2002. Examples -------- >>> from collections import Counter >>> from numpy.random import RandomState >>> from sklearn.datasets import make_classification >>> from imblearn.over_sampling import SMOTENC >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print(f'Original dataset shape {{X.shape}}') Original dataset shape (1000, 20) >>> print(f'Original dataset samples per class {{Counter(y)}}') Original dataset samples per class Counter({{1: 900, 0: 100}}) >>> # simulate the 2 last columns to be categorical features >>> X[:, -2:] = RandomState(10).randint(0, 4, size=(1000, 2)) >>> sm = SMOTENC(random_state=42, categorical_features=[18, 19]) >>> X_res, y_res = sm.fit_resample(X, y) >>> print(f'Resampled dataset samples per class {{Counter(y_res)}}') Resampled dataset samples per class Counter({{0: 900, 1: 900}}) """ _required_parameters = ["categorical_features"] _parameter_constraints: dict = { **SMOTE._parameter_constraints, "categorical_features": ["array-like", StrOptions({"auto"})], "categorical_encoder": [ HasMethods(["fit_transform", "inverse_transform"]), None, ], } def __init__( self, categorical_features, *, categorical_encoder=None, sampling_strategy="auto", random_state=None, k_neighbors=5, n_jobs=None, ): super().__init__( sampling_strategy=sampling_strategy, random_state=random_state, k_neighbors=k_neighbors, n_jobs=n_jobs, ) self.categorical_features = categorical_features self.categorical_encoder = categorical_encoder def _check_X_y(self, X, y): """Overwrite the checking to let pass some string for categorical features. """ y, binarize_y = check_target_type(y, indicate_one_vs_all=True) X = _check_X(X) self._check_n_features(X, reset=True) self._check_feature_names(X, reset=True) return X, y, binarize_y def _validate_column_types(self, X): """Compute the indices of the categorical and continuous features.""" if self.categorical_features == "auto": if not _is_pandas_df(X): raise ValueError( "When `categorical_features='auto'`, the input data " f"should be a pandas.DataFrame. Got {type(X)} instead." ) import pandas as pd # safely import pandas now are_columns_categorical = np.array( [isinstance(col_dtype, pd.CategoricalDtype) for col_dtype in X.dtypes] ) self.categorical_features_ = np.flatnonzero(are_columns_categorical) self.continuous_features_ = np.flatnonzero(~are_columns_categorical) else: self.categorical_features_ = np.array( _get_column_indices(X, self.categorical_features) ) self.continuous_features_ = np.setdiff1d( np.arange(self.n_features_), self.categorical_features_ ) def _validate_estimator(self): super()._validate_estimator() if self.categorical_features_.size == self.n_features_in_: raise ValueError( "SMOTE-NC is not designed to work only with categorical " "features. It requires some numerical features." ) elif self.categorical_features_.size == 0: raise ValueError( "SMOTE-NC is not designed to work only with numerical " "features. It requires some categorical features." ) def _fit_resample(self, X, y): # FIXME: to be removed in 0.12 if self.n_jobs is not None: warnings.warn( "The parameter `n_jobs` has been deprecated in 0.10 and will be " "removed in 0.12. You can pass an nearest neighbors estimator where " "`n_jobs` is already set instead.", FutureWarning, ) self.n_features_ = _num_features(X) self._validate_column_types(X) self._validate_estimator() X_continuous = _safe_indexing(X, self.continuous_features_, axis=1) X_continuous = check_array(X_continuous, accept_sparse=["csr", "csc"]) X_categorical = _safe_indexing(X, self.categorical_features_, axis=1) if X_continuous.dtype.name != "object": dtype_ohe = X_continuous.dtype else: dtype_ohe = np.float64 if self.categorical_encoder is None: self.categorical_encoder_ = OneHotEncoder( handle_unknown="ignore", dtype=dtype_ohe ) else: self.categorical_encoder_ = clone(self.categorical_encoder) # the input of the OneHotEncoder needs to be dense X_ohe = self.categorical_encoder_.fit_transform( X_categorical.toarray() if sparse.issparse(X_categorical) else X_categorical ) if not sparse.issparse(X_ohe): X_ohe = sparse.csr_matrix(X_ohe, dtype=dtype_ohe) X_encoded = sparse.hstack((X_continuous, X_ohe), format="csr", dtype=dtype_ohe) X_resampled = [X_encoded.copy()] y_resampled = [y.copy()] # SMOTE resampling starts here self.median_std_ = {} for class_sample, n_samples in self.sampling_strategy_.items(): if n_samples == 0: continue target_class_indices = np.flatnonzero(y == class_sample) X_class = _safe_indexing(X_encoded, target_class_indices) _, var = csr_mean_variance_axis0( X_class[:, : self.continuous_features_.size] ) self.median_std_[class_sample] = np.median(np.sqrt(var)) # In the edge case where the median of the std is equal to 0, the 1s # entries will be also nullified. In this case, we store the original # categorical encoding which will be later used for inverting the OHE if math.isclose(self.median_std_[class_sample], 0): # This variable will be used when generating data self._X_categorical_minority_encoded = X_class[ :, self.continuous_features_.size : ].toarray() # we can replace the 1 entries of the categorical features with the # median of the standard deviation. It will ensure that whenever # distance is computed between 2 samples, the difference will be equal # to the median of the standard deviation as in the original paper. X_class_categorical = X_class[:, self.continuous_features_.size :] # With one-hot encoding, the median will be repeated twice. We need # to divide by sqrt(2) such that we only have one median value # contributing to the Euclidean distance X_class_categorical.data[:] = self.median_std_[class_sample] / np.sqrt(2) X_class[:, self.continuous_features_.size :] = X_class_categorical self.nn_k_.fit(X_class) nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:] X_new, y_new = self._make_samples( X_class, y.dtype, class_sample, X_class, nns, n_samples, 1.0 ) X_resampled.append(X_new) y_resampled.append(y_new) X_resampled = sparse.vstack(X_resampled, format=X_encoded.format) y_resampled = np.hstack(y_resampled) # SMOTE resampling ends here # reverse the encoding of the categorical features X_res_cat = X_resampled[:, self.continuous_features_.size :] X_res_cat.data = np.ones_like(X_res_cat.data) X_res_cat_dec = self.categorical_encoder_.inverse_transform(X_res_cat) if sparse.issparse(X): X_resampled = sparse.hstack( ( X_resampled[:, : self.continuous_features_.size], X_res_cat_dec, ), format="csr", ) else: X_resampled = np.hstack( ( X_resampled[:, : self.continuous_features_.size].toarray(), X_res_cat_dec, ) ) indices_reordered = np.argsort( np.hstack((self.continuous_features_, self.categorical_features_)) ) if sparse.issparse(X_resampled): # the matrix is supposed to be in the CSR format after the stacking col_indices = X_resampled.indices.copy() for idx, col_idx in enumerate(indices_reordered): mask = X_resampled.indices == col_idx col_indices[mask] = idx X_resampled.indices = col_indices else: X_resampled = X_resampled[:, indices_reordered] return X_resampled, y_resampled def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps, y_type, y=None): """Generate a synthetic sample with an additional steps for the categorical features. Each new sample is generated the same way than in SMOTE. However, the categorical features are mapped to the most frequent nearest neighbors of the majority class. """ rng = check_random_state(self.random_state) X_new = super()._generate_samples(X, nn_data, nn_num, rows, cols, steps) # change in sparsity structure more efficient with LIL than CSR X_new = X_new.tolil() if sparse.issparse(X_new) else X_new # convert to dense array since scipy.sparse doesn't handle 3D nn_data = nn_data.toarray() if sparse.issparse(nn_data) else nn_data # In the case that the median std was equal to zeros, we have to # create non-null entry based on the encoded of OHE if math.isclose(self.median_std_[y_type], 0): nn_data[ :, self.continuous_features_.size : ] = self._X_categorical_minority_encoded all_neighbors = nn_data[nn_num[rows]] categories_size = [self.continuous_features_.size] + [ cat.size for cat in self.categorical_encoder_.categories_ ] for start_idx, end_idx in zip( np.cumsum(categories_size)[:-1], np.cumsum(categories_size)[1:] ): col_maxs = all_neighbors[:, :, start_idx:end_idx].sum(axis=1) # tie breaking argmax is_max = np.isclose(col_maxs, col_maxs.max(axis=1, keepdims=True)) max_idxs = rng.permutation(np.argwhere(is_max)) xs, idx_sels = np.unique(max_idxs[:, 0], return_index=True) col_sels = max_idxs[idx_sels, 1] ys = start_idx + col_sels X_new[:, start_idx:end_idx] = 0 X_new[xs, ys] = 1 return X_new @property def ohe_(self): """One-hot encoder used to encode the categorical features.""" warnings.warn( "'ohe_' attribute has been deprecated in 0.11 and will be removed " "in 0.13. Use 'categorical_encoder_' instead.", FutureWarning, ) return self.categorical_encoder_ @Substitution( sampling_strategy=BaseOverSampler._sampling_strategy_docstring, n_jobs=_n_jobs_docstring, random_state=_random_state_docstring, ) class SMOTEN(SMOTE): """Synthetic Minority Over-sampling Technique for Nominal. This method is referred as SMOTEN in [1]_. It expects that the data to resample are only made of categorical features. Read more in the :ref:`User Guide `. .. versionadded:: 0.8 Parameters ---------- categorical_encoder : estimator, default=None Ordinal encoder used to encode the categorical features. If `None`, a :class:`~sklearn.preprocessing.OrdinalEncoder` is used with default parameters. {sampling_strategy} {random_state} k_neighbors : int or object, default=5 The nearest neighbors used to define the neighborhood of samples to use to generate the synthetic samples. You can pass: - an `int` corresponding to the number of neighbors to use. A `~sklearn.neighbors.NearestNeighbors` instance will be fitted in this case. - an instance of a compatible nearest neighbors algorithm that should implement both methods `kneighbors` and `kneighbors_graph`. For instance, it could correspond to a :class:`~sklearn.neighbors.NearestNeighbors` but could be extended to any compatible class. {n_jobs} .. deprecated:: 0.10 `n_jobs` has been deprecated in 0.10 and will be removed in 0.12. It was previously used to set `n_jobs` of nearest neighbors algorithm. From now on, you can pass an estimator where `n_jobs` is already set instead. Attributes ---------- categorical_encoder_ : estimator The encoder used to encode the categorical features. sampling_strategy_ : dict Dictionary containing the information to sample the dataset. The keys corresponds to the class labels from which to sample and the values are the number of samples to sample. nn_k_ : estimator object Validated k-nearest neighbours created from the `k_neighbors` parameter. n_features_in_ : int Number of features in the input dataset. .. versionadded:: 0.9 feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during `fit`. Defined only when `X` has feature names that are all strings. .. versionadded:: 0.10 See Also -------- SMOTE : Over-sample using SMOTE. SMOTENC : Over-sample using SMOTE for continuous and categorical features. BorderlineSMOTE : Over-sample using the borderline-SMOTE variant. SVMSMOTE : Over-sample using the SVM-SMOTE variant. ADASYN : Over-sample using ADASYN. KMeansSMOTE : Over-sample applying a clustering before to oversample using SMOTE. Notes ----- See the original papers: [1]_ for more details. Supports multi-class resampling. A one-vs.-rest scheme is used as originally proposed in [1]_. References ---------- .. [1] N. V. Chawla, K. W. Bowyer, L. O.Hall, W. P. Kegelmeyer, "SMOTE: synthetic minority over-sampling technique," Journal of artificial intelligence research, 321-357, 2002. Examples -------- >>> import numpy as np >>> X = np.array(["A"] * 10 + ["B"] * 20 + ["C"] * 30, dtype=object).reshape(-1, 1) >>> y = np.array([0] * 20 + [1] * 40, dtype=np.int32) >>> from collections import Counter >>> print(f"Original class counts: {{Counter(y)}}") Original class counts: Counter({{1: 40, 0: 20}}) >>> from imblearn.over_sampling import SMOTEN >>> sampler = SMOTEN(random_state=0) >>> X_res, y_res = sampler.fit_resample(X, y) >>> print(f"Class counts after resampling {{Counter(y_res)}}") Class counts after resampling Counter({{0: 40, 1: 40}}) """ _parameter_constraints: dict = { **SMOTE._parameter_constraints, "categorical_encoder": [ HasMethods(["fit_transform", "inverse_transform"]), None, ], } def __init__( self, categorical_encoder=None, *, sampling_strategy="auto", random_state=None, k_neighbors=5, n_jobs=None, ): super().__init__( sampling_strategy=sampling_strategy, random_state=random_state, k_neighbors=k_neighbors, n_jobs=n_jobs, ) self.categorical_encoder = categorical_encoder def _check_X_y(self, X, y): """Check should accept strings and not sparse matrices.""" y, binarize_y = check_target_type(y, indicate_one_vs_all=True) X, y = self._validate_data( X, y, reset=True, dtype=None, accept_sparse=["csr", "csc"], ) return X, y, binarize_y def _validate_estimator(self): """Force to use precomputed distance matrix.""" super()._validate_estimator() self.nn_k_.set_params(metric="precomputed") def _make_samples(self, X_class, klass, y_dtype, nn_indices, n_samples): random_state = check_random_state(self.random_state) # generate sample indices that will be used to generate new samples samples_indices = random_state.choice( np.arange(X_class.shape[0]), size=n_samples, replace=True ) # for each drawn samples, select its k-neighbors and generate a sample # where for each feature individually, each category generated is the # most common category X_new = np.squeeze( _mode(X_class[nn_indices[samples_indices]], axis=1).mode, axis=1 ) y_new = np.full(n_samples, fill_value=klass, dtype=y_dtype) return X_new, y_new def _fit_resample(self, X, y): # FIXME: to be removed in 0.12 if self.n_jobs is not None: warnings.warn( "The parameter `n_jobs` has been deprecated in 0.10 and will be " "removed in 0.12. You can pass an nearest neighbors estimator where " "`n_jobs` is already set instead.", FutureWarning, ) if sparse.issparse(X): X_sparse_format = X.format X = X.toarray() warnings.warn( "Passing a sparse matrix to SMOTEN is not really efficient since it is" " converted to a dense array internally.", DataConversionWarning, ) else: X_sparse_format = None self._validate_estimator() X_resampled = [X.copy()] y_resampled = [y.copy()] if self.categorical_encoder is None: self.categorical_encoder_ = OrdinalEncoder(dtype=np.int32) else: self.categorical_encoder_ = clone(self.categorical_encoder) X_encoded = self.categorical_encoder_.fit_transform(X) vdm = ValueDifferenceMetric( n_categories=[len(cat) for cat in self.categorical_encoder_.categories_] ).fit(X_encoded, y) for class_sample, n_samples in self.sampling_strategy_.items(): if n_samples == 0: continue target_class_indices = np.flatnonzero(y == class_sample) X_class = _safe_indexing(X_encoded, target_class_indices) X_class_dist = vdm.pairwise(X_class) self.nn_k_.fit(X_class_dist) # the kneigbors search will include the sample itself which is # expected from the original algorithm nn_indices = self.nn_k_.kneighbors(X_class_dist, return_distance=False) X_new, y_new = self._make_samples( X_class, class_sample, y.dtype, nn_indices, n_samples ) X_new = self.categorical_encoder_.inverse_transform(X_new) X_resampled.append(X_new) y_resampled.append(y_new) X_resampled = np.vstack(X_resampled) y_resampled = np.hstack(y_resampled) if X_sparse_format == "csr": return sparse.csr_matrix(X_resampled), y_resampled elif X_sparse_format == "csc": return sparse.csc_matrix(X_resampled), y_resampled else: return X_resampled, y_resampled def _more_tags(self): return {"X_types": ["2darray", "dataframe", "string"]} imbalanced-learn-0.12.2/imblearn/over_sampling/_smote/cluster.py000066400000000000000000000254241460233407600247450ustar00rootroot00000000000000"""SMOTE variant employing some clustering before the generation.""" # Authors: Guillaume Lemaitre # Fernando Nogueira # Christos Aridas # License: MIT import math import numbers import numpy as np from scipy import sparse from sklearn.base import clone from sklearn.cluster import MiniBatchKMeans from sklearn.metrics import pairwise_distances from sklearn.utils import _safe_indexing from ...utils import Substitution from ...utils._docstring import _n_jobs_docstring, _random_state_docstring from ...utils._param_validation import HasMethods, Interval, StrOptions from ..base import BaseOverSampler from .base import BaseSMOTE @Substitution( sampling_strategy=BaseOverSampler._sampling_strategy_docstring, n_jobs=_n_jobs_docstring, random_state=_random_state_docstring, ) class KMeansSMOTE(BaseSMOTE): """Apply a KMeans clustering before to over-sample using SMOTE. This is an implementation of the algorithm described in [1]_. Read more in the :ref:`User Guide `. .. versionadded:: 0.5 Parameters ---------- {sampling_strategy} {random_state} k_neighbors : int or object, default=2 The nearest neighbors used to define the neighborhood of samples to use to generate the synthetic samples. You can pass: - an `int` corresponding to the number of neighbors to use. A `~sklearn.neighbors.NearestNeighbors` instance will be fitted in this case. - an instance of a compatible nearest neighbors algorithm that should implement both methods `kneighbors` and `kneighbors_graph`. For instance, it could correspond to a :class:`~sklearn.neighbors.NearestNeighbors` but could be extended to any compatible class. {n_jobs} kmeans_estimator : int or object, default=None A KMeans instance or the number of clusters to be used. By default, we used a :class:`~sklearn.cluster.MiniBatchKMeans` which tend to be better with large number of samples. cluster_balance_threshold : "auto" or float, default="auto" The threshold at which a cluster is called balanced and where samples of the class selected for SMOTE will be oversampled. If "auto", this will be determined by the ratio for each class, or it can be set manually. density_exponent : "auto" or float, default="auto" This exponent is used to determine the density of a cluster. Leaving this to "auto" will use a feature-length based exponent. Attributes ---------- sampling_strategy_ : dict Dictionary containing the information to sample the dataset. The keys corresponds to the class labels from which to sample and the values are the number of samples to sample. kmeans_estimator_ : estimator The fitted clustering method used before to apply SMOTE. nn_k_ : estimator The fitted k-NN estimator used in SMOTE. cluster_balance_threshold_ : float The threshold used during ``fit`` for calling a cluster balanced. n_features_in_ : int Number of features in the input dataset. .. versionadded:: 0.9 feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during `fit`. Defined only when `X` has feature names that are all strings. .. versionadded:: 0.10 See Also -------- SMOTE : Over-sample using SMOTE. SMOTENC : Over-sample using SMOTE for continuous and categorical features. SMOTEN : Over-sample using the SMOTE variant specifically for categorical features only. SVMSMOTE : Over-sample using SVM-SMOTE variant. BorderlineSMOTE : Over-sample using Borderline-SMOTE variant. ADASYN : Over-sample using ADASYN. References ---------- .. [1] Felix Last, Georgios Douzas, Fernando Bacao, "Oversampling for Imbalanced Learning Based on K-Means and SMOTE" https://arxiv.org/abs/1711.00837 Examples -------- >>> import numpy as np >>> from imblearn.over_sampling import KMeansSMOTE >>> from sklearn.datasets import make_blobs >>> blobs = [100, 800, 100] >>> X, y = make_blobs(blobs, centers=[(-10, 0), (0,0), (10, 0)]) >>> # Add a single 0 sample in the middle blob >>> X = np.concatenate([X, [[0, 0]]]) >>> y = np.append(y, 0) >>> # Make this a binary classification problem >>> y = y == 1 >>> sm = KMeansSMOTE( ... kmeans_estimator=MiniBatchKMeans(n_init=1, random_state=0), random_state=42 ... ) >>> X_res, y_res = sm.fit_resample(X, y) >>> # Find the number of new samples in the middle blob >>> n_res_in_middle = ((X_res[:, 0] > -5) & (X_res[:, 0] < 5)).sum() >>> print("Samples in the middle blob: %s" % n_res_in_middle) Samples in the middle blob: 801 >>> print("Middle blob unchanged: %s" % (n_res_in_middle == blobs[1] + 1)) Middle blob unchanged: True >>> print("More 0 samples: %s" % ((y_res == 0).sum() > (y == 0).sum())) More 0 samples: True """ _parameter_constraints: dict = { **BaseSMOTE._parameter_constraints, "kmeans_estimator": [ HasMethods(["fit", "predict"]), Interval(numbers.Integral, 1, None, closed="left"), None, ], "cluster_balance_threshold": [StrOptions({"auto"}), numbers.Real], "density_exponent": [StrOptions({"auto"}), numbers.Real], } def __init__( self, *, sampling_strategy="auto", random_state=None, k_neighbors=2, n_jobs=None, kmeans_estimator=None, cluster_balance_threshold="auto", density_exponent="auto", ): super().__init__( sampling_strategy=sampling_strategy, random_state=random_state, k_neighbors=k_neighbors, n_jobs=n_jobs, ) self.kmeans_estimator = kmeans_estimator self.cluster_balance_threshold = cluster_balance_threshold self.density_exponent = density_exponent def _validate_estimator(self): super()._validate_estimator() if self.kmeans_estimator is None: self.kmeans_estimator_ = MiniBatchKMeans(random_state=self.random_state) elif isinstance(self.kmeans_estimator, int): self.kmeans_estimator_ = MiniBatchKMeans( n_clusters=self.kmeans_estimator, random_state=self.random_state, ) else: self.kmeans_estimator_ = clone(self.kmeans_estimator) self.cluster_balance_threshold_ = ( self.cluster_balance_threshold if self.kmeans_estimator_.n_clusters != 1 else -np.inf ) def _find_cluster_sparsity(self, X): """Compute the cluster sparsity.""" euclidean_distances = pairwise_distances( X, metric="euclidean", n_jobs=self.n_jobs ) # negate diagonal elements for ind in range(X.shape[0]): euclidean_distances[ind, ind] = 0 non_diag_elements = (X.shape[0] ** 2) - X.shape[0] mean_distance = euclidean_distances.sum() / non_diag_elements exponent = ( math.log(X.shape[0], 1.6) ** 1.8 * 0.16 if self.density_exponent == "auto" else self.density_exponent ) return (mean_distance**exponent) / X.shape[0] def _fit_resample(self, X, y): self._validate_estimator() X_resampled = X.copy() y_resampled = y.copy() total_inp_samples = sum(self.sampling_strategy_.values()) for class_sample, n_samples in self.sampling_strategy_.items(): if n_samples == 0: continue X_clusters = self.kmeans_estimator_.fit_predict(X) valid_clusters = [] cluster_sparsities = [] # identify cluster which are answering the requirements for cluster_idx in range(self.kmeans_estimator_.n_clusters): cluster_mask = np.flatnonzero(X_clusters == cluster_idx) if cluster_mask.size == 0: # empty cluster continue X_cluster = _safe_indexing(X, cluster_mask) y_cluster = _safe_indexing(y, cluster_mask) cluster_class_mean = (y_cluster == class_sample).mean() if self.cluster_balance_threshold_ == "auto": balance_threshold = n_samples / total_inp_samples / 2 else: balance_threshold = self.cluster_balance_threshold_ # the cluster is already considered balanced if cluster_class_mean < balance_threshold: continue # not enough samples to apply SMOTE anticipated_samples = cluster_class_mean * X_cluster.shape[0] if anticipated_samples < self.nn_k_.n_neighbors: continue X_cluster_class = _safe_indexing( X_cluster, np.flatnonzero(y_cluster == class_sample) ) valid_clusters.append(cluster_mask) cluster_sparsities.append(self._find_cluster_sparsity(X_cluster_class)) cluster_sparsities = np.array(cluster_sparsities) cluster_weights = cluster_sparsities / cluster_sparsities.sum() if not valid_clusters: raise RuntimeError( f"No clusters found with sufficient samples of " f"class {class_sample}. Try lowering the " f"cluster_balance_threshold or increasing the number of " f"clusters." ) for valid_cluster_idx, valid_cluster in enumerate(valid_clusters): X_cluster = _safe_indexing(X, valid_cluster) y_cluster = _safe_indexing(y, valid_cluster) X_cluster_class = _safe_indexing( X_cluster, np.flatnonzero(y_cluster == class_sample) ) self.nn_k_.fit(X_cluster_class) nns = self.nn_k_.kneighbors(X_cluster_class, return_distance=False)[ :, 1: ] cluster_n_samples = int( math.ceil(n_samples * cluster_weights[valid_cluster_idx]) ) X_new, y_new = self._make_samples( X_cluster_class, y.dtype, class_sample, X_cluster_class, nns, cluster_n_samples, 1.0, ) stack = [np.vstack, sparse.vstack][int(sparse.issparse(X_new))] X_resampled = stack((X_resampled, X_new)) y_resampled = np.hstack((y_resampled, y_new)) return X_resampled, y_resampled imbalanced-learn-0.12.2/imblearn/over_sampling/_smote/filter.py000066400000000000000000000467421460233407600245570ustar00rootroot00000000000000"""SMOTE variant applying some filtering before the generation process.""" # Authors: Guillaume Lemaitre # Fernando Nogueira # Christos Aridas # Dzianis Dudnik # License: MIT import numbers import warnings import numpy as np from scipy import sparse from sklearn.base import clone from sklearn.svm import SVC from sklearn.utils import _safe_indexing, check_random_state from ...utils import Substitution, check_neighbors_object from ...utils._docstring import _n_jobs_docstring, _random_state_docstring from ...utils._param_validation import HasMethods, Interval, StrOptions from ..base import BaseOverSampler from .base import BaseSMOTE @Substitution( sampling_strategy=BaseOverSampler._sampling_strategy_docstring, n_jobs=_n_jobs_docstring, random_state=_random_state_docstring, ) class BorderlineSMOTE(BaseSMOTE): """Over-sampling using Borderline SMOTE. This algorithm is a variant of the original SMOTE algorithm proposed in [2]_. Borderline samples will be detected and used to generate new synthetic samples. Read more in the :ref:`User Guide `. .. versionadded:: 0.4 Parameters ---------- {sampling_strategy} {random_state} k_neighbors : int or object, default=5 The nearest neighbors used to define the neighborhood of samples to use to generate the synthetic samples. You can pass: - an `int` corresponding to the number of neighbors to use. A `~sklearn.neighbors.NearestNeighbors` instance will be fitted in this case. - an instance of a compatible nearest neighbors algorithm that should implement both methods `kneighbors` and `kneighbors_graph`. For instance, it could correspond to a :class:`~sklearn.neighbors.NearestNeighbors` but could be extended to any compatible class. {n_jobs} .. deprecated:: 0.10 `n_jobs` has been deprecated in 0.10 and will be removed in 0.12. It was previously used to set `n_jobs` of nearest neighbors algorithm. From now on, you can pass an estimator where `n_jobs` is already set instead. m_neighbors : int or object, default=10 The nearest neighbors used to determine if a minority sample is in "danger". You can pass: - an `int` corresponding to the number of neighbors to use. A `~sklearn.neighbors.NearestNeighbors` instance will be fitted in this case. - an instance of a compatible nearest neighbors algorithm that should implement both methods `kneighbors` and `kneighbors_graph`. For instance, it could correspond to a :class:`~sklearn.neighbors.NearestNeighbors` but could be extended to any compatible class. kind : {{"borderline-1", "borderline-2"}}, default='borderline-1' The type of SMOTE algorithm to use one of the following options: ``'borderline-1'``, ``'borderline-2'``. Attributes ---------- sampling_strategy_ : dict Dictionary containing the information to sample the dataset. The keys corresponds to the class labels from which to sample and the values are the number of samples to sample. nn_k_ : estimator object Validated k-nearest neighbours created from the `k_neighbors` parameter. nn_m_ : estimator object Validated m-nearest neighbours created from the `m_neighbors` parameter. in_danger_indices : dict of ndarray Dictionary containing the indices of the samples considered in danger that are used to generate new synthetic samples. The keys corresponds to the class label. n_features_in_ : int Number of features in the input dataset. .. versionadded:: 0.9 feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during `fit`. Defined only when `X` has feature names that are all strings. .. versionadded:: 0.10 See Also -------- SMOTE : Over-sample using SMOTE. SMOTENC : Over-sample using SMOTE for continuous and categorical features. SVMSMOTE : Over-sample using SVM-SMOTE variant. ADASYN : Over-sample using ADASYN. KMeansSMOTE : Over-sample applying a clustering before to oversample using SMOTE. Notes ----- See the original papers: [2]_ for more details. Supports multi-class resampling. A one-vs.-rest scheme is used as originally proposed in [1]_. References ---------- .. [1] N. V. Chawla, K. W. Bowyer, L. O.Hall, W. P. Kegelmeyer, "SMOTE: synthetic minority over-sampling technique," Journal of artificial intelligence research, 321-357, 2002. .. [2] H. Han, W. Wen-Yuan, M. Bing-Huan, "Borderline-SMOTE: a new over-sampling method in imbalanced data sets learning," Advances in intelligent computing, 878-887, 2005. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.over_sampling import BorderlineSMOTE >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> sm = BorderlineSMOTE(random_state=42) >>> X_res, y_res = sm.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{0: 900, 1: 900}}) """ _parameter_constraints: dict = { **BaseSMOTE._parameter_constraints, "m_neighbors": [ Interval(numbers.Integral, 1, None, closed="left"), HasMethods(["kneighbors", "kneighbors_graph"]), ], "kind": [StrOptions({"borderline-1", "borderline-2"})], } def __init__( self, *, sampling_strategy="auto", random_state=None, k_neighbors=5, n_jobs=None, m_neighbors=10, kind="borderline-1", ): super().__init__( sampling_strategy=sampling_strategy, random_state=random_state, k_neighbors=k_neighbors, n_jobs=n_jobs, ) self.m_neighbors = m_neighbors self.kind = kind def _validate_estimator(self): super()._validate_estimator() self.nn_m_ = check_neighbors_object( "m_neighbors", self.m_neighbors, additional_neighbor=1 ) def _fit_resample(self, X, y): # FIXME: to be removed in 0.12 if self.n_jobs is not None: warnings.warn( "The parameter `n_jobs` has been deprecated in 0.10 and will be " "removed in 0.12. You can pass an nearest neighbors estimator where " "`n_jobs` is already set instead.", FutureWarning, ) self._validate_estimator() X_resampled = X.copy() y_resampled = y.copy() self.in_danger_indices = {} for class_sample, n_samples in self.sampling_strategy_.items(): if n_samples == 0: continue target_class_indices = np.flatnonzero(y == class_sample) X_class = _safe_indexing(X, target_class_indices) self.nn_m_.fit(X) mask_danger = self._in_danger_noise( self.nn_m_, X_class, class_sample, y, kind="danger" ) if not any(mask_danger): continue X_danger = _safe_indexing(X_class, mask_danger) self.in_danger_indices[class_sample] = target_class_indices[mask_danger] if self.kind == "borderline-1": X_to_sample_from = X_class # consider the positive class only y_to_check_neighbors = None else: # self.kind == "borderline-2" X_to_sample_from = X # consider the whole dataset y_to_check_neighbors = y self.nn_k_.fit(X_to_sample_from) nns = self.nn_k_.kneighbors(X_danger, return_distance=False)[:, 1:] X_new, y_new = self._make_samples( X_danger, y.dtype, class_sample, X_to_sample_from, nns, n_samples, y=y_to_check_neighbors, ) if sparse.issparse(X_new): X_resampled = sparse.vstack([X_resampled, X_new]) else: X_resampled = np.vstack((X_resampled, X_new)) y_resampled = np.hstack((y_resampled, y_new)) return X_resampled, y_resampled @Substitution( sampling_strategy=BaseOverSampler._sampling_strategy_docstring, n_jobs=_n_jobs_docstring, random_state=_random_state_docstring, ) class SVMSMOTE(BaseSMOTE): """Over-sampling using SVM-SMOTE. Variant of SMOTE algorithm which use an SVM algorithm to detect sample to use for generating new synthetic samples as proposed in [2]_. Read more in the :ref:`User Guide `. .. versionadded:: 0.4 Parameters ---------- {sampling_strategy} {random_state} k_neighbors : int or object, default=5 The nearest neighbors used to define the neighborhood of samples to use to generate the synthetic samples. You can pass: - an `int` corresponding to the number of neighbors to use. A `~sklearn.neighbors.NearestNeighbors` instance will be fitted in this case. - an instance of a compatible nearest neighbors algorithm that should implement both methods `kneighbors` and `kneighbors_graph`. For instance, it could correspond to a :class:`~sklearn.neighbors.NearestNeighbors` but could be extended to any compatible class. {n_jobs} .. deprecated:: 0.10 `n_jobs` has been deprecated in 0.10 and will be removed in 0.12. It was previously used to set `n_jobs` of nearest neighbors algorithm. From now on, you can pass an estimator where `n_jobs` is already set instead. m_neighbors : int or object, default=10 The nearest neighbors used to determine if a minority sample is in "danger". You can pass: - an `int` corresponding to the number of neighbors to use. A `~sklearn.neighbors.NearestNeighbors` instance will be fitted in this case. - an instance of a compatible nearest neighbors algorithm that should implement both methods `kneighbors` and `kneighbors_graph`. For instance, it could correspond to a :class:`~sklearn.neighbors.NearestNeighbors` but could be extended to any compatible class. svm_estimator : estimator object, default=SVC() A parametrized :class:`~sklearn.svm.SVC` classifier can be passed. A scikit-learn compatible estimator can be passed but it is required to expose a `support_` fitted attribute. out_step : float, default=0.5 Step size when extrapolating. Attributes ---------- sampling_strategy_ : dict Dictionary containing the information to sample the dataset. The keys corresponds to the class labels from which to sample and the values are the number of samples to sample. nn_k_ : estimator object Validated k-nearest neighbours created from the `k_neighbors` parameter. nn_m_ : estimator object Validated m-nearest neighbours created from the `m_neighbors` parameter. svm_estimator_ : estimator object The validated SVM classifier used to detect samples from which to generate new synthetic samples. n_features_in_ : int Number of features in the input dataset. .. versionadded:: 0.9 feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during `fit`. Defined only when `X` has feature names that are all strings. .. versionadded:: 0.10 See Also -------- SMOTE : Over-sample using SMOTE. SMOTENC : Over-sample using SMOTE for continuous and categorical features. SMOTEN : Over-sample using the SMOTE variant specifically for categorical features only. BorderlineSMOTE : Over-sample using Borderline-SMOTE. ADASYN : Over-sample using ADASYN. KMeansSMOTE : Over-sample applying a clustering before to oversample using SMOTE. Notes ----- See the original papers: [2]_ for more details. Supports multi-class resampling. A one-vs.-rest scheme is used as originally proposed in [1]_. References ---------- .. [1] N. V. Chawla, K. W. Bowyer, L. O.Hall, W. P. Kegelmeyer, "SMOTE: synthetic minority over-sampling technique," Journal of artificial intelligence research, 321-357, 2002. .. [2] H. M. Nguyen, E. W. Cooper, K. Kamei, "Borderline over-sampling for imbalanced data classification," International Journal of Knowledge Engineering and Soft Data Paradigms, 3(1), pp.4-21, 2009. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.over_sampling import SVMSMOTE >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> sm = SVMSMOTE(random_state=42) >>> X_res, y_res = sm.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{0: 900, 1: 900}}) """ _parameter_constraints: dict = { **BaseSMOTE._parameter_constraints, "m_neighbors": [ Interval(numbers.Integral, 1, None, closed="left"), HasMethods(["kneighbors", "kneighbors_graph"]), ], "svm_estimator": [HasMethods(["fit", "predict"]), None], "out_step": [Interval(numbers.Real, 0, 1, closed="both")], } def __init__( self, *, sampling_strategy="auto", random_state=None, k_neighbors=5, n_jobs=None, m_neighbors=10, svm_estimator=None, out_step=0.5, ): super().__init__( sampling_strategy=sampling_strategy, random_state=random_state, k_neighbors=k_neighbors, n_jobs=n_jobs, ) self.m_neighbors = m_neighbors self.svm_estimator = svm_estimator self.out_step = out_step def _validate_estimator(self): super()._validate_estimator() self.nn_m_ = check_neighbors_object( "m_neighbors", self.m_neighbors, additional_neighbor=1 ) if self.svm_estimator is None: self.svm_estimator_ = SVC(gamma="scale", random_state=self.random_state) else: self.svm_estimator_ = clone(self.svm_estimator) def _fit_resample(self, X, y): # FIXME: to be removed in 0.12 if self.n_jobs is not None: warnings.warn( "The parameter `n_jobs` has been deprecated in 0.10 and will be " "removed in 0.12. You can pass an nearest neighbors estimator where " "`n_jobs` is already set instead.", FutureWarning, ) self._validate_estimator() random_state = check_random_state(self.random_state) X_resampled = X.copy() y_resampled = y.copy() for class_sample, n_samples in self.sampling_strategy_.items(): if n_samples == 0: continue target_class_indices = np.flatnonzero(y == class_sample) X_class = _safe_indexing(X, target_class_indices) self.svm_estimator_.fit(X, y) if not hasattr(self.svm_estimator_, "support_"): raise RuntimeError( "`svm_estimator` is required to exposed a `support_` fitted " "attribute. Such estimator belongs to the familly of Support " "Vector Machine." ) support_index = self.svm_estimator_.support_[ y[self.svm_estimator_.support_] == class_sample ] support_vector = _safe_indexing(X, support_index) self.nn_m_.fit(X) noise_bool = self._in_danger_noise( self.nn_m_, support_vector, class_sample, y, kind="noise" ) support_vector = _safe_indexing( support_vector, np.flatnonzero(np.logical_not(noise_bool)) ) if support_vector.shape[0] == 0: raise ValueError( "All support vectors are considered as noise. SVM-SMOTE is not " "adapted to your dataset. Try another SMOTE variant." ) danger_bool = self._in_danger_noise( self.nn_m_, support_vector, class_sample, y, kind="danger" ) safety_bool = np.logical_not(danger_bool) self.nn_k_.fit(X_class) fractions = random_state.beta(10, 10) n_generated_samples = int(fractions * (n_samples + 1)) if np.count_nonzero(danger_bool) > 0: nns = self.nn_k_.kneighbors( _safe_indexing(support_vector, np.flatnonzero(danger_bool)), return_distance=False, )[:, 1:] X_new_1, y_new_1 = self._make_samples( _safe_indexing(support_vector, np.flatnonzero(danger_bool)), y.dtype, class_sample, X_class, nns, n_generated_samples, step_size=1.0, ) if np.count_nonzero(safety_bool) > 0: nns = self.nn_k_.kneighbors( _safe_indexing(support_vector, np.flatnonzero(safety_bool)), return_distance=False, )[:, 1:] X_new_2, y_new_2 = self._make_samples( _safe_indexing(support_vector, np.flatnonzero(safety_bool)), y.dtype, class_sample, X_class, nns, n_samples - n_generated_samples, step_size=-self.out_step, ) if np.count_nonzero(danger_bool) > 0 and np.count_nonzero(safety_bool) > 0: if sparse.issparse(X_resampled): X_resampled = sparse.vstack([X_resampled, X_new_1, X_new_2]) else: X_resampled = np.vstack((X_resampled, X_new_1, X_new_2)) y_resampled = np.concatenate((y_resampled, y_new_1, y_new_2), axis=0) elif np.count_nonzero(danger_bool) == 0: if sparse.issparse(X_resampled): X_resampled = sparse.vstack([X_resampled, X_new_2]) else: X_resampled = np.vstack((X_resampled, X_new_2)) y_resampled = np.concatenate((y_resampled, y_new_2), axis=0) elif np.count_nonzero(safety_bool) == 0: if sparse.issparse(X_resampled): X_resampled = sparse.vstack([X_resampled, X_new_1]) else: X_resampled = np.vstack((X_resampled, X_new_1)) y_resampled = np.concatenate((y_resampled, y_new_1), axis=0) return X_resampled, y_resampled imbalanced-learn-0.12.2/imblearn/over_sampling/_smote/tests/000077500000000000000000000000001460233407600240455ustar00rootroot00000000000000imbalanced-learn-0.12.2/imblearn/over_sampling/_smote/tests/__init__.py000066400000000000000000000000001460233407600261440ustar00rootroot00000000000000imbalanced-learn-0.12.2/imblearn/over_sampling/_smote/tests/test_borderline_smote.py000066400000000000000000000066421460233407600310220ustar00rootroot00000000000000from collections import Counter import pytest from sklearn.datasets import make_classification from sklearn.linear_model import LogisticRegression from sklearn.utils._testing import assert_allclose, assert_array_equal from imblearn.over_sampling import BorderlineSMOTE @pytest.mark.parametrize("kind", ["borderline-1", "borderline-2"]) def test_borderline_smote_no_in_danger_samples(kind): """Check that the algorithm behave properly even on a dataset without any sample in danger. """ X, y = make_classification( n_samples=500, n_features=2, n_informative=2, n_redundant=0, n_repeated=0, n_clusters_per_class=1, n_classes=3, weights=[0.1, 0.2, 0.7], class_sep=1.5, random_state=1, ) smote = BorderlineSMOTE(kind=kind, m_neighbors=3, k_neighbors=5, random_state=0) X_res, y_res = smote.fit_resample(X, y) assert_allclose(X, X_res) assert_allclose(y, y_res) assert not smote.in_danger_indices def test_borderline_smote_kind(): """Check the behaviour of the `kind` parameter. In short, "borderline-2" generates sample closer to the boundary decision than "borderline-1". We generate an example where a logistic regression will perform worse on "borderline-2" than on "borderline-1". """ X, y = make_classification( n_samples=500, n_features=2, n_informative=2, n_redundant=0, n_repeated=0, n_clusters_per_class=1, n_classes=3, weights=[0.1, 0.2, 0.7], class_sep=1.0, random_state=1, ) smote = BorderlineSMOTE( kind="borderline-1", m_neighbors=9, k_neighbors=5, random_state=0 ) X_res_borderline_1, y_res_borderline_1 = smote.fit_resample(X, y) smote.set_params(kind="borderline-2") X_res_borderline_2, y_res_borderline_2 = smote.fit_resample(X, y) score_borderline_1 = ( LogisticRegression() .fit(X_res_borderline_1, y_res_borderline_1) .score(X_res_borderline_1, y_res_borderline_1) ) score_borderline_2 = ( LogisticRegression() .fit(X_res_borderline_2, y_res_borderline_2) .score(X_res_borderline_2, y_res_borderline_2) ) assert score_borderline_1 > score_borderline_2 def test_borderline_smote_in_danger(): X, y = make_classification( n_samples=500, n_features=2, n_informative=2, n_redundant=0, n_repeated=0, n_clusters_per_class=1, n_classes=3, weights=[0.1, 0.2, 0.7], class_sep=0.8, random_state=1, ) smote = BorderlineSMOTE( kind="borderline-1", m_neighbors=9, k_neighbors=5, random_state=0, ) _, y_res_1 = smote.fit_resample(X, y) in_danger_indices_borderline_1 = smote.in_danger_indices smote.set_params(kind="borderline-2") _, y_res_2 = smote.fit_resample(X, y) in_danger_indices_borderline_2 = smote.in_danger_indices for key1, key2 in zip( in_danger_indices_borderline_1, in_danger_indices_borderline_2 ): assert_array_equal( in_danger_indices_borderline_1[key1], in_danger_indices_borderline_2[key2] ) assert len(in_danger_indices_borderline_1) == len(in_danger_indices_borderline_2) counter = Counter(y_res_1) assert counter[0] == counter[1] == counter[2] counter = Counter(y_res_2) assert counter[0] == counter[1] == counter[2] imbalanced-learn-0.12.2/imblearn/over_sampling/_smote/tests/test_kmeans_smote.py000066400000000000000000000070601460233407600301460ustar00rootroot00000000000000import numpy as np import pytest from sklearn.cluster import KMeans, MiniBatchKMeans from sklearn.datasets import make_classification from sklearn.neighbors import NearestNeighbors from sklearn.utils._testing import assert_allclose, assert_array_equal from imblearn.over_sampling import SMOTE, KMeansSMOTE @pytest.fixture def data(): X = np.array( [ [0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], ] ) y = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0]) return X, y @pytest.mark.filterwarnings("ignore:The default value of `n_init` will change") def test_kmeans_smote(data): X, y = data kmeans_smote = KMeansSMOTE( kmeans_estimator=1, random_state=42, cluster_balance_threshold=0.0, k_neighbors=5, ) smote = SMOTE(random_state=42) X_res_1, y_res_1 = kmeans_smote.fit_resample(X, y) X_res_2, y_res_2 = smote.fit_resample(X, y) assert_allclose(X_res_1, X_res_2) assert_array_equal(y_res_1, y_res_2) assert kmeans_smote.nn_k_.n_neighbors == 6 assert kmeans_smote.kmeans_estimator_.n_clusters == 1 assert "batch_size" in kmeans_smote.kmeans_estimator_.get_params() @pytest.mark.filterwarnings("ignore:The default value of `n_init` will change") @pytest.mark.parametrize("k_neighbors", [2, NearestNeighbors(n_neighbors=3)]) @pytest.mark.parametrize( "kmeans_estimator", [ 3, KMeans(n_clusters=3, n_init=1, random_state=42), MiniBatchKMeans(n_clusters=3, n_init=1, random_state=42), ], ) def test_sample_kmeans_custom(data, k_neighbors, kmeans_estimator): X, y = data kmeans_smote = KMeansSMOTE( random_state=42, kmeans_estimator=kmeans_estimator, k_neighbors=k_neighbors, ) X_resampled, y_resampled = kmeans_smote.fit_resample(X, y) assert X_resampled.shape == (24, 2) assert y_resampled.shape == (24,) assert kmeans_smote.nn_k_.n_neighbors == 3 assert kmeans_smote.kmeans_estimator_.n_clusters == 3 @pytest.mark.filterwarnings("ignore:The default value of `n_init` will change") def test_sample_kmeans_not_enough_clusters(data): X, y = data smote = KMeansSMOTE(cluster_balance_threshold=10, random_state=42) with pytest.raises(RuntimeError): smote.fit_resample(X, y) @pytest.mark.parametrize("density_exponent", ["auto", 10]) @pytest.mark.parametrize("cluster_balance_threshold", ["auto", 0.1]) def test_sample_kmeans_density_estimation(density_exponent, cluster_balance_threshold): X, y = make_classification( n_samples=10_000, n_classes=2, weights=[0.3, 0.7], random_state=42 ) smote = KMeansSMOTE( kmeans_estimator=MiniBatchKMeans(n_init=1, random_state=42), random_state=0, density_exponent=density_exponent, cluster_balance_threshold=cluster_balance_threshold, ) smote.fit_resample(X, y) imbalanced-learn-0.12.2/imblearn/over_sampling/_smote/tests/test_smote.py000066400000000000000000000116661460233407600266170ustar00rootroot00000000000000"""Test the module SMOTE.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import numpy as np from sklearn.neighbors import NearestNeighbors from sklearn.utils._testing import assert_allclose, assert_array_equal from imblearn.over_sampling import SMOTE RND_SEED = 0 X = np.array( [ [0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], ] ) Y = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0]) R_TOL = 1e-4 def test_sample_regular(): smote = SMOTE(random_state=RND_SEED) X_resampled, y_resampled = smote.fit_resample(X, Y) X_gt = np.array( [ [0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.29307743, -0.14670439], [0.84976473, -0.15570176], [0.61319159, -0.11571668], [0.66052536, -0.28246517], ] ) y_gt = np.array( [0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0] ) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) def test_sample_regular_half(): sampling_strategy = {0: 9, 1: 12} smote = SMOTE(sampling_strategy=sampling_strategy, random_state=RND_SEED) X_resampled, y_resampled = smote.fit_resample(X, Y) X_gt = np.array( [ [0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.36784496, -0.1953161], ] ) y_gt = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) def test_sample_regular_with_nn(): nn_k = NearestNeighbors(n_neighbors=6) smote = SMOTE(random_state=RND_SEED, k_neighbors=nn_k) X_resampled, y_resampled = smote.fit_resample(X, Y) X_gt = np.array( [ [0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.29307743, -0.14670439], [0.84976473, -0.15570176], [0.61319159, -0.11571668], [0.66052536, -0.28246517], ] ) y_gt = np.array( [0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0] ) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) imbalanced-learn-0.12.2/imblearn/over_sampling/_smote/tests/test_smote_nc.py000066400000000000000000000344021460233407600272700ustar00rootroot00000000000000"""Test the module SMOTENC.""" # Authors: Guillaume Lemaitre # Christos Aridas # Dzianis Dudnik # License: MIT from collections import Counter import numpy as np import pytest import sklearn from scipy import sparse from sklearn.datasets import make_classification from sklearn.preprocessing import OneHotEncoder from sklearn.utils._testing import assert_allclose, assert_array_equal from sklearn.utils.fixes import parse_version from imblearn.over_sampling import SMOTENC from imblearn.utils.estimator_checks import ( _set_checking_parameters, check_param_validation, ) sklearn_version = parse_version(sklearn.__version__) def data_heterogneous_ordered(): rng = np.random.RandomState(42) X = np.empty((30, 4), dtype=object) # create 2 random continuous feature X[:, :2] = rng.randn(30, 2) # create a categorical feature using some string X[:, 2] = rng.choice(["a", "b", "c"], size=30).astype(object) # create a categorical feature using some integer X[:, 3] = rng.randint(3, size=30) y = np.array([0] * 10 + [1] * 20) # return the categories return X, y, [2, 3] def data_heterogneous_unordered(): rng = np.random.RandomState(42) X = np.empty((30, 4), dtype=object) # create 2 random continuous feature X[:, [1, 2]] = rng.randn(30, 2) # create a categorical feature using some string X[:, 0] = rng.choice(["a", "b", "c"], size=30).astype(object) # create a categorical feature using some integer X[:, 3] = rng.randint(3, size=30) y = np.array([0] * 10 + [1] * 20) # return the categories return X, y, [0, 3] def data_heterogneous_masked(): rng = np.random.RandomState(42) X = np.empty((30, 4), dtype=object) # create 2 random continuous feature X[:, [1, 2]] = rng.randn(30, 2) # create a categorical feature using some string X[:, 0] = rng.choice(["a", "b", "c"], size=30).astype(object) # create a categorical feature using some integer X[:, 3] = rng.randint(3, size=30) y = np.array([0] * 10 + [1] * 20) # return the categories return X, y, [True, False, False, True] def data_heterogneous_unordered_multiclass(): rng = np.random.RandomState(42) X = np.empty((50, 4), dtype=object) # create 2 random continuous feature X[:, [1, 2]] = rng.randn(50, 2) # create a categorical feature using some string X[:, 0] = rng.choice(["a", "b", "c"], size=50).astype(object) # create a categorical feature using some integer X[:, 3] = rng.randint(3, size=50) y = np.array([0] * 10 + [1] * 15 + [2] * 25) # return the categories return X, y, [0, 3] def data_sparse(format): rng = np.random.RandomState(42) X = np.empty((30, 4), dtype=np.float64) # create 2 random continuous feature X[:, [1, 2]] = rng.randn(30, 2) # create a categorical feature using some string X[:, 0] = rng.randint(3, size=30) # create a categorical feature using some integer X[:, 3] = rng.randint(3, size=30) y = np.array([0] * 10 + [1] * 20) X = sparse.csr_matrix(X) if format == "csr" else sparse.csc_matrix(X) return X, y, [0, 3] def test_smotenc_error(): X, y, _ = data_heterogneous_unordered() categorical_features = [0, 10] smote = SMOTENC(random_state=0, categorical_features=categorical_features) with pytest.raises(ValueError, match="all features must be in"): smote.fit_resample(X, y) @pytest.mark.parametrize( "data", [ data_heterogneous_ordered(), data_heterogneous_unordered(), data_heterogneous_masked(), data_sparse("csr"), data_sparse("csc"), ], ) def test_smotenc(data): X, y, categorical_features = data smote = SMOTENC(random_state=0, categorical_features=categorical_features) X_resampled, y_resampled = smote.fit_resample(X, y) assert X_resampled.dtype == X.dtype categorical_features = np.array(categorical_features) if categorical_features.dtype == bool: categorical_features = np.flatnonzero(categorical_features) for cat_idx in categorical_features: if sparse.issparse(X): assert set(X[:, cat_idx].data) == set(X_resampled[:, cat_idx].data) assert X[:, cat_idx].dtype == X_resampled[:, cat_idx].dtype else: assert set(X[:, cat_idx]) == set(X_resampled[:, cat_idx]) assert X[:, cat_idx].dtype == X_resampled[:, cat_idx].dtype assert isinstance(smote.median_std_, dict) # part of the common test which apply to SMOTE-NC even if it is not default # constructible def test_smotenc_check_target_type(): X, _, categorical_features = data_heterogneous_unordered() y = np.linspace(0, 1, 30) smote = SMOTENC(categorical_features=categorical_features, random_state=0) with pytest.raises(ValueError, match="Unknown label type"): smote.fit_resample(X, y) rng = np.random.RandomState(42) y = rng.randint(2, size=(20, 3)) msg = "Multilabel and multioutput targets are not supported." with pytest.raises(ValueError, match=msg): smote.fit_resample(X, y) def test_smotenc_samplers_one_label(): X, _, categorical_features = data_heterogneous_unordered() y = np.zeros(30) smote = SMOTENC(categorical_features=categorical_features, random_state=0) with pytest.raises(ValueError, match="needs to have more than 1 class"): smote.fit(X, y) def test_smotenc_fit(): X, y, categorical_features = data_heterogneous_unordered() smote = SMOTENC(categorical_features=categorical_features, random_state=0) smote.fit_resample(X, y) assert hasattr( smote, "sampling_strategy_" ), "No fitted attribute sampling_strategy_" def test_smotenc_fit_resample(): X, y, categorical_features = data_heterogneous_unordered() target_stats = Counter(y) smote = SMOTENC(categorical_features=categorical_features, random_state=0) _, y_res = smote.fit_resample(X, y) _ = Counter(y_res) n_samples = max(target_stats.values()) assert all(value >= n_samples for value in Counter(y_res).values()) def test_smotenc_fit_resample_sampling_strategy(): X, y, categorical_features = data_heterogneous_unordered_multiclass() expected_stat = Counter(y)[1] smote = SMOTENC(categorical_features=categorical_features, random_state=0) sampling_strategy = {2: 25, 0: 25} smote.set_params(sampling_strategy=sampling_strategy) X_res, y_res = smote.fit_resample(X, y) assert Counter(y_res)[1] == expected_stat def test_smotenc_pandas(): pd = pytest.importorskip("pandas") # Check that the samplers handle pandas dataframe and pandas series X, y, categorical_features = data_heterogneous_unordered_multiclass() X_pd = pd.DataFrame(X) smote = SMOTENC(categorical_features=categorical_features, random_state=0) X_res_pd, y_res_pd = smote.fit_resample(X_pd, y) X_res, y_res = smote.fit_resample(X, y) assert_array_equal(X_res_pd.to_numpy(), X_res) assert_allclose(y_res_pd, y_res) assert set(smote.median_std_.keys()) == {0, 1} def test_smotenc_preserve_dtype(): X, y = make_classification( n_samples=50, n_classes=3, n_informative=4, weights=[0.2, 0.3, 0.5], random_state=0, ) # Cast X and y to not default dtype X = X.astype(np.float32) y = y.astype(np.int32) smote = SMOTENC(categorical_features=[1], random_state=0) X_res, y_res = smote.fit_resample(X, y) assert X.dtype == X_res.dtype, "X dtype is not preserved" assert y.dtype == y_res.dtype, "y dtype is not preserved" @pytest.mark.parametrize("categorical_features", [[True, True, True], [0, 1, 2]]) def test_smotenc_raising_error_all_categorical(categorical_features): X, y = make_classification( n_features=3, n_informative=1, n_redundant=1, n_repeated=0, n_clusters_per_class=1, ) smote = SMOTENC(categorical_features=categorical_features) err_msg = "SMOTE-NC is not designed to work only with categorical features" with pytest.raises(ValueError, match=err_msg): smote.fit_resample(X, y) def test_smote_nc_with_null_median_std(): # Non-regression test for #662 # https://github.com/scikit-learn-contrib/imbalanced-learn/issues/662 data = np.array( [ [1, 2, 1, "A"], [2, 1, 2, "A"], [2, 1, 2, "A"], [1, 2, 3, "B"], [1, 2, 4, "C"], [1, 2, 5, "C"], [1, 2, 4, "C"], [1, 2, 4, "C"], [1, 2, 4, "C"], ], dtype="object", ) labels = np.array( [ "class_1", "class_1", "class_1", "class_1", "class_2", "class_2", "class_3", "class_3", "class_3", ], dtype=object, ) smote = SMOTENC(categorical_features=[3], k_neighbors=1, random_state=0) X_res, y_res = smote.fit_resample(data, labels) # check that the categorical feature is not random but correspond to the # categories seen in the minority class samples assert_array_equal(X_res[-3:, -1], np.array(["C", "C", "C"], dtype=object)) assert smote.median_std_ == {"class_2": 0.0, "class_3": 0.0} def test_smotenc_categorical_encoder(): """Check that we can pass our own categorical encoder.""" # TODO: only use `sparse_output` when sklearn >= 1.2 param = "sparse" if sklearn_version < parse_version("1.2") else "sparse_output" X, y, categorical_features = data_heterogneous_unordered() smote = SMOTENC(categorical_features=categorical_features, random_state=0) smote.fit_resample(X, y) assert getattr(smote.categorical_encoder_, param) is True encoder = OneHotEncoder() encoder.set_params(**{param: False}) smote.set_params(categorical_encoder=encoder).fit_resample(X, y) assert smote.categorical_encoder is encoder assert smote.categorical_encoder_ is not encoder assert getattr(smote.categorical_encoder_, param) is False # TODO(0.13): remove this test def test_smotenc_deprecation_ohe_(): """Check that we raise a deprecation warning when using `ohe_`.""" X, y, categorical_features = data_heterogneous_unordered() smote = SMOTENC(categorical_features=categorical_features, random_state=0) smote.fit_resample(X, y) with pytest.warns(FutureWarning, match="'ohe_' attribute has been deprecated"): smote.ohe_ def test_smotenc_param_validation(): """Check that we validate the parameters correctly since this estimator requires a specific parameter. """ categorical_features = [0] smote = SMOTENC(categorical_features=categorical_features, random_state=0) name = smote.__class__.__name__ _set_checking_parameters(smote) check_param_validation(name, smote) def test_smotenc_bool_categorical(): """Check that we don't try to early convert the full input data to numeric when handling a pandas dataframe. Non-regression test for: https://github.com/scikit-learn-contrib/imbalanced-learn/issues/974 """ pd = pytest.importorskip("pandas") X = pd.DataFrame( { "c": pd.Categorical([x for x in "abbacaba" * 3]), "f": [0.3, 0.5, 0.1, 0.2] * 6, "b": [False, False, True] * 8, } ) y = pd.DataFrame({"out": [1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0] * 2}) smote = SMOTENC(categorical_features=[0]) X_res, y_res = smote.fit_resample(X, y) pd.testing.assert_series_equal(X_res.dtypes, X.dtypes) assert len(X_res) == len(y_res) smote.set_params(categorical_features=[0, 2]) X_res, y_res = smote.fit_resample(X, y) pd.testing.assert_series_equal(X_res.dtypes, X.dtypes) assert len(X_res) == len(y_res) X = X.astype({"b": "category"}) X_res, y_res = smote.fit_resample(X, y) pd.testing.assert_series_equal(X_res.dtypes, X.dtypes) assert len(X_res) == len(y_res) def test_smotenc_categorical_features_str(): """Check that we support array-like of strings for `categorical_features` using pandas dataframe. """ pd = pytest.importorskip("pandas") X = pd.DataFrame( { "A": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "B": ["a", "b"] * 5, "C": ["a", "b", "c"] * 3 + ["a"], } ) X = pd.concat([X] * 10, ignore_index=True) y = np.array([0] * 70 + [1] * 30) smote = SMOTENC(categorical_features=["B", "C"], random_state=0) X_res, y_res = smote.fit_resample(X, y) assert X_res["B"].isin(["a", "b"]).all() assert X_res["C"].isin(["a", "b", "c"]).all() counter = Counter(y_res) assert counter[0] == counter[1] == 70 assert_array_equal(smote.categorical_features_, [1, 2]) assert_array_equal(smote.continuous_features_, [0]) def test_smotenc_categorical_features_auto(): """Check that we can automatically detect categorical features based on pandas dataframe. """ pd = pytest.importorskip("pandas") X = pd.DataFrame( { "A": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "B": ["a", "b"] * 5, "C": ["a", "b", "c"] * 3 + ["a"], } ) X = pd.concat([X] * 10, ignore_index=True) X["B"] = X["B"].astype("category") X["C"] = X["C"].astype("category") y = np.array([0] * 70 + [1] * 30) smote = SMOTENC(categorical_features="auto", random_state=0) X_res, y_res = smote.fit_resample(X, y) assert X_res["B"].isin(["a", "b"]).all() assert X_res["C"].isin(["a", "b", "c"]).all() counter = Counter(y_res) assert counter[0] == counter[1] == 70 assert_array_equal(smote.categorical_features_, [1, 2]) assert_array_equal(smote.continuous_features_, [0]) def test_smote_nc_categorical_features_auto_error(): """Check that we raise a proper error when we cannot use the `'auto'` mode.""" pd = pytest.importorskip("pandas") X = pd.DataFrame( { "A": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "B": ["a", "b"] * 5, "C": ["a", "b", "c"] * 3 + ["a"], } ) y = np.array([0] * 70 + [1] * 30) smote = SMOTENC(categorical_features="auto", random_state=0) with pytest.raises(ValueError, match="the input data should be a pandas.DataFrame"): smote.fit_resample(X.to_numpy(), y) err_msg = "SMOTE-NC is not designed to work only with numerical features" with pytest.raises(ValueError, match=err_msg): smote.fit_resample(X, y) imbalanced-learn-0.12.2/imblearn/over_sampling/_smote/tests/test_smoten.py000066400000000000000000000062511460233407600267670ustar00rootroot00000000000000import numpy as np import pytest from sklearn.exceptions import DataConversionWarning from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder from sklearn.utils._testing import _convert_container from imblearn.over_sampling import SMOTEN @pytest.fixture def data(): rng = np.random.RandomState(0) feature_1 = ["A"] * 10 + ["B"] * 20 + ["C"] * 30 feature_2 = ["A"] * 40 + ["B"] * 20 feature_3 = ["A"] * 20 + ["B"] * 20 + ["C"] * 10 + ["D"] * 10 X = np.array([feature_1, feature_2, feature_3], dtype=object).T rng.shuffle(X) y = np.array([0] * 20 + [1] * 40, dtype=np.int32) y_labels = np.array(["not apple", "apple"], dtype=object) y = y_labels[y] return X, y def test_smoten(data): # overall check for SMOTEN X, y = data sampler = SMOTEN(random_state=0) X_res, y_res = sampler.fit_resample(X, y) assert X_res.shape == (80, 3) assert y_res.shape == (80,) assert isinstance(sampler.categorical_encoder_, OrdinalEncoder) def test_smoten_resampling(): # check if the SMOTEN resample data as expected # we generate data such that "not apple" will be the minority class and # samples from this class will be generated. We will force the "blue" # category to be associated with this class. Therefore, the new generated # samples should as well be from the "blue" category. X = np.array(["green"] * 5 + ["red"] * 10 + ["blue"] * 7, dtype=object).reshape( -1, 1 ) y = np.array( ["apple"] * 5 + ["not apple"] * 3 + ["apple"] * 7 + ["not apple"] * 5 + ["apple"] * 2, dtype=object, ) sampler = SMOTEN(random_state=0) X_res, y_res = sampler.fit_resample(X, y) X_generated, y_generated = X_res[X.shape[0] :], y_res[X.shape[0] :] np.testing.assert_array_equal(X_generated, "blue") np.testing.assert_array_equal(y_generated, "not apple") @pytest.mark.parametrize("sparse_format", ["sparse_csr", "sparse_csc"]) def test_smoten_sparse_input(data, sparse_format): """Check that we handle sparse input in SMOTEN even if it is not efficient. Non-regression test for: https://github.com/scikit-learn-contrib/imbalanced-learn/issues/971 """ X, y = data X = OneHotEncoder().fit_transform(X).toarray() X = _convert_container(X, sparse_format) with pytest.warns(DataConversionWarning, match="is not really efficient"): X_res, y_res = SMOTEN(random_state=0).fit_resample(X, y) assert X_res.format == X.format assert X_res.shape[0] == len(y_res) def test_smoten_categorical_encoder(data): """Check that `categorical_encoder` is used when provided.""" X, y = data sampler = SMOTEN(random_state=0) sampler.fit_resample(X, y) assert isinstance(sampler.categorical_encoder_, OrdinalEncoder) assert sampler.categorical_encoder_.dtype == np.int32 encoder = OrdinalEncoder(dtype=np.int64) sampler.set_params(categorical_encoder=encoder).fit_resample(X, y) assert isinstance(sampler.categorical_encoder_, OrdinalEncoder) assert sampler.categorical_encoder is encoder assert sampler.categorical_encoder_ is not encoder assert sampler.categorical_encoder_.dtype == np.int64 imbalanced-learn-0.12.2/imblearn/over_sampling/_smote/tests/test_svm_smote.py000066400000000000000000000054541460233407600275020ustar00rootroot00000000000000import numpy as np import pytest from sklearn.datasets import make_classification from sklearn.linear_model import LogisticRegression from sklearn.neighbors import NearestNeighbors from sklearn.svm import SVC from sklearn.utils._testing import assert_allclose, assert_array_equal from imblearn.over_sampling import SVMSMOTE @pytest.fixture def data(): X = np.array( [ [0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], ] ) y = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0]) return X, y def test_svm_smote(data): svm_smote = SVMSMOTE(random_state=42) svm_smote_nn = SVMSMOTE( random_state=42, k_neighbors=NearestNeighbors(n_neighbors=6), m_neighbors=NearestNeighbors(n_neighbors=11), svm_estimator=SVC(gamma="scale", random_state=42), ) X_res_1, y_res_1 = svm_smote.fit_resample(*data) X_res_2, y_res_2 = svm_smote_nn.fit_resample(*data) assert_allclose(X_res_1, X_res_2) assert_array_equal(y_res_1, y_res_2) def test_svm_smote_not_svm(data): """Check that we raise a proper error if passing an estimator that does not expose a `support_` fitted attribute.""" err_msg = "`svm_estimator` is required to exposed a `support_` fitted attribute." with pytest.raises(RuntimeError, match=err_msg): SVMSMOTE(svm_estimator=LogisticRegression()).fit_resample(*data) def test_svm_smote_all_noise(data): """Check that we raise a proper error message when all support vectors are detected as noise and there is nothing that we can do. Non-regression test for: https://github.com/scikit-learn-contrib/imbalanced-learn/issues/742 """ X, y = make_classification( n_classes=3, class_sep=0.001, weights=[0.004, 0.451, 0.545], n_informative=3, n_redundant=0, flip_y=0, n_features=3, n_clusters_per_class=2, n_samples=1000, random_state=10, ) with pytest.raises(ValueError, match="SVM-SMOTE is not adapted to your dataset"): SVMSMOTE(k_neighbors=4, random_state=42).fit_resample(X, y) imbalanced-learn-0.12.2/imblearn/over_sampling/base.py000066400000000000000000000047271460233407600227130ustar00rootroot00000000000000""" Base class for the over-sampling method. """ # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import numbers from collections.abc import Mapping from ..base import BaseSampler from ..utils._param_validation import Interval, StrOptions class BaseOverSampler(BaseSampler): """Base class for over-sampling algorithms. Warning: This class should not be used directly. Use the derive classes instead. """ _sampling_type = "over-sampling" _sampling_strategy_docstring = """sampling_strategy : float, str, dict or callable, default='auto' Sampling information to resample the data set. - When ``float``, it corresponds to the desired ratio of the number of samples in the minority class over the number of samples in the majority class after resampling. Therefore, the ratio is expressed as :math:`\\alpha_{os} = N_{rm} / N_{M}` where :math:`N_{rm}` is the number of samples in the minority class after resampling and :math:`N_{M}` is the number of samples in the majority class. .. warning:: ``float`` is only available for **binary** classification. An error is raised for multi-class classification. - When ``str``, specify the class targeted by the resampling. The number of samples in the different classes will be equalized. Possible choices are: ``'minority'``: resample only the minority class; ``'not minority'``: resample all classes but the minority class; ``'not majority'``: resample all classes but the majority class; ``'all'``: resample all classes; ``'auto'``: equivalent to ``'not majority'``. - When ``dict``, the keys correspond to the targeted classes. The values correspond to the desired number of samples for each targeted class. - When callable, function taking ``y`` and returns a ``dict``. The keys correspond to the targeted classes. The values correspond to the desired number of samples for each class. """.strip() # noqa: E501 _parameter_constraints: dict = { "sampling_strategy": [ Interval(numbers.Real, 0, 1, closed="right"), StrOptions({"auto", "minority", "not minority", "not majority", "all"}), Mapping, callable, ], "random_state": ["random_state"], } imbalanced-learn-0.12.2/imblearn/over_sampling/tests/000077500000000000000000000000001460233407600225575ustar00rootroot00000000000000imbalanced-learn-0.12.2/imblearn/over_sampling/tests/__init__.py000066400000000000000000000000001460233407600246560ustar00rootroot00000000000000imbalanced-learn-0.12.2/imblearn/over_sampling/tests/test_adasyn.py000066400000000000000000000076011460233407600254530ustar00rootroot00000000000000"""Test the module under sampler.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import numpy as np from sklearn.neighbors import NearestNeighbors from sklearn.utils._testing import assert_allclose, assert_array_equal from imblearn.over_sampling import ADASYN RND_SEED = 0 X = np.array( [ [0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], ] ) Y = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0]) R_TOL = 1e-4 def test_ada_init(): sampling_strategy = "auto" ada = ADASYN(sampling_strategy=sampling_strategy, random_state=RND_SEED) assert ada.random_state == RND_SEED def test_ada_fit_resample(): ada = ADASYN(random_state=RND_SEED) X_resampled, y_resampled = ada.fit_resample(X, Y) X_gt = np.array( [ [0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.88161986, -0.2829741], [0.35681689, -0.18814597], [1.4148276, 0.05308106], [0.3136591, -0.31327875], ] ) y_gt = np.array( [0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0] ) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) def test_ada_fit_resample_nn_obj(): nn = NearestNeighbors(n_neighbors=6) ada = ADASYN(random_state=RND_SEED, n_neighbors=nn) X_resampled, y_resampled = ada.fit_resample(X, Y) X_gt = np.array( [ [0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.88161986, -0.2829741], [0.35681689, -0.18814597], [1.4148276, 0.05308106], [0.3136591, -0.31327875], ] ) y_gt = np.array( [0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0] ) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) imbalanced-learn-0.12.2/imblearn/over_sampling/tests/test_common.py000066400000000000000000000102131460233407600254550ustar00rootroot00000000000000from collections import Counter import numpy as np import pytest from sklearn.cluster import MiniBatchKMeans from imblearn.over_sampling import ( ADASYN, SMOTE, SMOTEN, SMOTENC, SVMSMOTE, BorderlineSMOTE, KMeansSMOTE, ) from imblearn.utils.testing import _CustomNearestNeighbors @pytest.fixture def numerical_data(): rng = np.random.RandomState(0) X = rng.randn(100, 2) y = np.repeat([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0], 5) return X, y @pytest.fixture def categorical_data(): rng = np.random.RandomState(0) feature_1 = ["A"] * 10 + ["B"] * 20 + ["C"] * 30 feature_2 = ["A"] * 40 + ["B"] * 20 feature_3 = ["A"] * 20 + ["B"] * 20 + ["C"] * 10 + ["D"] * 10 X = np.array([feature_1, feature_2, feature_3], dtype=object).T rng.shuffle(X) y = np.array([0] * 20 + [1] * 40, dtype=np.int32) y_labels = np.array(["not apple", "apple"], dtype=object) y = y_labels[y] return X, y @pytest.fixture def heterogeneous_data(): rng = np.random.RandomState(42) X = np.empty((30, 4), dtype=object) X[:, :2] = rng.randn(30, 2) X[:, 2] = rng.choice(["a", "b", "c"], size=30).astype(object) X[:, 3] = rng.randint(3, size=30) y = np.array([0] * 10 + [1] * 20) return X, y, [2, 3] @pytest.mark.parametrize( "smote", [BorderlineSMOTE(), SVMSMOTE()], ids=["borderline", "svm"] ) def test_smote_m_neighbors(numerical_data, smote): # check that m_neighbors is properly set. Regression test for: # https://github.com/scikit-learn-contrib/imbalanced-learn/issues/568 X, y = numerical_data _ = smote.fit_resample(X, y) assert smote.nn_k_.n_neighbors == 6 assert smote.nn_m_.n_neighbors == 11 @pytest.mark.parametrize( "smote, neighbor_estimator_name", [ (ADASYN(random_state=0), "n_neighbors"), (BorderlineSMOTE(random_state=0), "k_neighbors"), ( KMeansSMOTE( kmeans_estimator=MiniBatchKMeans(n_init=1, random_state=0), random_state=1, ), "k_neighbors", ), (SMOTE(random_state=0), "k_neighbors"), (SVMSMOTE(random_state=0), "k_neighbors"), ], ids=["adasyn", "borderline", "kmeans", "smote", "svm"], ) def test_numerical_smote_custom_nn(numerical_data, smote, neighbor_estimator_name): X, y = numerical_data params = { neighbor_estimator_name: _CustomNearestNeighbors(n_neighbors=5), } smote.set_params(**params) X_res, _ = smote.fit_resample(X, y) assert X_res.shape[0] >= 120 def test_categorical_smote_k_custom_nn(categorical_data): X, y = categorical_data smote = SMOTEN(k_neighbors=_CustomNearestNeighbors(n_neighbors=5)) X_res, y_res = smote.fit_resample(X, y) assert X_res.shape == (80, 3) assert Counter(y_res) == {"apple": 40, "not apple": 40} def test_heterogeneous_smote_k_custom_nn(heterogeneous_data): X, y, categorical_features = heterogeneous_data smote = SMOTENC( categorical_features, k_neighbors=_CustomNearestNeighbors(n_neighbors=5) ) X_res, y_res = smote.fit_resample(X, y) assert X_res.shape == (40, 4) assert Counter(y_res) == {0: 20, 1: 20} @pytest.mark.parametrize( "smote", [BorderlineSMOTE(random_state=0), SVMSMOTE(random_state=0)], ids=["borderline", "svm"], ) def test_numerical_smote_extra_custom_nn(numerical_data, smote): X, y = numerical_data smote.set_params(m_neighbors=_CustomNearestNeighbors(n_neighbors=5)) X_res, y_res = smote.fit_resample(X, y) assert X_res.shape == (120, 2) assert Counter(y_res) == {0: 60, 1: 60} # FIXME: to be removed in 0.12 @pytest.mark.parametrize( "sampler", [ ADASYN(random_state=0), BorderlineSMOTE(random_state=0), SMOTE(random_state=0), SMOTEN(random_state=0), SMOTENC([0], random_state=0), SVMSMOTE(random_state=0), ], ) def test_n_jobs_deprecation_warning(numerical_data, sampler): X, y = numerical_data sampler.set_params(n_jobs=2) warning_msg = "The parameter `n_jobs` has been deprecated" with pytest.warns(FutureWarning, match=warning_msg): sampler.fit_resample(X, y) imbalanced-learn-0.12.2/imblearn/over_sampling/tests/test_random_over_sampler.py000066400000000000000000000235221460233407600302320ustar00rootroot00000000000000"""Test the module under sampler.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT from collections import Counter from datetime import datetime import numpy as np import pytest from sklearn.datasets import make_classification from sklearn.utils._testing import ( _convert_container, assert_allclose, assert_array_equal, ) from imblearn.over_sampling import RandomOverSampler RND_SEED = 0 @pytest.fixture def data(): X = np.array( [ [0.04352327, -0.20515826], [0.92923648, 0.76103773], [0.20792588, 1.49407907], [0.47104475, 0.44386323], [0.22950086, 0.33367433], [0.15490546, 0.3130677], [0.09125309, -0.85409574], [0.12372842, 0.6536186], [0.13347175, 0.12167502], [0.094035, -2.55298982], ] ) Y = np.array([1, 0, 1, 0, 1, 1, 1, 1, 0, 1]) return X, Y def test_ros_init(): sampling_strategy = "auto" ros = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=RND_SEED) assert ros.random_state == RND_SEED @pytest.mark.parametrize( "params", [{"shrinkage": None}, {"shrinkage": 0}, {"shrinkage": {0: 0}}] ) @pytest.mark.parametrize("X_type", ["array", "dataframe"]) def test_ros_fit_resample(X_type, data, params): X, Y = data X_ = _convert_container(X, X_type) ros = RandomOverSampler(**params, random_state=RND_SEED) X_resampled, y_resampled = ros.fit_resample(X_, Y) X_gt = np.array( [ [0.04352327, -0.20515826], [0.92923648, 0.76103773], [0.20792588, 1.49407907], [0.47104475, 0.44386323], [0.22950086, 0.33367433], [0.15490546, 0.3130677], [0.09125309, -0.85409574], [0.12372842, 0.6536186], [0.13347175, 0.12167502], [0.094035, -2.55298982], [0.92923648, 0.76103773], [0.47104475, 0.44386323], [0.92923648, 0.76103773], [0.47104475, 0.44386323], ] ) y_gt = np.array([1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0]) if X_type == "dataframe": assert hasattr(X_resampled, "loc") # FIXME: we should use to_numpy with pandas >= 0.25 X_resampled = X_resampled.values assert_allclose(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) if params["shrinkage"] is None: assert ros.shrinkage_ is None else: assert ros.shrinkage_ == {0: 0} @pytest.mark.parametrize("params", [{"shrinkage": None}, {"shrinkage": 0}]) def test_ros_fit_resample_half(data, params): X, Y = data sampling_strategy = {0: 3, 1: 7} ros = RandomOverSampler( **params, sampling_strategy=sampling_strategy, random_state=RND_SEED ) X_resampled, y_resampled = ros.fit_resample(X, Y) X_gt = np.array( [ [0.04352327, -0.20515826], [0.92923648, 0.76103773], [0.20792588, 1.49407907], [0.47104475, 0.44386323], [0.22950086, 0.33367433], [0.15490546, 0.3130677], [0.09125309, -0.85409574], [0.12372842, 0.6536186], [0.13347175, 0.12167502], [0.094035, -2.55298982], ] ) y_gt = np.array([1, 0, 1, 0, 1, 1, 1, 1, 0, 1]) assert_allclose(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) if params["shrinkage"] is None: assert ros.shrinkage_ is None else: assert ros.shrinkage_ == {0: 0, 1: 0} @pytest.mark.parametrize("params", [{"shrinkage": None}, {"shrinkage": 0}]) def test_multiclass_fit_resample(data, params): # check the random over-sampling with a multiclass problem X, Y = data y = Y.copy() y[5] = 2 y[6] = 2 ros = RandomOverSampler(**params, random_state=RND_SEED) X_resampled, y_resampled = ros.fit_resample(X, y) count_y_res = Counter(y_resampled) assert count_y_res[0] == 5 assert count_y_res[1] == 5 assert count_y_res[2] == 5 if params["shrinkage"] is None: assert ros.shrinkage_ is None else: assert ros.shrinkage_ == {0: 0, 2: 0} def test_random_over_sampling_heterogeneous_data(): # check that resampling with heterogeneous dtype is working with basic # resampling X_hetero = np.array( [["xxx", 1, 1.0], ["yyy", 2, 2.0], ["zzz", 3, 3.0]], dtype=object ) y = np.array([0, 0, 1]) ros = RandomOverSampler(random_state=RND_SEED) X_res, y_res = ros.fit_resample(X_hetero, y) assert X_res.shape[0] == 4 assert y_res.shape[0] == 4 assert X_res.dtype == object assert X_res[-1, 0] in X_hetero[:, 0] def test_random_over_sampling_nan_inf(data): # check that we can oversample even with missing or infinite data # regression tests for #605 X, Y = data rng = np.random.RandomState(42) n_not_finite = X.shape[0] // 3 row_indices = rng.choice(np.arange(X.shape[0]), size=n_not_finite) col_indices = rng.randint(0, X.shape[1], size=n_not_finite) not_finite_values = rng.choice([np.nan, np.inf], size=n_not_finite) X_ = X.copy() X_[row_indices, col_indices] = not_finite_values ros = RandomOverSampler(random_state=0) X_res, y_res = ros.fit_resample(X_, Y) assert y_res.shape == (14,) assert X_res.shape == (14, 2) assert np.any(~np.isfinite(X_res)) def test_random_over_sampling_heterogeneous_data_smoothed_bootstrap(): # check that we raise an error when heterogeneous dtype data are given # and a smoothed bootstrap is requested X_hetero = np.array( [["xxx", 1, 1.0], ["yyy", 2, 2.0], ["zzz", 3, 3.0]], dtype=object ) y = np.array([0, 0, 1]) ros = RandomOverSampler(shrinkage=1, random_state=RND_SEED) err_msg = "When shrinkage is not None, X needs to contain only numerical" with pytest.raises(ValueError, match=err_msg): ros.fit_resample(X_hetero, y) @pytest.mark.parametrize("X_type", ["dataframe", "array", "sparse_csr", "sparse_csc"]) def test_random_over_sampler_smoothed_bootstrap(X_type, data): # check that smoothed bootstrap is working for numerical array X, y = data sampler = RandomOverSampler(shrinkage=1) X = _convert_container(X, X_type) X_res, y_res = sampler.fit_resample(X, y) assert y_res.shape == (14,) assert X_res.shape == (14, 2) if X_type == "dataframe": assert hasattr(X_res, "loc") def test_random_over_sampler_equivalence_shrinkage(data): # check that a shrinkage factor of 0 is equivalent to not create a smoothed # bootstrap X, y = data ros_not_shrink = RandomOverSampler(shrinkage=0, random_state=0) ros_hard_bootstrap = RandomOverSampler(shrinkage=None, random_state=0) X_res_not_shrink, y_res_not_shrink = ros_not_shrink.fit_resample(X, y) X_res, y_res = ros_hard_bootstrap.fit_resample(X, y) assert_allclose(X_res_not_shrink, X_res) assert_allclose(y_res_not_shrink, y_res) assert y_res.shape == (14,) assert X_res.shape == (14, 2) assert y_res_not_shrink.shape == (14,) assert X_res_not_shrink.shape == (14, 2) def test_random_over_sampler_shrinkage_behaviour(data): # check the behaviour of the shrinkage parameter # the covariance of the data generated with the larger shrinkage factor # should also be larger. X, y = data ros = RandomOverSampler(shrinkage=1, random_state=0) X_res_shink_1, y_res_shrink_1 = ros.fit_resample(X, y) ros.set_params(shrinkage=5) X_res_shink_5, y_res_shrink_5 = ros.fit_resample(X, y) disperstion_shrink_1 = np.linalg.det(np.cov(X_res_shink_1[y_res_shrink_1 == 0].T)) disperstion_shrink_5 = np.linalg.det(np.cov(X_res_shink_5[y_res_shrink_5 == 0].T)) assert disperstion_shrink_1 < disperstion_shrink_5 @pytest.mark.parametrize( "shrinkage, err_msg", [ ({}, "`shrinkage` should contain a shrinkage factor for each class"), ({0: -1}, "The shrinkage factor needs to be >= 0"), ], ) def test_random_over_sampler_shrinkage_error(data, shrinkage, err_msg): # check the validation of the shrinkage parameter X, y = data ros = RandomOverSampler(shrinkage=shrinkage) with pytest.raises(ValueError, match=err_msg): ros.fit_resample(X, y) @pytest.mark.parametrize( "sampling_strategy", ["auto", "minority", "not minority", "not majority", "all"] ) def test_random_over_sampler_strings(sampling_strategy): """Check that we support all supposed strings as `sampling_strategy` in a sampler inheriting from `BaseOverSampler`.""" X, y = make_classification( n_samples=100, n_clusters_per_class=1, n_classes=3, weights=[0.1, 0.3, 0.6], random_state=0, ) RandomOverSampler(sampling_strategy=sampling_strategy).fit_resample(X, y) def test_random_over_sampling_datetime(): """Check that we don't convert input data and only sample from it.""" pd = pytest.importorskip("pandas") X = pd.DataFrame({"label": [0, 0, 0, 1], "td": [datetime.now()] * 4}) y = X["label"] ros = RandomOverSampler(random_state=0) X_res, y_res = ros.fit_resample(X, y) pd.testing.assert_series_equal(X_res.dtypes, X.dtypes) pd.testing.assert_index_equal(X_res.index, y_res.index) assert_array_equal(y_res.to_numpy(), np.array([0, 0, 0, 1, 1, 1])) def test_random_over_sampler_full_nat(): """Check that we can return timedelta columns full of NaT. Non-regression test for: https://github.com/scikit-learn-contrib/imbalanced-learn/issues/1055 """ pd = pytest.importorskip("pandas") X = pd.DataFrame( { "col_str": ["abc", "def", "xyz"], "col_timedelta": pd.to_timedelta([np.nan, np.nan, np.nan]), } ) y = np.array([0, 0, 1]) X_res, y_res = RandomOverSampler().fit_resample(X, y) assert X_res.shape == (4, 2) assert y_res.shape == (4,) assert X_res["col_timedelta"].dtype == "timedelta64[ns]" imbalanced-learn-0.12.2/imblearn/pipeline.py000066400000000000000000001313121460233407600207300ustar00rootroot00000000000000""" The :mod:`imblearn.pipeline` module implements utilities to build a composite estimator, as a chain of transforms, samples and estimators. """ # Adapted from scikit-learn # Author: Edouard Duchesnay # Gael Varoquaux # Virgile Fritsch # Alexandre Gramfort # Lars Buitinck # Christos Aridas # Guillaume Lemaitre # License: BSD from sklearn import pipeline from sklearn.base import clone from sklearn.utils import Bunch, _print_elapsed_time from sklearn.utils.metaestimators import available_if from sklearn.utils.validation import check_memory from .base import _ParamsValidationMixin from .utils._metadata_requests import ( METHODS, MetadataRouter, MethodMapping, _raise_for_params, _routing_enabled, process_routing, ) from .utils._param_validation import HasMethods, validate_params from .utils.fixes import _fit_context METHODS.append("fit_resample") __all__ = ["Pipeline", "make_pipeline"] class Pipeline(_ParamsValidationMixin, pipeline.Pipeline): """Pipeline of transforms and resamples with a final estimator. Sequentially apply a list of transforms, sampling, and a final estimator. Intermediate steps of the pipeline must be transformers or resamplers, that is, they must implement fit, transform and sample methods. The samplers are only applied during fit. The final estimator only needs to implement fit. The transformers and samplers in the pipeline can be cached using ``memory`` argument. The purpose of the pipeline is to assemble several steps that can be cross-validated together while setting different parameters. For this, it enables setting parameters of the various steps using their names and the parameter name separated by a '__', as in the example below. A step's estimator may be replaced entirely by setting the parameter with its name to another estimator, or a transformer removed by setting it to 'passthrough' or ``None``. Parameters ---------- steps : list List of (name, transform) tuples (implementing fit/transform/fit_resample) that are chained, in the order in which they are chained, with the last object an estimator. memory : Instance of joblib.Memory or str, default=None Used to cache the fitted transformers of the pipeline. By default, no caching is performed. If a string is given, it is the path to the caching directory. Enabling caching triggers a clone of the transformers before fitting. Therefore, the transformer instance given to the pipeline cannot be inspected directly. Use the attribute ``named_steps`` or ``steps`` to inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time consuming. verbose : bool, default=False If True, the time elapsed while fitting each step will be printed as it is completed. Attributes ---------- named_steps : :class:`~sklearn.utils.Bunch` Read-only attribute to access any step parameter by user given name. Keys are step names and values are steps parameters. classes_ : ndarray of shape (n_classes,) The classes labels. n_features_in_ : int Number of features seen during first step `fit` method. See Also -------- make_pipeline : Helper function to make pipeline. Notes ----- See :ref:`sphx_glr_auto_examples_pipeline_plot_pipeline_classification.py` .. warning:: A surprising behaviour of the `imbalanced-learn` pipeline is that it breaks the `scikit-learn` contract where one expects `estimmator.fit_transform(X, y)` to be equivalent to `estimator.fit(X, y).transform(X)`. The semantic of `fit_resample` is to be applied only during the fit stage. Therefore, resampling will happen when calling `fit_transform` while it will only happen on the `fit` stage when calling `fit` and `transform` separately. Practically, `fit_transform` will lead to a resampled dataset while `fit` and `transform` will not. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from sklearn.model_selection import train_test_split as tts >>> from sklearn.decomposition import PCA >>> from sklearn.neighbors import KNeighborsClassifier as KNN >>> from sklearn.metrics import classification_report >>> from imblearn.over_sampling import SMOTE >>> from imblearn.pipeline import Pipeline >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print(f'Original dataset shape {Counter(y)}') Original dataset shape Counter({1: 900, 0: 100}) >>> pca = PCA() >>> smt = SMOTE(random_state=42) >>> knn = KNN() >>> pipeline = Pipeline([('smt', smt), ('pca', pca), ('knn', knn)]) >>> X_train, X_test, y_train, y_test = tts(X, y, random_state=42) >>> pipeline.fit(X_train, y_train) Pipeline(...) >>> y_hat = pipeline.predict(X_test) >>> print(classification_report(y_test, y_hat)) precision recall f1-score support 0 0.87 1.00 0.93 26 1 1.00 0.98 0.99 224 accuracy 0.98 250 macro avg 0.93 0.99 0.96 250 weighted avg 0.99 0.98 0.98 250 """ _parameter_constraints: dict = { "steps": "no_validation", # validated in `_validate_steps` "memory": [None, str, HasMethods(["cache"])], "verbose": ["boolean"], } # BaseEstimator interface def _validate_steps(self): names, estimators = zip(*self.steps) # validate names self._validate_names(names) # validate estimators transformers = estimators[:-1] estimator = estimators[-1] for t in transformers: if t is None or t == "passthrough": continue if not ( hasattr(t, "fit") or hasattr(t, "fit_transform") or hasattr(t, "fit_resample") ) or not (hasattr(t, "transform") or hasattr(t, "fit_resample")): raise TypeError( "All intermediate steps of the chain should " "be estimators that implement fit and transform or " "fit_resample (but not both) or be a string 'passthrough' " "'%s' (type %s) doesn't)" % (t, type(t)) ) if hasattr(t, "fit_resample") and ( hasattr(t, "fit_transform") or hasattr(t, "transform") ): raise TypeError( "All intermediate steps of the chain should " "be estimators that implement fit and transform or " "fit_resample." " '%s' implements both)" % (t) ) if isinstance(t, pipeline.Pipeline): raise TypeError( "All intermediate steps of the chain should not be Pipelines" ) # We allow last estimator to be None as an identity transformation if ( estimator is not None and estimator != "passthrough" and not hasattr(estimator, "fit") ): raise TypeError( "Last step of Pipeline should implement fit or be " "the string 'passthrough'. '%s' (type %s) doesn't" % (estimator, type(estimator)) ) def _iter(self, with_final=True, filter_passthrough=True, filter_resample=True): """Generate (idx, (name, trans)) tuples from self.steps. When `filter_passthrough` is `True`, 'passthrough' and None transformers are filtered out. When `filter_resample` is `True`, estimator with a method `fit_resample` are filtered out. """ it = super()._iter(with_final, filter_passthrough) if filter_resample: return filter(lambda x: not hasattr(x[-1], "fit_resample"), it) else: return it # Estimator interface # def _fit(self, X, y=None, **fit_params_steps): def _fit(self, X, y=None, routed_params=None): self.steps = list(self.steps) self._validate_steps() # Setup the memory memory = check_memory(self.memory) fit_transform_one_cached = memory.cache(_fit_transform_one) fit_resample_one_cached = memory.cache(_fit_resample_one) for step_idx, name, transformer in self._iter( with_final=False, filter_passthrough=False, filter_resample=False ): if transformer is None or transformer == "passthrough": with _print_elapsed_time("Pipeline", self._log_message(step_idx)): continue if hasattr(memory, "location") and memory.location is None: # we do not clone when caching is disabled to # preserve backward compatibility cloned_transformer = transformer else: cloned_transformer = clone(transformer) # Fit or load from cache the current transformer if hasattr(cloned_transformer, "transform") or hasattr( cloned_transformer, "fit_transform" ): X, fitted_transformer = fit_transform_one_cached( cloned_transformer, X, y, None, message_clsname="Pipeline", message=self._log_message(step_idx), params=routed_params[name], ) elif hasattr(cloned_transformer, "fit_resample"): X, y, fitted_transformer = fit_resample_one_cached( cloned_transformer, X, y, message_clsname="Pipeline", message=self._log_message(step_idx), params=routed_params[name], ) # Replace the transformer of the step with the fitted # transformer. This is necessary when loading the transformer # from the cache. self.steps[step_idx] = (name, fitted_transformer) return X, y # The `fit_*` methods need to be overridden to support the samplers. @_fit_context( # estimators in Pipeline.steps are not validated yet prefer_skip_nested_validation=False ) def fit(self, X, y=None, **params): """Fit the model. Fit all the transforms/samplers one after the other and transform/sample the data, then fit the transformed/sampled data using the final estimator. Parameters ---------- X : iterable Training data. Must fulfill input requirements of first step of the pipeline. y : iterable, default=None Training targets. Must fulfill label requirements for all steps of the pipeline. **params : dict of str -> object - If `enable_metadata_routing=False` (default): Parameters passed to the ``fit`` method of each step, where each parameter name is prefixed such that parameter ``p`` for step ``s`` has key ``s__p``. - If `enable_metadata_routing=True`: Parameters requested and accepted by steps. Each step must have requested certain metadata for these parameters to be forwarded to them. .. versionchanged:: 1.4 Parameters are now passed to the ``transform`` method of the intermediate steps as well, if requested, and if `enable_metadata_routing=True` is set via :func:`~sklearn.set_config`. See :ref:`Metadata Routing User Guide ` for more details. Returns ------- self : Pipeline This estimator. """ routed_params = self._check_method_params(method="fit", props=params) Xt, yt = self._fit(X, y, routed_params) with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)): if self._final_estimator != "passthrough": last_step_params = routed_params[self.steps[-1][0]] self._final_estimator.fit(Xt, yt, **last_step_params["fit"]) return self def _can_fit_transform(self): return ( self._final_estimator == "passthrough" or hasattr(self._final_estimator, "transform") or hasattr(self._final_estimator, "fit_transform") ) @available_if(_can_fit_transform) @_fit_context( # estimators in Pipeline.steps are not validated yet prefer_skip_nested_validation=False ) def fit_transform(self, X, y=None, **params): """Fit the model and transform with the final estimator. Fits all the transformers/samplers one after the other and transform/sample the data, then uses fit_transform on transformed data with the final estimator. Parameters ---------- X : iterable Training data. Must fulfill input requirements of first step of the pipeline. y : iterable, default=None Training targets. Must fulfill label requirements for all steps of the pipeline. **params : dict of str -> object - If `enable_metadata_routing=False` (default): Parameters passed to the ``fit`` method of each step, where each parameter name is prefixed such that parameter ``p`` for step ``s`` has key ``s__p``. - If `enable_metadata_routing=True`: Parameters requested and accepted by steps. Each step must have requested certain metadata for these parameters to be forwarded to them. .. versionchanged:: 1.4 Parameters are now passed to the ``transform`` method of the intermediate steps as well, if requested, and if `enable_metadata_routing=True`. See :ref:`Metadata Routing User Guide ` for more details. Returns ------- Xt : array-like of shape (n_samples, n_transformed_features) Transformed samples. """ routed_params = self._check_method_params(method="fit_transform", props=params) Xt, yt = self._fit(X, y, routed_params) last_step = self._final_estimator with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)): if last_step == "passthrough": return Xt last_step_params = routed_params[self.steps[-1][0]] if hasattr(last_step, "fit_transform"): return last_step.fit_transform( Xt, yt, **last_step_params["fit_transform"] ) else: return last_step.fit(Xt, y, **last_step_params["fit"]).transform( Xt, **last_step_params["transform"] ) @available_if(pipeline._final_estimator_has("predict")) def predict(self, X, **params): """Transform the data, and apply `predict` with the final estimator. Call `transform` of each transformer in the pipeline. The transformed data are finally passed to the final estimator that calls `predict` method. Only valid if the final estimator implements `predict`. Parameters ---------- X : iterable Data to predict on. Must fulfill input requirements of first step of the pipeline. **params : dict of str -> object - If `enable_metadata_routing=False` (default): Parameters to the ``predict`` called at the end of all transformations in the pipeline. - If `enable_metadata_routing=True`: Parameters requested and accepted by steps. Each step must have requested certain metadata for these parameters to be forwarded to them. .. versionadded:: 0.20 .. versionchanged:: 1.4 Parameters are now passed to the ``transform`` method of the intermediate steps as well, if requested, and if `enable_metadata_routing=True` is set via :func:`~sklearn.set_config`. See :ref:`Metadata Routing User Guide ` for more details. Note that while this may be used to return uncertainties from some models with ``return_std`` or ``return_cov``, uncertainties that are generated by the transformations in the pipeline are not propagated to the final estimator. Returns ------- y_pred : ndarray Result of calling `predict` on the final estimator. """ Xt = X if not _routing_enabled(): for _, name, transform in self._iter(with_final=False): Xt = transform.transform(Xt) return self.steps[-1][1].predict(Xt, **params) # metadata routing enabled routed_params = process_routing(self, "predict", **params) for _, name, transform in self._iter(with_final=False): Xt = transform.transform(Xt, **routed_params[name].transform) return self.steps[-1][1].predict(Xt, **routed_params[self.steps[-1][0]].predict) def _can_fit_resample(self): return self._final_estimator == "passthrough" or hasattr( self._final_estimator, "fit_resample" ) @available_if(_can_fit_resample) @_fit_context( # estimators in Pipeline.steps are not validated yet prefer_skip_nested_validation=False ) def fit_resample(self, X, y=None, **params): """Fit the model and sample with the final estimator. Fits all the transformers/samplers one after the other and transform/sample the data, then uses fit_resample on transformed data with the final estimator. Parameters ---------- X : iterable Training data. Must fulfill input requirements of first step of the pipeline. y : iterable, default=None Training targets. Must fulfill label requirements for all steps of the pipeline. **params : dict of str -> object - If `enable_metadata_routing=False` (default): Parameters passed to the ``fit`` method of each step, where each parameter name is prefixed such that parameter ``p`` for step ``s`` has key ``s__p``. - If `enable_metadata_routing=True`: Parameters requested and accepted by steps. Each step must have requested certain metadata for these parameters to be forwarded to them. .. versionchanged:: 1.4 Parameters are now passed to the ``transform`` method of the intermediate steps as well, if requested, and if `enable_metadata_routing=True`. See :ref:`Metadata Routing User Guide ` for more details. Returns ------- Xt : array-like of shape (n_samples, n_transformed_features) Transformed samples. yt : array-like of shape (n_samples, n_transformed_features) Transformed target. """ routed_params = self._check_method_params(method="fit_resample", props=params) Xt, yt = self._fit(X, y, routed_params) last_step = self._final_estimator with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)): if last_step == "passthrough": return Xt last_step_params = routed_params[self.steps[-1][0]] if hasattr(last_step, "fit_resample"): return last_step.fit_resample( Xt, yt, **last_step_params["fit_resample"] ) @available_if(pipeline._final_estimator_has("fit_predict")) @_fit_context( # estimators in Pipeline.steps are not validated yet prefer_skip_nested_validation=False ) def fit_predict(self, X, y=None, **params): """Apply `fit_predict` of last step in pipeline after transforms. Applies fit_transforms of a pipeline to the data, followed by the fit_predict method of the final estimator in the pipeline. Valid only if the final estimator implements fit_predict. Parameters ---------- X : iterable Training data. Must fulfill input requirements of first step of the pipeline. y : iterable, default=None Training targets. Must fulfill label requirements for all steps of the pipeline. **params : dict of str -> object - If `enable_metadata_routing=False` (default): Parameters to the ``predict`` called at the end of all transformations in the pipeline. - If `enable_metadata_routing=True`: Parameters requested and accepted by steps. Each step must have requested certain metadata for these parameters to be forwarded to them. .. versionadded:: 0.20 .. versionchanged:: 1.4 Parameters are now passed to the ``transform`` method of the intermediate steps as well, if requested, and if `enable_metadata_routing=True`. See :ref:`Metadata Routing User Guide ` for more details. Note that while this may be used to return uncertainties from some models with ``return_std`` or ``return_cov``, uncertainties that are generated by the transformations in the pipeline are not propagated to the final estimator. Returns ------- y_pred : ndarray of shape (n_samples,) The predicted target. """ routed_params = self._check_method_params(method="fit_predict", props=params) Xt, yt = self._fit(X, y, routed_params) params_last_step = routed_params[self.steps[-1][0]] with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)): y_pred = self.steps[-1][-1].fit_predict( Xt, yt, **params_last_step.get("fit_predict", {}) ) return y_pred # TODO: remove the following methods when the minimum scikit-learn >= 1.4 # They do not depend on resampling but we need to redefine them for the # compatibility with the metadata routing framework. @available_if(pipeline._final_estimator_has("predict_proba")) def predict_proba(self, X, **params): """Transform the data, and apply `predict_proba` with the final estimator. Call `transform` of each transformer in the pipeline. The transformed data are finally passed to the final estimator that calls `predict_proba` method. Only valid if the final estimator implements `predict_proba`. Parameters ---------- X : iterable Data to predict on. Must fulfill input requirements of first step of the pipeline. **params : dict of str -> object - If `enable_metadata_routing=False` (default): Parameters to the `predict_proba` called at the end of all transformations in the pipeline. - If `enable_metadata_routing=True`: Parameters requested and accepted by steps. Each step must have requested certain metadata for these parameters to be forwarded to them. .. versionadded:: 0.20 .. versionchanged:: 1.4 Parameters are now passed to the ``transform`` method of the intermediate steps as well, if requested, and if `enable_metadata_routing=True`. See :ref:`Metadata Routing User Guide ` for more details. Returns ------- y_proba : ndarray of shape (n_samples, n_classes) Result of calling `predict_proba` on the final estimator. """ Xt = X if not _routing_enabled(): for _, name, transform in self._iter(with_final=False): Xt = transform.transform(Xt) return self.steps[-1][1].predict_proba(Xt, **params) # metadata routing enabled routed_params = process_routing(self, "predict_proba", **params) for _, name, transform in self._iter(with_final=False): Xt = transform.transform(Xt, **routed_params[name].transform) return self.steps[-1][1].predict_proba( Xt, **routed_params[self.steps[-1][0]].predict_proba ) @available_if(pipeline._final_estimator_has("decision_function")) def decision_function(self, X, **params): """Transform the data, and apply `decision_function` with the final estimator. Call `transform` of each transformer in the pipeline. The transformed data are finally passed to the final estimator that calls `decision_function` method. Only valid if the final estimator implements `decision_function`. Parameters ---------- X : iterable Data to predict on. Must fulfill input requirements of first step of the pipeline. **params : dict of string -> object Parameters requested and accepted by steps. Each step must have requested certain metadata for these parameters to be forwarded to them. .. versionadded:: 1.4 Only available if `enable_metadata_routing=True`. See :ref:`Metadata Routing User Guide ` for more details. Returns ------- y_score : ndarray of shape (n_samples, n_classes) Result of calling `decision_function` on the final estimator. """ _raise_for_params(params, self, "decision_function") # not branching here since params is only available if # enable_metadata_routing=True routed_params = process_routing(self, "decision_function", **params) Xt = X for _, name, transform in self._iter(with_final=False): Xt = transform.transform( Xt, **routed_params.get(name, {}).get("transform", {}) ) return self.steps[-1][1].decision_function( Xt, **routed_params.get(self.steps[-1][0], {}).get("decision_function", {}) ) @available_if(pipeline._final_estimator_has("score_samples")) def score_samples(self, X): """Transform the data, and apply `score_samples` with the final estimator. Call `transform` of each transformer in the pipeline. The transformed data are finally passed to the final estimator that calls `score_samples` method. Only valid if the final estimator implements `score_samples`. Parameters ---------- X : iterable Data to predict on. Must fulfill input requirements of first step of the pipeline. Returns ------- y_score : ndarray of shape (n_samples,) Result of calling `score_samples` on the final estimator. """ Xt = X for _, _, transformer in self._iter(with_final=False): Xt = transformer.transform(Xt) return self.steps[-1][1].score_samples(Xt) @available_if(pipeline._final_estimator_has("predict_log_proba")) def predict_log_proba(self, X, **params): """Transform the data, and apply `predict_log_proba` with the final estimator. Call `transform` of each transformer in the pipeline. The transformed data are finally passed to the final estimator that calls `predict_log_proba` method. Only valid if the final estimator implements `predict_log_proba`. Parameters ---------- X : iterable Data to predict on. Must fulfill input requirements of first step of the pipeline. **params : dict of str -> object - If `enable_metadata_routing=False` (default): Parameters to the `predict_log_proba` called at the end of all transformations in the pipeline. - If `enable_metadata_routing=True`: Parameters requested and accepted by steps. Each step must have requested certain metadata for these parameters to be forwarded to them. .. versionadded:: 0.20 .. versionchanged:: 1.4 Parameters are now passed to the ``transform`` method of the intermediate steps as well, if requested, and if `enable_metadata_routing=True`. See :ref:`Metadata Routing User Guide ` for more details. Returns ------- y_log_proba : ndarray of shape (n_samples, n_classes) Result of calling `predict_log_proba` on the final estimator. """ Xt = X if not _routing_enabled(): for _, name, transform in self._iter(with_final=False): Xt = transform.transform(Xt) return self.steps[-1][1].predict_log_proba(Xt, **params) # metadata routing enabled routed_params = process_routing(self, "predict_log_proba", **params) for _, name, transform in self._iter(with_final=False): Xt = transform.transform(Xt, **routed_params[name].transform) return self.steps[-1][1].predict_log_proba( Xt, **routed_params[self.steps[-1][0]].predict_log_proba ) def _can_transform(self): return self._final_estimator == "passthrough" or hasattr( self._final_estimator, "transform" ) @available_if(_can_transform) def transform(self, X, **params): """Transform the data, and apply `transform` with the final estimator. Call `transform` of each transformer in the pipeline. The transformed data are finally passed to the final estimator that calls `transform` method. Only valid if the final estimator implements `transform`. This also works where final estimator is `None` in which case all prior transformations are applied. Parameters ---------- X : iterable Data to transform. Must fulfill input requirements of first step of the pipeline. **params : dict of str -> object Parameters requested and accepted by steps. Each step must have requested certain metadata for these parameters to be forwarded to them. .. versionadded:: 1.4 Only available if `enable_metadata_routing=True`. See :ref:`Metadata Routing User Guide ` for more details. Returns ------- Xt : ndarray of shape (n_samples, n_transformed_features) Transformed data. """ _raise_for_params(params, self, "transform") # not branching here since params is only available if # enable_metadata_routing=True routed_params = process_routing(self, "transform", **params) Xt = X for _, name, transform in self._iter(): Xt = transform.transform(Xt, **routed_params[name].transform) return Xt def _can_inverse_transform(self): return all(hasattr(t, "inverse_transform") for _, _, t in self._iter()) @available_if(_can_inverse_transform) def inverse_transform(self, Xt, **params): """Apply `inverse_transform` for each step in a reverse order. All estimators in the pipeline must support `inverse_transform`. Parameters ---------- Xt : array-like of shape (n_samples, n_transformed_features) Data samples, where ``n_samples`` is the number of samples and ``n_features`` is the number of features. Must fulfill input requirements of last step of pipeline's ``inverse_transform`` method. **params : dict of str -> object Parameters requested and accepted by steps. Each step must have requested certain metadata for these parameters to be forwarded to them. .. versionadded:: 1.4 Only available if `enable_metadata_routing=True`. See :ref:`Metadata Routing User Guide ` for more details. Returns ------- Xt : ndarray of shape (n_samples, n_features) Inverse transformed data, that is, data in the original feature space. """ _raise_for_params(params, self, "inverse_transform") # we don't have to branch here, since params is only non-empty if # enable_metadata_routing=True. routed_params = process_routing(self, "inverse_transform", **params) reverse_iter = reversed(list(self._iter())) for _, name, transform in reverse_iter: Xt = transform.inverse_transform( Xt, **routed_params[name].inverse_transform ) return Xt @available_if(pipeline._final_estimator_has("score")) def score(self, X, y=None, sample_weight=None, **params): """Transform the data, and apply `score` with the final estimator. Call `transform` of each transformer in the pipeline. The transformed data are finally passed to the final estimator that calls `score` method. Only valid if the final estimator implements `score`. Parameters ---------- X : iterable Data to predict on. Must fulfill input requirements of first step of the pipeline. y : iterable, default=None Targets used for scoring. Must fulfill label requirements for all steps of the pipeline. sample_weight : array-like, default=None If not None, this argument is passed as ``sample_weight`` keyword argument to the ``score`` method of the final estimator. **params : dict of str -> object Parameters requested and accepted by steps. Each step must have requested certain metadata for these parameters to be forwarded to them. .. versionadded:: 1.4 Only available if `enable_metadata_routing=True`. See :ref:`Metadata Routing User Guide ` for more details. Returns ------- score : float Result of calling `score` on the final estimator. """ Xt = X if not _routing_enabled(): for _, name, transform in self._iter(with_final=False): Xt = transform.transform(Xt) score_params = {} if sample_weight is not None: score_params["sample_weight"] = sample_weight return self.steps[-1][1].score(Xt, y, **score_params) # metadata routing is enabled. routed_params = process_routing( self, "score", sample_weight=sample_weight, **params ) Xt = X for _, name, transform in self._iter(with_final=False): Xt = transform.transform(Xt, **routed_params[name].transform) return self.steps[-1][1].score(Xt, y, **routed_params[self.steps[-1][0]].score) # TODO: once scikit-learn >= 1.4, the following function should be simplified by # calling `super().get_metadata_routing()` def get_metadata_routing(self): """Get metadata routing of this object. Please check :ref:`User Guide ` on how the routing mechanism works. Returns ------- routing : MetadataRouter A :class:`~utils.metadata_routing.MetadataRouter` encapsulating routing information. """ router = MetadataRouter(owner=self.__class__.__name__) # first we add all steps except the last one for _, name, trans in self._iter(with_final=False, filter_passthrough=True): method_mapping = MethodMapping() # fit, fit_predict, and fit_transform call fit_transform if it # exists, or else fit and transform if hasattr(trans, "fit_transform"): ( method_mapping.add(caller="fit", callee="fit_transform") .add(caller="fit_transform", callee="fit_transform") .add(caller="fit_predict", callee="fit_transform") .add(caller="fit_resample", callee="fit_transform") ) else: ( method_mapping.add(caller="fit", callee="fit") .add(caller="fit", callee="transform") .add(caller="fit_transform", callee="fit") .add(caller="fit_transform", callee="transform") .add(caller="fit_predict", callee="fit") .add(caller="fit_predict", callee="transform") .add(caller="fit_resample", callee="fit") .add(caller="fit_resample", callee="transform") ) ( method_mapping.add(caller="predict", callee="transform") .add(caller="predict", callee="transform") .add(caller="predict_proba", callee="transform") .add(caller="decision_function", callee="transform") .add(caller="predict_log_proba", callee="transform") .add(caller="transform", callee="transform") .add(caller="inverse_transform", callee="inverse_transform") .add(caller="score", callee="transform") .add(caller="fit_resample", callee="transform") ) router.add(method_mapping=method_mapping, **{name: trans}) final_name, final_est = self.steps[-1] if final_est is None or final_est == "passthrough": return router # then we add the last step method_mapping = MethodMapping() if hasattr(final_est, "fit_transform"): ( method_mapping.add(caller="fit_transform", callee="fit_transform").add( caller="fit_resample", callee="fit_transform" ) ) else: ( method_mapping.add(caller="fit", callee="fit") .add(caller="fit", callee="transform") .add(caller="fit_resample", callee="fit") .add(caller="fit_resample", callee="transform") ) ( method_mapping.add(caller="fit", callee="fit") .add(caller="predict", callee="predict") .add(caller="fit_predict", callee="fit_predict") .add(caller="predict_proba", callee="predict_proba") .add(caller="decision_function", callee="decision_function") .add(caller="predict_log_proba", callee="predict_log_proba") .add(caller="transform", callee="transform") .add(caller="inverse_transform", callee="inverse_transform") .add(caller="score", callee="score") .add(caller="fit_resample", callee="fit_resample") ) router.add(method_mapping=method_mapping, **{final_name: final_est}) return router def _check_method_params(self, method, props, **kwargs): if _routing_enabled(): routed_params = process_routing(self, method, **props, **kwargs) return routed_params else: fit_params_steps = Bunch( **{ name: Bunch(**{method: {} for method in METHODS}) for name, step in self.steps if step is not None } ) for pname, pval in props.items(): if "__" not in pname: raise ValueError( "Pipeline.fit does not accept the {} parameter. " "You can pass parameters to specific steps of your " "pipeline using the stepname__parameter format, e.g. " "`Pipeline.fit(X, y, logisticregression__sample_weight" "=sample_weight)`.".format(pname) ) step, param = pname.split("__", 1) fit_params_steps[step]["fit"][param] = pval # without metadata routing, fit_transform and fit_predict # get all the same params and pass it to the last fit. fit_params_steps[step]["fit_transform"][param] = pval fit_params_steps[step]["fit_predict"][param] = pval return fit_params_steps def _fit_resample_one(sampler, X, y, message_clsname="", message=None, params=None): with _print_elapsed_time(message_clsname, message): X_res, y_res = sampler.fit_resample(X, y, **params.get("fit_resample", {})) return X_res, y_res, sampler def _transform_one(transformer, X, y, weight, params): """Call transform and apply weight to output. Parameters ---------- transformer : estimator Estimator to be used for transformation. X : {array-like, sparse matrix} of shape (n_samples, n_features) Input data to be transformed. y : ndarray of shape (n_samples,) Ignored. weight : float Weight to be applied to the output of the transformation. params : dict Parameters to be passed to the transformer's ``transform`` method. This should be of the form ``process_routing()["step_name"]``. """ res = transformer.transform(X, **params.transform) # if we have a weight for this transformer, multiply output if weight is None: return res return res * weight def _fit_transform_one( transformer, X, y, weight, message_clsname="", message=None, params=None ): """ Fits ``transformer`` to ``X`` and ``y``. The transformed result is returned with the fitted transformer. If ``weight`` is not ``None``, the result will be multiplied by ``weight``. ``params`` needs to be of the form ``process_routing()["step_name"]``. """ params = params or {} with _print_elapsed_time(message_clsname, message): if hasattr(transformer, "fit_transform"): res = transformer.fit_transform(X, y, **params.get("fit_transform", {})) else: res = transformer.fit(X, y, **params.get("fit", {})).transform( X, **params.get("transform", {}) ) if weight is None: return res, transformer return res * weight, transformer @validate_params( {"memory": [None, str, HasMethods(["cache"])], "verbose": ["boolean"]}, prefer_skip_nested_validation=True, ) def make_pipeline(*steps, memory=None, verbose=False): """Construct a Pipeline from the given estimators. This is a shorthand for the Pipeline constructor; it does not require, and does not permit, naming the estimators. Instead, their names will be set to the lowercase of their types automatically. Parameters ---------- *steps : list of estimators A list of estimators. memory : None, str or object with the joblib.Memory interface, default=None Used to cache the fitted transformers of the pipeline. By default, no caching is performed. If a string is given, it is the path to the caching directory. Enabling caching triggers a clone of the transformers before fitting. Therefore, the transformer instance given to the pipeline cannot be inspected directly. Use the attribute ``named_steps`` or ``steps`` to inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time consuming. verbose : bool, default=False If True, the time elapsed while fitting each step will be printed as it is completed. Returns ------- p : Pipeline Returns an imbalanced-learn `Pipeline` instance that handles samplers. See Also -------- imblearn.pipeline.Pipeline : Class for creating a pipeline of transforms with a final estimator. Examples -------- >>> from sklearn.naive_bayes import GaussianNB >>> from sklearn.preprocessing import StandardScaler >>> make_pipeline(StandardScaler(), GaussianNB(priors=None)) Pipeline(steps=[('standardscaler', StandardScaler()), ('gaussiannb', GaussianNB())]) """ return Pipeline(pipeline._name_estimators(steps), memory=memory, verbose=verbose) imbalanced-learn-0.12.2/imblearn/tensorflow/000077500000000000000000000000001460233407600207525ustar00rootroot00000000000000imbalanced-learn-0.12.2/imblearn/tensorflow/__init__.py000066400000000000000000000003011460233407600230550ustar00rootroot00000000000000"""The :mod:`imblearn.tensorflow` provides utilities to deal with imbalanced dataset in tensorflow.""" from ._generator import balanced_batch_generator __all__ = ["balanced_batch_generator"] imbalanced-learn-0.12.2/imblearn/tensorflow/_generator.py000066400000000000000000000063361460233407600234610ustar00rootroot00000000000000"""Implement generators for ``tensorflow`` which will balance the data.""" from scipy.sparse import issparse from sklearn.base import clone from sklearn.utils import _safe_indexing, check_random_state from ..under_sampling import RandomUnderSampler from ..utils import Substitution from ..utils._docstring import _random_state_docstring @Substitution(random_state=_random_state_docstring) def balanced_batch_generator( X, y, *, sample_weight=None, sampler=None, batch_size=32, keep_sparse=False, random_state=None, ): """Create a balanced batch generator to train tensorflow model. Returns a generator --- as well as the number of step per epoch --- to iterate to get the mini-batches. The sampler defines the sampling strategy used to balance the dataset ahead of creating the batch. The sampler should have an attribute ``sample_indices_``. .. versionadded:: 0.4 Parameters ---------- X : ndarray of shape (n_samples, n_features) Original imbalanced dataset. y : ndarray of shape (n_samples,) or (n_samples, n_classes) Associated targets. sample_weight : ndarray of shape (n_samples,), default=None Sample weight. sampler : sampler object, default=None A sampler instance which has an attribute ``sample_indices_``. By default, the sampler used is a :class:`~imblearn.under_sampling.RandomUnderSampler`. batch_size : int, default=32 Number of samples per gradient update. keep_sparse : bool, default=False Either or not to conserve or not the sparsity of the input ``X``. By default, the returned batches will be dense. {random_state} Returns ------- generator : generator of tuple Generate batch of data. The tuple generated are either (X_batch, y_batch) or (X_batch, y_batch, sampler_weight_batch). steps_per_epoch : int The number of samples per epoch. """ random_state = check_random_state(random_state) if sampler is None: sampler_ = RandomUnderSampler(random_state=random_state) else: sampler_ = clone(sampler) sampler_.fit_resample(X, y) if not hasattr(sampler_, "sample_indices_"): raise ValueError("'sampler' needs to have an attribute 'sample_indices_'.") indices = sampler_.sample_indices_ # shuffle the indices since the sampler are packing them by class random_state.shuffle(indices) def generator(X, y, sample_weight, indices, batch_size): while True: for index in range(0, len(indices), batch_size): X_res = _safe_indexing(X, indices[index : index + batch_size]) y_res = _safe_indexing(y, indices[index : index + batch_size]) if issparse(X_res) and not keep_sparse: X_res = X_res.toarray() if sample_weight is None: yield X_res, y_res else: sw_res = _safe_indexing( sample_weight, indices[index : index + batch_size] ) yield X_res, y_res, sw_res return ( generator(X, y, sample_weight, indices, batch_size), int(indices.size // batch_size), ) imbalanced-learn-0.12.2/imblearn/tensorflow/tests/000077500000000000000000000000001460233407600221145ustar00rootroot00000000000000imbalanced-learn-0.12.2/imblearn/tensorflow/tests/__init__.py000066400000000000000000000000001460233407600242130ustar00rootroot00000000000000imbalanced-learn-0.12.2/imblearn/tensorflow/tests/test_generator.py000066400000000000000000000124641460233407600255220ustar00rootroot00000000000000import numpy as np import pytest from scipy import sparse from sklearn.datasets import load_iris from sklearn.utils.fixes import parse_version from imblearn.datasets import make_imbalance from imblearn.over_sampling import RandomOverSampler from imblearn.tensorflow import balanced_batch_generator from imblearn.under_sampling import NearMiss tf = pytest.importorskip("tensorflow") @pytest.fixture def data(): X, y = load_iris(return_X_y=True) X, y = make_imbalance(X, y, sampling_strategy={0: 30, 1: 50, 2: 40}) X = X.astype(np.float32) return X, y def check_balanced_batch_generator_tf_1_X_X(dataset, sampler): X, y = dataset batch_size = 10 training_generator, steps_per_epoch = balanced_batch_generator( X, y, sample_weight=None, sampler=sampler, batch_size=batch_size, random_state=42, ) learning_rate = 0.01 epochs = 10 input_size = X.shape[1] output_size = 3 # helper functions def init_weights(shape): return tf.Variable(tf.random_normal(shape, stddev=0.01)) def accuracy(y_true, y_pred): return np.mean(np.argmax(y_pred, axis=1) == y_true) # input and output data = tf.placeholder("float32", shape=[None, input_size]) targets = tf.placeholder("int32", shape=[None]) # build the model and weights W = init_weights([input_size, output_size]) b = init_weights([output_size]) out_act = tf.nn.sigmoid(tf.matmul(data, W) + b) # build the loss, predict, and train operator cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=out_act, labels=targets ) loss = tf.reduce_sum(cross_entropy) optimizer = tf.train.GradientDescentOptimizer(learning_rate) train_op = optimizer.minimize(loss) predict = tf.nn.softmax(out_act) # Initialization of all variables in the graph init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) for e in range(epochs): for i in range(steps_per_epoch): X_batch, y_batch = next(training_generator) sess.run( [train_op, loss], feed_dict={data: X_batch, targets: y_batch}, ) # For each epoch, run accuracy on train and test predicts_train = sess.run(predict, feed_dict={data: X}) print(f"epoch: {e} train accuracy: {accuracy(y, predicts_train):.3f}") def check_balanced_batch_generator_tf_2_X_X_compat_1_X_X(dataset, sampler): tf.compat.v1.disable_eager_execution() X, y = dataset batch_size = 10 training_generator, steps_per_epoch = balanced_batch_generator( X, y, sample_weight=None, sampler=sampler, batch_size=batch_size, random_state=42, ) learning_rate = 0.01 epochs = 10 input_size = X.shape[1] output_size = 3 # helper functions def init_weights(shape): return tf.Variable(tf.random.normal(shape, stddev=0.01)) def accuracy(y_true, y_pred): return np.mean(np.argmax(y_pred, axis=1) == y_true) # input and output data = tf.compat.v1.placeholder("float32", shape=[None, input_size]) targets = tf.compat.v1.placeholder("int32", shape=[None]) # build the model and weights W = init_weights([input_size, output_size]) b = init_weights([output_size]) out_act = tf.nn.sigmoid(tf.matmul(data, W) + b) # build the loss, predict, and train operator cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=out_act, labels=targets ) loss = tf.reduce_sum(input_tensor=cross_entropy) optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate) train_op = optimizer.minimize(loss) predict = tf.nn.softmax(out_act) # Initialization of all variables in the graph init = tf.compat.v1.global_variables_initializer() with tf.compat.v1.Session() as sess: sess.run(init) for e in range(epochs): for i in range(steps_per_epoch): X_batch, y_batch = next(training_generator) sess.run( [train_op, loss], feed_dict={data: X_batch, targets: y_batch}, ) # For each epoch, run accuracy on train and test predicts_train = sess.run(predict, feed_dict={data: X}) print(f"epoch: {e} train accuracy: {accuracy(y, predicts_train):.3f}") @pytest.mark.parametrize("sampler", [None, NearMiss(), RandomOverSampler()]) def test_balanced_batch_generator(data, sampler): if parse_version(tf.__version__) < parse_version("2.0.0"): check_balanced_batch_generator_tf_1_X_X(data, sampler) else: check_balanced_batch_generator_tf_2_X_X_compat_1_X_X(data, sampler) @pytest.mark.parametrize("keep_sparse", [True, False]) def test_balanced_batch_generator_function_sparse(data, keep_sparse): X, y = data training_generator, steps_per_epoch = balanced_batch_generator( sparse.csr_matrix(X), y, keep_sparse=keep_sparse, batch_size=10, random_state=42, ) for idx in range(steps_per_epoch): X_batch, y_batch = next(training_generator) if keep_sparse: assert sparse.issparse(X_batch) else: assert not sparse.issparse(X_batch) imbalanced-learn-0.12.2/imblearn/tests/000077500000000000000000000000001460233407600177125ustar00rootroot00000000000000imbalanced-learn-0.12.2/imblearn/tests/__init__.py000066400000000000000000000000001460233407600220110ustar00rootroot00000000000000imbalanced-learn-0.12.2/imblearn/tests/test_base.py000066400000000000000000000065051460233407600222430ustar00rootroot00000000000000"""Test for miscellaneous samplers objects.""" # Authors: Guillaume Lemaitre # License: MIT import numpy as np import pytest from scipy import sparse from sklearn.datasets import load_iris, make_regression from sklearn.linear_model import LinearRegression from sklearn.utils import _safe_indexing from sklearn.utils._testing import assert_allclose_dense_sparse, assert_array_equal from sklearn.utils.multiclass import type_of_target from imblearn import FunctionSampler from imblearn.datasets import make_imbalance from imblearn.pipeline import make_pipeline from imblearn.under_sampling import RandomUnderSampler iris = load_iris() X, y = make_imbalance( iris.data, iris.target, sampling_strategy={0: 10, 1: 25}, random_state=0 ) def test_function_sampler_reject_sparse(): X_sparse = sparse.csr_matrix(X) sampler = FunctionSampler(accept_sparse=False) err_msg = "dense data is required" with pytest.raises( TypeError, match=err_msg, ): sampler.fit_resample(X_sparse, y) @pytest.mark.parametrize( "X, y", [(X, y), (sparse.csr_matrix(X), y), (sparse.csc_matrix(X), y)] ) def test_function_sampler_identity(X, y): sampler = FunctionSampler() X_res, y_res = sampler.fit_resample(X, y) assert_allclose_dense_sparse(X_res, X) assert_array_equal(y_res, y) @pytest.mark.parametrize( "X, y", [(X, y), (sparse.csr_matrix(X), y), (sparse.csc_matrix(X), y)] ) def test_function_sampler_func(X, y): def func(X, y): return X[:10], y[:10] sampler = FunctionSampler(func=func) X_res, y_res = sampler.fit_resample(X, y) assert_allclose_dense_sparse(X_res, X[:10]) assert_array_equal(y_res, y[:10]) @pytest.mark.parametrize( "X, y", [(X, y), (sparse.csr_matrix(X), y), (sparse.csc_matrix(X), y)] ) def test_function_sampler_func_kwargs(X, y): def func(X, y, sampling_strategy, random_state): rus = RandomUnderSampler( sampling_strategy=sampling_strategy, random_state=random_state ) return rus.fit_resample(X, y) sampler = FunctionSampler( func=func, kw_args={"sampling_strategy": "auto", "random_state": 0} ) X_res, y_res = sampler.fit_resample(X, y) X_res_2, y_res_2 = RandomUnderSampler(random_state=0).fit_resample(X, y) assert_allclose_dense_sparse(X_res, X_res_2) assert_array_equal(y_res, y_res_2) def test_function_sampler_validate(): # check that we can let a pass a regression variable by turning down the # validation X, y = make_regression() def dummy_sampler(X, y): indices = np.random.choice(np.arange(X.shape[0]), size=100) return _safe_indexing(X, indices), _safe_indexing(y, indices) sampler = FunctionSampler(func=dummy_sampler, validate=False) pipeline = make_pipeline(sampler, LinearRegression()) y_pred = pipeline.fit(X, y).predict(X) assert type_of_target(y_pred) == "continuous" def test_function_resampler_fit(): # Check that the validation is bypass when calling `fit` # Non-regression test for: # https://github.com/scikit-learn-contrib/imbalanced-learn/issues/782 X = np.array([[1, np.nan], [2, 3], [np.inf, 4]]) y = np.array([0, 1, 1]) def func(X, y): return X[:1], y[:1] sampler = FunctionSampler(func=func, validate=False) sampler.fit(X, y) sampler.fit_resample(X, y) imbalanced-learn-0.12.2/imblearn/tests/test_common.py000066400000000000000000000073231460233407600226200ustar00rootroot00000000000000"""Common tests""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import warnings from collections import OrderedDict import numpy as np import pytest from sklearn.base import clone from sklearn.exceptions import ConvergenceWarning from sklearn.utils._testing import SkipTest, ignore_warnings, set_random_state from sklearn.utils.estimator_checks import _construct_instance, _get_check_estimator_ids from sklearn.utils.estimator_checks import ( parametrize_with_checks as parametrize_with_checks_sklearn, ) from imblearn.over_sampling import RandomOverSampler from imblearn.under_sampling import NearMiss, RandomUnderSampler from imblearn.utils.estimator_checks import ( _set_checking_parameters, check_dataframe_column_names_consistency, check_param_validation, parametrize_with_checks, ) from imblearn.utils.testing import all_estimators @pytest.mark.parametrize("name, Estimator", all_estimators()) def test_all_estimator_no_base_class(name, Estimator): # test that all_estimators doesn't find abstract classes. msg = f"Base estimators such as {name} should not be included" f" in all_estimators" assert not name.lower().startswith("base"), msg def _tested_estimators(): for name, Estimator in all_estimators(): try: estimator = _construct_instance(Estimator) set_random_state(estimator) except SkipTest: continue if isinstance(estimator, NearMiss): # For NearMiss, let's check the three algorithms for version in (1, 2, 3): yield clone(estimator).set_params(version=version) else: yield estimator @parametrize_with_checks_sklearn(list(_tested_estimators())) def test_estimators_compatibility_sklearn(estimator, check, request): _set_checking_parameters(estimator) check(estimator) @parametrize_with_checks(list(_tested_estimators())) def test_estimators_imblearn(estimator, check, request): # Common tests for estimator instances with ignore_warnings( category=( FutureWarning, ConvergenceWarning, UserWarning, FutureWarning, ) ): _set_checking_parameters(estimator) check(estimator) @pytest.mark.parametrize( "estimator", _tested_estimators(), ids=_get_check_estimator_ids ) def test_check_param_validation(estimator): name = estimator.__class__.__name__ _set_checking_parameters(estimator) check_param_validation(name, estimator) @pytest.mark.parametrize("Sampler", [RandomOverSampler, RandomUnderSampler]) def test_strategy_as_ordered_dict(Sampler): """Check that it is possible to pass an `OrderedDict` as strategy.""" rng = np.random.RandomState(42) X, y = rng.randn(30, 2), np.array([0] * 10 + [1] * 20) sampler = Sampler(random_state=42) if isinstance(sampler, RandomOverSampler): strategy = OrderedDict({0: 20, 1: 20}) else: strategy = OrderedDict({0: 10, 1: 10}) sampler.set_params(sampling_strategy=strategy) X_res, y_res = sampler.fit_resample(X, y) assert X_res.shape[0] == sum(strategy.values()) assert y_res.shape[0] == sum(strategy.values()) @pytest.mark.parametrize( "estimator", _tested_estimators(), ids=_get_check_estimator_ids ) def test_pandas_column_name_consistency(estimator): _set_checking_parameters(estimator) with ignore_warnings(category=(FutureWarning)): with warnings.catch_warnings(record=True) as record: check_dataframe_column_names_consistency( estimator.__class__.__name__, estimator ) for warning in record: assert "was fitted without feature names" not in str(warning.message) imbalanced-learn-0.12.2/imblearn/tests/test_docstring_parameters.py000066400000000000000000000214111460233407600255410ustar00rootroot00000000000000# Authors: Alexandre Gramfort # Raghav RV # License: BSD 3 clause import importlib import inspect import warnings from inspect import signature from pkgutil import walk_packages import pytest from sklearn.datasets import make_classification from sklearn.linear_model import LogisticRegression from sklearn.utils import IS_PYPY from sklearn.utils._testing import ( _get_func_name, check_docstring_parameters, ignore_warnings, ) from sklearn.utils.estimator_checks import _enforce_estimator_tags_y try: from sklearn.utils.estimator_checks import _enforce_estimator_tags_x except ImportError: # scikit-learn >= 1.2 from sklearn.utils.estimator_checks import ( _enforce_estimator_tags_X as _enforce_estimator_tags_x, ) from sklearn.utils.deprecation import _is_deprecated from sklearn.utils.estimator_checks import _construct_instance import imblearn from imblearn.base import is_sampler from imblearn.utils.estimator_checks import _set_checking_parameters from imblearn.utils.testing import all_estimators # walk_packages() ignores DeprecationWarnings, now we need to ignore # FutureWarnings with warnings.catch_warnings(): warnings.simplefilter("ignore", FutureWarning) # mypy error: Module has no attribute "__path__" imblearn_path = imblearn.__path__ # type: ignore # mypy issue #1422 PUBLIC_MODULES = set( [ pckg[1] for pckg in walk_packages(prefix="imblearn.", path=imblearn_path) if not ("._" in pckg[1] or ".tests." in pckg[1]) ] ) # functions to ignore args / docstring of _DOCSTRING_IGNORES = [ "RUSBoostClassifier", # TODO remove after releasing scikit-learn 1.0.1 "ValueDifferenceMetric", ] # Methods where y param should be ignored if y=None by default _METHODS_IGNORE_NONE_Y = [ "fit", "score", "fit_predict", "fit_transform", "partial_fit", "predict", ] # numpydoc 0.8.0's docscrape tool raises because of collections.abc under # Python 3.7 @pytest.mark.filterwarnings("ignore::FutureWarning") @pytest.mark.filterwarnings("ignore::DeprecationWarning") @pytest.mark.skipif(IS_PYPY, reason="test segfaults on PyPy") def test_docstring_parameters(): # Test module docstring formatting # Skip test if numpydoc is not found pytest.importorskip( "numpydoc", reason="numpydoc is required to test the docstrings" ) # XXX unreached code as of v0.22 from numpydoc import docscrape incorrect = [] for name in PUBLIC_MODULES: if name.endswith(".conftest"): # pytest tooling, not part of the scikit-learn API continue with warnings.catch_warnings(record=True): module = importlib.import_module(name) classes = inspect.getmembers(module, inspect.isclass) # Exclude non-scikit-learn classes classes = [cls for cls in classes if cls[1].__module__.startswith("imblearn")] for cname, cls in classes: this_incorrect = [] if cname in _DOCSTRING_IGNORES or cname.startswith("_"): continue if inspect.isabstract(cls): continue with warnings.catch_warnings(record=True) as w: cdoc = docscrape.ClassDoc(cls) if len(w): raise RuntimeError( "Error for __init__ of %s in %s:\n%s" % (cls, name, w[0]) ) cls_init = getattr(cls, "__init__", None) if _is_deprecated(cls_init): continue elif cls_init is not None: this_incorrect += check_docstring_parameters(cls.__init__, cdoc) for method_name in cdoc.methods: method = getattr(cls, method_name) if _is_deprecated(method): continue param_ignore = None # Now skip docstring test for y when y is None # by default for API reason if method_name in _METHODS_IGNORE_NONE_Y: sig = signature(method) if "y" in sig.parameters and sig.parameters["y"].default is None: param_ignore = ["y"] # ignore y for fit and score result = check_docstring_parameters(method, ignore=param_ignore) this_incorrect += result incorrect += this_incorrect functions = inspect.getmembers(module, inspect.isfunction) # Exclude imported functions functions = [fn for fn in functions if fn[1].__module__ == name] for fname, func in functions: # Don't test private methods / functions if fname.startswith("_"): continue if fname == "configuration" and name.endswith("setup"): continue name_ = _get_func_name(func) if not any(d in name_ for d in _DOCSTRING_IGNORES) and not _is_deprecated( func ): incorrect += check_docstring_parameters(func) msg = "\n".join(incorrect) if len(incorrect) > 0: raise AssertionError("Docstring Error:\n" + msg) @ignore_warnings(category=FutureWarning) def test_tabs(): # Test that there are no tabs in our source files for importer, modname, ispkg in walk_packages( imblearn.__path__, prefix="imblearn." ): if IS_PYPY: continue # because we don't import mod = importlib.import_module(modname) try: source = inspect.getsource(mod) except IOError: # user probably should have run "make clean" continue assert "\t" not in source, ( '"%s" has tabs, please remove them ', "or add it to the ignore list" % modname, ) def _construct_compose_pipeline_instance(Estimator): # Minimal / degenerate instances: only useful to test the docstrings. if Estimator.__name__ == "Pipeline": return Estimator(steps=[("clf", LogisticRegression())]) @pytest.mark.parametrize("name, Estimator", all_estimators()) def test_fit_docstring_attributes(name, Estimator): pytest.importorskip("numpydoc") from numpydoc import docscrape if Estimator.__name__ in _DOCSTRING_IGNORES: return doc = docscrape.ClassDoc(Estimator) attributes = doc["Attributes"] if Estimator.__name__ == "Pipeline": est = _construct_compose_pipeline_instance(Estimator) else: est = _construct_instance(Estimator) _set_checking_parameters(est) X, y = make_classification( n_samples=20, n_features=3, n_redundant=0, n_classes=2, random_state=2, ) y = _enforce_estimator_tags_y(est, y) X = _enforce_estimator_tags_x(est, X) if "oob_score" in est.get_params(): est.set_params(bootstrap=True, oob_score=True) if is_sampler(est): est.fit_resample(X, y) else: est.fit(X, y) skipped_attributes = set( [ "base_estimator_", # this attribute exist with old version of sklearn ] ) for attr in attributes: if attr.name in skipped_attributes: continue desc = " ".join(attr.desc).lower() # As certain attributes are present "only" if a certain parameter is # provided, this checks if the word "only" is present in the attribute # description, and if not the attribute is required to be present. if "only " in desc: continue # ignore deprecation warnings with ignore_warnings(category=FutureWarning): assert hasattr(est, attr.name) fit_attr = _get_all_fitted_attributes(est) fit_attr_names = [attr.name for attr in attributes] undocumented_attrs = set(fit_attr).difference(fit_attr_names) undocumented_attrs = set(undocumented_attrs).difference(skipped_attributes) if undocumented_attrs: raise AssertionError( f"Undocumented attributes for {Estimator.__name__}: {undocumented_attrs}" ) def _get_all_fitted_attributes(estimator): "Get all the fitted attributes of an estimator including properties" # attributes fit_attr = list(estimator.__dict__.keys()) # properties with warnings.catch_warnings(): warnings.filterwarnings("error", category=FutureWarning) for name in dir(estimator.__class__): obj = getattr(estimator.__class__, name) if not isinstance(obj, property): continue # ignore properties that raises an AttributeError and deprecated # properties try: getattr(estimator, name) except (AttributeError, FutureWarning): continue fit_attr.append(name) return [k for k in fit_attr if k.endswith("_") and not k.startswith("_")] imbalanced-learn-0.12.2/imblearn/tests/test_exceptions.py000066400000000000000000000005671460233407600235140ustar00rootroot00000000000000"""Test for the exceptions modules""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT from pytest import raises from imblearn.exceptions import raise_isinstance_error def test_raise_isinstance_error(): var = 10.0 with raises(ValueError, match="has to be one of"): raise_isinstance_error("var", [int], var) imbalanced-learn-0.12.2/imblearn/tests/test_pipeline.py000066400000000000000000001257071460233407600231440ustar00rootroot00000000000000""" Test the pipeline module. """ # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import itertools import re import shutil import time from tempfile import mkdtemp import numpy as np import pytest import sklearn from joblib import Memory from pytest import raises from sklearn.base import BaseEstimator, clone from sklearn.cluster import KMeans from sklearn.datasets import load_iris, make_classification from sklearn.decomposition import PCA from sklearn.feature_selection import SelectKBest, f_classif from sklearn.linear_model import LinearRegression, LogisticRegression from sklearn.neighbors import LocalOutlierFactor from sklearn.pipeline import FeatureUnion from sklearn.preprocessing import StandardScaler from sklearn.svm import SVC from sklearn.utils._testing import ( assert_allclose, assert_array_almost_equal, assert_array_equal, ) from sklearn.utils.fixes import parse_version from imblearn.datasets import make_imbalance from imblearn.pipeline import Pipeline, make_pipeline from imblearn.under_sampling import EditedNearestNeighbours as ENN from imblearn.under_sampling import RandomUnderSampler from imblearn.utils.estimator_checks import check_param_validation sklearn_version = parse_version(sklearn.__version__) JUNK_FOOD_DOCS = ( "the pizza pizza beer copyright", "the pizza burger beer copyright", "the the pizza beer beer copyright", "the burger beer beer copyright", "the coke burger coke copyright", "the coke burger burger", ) R_TOL = 1e-4 class NoFit: """Small class to test parameter dispatching.""" def __init__(self, a=None, b=None): self.a = a self.b = b class NoTrans(NoFit): def fit(self, X, y): return self def get_params(self, deep=False): return {"a": self.a, "b": self.b} def set_params(self, **params): self.a = params["a"] return self class NoInvTransf(NoTrans): def transform(self, X, y=None): return X class Transf(NoInvTransf): def transform(self, X, y=None): return X def inverse_transform(self, X): return X class TransfFitParams(Transf): def fit(self, X, y, **fit_params): self.fit_params = fit_params return self class Mult(BaseEstimator): def __init__(self, mult=1): self.mult = mult def fit(self, X, y): return self def transform(self, X): return np.asarray(X) * self.mult def inverse_transform(self, X): return np.asarray(X) / self.mult def predict(self, X): return (np.asarray(X) * self.mult).sum(axis=1) predict_proba = predict_log_proba = decision_function = predict def score(self, X, y=None): return np.sum(X) class FitParamT(BaseEstimator): """Mock classifier""" def __init__(self): self.successful = False def fit(self, X, y, should_succeed=False): self.successful = should_succeed def predict(self, X): return self.successful def fit_predict(self, X, y, should_succeed=False): self.fit(X, y, should_succeed=should_succeed) return self.predict(X) def score(self, X, y=None, sample_weight=None): if sample_weight is not None: X = X * sample_weight return np.sum(X) class DummyTransf(Transf): """Transformer which store the column means""" def fit(self, X, y): self.means_ = np.mean(X, axis=0) # store timestamp to figure out whether the result of 'fit' has been # cached or not self.timestamp_ = time.time() return self class DummyEstimatorParams(BaseEstimator): """Mock classifier that takes params on predict""" def fit(self, X, y): return self def predict(self, X, got_attribute=False): self.got_attribute = got_attribute return self class DummySampler(NoTrans): """Samplers which returns a balanced number of samples""" def fit_resample(self, X, y): self.means_ = np.mean(X, axis=0) # store timestamp to figure out whether the result of 'fit' has been # cached or not self.timestamp_ = time.time() return X, y class FitTransformSample(NoTrans): """Estimator implementing both transform and sample""" def fit(self, X, y, should_succeed=False): pass def fit_resample(self, X, y=None): return X, y def fit_transform(self, X, y=None): return self.fit(X, y).transform(X) def transform(self, X, y=None): return X def test_pipeline_init_tuple(): # Pipeline accepts steps as tuple X = np.array([[1, 2]]) pipe = Pipeline((("transf", Transf()), ("clf", FitParamT()))) pipe.fit(X, y=None) pipe.score(X) pipe.set_params(transf="passthrough") pipe.fit(X, y=None) pipe.score(X) def test_pipeline_init(): # Test the various init parameters of the pipeline. with raises(TypeError): Pipeline() # Check that we can't instantiate pipelines with objects without fit # method X, y = load_iris(return_X_y=True) error_regex = ( "Last step of Pipeline should implement fit or be the string 'passthrough'" ) with raises(TypeError, match=error_regex): model = Pipeline([("clf", NoFit())]) model.fit(X, y) # Smoke test with only an estimator clf = NoTrans() pipe = Pipeline([("svc", clf)]) expected = dict(svc__a=None, svc__b=None, svc=clf, **pipe.get_params(deep=False)) assert pipe.get_params(deep=True) == expected # Check that params are set pipe.set_params(svc__a=0.1) assert clf.a == 0.1 assert clf.b is None # Smoke test the repr: repr(pipe) # Test with two objects clf = SVC(gamma="scale") filter1 = SelectKBest(f_classif) pipe = Pipeline([("anova", filter1), ("svc", clf)]) # Check that we can't instantiate with non-transformers on the way # Note that NoTrans implements fit, but not transform error_regex = "implement fit and transform or fit_resample" with raises(TypeError, match=error_regex): model = Pipeline([("t", NoTrans()), ("svc", clf)]) model.fit(X, y) # Check that params are set pipe.set_params(svc__C=0.1) assert clf.C == 0.1 # Smoke test the repr: repr(pipe) # Check that params are not set when naming them wrong with raises(ValueError): pipe.set_params(anova__C=0.1) # Test clone pipe2 = clone(pipe) assert pipe.named_steps["svc"] is not pipe2.named_steps["svc"] # Check that apart from estimators, the parameters are the same params = pipe.get_params(deep=True) params2 = pipe2.get_params(deep=True) for x in pipe.get_params(deep=False): params.pop(x) for x in pipe2.get_params(deep=False): params2.pop(x) # Remove estimators that where copied params.pop("svc") params.pop("anova") params2.pop("svc") params2.pop("anova") assert params == params2 def test_pipeline_methods_anova(): # Test the various methods of the pipeline (anova). iris = load_iris() X = iris.data y = iris.target # Test with Anova + LogisticRegression clf = LogisticRegression(solver="lbfgs", multi_class="auto") filter1 = SelectKBest(f_classif, k=2) pipe = Pipeline([("anova", filter1), ("logistic", clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y) def test_pipeline_fit_params(): # Test that the pipeline can take fit parameters pipe = Pipeline([("transf", Transf()), ("clf", FitParamT())]) pipe.fit(X=None, y=None, clf__should_succeed=True) # classifier should return True assert pipe.predict(None) # and transformer params should not be changed assert pipe.named_steps["transf"].a is None assert pipe.named_steps["transf"].b is None # invalid parameters should raise an error message with raises(TypeError, match="unexpected keyword argument"): pipe.fit(None, None, clf__bad=True) def test_pipeline_sample_weight_supported(): # Pipeline should pass sample_weight X = np.array([[1, 2]]) pipe = Pipeline([("transf", Transf()), ("clf", FitParamT())]) pipe.fit(X, y=None) assert pipe.score(X) == 3 assert pipe.score(X, y=None) == 3 assert pipe.score(X, y=None, sample_weight=None) == 3 assert pipe.score(X, sample_weight=np.array([2, 3])) == 8 def test_pipeline_sample_weight_unsupported(): # When sample_weight is None it shouldn't be passed X = np.array([[1, 2]]) pipe = Pipeline([("transf", Transf()), ("clf", Mult())]) pipe.fit(X, y=None) assert pipe.score(X) == 3 assert pipe.score(X, sample_weight=None) == 3 with raises(TypeError, match="unexpected keyword argument"): pipe.score(X, sample_weight=np.array([2, 3])) def test_pipeline_raise_set_params_error(): # Test pipeline raises set params error message for nested models. pipe = Pipeline([("cls", LinearRegression())]) with raises(ValueError, match="Invalid parameter"): pipe.set_params(fake="nope") # nested model check with raises(ValueError, match="Invalid parameter"): pipe.set_params(fake__estimator="nope") def test_pipeline_methods_pca_svm(): # Test the various methods of the pipeline (pca + svm). iris = load_iris() X = iris.data y = iris.target # Test with PCA + SVC clf = SVC(gamma="scale", probability=True, random_state=0) pca = PCA(svd_solver="full", n_components="mle", whiten=True) pipe = Pipeline([("pca", pca), ("svc", clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y) def test_pipeline_methods_preprocessing_svm(): # Test the various methods of the pipeline (preprocessing + svm). iris = load_iris() X = iris.data y = iris.target n_samples = X.shape[0] n_classes = len(np.unique(y)) scaler = StandardScaler() pca = PCA(n_components=2, svd_solver="randomized", whiten=True) clf = SVC( gamma="scale", probability=True, random_state=0, decision_function_shape="ovr", ) for preprocessing in [scaler, pca]: pipe = Pipeline([("preprocess", preprocessing), ("svc", clf)]) pipe.fit(X, y) # check shapes of various prediction functions predict = pipe.predict(X) assert predict.shape == (n_samples,) proba = pipe.predict_proba(X) assert proba.shape == (n_samples, n_classes) log_proba = pipe.predict_log_proba(X) assert log_proba.shape == (n_samples, n_classes) decision_function = pipe.decision_function(X) assert decision_function.shape == (n_samples, n_classes) pipe.score(X, y) def test_fit_predict_on_pipeline(): # test that the fit_predict method is implemented on a pipeline # test that the fit_predict on pipeline yields same results as applying # transform and clustering steps separately iris = load_iris() scaler = StandardScaler() km = KMeans(random_state=0, n_init=10) # As pipeline doesn't clone estimators on construction, # it must have its own estimators scaler_for_pipeline = StandardScaler() km_for_pipeline = KMeans(random_state=0, n_init=10) # first compute the transform and clustering step separately scaled = scaler.fit_transform(iris.data) separate_pred = km.fit_predict(scaled) # use a pipeline to do the transform and clustering in one step pipe = Pipeline([("scaler", scaler_for_pipeline), ("Kmeans", km_for_pipeline)]) pipeline_pred = pipe.fit_predict(iris.data) assert_array_almost_equal(pipeline_pred, separate_pred) def test_fit_predict_on_pipeline_without_fit_predict(): # tests that a pipeline does not have fit_predict method when final # step of pipeline does not have fit_predict defined scaler = StandardScaler() pca = PCA(svd_solver="full") pipe = Pipeline([("scaler", scaler), ("pca", pca)]) error_regex = "has no attribute 'fit_predict'" with raises(AttributeError, match=error_regex): getattr(pipe, "fit_predict") def test_fit_predict_with_intermediate_fit_params(): # tests that Pipeline passes fit_params to intermediate steps # when fit_predict is invoked pipe = Pipeline([("transf", TransfFitParams()), ("clf", FitParamT())]) pipe.fit_predict( X=None, y=None, transf__should_get_this=True, clf__should_succeed=True ) assert pipe.named_steps["transf"].fit_params["should_get_this"] assert pipe.named_steps["clf"].successful assert "should_succeed" not in pipe.named_steps["transf"].fit_params def test_pipeline_transform(): # Test whether pipeline works with a transformer at the end. # Also test pipeline.transform and pipeline.inverse_transform iris = load_iris() X = iris.data pca = PCA(n_components=2, svd_solver="full") pipeline = Pipeline([("pca", pca)]) # test transform and fit_transform: X_trans = pipeline.fit(X).transform(X) X_trans2 = pipeline.fit_transform(X) X_trans3 = pca.fit_transform(X) assert_array_almost_equal(X_trans, X_trans2) assert_array_almost_equal(X_trans, X_trans3) X_back = pipeline.inverse_transform(X_trans) X_back2 = pca.inverse_transform(X_trans) assert_array_almost_equal(X_back, X_back2) def test_pipeline_fit_transform(): # Test whether pipeline works with a transformer missing fit_transform iris = load_iris() X = iris.data y = iris.target transf = Transf() pipeline = Pipeline([("mock", transf)]) # test fit_transform: X_trans = pipeline.fit_transform(X, y) X_trans2 = transf.fit(X, y).transform(X) assert_array_almost_equal(X_trans, X_trans2) def test_set_pipeline_steps(): transf1 = Transf() transf2 = Transf() pipeline = Pipeline([("mock", transf1)]) assert pipeline.named_steps["mock"] is transf1 # Directly setting attr pipeline.steps = [("mock2", transf2)] assert "mock" not in pipeline.named_steps assert pipeline.named_steps["mock2"] is transf2 assert [("mock2", transf2)] == pipeline.steps # Using set_params pipeline.set_params(steps=[("mock", transf1)]) assert [("mock", transf1)] == pipeline.steps # Using set_params to replace single step pipeline.set_params(mock=transf2) assert [("mock", transf2)] == pipeline.steps # With invalid data pipeline.set_params(steps=[("junk", ())]) with raises(TypeError): pipeline.fit([[1]], [1]) with raises(AttributeError): pipeline.fit_transform([[1]], [1]) @pytest.mark.parametrize("passthrough", [None, "passthrough"]) def test_pipeline_correctly_adjusts_steps(passthrough): X = np.array([[1]]) y = np.array([1]) mult2 = Mult(mult=2) mult3 = Mult(mult=3) mult5 = Mult(mult=5) pipeline = Pipeline( [("m2", mult2), ("bad", passthrough), ("m3", mult3), ("m5", mult5)] ) pipeline.fit(X, y) expected_names = ["m2", "bad", "m3", "m5"] actual_names = [name for name, _ in pipeline.steps] assert expected_names == actual_names @pytest.mark.parametrize("passthrough", [None, "passthrough"]) def test_set_pipeline_step_passthrough(passthrough): # Test setting Pipeline steps to None X = np.array([[1]]) y = np.array([1]) mult2 = Mult(mult=2) mult3 = Mult(mult=3) mult5 = Mult(mult=5) def make(): return Pipeline([("m2", mult2), ("m3", mult3), ("last", mult5)]) pipeline = make() exp = 2 * 3 * 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) pipeline.set_params(m3=passthrough) exp = 2 * 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) expected_params = { "steps": pipeline.steps, "m2": mult2, "m3": passthrough, "last": mult5, "memory": None, "m2__mult": 2, "last__mult": 5, "verbose": False, } assert pipeline.get_params(deep=True) == expected_params pipeline.set_params(m2=passthrough) exp = 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) # for other methods, ensure no AttributeErrors on None: other_methods = [ "predict_proba", "predict_log_proba", "decision_function", "transform", "score", ] for method in other_methods: getattr(pipeline, method)(X) pipeline.set_params(m2=mult2) exp = 2 * 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) pipeline = make() pipeline.set_params(last=passthrough) # mult2 and mult3 are active exp = 6 pipeline.fit(X, y) pipeline.transform(X) assert_array_equal([[exp]], pipeline.fit(X, y).transform(X)) assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) with raises(AttributeError, match="has no attribute 'predict'"): getattr(pipeline, "predict") # Check 'passthrough' step at construction time exp = 2 * 5 pipeline = Pipeline([("m2", mult2), ("m3", passthrough), ("last", mult5)]) assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) def test_pipeline_ducktyping(): pipeline = make_pipeline(Mult(5)) pipeline.predict pipeline.transform pipeline.inverse_transform pipeline = make_pipeline(Transf()) assert not hasattr(pipeline, "predict") pipeline.transform pipeline.inverse_transform pipeline = make_pipeline("passthrough") assert pipeline.steps[0] == ("passthrough", "passthrough") assert not hasattr(pipeline, "predict") pipeline.transform pipeline.inverse_transform pipeline = make_pipeline(Transf(), NoInvTransf()) assert not hasattr(pipeline, "predict") pipeline.transform assert not hasattr(pipeline, "inverse_transform") pipeline = make_pipeline(NoInvTransf(), Transf()) assert not hasattr(pipeline, "predict") pipeline.transform assert not hasattr(pipeline, "inverse_transform") def test_make_pipeline(): t1 = Transf() t2 = Transf() pipe = make_pipeline(t1, t2) assert isinstance(pipe, Pipeline) assert pipe.steps[0][0] == "transf-1" assert pipe.steps[1][0] == "transf-2" pipe = make_pipeline(t1, t2, FitParamT()) assert isinstance(pipe, Pipeline) assert pipe.steps[0][0] == "transf-1" assert pipe.steps[1][0] == "transf-2" assert pipe.steps[2][0] == "fitparamt" def test_classes_property(): iris = load_iris() X = iris.data y = iris.target reg = make_pipeline(SelectKBest(k=1), LinearRegression()) reg.fit(X, y) with raises(AttributeError): getattr(reg, "classes_") clf = make_pipeline( SelectKBest(k=1), LogisticRegression(solver="lbfgs", multi_class="auto", random_state=0), ) with raises(AttributeError): getattr(clf, "classes_") clf.fit(X, y) assert_array_equal(clf.classes_, np.unique(y)) def test_pipeline_memory_transformer(): iris = load_iris() X = iris.data y = iris.target cachedir = mkdtemp() try: memory = Memory(cachedir, verbose=10) # Test with Transformer + SVC clf = SVC(gamma="scale", probability=True, random_state=0) transf = DummyTransf() pipe = Pipeline([("transf", clone(transf)), ("svc", clf)]) cached_pipe = Pipeline([("transf", transf), ("svc", clf)], memory=memory) # Memoize the transformer at the first fit cached_pipe.fit(X, y) pipe.fit(X, y) # Get the time stamp of the tranformer in the cached pipeline expected_ts = cached_pipe.named_steps["transf"].timestamp_ # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal( pipe.named_steps["transf"].means_, cached_pipe.named_steps["transf"].means_, ) assert not hasattr(transf, "means_") # Check that we are reading the cache while fitting # a second time cached_pipe.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal( pipe.named_steps["transf"].means_, cached_pipe.named_steps["transf"].means_, ) assert cached_pipe.named_steps["transf"].timestamp_ == expected_ts # Create a new pipeline with cloned estimators # Check that even changing the name step does not affect the cache hit clf_2 = SVC(gamma="scale", probability=True, random_state=0) transf_2 = DummyTransf() cached_pipe_2 = Pipeline( [("transf_2", transf_2), ("svc", clf_2)], memory=memory ) cached_pipe_2.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe_2.predict_proba(X)) assert_array_equal( pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X) ) assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y)) assert_array_equal( pipe.named_steps["transf"].means_, cached_pipe_2.named_steps["transf_2"].means_, ) assert cached_pipe_2.named_steps["transf_2"].timestamp_ == expected_ts finally: shutil.rmtree(cachedir) def test_pipeline_memory_sampler(): X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0, ) cachedir = mkdtemp() try: memory = Memory(cachedir, verbose=10) # Test with Transformer + SVC clf = SVC(gamma="scale", probability=True, random_state=0) transf = DummySampler() pipe = Pipeline([("transf", clone(transf)), ("svc", clf)]) cached_pipe = Pipeline([("transf", transf), ("svc", clf)], memory=memory) # Memoize the transformer at the first fit cached_pipe.fit(X, y) pipe.fit(X, y) # Get the time stamp of the tranformer in the cached pipeline expected_ts = cached_pipe.named_steps["transf"].timestamp_ # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal( pipe.named_steps["transf"].means_, cached_pipe.named_steps["transf"].means_, ) assert not hasattr(transf, "means_") # Check that we are reading the cache while fitting # a second time cached_pipe.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal( pipe.named_steps["transf"].means_, cached_pipe.named_steps["transf"].means_, ) assert cached_pipe.named_steps["transf"].timestamp_ == expected_ts # Create a new pipeline with cloned estimators # Check that even changing the name step does not affect the cache hit clf_2 = SVC(gamma="scale", probability=True, random_state=0) transf_2 = DummySampler() cached_pipe_2 = Pipeline( [("transf_2", transf_2), ("svc", clf_2)], memory=memory ) cached_pipe_2.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe_2.predict_proba(X)) assert_array_equal( pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X) ) assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y)) assert_array_equal( pipe.named_steps["transf"].means_, cached_pipe_2.named_steps["transf_2"].means_, ) assert cached_pipe_2.named_steps["transf_2"].timestamp_ == expected_ts finally: shutil.rmtree(cachedir) def test_pipeline_methods_pca_rus_svm(): # Test the various methods of the pipeline (pca + svm). X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0, ) # Test with PCA + SVC clf = SVC(gamma="scale", probability=True, random_state=0) pca = PCA() rus = RandomUnderSampler(random_state=0) pipe = Pipeline([("pca", pca), ("rus", rus), ("svc", clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y) def test_pipeline_methods_rus_pca_svm(): # Test the various methods of the pipeline (pca + svm). X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0, ) # Test with PCA + SVC clf = SVC(gamma="scale", probability=True, random_state=0) pca = PCA() rus = RandomUnderSampler(random_state=0) pipe = Pipeline([("rus", rus), ("pca", pca), ("svc", clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y) def test_pipeline_sample(): # Test whether pipeline works with a sampler at the end. # Also test pipeline.sampler X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0, ) rus = RandomUnderSampler(random_state=0) pipeline = Pipeline([("rus", rus)]) # test transform and fit_transform: X_trans, y_trans = pipeline.fit_resample(X, y) X_trans2, y_trans2 = rus.fit_resample(X, y) assert_allclose(X_trans, X_trans2, rtol=R_TOL) assert_allclose(y_trans, y_trans2, rtol=R_TOL) pca = PCA() pipeline = Pipeline([("pca", PCA()), ("rus", rus)]) X_trans, y_trans = pipeline.fit_resample(X, y) X_pca = pca.fit_transform(X) X_trans2, y_trans2 = rus.fit_resample(X_pca, y) # We round the value near to zero. It seems that PCA has some issue # with that X_trans[np.bitwise_and(X_trans < R_TOL, X_trans > -R_TOL)] = 0 X_trans2[np.bitwise_and(X_trans2 < R_TOL, X_trans2 > -R_TOL)] = 0 assert_allclose(X_trans, X_trans2, rtol=R_TOL) assert_allclose(y_trans, y_trans2, rtol=R_TOL) def test_pipeline_sample_transform(): # Test whether pipeline works with a sampler at the end. # Also test pipeline.sampler X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0, ) rus = RandomUnderSampler(random_state=0) pca = PCA() pca2 = PCA() pipeline = Pipeline([("pca", pca), ("rus", rus), ("pca2", pca2)]) pipeline.fit(X, y).transform(X) def test_pipeline_none_classifier(): # Test pipeline using None as preprocessing step and a classifier X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0, ) clf = LogisticRegression(solver="lbfgs", random_state=0) pipe = make_pipeline(None, clf) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.decision_function(X) pipe.score(X, y) def test_pipeline_none_sampler_classifier(): # Test pipeline using None, RUS and a classifier X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0, ) clf = LogisticRegression(solver="lbfgs", random_state=0) rus = RandomUnderSampler(random_state=0) pipe = make_pipeline(None, rus, clf) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.decision_function(X) pipe.score(X, y) def test_pipeline_sampler_none_classifier(): # Test pipeline using RUS, None and a classifier X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0, ) clf = LogisticRegression(solver="lbfgs", random_state=0) rus = RandomUnderSampler(random_state=0) pipe = make_pipeline(rus, None, clf) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.decision_function(X) pipe.score(X, y) def test_pipeline_none_sampler_sample(): # Test pipeline using None step and a sampler X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0, ) rus = RandomUnderSampler(random_state=0) pipe = make_pipeline(None, rus) pipe.fit_resample(X, y) def test_pipeline_none_transformer(): # Test pipeline using None and a transformer that implements transform and # inverse_transform X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0, ) pca = PCA(whiten=True) pipe = make_pipeline(None, pca) pipe.fit(X, y) X_trans = pipe.transform(X) X_inversed = pipe.inverse_transform(X_trans) assert_array_almost_equal(X, X_inversed) def test_pipeline_methods_anova_rus(): # Test the various methods of the pipeline (anova). X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0, ) # Test with RandomUnderSampling + Anova + LogisticRegression clf = LogisticRegression(solver="lbfgs") rus = RandomUnderSampler(random_state=0) filter1 = SelectKBest(f_classif, k=2) pipe = Pipeline([("rus", rus), ("anova", filter1), ("logistic", clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y) def test_pipeline_with_step_that_implements_both_sample_and_transform(): # Test the various methods of the pipeline (anova). X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0, ) clf = LogisticRegression(solver="lbfgs") with raises(TypeError): pipeline = Pipeline([("step", FitTransformSample()), ("logistic", clf)]) pipeline.fit(X, y) def test_pipeline_with_step_that_it_is_pipeline(): # Test the various methods of the pipeline (anova). X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0, ) # Test with RandomUnderSampling + Anova + LogisticRegression clf = LogisticRegression(solver="lbfgs") rus = RandomUnderSampler(random_state=0) filter1 = SelectKBest(f_classif, k=2) pipe1 = Pipeline([("rus", rus), ("anova", filter1)]) with raises(TypeError): pipe2 = Pipeline([("pipe1", pipe1), ("logistic", clf)]) pipe2.fit(X, y) def test_pipeline_fit_then_sample_with_sampler_last_estimator(): X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=50000, random_state=0, ) rus = RandomUnderSampler(random_state=42) enn = ENN() pipeline = make_pipeline(rus, enn) X_fit_resample_resampled, y_fit_resample_resampled = pipeline.fit_resample(X, y) pipeline = make_pipeline(rus, enn) pipeline.fit(X, y) X_fit_then_sample_res, y_fit_then_sample_res = pipeline.fit_resample(X, y) assert_array_equal(X_fit_resample_resampled, X_fit_then_sample_res) assert_array_equal(y_fit_resample_resampled, y_fit_then_sample_res) def test_pipeline_fit_then_sample_3_samplers_with_sampler_last_estimator(): X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=50000, random_state=0, ) rus = RandomUnderSampler(random_state=42) enn = ENN() pipeline = make_pipeline(rus, enn, rus) X_fit_resample_resampled, y_fit_resample_resampled = pipeline.fit_resample(X, y) pipeline = make_pipeline(rus, enn, rus) pipeline.fit(X, y) X_fit_then_sample_res, y_fit_then_sample_res = pipeline.fit_resample(X, y) assert_array_equal(X_fit_resample_resampled, X_fit_then_sample_res) assert_array_equal(y_fit_resample_resampled, y_fit_then_sample_res) def test_make_pipeline_memory(): cachedir = mkdtemp() try: memory = Memory(cachedir, verbose=10) pipeline = make_pipeline(DummyTransf(), SVC(gamma="scale"), memory=memory) assert pipeline.memory is memory pipeline = make_pipeline(DummyTransf(), SVC(gamma="scale")) assert pipeline.memory is None finally: shutil.rmtree(cachedir) def test_predict_with_predict_params(): # tests that Pipeline passes predict_params to the final estimator # when predict is invoked pipe = Pipeline([("transf", Transf()), ("clf", DummyEstimatorParams())]) pipe.fit(None, None) pipe.predict(X=None, got_attribute=True) assert pipe.named_steps["clf"].got_attribute def test_resampler_last_stage_passthrough(): X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=50000, random_state=0, ) rus = RandomUnderSampler(random_state=42) pipe = make_pipeline(rus, None) pipe.fit_resample(X, y) def test_pipeline_score_samples_pca_lof_binary(): X, y = make_classification( n_classes=2, class_sep=2, weights=[0.3, 0.7], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=500, random_state=0, ) # Test that the score_samples method is implemented on a pipeline. # Test that the score_samples method on pipeline yields same results as # applying transform and score_samples steps separately. rus = RandomUnderSampler(random_state=42) pca = PCA(svd_solver="full", n_components="mle", whiten=True) lof = LocalOutlierFactor(novelty=True) pipe = Pipeline([("rus", rus), ("pca", pca), ("lof", lof)]) pipe.fit(X, y) # Check the shapes assert pipe.score_samples(X).shape == (X.shape[0],) # Check the values X_res, _ = rus.fit_resample(X, y) lof.fit(pca.fit_transform(X_res)) assert_allclose(pipe.score_samples(X), lof.score_samples(pca.transform(X))) def test_score_samples_on_pipeline_without_score_samples(): X = np.array([[1], [2]]) y = np.array([1, 2]) # Test that a pipeline does not have score_samples method when the final # step of the pipeline does not have score_samples defined. pipe = make_pipeline(LogisticRegression()) pipe.fit(X, y) with pytest.raises( AttributeError, match="has no attribute 'score_samples'", ): pipe.score_samples(X) def test_pipeline_param_error(): clf = make_pipeline(LogisticRegression()) with pytest.raises( ValueError, match="Pipeline.fit does not accept the sample_weight parameter", ): clf.fit([[0], [0]], [0, 1], sample_weight=[1, 1]) parameter_grid_test_verbose = ( (est, pattern, method) for (est, pattern), method in itertools.product( [ ( Pipeline([("transf", Transf()), ("clf", FitParamT())]), r"\[Pipeline\].*\(step 1 of 2\) Processing transf.* total=.*\n" r"\[Pipeline\].*\(step 2 of 2\) Processing clf.* total=.*\n$", ), ( Pipeline([("transf", Transf()), ("noop", None), ("clf", FitParamT())]), r"\[Pipeline\].*\(step 1 of 3\) Processing transf.* total=.*\n" r"\[Pipeline\].*\(step 2 of 3\) Processing noop.* total=.*\n" r"\[Pipeline\].*\(step 3 of 3\) Processing clf.* total=.*\n$", ), ( Pipeline( [ ("transf", Transf()), ("noop", "passthrough"), ("clf", FitParamT()), ] ), r"\[Pipeline\].*\(step 1 of 3\) Processing transf.* total=.*\n" r"\[Pipeline\].*\(step 2 of 3\) Processing noop.* total=.*\n" r"\[Pipeline\].*\(step 3 of 3\) Processing clf.* total=.*\n$", ), ( Pipeline([("transf", Transf()), ("clf", None)]), r"\[Pipeline\].*\(step 1 of 2\) Processing transf.* total=.*\n" r"\[Pipeline\].*\(step 2 of 2\) Processing clf.* total=.*\n$", ), ( Pipeline([("transf", None), ("mult", Mult())]), r"\[Pipeline\].*\(step 1 of 2\) Processing transf.* total=.*\n" r"\[Pipeline\].*\(step 2 of 2\) Processing mult.* total=.*\n$", ), ( Pipeline([("transf", "passthrough"), ("mult", Mult())]), r"\[Pipeline\].*\(step 1 of 2\) Processing transf.* total=.*\n" r"\[Pipeline\].*\(step 2 of 2\) Processing mult.* total=.*\n$", ), ( FeatureUnion([("mult1", Mult()), ("mult2", Mult())]), r"\[FeatureUnion\].*\(step 1 of 2\) Processing mult1.* total=.*\n" r"\[FeatureUnion\].*\(step 2 of 2\) Processing mult2.* total=.*\n$", ), ( FeatureUnion([("mult1", "drop"), ("mult2", Mult()), ("mult3", "drop")]), r"\[FeatureUnion\].*\(step 1 of 1\) Processing mult2.* total=.*\n$", ), ], ["fit", "fit_transform", "fit_predict"], ) if hasattr(est, method) and not ( method == "fit_transform" and hasattr(est, "steps") and isinstance(est.steps[-1][1], FitParamT) ) ) @pytest.mark.parametrize("est, pattern, method", parameter_grid_test_verbose) def test_verbose(est, method, pattern, capsys): func = getattr(est, method) X = [[1, 2, 3], [4, 5, 6]] y = [[7], [8]] est.set_params(verbose=False) func(X, y) assert not capsys.readouterr().out, "Got output for verbose=False" est.set_params(verbose=True) func(X, y) assert re.match(pattern, capsys.readouterr().out) def test_pipeline_score_samples_pca_lof_multiclass(): X, y = load_iris(return_X_y=True) sampling_strategy = {0: 50, 1: 30, 2: 20} X, y = make_imbalance(X, y, sampling_strategy=sampling_strategy) # Test that the score_samples method is implemented on a pipeline. # Test that the score_samples method on pipeline yields same results as # applying transform and score_samples steps separately. rus = RandomUnderSampler() pca = PCA(svd_solver="full", n_components="mle", whiten=True) lof = LocalOutlierFactor(novelty=True) pipe = Pipeline([("rus", rus), ("pca", pca), ("lof", lof)]) pipe.fit(X, y) # Check the shapes assert pipe.score_samples(X).shape == (X.shape[0],) # Check the values lof.fit(pca.fit_transform(X)) assert_allclose(pipe.score_samples(X), lof.score_samples(pca.transform(X))) def test_pipeline_param_validation(): model = Pipeline( [("sampler", RandomUnderSampler()), ("classifier", LogisticRegression())] ) check_param_validation("Pipeline", model) @pytest.mark.skipif( sklearn_version < parse_version("1.2"), reason="requires scikit-learn >= 1.2" ) def test_pipeline_with_set_output(): pd = pytest.importorskip("pandas") X, y = load_iris(return_X_y=True, as_frame=True) pipeline = make_pipeline( StandardScaler(), RandomUnderSampler(), LogisticRegression() ).set_output(transform="default") pipeline.fit(X, y) X_res, y_res = pipeline[:-1].fit_resample(X, y) assert isinstance(X_res, np.ndarray) # transformer will not change `y` and sampler will always preserve the type of `y` assert isinstance(y_res, type(y)) pipeline.set_output(transform="pandas") X_res, y_res = pipeline[:-1].fit_resample(X, y) assert isinstance(X_res, pd.DataFrame) # transformer will not change `y` and sampler will always preserve the type of `y` assert isinstance(y_res, type(y)) imbalanced-learn-0.12.2/imblearn/tests/test_public_functions.py000066400000000000000000000077061460233407600247030ustar00rootroot00000000000000"""This is a copy of sklearn/tests/test_public_functions.py. It can be removed when we support scikit-learn >= 1.2. """ from importlib import import_module from inspect import signature import pytest from imblearn.utils._param_validation import ( generate_invalid_param_val, generate_valid_param, make_constraint, ) PARAM_VALIDATION_FUNCTION_LIST = [ "imblearn.datasets.fetch_datasets", "imblearn.datasets.make_imbalance", "imblearn.metrics.classification_report_imbalanced", "imblearn.metrics.geometric_mean_score", "imblearn.metrics.macro_averaged_mean_absolute_error", "imblearn.metrics.make_index_balanced_accuracy", "imblearn.metrics.sensitivity_specificity_support", "imblearn.metrics.sensitivity_score", "imblearn.metrics.specificity_score", "imblearn.pipeline.make_pipeline", ] @pytest.mark.parametrize("func_module", PARAM_VALIDATION_FUNCTION_LIST) def test_function_param_validation(func_module): """Check that an informative error is raised when the value of a parameter does not have an appropriate type or value. """ module_name, func_name = func_module.rsplit(".", 1) module = import_module(module_name) func = getattr(module, func_name) func_sig = signature(func) func_params = [ p.name for p in func_sig.parameters.values() if p.kind not in (p.VAR_POSITIONAL, p.VAR_KEYWORD) ] parameter_constraints = getattr(func, "_skl_parameter_constraints") # Generate valid values for the required parameters # The parameters `*args` and `**kwargs` are ignored since we cannot generate # constraints. required_params = [ p.name for p in func_sig.parameters.values() if p.default is p.empty and p.kind not in (p.VAR_POSITIONAL, p.VAR_KEYWORD) ] valid_required_params = {} for param_name in required_params: if parameter_constraints[param_name] == "no_validation": valid_required_params[param_name] = 1 else: valid_required_params[param_name] = generate_valid_param( make_constraint(parameter_constraints[param_name][0]) ) # check that there is a constraint for each parameter if func_params: validation_params = parameter_constraints.keys() unexpected_params = set(validation_params) - set(func_params) missing_params = set(func_params) - set(validation_params) err_msg = ( "Mismatch between _parameter_constraints and the parameters of" f" {func_name}.\nConsider the unexpected parameters {unexpected_params} and" f" expected but missing parameters {missing_params}\n" ) assert set(validation_params) == set(func_params), err_msg # this object does not have a valid type for sure for all params param_with_bad_type = type("BadType", (), {})() for param_name in func_params: constraints = parameter_constraints[param_name] if constraints == "no_validation": # This parameter is not validated continue match = ( rf"The '{param_name}' parameter of {func_name} must be .* Got .* instead." ) # First, check that the error is raised if param doesn't match any valid type. with pytest.raises(ValueError, match=match): func(**{**valid_required_params, param_name: param_with_bad_type}) # Then, for constraints that are more than a type constraint, check that the # error is raised if param does match a valid type but does not match any valid # value for this type. constraints = [make_constraint(constraint) for constraint in constraints] for constraint in constraints: try: bad_value = generate_invalid_param_val(constraint) except NotImplementedError: continue with pytest.raises(ValueError, match=match): func(**{**valid_required_params, param_name: bad_value}) imbalanced-learn-0.12.2/imblearn/under_sampling/000077500000000000000000000000001460233407600215575ustar00rootroot00000000000000imbalanced-learn-0.12.2/imblearn/under_sampling/__init__.py000066400000000000000000000013351460233407600236720ustar00rootroot00000000000000""" The :mod:`imblearn.under_sampling` provides methods to under-sample a dataset. """ from ._prototype_generation import ClusterCentroids from ._prototype_selection import ( AllKNN, CondensedNearestNeighbour, EditedNearestNeighbours, InstanceHardnessThreshold, NearMiss, NeighbourhoodCleaningRule, OneSidedSelection, RandomUnderSampler, RepeatedEditedNearestNeighbours, TomekLinks, ) __all__ = [ "ClusterCentroids", "RandomUnderSampler", "InstanceHardnessThreshold", "NearMiss", "TomekLinks", "EditedNearestNeighbours", "RepeatedEditedNearestNeighbours", "AllKNN", "OneSidedSelection", "CondensedNearestNeighbour", "NeighbourhoodCleaningRule", ] imbalanced-learn-0.12.2/imblearn/under_sampling/_prototype_generation/000077500000000000000000000000001460233407600261765ustar00rootroot00000000000000imbalanced-learn-0.12.2/imblearn/under_sampling/_prototype_generation/__init__.py000066400000000000000000000003501460233407600303050ustar00rootroot00000000000000""" The :mod:`imblearn.under_sampling.prototype_generation` submodule contains methods that generate new samples in order to balance the dataset. """ from ._cluster_centroids import ClusterCentroids __all__ = ["ClusterCentroids"] imbalanced-learn-0.12.2/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py000066400000000000000000000165501460233407600324510ustar00rootroot00000000000000"""Class to perform under-sampling by generating centroids based on clustering.""" # Authors: Guillaume Lemaitre # Fernando Nogueira # Christos Aridas # License: MIT import numpy as np from scipy import sparse from sklearn.base import clone from sklearn.cluster import KMeans from sklearn.neighbors import NearestNeighbors from sklearn.utils import _safe_indexing from ...utils import Substitution from ...utils._docstring import _random_state_docstring from ...utils._param_validation import HasMethods, StrOptions from ..base import BaseUnderSampler VOTING_KIND = ("auto", "hard", "soft") @Substitution( sampling_strategy=BaseUnderSampler._sampling_strategy_docstring, random_state=_random_state_docstring, ) class ClusterCentroids(BaseUnderSampler): """Undersample by generating centroids based on clustering methods. Method that under samples the majority class by replacing a cluster of majority samples by the cluster centroid of a KMeans algorithm. This algorithm keeps N majority samples by fitting the KMeans algorithm with N cluster to the majority class and using the coordinates of the N cluster centroids as the new majority samples. Read more in the :ref:`User Guide `. Parameters ---------- {sampling_strategy} {random_state} estimator : estimator object, default=None A scikit-learn compatible clustering method that exposes a `n_clusters` parameter and a `cluster_centers_` fitted attribute. By default, it will be a default :class:`~sklearn.cluster.KMeans` estimator. voting : {{"hard", "soft", "auto"}}, default='auto' Voting strategy to generate the new samples: - If ``'hard'``, the nearest-neighbors of the centroids found using the clustering algorithm will be used. - If ``'soft'``, the centroids found by the clustering algorithm will be used. - If ``'auto'``, if the input is sparse, it will default on ``'hard'`` otherwise, ``'soft'`` will be used. .. versionadded:: 0.3.0 Attributes ---------- sampling_strategy_ : dict Dictionary containing the information to sample the dataset. The keys corresponds to the class labels from which to sample and the values are the number of samples to sample. estimator_ : estimator object The validated estimator created from the `estimator` parameter. voting_ : str The validated voting strategy. n_features_in_ : int Number of features in the input dataset. .. versionadded:: 0.9 feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during `fit`. Defined only when `X` has feature names that are all strings. .. versionadded:: 0.10 See Also -------- EditedNearestNeighbours : Under-sampling by editing samples. CondensedNearestNeighbour: Under-sampling by condensing samples. Notes ----- Supports multi-class resampling by sampling each class independently. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from sklearn.cluster import MiniBatchKMeans >>> from imblearn.under_sampling import ClusterCentroids >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> cc = ClusterCentroids( ... estimator=MiniBatchKMeans(n_init=1, random_state=0), random_state=42 ... ) >>> X_res, y_res = cc.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{...}}) """ _parameter_constraints: dict = { **BaseUnderSampler._parameter_constraints, "estimator": [HasMethods(["fit", "predict"]), None], "voting": [StrOptions({"auto", "hard", "soft"})], "random_state": ["random_state"], } def __init__( self, *, sampling_strategy="auto", random_state=None, estimator=None, voting="auto", ): super().__init__(sampling_strategy=sampling_strategy) self.random_state = random_state self.estimator = estimator self.voting = voting def _validate_estimator(self): """Private function to create the KMeans estimator""" if self.estimator is None: self.estimator_ = KMeans(random_state=self.random_state) else: self.estimator_ = clone(self.estimator) if "n_clusters" not in self.estimator_.get_params(): raise ValueError( "`estimator` should be a clustering estimator exposing a parameter" " `n_clusters` and a fitted parameter `cluster_centers_`." ) def _generate_sample(self, X, y, centroids, target_class): if self.voting_ == "hard": nearest_neighbors = NearestNeighbors(n_neighbors=1) nearest_neighbors.fit(X, y) indices = nearest_neighbors.kneighbors(centroids, return_distance=False) X_new = _safe_indexing(X, np.squeeze(indices)) else: if sparse.issparse(X): X_new = sparse.csr_matrix(centroids, dtype=X.dtype) else: X_new = centroids y_new = np.array([target_class] * centroids.shape[0], dtype=y.dtype) return X_new, y_new def _fit_resample(self, X, y): self._validate_estimator() if self.voting == "auto": self.voting_ = "hard" if sparse.issparse(X) else "soft" else: self.voting_ = self.voting X_resampled, y_resampled = [], [] for target_class in np.unique(y): target_class_indices = np.flatnonzero(y == target_class) if target_class in self.sampling_strategy_.keys(): n_samples = self.sampling_strategy_[target_class] self.estimator_.set_params(**{"n_clusters": n_samples}) self.estimator_.fit(_safe_indexing(X, target_class_indices)) if not hasattr(self.estimator_, "cluster_centers_"): raise RuntimeError( "`estimator` should be a clustering estimator exposing a " "fitted parameter `cluster_centers_`." ) X_new, y_new = self._generate_sample( _safe_indexing(X, target_class_indices), _safe_indexing(y, target_class_indices), self.estimator_.cluster_centers_, target_class, ) X_resampled.append(X_new) y_resampled.append(y_new) else: X_resampled.append(_safe_indexing(X, target_class_indices)) y_resampled.append(_safe_indexing(y, target_class_indices)) if sparse.issparse(X): X_resampled = sparse.vstack(X_resampled) else: X_resampled = np.vstack(X_resampled) y_resampled = np.hstack(y_resampled) return X_resampled, np.array(y_resampled, dtype=y.dtype) def _more_tags(self): return {"sample_indices": False} imbalanced-learn-0.12.2/imblearn/under_sampling/_prototype_generation/tests/000077500000000000000000000000001460233407600273405ustar00rootroot00000000000000imbalanced-learn-0.12.2/imblearn/under_sampling/_prototype_generation/tests/__init__.py000066400000000000000000000000001460233407600314370ustar00rootroot00000000000000test_cluster_centroids.py000066400000000000000000000122511460233407600344260ustar00rootroot00000000000000imbalanced-learn-0.12.2/imblearn/under_sampling/_prototype_generation/tests"""Test the module cluster centroids.""" from collections import Counter import numpy as np import pytest from scipy import sparse from sklearn.cluster import KMeans from sklearn.datasets import make_classification from sklearn.linear_model import LogisticRegression from imblearn.under_sampling import ClusterCentroids from imblearn.utils.testing import _CustomClusterer RND_SEED = 0 X = np.array( [ [0.04352327, -0.20515826], [0.92923648, 0.76103773], [0.20792588, 1.49407907], [0.47104475, 0.44386323], [0.22950086, 0.33367433], [0.15490546, 0.3130677], [0.09125309, -0.85409574], [0.12372842, 0.6536186], [0.13347175, 0.12167502], [0.094035, -2.55298982], ] ) Y = np.array([1, 0, 1, 0, 1, 1, 1, 1, 0, 1]) R_TOL = 1e-4 @pytest.mark.parametrize( "X, expected_voting", [(X, "soft"), (sparse.csr_matrix(X), "hard")] ) @pytest.mark.filterwarnings("ignore:The default value of `n_init` will change") def test_fit_resample_check_voting(X, expected_voting): cc = ClusterCentroids(random_state=RND_SEED) cc.fit_resample(X, Y) assert cc.voting_ == expected_voting @pytest.mark.filterwarnings("ignore:The default value of `n_init` will change") def test_fit_resample_auto(): sampling_strategy = "auto" cc = ClusterCentroids(sampling_strategy=sampling_strategy, random_state=RND_SEED) X_resampled, y_resampled = cc.fit_resample(X, Y) assert X_resampled.shape == (6, 2) assert y_resampled.shape == (6,) @pytest.mark.filterwarnings("ignore:The default value of `n_init` will change") def test_fit_resample_half(): sampling_strategy = {0: 3, 1: 6} cc = ClusterCentroids(sampling_strategy=sampling_strategy, random_state=RND_SEED) X_resampled, y_resampled = cc.fit_resample(X, Y) assert X_resampled.shape == (9, 2) assert y_resampled.shape == (9,) @pytest.mark.filterwarnings("ignore:The default value of `n_init` will change") def test_multiclass_fit_resample(): y = Y.copy() y[5] = 2 y[6] = 2 cc = ClusterCentroids(random_state=RND_SEED) _, y_resampled = cc.fit_resample(X, y) count_y_res = Counter(y_resampled) assert count_y_res[0] == 2 assert count_y_res[1] == 2 assert count_y_res[2] == 2 def test_fit_resample_object(): sampling_strategy = "auto" cluster = KMeans(random_state=RND_SEED, n_init=1) cc = ClusterCentroids( sampling_strategy=sampling_strategy, random_state=RND_SEED, estimator=cluster, ) X_resampled, y_resampled = cc.fit_resample(X, Y) assert X_resampled.shape == (6, 2) assert y_resampled.shape == (6,) def test_fit_hard_voting(): sampling_strategy = "auto" voting = "hard" cluster = KMeans(random_state=RND_SEED, n_init=1) cc = ClusterCentroids( sampling_strategy=sampling_strategy, random_state=RND_SEED, estimator=cluster, voting=voting, ) X_resampled, y_resampled = cc.fit_resample(X, Y) assert X_resampled.shape == (6, 2) assert y_resampled.shape == (6,) for x in X_resampled: assert np.any(np.all(x == X, axis=1)) @pytest.mark.filterwarnings("ignore:The default value of `n_init` will change") def test_cluster_centroids_hard_target_class(): # check that the samples selecting by the hard voting corresponds to the # targeted class # non-regression test for: # https://github.com/scikit-learn-contrib/imbalanced-learn/issues/738 X, y = make_classification( n_samples=1000, n_features=2, n_informative=1, n_redundant=0, n_repeated=0, n_clusters_per_class=1, weights=[0.3, 0.7], class_sep=0.01, random_state=0, ) cc = ClusterCentroids(voting="hard", random_state=0) X_res, y_res = cc.fit_resample(X, y) minority_class_indices = np.flatnonzero(y == 0) X_minority_class = X[minority_class_indices] resampled_majority_class_indices = np.flatnonzero(y_res == 1) X_res_majority = X_res[resampled_majority_class_indices] sample_from_minority_in_majority = [ np.all(np.isclose(selected_sample, minority_sample)) for selected_sample in X_res_majority for minority_sample in X_minority_class ] assert sum(sample_from_minority_in_majority) == 0 def test_cluster_centroids_custom_clusterer(): clusterer = _CustomClusterer() cc = ClusterCentroids(estimator=clusterer, random_state=RND_SEED) cc.fit_resample(X, Y) assert isinstance(cc.estimator_.cluster_centers_, np.ndarray) clusterer = _CustomClusterer(expose_cluster_centers=False) cc = ClusterCentroids(estimator=clusterer, random_state=RND_SEED) err_msg = ( "`estimator` should be a clustering estimator exposing a fitted parameter " "`cluster_centers_`." ) with pytest.raises(RuntimeError, match=err_msg): cc.fit_resample(X, Y) clusterer = LogisticRegression() cc = ClusterCentroids(estimator=clusterer, random_state=RND_SEED) err_msg = ( "`estimator` should be a clustering estimator exposing a parameter " "`n_clusters` and a fitted parameter `cluster_centers_`." ) with pytest.raises(ValueError, match=err_msg): cc.fit_resample(X, Y) imbalanced-learn-0.12.2/imblearn/under_sampling/_prototype_selection/000077500000000000000000000000001460233407600260305ustar00rootroot00000000000000imbalanced-learn-0.12.2/imblearn/under_sampling/_prototype_selection/__init__.py000066400000000000000000000016401460233407600301420ustar00rootroot00000000000000""" The :mod:`imblearn.under_sampling.prototype_selection` submodule contains methods that select samples in order to balance the dataset. """ from ._condensed_nearest_neighbour import CondensedNearestNeighbour from ._edited_nearest_neighbours import ( AllKNN, EditedNearestNeighbours, RepeatedEditedNearestNeighbours, ) from ._instance_hardness_threshold import InstanceHardnessThreshold from ._nearmiss import NearMiss from ._neighbourhood_cleaning_rule import NeighbourhoodCleaningRule from ._one_sided_selection import OneSidedSelection from ._random_under_sampler import RandomUnderSampler from ._tomek_links import TomekLinks __all__ = [ "RandomUnderSampler", "InstanceHardnessThreshold", "NearMiss", "TomekLinks", "EditedNearestNeighbours", "RepeatedEditedNearestNeighbours", "AllKNN", "OneSidedSelection", "CondensedNearestNeighbour", "NeighbourhoodCleaningRule", ] imbalanced-learn-0.12.2/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py000066400000000000000000000226251460233407600342750ustar00rootroot00000000000000"""Class to perform under-sampling based on the condensed nearest neighbour method.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import numbers import warnings from collections import Counter import numpy as np from scipy.sparse import issparse from sklearn.base import clone from sklearn.neighbors import KNeighborsClassifier from sklearn.utils import _safe_indexing, check_random_state from ...utils import Substitution from ...utils._docstring import _n_jobs_docstring, _random_state_docstring from ...utils._param_validation import HasMethods, Interval from ..base import BaseCleaningSampler @Substitution( sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring, n_jobs=_n_jobs_docstring, random_state=_random_state_docstring, ) class CondensedNearestNeighbour(BaseCleaningSampler): """Undersample based on the condensed nearest neighbour method. Read more in the :ref:`User Guide `. Parameters ---------- {sampling_strategy} {random_state} n_neighbors : int or estimator object, default=None If ``int``, size of the neighbourhood to consider to compute the nearest neighbors. If object, an estimator that inherits from :class:`~sklearn.neighbors.base.KNeighborsMixin` that will be used to find the nearest-neighbors. If `None`, a :class:`~sklearn.neighbors.KNeighborsClassifier` with a 1-NN rules will be used. n_seeds_S : int, default=1 Number of samples to extract in order to build the set S. {n_jobs} Attributes ---------- sampling_strategy_ : dict Dictionary containing the information to sample the dataset. The keys corresponds to the class labels from which to sample and the values are the number of samples to sample. estimator_ : estimator object The validated K-nearest neighbor estimator created from `n_neighbors` parameter. .. deprecated:: 0.12 `estimator_` is deprecated in 0.12 and will be removed in 0.14. Use `estimators_` instead that contains the list of all K-nearest neighbors estimator used for each pair of class. estimators_ : list of estimator objects of shape (n_resampled_classes - 1,) Contains the K-nearest neighbor estimator used for per of classes. .. versionadded:: 0.12 sample_indices_ : ndarray of shape (n_new_samples,) Indices of the samples selected. .. versionadded:: 0.4 n_features_in_ : int Number of features in the input dataset. .. versionadded:: 0.9 feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during `fit`. Defined only when `X` has feature names that are all strings. .. versionadded:: 0.10 See Also -------- EditedNearestNeighbours : Undersample by editing samples. RepeatedEditedNearestNeighbours : Undersample by repeating ENN algorithm. AllKNN : Undersample using ENN and various number of neighbours. Notes ----- The method is based on [1]_. Supports multi-class resampling: a strategy one (minority) vs. each other classes is applied. References ---------- .. [1] P. Hart, "The condensed nearest neighbor rule," In Information Theory, IEEE Transactions on, vol. 14(3), pp. 515-516, 1968. Examples -------- >>> from collections import Counter # doctest: +SKIP >>> from sklearn.datasets import fetch_openml # doctest: +SKIP >>> from sklearn.preprocessing import scale # doctest: +SKIP >>> from imblearn.under_sampling import \ CondensedNearestNeighbour # doctest: +SKIP >>> X, y = fetch_openml('diabetes', version=1, return_X_y=True) # doctest: +SKIP >>> X = scale(X) # doctest: +SKIP >>> print('Original dataset shape %s' % Counter(y)) # doctest: +SKIP Original dataset shape Counter({{'tested_negative': 500, \ 'tested_positive': 268}}) # doctest: +SKIP >>> cnn = CondensedNearestNeighbour(random_state=42) # doctest: +SKIP >>> X_res, y_res = cnn.fit_resample(X, y) #doctest: +SKIP >>> print('Resampled dataset shape %s' % Counter(y_res)) # doctest: +SKIP Resampled dataset shape Counter({{'tested_positive': 268, \ 'tested_negative': 181}}) # doctest: +SKIP """ _parameter_constraints: dict = { **BaseCleaningSampler._parameter_constraints, "n_neighbors": [ Interval(numbers.Integral, 1, None, closed="left"), HasMethods(["kneighbors", "kneighbors_graph"]), None, ], "n_seeds_S": [Interval(numbers.Integral, 1, None, closed="left")], "n_jobs": [numbers.Integral, None], "random_state": ["random_state"], } def __init__( self, *, sampling_strategy="auto", random_state=None, n_neighbors=None, n_seeds_S=1, n_jobs=None, ): super().__init__(sampling_strategy=sampling_strategy) self.random_state = random_state self.n_neighbors = n_neighbors self.n_seeds_S = n_seeds_S self.n_jobs = n_jobs def _validate_estimator(self): """Private function to create the NN estimator""" if self.n_neighbors is None: estimator = KNeighborsClassifier(n_neighbors=1, n_jobs=self.n_jobs) elif isinstance(self.n_neighbors, numbers.Integral): estimator = KNeighborsClassifier( n_neighbors=self.n_neighbors, n_jobs=self.n_jobs ) elif isinstance(self.n_neighbors, KNeighborsClassifier): estimator = clone(self.n_neighbors) return estimator def _fit_resample(self, X, y): estimator = self._validate_estimator() random_state = check_random_state(self.random_state) target_stats = Counter(y) class_minority = min(target_stats, key=target_stats.get) idx_under = np.empty((0,), dtype=int) self.estimators_ = [] for target_class in np.unique(y): if target_class in self.sampling_strategy_.keys(): # Randomly get one sample from the majority class # Generate the index to select idx_maj = np.flatnonzero(y == target_class) idx_maj_sample = idx_maj[ random_state.randint( low=0, high=target_stats[target_class], size=self.n_seeds_S, ) ] # Create the set C - One majority samples and all minority C_indices = np.append( np.flatnonzero(y == class_minority), idx_maj_sample ) C_x = _safe_indexing(X, C_indices) C_y = _safe_indexing(y, C_indices) # Create the set S - all majority samples S_indices = np.flatnonzero(y == target_class) S_x = _safe_indexing(X, S_indices) S_y = _safe_indexing(y, S_indices) # fit knn on C self.estimators_.append(clone(estimator).fit(C_x, C_y)) good_classif_label = idx_maj_sample.copy() # Check each sample in S if we keep it or drop it for idx_sam, (x_sam, y_sam) in enumerate(zip(S_x, S_y)): # Do not select sample which are already well classified if idx_sam in good_classif_label: continue # Classify on S if not issparse(x_sam): x_sam = x_sam.reshape(1, -1) pred_y = self.estimators_[-1].predict(x_sam) # If the prediction do not agree with the true label # append it in C_x if y_sam != pred_y: # Keep the index for later idx_maj_sample = np.append(idx_maj_sample, idx_maj[idx_sam]) # Update C C_indices = np.append(C_indices, idx_maj[idx_sam]) C_x = _safe_indexing(X, C_indices) C_y = _safe_indexing(y, C_indices) # fit a knn on C self.estimators_[-1].fit(C_x, C_y) # This experimental to speed up the search # Classify all the element in S and avoid to test the # well classified elements pred_S_y = self.estimators_[-1].predict(S_x) good_classif_label = np.unique( np.append(idx_maj_sample, np.flatnonzero(pred_S_y == S_y)) ) idx_under = np.concatenate((idx_under, idx_maj_sample), axis=0) else: idx_under = np.concatenate( (idx_under, np.flatnonzero(y == target_class)), axis=0 ) self.sample_indices_ = idx_under return _safe_indexing(X, idx_under), _safe_indexing(y, idx_under) @property def estimator_(self): """Last fitted k-NN estimator.""" warnings.warn( "`estimator_` attribute has been deprecated in 0.12 and will be " "removed in 0.14. Use `estimators_` instead.", FutureWarning, ) return self.estimators_[-1] def _more_tags(self): return {"sample_indices": True} imbalanced-learn-0.12.2/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py000066400000000000000000000517621460233407600337600ustar00rootroot00000000000000"""Classes to perform under-sampling based on the edited nearest neighbour method.""" # Authors: Guillaume Lemaitre # Dayvid Oliveira # Christos Aridas # License: MIT import numbers from collections import Counter import numpy as np from sklearn.utils import _safe_indexing from ...utils import Substitution, check_neighbors_object from ...utils._docstring import _n_jobs_docstring from ...utils._param_validation import HasMethods, Interval, StrOptions from ...utils.fixes import _mode from ..base import BaseCleaningSampler SEL_KIND = ("all", "mode") @Substitution( sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring, n_jobs=_n_jobs_docstring, ) class EditedNearestNeighbours(BaseCleaningSampler): """Undersample based on the edited nearest neighbour method. This method cleans the dataset by removing samples close to the decision boundary. It removes observations from the majority class or classes when any or most of its closest neighours are from a different class. Read more in the :ref:`User Guide `. Parameters ---------- {sampling_strategy} n_neighbors : int or object, default=3 If ``int``, size of the neighbourhood to consider for the undersampling, i.e., if `n_neighbors=3`, a sample will be removed when any or most of its 3 closest neighbours are from a different class. If object, an estimator that inherits from :class:`~sklearn.neighbors.base.KNeighborsMixin` that will be used to find the nearest-neighbors. Note that if you want to examine the 3 closest neighbours of a sample for the undersampling, you need to pass a 4-KNN. kind_sel : {{'all', 'mode'}}, default='all' Strategy to use to exclude samples. - If ``'all'``, all neighbours should be of the same class of the examined sample for it not be excluded. - If ``'mode'``, most neighbours should be of the same class of the examined sample for it not be excluded. The strategy `"all"` will be less conservative than `'mode'`. Thus, more samples will be removed when `kind_sel="all"`, generally. {n_jobs} Attributes ---------- sampling_strategy_ : dict Dictionary containing the information to sample the dataset. The keys correspond to the class labels from which to sample and the values are the number of samples to sample. nn_ : estimator object Validated K-nearest Neighbours instance created from `n_neighbors` parameter. sample_indices_ : ndarray of shape (n_new_samples,) Indices of the samples selected. .. versionadded:: 0.4 n_features_in_ : int Number of features in the input dataset. .. versionadded:: 0.9 feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during `fit`. Defined only when `X` has feature names that are all strings. .. versionadded:: 0.10 See Also -------- CondensedNearestNeighbour : Undersample by condensing samples. RepeatedEditedNearestNeighbours : Undersample by repeating the ENN algorithm. AllKNN : Undersample using ENN with varying neighbours. Notes ----- The method is based on [1]_. Supports multi-class resampling. A one-vs.-rest scheme is used when sampling a class as proposed in [1]_. References ---------- .. [1] D. Wilson, Asymptotic" Properties of Nearest Neighbor Rules Using Edited Data," In IEEE Transactions on Systems, Man, and Cybernetrics, vol. 2 (3), pp. 408-421, 1972. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.under_sampling import EditedNearestNeighbours >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> enn = EditedNearestNeighbours() >>> X_res, y_res = enn.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{1: 887, 0: 100}}) """ _parameter_constraints: dict = { **BaseCleaningSampler._parameter_constraints, "n_neighbors": [ Interval(numbers.Integral, 1, None, closed="left"), HasMethods(["kneighbors", "kneighbors_graph"]), ], "kind_sel": [StrOptions({"all", "mode"})], "n_jobs": [numbers.Integral, None], } def __init__( self, *, sampling_strategy="auto", n_neighbors=3, kind_sel="all", n_jobs=None, ): super().__init__(sampling_strategy=sampling_strategy) self.n_neighbors = n_neighbors self.kind_sel = kind_sel self.n_jobs = n_jobs def _validate_estimator(self): """Validate the estimator created in the ENN.""" self.nn_ = check_neighbors_object( "n_neighbors", self.n_neighbors, additional_neighbor=1 ) self.nn_.set_params(**{"n_jobs": self.n_jobs}) def _fit_resample(self, X, y): self._validate_estimator() idx_under = np.empty((0,), dtype=int) self.nn_.fit(X) for target_class in np.unique(y): if target_class in self.sampling_strategy_.keys(): target_class_indices = np.flatnonzero(y == target_class) X_class = _safe_indexing(X, target_class_indices) y_class = _safe_indexing(y, target_class_indices) nnhood_idx = self.nn_.kneighbors(X_class, return_distance=False)[:, 1:] nnhood_label = y[nnhood_idx] if self.kind_sel == "mode": nnhood_label, _ = _mode(nnhood_label, axis=1) nnhood_bool = np.ravel(nnhood_label) == y_class elif self.kind_sel == "all": nnhood_label = nnhood_label == target_class nnhood_bool = np.all(nnhood_label, axis=1) index_target_class = np.flatnonzero(nnhood_bool) else: index_target_class = slice(None) idx_under = np.concatenate( ( idx_under, np.flatnonzero(y == target_class)[index_target_class], ), axis=0, ) self.sample_indices_ = idx_under return _safe_indexing(X, idx_under), _safe_indexing(y, idx_under) def _more_tags(self): return {"sample_indices": True} @Substitution( sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring, n_jobs=_n_jobs_docstring, ) class RepeatedEditedNearestNeighbours(BaseCleaningSampler): """Undersample based on the repeated edited nearest neighbour method. This method repeats the :class:`EditedNearestNeighbours` algorithm several times. The repetitions will stop when i) the maximum number of iterations is reached, or ii) no more observations are being removed, or iii) one of the majority classes becomes a minority class or iv) one of the majority classes disappears during undersampling. Read more in the :ref:`User Guide `. Parameters ---------- {sampling_strategy} n_neighbors : int or object, default=3 If ``int``, size of the neighbourhood to consider for the undersampling, i.e., if `n_neighbors=3`, a sample will be removed when any or most of its 3 closest neighbours are from a different class. If object, an estimator that inherits from :class:`~sklearn.neighbors.base.KNeighborsMixin` that will be used to find the nearest-neighbors. Note that if you want to examine the 3 closest neighbours of a sample for the undersampling, you need to pass a 4-KNN. max_iter : int, default=100 Maximum number of iterations of the edited nearest neighbours. kind_sel : {{'all', 'mode'}}, default='all' Strategy to use to exclude samples. - If ``'all'``, all neighbours should be of the same class of the examined sample for it not be excluded. - If ``'mode'``, most neighbours should be of the same class of the examined sample for it not be excluded. The strategy `"all"` will be less conservative than `'mode'`. Thus, more samples will be removed when `kind_sel="all"`, generally. {n_jobs} Attributes ---------- sampling_strategy_ : dict Dictionary containing the information to sample the dataset. The keys correspond to the class labels from which to sample and the values are the number of samples to sample. nn_ : estimator object Validated K-nearest Neighbours estimator linked to the parameter `n_neighbors`. enn_ : sampler object The validated :class:`~imblearn.under_sampling.EditedNearestNeighbours` instance. sample_indices_ : ndarray of shape (n_new_samples,) Indices of the samples selected. .. versionadded:: 0.4 n_iter_ : int Number of iterations run. .. versionadded:: 0.6 n_features_in_ : int Number of features in the input dataset. .. versionadded:: 0.9 feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during `fit`. Defined only when `X` has feature names that are all strings. .. versionadded:: 0.10 See Also -------- CondensedNearestNeighbour : Undersample by condensing samples. EditedNearestNeighbours : Undersample by editing samples. AllKNN : Undersample using ENN with varying neighbours. Notes ----- The method is based on [1]_. A one-vs.-rest scheme is used when sampling a class as proposed in [1]_. Supports multi-class resampling. References ---------- .. [1] I. Tomek, "An Experiment with the Edited Nearest-Neighbor Rule," IEEE Transactions on Systems, Man, and Cybernetics, vol. 6(6), pp. 448-452, June 1976. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.under_sampling import RepeatedEditedNearestNeighbours >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> renn = RepeatedEditedNearestNeighbours() >>> X_res, y_res = renn.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{1: 887, 0: 100}}) """ _parameter_constraints: dict = { **BaseCleaningSampler._parameter_constraints, "n_neighbors": [ Interval(numbers.Integral, 1, None, closed="left"), HasMethods(["kneighbors", "kneighbors_graph"]), ], "max_iter": [Interval(numbers.Integral, 1, None, closed="left")], "kind_sel": [StrOptions({"all", "mode"})], "n_jobs": [numbers.Integral, None], } def __init__( self, *, sampling_strategy="auto", n_neighbors=3, max_iter=100, kind_sel="all", n_jobs=None, ): super().__init__(sampling_strategy=sampling_strategy) self.n_neighbors = n_neighbors self.kind_sel = kind_sel self.n_jobs = n_jobs self.max_iter = max_iter def _validate_estimator(self): """Private function to create the NN estimator""" self.nn_ = check_neighbors_object( "n_neighbors", self.n_neighbors, additional_neighbor=1 ) self.enn_ = EditedNearestNeighbours( sampling_strategy=self.sampling_strategy, n_neighbors=self.nn_, kind_sel=self.kind_sel, n_jobs=self.n_jobs, ) def _fit_resample(self, X, y): self._validate_estimator() X_, y_ = X, y self.sample_indices_ = np.arange(X.shape[0], dtype=int) target_stats = Counter(y) class_minority = min(target_stats, key=target_stats.get) for n_iter in range(self.max_iter): prev_len = y_.shape[0] X_enn, y_enn = self.enn_.fit_resample(X_, y_) # Check the stopping criterion # 1. If there is no changes for the vector y # 2. If the number of samples in the other class become inferior to # the number of samples in the majority class # 3. If one of the class is disappearing # Case 1 b_conv = prev_len == y_enn.shape[0] # Case 2 stats_enn = Counter(y_enn) count_non_min = np.array( [ val for val, key in zip(stats_enn.values(), stats_enn.keys()) if key != class_minority ] ) b_min_bec_maj = np.any(count_non_min < target_stats[class_minority]) # Case 3 b_remove_maj_class = len(stats_enn) < len(target_stats) ( X_, y_, ) = ( X_enn, y_enn, ) self.sample_indices_ = self.sample_indices_[self.enn_.sample_indices_] if b_conv or b_min_bec_maj or b_remove_maj_class: if b_conv: ( X_, y_, ) = ( X_enn, y_enn, ) self.sample_indices_ = self.sample_indices_[ self.enn_.sample_indices_ ] break self.n_iter_ = n_iter + 1 X_resampled, y_resampled = X_, y_ return X_resampled, y_resampled def _more_tags(self): return {"sample_indices": True} @Substitution( sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring, n_jobs=_n_jobs_docstring, ) class AllKNN(BaseCleaningSampler): """Undersample based on the AllKNN method. This method will apply :class:`EditedNearestNeighbours` several times varying the number of nearest neighbours at each round. It begins by examining 1 closest neighbour, and it incrases the neighbourhood by 1 at each round. The algorithm stops when the maximum number of neighbours are examined or when the majority class becomes the minority class, whichever comes first. Read more in the :ref:`User Guide `. Parameters ---------- {sampling_strategy} n_neighbors : int or estimator object, default=3 If ``int``, size of the maximum neighbourhood to examine for the undersampling. If `n_neighbors=3`, in the first iteration the algorithm will examine 1 closest neigbhour, in the second round 2, and in the final round 3. If object, an estimator that inherits from :class:`~sklearn.neighbors.base.KNeighborsMixin` that will be used to find the nearest-neighbors. Note that if you want to examine the 3 closest neighbours of a sample, you need to pass a 4-KNN. kind_sel : {{'all', 'mode'}}, default='all' Strategy to use to exclude samples. - If ``'all'``, all neighbours should be of the same class of the examined sample for it not be excluded. - If ``'mode'``, most neighbours should be of the same class of the examined sample for it not be excluded. The strategy `"all"` will be less conservative than `'mode'`. Thus, more samples will be removed when `kind_sel="all"`, generally. allow_minority : bool, default=False If ``True``, it allows the majority classes to become the minority class without early stopping. .. versionadded:: 0.3 {n_jobs} Attributes ---------- sampling_strategy_ : dict Dictionary containing the information to sample the dataset. The keys correspond to the class labels from which to sample and the values are the number of samples to sample. nn_ : estimator object Validated K-nearest Neighbours estimator linked to the parameter `n_neighbors`. enn_ : sampler object The validated :class:`~imblearn.under_sampling.EditedNearestNeighbours` instance. sample_indices_ : ndarray of shape (n_new_samples,) Indices of the samples selected. .. versionadded:: 0.4 n_features_in_ : int Number of features in the input dataset. .. versionadded:: 0.9 feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during `fit`. Defined only when `X` has feature names that are all strings. .. versionadded:: 0.10 See Also -------- CondensedNearestNeighbour: Under-sampling by condensing samples. EditedNearestNeighbours: Under-sampling by editing samples. RepeatedEditedNearestNeighbours: Under-sampling by repeating ENN. Notes ----- The method is based on [1]_. Supports multi-class resampling. A one-vs.-rest scheme is used when sampling a class as proposed in [1]_. References ---------- .. [1] I. Tomek, "An Experiment with the Edited Nearest-Neighbor Rule," IEEE Transactions on Systems, Man, and Cybernetics, vol. 6(6), pp. 448-452, June 1976. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.under_sampling import AllKNN >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> allknn = AllKNN() >>> X_res, y_res = allknn.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{1: 887, 0: 100}}) """ _parameter_constraints: dict = { **BaseCleaningSampler._parameter_constraints, "n_neighbors": [ Interval(numbers.Integral, 1, None, closed="left"), HasMethods(["kneighbors", "kneighbors_graph"]), ], "kind_sel": [StrOptions({"all", "mode"})], "allow_minority": ["boolean"], "n_jobs": [numbers.Integral, None], } def __init__( self, *, sampling_strategy="auto", n_neighbors=3, kind_sel="all", allow_minority=False, n_jobs=None, ): super().__init__(sampling_strategy=sampling_strategy) self.n_neighbors = n_neighbors self.kind_sel = kind_sel self.allow_minority = allow_minority self.n_jobs = n_jobs def _validate_estimator(self): """Create objects required by AllKNN""" self.nn_ = check_neighbors_object( "n_neighbors", self.n_neighbors, additional_neighbor=1 ) self.enn_ = EditedNearestNeighbours( sampling_strategy=self.sampling_strategy, n_neighbors=self.nn_, kind_sel=self.kind_sel, n_jobs=self.n_jobs, ) def _fit_resample(self, X, y): self._validate_estimator() X_, y_ = X, y target_stats = Counter(y) class_minority = min(target_stats, key=target_stats.get) self.sample_indices_ = np.arange(X.shape[0], dtype=int) for curr_size_ngh in range(1, self.nn_.n_neighbors): self.enn_.n_neighbors = curr_size_ngh X_enn, y_enn = self.enn_.fit_resample(X_, y_) # Check the stopping criterion # 1. If the number of samples in the other class become inferior to # the number of samples in the majority class # 2. If one of the class is disappearing # Case 1else: stats_enn = Counter(y_enn) count_non_min = np.array( [ val for val, key in zip(stats_enn.values(), stats_enn.keys()) if key != class_minority ] ) b_min_bec_maj = np.any(count_non_min < target_stats[class_minority]) if self.allow_minority: # overwrite b_min_bec_maj b_min_bec_maj = False # Case 2 b_remove_maj_class = len(stats_enn) < len(target_stats) ( X_, y_, ) = ( X_enn, y_enn, ) self.sample_indices_ = self.sample_indices_[self.enn_.sample_indices_] if b_min_bec_maj or b_remove_maj_class: break X_resampled, y_resampled = X_, y_ return X_resampled, y_resampled def _more_tags(self): return {"sample_indices": True} imbalanced-learn-0.12.2/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py000066400000000000000000000145241460233407600343160ustar00rootroot00000000000000"""Class to perform under-sampling based on the instance hardness threshold.""" # Authors: Guillaume Lemaitre # Dayvid Oliveira # Christos Aridas # License: MIT import numbers from collections import Counter import numpy as np from sklearn.base import clone, is_classifier from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble._base import _set_random_states from sklearn.model_selection import StratifiedKFold, cross_val_predict from sklearn.utils import _safe_indexing, check_random_state from ...utils import Substitution from ...utils._docstring import _n_jobs_docstring, _random_state_docstring from ...utils._param_validation import HasMethods from ..base import BaseUnderSampler @Substitution( sampling_strategy=BaseUnderSampler._sampling_strategy_docstring, n_jobs=_n_jobs_docstring, random_state=_random_state_docstring, ) class InstanceHardnessThreshold(BaseUnderSampler): """Undersample based on the instance hardness threshold. Read more in the :ref:`User Guide `. Parameters ---------- estimator : estimator object, default=None Classifier to be used to estimate instance hardness of the samples. This classifier should implement `predict_proba`. {sampling_strategy} {random_state} cv : int, default=5 Number of folds to be used when estimating samples' instance hardness. {n_jobs} Attributes ---------- sampling_strategy_ : dict Dictionary containing the information to sample the dataset. The keys correspond to the class labels from which to sample and the values are the number of samples to sample. estimator_ : estimator object The validated classifier used to estimate the instance hardness of the samples. sample_indices_ : ndarray of shape (n_new_samples,) Indices of the samples selected. .. versionadded:: 0.4 n_features_in_ : int Number of features in the input dataset. .. versionadded:: 0.9 feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during `fit`. Defined only when `X` has feature names that are all strings. .. versionadded:: 0.10 See Also -------- NearMiss : Undersample based on near-miss search. RandomUnderSampler : Random under-sampling. Notes ----- The method is based on [1]_. Supports multi-class resampling: from each class to be under-sampled, it retains the observations with the highest probability of being correctly classified. References ---------- .. [1] D. Smith, Michael R., Tony Martinez, and Christophe Giraud-Carrier. "An instance level analysis of data complexity." Machine learning 95.2 (2014): 225-256. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.under_sampling import InstanceHardnessThreshold >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> iht = InstanceHardnessThreshold(random_state=42) >>> X_res, y_res = iht.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{1: 5..., 0: 100}}) """ _parameter_constraints: dict = { **BaseUnderSampler._parameter_constraints, "estimator": [ HasMethods(["fit", "predict_proba"]), None, ], "cv": ["cv_object"], "n_jobs": [numbers.Integral, None], "random_state": ["random_state"], } def __init__( self, *, estimator=None, sampling_strategy="auto", random_state=None, cv=5, n_jobs=None, ): super().__init__(sampling_strategy=sampling_strategy) self.random_state = random_state self.estimator = estimator self.cv = cv self.n_jobs = n_jobs def _validate_estimator(self, random_state): """Private function to create the classifier""" if ( self.estimator is not None and is_classifier(self.estimator) and hasattr(self.estimator, "predict_proba") ): self.estimator_ = clone(self.estimator) _set_random_states(self.estimator_, random_state) elif self.estimator is None: self.estimator_ = RandomForestClassifier( n_estimators=100, random_state=self.random_state, n_jobs=self.n_jobs, ) def _fit_resample(self, X, y): random_state = check_random_state(self.random_state) self._validate_estimator(random_state) target_stats = Counter(y) skf = StratifiedKFold( n_splits=self.cv, shuffle=True, random_state=random_state, ) probabilities = cross_val_predict( self.estimator_, X, y, cv=skf, n_jobs=self.n_jobs, method="predict_proba", ) probabilities = probabilities[range(len(y)), y] idx_under = np.empty((0,), dtype=int) for target_class in np.unique(y): if target_class in self.sampling_strategy_.keys(): n_samples = self.sampling_strategy_[target_class] threshold = np.percentile( probabilities[y == target_class], (1.0 - (n_samples / target_stats[target_class])) * 100.0, ) index_target_class = np.flatnonzero( probabilities[y == target_class] >= threshold ) else: index_target_class = slice(None) idx_under = np.concatenate( ( idx_under, np.flatnonzero(y == target_class)[index_target_class], ), axis=0, ) self.sample_indices_ = idx_under return _safe_indexing(X, idx_under), _safe_indexing(y, idx_under) def _more_tags(self): return {"sample_indices": True} imbalanced-learn-0.12.2/imblearn/under_sampling/_prototype_selection/_nearmiss.py000066400000000000000000000255611460233407600303730ustar00rootroot00000000000000"""Class to perform under-sampling based on nearmiss methods.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import numbers import warnings from collections import Counter import numpy as np from sklearn.utils import _safe_indexing from ...utils import Substitution, check_neighbors_object from ...utils._docstring import _n_jobs_docstring from ...utils._param_validation import HasMethods, Interval from ..base import BaseUnderSampler @Substitution( sampling_strategy=BaseUnderSampler._sampling_strategy_docstring, n_jobs=_n_jobs_docstring, ) class NearMiss(BaseUnderSampler): """Class to perform under-sampling based on NearMiss methods. Read more in the :ref:`User Guide `. Parameters ---------- {sampling_strategy} version : int, default=1 Version of the NearMiss to use. Possible values are 1, 2 or 3. n_neighbors : int or estimator object, default=3 If ``int``, size of the neighbourhood to consider to compute the average distance to the minority point samples. If object, an estimator that inherits from :class:`~sklearn.neighbors.base.KNeighborsMixin` that will be used to find the k_neighbors. By default, it will be a 3-NN. n_neighbors_ver3 : int or estimator object, default=3 If ``int``, NearMiss-3 algorithm start by a phase of re-sampling. This parameter correspond to the number of neighbours selected create the subset in which the selection will be performed. If object, an estimator that inherits from :class:`~sklearn.neighbors.base.KNeighborsMixin` that will be used to find the k_neighbors. By default, it will be a 3-NN. {n_jobs} Attributes ---------- sampling_strategy_ : dict Dictionary containing the information to sample the dataset. The keys corresponds to the class labels from which to sample and the values are the number of samples to sample. nn_ : estimator object Validated K-nearest Neighbours object created from `n_neighbors` parameter. sample_indices_ : ndarray of shape (n_new_samples,) Indices of the samples selected. .. versionadded:: 0.4 n_features_in_ : int Number of features in the input dataset. .. versionadded:: 0.9 feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during `fit`. Defined only when `X` has feature names that are all strings. .. versionadded:: 0.10 See Also -------- RandomUnderSampler : Random undersample the dataset. InstanceHardnessThreshold : Use of classifier to undersample a dataset. Notes ----- The methods are based on [1]_. Supports multi-class resampling. References ---------- .. [1] I. Mani, I. Zhang. "kNN approach to unbalanced data distributions: a case study involving information extraction," In Proceedings of workshop on learning from imbalanced datasets, 2003. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.under_sampling import NearMiss >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> nm = NearMiss() >>> X_res, y_res = nm.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{0: 100, 1: 100}}) """ _parameter_constraints: dict = { **BaseUnderSampler._parameter_constraints, "version": [Interval(numbers.Integral, 1, 3, closed="both")], "n_neighbors": [ Interval(numbers.Integral, 1, None, closed="left"), HasMethods(["kneighbors", "kneighbors_graph"]), ], "n_neighbors_ver3": [ Interval(numbers.Integral, 1, None, closed="left"), HasMethods(["kneighbors", "kneighbors_graph"]), ], "n_jobs": [numbers.Integral, None], } def __init__( self, *, sampling_strategy="auto", version=1, n_neighbors=3, n_neighbors_ver3=3, n_jobs=None, ): super().__init__(sampling_strategy=sampling_strategy) self.version = version self.n_neighbors = n_neighbors self.n_neighbors_ver3 = n_neighbors_ver3 self.n_jobs = n_jobs def _selection_dist_based( self, X, y, dist_vec, num_samples, key, sel_strategy="nearest" ): """Select the appropriate samples depending of the strategy selected. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Original samples. y : array-like, shape (n_samples,) Associated label to X. dist_vec : ndarray, shape (n_samples, ) The distance matrix to the nearest neigbour. num_samples: int The desired number of samples to select. key : str or int, The target class. sel_strategy : str, optional (default='nearest') Strategy to select the samples. Either 'nearest' or 'farthest' Returns ------- idx_sel : ndarray, shape (num_samples,) The list of the indices of the selected samples. """ # Compute the distance considering the farthest neighbour dist_avg_vec = np.sum(dist_vec[:, -self.nn_.n_neighbors :], axis=1) target_class_indices = np.flatnonzero(y == key) if dist_vec.shape[0] != _safe_indexing(X, target_class_indices).shape[0]: raise RuntimeError( "The samples to be selected do not correspond" " to the distance matrix given. Ensure that" " both `X[y == key]` and `dist_vec` are" " related." ) # Sort the list of distance and get the index if sel_strategy == "nearest": sort_way = False else: # sel_strategy == "farthest": sort_way = True sorted_idx = sorted( range(len(dist_avg_vec)), key=dist_avg_vec.__getitem__, reverse=sort_way, ) # Throw a warning to tell the user that we did not have enough samples # to select and that we just select everything if len(sorted_idx) < num_samples: warnings.warn( "The number of the samples to be selected is larger" " than the number of samples available. The" " balancing ratio cannot be ensure and all samples" " will be returned." ) # Select the desired number of samples return sorted_idx[:num_samples] def _validate_estimator(self): """Private function to create the NN estimator""" self.nn_ = check_neighbors_object("n_neighbors", self.n_neighbors) self.nn_.set_params(**{"n_jobs": self.n_jobs}) if self.version == 3: self.nn_ver3_ = check_neighbors_object( "n_neighbors_ver3", self.n_neighbors_ver3 ) self.nn_ver3_.set_params(**{"n_jobs": self.n_jobs}) def _fit_resample(self, X, y): self._validate_estimator() idx_under = np.empty((0,), dtype=int) target_stats = Counter(y) class_minority = min(target_stats, key=target_stats.get) minority_class_indices = np.flatnonzero(y == class_minority) self.nn_.fit(_safe_indexing(X, minority_class_indices)) for target_class in np.unique(y): if target_class in self.sampling_strategy_.keys(): n_samples = self.sampling_strategy_[target_class] target_class_indices = np.flatnonzero(y == target_class) X_class = _safe_indexing(X, target_class_indices) y_class = _safe_indexing(y, target_class_indices) if self.version == 1: dist_vec, idx_vec = self.nn_.kneighbors( X_class, n_neighbors=self.nn_.n_neighbors ) index_target_class = self._selection_dist_based( X, y, dist_vec, n_samples, target_class, sel_strategy="nearest", ) elif self.version == 2: dist_vec, idx_vec = self.nn_.kneighbors( X_class, n_neighbors=target_stats[class_minority] ) index_target_class = self._selection_dist_based( X, y, dist_vec, n_samples, target_class, sel_strategy="nearest", ) elif self.version == 3: self.nn_ver3_.fit(X_class) dist_vec, idx_vec = self.nn_ver3_.kneighbors( _safe_indexing(X, minority_class_indices) ) idx_vec_farthest = np.unique(idx_vec.reshape(-1)) X_class_selected = _safe_indexing(X_class, idx_vec_farthest) y_class_selected = _safe_indexing(y_class, idx_vec_farthest) dist_vec, idx_vec = self.nn_.kneighbors( X_class_selected, n_neighbors=self.nn_.n_neighbors ) index_target_class = self._selection_dist_based( X_class_selected, y_class_selected, dist_vec, n_samples, target_class, sel_strategy="farthest", ) # idx_tmp is relative to the feature selected in the # previous step and we need to find the indirection index_target_class = idx_vec_farthest[index_target_class] else: index_target_class = slice(None) idx_under = np.concatenate( ( idx_under, np.flatnonzero(y == target_class)[index_target_class], ), axis=0, ) self.sample_indices_ = idx_under return _safe_indexing(X, idx_under), _safe_indexing(y, idx_under) # fmt: off def _more_tags(self): return { "sample_indices": True, "_xfail_checks": { "check_samplers_fit_resample": "Fails for NearMiss-3 with less samples than expected" } } # fmt: on imbalanced-learn-0.12.2/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py000066400000000000000000000226161460233407600342730ustar00rootroot00000000000000"""Class performing under-sampling based on the neighbourhood cleaning rule.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import numbers import warnings from collections import Counter import numpy as np from sklearn.base import clone from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors from sklearn.utils import _safe_indexing from ...utils import Substitution from ...utils._docstring import _n_jobs_docstring from ...utils._param_validation import HasMethods, Hidden, Interval, StrOptions from ..base import BaseCleaningSampler from ._edited_nearest_neighbours import EditedNearestNeighbours SEL_KIND = ("all", "mode") @Substitution( sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring, n_jobs=_n_jobs_docstring, ) class NeighbourhoodCleaningRule(BaseCleaningSampler): """Undersample based on the neighbourhood cleaning rule. This class uses ENN and a k-NN to remove noisy samples from the datasets. Read more in the :ref:`User Guide `. Parameters ---------- {sampling_strategy} edited_nearest_neighbours : estimator object, default=None The :class:`~imblearn.under_sampling.EditedNearestNeighbours` (ENN) object to clean the dataset. If `None`, a default ENN is created with `kind_sel="mode"` and `n_neighbors=n_neighbors`. n_neighbors : int or estimator object, default=3 If ``int``, size of the neighbourhood to consider to compute the K-nearest neighbors. If object, an estimator that inherits from :class:`~sklearn.neighbors.base.KNeighborsMixin` that will be used to find the nearest-neighbors. By default, it will be a 3-NN. kind_sel : {{"all", "mode"}}, default='all' Strategy to use in order to exclude samples in the ENN sampling. - If ``'all'``, all neighbours will have to agree with the samples of interest to not be excluded. - If ``'mode'``, the majority vote of the neighbours will be used in order to exclude a sample. The strategy `"all"` will be less conservative than `'mode'`. Thus, more samples will be removed when `kind_sel="all"` generally. .. deprecated:: 0.12 `kind_sel` is deprecated in 0.12 and will be removed in 0.14. Currently the parameter has no effect and corresponds always to the `"all"` strategy. threshold_cleaning : float, default=0.5 Threshold used to whether consider a class or not during the cleaning after applying ENN. A class will be considered during cleaning when: Ci > C x T , where Ci and C is the number of samples in the class and the data set, respectively and theta is the threshold. {n_jobs} Attributes ---------- sampling_strategy_ : dict Dictionary containing the information to sample the dataset. The keys corresponds to the class labels from which to sample and the values are the number of samples to sample. edited_nearest_neighbours_ : estimator object The edited nearest neighbour object used to make the first resampling. nn_ : estimator object Validated K-nearest Neighbours object created from `n_neighbors` parameter. classes_to_clean_ : list The classes considered with under-sampling by `nn_` in the second cleaning phase. sample_indices_ : ndarray of shape (n_new_samples,) Indices of the samples selected. .. versionadded:: 0.4 n_features_in_ : int Number of features in the input dataset. .. versionadded:: 0.9 feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during `fit`. Defined only when `X` has feature names that are all strings. .. versionadded:: 0.10 See Also -------- EditedNearestNeighbours : Undersample by editing noisy samples. Notes ----- See the original paper: [1]_. Supports multi-class resampling. A one-vs.-rest scheme is used when sampling a class as proposed in [1]_. References ---------- .. [1] J. Laurikkala, "Improving identification of difficult small classes by balancing class distribution," Springer Berlin Heidelberg, 2001. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.under_sampling import NeighbourhoodCleaningRule >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> ncr = NeighbourhoodCleaningRule() >>> X_res, y_res = ncr.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{1: 888, 0: 100}}) """ _parameter_constraints: dict = { **BaseCleaningSampler._parameter_constraints, "edited_nearest_neighbours": [ HasMethods(["fit_resample"]), None, ], "n_neighbors": [ Interval(numbers.Integral, 1, None, closed="left"), HasMethods(["kneighbors", "kneighbors_graph"]), ], "kind_sel": [StrOptions({"all", "mode"}), Hidden(StrOptions({"deprecated"}))], "threshold_cleaning": [Interval(numbers.Real, 0, None, closed="neither")], "n_jobs": [numbers.Integral, None], } def __init__( self, *, sampling_strategy="auto", edited_nearest_neighbours=None, n_neighbors=3, kind_sel="deprecated", threshold_cleaning=0.5, n_jobs=None, ): super().__init__(sampling_strategy=sampling_strategy) self.edited_nearest_neighbours = edited_nearest_neighbours self.n_neighbors = n_neighbors self.kind_sel = kind_sel self.threshold_cleaning = threshold_cleaning self.n_jobs = n_jobs def _validate_estimator(self): """Create the objects required by NCR.""" if isinstance(self.n_neighbors, numbers.Integral): self.nn_ = KNeighborsClassifier( n_neighbors=self.n_neighbors, n_jobs=self.n_jobs ) elif isinstance(self.n_neighbors, NearestNeighbors): # backward compatibility when passing a NearestNeighbors object self.nn_ = KNeighborsClassifier( n_neighbors=self.n_neighbors.n_neighbors - 1, n_jobs=self.n_jobs ) else: self.nn_ = clone(self.n_neighbors) if self.edited_nearest_neighbours is None: self.edited_nearest_neighbours_ = EditedNearestNeighbours( sampling_strategy=self.sampling_strategy, n_neighbors=self.n_neighbors, kind_sel="mode", n_jobs=self.n_jobs, ) else: self.edited_nearest_neighbours_ = clone(self.edited_nearest_neighbours) def _fit_resample(self, X, y): if self.kind_sel != "deprecated": warnings.warn( "`kind_sel` is deprecated in 0.12 and will be removed in 0.14. " "It already has not effect and corresponds to the `'all'` option.", FutureWarning, ) self._validate_estimator() self.edited_nearest_neighbours_.fit_resample(X, y) index_not_a1 = self.edited_nearest_neighbours_.sample_indices_ index_a1 = np.ones(y.shape, dtype=bool) index_a1[index_not_a1] = False index_a1 = np.flatnonzero(index_a1) # clean the neighborhood target_stats = Counter(y) class_minority = min(target_stats, key=target_stats.get) # compute which classes to consider for cleaning for the A2 group self.classes_to_clean_ = [ c for c, n_samples in target_stats.items() if ( c in self.sampling_strategy_.keys() and (n_samples > target_stats[class_minority] * self.threshold_cleaning) ) ] self.nn_.fit(X, y) class_minority_indices = np.flatnonzero(y == class_minority) X_minority = _safe_indexing(X, class_minority_indices) y_minority = _safe_indexing(y, class_minority_indices) y_pred_minority = self.nn_.predict(X_minority) # add an additional sample since the query points contains the original dataset neighbors_to_minority_indices = self.nn_.kneighbors( X_minority, n_neighbors=self.nn_.n_neighbors + 1, return_distance=False )[:, 1:] mask_misclassified_minority = y_pred_minority != y_minority index_a2 = np.ravel(neighbors_to_minority_indices[mask_misclassified_minority]) index_a2 = np.array( [ index for index in np.unique(index_a2) if y[index] in self.classes_to_clean_ ] ) union_a1_a2 = np.union1d(index_a1, index_a2).astype(int) selected_samples = np.ones(y.shape, dtype=bool) selected_samples[union_a1_a2] = False self.sample_indices_ = np.flatnonzero(selected_samples) return ( _safe_indexing(X, self.sample_indices_), _safe_indexing(y, self.sample_indices_), ) def _more_tags(self): return {"sample_indices": True} imbalanced-learn-0.12.2/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py000066400000000000000000000201031460233407600325330ustar00rootroot00000000000000"""Class to perform under-sampling based on one-sided selection method.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import numbers import warnings from collections import Counter import numpy as np from sklearn.base import clone from sklearn.neighbors import KNeighborsClassifier from sklearn.utils import _safe_indexing, check_random_state from ...utils import Substitution from ...utils._docstring import _n_jobs_docstring, _random_state_docstring from ...utils._param_validation import HasMethods, Interval from ..base import BaseCleaningSampler from ._tomek_links import TomekLinks @Substitution( sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring, n_jobs=_n_jobs_docstring, random_state=_random_state_docstring, ) class OneSidedSelection(BaseCleaningSampler): """Class to perform under-sampling based on one-sided selection method. Read more in the :ref:`User Guide `. Parameters ---------- {sampling_strategy} {random_state} n_neighbors : int or estimator object, default=None If ``int``, size of the neighbourhood to consider to compute the nearest neighbors. If object, an estimator that inherits from :class:`~sklearn.neighbors.base.KNeighborsMixin` that will be used to find the nearest-neighbors. If `None`, a :class:`~sklearn.neighbors.KNeighborsClassifier` with a 1-NN rules will be used. n_seeds_S : int, default=1 Number of samples to extract in order to build the set S. {n_jobs} Attributes ---------- sampling_strategy_ : dict Dictionary containing the information to sample the dataset. The keys corresponds to the class labels from which to sample and the values are the number of samples to sample. estimator_ : estimator object Validated K-nearest neighbors estimator created from parameter `n_neighbors`. .. deprecated:: 0.12 `estimator_` is deprecated in 0.12 and will be removed in 0.14. Use `estimators_` instead that contains the list of all K-nearest neighbors estimator used for each pair of class. estimators_ : list of estimator objects of shape (n_resampled_classes - 1,) Contains the K-nearest neighbor estimator used for per of classes. .. versionadded:: 0.12 sample_indices_ : ndarray of shape (n_new_samples,) Indices of the samples selected. .. versionadded:: 0.4 n_features_in_ : int Number of features in the input dataset. .. versionadded:: 0.9 feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during `fit`. Defined only when `X` has feature names that are all strings. .. versionadded:: 0.10 See Also -------- EditedNearestNeighbours : Undersample by editing noisy samples. Notes ----- The method is based on [1]_. Supports multi-class resampling. A one-vs.-one scheme is used when sampling a class as proposed in [1]_. For each class to be sampled, all samples of this class and the minority class are used during the sampling procedure. References ---------- .. [1] M. Kubat, S. Matwin, "Addressing the curse of imbalanced training sets: one-sided selection," In ICML, vol. 97, pp. 179-186, 1997. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.under_sampling import OneSidedSelection >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> oss = OneSidedSelection(random_state=42) >>> X_res, y_res = oss.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{1: 496, 0: 100}}) """ _parameter_constraints: dict = { **BaseCleaningSampler._parameter_constraints, "n_neighbors": [ Interval(numbers.Integral, 1, None, closed="left"), HasMethods(["kneighbors", "kneighbors_graph"]), None, ], "n_seeds_S": [Interval(numbers.Integral, 1, None, closed="left")], "n_jobs": [numbers.Integral, None], "random_state": ["random_state"], } def __init__( self, *, sampling_strategy="auto", random_state=None, n_neighbors=None, n_seeds_S=1, n_jobs=None, ): super().__init__(sampling_strategy=sampling_strategy) self.random_state = random_state self.n_neighbors = n_neighbors self.n_seeds_S = n_seeds_S self.n_jobs = n_jobs def _validate_estimator(self): """Private function to create the NN estimator""" if self.n_neighbors is None: estimator = KNeighborsClassifier(n_neighbors=1, n_jobs=self.n_jobs) elif isinstance(self.n_neighbors, int): estimator = KNeighborsClassifier( n_neighbors=self.n_neighbors, n_jobs=self.n_jobs ) elif isinstance(self.n_neighbors, KNeighborsClassifier): estimator = clone(self.n_neighbors) return estimator def _fit_resample(self, X, y): estimator = self._validate_estimator() random_state = check_random_state(self.random_state) target_stats = Counter(y) class_minority = min(target_stats, key=target_stats.get) idx_under = np.empty((0,), dtype=int) self.estimators_ = [] for target_class in np.unique(y): if target_class in self.sampling_strategy_.keys(): # select a sample from the current class idx_maj = np.flatnonzero(y == target_class) sel_idx_maj = random_state.randint( low=0, high=target_stats[target_class], size=self.n_seeds_S ) idx_maj_sample = idx_maj[sel_idx_maj] minority_class_indices = np.flatnonzero(y == class_minority) C_indices = np.append(minority_class_indices, idx_maj_sample) # create the set composed of all minority samples and one # sample from the current class. C_x = _safe_indexing(X, C_indices) C_y = _safe_indexing(y, C_indices) # create the set S with removing the seed from S # since that it will be added anyway idx_maj_extracted = np.delete(idx_maj, sel_idx_maj, axis=0) S_x = _safe_indexing(X, idx_maj_extracted) S_y = _safe_indexing(y, idx_maj_extracted) self.estimators_.append(clone(estimator).fit(C_x, C_y)) pred_S_y = self.estimators_[-1].predict(S_x) S_misclassified_indices = np.flatnonzero(pred_S_y != S_y) idx_tmp = idx_maj_extracted[S_misclassified_indices] idx_under = np.concatenate((idx_under, idx_maj_sample, idx_tmp), axis=0) else: idx_under = np.concatenate( (idx_under, np.flatnonzero(y == target_class)), axis=0 ) X_resampled = _safe_indexing(X, idx_under) y_resampled = _safe_indexing(y, idx_under) # apply Tomek cleaning tl = TomekLinks(sampling_strategy=list(self.sampling_strategy_.keys())) X_cleaned, y_cleaned = tl.fit_resample(X_resampled, y_resampled) self.sample_indices_ = _safe_indexing(idx_under, tl.sample_indices_) return X_cleaned, y_cleaned @property def estimator_(self): """Last fitted k-NN estimator.""" warnings.warn( "`estimator_` attribute has been deprecated in 0.12 and will be " "removed in 0.14. Use `estimators_` instead.", FutureWarning, ) return self.estimators_[-1] def _more_tags(self): return {"sample_indices": True} imbalanced-learn-0.12.2/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py000066400000000000000000000107711460233407600327470ustar00rootroot00000000000000"""Class to perform random under-sampling.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import numpy as np from sklearn.utils import _safe_indexing, check_random_state from ...utils import Substitution, check_target_type from ...utils._docstring import _random_state_docstring from ...utils._validation import _check_X from ..base import BaseUnderSampler @Substitution( sampling_strategy=BaseUnderSampler._sampling_strategy_docstring, random_state=_random_state_docstring, ) class RandomUnderSampler(BaseUnderSampler): """Class to perform random under-sampling. Under-sample the majority class(es) by randomly picking samples with or without replacement. Read more in the :ref:`User Guide `. Parameters ---------- {sampling_strategy} {random_state} replacement : bool, default=False Whether the sample is with or without replacement. Attributes ---------- sampling_strategy_ : dict Dictionary containing the information to sample the dataset. The keys corresponds to the class labels from which to sample and the values are the number of samples to sample. sample_indices_ : ndarray of shape (n_new_samples,) Indices of the samples selected. .. versionadded:: 0.4 n_features_in_ : int Number of features in the input dataset. .. versionadded:: 0.9 feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during `fit`. Defined only when `X` has feature names that are all strings. .. versionadded:: 0.10 See Also -------- NearMiss : Undersample using near-miss samples. Notes ----- Supports multi-class resampling by sampling each class independently. Supports heterogeneous data as object array containing string and numeric data. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.under_sampling import RandomUnderSampler >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> rus = RandomUnderSampler(random_state=42) >>> X_res, y_res = rus.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{0: 100, 1: 100}}) """ _parameter_constraints: dict = { **BaseUnderSampler._parameter_constraints, "replacement": ["boolean"], "random_state": ["random_state"], } def __init__( self, *, sampling_strategy="auto", random_state=None, replacement=False ): super().__init__(sampling_strategy=sampling_strategy) self.random_state = random_state self.replacement = replacement def _check_X_y(self, X, y): y, binarize_y = check_target_type(y, indicate_one_vs_all=True) X = _check_X(X) self._check_n_features(X, reset=True) self._check_feature_names(X, reset=True) return X, y, binarize_y def _fit_resample(self, X, y): random_state = check_random_state(self.random_state) idx_under = np.empty((0,), dtype=int) for target_class in np.unique(y): if target_class in self.sampling_strategy_.keys(): n_samples = self.sampling_strategy_[target_class] index_target_class = random_state.choice( range(np.count_nonzero(y == target_class)), size=n_samples, replace=self.replacement, ) else: index_target_class = slice(None) idx_under = np.concatenate( ( idx_under, np.flatnonzero(y == target_class)[index_target_class], ), axis=0, ) self.sample_indices_ = idx_under return _safe_indexing(X, idx_under), _safe_indexing(y, idx_under) def _more_tags(self): return { "X_types": ["2darray", "string", "sparse", "dataframe"], "sample_indices": True, "allow_nan": True, "_xfail_checks": { "check_complex_data": "Robust to this type of data.", }, } imbalanced-learn-0.12.2/imblearn/under_sampling/_prototype_selection/_tomek_links.py000066400000000000000000000117731460233407600310710ustar00rootroot00000000000000"""Class to perform under-sampling by removing Tomek's links.""" # Authors: Guillaume Lemaitre # Fernando Nogueira # Christos Aridas # License: MIT import numbers import numpy as np from sklearn.neighbors import NearestNeighbors from sklearn.utils import _safe_indexing from ...utils import Substitution from ...utils._docstring import _n_jobs_docstring from ..base import BaseCleaningSampler @Substitution( sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring, n_jobs=_n_jobs_docstring, ) class TomekLinks(BaseCleaningSampler): """Under-sampling by removing Tomek's links. Read more in the :ref:`User Guide `. Parameters ---------- {sampling_strategy} {n_jobs} Attributes ---------- sampling_strategy_ : dict Dictionary containing the information to sample the dataset. The keys corresponds to the class labels from which to sample and the values are the number of samples to sample. sample_indices_ : ndarray of shape (n_new_samples,) Indices of the samples selected. .. versionadded:: 0.4 n_features_in_ : int Number of features in the input dataset. .. versionadded:: 0.9 feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during `fit`. Defined only when `X` has feature names that are all strings. .. versionadded:: 0.10 See Also -------- EditedNearestNeighbours : Undersample by samples edition. CondensedNearestNeighbour : Undersample by samples condensation. RandomUnderSampler : Randomly under-sample the dataset. Notes ----- This method is based on [1]_. Supports multi-class resampling. A one-vs.-rest scheme is used as originally proposed in [1]_. References ---------- .. [1] I. Tomek, "Two modifications of CNN," In Systems, Man, and Cybernetics, IEEE Transactions on, vol. 6, pp 769-772, 1976. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.under_sampling import TomekLinks >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> tl = TomekLinks() >>> X_res, y_res = tl.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{1: 897, 0: 100}}) """ _parameter_constraints: dict = { **BaseCleaningSampler._parameter_constraints, "n_jobs": [numbers.Integral, None], } def __init__(self, *, sampling_strategy="auto", n_jobs=None): super().__init__(sampling_strategy=sampling_strategy) self.n_jobs = n_jobs @staticmethod def is_tomek(y, nn_index, class_type): """Detect if samples are Tomek's link. More precisely, it uses the target vector and the first neighbour of every sample point and looks for Tomek pairs. Returning a boolean vector with True for majority Tomek links. Parameters ---------- y : ndarray of shape (n_samples,) Target vector of the data set, necessary to keep track of whether a sample belongs to minority or not. nn_index : ndarray of shape (len(y),) The index of the closes nearest neighbour to a sample point. class_type : int or str The label of the minority class. Returns ------- is_tomek : ndarray of shape (len(y), ) Boolean vector on len( # samples ), with True for majority samples that are Tomek links. """ links = np.zeros(len(y), dtype=bool) # find which class to not consider class_excluded = [c for c in np.unique(y) if c not in class_type] # there is a Tomek link between two samples if they are both nearest # neighbors of each others. for index_sample, target_sample in enumerate(y): if target_sample in class_excluded: continue if y[nn_index[index_sample]] != target_sample: if nn_index[nn_index[index_sample]] == index_sample: links[index_sample] = True return links def _fit_resample(self, X, y): # Find the nearest neighbour of every point nn = NearestNeighbors(n_neighbors=2, n_jobs=self.n_jobs) nn.fit(X) nns = nn.kneighbors(X, return_distance=False)[:, 1] links = self.is_tomek(y, nns, self.sampling_strategy_) self.sample_indices_ = np.flatnonzero(np.logical_not(links)) return ( _safe_indexing(X, self.sample_indices_), _safe_indexing(y, self.sample_indices_), ) def _more_tags(self): return {"sample_indices": True} imbalanced-learn-0.12.2/imblearn/under_sampling/_prototype_selection/tests/000077500000000000000000000000001460233407600271725ustar00rootroot00000000000000imbalanced-learn-0.12.2/imblearn/under_sampling/_prototype_selection/tests/__init__.py000066400000000000000000000000001460233407600312710ustar00rootroot00000000000000imbalanced-learn-0.12.2/imblearn/under_sampling/_prototype_selection/tests/test_allknn.py000066400000000000000000000211051460233407600320610ustar00rootroot00000000000000"""Test the module repeated edited nearest neighbour.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import numpy as np import pytest from sklearn.datasets import make_classification from sklearn.neighbors import NearestNeighbors from sklearn.utils._testing import assert_allclose, assert_array_equal from imblearn.under_sampling import AllKNN X = np.array( [ [-0.12840393, 0.66446571], [1.32319756, -0.13181616], [0.04296502, -0.37981873], [0.83631853, 0.18569783], [1.02956816, 0.36061601], [1.12202806, 0.33811558], [-0.53171468, -0.53735182], [1.3381556, 0.35956356], [-0.35946678, 0.72510189], [1.32326943, 0.28393874], [2.94290565, -0.13986434], [0.28294738, -1.00125525], [0.34218094, -0.58781961], [-0.88864036, -0.33782387], [-1.10146139, 0.91782682], [-0.7969716, -0.50493969], [0.73489726, 0.43915195], [0.2096964, -0.61814058], [-0.28479268, 0.70459548], [1.84864913, 0.14729596], [1.59068979, -0.96622933], [0.73418199, -0.02222847], [0.50307437, 0.498805], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [0.79270821, -0.41386668], [1.16606871, -0.25641059], [1.57356906, 0.30390519], [1.0304995, -0.16955962], [1.67314371, 0.19231498], [0.98382284, 0.37184502], [0.48921682, -1.38504507], [-0.46226554, -0.50481004], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], [0.80541964, -0.34465185], [0.1732627, -1.61323172], [0.69804044, 0.44810796], [-0.5506368, -0.42072426], [-0.34474418, 0.21969797], ] ) Y = np.array( [ 1, 2, 2, 2, 1, 1, 0, 2, 1, 1, 1, 2, 2, 0, 1, 2, 1, 2, 1, 1, 2, 2, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 0, 2, 2, 2, 2, 1, 2, 0, ] ) R_TOL = 1e-4 def test_allknn_fit_resample(): allknn = AllKNN() X_resampled, y_resampled = allknn.fit_resample(X, Y) X_gt = np.array( [ [-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [-0.46226554, -0.50481004], [-0.34474418, 0.21969797], [1.02956816, 0.36061601], [1.12202806, 0.33811558], [-1.10146139, 0.91782682], [0.73489726, 0.43915195], [0.50307437, 0.498805], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [0.98382284, 0.37184502], [0.69804044, 0.44810796], [0.04296502, -0.37981873], [0.28294738, -1.00125525], [0.34218094, -0.58781961], [0.2096964, -0.61814058], [1.59068979, -0.96622933], [0.73418199, -0.02222847], [0.79270821, -0.41386668], [1.16606871, -0.25641059], [1.0304995, -0.16955962], [0.48921682, -1.38504507], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], [0.80541964, -0.34465185], [0.1732627, -1.61323172], ] ) y_gt = np.array( [ 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ] ) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_allclose(y_resampled, y_gt, rtol=R_TOL) def test_all_knn_allow_minority(): X, y = make_classification( n_samples=10000, n_features=2, n_informative=2, n_redundant=0, n_repeated=0, n_classes=3, n_clusters_per_class=1, weights=[0.2, 0.3, 0.5], class_sep=0.4, random_state=0, ) allknn = AllKNN(allow_minority=True) X_res_1, y_res_1 = allknn.fit_resample(X, y) allknn = AllKNN() X_res_2, y_res_2 = allknn.fit_resample(X, y) assert len(y_res_1) < len(y_res_2) def test_allknn_fit_resample_mode(): allknn = AllKNN(kind_sel="mode") X_resampled, y_resampled = allknn.fit_resample(X, Y) X_gt = np.array( [ [-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [-0.46226554, -0.50481004], [-0.34474418, 0.21969797], [-0.12840393, 0.66446571], [1.02956816, 0.36061601], [1.12202806, 0.33811558], [-0.35946678, 0.72510189], [-1.10146139, 0.91782682], [0.73489726, 0.43915195], [-0.28479268, 0.70459548], [0.50307437, 0.498805], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [0.98382284, 0.37184502], [0.69804044, 0.44810796], [1.32319756, -0.13181616], [0.04296502, -0.37981873], [0.28294738, -1.00125525], [0.34218094, -0.58781961], [0.2096964, -0.61814058], [1.59068979, -0.96622933], [0.73418199, -0.02222847], [0.79270821, -0.41386668], [1.16606871, -0.25641059], [1.0304995, -0.16955962], [0.48921682, -1.38504507], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], [0.80541964, -0.34465185], [0.1732627, -1.61323172], ] ) y_gt = np.array( [ 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ] ) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_allknn_fit_resample_with_nn_object(): nn = NearestNeighbors(n_neighbors=4) allknn = AllKNN(n_neighbors=nn, kind_sel="mode") X_resampled, y_resampled = allknn.fit_resample(X, Y) X_gt = np.array( [ [-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [-0.46226554, -0.50481004], [-0.34474418, 0.21969797], [-0.12840393, 0.66446571], [1.02956816, 0.36061601], [1.12202806, 0.33811558], [-0.35946678, 0.72510189], [-1.10146139, 0.91782682], [0.73489726, 0.43915195], [-0.28479268, 0.70459548], [0.50307437, 0.498805], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [0.98382284, 0.37184502], [0.69804044, 0.44810796], [1.32319756, -0.13181616], [0.04296502, -0.37981873], [0.28294738, -1.00125525], [0.34218094, -0.58781961], [0.2096964, -0.61814058], [1.59068979, -0.96622933], [0.73418199, -0.02222847], [0.79270821, -0.41386668], [1.16606871, -0.25641059], [1.0304995, -0.16955962], [0.48921682, -1.38504507], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], [0.80541964, -0.34465185], [0.1732627, -1.61323172], ] ) y_gt = np.array( [ 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ] ) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_alknn_not_good_object(): nn = "rnd" allknn = AllKNN(n_neighbors=nn, kind_sel="mode") with pytest.raises(ValueError): allknn.fit_resample(X, Y) test_condensed_nearest_neighbour.py000066400000000000000000000102031460233407600362450ustar00rootroot00000000000000imbalanced-learn-0.12.2/imblearn/under_sampling/_prototype_selection/tests"""Test the module condensed nearest neighbour.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import numpy as np import pytest from sklearn.datasets import make_classification from sklearn.neighbors import KNeighborsClassifier from sklearn.utils._testing import assert_array_equal from imblearn.under_sampling import CondensedNearestNeighbour RND_SEED = 0 X = np.array( [ [2.59928271, 0.93323465], [0.25738379, 0.95564169], [1.42772181, 0.526027], [1.92365863, 0.82718767], [-0.10903849, -0.12085181], [-0.284881, -0.62730973], [0.57062627, 1.19528323], [0.03394306, 0.03986753], [0.78318102, 2.59153329], [0.35831463, 1.33483198], [-0.14313184, -1.0412815], [0.01936241, 0.17799828], [-1.25020462, -0.40402054], [-0.09816301, -0.74662486], [-0.01252787, 0.34102657], [0.52726792, -0.38735648], [0.2821046, -0.07862747], [0.05230552, 0.09043907], [0.15198585, 0.12512646], [0.70524765, 0.39816382], ] ) Y = np.array([1, 2, 1, 1, 0, 2, 2, 2, 2, 2, 2, 0, 1, 2, 2, 2, 2, 1, 2, 1]) def test_cnn_init(): cnn = CondensedNearestNeighbour(random_state=RND_SEED) assert cnn.n_seeds_S == 1 assert cnn.n_jobs is None def test_cnn_fit_resample(): cnn = CondensedNearestNeighbour(random_state=RND_SEED) X_resampled, y_resampled = cnn.fit_resample(X, Y) X_gt = np.array( [ [-0.10903849, -0.12085181], [0.01936241, 0.17799828], [0.05230552, 0.09043907], [-1.25020462, -0.40402054], [0.70524765, 0.39816382], [0.35831463, 1.33483198], [-0.284881, -0.62730973], [0.03394306, 0.03986753], [-0.01252787, 0.34102657], [0.15198585, 0.12512646], ] ) y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) @pytest.mark.parametrize("n_neighbors", [1, KNeighborsClassifier(n_neighbors=1)]) def test_cnn_fit_resample_with_object(n_neighbors): cnn = CondensedNearestNeighbour(random_state=RND_SEED, n_neighbors=n_neighbors) X_resampled, y_resampled = cnn.fit_resample(X, Y) X_gt = np.array( [ [-0.10903849, -0.12085181], [0.01936241, 0.17799828], [0.05230552, 0.09043907], [-1.25020462, -0.40402054], [0.70524765, 0.39816382], [0.35831463, 1.33483198], [-0.284881, -0.62730973], [0.03394306, 0.03986753], [-0.01252787, 0.34102657], [0.15198585, 0.12512646], ] ) y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) cnn = CondensedNearestNeighbour(random_state=RND_SEED, n_neighbors=1) X_resampled, y_resampled = cnn.fit_resample(X, Y) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_condensed_nearest_neighbour_multiclass(): """Check the validity of the fitted attributes `estimators_`.""" X, y = make_classification( n_samples=1_000, n_classes=4, weights=[0.1, 0.2, 0.2, 0.5], n_clusters_per_class=1, random_state=0, ) cnn = CondensedNearestNeighbour(random_state=RND_SEED) cnn.fit_resample(X, y) assert len(cnn.estimators_) == len(cnn.sampling_strategy_) other_classes = [] for est in cnn.estimators_: assert est.classes_[0] == 0 # minority class assert est.classes_[1] in {1, 2, 3} # other classes other_classes.append(est.classes_[1]) assert len(set(other_classes)) == len(other_classes) # TODO: remove in 0.14 def test_condensed_nearest_neighbors_deprecation(): """Check that we raise a FutureWarning when accessing the parameter `estimator_`.""" cnn = CondensedNearestNeighbour(random_state=RND_SEED) cnn.fit_resample(X, Y) warn_msg = "`estimator_` attribute has been deprecated" with pytest.warns(FutureWarning, match=warn_msg): cnn.estimator_ test_edited_nearest_neighbours.py000066400000000000000000000101641460233407600357320ustar00rootroot00000000000000imbalanced-learn-0.12.2/imblearn/under_sampling/_prototype_selection/tests"""Test the module edited nearest neighbour.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import numpy as np from sklearn.datasets import make_classification from sklearn.neighbors import NearestNeighbors from sklearn.utils._testing import assert_array_equal from imblearn.under_sampling import EditedNearestNeighbours X = np.array( [ [2.59928271, 0.93323465], [0.25738379, 0.95564169], [1.42772181, 0.526027], [1.92365863, 0.82718767], [-0.10903849, -0.12085181], [-0.284881, -0.62730973], [0.57062627, 1.19528323], [0.03394306, 0.03986753], [0.78318102, 2.59153329], [0.35831463, 1.33483198], [-0.14313184, -1.0412815], [0.01936241, 0.17799828], [-1.25020462, -0.40402054], [-0.09816301, -0.74662486], [-0.01252787, 0.34102657], [0.52726792, -0.38735648], [0.2821046, -0.07862747], [0.05230552, 0.09043907], [0.15198585, 0.12512646], [0.70524765, 0.39816382], ] ) Y = np.array([1, 2, 1, 1, 0, 2, 2, 2, 2, 2, 2, 0, 1, 2, 2, 2, 2, 1, 2, 1]) def test_enn_init(): enn = EditedNearestNeighbours() assert enn.n_neighbors == 3 assert enn.kind_sel == "all" assert enn.n_jobs is None def test_enn_fit_resample(): enn = EditedNearestNeighbours() X_resampled, y_resampled = enn.fit_resample(X, Y) X_gt = np.array( [ [-0.10903849, -0.12085181], [0.01936241, 0.17799828], [2.59928271, 0.93323465], [1.92365863, 0.82718767], [0.25738379, 0.95564169], [0.78318102, 2.59153329], [0.52726792, -0.38735648], ] ) y_gt = np.array([0, 0, 1, 1, 2, 2, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_enn_fit_resample_mode(): enn = EditedNearestNeighbours(kind_sel="mode") X_resampled, y_resampled = enn.fit_resample(X, Y) X_gt = np.array( [ [-0.10903849, -0.12085181], [0.01936241, 0.17799828], [2.59928271, 0.93323465], [1.42772181, 0.526027], [1.92365863, 0.82718767], [0.25738379, 0.95564169], [-0.284881, -0.62730973], [0.57062627, 1.19528323], [0.78318102, 2.59153329], [0.35831463, 1.33483198], [-0.14313184, -1.0412815], [-0.09816301, -0.74662486], [0.52726792, -0.38735648], [0.2821046, -0.07862747], ] ) y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_enn_fit_resample_with_nn_object(): nn = NearestNeighbors(n_neighbors=4) enn = EditedNearestNeighbours(n_neighbors=nn, kind_sel="mode") X_resampled, y_resampled = enn.fit_resample(X, Y) X_gt = np.array( [ [-0.10903849, -0.12085181], [0.01936241, 0.17799828], [2.59928271, 0.93323465], [1.42772181, 0.526027], [1.92365863, 0.82718767], [0.25738379, 0.95564169], [-0.284881, -0.62730973], [0.57062627, 1.19528323], [0.78318102, 2.59153329], [0.35831463, 1.33483198], [-0.14313184, -1.0412815], [-0.09816301, -0.74662486], [0.52726792, -0.38735648], [0.2821046, -0.07862747], ] ) y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_enn_check_kind_selection(): """Check that `check_sel="all"` is more conservative than `check_sel="mode"`.""" X, y = make_classification( n_samples=1000, n_classes=2, weights=[0.3, 0.7], random_state=0, ) enn_all = EditedNearestNeighbours(kind_sel="all") enn_mode = EditedNearestNeighbours(kind_sel="mode") enn_all.fit_resample(X, y) enn_mode.fit_resample(X, y) assert enn_all.sample_indices_.size < enn_mode.sample_indices_.size test_instance_hardness_threshold.py000066400000000000000000000072561460233407600363050ustar00rootroot00000000000000imbalanced-learn-0.12.2/imblearn/under_sampling/_prototype_selection/tests"""Test the module .""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import numpy as np from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier from sklearn.naive_bayes import GaussianNB as NB from sklearn.pipeline import make_pipeline from sklearn.utils._testing import assert_array_equal from imblearn.under_sampling import InstanceHardnessThreshold RND_SEED = 0 X = np.array( [ [-0.3879569, 0.6894251], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], [0.91542919, -0.65453327], [-0.03852113, 0.40910479], [-0.43877303, 1.07366684], [-0.85795321, 0.82980738], [-0.18430329, 0.52328473], [-0.30126957, -0.66268378], [-0.65571327, 0.42412021], [-0.28305528, 0.30284991], [0.20246714, -0.34727125], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [-0.00717161, 0.00318087], ] ) Y = np.array([0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0]) ESTIMATOR = GradientBoostingClassifier(random_state=RND_SEED) def test_iht_init(): sampling_strategy = "auto" iht = InstanceHardnessThreshold( estimator=ESTIMATOR, sampling_strategy=sampling_strategy, random_state=RND_SEED, ) assert iht.sampling_strategy == sampling_strategy assert iht.random_state == RND_SEED def test_iht_fit_resample(): iht = InstanceHardnessThreshold(estimator=ESTIMATOR, random_state=RND_SEED) X_resampled, y_resampled = iht.fit_resample(X, Y) assert X_resampled.shape == (12, 2) assert y_resampled.shape == (12,) def test_iht_fit_resample_half(): sampling_strategy = {0: 3, 1: 3} iht = InstanceHardnessThreshold( estimator=NB(), sampling_strategy=sampling_strategy, random_state=RND_SEED, ) X_resampled, y_resampled = iht.fit_resample(X, Y) assert X_resampled.shape == (6, 2) assert y_resampled.shape == (6,) def test_iht_fit_resample_class_obj(): est = GradientBoostingClassifier(random_state=RND_SEED) iht = InstanceHardnessThreshold(estimator=est, random_state=RND_SEED) X_resampled, y_resampled = iht.fit_resample(X, Y) assert X_resampled.shape == (12, 2) assert y_resampled.shape == (12,) def test_iht_reproducibility(): from sklearn.datasets import load_digits X_digits, y_digits = load_digits(return_X_y=True) idx_sampled = [] for seed in range(5): est = RandomForestClassifier(n_estimators=10, random_state=seed) iht = InstanceHardnessThreshold(estimator=est, random_state=RND_SEED) iht.fit_resample(X_digits, y_digits) idx_sampled.append(iht.sample_indices_.copy()) for idx_1, idx_2 in zip(idx_sampled, idx_sampled[1:]): assert_array_equal(idx_1, idx_2) def test_iht_fit_resample_default_estimator(): iht = InstanceHardnessThreshold(estimator=None, random_state=RND_SEED) X_resampled, y_resampled = iht.fit_resample(X, Y) assert isinstance(iht.estimator_, RandomForestClassifier) assert X_resampled.shape == (12, 2) assert y_resampled.shape == (12,) def test_iht_estimator_pipeline(): """Check that we can pass a pipeline containing a classifier. Checking if we have a classifier should not be based on inheriting from `ClassifierMixin`. Non-regression test for: https://github.com/scikit-learn-contrib/imbalanced-learn/pull/1049 """ model = make_pipeline(GradientBoostingClassifier(random_state=RND_SEED)) iht = InstanceHardnessThreshold(estimator=model, random_state=RND_SEED) X_resampled, y_resampled = iht.fit_resample(X, Y) assert X_resampled.shape == (12, 2) assert y_resampled.shape == (12,) imbalanced-learn-0.12.2/imblearn/under_sampling/_prototype_selection/tests/test_nearmiss.py000066400000000000000000000155151460233407600324330ustar00rootroot00000000000000"""Test the module nearmiss.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import numpy as np from sklearn.neighbors import NearestNeighbors from sklearn.utils._testing import assert_array_equal from imblearn.under_sampling import NearMiss X = np.array( [ [1.17737838, -0.2002118], [0.4960075, 0.86130762], [-0.05903827, 0.10947647], [0.91464286, 1.61369212], [-0.54619583, 1.73009918], [-0.60413357, 0.24628718], [0.45713638, 1.31069295], [-0.04032409, 3.01186964], [0.03142011, 0.12323596], [0.50701028, -0.17636928], [-0.80809175, -1.09917302], [-0.20497017, -0.26630228], [0.99272351, -0.11631728], [-1.95581933, 0.69609604], [1.15157493, -1.2981518], ] ) Y = np.array([1, 2, 1, 0, 2, 1, 2, 2, 1, 2, 0, 0, 2, 1, 2]) VERSION_NEARMISS = (1, 2, 3) def test_nm_fit_resample_auto(): sampling_strategy = "auto" X_gt = [ np.array( [ [0.91464286, 1.61369212], [-0.80809175, -1.09917302], [-0.20497017, -0.26630228], [-0.05903827, 0.10947647], [0.03142011, 0.12323596], [-0.60413357, 0.24628718], [0.50701028, -0.17636928], [0.4960075, 0.86130762], [0.45713638, 1.31069295], ] ), np.array( [ [0.91464286, 1.61369212], [-0.80809175, -1.09917302], [-0.20497017, -0.26630228], [-0.05903827, 0.10947647], [0.03142011, 0.12323596], [-0.60413357, 0.24628718], [0.50701028, -0.17636928], [0.4960075, 0.86130762], [0.45713638, 1.31069295], ] ), np.array( [ [0.91464286, 1.61369212], [-0.80809175, -1.09917302], [-0.20497017, -0.26630228], [1.17737838, -0.2002118], [-0.60413357, 0.24628718], [0.03142011, 0.12323596], [1.15157493, -1.2981518], [-0.54619583, 1.73009918], [0.99272351, -0.11631728], ] ), ] y_gt = [ np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), ] for version_idx, version in enumerate(VERSION_NEARMISS): nm = NearMiss(sampling_strategy=sampling_strategy, version=version) X_resampled, y_resampled = nm.fit_resample(X, Y) assert_array_equal(X_resampled, X_gt[version_idx]) assert_array_equal(y_resampled, y_gt[version_idx]) def test_nm_fit_resample_float_sampling_strategy(): sampling_strategy = {0: 3, 1: 4, 2: 4} X_gt = [ np.array( [ [-0.20497017, -0.26630228], [-0.80809175, -1.09917302], [0.91464286, 1.61369212], [-0.05903827, 0.10947647], [0.03142011, 0.12323596], [-0.60413357, 0.24628718], [1.17737838, -0.2002118], [0.50701028, -0.17636928], [0.4960075, 0.86130762], [0.45713638, 1.31069295], [0.99272351, -0.11631728], ] ), np.array( [ [-0.20497017, -0.26630228], [-0.80809175, -1.09917302], [0.91464286, 1.61369212], [-0.05903827, 0.10947647], [0.03142011, 0.12323596], [-0.60413357, 0.24628718], [1.17737838, -0.2002118], [0.50701028, -0.17636928], [0.4960075, 0.86130762], [0.45713638, 1.31069295], [0.99272351, -0.11631728], ] ), np.array( [ [0.91464286, 1.61369212], [-0.80809175, -1.09917302], [-0.20497017, -0.26630228], [1.17737838, -0.2002118], [-0.60413357, 0.24628718], [0.03142011, 0.12323596], [-0.05903827, 0.10947647], [1.15157493, -1.2981518], [-0.54619583, 1.73009918], [0.99272351, -0.11631728], [0.45713638, 1.31069295], ] ), ] y_gt = [ np.array([0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]), np.array([0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]), np.array([0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]), ] for version_idx, version in enumerate(VERSION_NEARMISS): nm = NearMiss(sampling_strategy=sampling_strategy, version=version) X_resampled, y_resampled = nm.fit_resample(X, Y) assert_array_equal(X_resampled, X_gt[version_idx]) assert_array_equal(y_resampled, y_gt[version_idx]) def test_nm_fit_resample_nn_obj(): sampling_strategy = "auto" nn = NearestNeighbors(n_neighbors=3) X_gt = [ np.array( [ [0.91464286, 1.61369212], [-0.80809175, -1.09917302], [-0.20497017, -0.26630228], [-0.05903827, 0.10947647], [0.03142011, 0.12323596], [-0.60413357, 0.24628718], [0.50701028, -0.17636928], [0.4960075, 0.86130762], [0.45713638, 1.31069295], ] ), np.array( [ [0.91464286, 1.61369212], [-0.80809175, -1.09917302], [-0.20497017, -0.26630228], [-0.05903827, 0.10947647], [0.03142011, 0.12323596], [-0.60413357, 0.24628718], [0.50701028, -0.17636928], [0.4960075, 0.86130762], [0.45713638, 1.31069295], ] ), np.array( [ [0.91464286, 1.61369212], [-0.80809175, -1.09917302], [-0.20497017, -0.26630228], [1.17737838, -0.2002118], [-0.60413357, 0.24628718], [0.03142011, 0.12323596], [1.15157493, -1.2981518], [-0.54619583, 1.73009918], [0.99272351, -0.11631728], ] ), ] y_gt = [ np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), ] for version_idx, version in enumerate(VERSION_NEARMISS): nm = NearMiss( sampling_strategy=sampling_strategy, version=version, n_neighbors=nn, ) X_resampled, y_resampled = nm.fit_resample(X, Y) assert_array_equal(X_resampled, X_gt[version_idx]) assert_array_equal(y_resampled, y_gt[version_idx]) test_neighbourhood_cleaning_rule.py000066400000000000000000000050161460233407600362510ustar00rootroot00000000000000imbalanced-learn-0.12.2/imblearn/under_sampling/_prototype_selection/tests"""Test the module neighbourhood cleaning rule.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT from collections import Counter import numpy as np import pytest from sklearn.datasets import make_classification from sklearn.utils._testing import assert_array_equal from imblearn.under_sampling import EditedNearestNeighbours, NeighbourhoodCleaningRule @pytest.fixture(scope="module") def data(): return make_classification( n_samples=200, n_features=2, n_informative=2, n_redundant=0, n_repeated=0, n_clusters_per_class=1, n_classes=3, weights=[0.1, 0.3, 0.6], random_state=0, ) def test_ncr_threshold_cleaning(data): """Test the effect of the `threshold_cleaning` parameter.""" X, y = data # with a large `threshold_cleaning`, the algorithm is equivalent to ENN enn = EditedNearestNeighbours() ncr = NeighbourhoodCleaningRule( edited_nearest_neighbours=enn, n_neighbors=10, threshold_cleaning=10 ) enn.fit_resample(X, y) ncr.fit_resample(X, y) assert_array_equal(np.sort(enn.sample_indices_), np.sort(ncr.sample_indices_)) assert ncr.classes_to_clean_ == [] # set a threshold that we should consider only the class #2 counter = Counter(y) threshold = counter[1] / counter[0] ncr.set_params(threshold_cleaning=threshold) ncr.fit_resample(X, y) assert set(ncr.classes_to_clean_) == {2} # making the threshold slightly smaller to take into account class #1 ncr.set_params(threshold_cleaning=threshold - np.finfo(np.float32).eps) ncr.fit_resample(X, y) assert set(ncr.classes_to_clean_) == {1, 2} def test_ncr_n_neighbors(data): """Check the effect of the NN on the cleaning of the second phase.""" X, y = data enn = EditedNearestNeighbours() ncr = NeighbourhoodCleaningRule(edited_nearest_neighbours=enn, n_neighbors=3) ncr.fit_resample(X, y) sample_indices_3_nn = ncr.sample_indices_ ncr.set_params(n_neighbors=10).fit_resample(X, y) sample_indices_10_nn = ncr.sample_indices_ # we should have a more aggressive cleaning with n_neighbors is larger assert len(sample_indices_3_nn) > len(sample_indices_10_nn) # TODO: remove in 0.14 @pytest.mark.parametrize("kind_sel", ["all", "mode"]) def test_ncr_deprecate_kind_sel(data, kind_sel): X, y = data with pytest.warns(FutureWarning, match="`kind_sel` is deprecated"): NeighbourhoodCleaningRule(kind_sel=kind_sel).fit_resample(X, y) test_one_sided_selection.py000066400000000000000000000101311460233407600345160ustar00rootroot00000000000000imbalanced-learn-0.12.2/imblearn/under_sampling/_prototype_selection/tests"""Test the module one-sided selection.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import numpy as np import pytest from sklearn.datasets import make_classification from sklearn.neighbors import KNeighborsClassifier from sklearn.utils._testing import assert_array_equal from imblearn.under_sampling import OneSidedSelection RND_SEED = 0 X = np.array( [ [-0.3879569, 0.6894251], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], [0.91542919, -0.65453327], [-0.03852113, 0.40910479], [-0.43877303, 1.07366684], [-0.85795321, 0.82980738], [-0.18430329, 0.52328473], [-0.30126957, -0.66268378], [-0.65571327, 0.42412021], [-0.28305528, 0.30284991], [0.20246714, -0.34727125], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [-0.00717161, 0.00318087], ] ) Y = np.array([0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0]) def test_oss_init(): oss = OneSidedSelection(random_state=RND_SEED) assert oss.n_seeds_S == 1 assert oss.n_jobs is None assert oss.random_state == RND_SEED def test_oss_fit_resample(): oss = OneSidedSelection(random_state=RND_SEED) X_resampled, y_resampled = oss.fit_resample(X, Y) X_gt = np.array( [ [-0.3879569, 0.6894251], [0.91542919, -0.65453327], [-0.65571327, 0.42412021], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [-0.00717161, 0.00318087], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], [-0.43877303, 1.07366684], [-0.85795321, 0.82980738], [-0.30126957, -0.66268378], [0.20246714, -0.34727125], ] ) y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) @pytest.mark.parametrize("n_neighbors", [1, KNeighborsClassifier(n_neighbors=1)]) def test_oss_with_object(n_neighbors): oss = OneSidedSelection(random_state=RND_SEED, n_neighbors=n_neighbors) X_resampled, y_resampled = oss.fit_resample(X, Y) X_gt = np.array( [ [-0.3879569, 0.6894251], [0.91542919, -0.65453327], [-0.65571327, 0.42412021], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [-0.00717161, 0.00318087], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], [-0.43877303, 1.07366684], [-0.85795321, 0.82980738], [-0.30126957, -0.66268378], [0.20246714, -0.34727125], ] ) y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) knn = 1 oss = OneSidedSelection(random_state=RND_SEED, n_neighbors=knn) X_resampled, y_resampled = oss.fit_resample(X, Y) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_one_sided_selection_multiclass(): """Check the validity of the fitted attributes `estimators_`.""" X, y = make_classification( n_samples=1_000, n_classes=4, weights=[0.1, 0.2, 0.2, 0.5], n_clusters_per_class=1, random_state=0, ) oss = OneSidedSelection(random_state=RND_SEED) oss.fit_resample(X, y) assert len(oss.estimators_) == len(oss.sampling_strategy_) other_classes = [] for est in oss.estimators_: assert est.classes_[0] == 0 # minority class assert est.classes_[1] in {1, 2, 3} # other classes other_classes.append(est.classes_[1]) assert len(set(other_classes)) == len(other_classes) # TODO: remove in 0.14 def test_one_sided_selection_deprecation(): """Check that we raise a FutureWarning when accessing the parameter `estimator_`.""" oss = OneSidedSelection(random_state=RND_SEED) oss.fit_resample(X, Y) warn_msg = "`estimator_` attribute has been deprecated" with pytest.warns(FutureWarning, match=warn_msg): oss.estimator_ test_random_under_sampler.py000066400000000000000000000127761460233407600347410ustar00rootroot00000000000000imbalanced-learn-0.12.2/imblearn/under_sampling/_prototype_selection/tests"""Test the module random under sampler.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT from collections import Counter from datetime import datetime import numpy as np import pytest from sklearn.datasets import make_classification from sklearn.utils._testing import assert_array_equal from imblearn.under_sampling import RandomUnderSampler RND_SEED = 0 X = np.array( [ [0.04352327, -0.20515826], [0.92923648, 0.76103773], [0.20792588, 1.49407907], [0.47104475, 0.44386323], [0.22950086, 0.33367433], [0.15490546, 0.3130677], [0.09125309, -0.85409574], [0.12372842, 0.6536186], [0.13347175, 0.12167502], [0.094035, -2.55298982], ] ) Y = np.array([1, 0, 1, 0, 1, 1, 1, 1, 0, 1]) @pytest.mark.parametrize("as_frame", [True, False], ids=["dataframe", "array"]) def test_rus_fit_resample(as_frame): if as_frame: pd = pytest.importorskip("pandas") X_ = pd.DataFrame(X) else: X_ = X rus = RandomUnderSampler(random_state=RND_SEED, replacement=True) X_resampled, y_resampled = rus.fit_resample(X_, Y) X_gt = np.array( [ [0.92923648, 0.76103773], [0.47104475, 0.44386323], [0.13347175, 0.12167502], [0.09125309, -0.85409574], [0.12372842, 0.6536186], [0.04352327, -0.20515826], ] ) y_gt = np.array([0, 0, 0, 1, 1, 1]) if as_frame: assert hasattr(X_resampled, "loc") # FIXME: we should use to_numpy with pandas >= 0.25 X_resampled = X_resampled.values assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_rus_fit_resample_half(): sampling_strategy = {0: 3, 1: 6} rus = RandomUnderSampler( sampling_strategy=sampling_strategy, random_state=RND_SEED, replacement=True, ) X_resampled, y_resampled = rus.fit_resample(X, Y) X_gt = np.array( [ [0.92923648, 0.76103773], [0.47104475, 0.44386323], [0.92923648, 0.76103773], [0.15490546, 0.3130677], [0.15490546, 0.3130677], [0.15490546, 0.3130677], [0.20792588, 1.49407907], [0.15490546, 0.3130677], [0.12372842, 0.6536186], ] ) y_gt = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_multiclass_fit_resample(): y = Y.copy() y[5] = 2 y[6] = 2 rus = RandomUnderSampler(random_state=RND_SEED) X_resampled, y_resampled = rus.fit_resample(X, y) count_y_res = Counter(y_resampled) assert count_y_res[0] == 2 assert count_y_res[1] == 2 assert count_y_res[2] == 2 def test_random_under_sampling_heterogeneous_data(): X_hetero = np.array( [["xxx", 1, 1.0], ["yyy", 2, 2.0], ["zzz", 3, 3.0]], dtype=object ) y = np.array([0, 0, 1]) rus = RandomUnderSampler(random_state=RND_SEED) X_res, y_res = rus.fit_resample(X_hetero, y) assert X_res.shape[0] == 2 assert y_res.shape[0] == 2 assert X_res.dtype == object def test_random_under_sampling_nan_inf(): # check that we can undersample even with missing or infinite data # regression tests for #605 rng = np.random.RandomState(42) n_not_finite = X.shape[0] // 3 row_indices = rng.choice(np.arange(X.shape[0]), size=n_not_finite) col_indices = rng.randint(0, X.shape[1], size=n_not_finite) not_finite_values = rng.choice([np.nan, np.inf], size=n_not_finite) X_ = X.copy() X_[row_indices, col_indices] = not_finite_values rus = RandomUnderSampler(random_state=0) X_res, y_res = rus.fit_resample(X_, Y) assert y_res.shape == (6,) assert X_res.shape == (6, 2) assert np.any(~np.isfinite(X_res)) @pytest.mark.parametrize( "sampling_strategy", ["auto", "majority", "not minority", "not majority", "all"] ) def test_random_under_sampler_strings(sampling_strategy): """Check that we support all supposed strings as `sampling_strategy` in a sampler inheriting from `BaseUnderSampler`.""" X, y = make_classification( n_samples=100, n_clusters_per_class=1, n_classes=3, weights=[0.1, 0.3, 0.6], random_state=0, ) RandomUnderSampler(sampling_strategy=sampling_strategy).fit_resample(X, y) def test_random_under_sampling_datetime(): """Check that we don't convert input data and only sample from it.""" pd = pytest.importorskip("pandas") X = pd.DataFrame({"label": [0, 0, 0, 1], "td": [datetime.now()] * 4}) y = X["label"] rus = RandomUnderSampler(random_state=0) X_res, y_res = rus.fit_resample(X, y) pd.testing.assert_series_equal(X_res.dtypes, X.dtypes) pd.testing.assert_index_equal(X_res.index, y_res.index) assert_array_equal(y_res.to_numpy(), np.array([0, 1])) def test_random_under_sampler_full_nat(): """Check that we can return timedelta columns full of NaT. Non-regression test for: https://github.com/scikit-learn-contrib/imbalanced-learn/issues/1055 """ pd = pytest.importorskip("pandas") X = pd.DataFrame( { "col_str": ["abc", "def", "xyz"], "col_timedelta": pd.to_timedelta([np.nan, np.nan, np.nan]), } ) y = np.array([0, 0, 1]) X_res, y_res = RandomUnderSampler().fit_resample(X, y) assert X_res.shape == (2, 2) assert y_res.shape == (2,) assert X_res["col_timedelta"].dtype == "timedelta64[ns]" test_repeated_edited_nearest_neighbours.py000066400000000000000000000210271460233407600376030ustar00rootroot00000000000000imbalanced-learn-0.12.2/imblearn/under_sampling/_prototype_selection/tests"""Test the module repeated edited nearest neighbour.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import numpy as np import pytest from sklearn.neighbors import NearestNeighbors from sklearn.utils._testing import assert_array_equal from imblearn.under_sampling import RepeatedEditedNearestNeighbours X = np.array( [ [-0.12840393, 0.66446571], [1.32319756, -0.13181616], [0.04296502, -0.37981873], [0.83631853, 0.18569783], [1.02956816, 0.36061601], [1.12202806, 0.33811558], [-0.53171468, -0.53735182], [1.3381556, 0.35956356], [-0.35946678, 0.72510189], [1.32326943, 0.28393874], [2.94290565, -0.13986434], [0.28294738, -1.00125525], [0.34218094, -0.58781961], [-0.88864036, -0.33782387], [-1.10146139, 0.91782682], [-0.7969716, -0.50493969], [0.73489726, 0.43915195], [0.2096964, -0.61814058], [-0.28479268, 0.70459548], [1.84864913, 0.14729596], [1.59068979, -0.96622933], [0.73418199, -0.02222847], [0.50307437, 0.498805], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [0.79270821, -0.41386668], [1.16606871, -0.25641059], [1.57356906, 0.30390519], [1.0304995, -0.16955962], [1.67314371, 0.19231498], [0.98382284, 0.37184502], [0.48921682, -1.38504507], [-0.46226554, -0.50481004], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], [0.80541964, -0.34465185], [0.1732627, -1.61323172], [0.69804044, 0.44810796], [-0.5506368, -0.42072426], [-0.34474418, 0.21969797], ] ) Y = np.array( [ 1, 2, 2, 2, 1, 1, 0, 2, 1, 1, 1, 2, 2, 0, 1, 2, 1, 2, 1, 1, 2, 2, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 0, 2, 2, 2, 2, 1, 2, 0, ] ) def test_renn_init(): renn = RepeatedEditedNearestNeighbours() assert renn.n_neighbors == 3 assert renn.kind_sel == "all" assert renn.n_jobs is None def test_renn_iter_wrong(): max_iter = -1 renn = RepeatedEditedNearestNeighbours(max_iter=max_iter) with pytest.raises(ValueError): renn.fit_resample(X, Y) def test_renn_fit_resample(): renn = RepeatedEditedNearestNeighbours() X_resampled, y_resampled = renn.fit_resample(X, Y) X_gt = np.array( [ [-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [-0.46226554, -0.50481004], [-0.34474418, 0.21969797], [1.02956816, 0.36061601], [1.12202806, 0.33811558], [0.73489726, 0.43915195], [0.50307437, 0.498805], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [0.98382284, 0.37184502], [0.69804044, 0.44810796], [0.04296502, -0.37981873], [0.28294738, -1.00125525], [0.34218094, -0.58781961], [0.2096964, -0.61814058], [1.59068979, -0.96622933], [0.73418199, -0.02222847], [0.79270821, -0.41386668], [1.16606871, -0.25641059], [1.0304995, -0.16955962], [0.48921682, -1.38504507], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], [0.80541964, -0.34465185], [0.1732627, -1.61323172], ] ) y_gt = np.array( [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2] ) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert 0 < renn.n_iter_ <= renn.max_iter def test_renn_fit_resample_mode_object(): renn = RepeatedEditedNearestNeighbours(kind_sel="mode") X_resampled, y_resampled = renn.fit_resample(X, Y) X_gt = np.array( [ [-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [-0.46226554, -0.50481004], [-0.34474418, 0.21969797], [-0.12840393, 0.66446571], [1.02956816, 0.36061601], [1.12202806, 0.33811558], [-0.35946678, 0.72510189], [2.94290565, -0.13986434], [-1.10146139, 0.91782682], [0.73489726, 0.43915195], [-0.28479268, 0.70459548], [1.84864913, 0.14729596], [0.50307437, 0.498805], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [1.67314371, 0.19231498], [0.98382284, 0.37184502], [0.69804044, 0.44810796], [1.32319756, -0.13181616], [0.04296502, -0.37981873], [0.28294738, -1.00125525], [0.34218094, -0.58781961], [0.2096964, -0.61814058], [1.59068979, -0.96622933], [0.73418199, -0.02222847], [0.79270821, -0.41386668], [1.16606871, -0.25641059], [1.0304995, -0.16955962], [0.48921682, -1.38504507], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], [0.80541964, -0.34465185], [0.1732627, -1.61323172], ] ) y_gt = np.array( [ 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ] ) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert 0 < renn.n_iter_ <= renn.max_iter def test_renn_fit_resample_mode(): nn = NearestNeighbors(n_neighbors=4) renn = RepeatedEditedNearestNeighbours(n_neighbors=nn, kind_sel="mode") X_resampled, y_resampled = renn.fit_resample(X, Y) X_gt = np.array( [ [-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [-0.46226554, -0.50481004], [-0.34474418, 0.21969797], [-0.12840393, 0.66446571], [1.02956816, 0.36061601], [1.12202806, 0.33811558], [-0.35946678, 0.72510189], [2.94290565, -0.13986434], [-1.10146139, 0.91782682], [0.73489726, 0.43915195], [-0.28479268, 0.70459548], [1.84864913, 0.14729596], [0.50307437, 0.498805], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [1.67314371, 0.19231498], [0.98382284, 0.37184502], [0.69804044, 0.44810796], [1.32319756, -0.13181616], [0.04296502, -0.37981873], [0.28294738, -1.00125525], [0.34218094, -0.58781961], [0.2096964, -0.61814058], [1.59068979, -0.96622933], [0.73418199, -0.02222847], [0.79270821, -0.41386668], [1.16606871, -0.25641059], [1.0304995, -0.16955962], [0.48921682, -1.38504507], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], [0.80541964, -0.34465185], [0.1732627, -1.61323172], ] ) y_gt = np.array( [ 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ] ) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert 0 < renn.n_iter_ <= renn.max_iter @pytest.mark.parametrize( "max_iter, n_iter", [(2, 2), (5, 3)], ) def test_renn_iter_attribute(max_iter, n_iter): renn = RepeatedEditedNearestNeighbours(max_iter=max_iter) renn.fit_resample(X, Y) assert renn.n_iter_ == n_iter imbalanced-learn-0.12.2/imblearn/under_sampling/_prototype_selection/tests/test_tomek_links.py000066400000000000000000000052121460233407600331220ustar00rootroot00000000000000"""Test the module Tomek's links.""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import numpy as np import pytest from sklearn.datasets import make_classification from sklearn.utils._testing import assert_array_equal from imblearn.under_sampling import TomekLinks X = np.array( [ [0.31230513, 0.1216318], [0.68481731, 0.51935141], [1.34192108, -0.13367336], [0.62366841, -0.21312976], [1.61091956, -0.40283504], [-0.37162401, -2.19400981], [0.74680821, 1.63827342], [0.2184254, 0.24299982], [0.61472253, -0.82309052], [0.19893132, -0.47761769], [1.06514042, -0.0770537], [0.97407872, 0.44454207], [1.40301027, -0.83648734], [-1.20515198, -1.02689695], [-0.27410027, -0.54194484], [0.8381014, 0.44085498], [-0.23374509, 0.18370049], [-0.32635887, -0.29299653], [-0.00288378, 0.84259929], [1.79580611, -0.02219234], ] ) Y = np.array([1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0]) def test_tl_init(): tl = TomekLinks() assert tl.n_jobs is None def test_tl_fit_resample(): tl = TomekLinks() X_resampled, y_resampled = tl.fit_resample(X, Y) X_gt = np.array( [ [0.31230513, 0.1216318], [0.68481731, 0.51935141], [1.34192108, -0.13367336], [0.62366841, -0.21312976], [1.61091956, -0.40283504], [-0.37162401, -2.19400981], [0.74680821, 1.63827342], [0.2184254, 0.24299982], [0.61472253, -0.82309052], [0.19893132, -0.47761769], [0.97407872, 0.44454207], [1.40301027, -0.83648734], [-1.20515198, -1.02689695], [-0.23374509, 0.18370049], [-0.32635887, -0.29299653], [-0.00288378, 0.84259929], [1.79580611, -0.02219234], ] ) y_gt = np.array([1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) @pytest.mark.parametrize( "sampling_strategy", ["auto", "majority", "not minority", "not majority", "all"] ) def test_tomek_links_strings(sampling_strategy): """Check that we support all supposed strings as `sampling_strategy` in a sampler inheriting from `BaseCleaningSampler`.""" X, y = make_classification( n_samples=100, n_clusters_per_class=1, n_classes=3, weights=[0.1, 0.3, 0.6], random_state=0, ) TomekLinks(sampling_strategy=sampling_strategy).fit_resample(X, y) imbalanced-learn-0.12.2/imblearn/under_sampling/base.py000066400000000000000000000075001460233407600230450ustar00rootroot00000000000000""" Base class for the under-sampling method. """ # Authors: Guillaume Lemaitre # License: MIT import numbers from collections.abc import Mapping from ..base import BaseSampler from ..utils._param_validation import Interval, StrOptions class BaseUnderSampler(BaseSampler): """Base class for under-sampling algorithms. Warning: This class should not be used directly. Use the derive classes instead. """ _sampling_type = "under-sampling" _sampling_strategy_docstring = """sampling_strategy : float, str, dict, callable, default='auto' Sampling information to sample the data set. - When ``float``, it corresponds to the desired ratio of the number of samples in the minority class over the number of samples in the majority class after resampling. Therefore, the ratio is expressed as :math:`\\alpha_{us} = N_{m} / N_{rM}` where :math:`N_{m}` is the number of samples in the minority class and :math:`N_{rM}` is the number of samples in the majority class after resampling. .. warning:: ``float`` is only available for **binary** classification. An error is raised for multi-class classification. - When ``str``, specify the class targeted by the resampling. The number of samples in the different classes will be equalized. Possible choices are: ``'majority'``: resample only the majority class; ``'not minority'``: resample all classes but the minority class; ``'not majority'``: resample all classes but the majority class; ``'all'``: resample all classes; ``'auto'``: equivalent to ``'not minority'``. - When ``dict``, the keys correspond to the targeted classes. The values correspond to the desired number of samples for each targeted class. - When callable, function taking ``y`` and returns a ``dict``. The keys correspond to the targeted classes. The values correspond to the desired number of samples for each class. """.rstrip() # noqa: E501 _parameter_constraints: dict = { "sampling_strategy": [ Interval(numbers.Real, 0, 1, closed="right"), StrOptions({"auto", "majority", "not minority", "not majority", "all"}), Mapping, callable, ], } class BaseCleaningSampler(BaseSampler): """Base class for under-sampling algorithms. Warning: This class should not be used directly. Use the derive classes instead. """ _sampling_type = "clean-sampling" _sampling_strategy_docstring = """sampling_strategy : str, list or callable Sampling information to sample the data set. - When ``str``, specify the class targeted by the resampling. Note the the number of samples will not be equal in each. Possible choices are: ``'majority'``: resample only the majority class; ``'not minority'``: resample all classes but the minority class; ``'not majority'``: resample all classes but the majority class; ``'all'``: resample all classes; ``'auto'``: equivalent to ``'not minority'``. - When ``list``, the list contains the classes targeted by the resampling. - When callable, function taking ``y`` and returns a ``dict``. The keys correspond to the targeted classes. The values correspond to the desired number of samples for each class. """.rstrip() _parameter_constraints: dict = { "sampling_strategy": [ Interval(numbers.Real, 0, 1, closed="right"), StrOptions({"auto", "majority", "not minority", "not majority", "all"}), list, callable, ], } imbalanced-learn-0.12.2/imblearn/utils/000077500000000000000000000000001460233407600177105ustar00rootroot00000000000000imbalanced-learn-0.12.2/imblearn/utils/__init__.py000066400000000000000000000005211460233407600220170ustar00rootroot00000000000000""" The :mod:`imblearn.utils` module includes various utilities. """ from ._docstring import Substitution from ._validation import ( check_neighbors_object, check_sampling_strategy, check_target_type, ) __all__ = [ "check_neighbors_object", "check_sampling_strategy", "check_target_type", "Substitution", ] imbalanced-learn-0.12.2/imblearn/utils/_available_if.py000066400000000000000000000063721460233407600230270ustar00rootroot00000000000000"""This is a copy of sklearn/utils/_available_if.py. It can be removed when we support scikit-learn >= 1.1. """ # mypy: ignore-errors from functools import update_wrapper, wraps from types import MethodType import sklearn from sklearn.utils.fixes import parse_version sklearn_version = parse_version(sklearn.__version__) if sklearn_version < parse_version("1.1"): class _AvailableIfDescriptor: """Implements a conditional property using the descriptor protocol. Using this class to create a decorator will raise an ``AttributeError`` if check(self) returns a falsey value. Note that if check raises an error this will also result in hasattr returning false. See https://docs.python.org/3/howto/descriptor.html for an explanation of descriptors. """ def __init__(self, fn, check, attribute_name): self.fn = fn self.check = check self.attribute_name = attribute_name # update the docstring of the descriptor update_wrapper(self, fn) def __get__(self, obj, owner=None): attr_err = AttributeError( f"This {owner.__name__!r} has no attribute {self.attribute_name!r}" ) if obj is not None: # delegate only on instances, not the classes. # this is to allow access to the docstrings. if not self.check(obj): raise attr_err out = MethodType(self.fn, obj) else: # This makes it possible to use the decorated method as an # unbound method, for instance when monkeypatching. @wraps(self.fn) def out(*args, **kwargs): if not self.check(args[0]): raise attr_err return self.fn(*args, **kwargs) return out def available_if(check): """An attribute that is available only if check returns a truthy value. Parameters ---------- check : callable When passed the object with the decorated method, this should return a truthy value if the attribute is available, and either return False or raise an AttributeError if not available. Returns ------- callable Callable makes the decorated method available if `check` returns a truthy value, otherwise the decorated method is unavailable. Examples -------- >>> from sklearn.utils.metaestimators import available_if >>> class HelloIfEven: ... def __init__(self, x): ... self.x = x ... ... def _x_is_even(self): ... return self.x % 2 == 0 ... ... @available_if(_x_is_even) ... def say_hello(self): ... print("Hello") ... >>> obj = HelloIfEven(1) >>> hasattr(obj, "say_hello") False >>> obj.x = 2 >>> hasattr(obj, "say_hello") True >>> obj.say_hello() Hello """ return lambda fn: _AvailableIfDescriptor(fn, check, attribute_name=fn.__name__) else: from sklearn.utils.metaestimators import available_if # noqa imbalanced-learn-0.12.2/imblearn/utils/_docstring.py000066400000000000000000000027501460233407600224210ustar00rootroot00000000000000"""Utilities for docstring in imbalanced-learn.""" # Authors: Guillaume Lemaitre # License: MIT class Substitution: """Decorate a function's or a class' docstring to perform string substitution on it. This decorator should be robust even if obj.__doc__ is None (for example, if -OO was passed to the interpreter) """ def __init__(self, *args, **kwargs): if args and kwargs: raise AssertionError("Only positional or keyword args are allowed") self.params = args or kwargs def __call__(self, obj): if obj.__doc__: obj.__doc__ = obj.__doc__.format(**self.params) return obj _random_state_docstring = """random_state : int, RandomState instance, default=None Control the randomization of the algorithm. - If int, ``random_state`` is the seed used by the random number generator; - If ``RandomState`` instance, random_state is the random number generator; - If ``None``, the random number generator is the ``RandomState`` instance used by ``np.random``. """.rstrip() _n_jobs_docstring = """n_jobs : int, default=None Number of CPU cores used during the cross-validation loop. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See `Glossary `_ for more details. """.rstrip() imbalanced-learn-0.12.2/imblearn/utils/_metadata_requests.py000066400000000000000000001654361460233407600241530ustar00rootroot00000000000000""" This is a copy of sklearn/utils/_metadata_requests.py. It can be removed once we support scikit-learn >= 1.4. Metadata Routing Utility In order to better understand the components implemented in this file, one needs to understand their relationship to one another. The only relevant public API for end users are the ``set_{method}_request``, e.g. ``estimator.set_fit_request(sample_weight=True)``. However, third-party developers and users who implement custom meta-estimators, need to deal with the objects implemented in this file. All estimators (should) implement a ``get_metadata_routing`` method, returning the routing requests set for the estimator. This method is automatically implemented via ``BaseEstimator`` for all simple estimators, but needs a custom implementation for meta-estimators. In non-routing consumers, i.e. the simplest case, e.g. ``SVM``, ``get_metadata_routing`` returns a ``MetadataRequest`` object. In routers, e.g. meta-estimators and a multi metric scorer, ``get_metadata_routing`` returns a ``MetadataRouter`` object. An object which is both a router and a consumer, e.g. a meta-estimator which consumes ``sample_weight`` and routes ``sample_weight`` to its sub-estimators, routing information includes both information about the object itself (added via ``MetadataRouter.add_self_request``), as well as the routing information for its sub-estimators. A ``MetadataRequest`` instance includes one ``MethodMetadataRequest`` per method in ``METHODS``, which includes ``fit``, ``score``, etc. Request values are added to the routing mechanism by adding them to ``MethodMetadataRequest`` instances, e.g. ``metadatarequest.fit.add(param="sample_weight", alias="my_weights")``. This is used in ``set_{method}_request`` which are automatically generated, so users and developers almost never need to directly call methods on a ``MethodMetadataRequest``. The ``alias`` above in the ``add`` method has to be either a string (an alias), or a {True (requested), False (unrequested), None (error if passed)}``. There are some other special values such as ``UNUSED`` and ``WARN`` which are used for purposes such as warning of removing a metadata in a child class, but not used by the end users. ``MetadataRouter`` includes information about sub-objects' routing and how methods are mapped together. For instance, the information about which methods of a sub-estimator are called in which methods of the meta-estimator are all stored here. Conceptually, this information looks like: ``` { "sub_estimator1": ( mapping=[(caller="fit", callee="transform"), ...], router=MetadataRequest(...), # or another MetadataRouter ), ... } ``` To give the above representation some structure, we use the following objects: - ``(caller, callee)`` is a namedtuple called ``MethodPair`` - The list of ``MethodPair`` stored in the ``mapping`` field is a ``MethodMapping`` object - ``(mapping=..., router=...)`` is a namedtuple called ``RouterMappingPair`` The ``set_{method}_request`` methods are dynamically generated for estimators which inherit from the ``BaseEstimator``. This is done by attaching instances of the ``RequestMethod`` descriptor to classes, which is done in the ``_MetadataRequester`` class, and ``BaseEstimator`` inherits from this mixin. This mixin also implements the ``get_metadata_routing``, which meta-estimators need to override, but it works for simple consumers as is. """ # Author: Adrin Jalali # License: BSD 3 clause import inspect from collections import namedtuple from copy import deepcopy from typing import TYPE_CHECKING, Optional, Union from warnings import warn from sklearn import __version__, get_config from sklearn.utils import Bunch from sklearn.utils.fixes import parse_version sklearn_version = parse_version(__version__) if parse_version(sklearn_version.base_version) < parse_version("1.4"): # Only the following methods are supported in the routing mechanism. Adding new # methods at the moment involves monkeypatching this list. # Note that if this list is changed or monkeypatched, the corresponding method # needs to be added under a TYPE_CHECKING condition like the one done here in # _MetadataRequester SIMPLE_METHODS = [ "fit", "partial_fit", "predict", "predict_proba", "predict_log_proba", "decision_function", "score", "split", "transform", "inverse_transform", ] # These methods are a composite of other methods and one cannot set their # requests directly. Instead they should be set by setting the requests of the # simple methods which make the composite ones. COMPOSITE_METHODS = { "fit_transform": ["fit", "transform"], "fit_predict": ["fit", "predict"], } METHODS = SIMPLE_METHODS + list(COMPOSITE_METHODS.keys()) def _routing_enabled(): """Return whether metadata routing is enabled. .. versionadded:: 1.3 Returns ------- enabled : bool Whether metadata routing is enabled. If the config is not set, it defaults to False. """ return get_config().get("enable_metadata_routing", False) def _raise_for_params(params, owner, method): """Raise an error if metadata routing is not enabled and params are passed. .. versionadded:: 1.4 Parameters ---------- params : dict The metadata passed to a method. owner : object The object to which the method belongs. method : str The name of the method, e.g. "fit". Raises ------ ValueError If metadata routing is not enabled and params are passed. """ caller = ( f"{owner.__class__.__name__}.{method}" if method else owner.__class__.__name__ ) if not _routing_enabled() and params: raise ValueError( f"Passing extra keyword arguments to {caller} is only supported if" " enable_metadata_routing=True, which you can set using" " `sklearn.set_config`. See the User Guide" " for more" f" details. Extra parameters passed are: {set(params)}" ) def _raise_for_unsupported_routing(obj, method, **kwargs): """Raise when metadata routing is enabled and metadata is passed. This is used in meta-estimators which have not implemented metadata routing to prevent silent bugs. There is no need to use this function if the meta-estimator is not accepting any metadata, especially in `fit`, since if a meta-estimator accepts any metadata, they would do that in `fit` as well. Parameters ---------- obj : estimator The estimator for which we're raising the error. method : str The method where the error is raised. **kwargs : dict The metadata passed to the method. """ kwargs = {key: value for key, value in kwargs.items() if value is not None} if _routing_enabled() and kwargs: cls_name = obj.__class__.__name__ raise NotImplementedError( f"{cls_name}.{method} cannot accept given metadata " f"({set(kwargs.keys())}) since metadata routing is not yet implemented " f"for {cls_name}." ) class _RoutingNotSupportedMixin: """A mixin to be used to remove the default `get_metadata_routing`. This is used in meta-estimators where metadata routing is not yet implemented. This also makes it clear in our rendered documentation that this method cannot be used. """ def get_metadata_routing(self): """Raise `NotImplementedError`. This estimator does not support metadata routing yet.""" raise NotImplementedError( f"{self.__class__.__name__} has not implemented metadata routing yet." ) # Request values # ============== # Each request value needs to be one of the following values, or an alias. # this is used in `__metadata_request__*` attributes to indicate that a # metadata is not present even though it may be present in the # corresponding method's signature. UNUSED = "$UNUSED$" # this is used whenever a default value is changed, and therefore the user # should explicitly set the value, otherwise a warning is shown. An example # is when a meta-estimator is only a router, but then becomes also a # consumer in a new release. WARN = "$WARN$" # this is the default used in `set_{method}_request` methods to indicate no # change requested by the user. UNCHANGED = "$UNCHANGED$" VALID_REQUEST_VALUES = [False, True, None, UNUSED, WARN] def request_is_alias(item): """Check if an item is a valid alias. Values in ``VALID_REQUEST_VALUES`` are not considered aliases in this context. Only a string which is a valid identifier is. Parameters ---------- item : object The given item to be checked if it can be an alias. Returns ------- result : bool Whether the given item is a valid alias. """ if item in VALID_REQUEST_VALUES: return False # item is only an alias if it's a valid identifier return isinstance(item, str) and item.isidentifier() def request_is_valid(item): """Check if an item is a valid request value (and not an alias). Parameters ---------- item : object The given item to be checked. Returns ------- result : bool Whether the given item is valid. """ return item in VALID_REQUEST_VALUES # Metadata Request for Simple Consumers # ===================================== # This section includes MethodMetadataRequest and MetadataRequest which are # used in simple consumers. class MethodMetadataRequest: """A prescription of how metadata is to be passed to a single method. Refer to :class:`MetadataRequest` for how this class is used. .. versionadded:: 1.3 Parameters ---------- owner : str A display name for the object owning these requests. method : str The name of the method to which these requests belong. requests : dict of {str: bool, None or str}, default=None The initial requests for this method. """ def __init__(self, owner, method, requests=None): self._requests = requests or dict() self.owner = owner self.method = method @property def requests(self): """Dictionary of the form: ``{key: alias}``.""" return self._requests def add_request( self, *, param, alias, ): """Add request info for a metadata. Parameters ---------- param : str The property for which a request is set. alias : str, or {True, False, None} Specifies which metadata should be routed to `param` - str: the name (or alias) of metadata given to a meta-estimator that should be routed to this parameter. - True: requested - False: not requested - None: error if passed """ if not request_is_alias(alias) and not request_is_valid(alias): raise ValueError( f"The alias you're setting for `{param}` should be either a " "valid identifier or one of {None, True, False}, but given " f"value is: `{alias}`" ) if alias == param: alias = True if alias == UNUSED: if param in self._requests: del self._requests[param] else: raise ValueError( f"Trying to remove parameter {param} with UNUSED which doesn't" " exist." ) else: self._requests[param] = alias return self def _get_param_names(self, return_alias): """Get names of all metadata that can be consumed or routed by this method. This method returns the names of all metadata, even the ``False`` ones. Parameters ---------- return_alias : bool Controls whether original or aliased names should be returned. If ``False``, aliases are ignored and original names are returned. Returns ------- names : set of str A set of strings with the names of all parameters. """ return set( alias if return_alias and not request_is_valid(alias) else prop for prop, alias in self._requests.items() if not request_is_valid(alias) or alias is not False ) def _check_warnings(self, *, params): """Check whether metadata is passed which is marked as WARN. If any metadata is passed which is marked as WARN, a warning is raised. Parameters ---------- params : dict The metadata passed to a method. """ params = {} if params is None else params warn_params = { prop for prop, alias in self._requests.items() if alias == WARN and prop in params } for param in warn_params: warn( f"Support for {param} has recently been added to this class. " "To maintain backward compatibility, it is ignored now. " "You can set the request value to False to silence this " "warning, or to True to consume and use the metadata." ) def _route_params(self, params): """Prepare the given parameters to be passed to the method. The output of this method can be used directly as the input to the corresponding method as extra props. Parameters ---------- params : dict A dictionary of provided metadata. Returns ------- params : Bunch A :class:`~sklearn.utils.Bunch` of {prop: value} which can be given to the corresponding method. """ self._check_warnings(params=params) unrequested = dict() args = {arg: value for arg, value in params.items() if value is not None} res = Bunch() for prop, alias in self._requests.items(): if alias is False or alias == WARN: continue elif alias is True and prop in args: res[prop] = args[prop] elif alias is None and prop in args: unrequested[prop] = args[prop] elif alias in args: res[prop] = args[alias] if unrequested: raise UnsetMetadataPassedError( message=( f"[{', '.join([key for key in unrequested])}] are passed but " "are not explicitly set as requested or not for" f" {self.owner}.{self.method}" ), unrequested_params=unrequested, routed_params=res, ) return res def _consumes(self, params): """Check whether the given parameters are consumed by this method. Parameters ---------- params : iterable of str An iterable of parameters to check. Returns ------- consumed : set of str A set of parameters which are consumed by this method. """ params = set(params) res = set() for prop, alias in self._requests.items(): if alias is True and prop in params: res.add(prop) elif isinstance(alias, str) and alias in params: res.add(alias) return res def _serialize(self): """Serialize the object. Returns ------- obj : dict A serialized version of the instance in the form of a dictionary. """ return self._requests def __repr__(self): return str(self._serialize()) def __str__(self): return str(repr(self)) class MetadataRequest: """Contains the metadata request info of a consumer. Instances of `MethodMetadataRequest` are used in this class for each available method under `metadatarequest.{method}`. Consumer-only classes such as simple estimators return a serialized version of this class as the output of `get_metadata_routing()`. .. versionadded:: 1.3 Parameters ---------- owner : str The name of the object to which these requests belong. """ # this is here for us to use this attribute's value instead of doing # `isinstance` in our checks, so that we avoid issues when people vendor # this file instead of using it directly from scikit-learn. _type = "metadata_request" def __init__(self, owner): self.owner = owner for method in SIMPLE_METHODS: setattr( self, method, MethodMetadataRequest(owner=owner, method=method), ) def consumes(self, method, params): """Check whether the given parameters are consumed by the given method. .. versionadded:: 1.4 Parameters ---------- method : str The name of the method to check. params : iterable of str An iterable of parameters to check. Returns ------- consumed : set of str A set of parameters which are consumed by the given method. """ return getattr(self, method)._consumes(params=params) def __getattr__(self, name): # Called when the default attribute access fails with an AttributeError # (either __getattribute__() raises an AttributeError because name is # not an instance attribute or an attribute in the class tree for self; # or __get__() of a name property raises AttributeError). This method # should either return the (computed) attribute value or raise an # AttributeError exception. # https://docs.python.org/3/reference/datamodel.html#object.__getattr__ if name not in COMPOSITE_METHODS: raise AttributeError( f"'{self.__class__.__name__}' object has no attribute '{name}'" ) requests = {} for method in COMPOSITE_METHODS[name]: mmr = getattr(self, method) existing = set(requests.keys()) upcoming = set(mmr.requests.keys()) common = existing & upcoming conflicts = [ key for key in common if requests[key] != mmr._requests[key] ] if conflicts: raise ValueError( f"Conflicting metadata requests for {', '.join(conflicts)} " f"while composing the requests for {name}. Metadata with the " f"same name for methods {', '.join(COMPOSITE_METHODS[name])} " "should have the same request value." ) requests.update(mmr._requests) return MethodMetadataRequest( owner=self.owner, method=name, requests=requests ) def _get_param_names(self, method, return_alias, ignore_self_request=None): """Get names of all metadata that can be consumed or routed by specified \ method. This method returns the names of all metadata, even the ``False`` ones. Parameters ---------- method : str The name of the method for which metadata names are requested. return_alias : bool Controls whether original or aliased names should be returned. If ``False``, aliases are ignored and original names are returned. ignore_self_request : bool Ignored. Present for API compatibility. Returns ------- names : set of str A set of strings with the names of all parameters. """ return getattr(self, method)._get_param_names(return_alias=return_alias) def _route_params(self, *, method, params): """Prepare the given parameters to be passed to the method. The output of this method can be used directly as the input to the corresponding method as extra keyword arguments to pass metadata. Parameters ---------- method : str The name of the method for which the parameters are requested and routed. params : dict A dictionary of provided metadata. Returns ------- params : Bunch A :class:`~sklearn.utils.Bunch` of {prop: value} which can be given to the corresponding method. """ return getattr(self, method)._route_params(params=params) def _check_warnings(self, *, method, params): """Check whether metadata is passed which is marked as WARN. If any metadata is passed which is marked as WARN, a warning is raised. Parameters ---------- method : str The name of the method for which the warnings should be checked. params : dict The metadata passed to a method. """ getattr(self, method)._check_warnings(params=params) def _serialize(self): """Serialize the object. Returns ------- obj : dict A serialized version of the instance in the form of a dictionary. """ output = dict() for method in SIMPLE_METHODS: mmr = getattr(self, method) if len(mmr.requests): output[method] = mmr._serialize() return output def __repr__(self): return str(self._serialize()) def __str__(self): return str(repr(self)) # Metadata Request for Routers # ============================ # This section includes all objects required for MetadataRouter which is used # in routers, returned by their ``get_metadata_routing``. # This namedtuple is used to store a (mapping, routing) pair. Mapping is a # MethodMapping object, and routing is the output of `get_metadata_routing`. # MetadataRouter stores a collection of these namedtuples. RouterMappingPair = namedtuple("RouterMappingPair", ["mapping", "router"]) # A namedtuple storing a single method route. A collection of these namedtuples # is stored in a MetadataRouter. MethodPair = namedtuple("MethodPair", ["callee", "caller"]) class MethodMapping: """Stores the mapping between callee and caller methods for a router. This class is primarily used in a ``get_metadata_routing()`` of a router object when defining the mapping between a sub-object (a sub-estimator or a scorer) to the router's methods. It stores a collection of ``Route`` namedtuples. Iterating through an instance of this class will yield named ``MethodPair(callee, caller)`` tuples. .. versionadded:: 1.3 """ def __init__(self): self._routes = [] def __iter__(self): return iter(self._routes) def add(self, *, callee, caller): """Add a method mapping. Parameters ---------- callee : str Child object's method name. This method is called in ``caller``. caller : str Parent estimator's method name in which the ``callee`` is called. Returns ------- self : MethodMapping Returns self. """ if callee not in METHODS: raise ValueError( f"Given callee:{callee} is not a valid method. Valid methods are:" f" {METHODS}" ) if caller not in METHODS: raise ValueError( f"Given caller:{caller} is not a valid method. Valid methods are:" f" {METHODS}" ) self._routes.append(MethodPair(callee=callee, caller=caller)) return self def _serialize(self): """Serialize the object. Returns ------- obj : list A serialized version of the instance in the form of a list. """ result = list() for route in self._routes: result.append({"callee": route.callee, "caller": route.caller}) return result @classmethod def from_str(cls, route): """Construct an instance from a string. Parameters ---------- route : str A string representing the mapping, it can be: - `"one-to-one"`: a one to one mapping for all methods. - `"method"`: the name of a single method, such as ``fit``, ``transform``, ``score``, etc. Returns ------- obj : MethodMapping A :class:`~sklearn.utils.metadata_routing.MethodMapping` instance constructed from the given string. """ routing = cls() if route == "one-to-one": for method in METHODS: routing.add(callee=method, caller=method) elif route in METHODS: routing.add(callee=route, caller=route) else: raise ValueError("route should be 'one-to-one' or a single method!") return routing def __repr__(self): return str(self._serialize()) def __str__(self): return str(repr(self)) class MetadataRouter: """Stores and handles metadata routing for a router object. This class is used by router objects to store and handle metadata routing. Routing information is stored as a dictionary of the form ``{"object_name": RouteMappingPair(method_mapping, routing_info)}``, where ``method_mapping`` is an instance of :class:`~sklearn.utils.metadata_routing.MethodMapping` and ``routing_info`` is either a :class:`~sklearn.utils.metadata_routing.MetadataRequest` or a :class:`~sklearn.utils.metadata_routing.MetadataRouter` instance. .. versionadded:: 1.3 Parameters ---------- owner : str The name of the object to which these requests belong. """ # this is here for us to use this attribute's value instead of doing # `isinstance`` in our checks, so that we avoid issues when people vendor # this file instead of using it directly from scikit-learn. _type = "metadata_router" def __init__(self, owner): self._route_mappings = dict() # `_self_request` is used if the router is also a consumer. # _self_request, (added using `add_self_request()`) is treated # differently from the other objects which are stored in # _route_mappings. self._self_request = None self.owner = owner def add_self_request(self, obj): """Add `self` (as a consumer) to the routing. This method is used if the router is also a consumer, and hence the router itself needs to be included in the routing. The passed object can be an estimator or a :class:`~sklearn.utils.metadata_routing.MetadataRequest`. A router should add itself using this method instead of `add` since it should be treated differently than the other objects to which metadata is routed by the router. Parameters ---------- obj : object This is typically the router instance, i.e. `self` in a ``get_metadata_routing()`` implementation. It can also be a ``MetadataRequest`` instance. Returns ------- self : MetadataRouter Returns `self`. """ if getattr(obj, "_type", None) == "metadata_request": self._self_request = deepcopy(obj) elif hasattr(obj, "_get_metadata_request"): self._self_request = deepcopy(obj._get_metadata_request()) else: raise ValueError( "Given `obj` is neither a `MetadataRequest` nor does it implement " "the required API. Inheriting from `BaseEstimator` implements the " "required API." ) return self def add(self, *, method_mapping, **objs): """Add named objects with their corresponding method mapping. Parameters ---------- method_mapping : MethodMapping or str The mapping between the child and the parent's methods. If str, the output of :func:`~sklearn.utils.metadata_routing.MethodMapping.from_str` is used. **objs : dict A dictionary of objects from which metadata is extracted by calling :func:`~sklearn.utils.metadata_routing.get_routing_for_object` on them. Returns ------- self : MetadataRouter Returns `self`. """ if isinstance(method_mapping, str): method_mapping = MethodMapping.from_str(method_mapping) else: method_mapping = deepcopy(method_mapping) for name, obj in objs.items(): self._route_mappings[name] = RouterMappingPair( mapping=method_mapping, router=get_routing_for_object(obj) ) return self def consumes(self, method, params): """Check whether the given parameters are consumed by the given method. .. versionadded:: 1.4 Parameters ---------- method : str The name of the method to check. params : iterable of str An iterable of parameters to check. Returns ------- consumed : set of str A set of parameters which are consumed by the given method. """ res = set() if self._self_request: res = res | self._self_request.consumes(method=method, params=params) for _, route_mapping in self._route_mappings.items(): for callee, caller in route_mapping.mapping: if caller == method: res = res | route_mapping.router.consumes( method=callee, params=params ) return res def _get_param_names(self, *, method, return_alias, ignore_self_request): """Get names of all metadata that can be consumed or routed by specified \ method. This method returns the names of all metadata, even the ``False`` ones. Parameters ---------- method : str The name of the method for which metadata names are requested. return_alias : bool Controls whether original or aliased names should be returned, which only applies to the stored `self`. If no `self` routing object is stored, this parameter has no effect. ignore_self_request : bool If `self._self_request` should be ignored. This is used in `_route_params`. If ``True``, ``return_alias`` has no effect. Returns ------- names : set of str A set of strings with the names of all parameters. """ res = set() if self._self_request and not ignore_self_request: res = res.union( self._self_request._get_param_names( method=method, return_alias=return_alias ) ) for name, route_mapping in self._route_mappings.items(): for callee, caller in route_mapping.mapping: if caller == method: res = res.union( route_mapping.router._get_param_names( method=callee, return_alias=True, ignore_self_request=False, ) ) return res def _route_params(self, *, params, method): """Prepare the given parameters to be passed to the method. This is used when a router is used as a child object of another router. The parent router then passes all parameters understood by the child object to it and delegates their validation to the child. The output of this method can be used directly as the input to the corresponding method as extra props. Parameters ---------- method : str The name of the method for which the parameters are requested and routed. params : dict A dictionary of provided metadata. Returns ------- params : Bunch A :class:`~sklearn.utils.Bunch` of {prop: value} which can be given to the corresponding method. """ res = Bunch() if self._self_request: res.update( self._self_request._route_params(params=params, method=method) ) param_names = self._get_param_names( method=method, return_alias=True, ignore_self_request=True ) child_params = { key: value for key, value in params.items() if key in param_names } for key in set(res.keys()).intersection(child_params.keys()): # conflicts are okay if the passed objects are the same, but it's # an issue if they're different objects. if child_params[key] is not res[key]: raise ValueError( f"In {self.owner}, there is a conflict on {key} between what is" " requested for this estimator and what is requested by its" " children. You can resolve this conflict by using an alias for" " the child estimator(s) requested metadata." ) res.update(child_params) return res def route_params(self, *, caller, params): """Return the input parameters requested by child objects. The output of this method is a bunch, which includes the inputs for all methods of each child object that are used in the router's `caller` method. If the router is also a consumer, it also checks for warnings of `self`'s/consumer's requested metadata. Parameters ---------- caller : str The name of the method for which the parameters are requested and routed. If called inside the :term:`fit` method of a router, it would be `"fit"`. params : dict A dictionary of provided metadata. Returns ------- params : Bunch A :class:`~sklearn.utils.Bunch` of the form ``{"object_name": {"method_name": {prop: value}}}`` which can be used to pass the required metadata to corresponding methods or corresponding child objects. """ if self._self_request: self._self_request._check_warnings(params=params, method=caller) res = Bunch() for name, route_mapping in self._route_mappings.items(): router, mapping = route_mapping.router, route_mapping.mapping res[name] = Bunch() for _callee, _caller in mapping: if _caller == caller: res[name][_callee] = router._route_params( params=params, method=_callee ) return res def validate_metadata(self, *, method, params): """Validate given metadata for a method. This raises a ``TypeError`` if some of the passed metadata are not understood by child objects. Parameters ---------- method : str The name of the method for which the parameters are requested and routed. If called inside the :term:`fit` method of a router, it would be `"fit"`. params : dict A dictionary of provided metadata. """ param_names = self._get_param_names( method=method, return_alias=False, ignore_self_request=False ) if self._self_request: self_params = self._self_request._get_param_names( method=method, return_alias=False ) else: self_params = set() extra_keys = set(params.keys()) - param_names - self_params if extra_keys: raise TypeError( f"{self.owner}.{method} got unexpected argument(s) {extra_keys}, " "which are not requested metadata in any object." ) def _serialize(self): """Serialize the object. Returns ------- obj : dict A serialized version of the instance in the form of a dictionary. """ res = dict() if self._self_request: res["$self_request"] = self._self_request._serialize() for name, route_mapping in self._route_mappings.items(): res[name] = dict() res[name]["mapping"] = route_mapping.mapping._serialize() res[name]["router"] = route_mapping.router._serialize() return res def __iter__(self): if self._self_request: yield "$self_request", RouterMappingPair( mapping=MethodMapping.from_str("one-to-one"), router=self._self_request, ) for name, route_mapping in self._route_mappings.items(): yield (name, route_mapping) def __repr__(self): return str(self._serialize()) def __str__(self): return str(repr(self)) def get_routing_for_object(obj=None): """Get a ``Metadata{Router, Request}`` instance from the given object. This function returns a :class:`~sklearn.utils.metadata_routing.MetadataRouter` or a :class:`~sklearn.utils.metadata_routing.MetadataRequest` from the given input. This function always returns a copy or an instance constructed from the input, such that changing the output of this function will not change the original object. .. versionadded:: 1.3 Parameters ---------- obj : object - If the object is already a :class:`~sklearn.utils.metadata_routing.MetadataRequest` or a :class:`~sklearn.utils.metadata_routing.MetadataRouter`, return a copy of that. - If the object provides a `get_metadata_routing` method, return a copy of the output of that method. - Returns an empty :class:`~sklearn.utils.metadata_routing.MetadataRequest` otherwise. Returns ------- obj : MetadataRequest or MetadataRouting A ``MetadataRequest`` or a ``MetadataRouting`` taken or created from the given object. """ # doing this instead of a try/except since an AttributeError could be raised # for other reasons. if hasattr(obj, "get_metadata_routing"): return deepcopy(obj.get_metadata_routing()) elif getattr(obj, "_type", None) in ["metadata_request", "metadata_router"]: return deepcopy(obj) return MetadataRequest(owner=None) # Request method # ============== # This section includes what's needed for the request method descriptor and # their dynamic generation in a meta class. # These strings are used to dynamically generate the docstrings for # set_{method}_request methods. REQUESTER_DOC = """ Request metadata passed to the ``{method}`` method. Note that this method is only relevant if ``enable_metadata_routing=True`` (see :func:`sklearn.set_config`). Please see :ref:`User Guide ` on how the routing mechanism works. The options for each parameter are: - ``True``: metadata is requested, and \ passed to ``{method}`` if provided. The request is ignored if \ metadata is not provided. - ``False``: metadata is not requested and the meta-estimator \ will not pass it to ``{method}``. - ``None``: metadata is not requested, and the meta-estimator \ will raise an error if the user provides it. - ``str``: metadata should be passed to the meta-estimator with \ this given alias instead of the original name. The default (``sklearn.utils.metadata_routing.UNCHANGED``) retains the existing request. This allows you to change the request for some parameters and not others. .. versionadded:: 1.3 .. note:: This method is only relevant if this estimator is used as a sub-estimator of a meta-estimator, e.g. used inside a :class:`~sklearn.pipeline.Pipeline`. Otherwise it has no effect. Parameters ---------- """ REQUESTER_DOC_PARAM = """ {metadata} : str, True, False, or None, \ default=sklearn.utils.metadata_routing.UNCHANGED Metadata routing for ``{metadata}`` parameter in ``{method}``. """ REQUESTER_DOC_RETURN = """ Returns ------- self : object The updated object. """ class RequestMethod: """ A descriptor for request methods. .. versionadded:: 1.3 Parameters ---------- name : str The name of the method for which the request function should be created, e.g. ``"fit"`` would create a ``set_fit_request`` function. keys : list of str A list of strings which are accepted parameters by the created function, e.g. ``["sample_weight"]`` if the corresponding method accepts it as a metadata. validate_keys : bool, default=True Whether to check if the requested parameters fit the actual parameters of the method. Notes ----- This class is a descriptor [1]_ and uses PEP-362 to set the signature of the returned function [2]_. References ---------- .. [1] https://docs.python.org/3/howto/descriptor.html .. [2] https://www.python.org/dev/peps/pep-0362/ """ def __init__(self, name, keys, validate_keys=True): self.name = name self.keys = keys self.validate_keys = validate_keys def __get__(self, instance, owner): # we would want to have a method which accepts only the expected args def func(**kw): """Updates the request for provided parameters This docstring is overwritten below. See REQUESTER_DOC for expected functionality """ if not _routing_enabled(): raise RuntimeError( "This method is only available when metadata routing is " "enabled. You can enable it using" " sklearn.set_config(enable_metadata_routing=True)." ) if self.validate_keys and (set(kw) - set(self.keys)): raise TypeError( f"Unexpected args: {set(kw) - set(self.keys)}. Accepted " f"arguments are: {set(self.keys)}" ) requests = instance._get_metadata_request() method_metadata_request = getattr(requests, self.name) for prop, alias in kw.items(): if alias is not UNCHANGED: method_metadata_request.add_request(param=prop, alias=alias) instance._metadata_request = requests return instance # Now we set the relevant attributes of the function so that it seems # like a normal method to the end user, with known expected arguments. func.__name__ = f"set_{self.name}_request" params = [ inspect.Parameter( name="self", kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, annotation=owner, ) ] params.extend( [ inspect.Parameter( k, inspect.Parameter.KEYWORD_ONLY, default=UNCHANGED, annotation=Optional[Union[bool, None, str]], ) for k in self.keys ] ) func.__signature__ = inspect.Signature( params, return_annotation=owner, ) doc = REQUESTER_DOC.format(method=self.name) for metadata in self.keys: doc += REQUESTER_DOC_PARAM.format(metadata=metadata, method=self.name) doc += REQUESTER_DOC_RETURN func.__doc__ = doc return func class _MetadataRequester: """Mixin class for adding metadata request functionality. ``BaseEstimator`` inherits from this Mixin. .. versionadded:: 1.3 """ if TYPE_CHECKING: # pragma: no cover # This code is never run in runtime, but it's here for type checking. # Type checkers fail to understand that the `set_{method}_request` # methods are dynamically generated, and they complain that they are # not defined. We define them here to make type checkers happy. # During type checking analyzers assume this to be True. # The following list of defined methods mirrors the list of methods # in SIMPLE_METHODS. # fmt: off def set_fit_request(self, **kwargs): pass def set_partial_fit_request(self, **kwargs): pass def set_predict_request(self, **kwargs): pass def set_predict_proba_request(self, **kwargs): pass def set_predict_log_proba_request(self, **kwargs): pass def set_decision_function_request(self, **kwargs): pass def set_score_request(self, **kwargs): pass def set_split_request(self, **kwargs): pass def set_transform_request(self, **kwargs): pass def set_inverse_transform_request(self, **kwargs): pass # fmt: on def __init_subclass__(cls, **kwargs): """Set the ``set_{method}_request`` methods. This uses PEP-487 [1]_ to set the ``set_{method}_request`` methods. It looks for the information available in the set default values which are set using ``__metadata_request__*`` class attributes, or inferred from method signatures. The ``__metadata_request__*`` class attributes are used when a method does not explicitly accept a metadata through its arguments or if the developer would like to specify a request value for those metadata which are different from the default ``None``. References ---------- .. [1] https://www.python.org/dev/peps/pep-0487 """ try: requests = cls._get_default_requests() except Exception: # if there are any issues in the default values, it will be raised # when ``get_metadata_routing`` is called. Here we are going to # ignore all the issues such as bad defaults etc. super().__init_subclass__(**kwargs) return for method in SIMPLE_METHODS: mmr = getattr(requests, method) # set ``set_{method}_request``` methods if not len(mmr.requests): continue setattr( cls, f"set_{method}_request", RequestMethod(method, sorted(mmr.requests.keys())), ) super().__init_subclass__(**kwargs) @classmethod def _build_request_for_signature(cls, router, method): """Build the `MethodMetadataRequest` for a method using its signature. This method takes all arguments from the method signature and uses ``None`` as their default request value, except ``X``, ``y``, ``Y``, ``Xt``, ``yt``, ``*args``, and ``**kwargs``. Parameters ---------- router : MetadataRequest The parent object for the created `MethodMetadataRequest`. method : str The name of the method. Returns ------- method_request : MethodMetadataRequest The prepared request using the method's signature. """ mmr = MethodMetadataRequest(owner=cls.__name__, method=method) # Here we use `isfunction` instead of `ismethod` because calling `getattr` # on a class instead of an instance returns an unbound function. if not hasattr(cls, method) or not inspect.isfunction(getattr(cls, method)): return mmr # ignore the first parameter of the method, which is usually "self" params = list(inspect.signature(getattr(cls, method)).parameters.items())[ 1: ] for pname, param in params: if pname in {"X", "y", "Y", "Xt", "yt"}: continue if param.kind in {param.VAR_POSITIONAL, param.VAR_KEYWORD}: continue mmr.add_request( param=pname, alias=None, ) return mmr @classmethod def _get_default_requests(cls): """Collect default request values. This method combines the information present in ``__metadata_request__*`` class attributes, as well as determining request keys from method signatures. """ requests = MetadataRequest(owner=cls.__name__) for method in SIMPLE_METHODS: setattr( requests, method, cls._build_request_for_signature(router=requests, method=method), ) # Then overwrite those defaults with the ones provided in # __metadata_request__* attributes. Defaults set in # __metadata_request__* attributes take precedence over signature # sniffing. # need to go through the MRO since this is a class attribute and # ``vars`` doesn't report the parent class attributes. We go through # the reverse of the MRO so that child classes have precedence over # their parents. defaults = dict() for base_class in reversed(inspect.getmro(cls)): base_defaults = { attr: value for attr, value in vars(base_class).items() if "__metadata_request__" in attr } defaults.update(base_defaults) defaults = dict(sorted(defaults.items())) for attr, value in defaults.items(): # we don't check for attr.startswith() since python prefixes attrs # starting with __ with the `_ClassName`. substr = "__metadata_request__" method = attr[attr.index(substr) + len(substr) :] for prop, alias in value.items(): getattr(requests, method).add_request(param=prop, alias=alias) return requests def _get_metadata_request(self): """Get requested data properties. Please check :ref:`User Guide ` on how the routing mechanism works. Returns ------- request : MetadataRequest A :class:`~sklearn.utils.metadata_routing.MetadataRequest` instance. """ if hasattr(self, "_metadata_request"): requests = get_routing_for_object(self._metadata_request) else: requests = self._get_default_requests() return requests def get_metadata_routing(self): """Get metadata routing of this object. Please check :ref:`User Guide ` on how the routing mechanism works. Returns ------- routing : MetadataRequest A :class:`~sklearn.utils.metadata_routing.MetadataRequest` encapsulating routing information. """ return self._get_metadata_request() # Process Routing in Routers # ========================== # This is almost always the only method used in routers to process and route # given metadata. This is to minimize the boilerplate required in routers. # Here the first two arguments are positional only which makes everything # passed as keyword argument a metadata. The first two args also have an `_` # prefix to reduce the chances of name collisions with the passed metadata, and # since they're positional only, users will never type those underscores. def process_routing(_obj, _method, /, **kwargs): """Validate and route input parameters. This function is used inside a router's method, e.g. :term:`fit`, to validate the metadata and handle the routing. Assuming this signature: ``fit(self, X, y, sample_weight=None, **fit_params)``, a call to this function would be: ``process_routing(self, sample_weight=sample_weight, **fit_params)``. Note that if routing is not enabled and ``kwargs`` is empty, then it returns an empty routing where ``process_routing(...).ANYTHING.ANY_METHOD`` is always an empty dictionary. .. versionadded:: 1.3 Parameters ---------- _obj : object An object implementing ``get_metadata_routing``. Typically a meta-estimator. _method : str The name of the router's method in which this function is called. **kwargs : dict Metadata to be routed. Returns ------- routed_params : Bunch A :class:`~sklearn.utils.Bunch` of the form ``{"object_name": {"method_name": {prop: value}}}`` which can be used to pass the required metadata to corresponding methods or corresponding child objects. The object names are those defined in `obj.get_metadata_routing()`. """ if not _routing_enabled() and not kwargs: # If routing is not enabled and kwargs are empty, then we don't have to # try doing any routing, we can simply return a structure which returns # an empty dict on routed_params.ANYTHING.ANY_METHOD. class EmptyRequest: def get(self, name, default=None): return default if default else {} def __getitem__(self, name): return Bunch(**{method: dict() for method in METHODS}) def __getattr__(self, name): return Bunch(**{method: dict() for method in METHODS}) return EmptyRequest() if not ( hasattr(_obj, "get_metadata_routing") or isinstance(_obj, MetadataRouter) ): raise AttributeError( f"The given object ({repr(_obj.__class__.__name__)}) needs to either" " implement the routing method `get_metadata_routing` or be a" " `MetadataRouter` instance." ) if _method not in METHODS: raise TypeError( f"Can only route and process input on these methods: {METHODS}, " f"while the passed method is: {_method}." ) request_routing = get_routing_for_object(_obj) request_routing.validate_metadata(params=kwargs, method=_method) routed_params = request_routing.route_params(params=kwargs, caller=_method) return routed_params else: from sklearn.exceptions import UnsetMetadataPassedError from sklearn.utils._metadata_requests import ( # type: ignore[no-redef] COMPOSITE_METHODS, # noqa METHODS, # noqa SIMPLE_METHODS, # noqa UNCHANGED, UNUSED, WARN, MetadataRequest, MetadataRouter, MethodMapping, _MetadataRequester, # noqa _raise_for_params, # noqa _raise_for_unsupported_routing, # noqa _routing_enabled, _RoutingNotSupportedMixin, # noqa get_routing_for_object, process_routing, # noqa ) imbalanced-learn-0.12.2/imblearn/utils/_param_validation.py000066400000000000000000000774271460233407600237540ustar00rootroot00000000000000"""This is a copy of sklearn/utils/_param_validation.py. It can be removed when we support scikit-learn >= 1.2. """ # mypy: ignore-errors import functools import math import operator import re from abc import ABC, abstractmethod from collections.abc import Iterable from inspect import signature from numbers import Integral, Real import numpy as np import sklearn from scipy.sparse import csr_matrix, issparse from sklearn.utils.fixes import parse_version from .._config import config_context, get_config from ..utils.fixes import _is_arraylike_not_scalar sklearn_version = parse_version(sklearn.__version__) if sklearn_version < parse_version("1.4"): class InvalidParameterError(ValueError, TypeError): """Custom exception to be raised when the parameter of a class/method/function does not have a valid type or value. """ # Inherits from ValueError and TypeError to keep backward compatibility. def validate_parameter_constraints(parameter_constraints, params, caller_name): """Validate types and values of given parameters. Parameters ---------- parameter_constraints : dict or {"no_validation"} If "no_validation", validation is skipped for this parameter. If a dict, it must be a dictionary `param_name: list of constraints`. A parameter is valid if it satisfies one of the constraints from the list. Constraints can be: - an Interval object, representing a continuous or discrete range of numbers - the string "array-like" - the string "sparse matrix" - the string "random_state" - callable - None, meaning that None is a valid value for the parameter - any type, meaning that any instance of this type is valid - an Options object, representing a set of elements of a given type - a StrOptions object, representing a set of strings - the string "boolean" - the string "verbose" - the string "cv_object" - the string "nan" - a MissingValues object representing markers for missing values - a HasMethods object, representing method(s) an object must have - a Hidden object, representing a constraint not meant to be exposed to the user params : dict A dictionary `param_name: param_value`. The parameters to validate against the constraints. caller_name : str The name of the estimator or function or method that called this function. """ for param_name, param_val in params.items(): # We allow parameters to not have a constraint so that third party # estimators can inherit from sklearn estimators without having to # necessarily use the validation tools. if param_name not in parameter_constraints: continue constraints = parameter_constraints[param_name] if constraints == "no_validation": continue constraints = [make_constraint(constraint) for constraint in constraints] for constraint in constraints: if constraint.is_satisfied_by(param_val): # this constraint is satisfied, no need to check further. break else: # No constraint is satisfied, raise with an informative message. # Ignore constraints that we don't want to expose in the error # message, i.e. options that are for internal purpose or not # officially supported. constraints = [ constraint for constraint in constraints if not constraint.hidden ] if len(constraints) == 1: constraints_str = f"{constraints[0]}" else: constraints_str = ( f"{', '.join([str(c) for c in constraints[:-1]])} or" f" {constraints[-1]}" ) raise InvalidParameterError( f"The {param_name!r} parameter of {caller_name} must be" f" {constraints_str}. Got {param_val!r} instead." ) def make_constraint(constraint): """Convert the constraint into the appropriate Constraint object. Parameters ---------- constraint : object The constraint to convert. Returns ------- constraint : instance of _Constraint The converted constraint. """ if isinstance(constraint, str) and constraint == "array-like": return _ArrayLikes() if isinstance(constraint, str) and constraint == "sparse matrix": return _SparseMatrices() if isinstance(constraint, str) and constraint == "random_state": return _RandomStates() if constraint is callable: return _Callables() if constraint is None: return _NoneConstraint() if isinstance(constraint, type): return _InstancesOf(constraint) if isinstance( constraint, (Interval, StrOptions, Options, HasMethods, MissingValues) ): return constraint if isinstance(constraint, str) and constraint == "boolean": return _Booleans() if isinstance(constraint, str) and constraint == "verbose": return _VerboseHelper() if isinstance(constraint, str) and constraint == "cv_object": return _CVObjects() if isinstance(constraint, Hidden): constraint = make_constraint(constraint.constraint) constraint.hidden = True return constraint if isinstance(constraint, str) and constraint == "nan": return _NanConstraint() raise ValueError(f"Unknown constraint type: {constraint}") def validate_params(parameter_constraints, *, prefer_skip_nested_validation): """Decorator to validate types and values of functions and methods. Parameters ---------- parameter_constraints : dict A dictionary `param_name: list of constraints`. See the docstring of `validate_parameter_constraints` for a description of the accepted constraints. Note that the *args and **kwargs parameters are not validated and must not be present in the parameter_constraints dictionary. prefer_skip_nested_validation : bool If True, the validation of parameters of inner estimators or functions called by the decorated function will be skipped. This is useful to avoid validating many times the parameters passed by the user from the public facing API. It's also useful to avoid validating parameters that we pass internally to inner functions that are guaranteed to be valid by the test suite. It should be set to True for most functions, except for those that receive non-validated objects as parameters or that are just wrappers around classes because they only perform a partial validation. Returns ------- decorated_function : function or method The decorated function. """ def decorator(func): # The dict of parameter constraints is set as an attribute of the function # to make it possible to dynamically introspect the constraints for # automatic testing. setattr(func, "_skl_parameter_constraints", parameter_constraints) @functools.wraps(func) def wrapper(*args, **kwargs): global_skip_validation = get_config()["skip_parameter_validation"] if global_skip_validation: return func(*args, **kwargs) func_sig = signature(func) # Map *args/**kwargs to the function signature params = func_sig.bind(*args, **kwargs) params.apply_defaults() # ignore self/cls and positional/keyword markers to_ignore = [ p.name for p in func_sig.parameters.values() if p.kind in (p.VAR_POSITIONAL, p.VAR_KEYWORD) ] to_ignore += ["self", "cls"] params = { k: v for k, v in params.arguments.items() if k not in to_ignore } validate_parameter_constraints( parameter_constraints, params, caller_name=func.__qualname__ ) try: with config_context( skip_parameter_validation=( prefer_skip_nested_validation or global_skip_validation ) ): return func(*args, **kwargs) except InvalidParameterError as e: # When the function is just a wrapper around an estimator, we allow # the function to delegate validation to the estimator, but we # replace the name of the estimator by the name of the function in # the error message to avoid confusion. msg = re.sub( r"parameter of \w+ must be", f"parameter of {func.__qualname__} must be", str(e), ) raise InvalidParameterError(msg) from e return wrapper return decorator class RealNotInt(Real): """A type that represents reals that are not instances of int. Behaves like float, but also works with values extracted from numpy arrays. isintance(1, RealNotInt) -> False isinstance(1.0, RealNotInt) -> True """ RealNotInt.register(float) def _type_name(t): """Convert type into human readable string.""" module = t.__module__ qualname = t.__qualname__ if module == "builtins": return qualname elif t == Real: return "float" elif t == Integral: return "int" return f"{module}.{qualname}" class _Constraint(ABC): """Base class for the constraint objects.""" def __init__(self): self.hidden = False @abstractmethod def is_satisfied_by(self, val): """Whether or not a value satisfies the constraint. Parameters ---------- val : object The value to check. Returns ------- is_satisfied : bool Whether or not the constraint is satisfied by this value. """ @abstractmethod def __str__(self): """A human readable representational string of the constraint.""" class _InstancesOf(_Constraint): """Constraint representing instances of a given type. Parameters ---------- type : type The valid type. """ def __init__(self, type): super().__init__() self.type = type def is_satisfied_by(self, val): return isinstance(val, self.type) def __str__(self): return f"an instance of {_type_name(self.type)!r}" class _NoneConstraint(_Constraint): """Constraint representing the None singleton.""" def is_satisfied_by(self, val): return val is None def __str__(self): return "None" class _NanConstraint(_Constraint): """Constraint representing the indicator `np.nan`.""" def is_satisfied_by(self, val): return ( not isinstance(val, Integral) and isinstance(val, Real) and math.isnan(val) ) def __str__(self): return "numpy.nan" class _PandasNAConstraint(_Constraint): """Constraint representing the indicator `pd.NA`.""" def is_satisfied_by(self, val): try: import pandas as pd return isinstance(val, type(pd.NA)) and pd.isna(val) except ImportError: return False def __str__(self): return "pandas.NA" class Options(_Constraint): """Constraint representing a finite set of instances of a given type. Parameters ---------- type : type options : set The set of valid scalars. deprecated : set or None, default=None A subset of the `options` to mark as deprecated in the string representation of the constraint. """ def __init__(self, type, options, *, deprecated=None): super().__init__() self.type = type self.options = options self.deprecated = deprecated or set() if self.deprecated - self.options: raise ValueError( "The deprecated options must be a subset of the options." ) def is_satisfied_by(self, val): return isinstance(val, self.type) and val in self.options def _mark_if_deprecated(self, option): """Add a deprecated mark to an option if needed.""" option_str = f"{option!r}" if option in self.deprecated: option_str = f"{option_str} (deprecated)" return option_str def __str__(self): options_str = ( f"{', '.join([self._mark_if_deprecated(o) for o in self.options])}" ) return f"a {_type_name(self.type)} among {{{options_str}}}" class StrOptions(Options): """Constraint representing a finite set of strings. Parameters ---------- options : set of str The set of valid strings. deprecated : set of str or None, default=None A subset of the `options` to mark as deprecated in the string representation of the constraint. """ def __init__(self, options, *, deprecated=None): super().__init__(type=str, options=options, deprecated=deprecated) class Interval(_Constraint): """Constraint representing a typed interval. Parameters ---------- type : {numbers.Integral, numbers.Real, RealNotInt} The set of numbers in which to set the interval. If RealNotInt, only reals that don't have the integer type are allowed. For example 1.0 is allowed but 1 is not. left : float or int or None The left bound of the interval. None means left bound is -∞. right : float, int or None The right bound of the interval. None means right bound is +∞. closed : {"left", "right", "both", "neither"} Whether the interval is open or closed. Possible choices are: - `"left"`: the interval is closed on the left and open on the right. It is equivalent to the interval `[ left, right )`. - `"right"`: the interval is closed on the right and open on the left. It is equivalent to the interval `( left, right ]`. - `"both"`: the interval is closed. It is equivalent to the interval `[ left, right ]`. - `"neither"`: the interval is open. It is equivalent to the interval `( left, right )`. Notes ----- Setting a bound to `None` and setting the interval closed is valid. For instance, strictly speaking, `Interval(Real, 0, None, closed="both")` corresponds to `[0, +∞) U {+∞}`. """ def __init__(self, type, left, right, *, closed): super().__init__() self.type = type self.left = left self.right = right self.closed = closed self._check_params() def _check_params(self): if self.type not in (Integral, Real, RealNotInt): raise ValueError( "type must be either numbers.Integral, numbers.Real or RealNotInt." f" Got {self.type} instead." ) if self.closed not in ("left", "right", "both", "neither"): raise ValueError( "closed must be either 'left', 'right', 'both' or 'neither'. " f"Got {self.closed} instead." ) if self.type is Integral: suffix = "for an interval over the integers." if self.left is not None and not isinstance(self.left, Integral): raise TypeError(f"Expecting left to be an int {suffix}") if self.right is not None and not isinstance(self.right, Integral): raise TypeError(f"Expecting right to be an int {suffix}") if self.left is None and self.closed in ("left", "both"): raise ValueError( f"left can't be None when closed == {self.closed} {suffix}" ) if self.right is None and self.closed in ("right", "both"): raise ValueError( f"right can't be None when closed == {self.closed} {suffix}" ) else: if self.left is not None and not isinstance(self.left, Real): raise TypeError("Expecting left to be a real number.") if self.right is not None and not isinstance(self.right, Real): raise TypeError("Expecting right to be a real number.") if ( self.right is not None and self.left is not None and self.right <= self.left ): raise ValueError( f"right can't be less than left. Got left={self.left} and " f"right={self.right}" ) def __contains__(self, val): if not isinstance(val, Integral) and np.isnan(val): return False left_cmp = operator.lt if self.closed in ("left", "both") else operator.le right_cmp = operator.gt if self.closed in ("right", "both") else operator.ge left = -np.inf if self.left is None else self.left right = np.inf if self.right is None else self.right if left_cmp(val, left): return False if right_cmp(val, right): return False return True def is_satisfied_by(self, val): if not isinstance(val, self.type): return False return val in self def __str__(self): type_str = "an int" if self.type is Integral else "a float" left_bracket = "[" if self.closed in ("left", "both") else "(" left_bound = "-inf" if self.left is None else self.left right_bound = "inf" if self.right is None else self.right right_bracket = "]" if self.closed in ("right", "both") else ")" # better repr if the bounds were given as integers if not self.type == Integral and isinstance(self.left, Real): left_bound = float(left_bound) if not self.type == Integral and isinstance(self.right, Real): right_bound = float(right_bound) return ( f"{type_str} in the range " f"{left_bracket}{left_bound}, {right_bound}{right_bracket}" ) class _ArrayLikes(_Constraint): """Constraint representing array-likes""" def is_satisfied_by(self, val): return _is_arraylike_not_scalar(val) def __str__(self): return "an array-like" class _SparseMatrices(_Constraint): """Constraint representing sparse matrices.""" def is_satisfied_by(self, val): return issparse(val) def __str__(self): return "a sparse matrix" class _Callables(_Constraint): """Constraint representing callables.""" def is_satisfied_by(self, val): return callable(val) def __str__(self): return "a callable" class _RandomStates(_Constraint): """Constraint representing random states. Convenience class for [Interval(Integral, 0, 2**32 - 1, closed="both"), np.random.RandomState, None] """ def __init__(self): super().__init__() self._constraints = [ Interval(Integral, 0, 2**32 - 1, closed="both"), _InstancesOf(np.random.RandomState), _NoneConstraint(), ] def is_satisfied_by(self, val): return any(c.is_satisfied_by(val) for c in self._constraints) def __str__(self): return ( f"{', '.join([str(c) for c in self._constraints[:-1]])} or" f" {self._constraints[-1]}" ) class _Booleans(_Constraint): """Constraint representing boolean likes. Convenience class for [bool, np.bool_, Integral (deprecated)] """ def __init__(self): super().__init__() self._constraints = [ _InstancesOf(bool), _InstancesOf(np.bool_), ] def is_satisfied_by(self, val): return any(c.is_satisfied_by(val) for c in self._constraints) def __str__(self): return ( f"{', '.join([str(c) for c in self._constraints[:-1]])} or" f" {self._constraints[-1]}" ) class _VerboseHelper(_Constraint): """Helper constraint for the verbose parameter. Convenience class for [Interval(Integral, 0, None, closed="left"), bool, numpy.bool_] """ def __init__(self): super().__init__() self._constraints = [ Interval(Integral, 0, None, closed="left"), _InstancesOf(bool), _InstancesOf(np.bool_), ] def is_satisfied_by(self, val): return any(c.is_satisfied_by(val) for c in self._constraints) def __str__(self): return ( f"{', '.join([str(c) for c in self._constraints[:-1]])} or" f" {self._constraints[-1]}" ) class MissingValues(_Constraint): """Helper constraint for the `missing_values` parameters. Convenience for [ Integral, Interval(Real, None, None, closed="both"), str, # when numeric_only is False None, # when numeric_only is False _NanConstraint(), _PandasNAConstraint(), ] Parameters ---------- numeric_only : bool, default=False Whether to consider only numeric missing value markers. """ def __init__(self, numeric_only=False): super().__init__() self.numeric_only = numeric_only self._constraints = [ _InstancesOf(Integral), # we use an interval of Real to ignore np.nan that has its own # constraint Interval(Real, None, None, closed="both"), _NanConstraint(), _PandasNAConstraint(), ] if not self.numeric_only: self._constraints.extend([_InstancesOf(str), _NoneConstraint()]) def is_satisfied_by(self, val): return any(c.is_satisfied_by(val) for c in self._constraints) def __str__(self): return ( f"{', '.join([str(c) for c in self._constraints[:-1]])} or" f" {self._constraints[-1]}" ) class HasMethods(_Constraint): """Constraint representing objects that expose specific methods. It is useful for parameters following a protocol and where we don't want to impose an affiliation to a specific module or class. Parameters ---------- methods : str or list of str The method(s) that the object is expected to expose. """ @validate_params( {"methods": [str, list]}, prefer_skip_nested_validation=True, ) def __init__(self, methods): super().__init__() if isinstance(methods, str): methods = [methods] self.methods = methods def is_satisfied_by(self, val): return all(callable(getattr(val, method, None)) for method in self.methods) def __str__(self): if len(self.methods) == 1: methods = f"{self.methods[0]!r}" else: methods = ( f"{', '.join([repr(m) for m in self.methods[:-1]])} and" f" {self.methods[-1]!r}" ) return f"an object implementing {methods}" class _IterablesNotString(_Constraint): """Constraint representing iterables that are not strings.""" def is_satisfied_by(self, val): return isinstance(val, Iterable) and not isinstance(val, str) def __str__(self): return "an iterable" class _CVObjects(_Constraint): """Constraint representing cv objects. Convenient class for [ Interval(Integral, 2, None, closed="left"), HasMethods(["split", "get_n_splits"]), _IterablesNotString(), None, ] """ def __init__(self): super().__init__() self._constraints = [ Interval(Integral, 2, None, closed="left"), HasMethods(["split", "get_n_splits"]), _IterablesNotString(), _NoneConstraint(), ] def is_satisfied_by(self, val): return any(c.is_satisfied_by(val) for c in self._constraints) def __str__(self): return ( f"{', '.join([str(c) for c in self._constraints[:-1]])} or" f" {self._constraints[-1]}" ) class Hidden: """Class encapsulating a constraint not meant to be exposed to the user. Parameters ---------- constraint : str or _Constraint instance The constraint to be used internally. """ def __init__(self, constraint): self.constraint = constraint def generate_invalid_param_val(constraint): """Return a value that does not satisfy the constraint. Raises a NotImplementedError if there exists no invalid value for this constraint. This is only useful for testing purpose. Parameters ---------- constraint : _Constraint instance The constraint to generate a value for. Returns ------- val : object A value that does not satisfy the constraint. """ if isinstance(constraint, StrOptions): return f"not {' or '.join(constraint.options)}" if isinstance(constraint, MissingValues): return np.array([1, 2, 3]) if isinstance(constraint, _VerboseHelper): return -1 if isinstance(constraint, HasMethods): return type("HasNotMethods", (), {})() if isinstance(constraint, _IterablesNotString): return "a string" if isinstance(constraint, _CVObjects): return "not a cv object" if isinstance(constraint, Interval) and constraint.type is Integral: if constraint.left is not None: return constraint.left - 1 if constraint.right is not None: return constraint.right + 1 # There's no integer outside (-inf, +inf) raise NotImplementedError if isinstance(constraint, Interval) and constraint.type in (Real, RealNotInt): if constraint.left is not None: return constraint.left - 1e-6 if constraint.right is not None: return constraint.right + 1e-6 # bounds are -inf, +inf if constraint.closed in ("right", "neither"): return -np.inf if constraint.closed in ("left", "neither"): return np.inf # interval is [-inf, +inf] return np.nan raise NotImplementedError def generate_valid_param(constraint): """Return a value that does satisfy a constraint. This is only useful for testing purpose. Parameters ---------- constraint : Constraint instance The constraint to generate a value for. Returns ------- val : object A value that does satisfy the constraint. """ if isinstance(constraint, _ArrayLikes): return np.array([1, 2, 3]) if isinstance(constraint, _SparseMatrices): return csr_matrix([[0, 1], [1, 0]]) if isinstance(constraint, _RandomStates): return np.random.RandomState(42) if isinstance(constraint, _Callables): return lambda x: x if isinstance(constraint, _NoneConstraint): return None if isinstance(constraint, _InstancesOf): if constraint.type is np.ndarray: # special case for ndarray since it can't be instantiated without # arguments return np.array([1, 2, 3]) if constraint.type in (Integral, Real): # special case for Integral and Real since they are abstract classes return 1 return constraint.type() if isinstance(constraint, _Booleans): return True if isinstance(constraint, _VerboseHelper): return 1 if isinstance(constraint, MissingValues) and constraint.numeric_only: return np.nan if isinstance(constraint, MissingValues) and not constraint.numeric_only: return "missing" if isinstance(constraint, HasMethods): return type( "ValidHasMethods", (), {m: lambda self: None for m in constraint.methods}, )() if isinstance(constraint, _IterablesNotString): return [1, 2, 3] if isinstance(constraint, _CVObjects): return 5 if isinstance(constraint, Options): # includes StrOptions for option in constraint.options: return option if isinstance(constraint, Interval): interval = constraint if interval.left is None and interval.right is None: return 0 elif interval.left is None: return interval.right - 1 elif interval.right is None: return interval.left + 1 else: if interval.type is Real: return (interval.left + interval.right) / 2 else: return interval.left + 1 raise ValueError(f"Unknown constraint type: {constraint}") else: from sklearn.utils._param_validation import generate_invalid_param_val # noqa from sklearn.utils._param_validation import generate_valid_param # noqa from sklearn.utils._param_validation import validate_parameter_constraints # noqa from sklearn.utils._param_validation import ( HasMethods, Hidden, Interval, InvalidParameterError, MissingValues, Options, RealNotInt, StrOptions, _ArrayLikes, _Booleans, _Callables, _CVObjects, _InstancesOf, _IterablesNotString, _NanConstraint, _NoneConstraint, _PandasNAConstraint, _RandomStates, _SparseMatrices, _VerboseHelper, make_constraint, validate_params, ) imbalanced-learn-0.12.2/imblearn/utils/_show_versions.py000066400000000000000000000042001460233407600233250ustar00rootroot00000000000000""" Utility method which prints system info to help with debugging, and filing issues on GitHub. Adapted from :func:`sklearn.show_versions`, which was adapted from :func:`pandas.show_versions` """ # Author: Alexander L. Hayes # License: MIT from .. import __version__ def _get_deps_info(): """Overview of the installed version of main dependencies Returns ------- deps_info: dict version information on relevant Python libraries """ deps = [ "imbalanced-learn", "pip", "setuptools", "numpy", "scipy", "scikit-learn", "Cython", "pandas", "keras", "tensorflow", "joblib", ] deps_info = { "imbalanced-learn": __version__, } from importlib.metadata import PackageNotFoundError, version for modname in deps: try: deps_info[modname] = version(modname) except PackageNotFoundError: deps_info[modname] = None return deps_info def show_versions(github=False): """Print debugging information. .. versionadded:: 0.5 Parameters ---------- github : bool, If true, wrap system info with GitHub markup. """ from sklearn.utils._show_versions import _get_sys_info _sys_info = _get_sys_info() _deps_info = _get_deps_info() _github_markup = ( "
" "System, Dependency Information\n\n" "**System Information**\n\n" "{0}\n" "**Python Dependencies**\n\n" "{1}\n" "
" ) if github: _sys_markup = "" _deps_markup = "" for k, stat in _sys_info.items(): _sys_markup += f"* {k:<10}: `{stat}`\n" for k, stat in _deps_info.items(): _deps_markup += f"* {k:<10}: `{stat}`\n" print(_github_markup.format(_sys_markup, _deps_markup)) else: print("\nSystem:") for k, stat in _sys_info.items(): print(f"{k:>11}: {stat}") print("\nPython dependencies:") for k, stat in _deps_info.items(): print(f"{k:>11}: {stat}") imbalanced-learn-0.12.2/imblearn/utils/_validation.py000066400000000000000000000566371460233407600225740ustar00rootroot00000000000000"""Utilities for input validation""" # Authors: Guillaume Lemaitre # License: MIT import warnings from collections import OrderedDict from functools import wraps from inspect import Parameter, signature from numbers import Integral, Real import numpy as np from scipy.sparse import issparse from sklearn.base import clone from sklearn.neighbors import NearestNeighbors from sklearn.utils import check_array, column_or_1d from sklearn.utils.multiclass import type_of_target from sklearn.utils.validation import _num_samples from .fixes import _is_pandas_df SAMPLING_KIND = ( "over-sampling", "under-sampling", "clean-sampling", "ensemble", "bypass", ) TARGET_KIND = ("binary", "multiclass", "multilabel-indicator") class ArraysTransformer: """A class to convert sampler output arrays to their original types.""" def __init__(self, X, y): self.x_props = self._gets_props(X) self.y_props = self._gets_props(y) def transform(self, X, y): X = self._transfrom_one(X, self.x_props) y = self._transfrom_one(y, self.y_props) if self.x_props["type"].lower() == "dataframe" and self.y_props[ "type" ].lower() in {"series", "dataframe"}: # We lost the y.index during resampling. We can safely use X.index to align # them. y.index = X.index return X, y def _gets_props(self, array): props = {} props["type"] = array.__class__.__name__ props["columns"] = getattr(array, "columns", None) props["name"] = getattr(array, "name", None) props["dtypes"] = getattr(array, "dtypes", None) return props def _transfrom_one(self, array, props): type_ = props["type"].lower() if type_ == "list": ret = array.tolist() elif type_ == "dataframe": import pandas as pd if issparse(array): ret = pd.DataFrame.sparse.from_spmatrix(array, columns=props["columns"]) else: ret = pd.DataFrame(array, columns=props["columns"]) try: ret = ret.astype(props["dtypes"]) except TypeError: # We special case the following error: # https://github.com/scikit-learn-contrib/imbalanced-learn/issues/1055 # There is no easy way to have a generic workaround. Here, we detect # that we have a column with only null values that is datetime64 # (resulting from the np.vstack of the resampling). for col in ret.columns: if ( ret[col].isnull().all() and ret[col].dtype == "datetime64[ns]" and props["dtypes"][col] == "timedelta64[ns]" ): ret[col] = pd.to_timedelta(["NaT"] * len(ret[col])) # try again ret = ret.astype(props["dtypes"]) elif type_ == "series": import pandas as pd ret = pd.Series(array, dtype=props["dtypes"], name=props["name"]) else: ret = array return ret def _is_neighbors_object(estimator): """Check that the estimator exposes a KNeighborsMixin-like API. A KNeighborsMixin-like API exposes the following methods: (i) `kneighbors`, (ii) `kneighbors_graph`. Parameters ---------- estimator : object A scikit-learn compatible estimator. Returns ------- is_neighbors_object : bool True if the estimator exposes a KNeighborsMixin-like API. """ neighbors_attributes = ["kneighbors", "kneighbors_graph"] return all(hasattr(estimator, attr) for attr in neighbors_attributes) def check_neighbors_object(nn_name, nn_object, additional_neighbor=0): """Check the objects is consistent to be a k nearest neighbors. Several methods in `imblearn` relies on k nearest neighbors. These objects can be passed at initialisation as an integer or as an object that has KNeighborsMixin-like attributes. This utility will create or clone said object, ensuring it is KNeighbors-like. Parameters ---------- nn_name : str The name associated to the object to raise an error if needed. nn_object : int or KNeighborsMixin The object to be checked. additional_neighbor : int, default=0 Sometimes, some algorithm need an additional neighbors. Returns ------- nn_object : KNeighborsMixin The k-NN object. """ if isinstance(nn_object, Integral): return NearestNeighbors(n_neighbors=nn_object + additional_neighbor) # _is_neighbors_object(nn_object) return clone(nn_object) def _count_class_sample(y): unique, counts = np.unique(y, return_counts=True) return dict(zip(unique, counts)) def check_target_type(y, indicate_one_vs_all=False): """Check the target types to be conform to the current samplers. The current samplers should be compatible with ``'binary'``, ``'multilabel-indicator'`` and ``'multiclass'`` targets only. Parameters ---------- y : ndarray The array containing the target. indicate_one_vs_all : bool, default=False Either to indicate if the targets are encoded in a one-vs-all fashion. Returns ------- y : ndarray The returned target. is_one_vs_all : bool, optional Indicate if the target was originally encoded in a one-vs-all fashion. Only returned if ``indicate_multilabel=True``. """ type_y = type_of_target(y) if type_y == "multilabel-indicator": if np.any(y.sum(axis=1) > 1): raise ValueError( "Imbalanced-learn currently supports binary, multiclass and " "binarized encoded multiclasss targets. Multilabel and " "multioutput targets are not supported." ) y = y.argmax(axis=1) else: y = column_or_1d(y) return (y, type_y == "multilabel-indicator") if indicate_one_vs_all else y def _sampling_strategy_all(y, sampling_type): """Returns sampling target by targeting all classes.""" target_stats = _count_class_sample(y) if sampling_type == "over-sampling": n_sample_majority = max(target_stats.values()) sampling_strategy = { key: n_sample_majority - value for (key, value) in target_stats.items() } elif sampling_type == "under-sampling" or sampling_type == "clean-sampling": n_sample_minority = min(target_stats.values()) sampling_strategy = {key: n_sample_minority for key in target_stats.keys()} else: raise NotImplementedError return sampling_strategy def _sampling_strategy_majority(y, sampling_type): """Returns sampling target by targeting the majority class only.""" if sampling_type == "over-sampling": raise ValueError( "'sampling_strategy'='majority' cannot be used with over-sampler." ) elif sampling_type == "under-sampling" or sampling_type == "clean-sampling": target_stats = _count_class_sample(y) class_majority = max(target_stats, key=target_stats.get) n_sample_minority = min(target_stats.values()) sampling_strategy = { key: n_sample_minority for key in target_stats.keys() if key == class_majority } else: raise NotImplementedError return sampling_strategy def _sampling_strategy_not_majority(y, sampling_type): """Returns sampling target by targeting all classes but not the majority.""" target_stats = _count_class_sample(y) if sampling_type == "over-sampling": n_sample_majority = max(target_stats.values()) class_majority = max(target_stats, key=target_stats.get) sampling_strategy = { key: n_sample_majority - value for (key, value) in target_stats.items() if key != class_majority } elif sampling_type == "under-sampling" or sampling_type == "clean-sampling": n_sample_minority = min(target_stats.values()) class_majority = max(target_stats, key=target_stats.get) sampling_strategy = { key: n_sample_minority for key in target_stats.keys() if key != class_majority } else: raise NotImplementedError return sampling_strategy def _sampling_strategy_not_minority(y, sampling_type): """Returns sampling target by targeting all classes but not the minority.""" target_stats = _count_class_sample(y) if sampling_type == "over-sampling": n_sample_majority = max(target_stats.values()) class_minority = min(target_stats, key=target_stats.get) sampling_strategy = { key: n_sample_majority - value for (key, value) in target_stats.items() if key != class_minority } elif sampling_type == "under-sampling" or sampling_type == "clean-sampling": n_sample_minority = min(target_stats.values()) class_minority = min(target_stats, key=target_stats.get) sampling_strategy = { key: n_sample_minority for key in target_stats.keys() if key != class_minority } else: raise NotImplementedError return sampling_strategy def _sampling_strategy_minority(y, sampling_type): """Returns sampling target by targeting the minority class only.""" target_stats = _count_class_sample(y) if sampling_type == "over-sampling": n_sample_majority = max(target_stats.values()) class_minority = min(target_stats, key=target_stats.get) sampling_strategy = { key: n_sample_majority - value for (key, value) in target_stats.items() if key == class_minority } elif sampling_type == "under-sampling" or sampling_type == "clean-sampling": raise ValueError( "'sampling_strategy'='minority' cannot be used with" " under-sampler and clean-sampler." ) else: raise NotImplementedError return sampling_strategy def _sampling_strategy_auto(y, sampling_type): """Returns sampling target auto for over-sampling and not-minority for under-sampling.""" if sampling_type == "over-sampling": return _sampling_strategy_not_majority(y, sampling_type) elif sampling_type == "under-sampling" or sampling_type == "clean-sampling": return _sampling_strategy_not_minority(y, sampling_type) def _sampling_strategy_dict(sampling_strategy, y, sampling_type): """Returns sampling target by converting the dictionary depending of the sampling.""" target_stats = _count_class_sample(y) # check that all keys in sampling_strategy are also in y set_diff_sampling_strategy_target = set(sampling_strategy.keys()) - set( target_stats.keys() ) if len(set_diff_sampling_strategy_target) > 0: raise ValueError( f"The {set_diff_sampling_strategy_target} target class is/are not " f"present in the data." ) # check that there is no negative number if any(n_samples < 0 for n_samples in sampling_strategy.values()): raise ValueError( f"The number of samples in a class cannot be negative." f"'sampling_strategy' contains some negative value: {sampling_strategy}" ) sampling_strategy_ = {} if sampling_type == "over-sampling": max(target_stats.values()) max(target_stats, key=target_stats.get) for class_sample, n_samples in sampling_strategy.items(): if n_samples < target_stats[class_sample]: raise ValueError( f"With over-sampling methods, the number" f" of samples in a class should be greater" f" or equal to the original number of samples." f" Originally, there is {target_stats[class_sample]} " f"samples and {n_samples} samples are asked." ) sampling_strategy_[class_sample] = n_samples - target_stats[class_sample] elif sampling_type == "under-sampling": for class_sample, n_samples in sampling_strategy.items(): if n_samples > target_stats[class_sample]: raise ValueError( f"With under-sampling methods, the number of" f" samples in a class should be less or equal" f" to the original number of samples." f" Originally, there is {target_stats[class_sample]} " f"samples and {n_samples} samples are asked." ) sampling_strategy_[class_sample] = n_samples elif sampling_type == "clean-sampling": raise ValueError( "'sampling_strategy' as a dict for cleaning methods is " "not supported. Please give a list of the classes to be " "targeted by the sampling." ) else: raise NotImplementedError return sampling_strategy_ def _sampling_strategy_list(sampling_strategy, y, sampling_type): """With cleaning methods, sampling_strategy can be a list to target the class of interest.""" if sampling_type != "clean-sampling": raise ValueError( "'sampling_strategy' cannot be a list for samplers " "which are not cleaning methods." ) target_stats = _count_class_sample(y) # check that all keys in sampling_strategy are also in y set_diff_sampling_strategy_target = set(sampling_strategy) - set( target_stats.keys() ) if len(set_diff_sampling_strategy_target) > 0: raise ValueError( f"The {set_diff_sampling_strategy_target} target class is/are not " f"present in the data." ) return { class_sample: min(target_stats.values()) for class_sample in sampling_strategy } def _sampling_strategy_float(sampling_strategy, y, sampling_type): """Take a proportion of the majority (over-sampling) or minority (under-sampling) class in binary classification.""" type_y = type_of_target(y) if type_y != "binary": raise ValueError( '"sampling_strategy" can be a float only when the type ' "of target is binary. For multi-class, use a dict." ) target_stats = _count_class_sample(y) if sampling_type == "over-sampling": n_sample_majority = max(target_stats.values()) class_majority = max(target_stats, key=target_stats.get) sampling_strategy_ = { key: int(n_sample_majority * sampling_strategy - value) for (key, value) in target_stats.items() if key != class_majority } if any([n_samples <= 0 for n_samples in sampling_strategy_.values()]): raise ValueError( "The specified ratio required to remove samples " "from the minority class while trying to " "generate new samples. Please increase the " "ratio." ) elif sampling_type == "under-sampling": n_sample_minority = min(target_stats.values()) class_minority = min(target_stats, key=target_stats.get) sampling_strategy_ = { key: int(n_sample_minority / sampling_strategy) for (key, value) in target_stats.items() if key != class_minority } if any( [ n_samples > target_stats[target] for target, n_samples in sampling_strategy_.items() ] ): raise ValueError( "The specified ratio required to generate new " "sample in the majority class while trying to " "remove samples. Please increase the ratio." ) else: raise ValueError( "'clean-sampling' methods do let the user specify the sampling ratio." ) return sampling_strategy_ def check_sampling_strategy(sampling_strategy, y, sampling_type, **kwargs): """Sampling target validation for samplers. Checks that ``sampling_strategy`` is of consistent type and return a dictionary containing each targeted class with its corresponding number of sample. It is used in :class:`~imblearn.base.BaseSampler`. Parameters ---------- sampling_strategy : float, str, dict, list or callable, Sampling information to sample the data set. - When ``float``: For **under-sampling methods**, it corresponds to the ratio :math:`\\alpha_{us}` defined by :math:`N_{rM} = \\alpha_{us} \\times N_{m}` where :math:`N_{rM}` and :math:`N_{m}` are the number of samples in the majority class after resampling and the number of samples in the minority class, respectively; For **over-sampling methods**, it correspond to the ratio :math:`\\alpha_{os}` defined by :math:`N_{rm} = \\alpha_{os} \\times N_{m}` where :math:`N_{rm}` and :math:`N_{M}` are the number of samples in the minority class after resampling and the number of samples in the majority class, respectively. .. warning:: ``float`` is only available for **binary** classification. An error is raised for multi-class classification and with cleaning samplers. - When ``str``, specify the class targeted by the resampling. For **under- and over-sampling methods**, the number of samples in the different classes will be equalized. For **cleaning methods**, the number of samples will not be equal. Possible choices are: ``'minority'``: resample only the minority class; ``'majority'``: resample only the majority class; ``'not minority'``: resample all classes but the minority class; ``'not majority'``: resample all classes but the majority class; ``'all'``: resample all classes; ``'auto'``: for under-sampling methods, equivalent to ``'not minority'`` and for over-sampling methods, equivalent to ``'not majority'``. - When ``dict``, the keys correspond to the targeted classes. The values correspond to the desired number of samples for each targeted class. .. warning:: ``dict`` is available for both **under- and over-sampling methods**. An error is raised with **cleaning methods**. Use a ``list`` instead. - When ``list``, the list contains the targeted classes. It used only for **cleaning methods**. .. warning:: ``list`` is available for **cleaning methods**. An error is raised with **under- and over-sampling methods**. - When callable, function taking ``y`` and returns a ``dict``. The keys correspond to the targeted classes. The values correspond to the desired number of samples for each class. y : ndarray of shape (n_samples,) The target array. sampling_type : {{'over-sampling', 'under-sampling', 'clean-sampling'}} The type of sampling. Can be either ``'over-sampling'``, ``'under-sampling'``, or ``'clean-sampling'``. **kwargs : dict Dictionary of additional keyword arguments to pass to ``sampling_strategy`` when this is a callable. Returns ------- sampling_strategy_converted : dict The converted and validated sampling target. Returns a dictionary with the key being the class target and the value being the desired number of samples. """ if sampling_type not in SAMPLING_KIND: raise ValueError( f"'sampling_type' should be one of {SAMPLING_KIND}. " f"Got '{sampling_type} instead." ) if np.unique(y).size <= 1: raise ValueError( f"The target 'y' needs to have more than 1 class. " f"Got {np.unique(y).size} class instead" ) if sampling_type in ("ensemble", "bypass"): return sampling_strategy if isinstance(sampling_strategy, str): if sampling_strategy not in SAMPLING_TARGET_KIND.keys(): raise ValueError( f"When 'sampling_strategy' is a string, it needs" f" to be one of {SAMPLING_TARGET_KIND}. Got '{sampling_strategy}' " f"instead." ) return OrderedDict( sorted(SAMPLING_TARGET_KIND[sampling_strategy](y, sampling_type).items()) ) elif isinstance(sampling_strategy, dict): return OrderedDict( sorted(_sampling_strategy_dict(sampling_strategy, y, sampling_type).items()) ) elif isinstance(sampling_strategy, list): return OrderedDict( sorted(_sampling_strategy_list(sampling_strategy, y, sampling_type).items()) ) elif isinstance(sampling_strategy, Real): if sampling_strategy <= 0 or sampling_strategy > 1: raise ValueError( f"When 'sampling_strategy' is a float, it should be " f"in the range (0, 1]. Got {sampling_strategy} instead." ) return OrderedDict( sorted( _sampling_strategy_float(sampling_strategy, y, sampling_type).items() ) ) elif callable(sampling_strategy): sampling_strategy_ = sampling_strategy(y, **kwargs) return OrderedDict( sorted( _sampling_strategy_dict(sampling_strategy_, y, sampling_type).items() ) ) SAMPLING_TARGET_KIND = { "minority": _sampling_strategy_minority, "majority": _sampling_strategy_majority, "not minority": _sampling_strategy_not_minority, "not majority": _sampling_strategy_not_majority, "all": _sampling_strategy_all, "auto": _sampling_strategy_auto, } def _deprecate_positional_args(f): """Decorator for methods that issues warnings for positional arguments Using the keyword-only argument syntax in pep 3102, arguments after the * will issue a warning when passed as a positional argument. Parameters ---------- f : function function to check arguments on. """ sig = signature(f) kwonly_args = [] all_args = [] for name, param in sig.parameters.items(): if param.kind == Parameter.POSITIONAL_OR_KEYWORD: all_args.append(name) elif param.kind == Parameter.KEYWORD_ONLY: kwonly_args.append(name) @wraps(f) def inner_f(*args, **kwargs): extra_args = len(args) - len(all_args) if extra_args > 0: # ignore first 'self' argument for instance methods args_msg = [ f"{name}={arg}" for name, arg in zip(kwonly_args[:extra_args], args[-extra_args:]) ] warnings.warn( f"Pass {', '.join(args_msg)} as keyword args. From version 0.9 " f"passing these as positional arguments will " f"result in an error", FutureWarning, ) kwargs.update({k: arg for k, arg in zip(sig.parameters, args)}) return f(**kwargs) return inner_f def _check_X(X): """Check X and do not check it if a dataframe.""" n_samples = _num_samples(X) if n_samples < 1: raise ValueError( f"Found array with {n_samples} sample(s) while a minimum of 1 is " "required." ) if _is_pandas_df(X): return X return check_array( X, dtype=None, accept_sparse=["csr", "csc"], force_all_finite=False ) imbalanced-learn-0.12.2/imblearn/utils/deprecation.py000066400000000000000000000031371460233407600225630ustar00rootroot00000000000000"""Utilities for deprecation""" # Authors: Guillaume Lemaitre # License: MIT import warnings def deprecate_parameter(sampler, version_deprecation, param_deprecated, new_param=None): """Helper to deprecate a parameter by another one. Parameters ---------- sampler : sampler object, The object which will be inspected. version_deprecation : str, The version from which the parameter will be deprecated. The format should be ``'x.y'``. param_deprecated : str, The parameter being deprecated. new_param : str, The parameter used instead of the deprecated parameter. By default, no parameter is expected. """ x, y = version_deprecation.split(".") version_removed = x + "." + str(int(y) + 2) if new_param is None: if getattr(sampler, param_deprecated) is not None: warnings.warn( f"'{param_deprecated}' is deprecated from {version_deprecation} and " f" will be removed in {version_removed} for the estimator " f"{sampler.__class__}.", category=FutureWarning, ) else: if getattr(sampler, param_deprecated) is not None: warnings.warn( f"'{param_deprecated}' is deprecated from {version_deprecation} and " f"will be removed in {version_removed} for the estimator " f"{sampler.__class__}. Use '{new_param}' instead.", category=FutureWarning, ) setattr(sampler, new_param, getattr(sampler, param_deprecated)) imbalanced-learn-0.12.2/imblearn/utils/estimator_checks.py000066400000000000000000000706701460233407600236230ustar00rootroot00000000000000"""Utils to check the samplers and compatibility with scikit-learn""" # Adapated from scikit-learn # Authors: Guillaume Lemaitre # License: MIT import re import sys import traceback import warnings from collections import Counter from functools import partial import numpy as np import pytest import sklearn from scipy import sparse from sklearn.base import clone, is_classifier, is_regressor from sklearn.cluster import KMeans from sklearn.datasets import ( # noqa load_iris, make_blobs, make_classification, make_multilabel_classification, ) from sklearn.exceptions import SkipTestWarning from sklearn.preprocessing import StandardScaler, label_binarize from sklearn.utils._tags import _safe_tags from sklearn.utils._testing import ( SkipTest, assert_allclose, assert_array_equal, assert_raises_regex, raises, set_random_state, ) from sklearn.utils.estimator_checks import ( _enforce_estimator_tags_y, _get_check_estimator_ids, _maybe_mark_xfail, ) try: from sklearn.utils.estimator_checks import _enforce_estimator_tags_x except ImportError: # scikit-learn >= 1.2 from sklearn.utils.estimator_checks import ( _enforce_estimator_tags_X as _enforce_estimator_tags_x, ) from sklearn.utils.fixes import parse_version from sklearn.utils.multiclass import type_of_target from imblearn.datasets import make_imbalance from imblearn.over_sampling.base import BaseOverSampler from imblearn.under_sampling.base import BaseCleaningSampler, BaseUnderSampler from imblearn.utils._param_validation import generate_invalid_param_val, make_constraint sklearn_version = parse_version(sklearn.__version__) def sample_dataset_generator(): X, y = make_classification( n_samples=1000, n_classes=3, n_informative=4, weights=[0.2, 0.3, 0.5], random_state=0, ) return X, y @pytest.fixture(name="sample_dataset_generator") def sample_dataset_generator_fixture(): return sample_dataset_generator() def _set_checking_parameters(estimator): params = estimator.get_params() name = estimator.__class__.__name__ if "n_estimators" in params: estimator.set_params(n_estimators=min(5, estimator.n_estimators)) if name == "ClusterCentroids": if sklearn_version < parse_version("1.1"): algorithm = "full" else: algorithm = "lloyd" estimator.set_params( voting="soft", estimator=KMeans(random_state=0, algorithm=algorithm, n_init=1), ) if name == "KMeansSMOTE": estimator.set_params(kmeans_estimator=12) if name == "BalancedRandomForestClassifier": # TODO: remove in 0.13 # future default in 0.13 estimator.set_params(replacement=True, sampling_strategy="all", bootstrap=False) def _yield_sampler_checks(sampler): tags = sampler._get_tags() yield check_target_type yield check_samplers_one_label yield check_samplers_fit yield check_samplers_fit_resample yield check_samplers_sampling_strategy_fit_resample if "sparse" in tags["X_types"]: yield check_samplers_sparse if "dataframe" in tags["X_types"]: yield check_samplers_pandas yield check_samplers_pandas_sparse if "string" in tags["X_types"]: yield check_samplers_string if tags["allow_nan"]: yield check_samplers_nan yield check_samplers_list yield check_samplers_multiclass_ova yield check_samplers_preserve_dtype # we don't filter samplers based on their tag here because we want to make # sure that the fitted attribute does not exist if the tag is not # stipulated yield check_samplers_sample_indices yield check_samplers_2d_target yield check_sampler_get_feature_names_out yield check_sampler_get_feature_names_out_pandas def _yield_classifier_checks(classifier): yield check_classifier_on_multilabel_or_multioutput_targets yield check_classifiers_with_encoded_labels def _yield_all_checks(estimator): name = estimator.__class__.__name__ tags = estimator._get_tags() if tags["_skip_test"]: warnings.warn( f"Explicit SKIP via _skip_test tag for estimator {name}.", SkipTestWarning, ) return # trigger our checks if this is a SamplerMixin if hasattr(estimator, "fit_resample"): for check in _yield_sampler_checks(estimator): yield check if hasattr(estimator, "predict"): for check in _yield_classifier_checks(estimator): yield check def parametrize_with_checks(estimators): """Pytest specific decorator for parametrizing estimator checks. The `id` of each check is set to be a pprint version of the estimator and the name of the check with its keyword arguments. This allows to use `pytest -k` to specify which tests to run:: pytest test_check_estimators.py -k check_estimators_fit_returns_self Parameters ---------- estimators : list of estimators instances Estimators to generated checks for. Returns ------- decorator : `pytest.mark.parametrize` Examples -------- >>> from sklearn.utils.estimator_checks import parametrize_with_checks >>> from sklearn.linear_model import LogisticRegression >>> from sklearn.tree import DecisionTreeRegressor >>> @parametrize_with_checks([LogisticRegression(), ... DecisionTreeRegressor()]) ... def test_sklearn_compatible_estimator(estimator, check): ... check(estimator) """ def checks_generator(): for estimator in estimators: name = type(estimator).__name__ for check in _yield_all_checks(estimator): check = partial(check, name) yield _maybe_mark_xfail(estimator, check, pytest) return pytest.mark.parametrize( "estimator, check", checks_generator(), ids=_get_check_estimator_ids ) def check_target_type(name, estimator_orig): estimator = clone(estimator_orig) # should raise warning if the target is continuous (we cannot raise error) X = np.random.random((20, 2)) y = np.linspace(0, 1, 20) msg = "Unknown label type:" assert_raises_regex( ValueError, msg, estimator.fit_resample, X, y, ) # if the target is multilabel then we should raise an error rng = np.random.RandomState(42) y = rng.randint(2, size=(20, 3)) msg = "Multilabel and multioutput targets are not supported." assert_raises_regex( ValueError, msg, estimator.fit_resample, X, y, ) def check_samplers_one_label(name, sampler_orig): sampler = clone(sampler_orig) error_string_fit = "Sampler can't balance when only one class is present." X = np.random.random((20, 2)) y = np.zeros(20) try: sampler.fit_resample(X, y) except ValueError as e: if "class" not in repr(e): print(error_string_fit, sampler.__class__.__name__, e) traceback.print_exc(file=sys.stdout) raise e else: return except Exception as exc: print(error_string_fit, traceback, exc) traceback.print_exc(file=sys.stdout) raise exc raise AssertionError(error_string_fit) def check_samplers_fit(name, sampler_orig): sampler = clone(sampler_orig) np.random.seed(42) # Make this test reproducible X = np.random.random((30, 2)) y = np.array([1] * 20 + [0] * 10) sampler.fit_resample(X, y) assert hasattr( sampler, "sampling_strategy_" ), "No fitted attribute sampling_strategy_" def check_samplers_fit_resample(name, sampler_orig): sampler = clone(sampler_orig) X, y = sample_dataset_generator() target_stats = Counter(y) X_res, y_res = sampler.fit_resample(X, y) if isinstance(sampler, BaseOverSampler): target_stats_res = Counter(y_res) n_samples = max(target_stats.values()) assert all(value >= n_samples for value in Counter(y_res).values()) elif isinstance(sampler, BaseUnderSampler): n_samples = min(target_stats.values()) if name == "InstanceHardnessThreshold": # IHT does not enforce the number of samples but provide a number # of samples the closest to the desired target. assert all( Counter(y_res)[k] <= target_stats[k] for k in target_stats.keys() ) else: assert all(value == n_samples for value in Counter(y_res).values()) elif isinstance(sampler, BaseCleaningSampler): target_stats_res = Counter(y_res) class_minority = min(target_stats, key=target_stats.get) assert all( target_stats[class_sample] > target_stats_res[class_sample] for class_sample in target_stats.keys() if class_sample != class_minority ) def check_samplers_sampling_strategy_fit_resample(name, sampler_orig): sampler = clone(sampler_orig) # in this test we will force all samplers to not change the class 1 X, y = sample_dataset_generator() expected_stat = Counter(y)[1] if isinstance(sampler, BaseOverSampler): sampling_strategy = {2: 498, 0: 498} sampler.set_params(sampling_strategy=sampling_strategy) X_res, y_res = sampler.fit_resample(X, y) assert Counter(y_res)[1] == expected_stat elif isinstance(sampler, BaseUnderSampler): sampling_strategy = {2: 201, 0: 201} sampler.set_params(sampling_strategy=sampling_strategy) X_res, y_res = sampler.fit_resample(X, y) assert Counter(y_res)[1] == expected_stat elif isinstance(sampler, BaseCleaningSampler): sampling_strategy = [2, 0] sampler.set_params(sampling_strategy=sampling_strategy) X_res, y_res = sampler.fit_resample(X, y) assert Counter(y_res)[1] == expected_stat def check_samplers_sparse(name, sampler_orig): sampler = clone(sampler_orig) # check that sparse matrices can be passed through the sampler leading to # the same results than dense X, y = sample_dataset_generator() X_sparse = sparse.csr_matrix(X) X_res_sparse, y_res_sparse = sampler.fit_resample(X_sparse, y) sampler = clone(sampler) X_res, y_res = sampler.fit_resample(X, y) assert sparse.issparse(X_res_sparse) assert_allclose(X_res_sparse.A, X_res, rtol=1e-5) assert_allclose(y_res_sparse, y_res) def check_samplers_pandas_sparse(name, sampler_orig): pd = pytest.importorskip("pandas") sampler = clone(sampler_orig) # Check that the samplers handle pandas dataframe and pandas series X, y = sample_dataset_generator() X_df = pd.DataFrame( X, columns=[str(i) for i in range(X.shape[1])], dtype=pd.SparseDtype(float, 0) ) y_s = pd.Series(y, name="class") X_res_df, y_res_s = sampler.fit_resample(X_df, y_s) X_res, y_res = sampler.fit_resample(X, y) # check that we return the same type for dataframes or series types assert isinstance(X_res_df, pd.DataFrame) assert isinstance(y_res_s, pd.Series) for column_dtype in X_res_df.dtypes: assert isinstance(column_dtype, pd.SparseDtype) assert X_df.columns.tolist() == X_res_df.columns.tolist() assert y_s.name == y_res_s.name # FIXME: we should use to_numpy with pandas >= 0.25 assert_allclose(X_res_df.values, X_res) assert_allclose(y_res_s.values, y_res) def check_samplers_pandas(name, sampler_orig): pd = pytest.importorskip("pandas") sampler = clone(sampler_orig) # Check that the samplers handle pandas dataframe and pandas series X, y = sample_dataset_generator() X_df = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1])]) y_df = pd.DataFrame(y) y_s = pd.Series(y, name="class") X_res_df, y_res_s = sampler.fit_resample(X_df, y_s) X_res_df, y_res_df = sampler.fit_resample(X_df, y_df) X_res, y_res = sampler.fit_resample(X, y) # check that we return the same type for dataframes or series types assert isinstance(X_res_df, pd.DataFrame) assert isinstance(y_res_df, pd.DataFrame) assert isinstance(y_res_s, pd.Series) assert X_df.columns.tolist() == X_res_df.columns.tolist() assert y_df.columns.tolist() == y_res_df.columns.tolist() assert y_s.name == y_res_s.name # FIXME: we should use to_numpy with pandas >= 0.25 assert_allclose(X_res_df.values, X_res) assert_allclose(y_res_df.values.ravel(), y_res) assert_allclose(y_res_s.values, y_res) def check_samplers_list(name, sampler_orig): sampler = clone(sampler_orig) # Check that the can samplers handle simple lists X, y = sample_dataset_generator() X_list = X.tolist() y_list = y.tolist() X_res, y_res = sampler.fit_resample(X, y) X_res_list, y_res_list = sampler.fit_resample(X_list, y_list) assert isinstance(X_res_list, list) assert isinstance(y_res_list, list) assert_allclose(X_res, X_res_list) assert_allclose(y_res, y_res_list) def check_samplers_multiclass_ova(name, sampler_orig): sampler = clone(sampler_orig) # Check that multiclass target lead to the same results than OVA encoding X, y = sample_dataset_generator() y_ova = label_binarize(y, classes=np.unique(y)) X_res, y_res = sampler.fit_resample(X, y) X_res_ova, y_res_ova = sampler.fit_resample(X, y_ova) assert_allclose(X_res, X_res_ova) assert type_of_target(y_res_ova) == type_of_target(y_ova) assert_allclose(y_res, y_res_ova.argmax(axis=1)) def check_samplers_2d_target(name, sampler_orig): sampler = clone(sampler_orig) X, y = sample_dataset_generator() y = y.reshape(-1, 1) # Make the target 2d sampler.fit_resample(X, y) def check_samplers_preserve_dtype(name, sampler_orig): sampler = clone(sampler_orig) X, y = sample_dataset_generator() # Cast X and y to not default dtype X = X.astype(np.float32) y = y.astype(np.int32) X_res, y_res = sampler.fit_resample(X, y) assert X.dtype == X_res.dtype, "X dtype is not preserved" assert y.dtype == y_res.dtype, "y dtype is not preserved" def check_samplers_sample_indices(name, sampler_orig): sampler = clone(sampler_orig) X, y = sample_dataset_generator() sampler.fit_resample(X, y) sample_indices = sampler._get_tags().get("sample_indices", None) if sample_indices: assert hasattr(sampler, "sample_indices_") is sample_indices else: assert not hasattr(sampler, "sample_indices_") def check_samplers_string(name, sampler_orig): rng = np.random.RandomState(0) sampler = clone(sampler_orig) categories = np.array(["A", "B", "C"], dtype=object) n_samples = 30 X = rng.randint(low=0, high=3, size=n_samples).reshape(-1, 1) X = categories[X] y = rng.permutation([0] * 10 + [1] * 20) X_res, y_res = sampler.fit_resample(X, y) assert X_res.dtype == object assert X_res.shape[0] == y_res.shape[0] assert_array_equal(np.unique(X_res.ravel()), categories) def check_samplers_nan(name, sampler_orig): rng = np.random.RandomState(0) sampler = clone(sampler_orig) categories = np.array([0, 1, np.nan], dtype=np.float64) n_samples = 100 X = rng.randint(low=0, high=3, size=n_samples).reshape(-1, 1) X = categories[X] y = rng.permutation([0] * 40 + [1] * 60) X_res, y_res = sampler.fit_resample(X, y) assert X_res.dtype == np.float64 assert X_res.shape[0] == y_res.shape[0] assert np.any(np.isnan(X_res.ravel())) def check_classifier_on_multilabel_or_multioutput_targets(name, estimator_orig): estimator = clone(estimator_orig) X, y = make_multilabel_classification(n_samples=30) msg = "Multilabel and multioutput targets are not supported." with pytest.raises(ValueError, match=msg): estimator.fit(X, y) def check_classifiers_with_encoded_labels(name, classifier_orig): # Non-regression test for #709 # https://github.com/scikit-learn-contrib/imbalanced-learn/issues/709 pd = pytest.importorskip("pandas") classifier = clone(classifier_orig) iris = load_iris(as_frame=True) df, y = iris.data, iris.target y = pd.Series(iris.target_names[iris.target], dtype="category") df, y = make_imbalance( df, y, sampling_strategy={ "setosa": 30, "versicolor": 20, "virginica": 50, }, ) classifier.set_params(sampling_strategy={"setosa": 20, "virginica": 20}) classifier.fit(df, y) assert set(classifier.classes_) == set(y.cat.categories.tolist()) y_pred = classifier.predict(df) assert set(y_pred) == set(y.cat.categories.tolist()) def check_param_validation(name, estimator_orig): # Check that an informative error is raised when the value of a constructor # parameter does not have an appropriate type or value. rng = np.random.RandomState(0) X = rng.uniform(size=(20, 5)) y = rng.randint(0, 2, size=20) y = _enforce_estimator_tags_y(estimator_orig, y) estimator_params = estimator_orig.get_params(deep=False).keys() # check that there is a constraint for each parameter if estimator_params: validation_params = estimator_orig._parameter_constraints.keys() unexpected_params = set(validation_params) - set(estimator_params) missing_params = set(estimator_params) - set(validation_params) err_msg = ( f"Mismatch between _parameter_constraints and the parameters of {name}." f"\nConsider the unexpected parameters {unexpected_params} and expected but" f" missing parameters {missing_params}" ) assert validation_params == estimator_params, err_msg # this object does not have a valid type for sure for all params param_with_bad_type = type("BadType", (), {})() fit_methods = ["fit", "partial_fit", "fit_transform", "fit_predict", "fit_resample"] for param_name in estimator_params: constraints = estimator_orig._parameter_constraints[param_name] if constraints == "no_validation": # This parameter is not validated continue # pragma: no cover match = rf"The '{param_name}' parameter of {name} must be .* Got .* instead." err_msg = ( f"{name} does not raise an informative error message when the " f"parameter {param_name} does not have a valid type or value." ) estimator = clone(estimator_orig) # First, check that the error is raised if param doesn't match any valid type. estimator.set_params(**{param_name: param_with_bad_type}) for method in fit_methods: if not hasattr(estimator, method): # the method is not accessible with the current set of parameters continue with raises(ValueError, match=match, err_msg=err_msg): if any( isinstance(X_type, str) and X_type.endswith("labels") for X_type in _safe_tags(estimator, key="X_types") ): # The estimator is a label transformer and take only `y` getattr(estimator, method)(y) # pragma: no cover else: getattr(estimator, method)(X, y) # Then, for constraints that are more than a type constraint, check that the # error is raised if param does match a valid type but does not match any valid # value for this type. constraints = [make_constraint(constraint) for constraint in constraints] for constraint in constraints: try: bad_value = generate_invalid_param_val(constraint) except NotImplementedError: continue estimator.set_params(**{param_name: bad_value}) for method in fit_methods: if not hasattr(estimator, method): # the method is not accessible with the current set of parameters continue with raises(ValueError, match=match, err_msg=err_msg): if any( X_type.endswith("labels") for X_type in _safe_tags(estimator, key="X_types") ): # The estimator is a label transformer and take only `y` getattr(estimator, method)(y) # pragma: no cover else: getattr(estimator, method)(X, y) def check_dataframe_column_names_consistency(name, estimator_orig): try: import pandas as pd except ImportError: raise SkipTest( "pandas is not installed: not checking column name consistency for pandas" ) tags = _safe_tags(estimator_orig) is_supported_X_types = ( "2darray" in tags["X_types"] or "categorical" in tags["X_types"] ) if not is_supported_X_types or tags["no_validation"]: return rng = np.random.RandomState(0) estimator = clone(estimator_orig) set_random_state(estimator) X_orig = rng.normal(size=(150, 8)) X_orig = _enforce_estimator_tags_x(estimator, X_orig) n_samples, n_features = X_orig.shape names = np.array([f"col_{i}" for i in range(n_features)]) X = pd.DataFrame(X_orig, columns=names) if is_regressor(estimator): y = rng.normal(size=n_samples) else: y = rng.randint(low=0, high=2, size=n_samples) y = _enforce_estimator_tags_y(estimator, y) # Check that calling `fit` does not raise any warnings about feature names. with warnings.catch_warnings(): warnings.filterwarnings( "error", message="X does not have valid feature names", category=UserWarning, module="imblearn", ) estimator.fit(X, y) if not hasattr(estimator, "feature_names_in_"): raise ValueError( "Estimator does not have a feature_names_in_ " "attribute after fitting with a dataframe" ) assert isinstance(estimator.feature_names_in_, np.ndarray) assert estimator.feature_names_in_.dtype == object assert_array_equal(estimator.feature_names_in_, names) # Only check imblearn estimators for feature_names_in_ in docstring module_name = estimator_orig.__module__ if ( module_name.startswith("imblearn.") and not ("test_" in module_name or module_name.endswith("_testing")) and ("feature_names_in_" not in (estimator_orig.__doc__)) ): raise ValueError( f"Estimator {name} does not document its feature_names_in_ attribute" ) check_methods = [] for method in ( "predict", "transform", "decision_function", "predict_proba", "score", "score_samples", "predict_log_proba", ): if not hasattr(estimator, method): continue callable_method = getattr(estimator, method) if method == "score": callable_method = partial(callable_method, y=y) check_methods.append((method, callable_method)) for _, method in check_methods: with warnings.catch_warnings(): warnings.filterwarnings( "error", message="X does not have valid feature names", category=UserWarning, module="sklearn", ) method(X) # works without UserWarning for valid features invalid_names = [ (names[::-1], "Feature names must be in the same order as they were in fit."), ( [f"another_prefix_{i}" for i in range(n_features)], "Feature names unseen at fit time:\n- another_prefix_0\n-" " another_prefix_1\n", ), ( names[:3], f"Feature names seen at fit time, yet now missing:\n- {min(names[3:])}\n", ), ] params = { key: value for key, value in estimator.get_params().items() if "early_stopping" in key } early_stopping_enabled = any(value is True for value in params.values()) for invalid_name, additional_message in invalid_names: X_bad = pd.DataFrame(X, columns=invalid_name) for name, method in check_methods: if sklearn_version >= parse_version("1.2"): expected_msg = re.escape( "The feature names should match those that were passed during fit." f"\n{additional_message}" ) with raises( ValueError, match=expected_msg, err_msg=f"{name} did not raise" ): method(X_bad) else: expected_msg = re.escape( "The feature names should match those that were passed " "during fit. Starting version 1.2, an error will be raised.\n" f"{additional_message}" ) with warnings.catch_warnings(): warnings.filterwarnings( "error", category=FutureWarning, module="sklearn", ) with raises( FutureWarning, match=expected_msg, err_msg=f"{name} did not raise", ): method(X_bad) # partial_fit checks on second call # Do not call partial fit if early_stopping is on if not hasattr(estimator, "partial_fit") or early_stopping_enabled: continue estimator = clone(estimator_orig) if is_classifier(estimator): classes = np.unique(y) estimator.partial_fit(X, y, classes=classes) else: estimator.partial_fit(X, y) with raises(ValueError, match=expected_msg): estimator.partial_fit(X_bad, y) def check_sampler_get_feature_names_out(name, sampler_orig): tags = sampler_orig._get_tags() if "2darray" not in tags["X_types"] or tags["no_validation"]: return X, y = make_blobs( n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1, ) X = StandardScaler().fit_transform(X) sampler = clone(sampler_orig) X = _enforce_estimator_tags_x(sampler, X) n_features = X.shape[1] set_random_state(sampler) y_ = y X_res, y_res = sampler.fit_resample(X, y=y_) input_features = [f"feature{i}" for i in range(n_features)] # input_features names is not the same length as n_features_in_ with raises(ValueError, match="input_features should have length equal"): sampler.get_feature_names_out(input_features[::2]) feature_names_out = sampler.get_feature_names_out(input_features) assert feature_names_out is not None assert isinstance(feature_names_out, np.ndarray) assert feature_names_out.dtype == object assert all(isinstance(name, str) for name in feature_names_out) n_features_out = X_res.shape[1] assert ( len(feature_names_out) == n_features_out ), f"Expected {n_features_out} feature names, got {len(feature_names_out)}" def check_sampler_get_feature_names_out_pandas(name, sampler_orig): try: import pandas as pd except ImportError: raise SkipTest( "pandas is not installed: not checking column name consistency for pandas" ) tags = sampler_orig._get_tags() if "2darray" not in tags["X_types"] or tags["no_validation"]: return X, y = make_blobs( n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1, ) X = StandardScaler().fit_transform(X) sampler = clone(sampler_orig) X = _enforce_estimator_tags_x(sampler, X) n_features = X.shape[1] set_random_state(sampler) y_ = y feature_names_in = [f"col{i}" for i in range(n_features)] df = pd.DataFrame(X, columns=feature_names_in) X_res, y_res = sampler.fit_resample(df, y=y_) # error is raised when `input_features` do not match feature_names_in invalid_feature_names = [f"bad{i}" for i in range(n_features)] with raises(ValueError, match="input_features is not equal to feature_names_in_"): sampler.get_feature_names_out(invalid_feature_names) feature_names_out_default = sampler.get_feature_names_out() feature_names_in_explicit_names = sampler.get_feature_names_out(feature_names_in) assert_array_equal(feature_names_out_default, feature_names_in_explicit_names) n_features_out = X_res.shape[1] assert ( len(feature_names_out_default) == n_features_out ), f"Expected {n_features_out} feature names, got {len(feature_names_out_default)}" imbalanced-learn-0.12.2/imblearn/utils/fixes.py000066400000000000000000000121451460233407600214030ustar00rootroot00000000000000"""Compatibility fixes for older version of python, numpy, scipy, and scikit-learn. If you add content to this file, please give the version of the package at which the fix is no longer needed. """ import functools import sys import numpy as np import scipy import scipy.stats import sklearn from sklearn.utils.fixes import parse_version from .._config import config_context, get_config sp_version = parse_version(scipy.__version__) sklearn_version = parse_version(sklearn.__version__) # TODO: Remove when SciPy 1.9 is the minimum supported version def _mode(a, axis=0): if sp_version >= parse_version("1.9.0"): return scipy.stats.mode(a, axis=axis, keepdims=True) return scipy.stats.mode(a, axis=axis) # TODO: Remove when scikit-learn 1.1 is the minimum supported version if sklearn_version >= parse_version("1.1"): from sklearn.utils.validation import _is_arraylike_not_scalar else: from sklearn.utils.validation import _is_arraylike def _is_arraylike_not_scalar(array): """Return True if array is array-like and not a scalar""" return _is_arraylike(array) and not np.isscalar(array) # TODO: remove when scikit-learn minimum version is 1.3 if sklearn_version < parse_version("1.3"): def _fit_context(*, prefer_skip_nested_validation): """Decorator to run the fit methods of estimators within context managers. Parameters ---------- prefer_skip_nested_validation : bool If True, the validation of parameters of inner estimators or functions called during fit will be skipped. This is useful to avoid validating many times the parameters passed by the user from the public facing API. It's also useful to avoid validating parameters that we pass internally to inner functions that are guaranteed to be valid by the test suite. It should be set to True for most estimators, except for those that receive non-validated objects as parameters, such as meta-estimators that are given estimator objects. Returns ------- decorated_fit : method The decorated fit method. """ def decorator(fit_method): @functools.wraps(fit_method) def wrapper(estimator, *args, **kwargs): global_skip_validation = get_config()["skip_parameter_validation"] # we don't want to validate again for each call to partial_fit partial_fit_and_fitted = ( fit_method.__name__ == "partial_fit" and _is_fitted(estimator) ) if not global_skip_validation and not partial_fit_and_fitted: estimator._validate_params() with config_context( skip_parameter_validation=( prefer_skip_nested_validation or global_skip_validation ) ): return fit_method(estimator, *args, **kwargs) return wrapper return decorator else: from sklearn.base import _fit_context # type: ignore[no-redef] # noqa # TODO: remove when scikit-learn minimum version is 1.3 if sklearn_version < parse_version("1.3"): def _is_fitted(estimator, attributes=None, all_or_any=all): """Determine if an estimator is fitted Parameters ---------- estimator : estimator instance Estimator instance for which the check is performed. attributes : str, list or tuple of str, default=None Attribute name(s) given as string or a list/tuple of strings Eg.: ``["coef_", "estimator_", ...], "coef_"`` If `None`, `estimator` is considered fitted if there exist an attribute that ends with a underscore and does not start with double underscore. all_or_any : callable, {all, any}, default=all Specify whether all or any of the given attributes must exist. Returns ------- fitted : bool Whether the estimator is fitted. """ if attributes is not None: if not isinstance(attributes, (list, tuple)): attributes = [attributes] return all_or_any([hasattr(estimator, attr) for attr in attributes]) if hasattr(estimator, "__sklearn_is_fitted__"): return estimator.__sklearn_is_fitted__() fitted_attrs = [ v for v in vars(estimator) if v.endswith("_") and not v.startswith("__") ] return len(fitted_attrs) > 0 else: from sklearn.utils.validation import _is_fitted # type: ignore[no-redef] try: from sklearn.utils.validation import _is_pandas_df except ImportError: def _is_pandas_df(X): """Return True if the X is a pandas dataframe.""" if hasattr(X, "columns") and hasattr(X, "iloc"): # Likely a pandas DataFrame, we explicitly check the type to confirm. try: pd = sys.modules["pandas"] except KeyError: return False return isinstance(X, pd.DataFrame) return False imbalanced-learn-0.12.2/imblearn/utils/testing.py000066400000000000000000000123121460233407600217360ustar00rootroot00000000000000"""Test utilities.""" # Adapted from scikit-learn # Authors: Guillaume Lemaitre # License: MIT import inspect import pkgutil from importlib import import_module from operator import itemgetter from pathlib import Path import numpy as np from scipy import sparse from sklearn.base import BaseEstimator from sklearn.neighbors import KDTree from sklearn.utils._testing import ignore_warnings def all_estimators( type_filter=None, ): """Get a list of all estimators from imblearn. This function crawls the module and gets all classes that inherit from BaseEstimator. Classes that are defined in test-modules are not included. By default meta_estimators are also not included. This function is adapted from sklearn. Parameters ---------- type_filter : str, list of str, or None, default=None Which kind of estimators should be returned. If None, no filter is applied and all estimators are returned. Possible values are 'sampler' to get estimators only of these specific types, or a list of these to get the estimators that fit at least one of the types. Returns ------- estimators : list of tuples List of (name, class), where ``name`` is the class name as string and ``class`` is the actual type of the class. """ from ..base import SamplerMixin def is_abstract(c): if not (hasattr(c, "__abstractmethods__")): return False if not len(c.__abstractmethods__): return False return True all_classes = [] modules_to_ignore = {"tests"} root = str(Path(__file__).parent.parent) # Ignore deprecation warnings triggered at import time and from walking # packages with ignore_warnings(category=FutureWarning): for importer, modname, ispkg in pkgutil.walk_packages( path=[root], prefix="imblearn." ): mod_parts = modname.split(".") if any(part in modules_to_ignore for part in mod_parts) or "._" in modname: continue module = import_module(modname) classes = inspect.getmembers(module, inspect.isclass) classes = [ (name, est_cls) for name, est_cls in classes if not name.startswith("_") ] all_classes.extend(classes) all_classes = set(all_classes) estimators = [ c for c in all_classes if (issubclass(c[1], BaseEstimator) and c[0] != "BaseEstimator") ] # get rid of abstract base classes estimators = [c for c in estimators if not is_abstract(c[1])] # get rid of sklearn estimators which have been imported in some classes estimators = [c for c in estimators if "sklearn" not in c[1].__module__] if type_filter is not None: if not isinstance(type_filter, list): type_filter = [type_filter] else: type_filter = list(type_filter) # copy filtered_estimators = [] filters = {"sampler": SamplerMixin} for name, mixin in filters.items(): if name in type_filter: type_filter.remove(name) filtered_estimators.extend( [est for est in estimators if issubclass(est[1], mixin)] ) estimators = filtered_estimators if type_filter: raise ValueError( "Parameter type_filter must be 'sampler' or " "None, got" " %s." % repr(type_filter) ) # drop duplicates, sort for reproducibility # itemgetter is used to ensure the sort does not extend to the 2nd item of # the tuple return sorted(set(estimators), key=itemgetter(0)) class _CustomNearestNeighbors(BaseEstimator): """Basic implementation of nearest neighbors not relying on scikit-learn. `kneighbors_graph` is ignored and `metric` does not have any impact. """ def __init__(self, n_neighbors=1, metric="euclidean"): self.n_neighbors = n_neighbors self.metric = metric def fit(self, X, y=None): X = X.toarray() if sparse.issparse(X) else X self._kd_tree = KDTree(X) return self def kneighbors(self, X, n_neighbors=None, return_distance=True): n_neighbors = n_neighbors if n_neighbors is not None else self.n_neighbors X = X.toarray() if sparse.issparse(X) else X distances, indices = self._kd_tree.query(X, k=n_neighbors) if return_distance: return distances, indices return indices def kneighbors_graph(X=None, n_neighbors=None, mode="connectivity"): """This method is not used within imblearn but it is required for duck-typing.""" pass class _CustomClusterer(BaseEstimator): """Class that mimics a cluster that does not expose `cluster_centers_`.""" def __init__(self, n_clusters=1, expose_cluster_centers=True): self.n_clusters = n_clusters self.expose_cluster_centers = expose_cluster_centers def fit(self, X, y=None): if self.expose_cluster_centers: self.cluster_centers_ = np.random.randn(self.n_clusters, X.shape[1]) return self def predict(self, X): return np.zeros(len(X), dtype=int) imbalanced-learn-0.12.2/imblearn/utils/tests/000077500000000000000000000000001460233407600210525ustar00rootroot00000000000000imbalanced-learn-0.12.2/imblearn/utils/tests/__init__.py000066400000000000000000000000001460233407600231510ustar00rootroot00000000000000imbalanced-learn-0.12.2/imblearn/utils/tests/test_deprecation.py000066400000000000000000000010521460233407600247560ustar00rootroot00000000000000"""Test for the deprecation helper""" # Authors: Guillaume Lemaitre # License: MIT import pytest from imblearn.utils.deprecation import deprecate_parameter class Sampler: def __init__(self): self.a = "something" self.b = "something" def test_deprecate_parameter(): with pytest.warns(FutureWarning, match="is deprecated from"): deprecate_parameter(Sampler(), "0.2", "a") with pytest.warns(FutureWarning, match="Use 'b' instead."): deprecate_parameter(Sampler(), "0.2", "a", "b") imbalanced-learn-0.12.2/imblearn/utils/tests/test_docstring.py000066400000000000000000000037151460233407600244650ustar00rootroot00000000000000"""Test utilities for docstring.""" # Authors: Guillaume Lemaitre # License: MIT import sys import textwrap import pytest from imblearn.utils import Substitution from imblearn.utils._docstring import _n_jobs_docstring, _random_state_docstring def _dedent_docstring(docstring): """Compatibility with Python 3.13+. xref: https://github.com/python/cpython/issues/81283 """ return "\n".join([textwrap.dedent(line) for line in docstring.split("\n")]) func_docstring = """A function. Parameters ---------- xxx yyy """ def func(param_1, param_2): """A function. Parameters ---------- {param_1} {param_2} """ return param_1, param_2 cls_docstring = """A class. Parameters ---------- xxx yyy """ class cls: """A class. Parameters ---------- {param_1} {param_2} """ def __init__(self, param_1, param_2): self.param_1 = param_1 self.param_2 = param_2 if sys.version_info >= (3, 13): func_docstring = _dedent_docstring(func_docstring) cls_docstring = _dedent_docstring(cls_docstring) @pytest.mark.parametrize( "obj, obj_docstring", [(func, func_docstring), (cls, cls_docstring)] ) def test_docstring_inject(obj, obj_docstring): obj_injected_docstring = Substitution(param_1="xxx", param_2="yyy")(obj) assert obj_injected_docstring.__doc__ == obj_docstring def test_docstring_template(): assert "random_state" in _random_state_docstring assert "n_jobs" in _n_jobs_docstring def test_docstring_with_python_OO(): """Check that we don't raise a warning if the code is executed with -OO. Non-regression test for: https://github.com/scikit-learn-contrib/imbalanced-learn/issues/945 """ instance = cls(param_1="xxx", param_2="yyy") instance.__doc__ = None # simulate -OO instance = Substitution(param_1="xxx", param_2="yyy")(instance) assert instance.__doc__ is None imbalanced-learn-0.12.2/imblearn/utils/tests/test_estimator_checks.py000066400000000000000000000067261460233407600260250ustar00rootroot00000000000000import numpy as np import pytest from sklearn.base import BaseEstimator from sklearn.utils.multiclass import check_classification_targets from imblearn.base import BaseSampler from imblearn.over_sampling.base import BaseOverSampler from imblearn.utils import check_target_type as target_check from imblearn.utils.estimator_checks import ( check_samplers_fit, check_samplers_nan, check_samplers_one_label, check_samplers_preserve_dtype, check_samplers_sparse, check_samplers_string, check_target_type, ) class BaseBadSampler(BaseEstimator): """Sampler without inputs checking.""" _sampling_type = "bypass" def fit(self, X, y): return self def fit_resample(self, X, y): check_classification_targets(y) self.fit(X, y) return X, y class SamplerSingleClass(BaseSampler): """Sampler that would sample even with a single class.""" _sampling_type = "bypass" def fit_resample(self, X, y): return self._fit_resample(X, y) def _fit_resample(self, X, y): return X, y class NotFittedSampler(BaseBadSampler): """Sampler without target checking.""" def fit(self, X, y): X, y = self._validate_data(X, y) return self class NoAcceptingSparseSampler(BaseBadSampler): """Sampler which does not accept sparse matrix.""" def fit(self, X, y): X, y = self._validate_data(X, y) self.sampling_strategy_ = "sampling_strategy_" return self class NotPreservingDtypeSampler(BaseSampler): _sampling_type = "bypass" _parameter_constraints: dict = {"sampling_strategy": "no_validation"} def _fit_resample(self, X, y): return X.astype(np.float64), y.astype(np.int64) class IndicesSampler(BaseOverSampler): def _check_X_y(self, X, y): y, binarize_y = target_check(y, indicate_one_vs_all=True) X, y = self._validate_data( X, y, reset=True, dtype=None, force_all_finite=False, ) return X, y, binarize_y def _fit_resample(self, X, y): n_max_count_class = np.bincount(y).max() indices = np.random.choice(np.arange(X.shape[0]), size=n_max_count_class * 2) return X[indices], y[indices] def test_check_samplers_string(): sampler = IndicesSampler() check_samplers_string(sampler.__class__.__name__, sampler) def test_check_samplers_nan(): sampler = IndicesSampler() check_samplers_nan(sampler.__class__.__name__, sampler) mapping_estimator_error = { "BaseBadSampler": (AssertionError, "ValueError not raised by fit"), "SamplerSingleClass": (AssertionError, "Sampler can't balance when only"), "NotFittedSampler": (AssertionError, "No fitted attribute"), "NoAcceptingSparseSampler": (TypeError, "dense data is required"), "NotPreservingDtypeSampler": (AssertionError, "X dtype is not preserved"), } def _test_single_check(Estimator, check): estimator = Estimator() name = estimator.__class__.__name__ err_type, err_msg = mapping_estimator_error[name] with pytest.raises(err_type, match=err_msg): check(name, estimator) def test_all_checks(): _test_single_check(BaseBadSampler, check_target_type) _test_single_check(SamplerSingleClass, check_samplers_one_label) _test_single_check(NotFittedSampler, check_samplers_fit) _test_single_check(NoAcceptingSparseSampler, check_samplers_sparse) _test_single_check(NotPreservingDtypeSampler, check_samplers_preserve_dtype) imbalanced-learn-0.12.2/imblearn/utils/tests/test_min_dependencies.py000066400000000000000000000030721460233407600257560ustar00rootroot00000000000000"""Tests for the minimum dependencies in the README.rst file.""" import os import platform import re from pathlib import Path import pytest from sklearn.utils.fixes import parse_version import imblearn from imblearn._min_dependencies import dependent_packages @pytest.mark.skipif( platform.system() == "Windows", reason="This test is enough on unix system" ) def test_min_dependencies_readme(): # Test that the minimum dependencies in the README.rst file are # consistent with the minimum dependencies defined at the file: # imblearn/_min_dependencies.py pattern = re.compile( r"(\.\. \|)" + r"(([A-Za-z]+\-?)+)" + r"(MinVersion\| replace::)" + r"( [0-9]+\.[0-9]+(\.[0-9]+)?)" ) readme_path = Path(imblearn.__path__[0]).parents[0] readme_file = readme_path / "README.rst" if not os.path.exists(readme_file): # Skip the test if the README.rst file is not available. # For instance, when installing scikit-learn from wheels pytest.skip("The README.rst file is not available.") with readme_file.open("r") as f: for line in f: matched = pattern.match(line) if not matched: continue package, version = matched.group(2), matched.group(5) package = package.lower() if package in dependent_packages: version = parse_version(version) min_version = parse_version(dependent_packages[package][0]) assert version == min_version, f"{package} has a mismatched version" imbalanced-learn-0.12.2/imblearn/utils/tests/test_param_validation.py000066400000000000000000000575621460233407600260140ustar00rootroot00000000000000"""This is a copy of sklearn/utils/tests/test_param_validation.py. It can be removed when we support scikit-learn >= 1.2. """ from numbers import Integral, Real import numpy as np import pytest from scipy.sparse import csr_matrix from sklearn.base import BaseEstimator from sklearn.model_selection import LeaveOneOut from sklearn.utils import deprecated from imblearn._config import config_context, get_config from imblearn.base import _ParamsValidationMixin from imblearn.utils._param_validation import ( HasMethods, Hidden, Interval, InvalidParameterError, MissingValues, Options, RealNotInt, StrOptions, _ArrayLikes, _Booleans, _Callables, _CVObjects, _InstancesOf, _IterablesNotString, _NanConstraint, _NoneConstraint, _PandasNAConstraint, _RandomStates, _SparseMatrices, _VerboseHelper, generate_invalid_param_val, generate_valid_param, make_constraint, validate_params, ) from imblearn.utils.fixes import _fit_context # Some helpers for the tests @validate_params( {"a": [Real], "b": [Real], "c": [Real], "d": [Real]}, prefer_skip_nested_validation=True, ) def _func(a, b=0, *args, c, d=0, **kwargs): """A function to test the validation of functions.""" class _Class: """A class to test the _InstancesOf constraint and the validation of methods.""" @validate_params({"a": [Real]}, prefer_skip_nested_validation=True) def _method(self, a): """A validated method""" @deprecated() @validate_params({"a": [Real]}, prefer_skip_nested_validation=True) def _deprecated_method(self, a): """A deprecated validated method""" class _Estimator(_ParamsValidationMixin, BaseEstimator): """An estimator to test the validation of estimator parameters.""" _parameter_constraints: dict = {"a": [Real]} def __init__(self, a): self.a = a @_fit_context(prefer_skip_nested_validation=True) def fit(self, X=None, y=None): pass @pytest.mark.parametrize("interval_type", [Integral, Real]) def test_interval_range(interval_type): """Check the range of values depending on closed.""" interval = Interval(interval_type, -2, 2, closed="left") assert -2 in interval assert 2 not in interval interval = Interval(interval_type, -2, 2, closed="right") assert -2 not in interval assert 2 in interval interval = Interval(interval_type, -2, 2, closed="both") assert -2 in interval assert 2 in interval interval = Interval(interval_type, -2, 2, closed="neither") assert -2 not in interval assert 2 not in interval @pytest.mark.parametrize("interval_type", [Integral, Real]) def test_interval_large_integers(interval_type): """Check that Interval constraint work with large integers. non-regression test for #26648. """ interval = Interval(interval_type, 0, 2, closed="neither") assert 2**65 not in interval assert 2**128 not in interval assert float(2**65) not in interval assert float(2**128) not in interval interval = Interval(interval_type, 0, 2**128, closed="neither") assert 2**65 in interval assert 2**128 not in interval assert float(2**65) in interval assert float(2**128) not in interval assert 2**1024 not in interval def test_interval_inf_in_bounds(): """Check that inf is included iff a bound is closed and set to None. Only valid for real intervals. """ interval = Interval(Real, 0, None, closed="right") assert np.inf in interval interval = Interval(Real, None, 0, closed="left") assert -np.inf in interval interval = Interval(Real, None, None, closed="neither") assert np.inf not in interval assert -np.inf not in interval @pytest.mark.parametrize( "interval", [Interval(Real, 0, 1, closed="left"), Interval(Real, None, None, closed="both")], ) def test_nan_not_in_interval(interval): """Check that np.nan is not in any interval.""" assert np.nan not in interval @pytest.mark.parametrize( "params, error, match", [ ( {"type": Integral, "left": 1.0, "right": 2, "closed": "both"}, TypeError, r"Expecting left to be an int for an interval over the integers", ), ( {"type": Integral, "left": 1, "right": 2.0, "closed": "neither"}, TypeError, "Expecting right to be an int for an interval over the integers", ), ( {"type": Integral, "left": None, "right": 0, "closed": "left"}, ValueError, r"left can't be None when closed == left", ), ( {"type": Integral, "left": 0, "right": None, "closed": "right"}, ValueError, r"right can't be None when closed == right", ), ( {"type": Integral, "left": 1, "right": -1, "closed": "both"}, ValueError, r"right can't be less than left", ), ], ) def test_interval_errors(params, error, match): """Check that informative errors are raised for invalid combination of parameters""" with pytest.raises(error, match=match): Interval(**params) def test_stroptions(): """Sanity check for the StrOptions constraint""" options = StrOptions({"a", "b", "c"}, deprecated={"c"}) assert options.is_satisfied_by("a") assert options.is_satisfied_by("c") assert not options.is_satisfied_by("d") assert "'c' (deprecated)" in str(options) def test_options(): """Sanity check for the Options constraint""" options = Options(Real, {-0.5, 0.5, np.inf}, deprecated={-0.5}) assert options.is_satisfied_by(-0.5) assert options.is_satisfied_by(np.inf) assert not options.is_satisfied_by(1.23) assert "-0.5 (deprecated)" in str(options) @pytest.mark.parametrize( "type, expected_type_name", [ (int, "int"), (Integral, "int"), (Real, "float"), (np.ndarray, "numpy.ndarray"), ], ) def test_instances_of_type_human_readable(type, expected_type_name): """Check the string representation of the _InstancesOf constraint.""" constraint = _InstancesOf(type) assert str(constraint) == f"an instance of '{expected_type_name}'" def test_hasmethods(): """Check the HasMethods constraint.""" constraint = HasMethods(["a", "b"]) class _Good: def a(self): pass # pragma: no cover def b(self): pass # pragma: no cover class _Bad: def a(self): pass # pragma: no cover assert constraint.is_satisfied_by(_Good()) assert not constraint.is_satisfied_by(_Bad()) assert str(constraint) == "an object implementing 'a' and 'b'" @pytest.mark.parametrize( "constraint", [ Interval(Real, None, 0, closed="left"), Interval(Real, 0, None, closed="left"), Interval(Real, None, None, closed="neither"), StrOptions({"a", "b", "c"}), MissingValues(), MissingValues(numeric_only=True), _VerboseHelper(), HasMethods("fit"), _IterablesNotString(), _CVObjects(), ], ) def test_generate_invalid_param_val(constraint): """Check that the value generated does not satisfy the constraint""" bad_value = generate_invalid_param_val(constraint) assert not constraint.is_satisfied_by(bad_value) @pytest.mark.parametrize( "integer_interval, real_interval", [ ( Interval(Integral, None, 3, closed="right"), Interval(RealNotInt, -5, 5, closed="both"), ), ( Interval(Integral, None, 3, closed="right"), Interval(RealNotInt, -5, 5, closed="neither"), ), ( Interval(Integral, None, 3, closed="right"), Interval(RealNotInt, 4, 5, closed="both"), ), ( Interval(Integral, None, 3, closed="right"), Interval(RealNotInt, 5, None, closed="left"), ), ( Interval(Integral, None, 3, closed="right"), Interval(RealNotInt, 4, None, closed="neither"), ), ( Interval(Integral, 3, None, closed="left"), Interval(RealNotInt, -5, 5, closed="both"), ), ( Interval(Integral, 3, None, closed="left"), Interval(RealNotInt, -5, 5, closed="neither"), ), ( Interval(Integral, 3, None, closed="left"), Interval(RealNotInt, 1, 2, closed="both"), ), ( Interval(Integral, 3, None, closed="left"), Interval(RealNotInt, None, -5, closed="left"), ), ( Interval(Integral, 3, None, closed="left"), Interval(RealNotInt, None, -4, closed="neither"), ), ( Interval(Integral, -5, 5, closed="both"), Interval(RealNotInt, None, 1, closed="right"), ), ( Interval(Integral, -5, 5, closed="both"), Interval(RealNotInt, 1, None, closed="left"), ), ( Interval(Integral, -5, 5, closed="both"), Interval(RealNotInt, -10, -4, closed="neither"), ), ( Interval(Integral, -5, 5, closed="both"), Interval(RealNotInt, -10, -4, closed="right"), ), ( Interval(Integral, -5, 5, closed="neither"), Interval(RealNotInt, 6, 10, closed="neither"), ), ( Interval(Integral, -5, 5, closed="neither"), Interval(RealNotInt, 6, 10, closed="left"), ), ( Interval(Integral, 2, None, closed="left"), Interval(RealNotInt, 0, 1, closed="both"), ), ( Interval(Integral, 1, None, closed="left"), Interval(RealNotInt, 0, 1, closed="both"), ), ], ) def test_generate_invalid_param_val_2_intervals(integer_interval, real_interval): """Check that the value generated for an interval constraint does not satisfy any of the interval constraints. """ bad_value = generate_invalid_param_val(constraint=real_interval) assert not real_interval.is_satisfied_by(bad_value) assert not integer_interval.is_satisfied_by(bad_value) bad_value = generate_invalid_param_val(constraint=integer_interval) assert not real_interval.is_satisfied_by(bad_value) assert not integer_interval.is_satisfied_by(bad_value) @pytest.mark.parametrize( "constraint", [ _ArrayLikes(), _InstancesOf(list), _Callables(), _NoneConstraint(), _RandomStates(), _SparseMatrices(), _Booleans(), Interval(Integral, None, None, closed="neither"), ], ) def test_generate_invalid_param_val_all_valid(constraint): """Check that the function raises NotImplementedError when there's no invalid value for the constraint. """ with pytest.raises(NotImplementedError): generate_invalid_param_val(constraint) @pytest.mark.parametrize( "constraint", [ _ArrayLikes(), _Callables(), _InstancesOf(list), _NoneConstraint(), _RandomStates(), _SparseMatrices(), _Booleans(), _VerboseHelper(), MissingValues(), MissingValues(numeric_only=True), StrOptions({"a", "b", "c"}), Options(Integral, {1, 2, 3}), Interval(Integral, None, None, closed="neither"), Interval(Integral, 0, 10, closed="neither"), Interval(Integral, 0, None, closed="neither"), Interval(Integral, None, 0, closed="neither"), Interval(Real, 0, 1, closed="neither"), Interval(Real, 0, None, closed="both"), Interval(Real, None, 0, closed="right"), HasMethods("fit"), _IterablesNotString(), _CVObjects(), ], ) def test_generate_valid_param(constraint): """Check that the value generated does satisfy the constraint.""" value = generate_valid_param(constraint) assert constraint.is_satisfied_by(value) @pytest.mark.parametrize( "constraint_declaration, value", [ (Interval(Real, 0, 1, closed="both"), 0.42), (Interval(Integral, 0, None, closed="neither"), 42), (StrOptions({"a", "b", "c"}), "b"), (Options(type, {np.float32, np.float64}), np.float64), (callable, lambda x: x + 1), (None, None), ("array-like", [[1, 2], [3, 4]]), ("array-like", np.array([[1, 2], [3, 4]])), ("sparse matrix", csr_matrix([[1, 2], [3, 4]])), ("random_state", 0), ("random_state", np.random.RandomState(0)), ("random_state", None), (_Class, _Class()), (int, 1), (Real, 0.5), ("boolean", False), ("verbose", 1), ("nan", np.nan), (MissingValues(), -1), (MissingValues(), -1.0), (MissingValues(), 2**1028), (MissingValues(), None), (MissingValues(), float("nan")), (MissingValues(), np.nan), (MissingValues(), "missing"), (HasMethods("fit"), _Estimator(a=0)), ("cv_object", 5), ], ) def test_is_satisfied_by(constraint_declaration, value): """Sanity check for the is_satisfied_by method""" constraint = make_constraint(constraint_declaration) assert constraint.is_satisfied_by(value) @pytest.mark.parametrize( "constraint_declaration, expected_constraint_class", [ (Interval(Real, 0, 1, closed="both"), Interval), (StrOptions({"option1", "option2"}), StrOptions), (Options(Real, {0.42, 1.23}), Options), ("array-like", _ArrayLikes), ("sparse matrix", _SparseMatrices), ("random_state", _RandomStates), (None, _NoneConstraint), (callable, _Callables), (int, _InstancesOf), ("boolean", _Booleans), ("verbose", _VerboseHelper), (MissingValues(numeric_only=True), MissingValues), (HasMethods("fit"), HasMethods), ("cv_object", _CVObjects), ("nan", _NanConstraint), ], ) def test_make_constraint(constraint_declaration, expected_constraint_class): """Check that make_constraint dispatches to the appropriate constraint class""" constraint = make_constraint(constraint_declaration) assert constraint.__class__ is expected_constraint_class def test_make_constraint_unknown(): """Check that an informative error is raised when an unknown constraint is passed""" with pytest.raises(ValueError, match="Unknown constraint"): make_constraint("not a valid constraint") def test_validate_params(): """Check that validate_params works no matter how the arguments are passed""" with pytest.raises( InvalidParameterError, match="The 'a' parameter of _func must be" ): _func("wrong", c=1) with pytest.raises( InvalidParameterError, match="The 'b' parameter of _func must be" ): _func(*[1, "wrong"], c=1) with pytest.raises( InvalidParameterError, match="The 'c' parameter of _func must be" ): _func(1, **{"c": "wrong"}) with pytest.raises( InvalidParameterError, match="The 'd' parameter of _func must be" ): _func(1, c=1, d="wrong") # check in the presence of extra positional and keyword args with pytest.raises( InvalidParameterError, match="The 'b' parameter of _func must be" ): _func(0, *["wrong", 2, 3], c=4, **{"e": 5}) with pytest.raises( InvalidParameterError, match="The 'c' parameter of _func must be" ): _func(0, *[1, 2, 3], c="four", **{"e": 5}) def test_validate_params_missing_params(): """Check that no error is raised when there are parameters without constraints """ @validate_params({"a": [int]}, prefer_skip_nested_validation=True) def func(a, b): pass func(1, 2) def test_decorate_validated_function(): """Check that validate_params functions can be decorated""" decorated_function = deprecated()(_func) with pytest.warns(FutureWarning, match="Function _func is deprecated"): decorated_function(1, 2, c=3) # outer decorator does not interfere with validation with pytest.warns(FutureWarning, match="Function _func is deprecated"): with pytest.raises( InvalidParameterError, match=r"The 'c' parameter of _func must be" ): decorated_function(1, 2, c="wrong") def test_validate_params_method(): """Check that validate_params works with methods""" with pytest.raises( InvalidParameterError, match="The 'a' parameter of _Class._method must be" ): _Class()._method("wrong") # validated method can be decorated with pytest.warns(FutureWarning, match="Function _deprecated_method is deprecated"): with pytest.raises( InvalidParameterError, match="The 'a' parameter of _Class._deprecated_method must be", ): _Class()._deprecated_method("wrong") def test_validate_params_estimator(): """Check that validate_params works with Estimator instances""" # no validation in init est = _Estimator("wrong") with pytest.raises( InvalidParameterError, match="The 'a' parameter of _Estimator must be" ): est.fit() def test_stroptions_deprecated_subset(): """Check that the deprecated parameter must be a subset of options.""" with pytest.raises(ValueError, match="deprecated options must be a subset"): StrOptions({"a", "b", "c"}, deprecated={"a", "d"}) def test_hidden_constraint(): """Check that internal constraints are not exposed in the error message.""" @validate_params( {"param": [Hidden(list), dict]}, prefer_skip_nested_validation=True ) def f(param): pass # list and dict are valid params f({"a": 1, "b": 2, "c": 3}) f([1, 2, 3]) with pytest.raises( InvalidParameterError, match="The 'param' parameter" ) as exc_info: f(param="bad") # the list option is not exposed in the error message err_msg = str(exc_info.value) assert "an instance of 'dict'" in err_msg assert "an instance of 'list'" not in err_msg def test_hidden_stroptions(): """Check that we can have 2 StrOptions constraints, one being hidden.""" @validate_params( {"param": [StrOptions({"auto"}), Hidden(StrOptions({"warn"}))]}, prefer_skip_nested_validation=True, ) def f(param): pass # "auto" and "warn" are valid params f("auto") f("warn") with pytest.raises( InvalidParameterError, match="The 'param' parameter" ) as exc_info: f(param="bad") # the "warn" option is not exposed in the error message err_msg = str(exc_info.value) assert "auto" in err_msg assert "warn" not in err_msg def test_validate_params_set_param_constraints_attribute(): """Check that the validate_params decorator properly sets the parameter constraints as attribute of the decorated function/method. """ assert hasattr(_func, "_skl_parameter_constraints") assert hasattr(_Class()._method, "_skl_parameter_constraints") def test_boolean_constraint_deprecated_int(): """Check that validate_params raise a deprecation message but still passes validation when using an int for a parameter accepting a boolean. """ @validate_params({"param": ["boolean"]}, prefer_skip_nested_validation=True) def f(param): pass # True/False and np.bool_(True/False) are valid params f(True) f(np.bool_(False)) def test_no_validation(): """Check that validation can be skipped for a parameter.""" @validate_params( {"param1": [int, None], "param2": "no_validation"}, prefer_skip_nested_validation=True, ) def f(param1=None, param2=None): pass # param1 is validated with pytest.raises(InvalidParameterError, match="The 'param1' parameter"): f(param1="wrong") # param2 is not validated: any type is valid. class SomeType: pass f(param2=SomeType) f(param2=SomeType()) def test_pandas_na_constraint_with_pd_na(): """Add a specific test for checking support for `pandas.NA`.""" pd = pytest.importorskip("pandas") na_constraint = _PandasNAConstraint() assert na_constraint.is_satisfied_by(pd.NA) assert not na_constraint.is_satisfied_by(np.array([1, 2, 3])) def test_iterable_not_string(): """Check that a string does not satisfy the _IterableNotString constraint.""" constraint = _IterablesNotString() assert constraint.is_satisfied_by([1, 2, 3]) assert constraint.is_satisfied_by(range(10)) assert not constraint.is_satisfied_by("some string") def test_cv_objects(): """Check that the _CVObjects constraint accepts all current ways to pass cv objects.""" constraint = _CVObjects() assert constraint.is_satisfied_by(5) assert constraint.is_satisfied_by(LeaveOneOut()) assert constraint.is_satisfied_by([([1, 2], [3, 4]), ([3, 4], [1, 2])]) assert constraint.is_satisfied_by(None) assert not constraint.is_satisfied_by("not a CV object") def test_third_party_estimator(): """Check that the validation from a scikit-learn estimator inherited by a third party estimator does not impose a match between the dict of constraints and the parameters of the estimator. """ class ThirdPartyEstimator(_Estimator): def __init__(self, b): self.b = b super().__init__(a=0) def fit(self, X=None, y=None): super().fit(X, y) # does not raise, even though "b" is not in the constraints dict and "a" is not # a parameter of the estimator. ThirdPartyEstimator(b=0).fit() def test_interval_real_not_int(): """Check for the type RealNotInt in the Interval constraint.""" constraint = Interval(RealNotInt, 0, 1, closed="both") assert constraint.is_satisfied_by(1.0) assert not constraint.is_satisfied_by(1) def test_real_not_int(): """Check for the RealNotInt type.""" assert isinstance(1.0, RealNotInt) assert not isinstance(1, RealNotInt) assert isinstance(np.float64(1), RealNotInt) assert not isinstance(np.int64(1), RealNotInt) def test_skip_param_validation(): """Check that param validation can be skipped using config_context.""" @validate_params({"a": [int]}, prefer_skip_nested_validation=True) def f(a): pass with pytest.raises(InvalidParameterError, match="The 'a' parameter"): f(a="1") # does not raise with config_context(skip_parameter_validation=True): f(a="1") @pytest.mark.parametrize("prefer_skip_nested_validation", [True, False]) def test_skip_nested_validation(prefer_skip_nested_validation): """Check that nested validation can be skipped.""" @validate_params({"a": [int]}, prefer_skip_nested_validation=True) def f(a): pass @validate_params( {"b": [int]}, prefer_skip_nested_validation=prefer_skip_nested_validation, ) def g(b): # calls f with a bad parameter type return f(a="invalid_param_value") # Validation for g is never skipped. with pytest.raises(InvalidParameterError, match="The 'b' parameter"): g(b="invalid_param_value") if prefer_skip_nested_validation: g(b=1) # does not raise because inner f is not validated else: with pytest.raises(InvalidParameterError, match="The 'a' parameter"): g(b=1) @pytest.mark.parametrize( "skip_parameter_validation, prefer_skip_nested_validation, expected_skipped", [ (True, True, True), (True, False, True), (False, True, True), (False, False, False), ], ) def test_skip_nested_validation_and_config_context( skip_parameter_validation, prefer_skip_nested_validation, expected_skipped ): """Check interaction between global skip and local skip.""" @validate_params( {"a": [int]}, prefer_skip_nested_validation=prefer_skip_nested_validation ) def g(a): return get_config()["skip_parameter_validation"] with config_context(skip_parameter_validation=skip_parameter_validation): actual_skipped = g(1) assert actual_skipped == expected_skipped imbalanced-learn-0.12.2/imblearn/utils/tests/test_show_versions.py000066400000000000000000000034321460233407600253750ustar00rootroot00000000000000"""Test for the show_versions helper. Based on the sklearn tests.""" # Author: Alexander L. Hayes # License: MIT from imblearn.utils._show_versions import _get_deps_info, show_versions def test_get_deps_info(): _deps_info = _get_deps_info() assert "pip" in _deps_info assert "setuptools" in _deps_info assert "imbalanced-learn" in _deps_info assert "scikit-learn" in _deps_info assert "numpy" in _deps_info assert "scipy" in _deps_info assert "Cython" in _deps_info assert "pandas" in _deps_info assert "joblib" in _deps_info def test_show_versions_default(capsys): show_versions() out, err = capsys.readouterr() assert "python" in out assert "executable" in out assert "machine" in out assert "pip" in out assert "setuptools" in out assert "imbalanced-learn" in out assert "scikit-learn" in out assert "numpy" in out assert "scipy" in out assert "Cython" in out assert "pandas" in out assert "keras" in out assert "tensorflow" in out assert "joblib" in out def test_show_versions_github(capsys): show_versions(github=True) out, err = capsys.readouterr() assert "
System, Dependency Information" in out assert "**System Information**" in out assert "* python" in out assert "* executable" in out assert "* machine" in out assert "**Python Dependencies**" in out assert "* pip" in out assert "* setuptools" in out assert "* imbalanced-learn" in out assert "* scikit-learn" in out assert "* numpy" in out assert "* scipy" in out assert "* Cython" in out assert "* pandas" in out assert "* keras" in out assert "* tensorflow" in out assert "* joblib" in out assert "
" in out imbalanced-learn-0.12.2/imblearn/utils/tests/test_testing.py000066400000000000000000000032341460233407600241420ustar00rootroot00000000000000"""Test for the testing module""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT import numpy as np import pytest from sklearn.neighbors._base import KNeighborsMixin from imblearn.base import SamplerMixin from imblearn.utils.testing import _CustomNearestNeighbors, all_estimators def test_all_estimators(): # check if the filtering is working with a list or a single string type_filter = "sampler" all_estimators(type_filter=type_filter) type_filter = ["sampler"] estimators = all_estimators(type_filter=type_filter) for estimator in estimators: # check that all estimators are sampler assert issubclass(estimator[1], SamplerMixin) # check that an error is raised when the type is unknown type_filter = "rnd" with pytest.raises(ValueError, match="Parameter type_filter must be 'sampler'"): all_estimators(type_filter=type_filter) def test_custom_nearest_neighbors(): """Check that our custom nearest neighbors can be used for our internal duck-typing.""" neareat_neighbors = _CustomNearestNeighbors(n_neighbors=3) assert not isinstance(neareat_neighbors, KNeighborsMixin) assert hasattr(neareat_neighbors, "kneighbors") assert hasattr(neareat_neighbors, "kneighbors_graph") rng = np.random.RandomState(42) X = rng.randn(150, 3) y = rng.randint(0, 2, 150) neareat_neighbors.fit(X, y) distances, indices = neareat_neighbors.kneighbors(X) assert distances.shape == (150, 3) assert indices.shape == (150, 3) np.testing.assert_allclose(distances[:, 0], 0.0) np.testing.assert_allclose(indices[:, 0], np.arange(150)) imbalanced-learn-0.12.2/imblearn/utils/tests/test_validation.py000066400000000000000000000324751460233407600246300ustar00rootroot00000000000000"""Test for the validation helper""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT from collections import Counter, OrderedDict import numpy as np import pytest from sklearn.cluster import KMeans from sklearn.neighbors import NearestNeighbors from sklearn.neighbors._base import KNeighborsMixin from sklearn.utils._testing import assert_array_equal from imblearn.utils import ( check_neighbors_object, check_sampling_strategy, check_target_type, ) from imblearn.utils._validation import ( ArraysTransformer, _deprecate_positional_args, _is_neighbors_object, ) from imblearn.utils.testing import _CustomNearestNeighbors multiclass_target = np.array([1] * 50 + [2] * 100 + [3] * 25) binary_target = np.array([1] * 25 + [0] * 100) def test_check_neighbors_object(): name = "n_neighbors" n_neighbors = 1 estimator = check_neighbors_object(name, n_neighbors) assert issubclass(type(estimator), KNeighborsMixin) assert estimator.n_neighbors == 1 estimator = check_neighbors_object(name, n_neighbors, 1) assert issubclass(type(estimator), KNeighborsMixin) assert estimator.n_neighbors == 2 estimator = NearestNeighbors(n_neighbors=n_neighbors) estimator_cloned = check_neighbors_object(name, estimator) assert estimator.n_neighbors == estimator_cloned.n_neighbors estimator = _CustomNearestNeighbors() estimator_cloned = check_neighbors_object(name, estimator) assert isinstance(estimator_cloned, _CustomNearestNeighbors) @pytest.mark.parametrize( "target, output_target", [ (np.array([0, 1, 1]), np.array([0, 1, 1])), (np.array([0, 1, 2]), np.array([0, 1, 2])), (np.array([[0, 1], [1, 0]]), np.array([1, 0])), ], ) def test_check_target_type(target, output_target): converted_target = check_target_type(target.astype(int)) assert_array_equal(converted_target, output_target.astype(int)) @pytest.mark.parametrize( "target, output_target, is_ova", [ (np.array([0, 1, 1]), np.array([0, 1, 1]), False), (np.array([0, 1, 2]), np.array([0, 1, 2]), False), (np.array([[0, 1], [1, 0]]), np.array([1, 0]), True), ], ) def test_check_target_type_ova(target, output_target, is_ova): converted_target, binarize_target = check_target_type( target.astype(int), indicate_one_vs_all=True ) assert_array_equal(converted_target, output_target.astype(int)) assert binarize_target == is_ova def test_check_sampling_strategy_warning(): msg = "dict for cleaning methods is not supported" with pytest.raises(ValueError, match=msg): check_sampling_strategy({1: 0, 2: 0, 3: 0}, multiclass_target, "clean-sampling") @pytest.mark.parametrize( "ratio, y, type, err_msg", [ ( 0.5, binary_target, "clean-sampling", "'clean-sampling' methods do let the user specify the sampling ratio", # noqa ), ( 0.1, np.array([0] * 10 + [1] * 20), "over-sampling", "remove samples from the minority class while trying to generate new", # noqa ), ( 0.1, np.array([0] * 10 + [1] * 20), "under-sampling", "generate new sample in the majority class while trying to remove", ), ], ) def test_check_sampling_strategy_float_error(ratio, y, type, err_msg): with pytest.raises(ValueError, match=err_msg): check_sampling_strategy(ratio, y, type) def test_check_sampling_strategy_error(): with pytest.raises(ValueError, match="'sampling_type' should be one of"): check_sampling_strategy("auto", np.array([1, 2, 3]), "rnd") error_regex = "The target 'y' needs to have more than 1 class." with pytest.raises(ValueError, match=error_regex): check_sampling_strategy("auto", np.ones((10,)), "over-sampling") error_regex = "When 'sampling_strategy' is a string, it needs to be one of" with pytest.raises(ValueError, match=error_regex): check_sampling_strategy("rnd", np.array([1, 2, 3]), "over-sampling") @pytest.mark.parametrize( "sampling_strategy, sampling_type, err_msg", [ ("majority", "over-sampling", "over-sampler"), ("minority", "under-sampling", "under-sampler"), ], ) def test_check_sampling_strategy_error_wrong_string( sampling_strategy, sampling_type, err_msg ): with pytest.raises( ValueError, match=("'{}' cannot be used with {}".format(sampling_strategy, err_msg)), ): check_sampling_strategy(sampling_strategy, np.array([1, 2, 3]), sampling_type) @pytest.mark.parametrize( "sampling_strategy, sampling_method", [ ({10: 10}, "under-sampling"), ({10: 10}, "over-sampling"), ([10], "clean-sampling"), ], ) def test_sampling_strategy_class_target_unknown(sampling_strategy, sampling_method): y = np.array([1] * 50 + [2] * 100 + [3] * 25) with pytest.raises(ValueError, match="are not present in the data."): check_sampling_strategy(sampling_strategy, y, sampling_method) def test_sampling_strategy_dict_error(): y = np.array([1] * 50 + [2] * 100 + [3] * 25) sampling_strategy = {1: -100, 2: 50, 3: 25} with pytest.raises(ValueError, match="in a class cannot be negative."): check_sampling_strategy(sampling_strategy, y, "under-sampling") sampling_strategy = {1: 45, 2: 100, 3: 70} error_regex = ( "With over-sampling methods, the number of samples in a" " class should be greater or equal to the original number" " of samples. Originally, there is 50 samples and 45" " samples are asked." ) with pytest.raises(ValueError, match=error_regex): check_sampling_strategy(sampling_strategy, y, "over-sampling") error_regex = ( "With under-sampling methods, the number of samples in a" " class should be less or equal to the original number of" " samples. Originally, there is 25 samples and 70 samples" " are asked." ) with pytest.raises(ValueError, match=error_regex): check_sampling_strategy(sampling_strategy, y, "under-sampling") @pytest.mark.parametrize("sampling_strategy", [-10, 10]) def test_sampling_strategy_float_error_not_in_range(sampling_strategy): y = np.array([1] * 50 + [2] * 100) with pytest.raises(ValueError, match="it should be in the range"): check_sampling_strategy(sampling_strategy, y, "under-sampling") def test_sampling_strategy_float_error_not_binary(): y = np.array([1] * 50 + [2] * 100 + [3] * 25) with pytest.raises(ValueError, match="the type of target is binary"): sampling_strategy = 0.5 check_sampling_strategy(sampling_strategy, y, "under-sampling") @pytest.mark.parametrize("sampling_method", ["over-sampling", "under-sampling"]) def test_sampling_strategy_list_error_not_clean_sampling(sampling_method): y = np.array([1] * 50 + [2] * 100 + [3] * 25) with pytest.raises(ValueError, match="cannot be a list for samplers"): sampling_strategy = [1, 2, 3] check_sampling_strategy(sampling_strategy, y, sampling_method) def _sampling_strategy_func(y): # this function could create an equal number of samples target_stats = Counter(y) n_samples = max(target_stats.values()) return {key: int(n_samples) for key in target_stats.keys()} @pytest.mark.parametrize( "sampling_strategy, sampling_type, expected_sampling_strategy, target", [ ("auto", "under-sampling", {1: 25, 2: 25}, multiclass_target), ("auto", "clean-sampling", {1: 25, 2: 25}, multiclass_target), ("auto", "over-sampling", {1: 50, 3: 75}, multiclass_target), ("all", "over-sampling", {1: 50, 2: 0, 3: 75}, multiclass_target), ("all", "under-sampling", {1: 25, 2: 25, 3: 25}, multiclass_target), ("all", "clean-sampling", {1: 25, 2: 25, 3: 25}, multiclass_target), ("majority", "under-sampling", {2: 25}, multiclass_target), ("majority", "clean-sampling", {2: 25}, multiclass_target), ("minority", "over-sampling", {3: 75}, multiclass_target), ("not minority", "over-sampling", {1: 50, 2: 0}, multiclass_target), ("not minority", "under-sampling", {1: 25, 2: 25}, multiclass_target), ("not minority", "clean-sampling", {1: 25, 2: 25}, multiclass_target), ("not majority", "over-sampling", {1: 50, 3: 75}, multiclass_target), ("not majority", "under-sampling", {1: 25, 3: 25}, multiclass_target), ("not majority", "clean-sampling", {1: 25, 3: 25}, multiclass_target), ( {1: 70, 2: 100, 3: 70}, "over-sampling", {1: 20, 2: 0, 3: 45}, multiclass_target, ), ( {1: 30, 2: 45, 3: 25}, "under-sampling", {1: 30, 2: 45, 3: 25}, multiclass_target, ), ([1], "clean-sampling", {1: 25}, multiclass_target), ( _sampling_strategy_func, "over-sampling", {1: 50, 2: 0, 3: 75}, multiclass_target, ), (0.5, "over-sampling", {1: 25}, binary_target), (0.5, "under-sampling", {0: 50}, binary_target), ], ) def test_check_sampling_strategy( sampling_strategy, sampling_type, expected_sampling_strategy, target ): sampling_strategy_ = check_sampling_strategy( sampling_strategy, target, sampling_type ) assert sampling_strategy_ == expected_sampling_strategy def test_sampling_strategy_callable_args(): y = np.array([1] * 50 + [2] * 100 + [3] * 25) multiplier = {1: 1.5, 2: 1, 3: 3} def sampling_strategy_func(y, multiplier): """samples such that each class will be affected by the multiplier.""" target_stats = Counter(y) return { key: int(values * multiplier[key]) for key, values in target_stats.items() } sampling_strategy_ = check_sampling_strategy( sampling_strategy_func, y, "over-sampling", multiplier=multiplier ) assert sampling_strategy_ == {1: 25, 2: 0, 3: 50} @pytest.mark.parametrize( "sampling_strategy, sampling_type, expected_result", [ ( {3: 25, 1: 25, 2: 25}, "under-sampling", OrderedDict({1: 25, 2: 25, 3: 25}), ), ( {3: 100, 1: 100, 2: 100}, "over-sampling", OrderedDict({1: 50, 2: 0, 3: 75}), ), ], ) def test_sampling_strategy_check_order( sampling_strategy, sampling_type, expected_result ): # We pass on purpose a non sorted dictionary and check that the resulting # dictionary is sorted. Refer to issue #428. y = np.array([1] * 50 + [2] * 100 + [3] * 25) sampling_strategy_ = check_sampling_strategy(sampling_strategy, y, sampling_type) assert sampling_strategy_ == expected_result def test_arrays_transformer_plain_list(): X = np.array([[0, 0], [1, 1]]) y = np.array([[0, 0], [1, 1]]) arrays_transformer = ArraysTransformer(X.tolist(), y.tolist()) X_res, y_res = arrays_transformer.transform(X, y) assert isinstance(X_res, list) assert isinstance(y_res, list) def test_arrays_transformer_numpy(): X = np.array([[0, 0], [1, 1]]) y = np.array([[0, 0], [1, 1]]) arrays_transformer = ArraysTransformer(X, y) X_res, y_res = arrays_transformer.transform(X, y) assert isinstance(X_res, np.ndarray) assert isinstance(y_res, np.ndarray) def test_arrays_transformer_pandas(): pd = pytest.importorskip("pandas") X = np.array([[0, 0], [1, 1]]) y = np.array([0, 1]) X_df = pd.DataFrame(X, columns=["a", "b"]) X_df = X_df.astype(int) y_df = pd.DataFrame(y, columns=["target"]) y_df = y_df.astype(int) y_s = pd.Series(y, name="target", dtype=int) # DataFrame and DataFrame case arrays_transformer = ArraysTransformer(X_df, y_df) X_res, y_res = arrays_transformer.transform(X, y) assert isinstance(X_res, pd.DataFrame) assert_array_equal(X_res.columns, X_df.columns) assert_array_equal(X_res.dtypes, X_df.dtypes) assert isinstance(y_res, pd.DataFrame) assert_array_equal(y_res.columns, y_df.columns) assert_array_equal(y_res.dtypes, y_df.dtypes) # DataFrames and Series case arrays_transformer = ArraysTransformer(X_df, y_s) _, y_res = arrays_transformer.transform(X, y) assert isinstance(y_res, pd.Series) assert_array_equal(y_res.name, y_s.name) assert_array_equal(y_res.dtype, y_s.dtype) def test_deprecate_positional_args_warns_for_function(): @_deprecate_positional_args def f1(a, b, *, c=1, d=1): pass with pytest.warns(FutureWarning, match=r"Pass c=3 as keyword args"): f1(1, 2, 3) with pytest.warns(FutureWarning, match=r"Pass c=3, d=4 as keyword args"): f1(1, 2, 3, 4) @_deprecate_positional_args def f2(a=1, *, b=1, c=1, d=1): pass with pytest.warns(FutureWarning, match=r"Pass b=2 as keyword args"): f2(1, 2) # The * is place before a keyword only argument without a default value @_deprecate_positional_args def f3(a, *, b, c=1, d=1): pass with pytest.warns(FutureWarning, match=r"Pass b=2 as keyword args"): f3(1, 2) @pytest.mark.parametrize( "estimator, is_neighbor_estimator", [(NearestNeighbors(), True), (KMeans(), False)] ) def test_is_neighbors_object(estimator, is_neighbor_estimator): assert _is_neighbors_object(estimator) == is_neighbor_estimator imbalanced-learn-0.12.2/maint_tools/000077500000000000000000000000001460233407600173075ustar00rootroot00000000000000imbalanced-learn-0.12.2/maint_tools/test_docstring.py000066400000000000000000000213351460233407600227200ustar00rootroot00000000000000import importlib import inspect import pkgutil import re from inspect import signature from typing import Optional import pytest import imblearn from imblearn.utils.testing import all_estimators numpydoc_validation = pytest.importorskip("numpydoc.validate") # List of whitelisted modules and methods; regexp are supported. # These docstrings will fail because they are inheriting from scikit-learn DOCSTRING_WHITELIST = [ "ADASYN$", "ADASYN.", "AllKNN$", "AllKNN.", "BalancedBaggingClassifier$", "BalancedBaggingClassifier.", "BalancedRandomForestClassifier$", "BalancedRandomForestClassifier.", "ClusterCentroids$", "ClusterCentroids.", "CondensedNearestNeighbour$", "CondensedNearestNeighbour.", "EasyEnsembleClassifier$", "EasyEnsembleClassifier.", "EditedNearestNeighbours$", "EditedNearestNeighbours.", "FunctionSampler$", "FunctionSampler.", "InstanceHardnessThreshold$", "InstanceHardnessThreshold.", "SMOTE$", "SMOTE.", "NearMiss$", "NearMiss.", "NeighbourhoodCleaningRule$", "NeighbourhoodCleaningRule.", "OneSidedSelection$", "OneSidedSelection.", "Pipeline$", "Pipeline.", "RUSBoostClassifier$", "RUSBoostClassifier.", "RandomOverSampler$", "RandomOverSampler.", "RandomUnderSampler$", "RandomUnderSampler.", "TomekLinks$", "TomekLinks", "ValueDifferenceMetric$", "ValueDifferenceMetric.", ] FUNCTION_DOCSTRING_IGNORE_LIST = [ "imblearn.tensorflow._generator.balanced_batch_generator", ] FUNCTION_DOCSTRING_IGNORE_LIST = set(FUNCTION_DOCSTRING_IGNORE_LIST) def get_all_methods(): estimators = all_estimators() for name, Estimator in estimators: if name.startswith("_"): # skip private classes continue methods = [] for name in dir(Estimator): if name.startswith("_"): continue method_obj = getattr(Estimator, name) if hasattr(method_obj, "__call__") or isinstance(method_obj, property): methods.append(name) methods.append(None) for method in sorted(methods, key=lambda x: str(x)): yield Estimator, method def _is_checked_function(item): if not inspect.isfunction(item): return False if item.__name__.startswith("_"): return False mod = item.__module__ if not mod.startswith("imblearn.") or mod.endswith("estimator_checks"): return False return True def get_all_functions_names(): """Get all public functions define in the imblearn module""" modules_to_ignore = { "tests", "estimator_checks", } all_functions_names = set() for module_finder, module_name, ispkg in pkgutil.walk_packages( path=imblearn.__path__, prefix="imblearn." ): module_parts = module_name.split(".") if ( any(part in modules_to_ignore for part in module_parts) or "._" in module_name ): continue module = importlib.import_module(module_name) functions = inspect.getmembers(module, _is_checked_function) for name, func in functions: full_name = f"{func.__module__}.{func.__name__}" all_functions_names.add(full_name) return sorted(all_functions_names) def filter_errors(errors, method, Estimator=None): """ Ignore some errors based on the method type. These rules are specific for scikit-learn.""" for code, message in errors: # We ignore following error code, # - RT02: The first line of the Returns section # should contain only the type, .. # (as we may need refer to the name of the returned # object) # - GL01: Docstring text (summary) should start in the line # immediately after the opening quotes (not in the same line, # or leaving a blank line in between) # - GL02: If there's a blank line, it should be before the # first line of the Returns section, not after (it allows to have # short docstrings for properties). if code in ["RT02", "GL01", "GL02"]: continue # Ignore PR02: Unknown parameters for properties. We sometimes use # properties for ducktyping, i.e. SGDClassifier.predict_proba if code == "PR02" and Estimator is not None and method is not None: method_obj = getattr(Estimator, method) if isinstance(method_obj, property): continue # Following codes are only taken into account for the # top level class docstrings: # - ES01: No extended summary found # - SA01: See Also section not found # - EX01: No examples section found if method is not None and code in ["EX01", "SA01", "ES01"]: continue yield code, message def repr_errors(res, estimator=None, method: Optional[str] = None) -> str: """Pretty print original docstring and the obtained errors Parameters ---------- res : dict result of numpydoc.validate.validate estimator : {estimator, None} estimator object or None method : str if estimator is not None, either the method name or None. Returns ------- str String representation of the error. """ if method is None: if hasattr(estimator, "__init__"): method = "__init__" elif estimator is None: raise ValueError("At least one of estimator, method should be provided") else: raise NotImplementedError if estimator is not None: obj = getattr(estimator, method) try: obj_signature = signature(obj) except TypeError: # In particular we can't parse the signature of properties obj_signature = ( "\nParsing of the method signature failed, " "possibly because this is a property." ) obj_name = estimator.__name__ + "." + method else: obj_signature = "" obj_name = method msg = "\n\n" + "\n\n".join( [ str(res["file"]), obj_name + str(obj_signature), res["docstring"], "# Errors", "\n".join( " - {}: {}".format(code, message) for code, message in res["errors"] ), ] ) return msg @pytest.mark.parametrize("function_name", get_all_functions_names()) def test_function_docstring(function_name, request): """Check function docstrings using numpydoc.""" if function_name in FUNCTION_DOCSTRING_IGNORE_LIST: request.applymarker( pytest.mark.xfail(run=False, reason="TODO pass numpydoc validation") ) res = numpydoc_validation.validate(function_name) res["errors"] = list(filter_errors(res["errors"], method="function")) if res["errors"]: msg = repr_errors(res, method=f"Tested function: {function_name}") raise ValueError(msg) @pytest.mark.parametrize("Estimator, method", get_all_methods()) def test_docstring(Estimator, method, request): base_import_path = Estimator.__module__ import_path = [base_import_path, Estimator.__name__] if method is not None: import_path.append(method) import_path = ".".join(import_path) if not any(re.search(regex, import_path) for regex in DOCSTRING_WHITELIST): request.applymarker( pytest.mark.xfail(run=False, reason="TODO pass numpydoc validation") ) res = numpydoc_validation.validate(import_path) res["errors"] = list(filter_errors(res["errors"], method)) if res["errors"]: msg = repr_errors(res, Estimator, method) raise ValueError(msg) if __name__ == "__main__": import argparse import sys parser = argparse.ArgumentParser(description="Validate docstring with numpydoc.") parser.add_argument("import_path", help="Import path to validate") args = parser.parse_args() res = numpydoc_validation.validate(args.import_path) import_path_sections = args.import_path.split(".") # When applied to classes, detect class method. For functions # method = None. # TODO: this detection can be improved. Currently we assume that we have # class # methods if the second path element before last is in camel case. if len(import_path_sections) >= 2 and re.match( r"(?:[A-Z][a-z]*)+", import_path_sections[-2] ): method = import_path_sections[-1] else: method = None res["errors"] = list(filter_errors(res["errors"], method)) if res["errors"]: msg = repr_errors(res, method=args.import_path) print(msg) sys.exit(1) else: print("All docstring checks passed for {}!".format(args.import_path)) imbalanced-learn-0.12.2/pyproject.toml000066400000000000000000000010211460233407600176650ustar00rootroot00000000000000[tool.black] target-version = ['py38'] include = '\.pyi?$' [tool.isort] profile = "black" [tool.ruff] # all rules can be found here: https://beta.ruff.rs/docs/rules/ select = ["E", "F", "W", "I"] # max line length for black line-length = 88 target-version = "py38" ignore=[ # space before : (needed for how black formats slicing) "E203", # do not assign a lambda expression, use a def "E731", # do not use variables named 'l', 'O', or 'I' "E741", # Import not on the top of the file "E402", ] imbalanced-learn-0.12.2/references.bib000066400000000000000000000155461460233407600175710ustar00rootroot00000000000000 @InProceedings{ batista2003, title = {Balancing training data for automated annotation of keywords: A case study}, author = {Batista, Gustavo E. A. P. A. and Bazzan, Ana L. C. and Monard, Maria Carolina}, booktitle = {Proceedings of the 2nd Brazilian Workshop on Bioinformatics}, pages = {10--18}, year = {2003}, month = {Dec.}, address = {Rio de Janeiro, Brazil} } @Article{ batista2004, title = {A study of the behavior of several methods for balancing machine learning training data}, author = {Batista, Gustavo E. A. P. A. and Prati, Ronaldo C. and Monard, Maria Carolina}, journal = {ACM Sigkdd Explorations Newsletter}, volume = {6}, number = {1}, pages = {20--29}, year = {2004}, publisher = {ACM} } @Article{ chawla2002, title = {SMOTE: Synthetic minority over-sampling technique}, author = {Chawla, Nitesh V. and Bowyer, Kevin W. and Hall, Lawrence O. and Kegelmeyer, W. Philip}, journal = {Journal of Artificial Intelligence Research}, volume = {16}, pages = {321--357}, year = {2002} } @InProceedings{ han2005, title = {Borderline-SMOTE: A new over-sampling method in imbalanced data sets learning}, author = {Han, Hui and Wang, Wen-Yuan and Mao, Bing-Huan}, journal = {Advances in intelligent computing}, pages = {878--887}, year = {2005}, booktitle = {Proceedings of the 1st International Conference on Intelligent Computing}, month = {Aug.}, address = {Hefei, China} } @Article{ hart1968, title = {The condensed nearest neighbor rule}, author = {Hart, Peter E.}, journal = {IEEE Transactions on Information Theory}, volume = {14}, number = {3}, pages = {515--516}, year = {1968}, publisher = {IEEE} } @InProceedings{ he2008, title = {ADASYN: Adaptive synthetic sampling approach for imbalanced learning}, author = {He, Haibo and Bai, Yang and Garcia, Edwardo A. and Li, Shutao}, booktitle = {Proceedings of the 5th IEEE International Joint Conference on Neural Networks}, pages = {1322--1328}, year = {2008}, organization = {IEEE}, month = {Jun.}, address = {Hong Kong, China} } @InProceedings{ kubat1997, title = {Addressing the curse of imbalanced training sets: One-sided selection}, author = {Kubat, Miroslav and Matwin, Stan}, booktitle = {Proceedings of the 14th International Conference on Machine Learning}, volume = {97}, pages = {179--186}, year = {1997}, address = {Nashville, Tennessee, USA}, month = {July} } @InProceedings{ laurikkala2001, title = {Improving identification of difficult small classes by balancing class distribution}, author = {Laurikkala, Jorma}, journal = {Proceedings of the 8th Conference on Artificial Intelligence in Medicine in Europe}, pages = {63--66}, address = {Cascais, Portugal}, month = {Jul.}, year = {2001}, publisher = {Springer} } @Article{ liu2009, title = {Exploratory undersampling for class-imbalance learning}, author = {Liu, Xu-Ying and Wu, Jianxin and Zhou, Zhi-Hua}, journal = {IEEE Transactions on Systems, Man, and Cybernetics}, volume = {39}, number = {2}, pages = {539--550}, year = {2009}, publisher = {IEEE} } @InProceedings{ mani2003, title = {kNN approach to unbalanced data distributions: A case study involving information extraction}, author = {Mani, Inderjeet and Zhang, Jianping}, booktitle = {Proceedings of the Workshop on Learning from Imbalanced Data Sets}, volume = {126}, year = {2003}, month = {Aug.}, pages = {1--7}, address = {Washington, DC, USA} } @InProceedings{ nguyen2009, title = {Borderline over-sampling for imbalanced data classification}, author = {Nguyen, Hien M. and Cooper, Eric W. and Kamei, Katsuari}, journal = {Proceedings of the 5th International Workshop on computational Intelligence and Applications}, pages = {24--29}, year = {2009} } @Article{ smith2014, title = {An instance level analysis of data complexity}, author = {Smith, Michael R. and Martinez, Tony and Giraud-Carrier, Christophe}, journal = {Machine learning}, volume = {95}, number = {2}, pages = {225--256}, year = {2014}, publisher = {Springer} } @Article{ tomek1976a, title = {Two modifications of CNN}, author = {Tomek, Ivan}, journal = {IEEE Trans. Systems, Man and Cybernetics}, volume = {6}, issue = {6}, pages = {769--772}, year = {1976} } @Article{ tomek1976b, title = {An experiment with the edited nearest-neighbor rule}, author = {Tomek, Ivan}, journal = {IEEE Transactions on Systems, Man, and Cybernetics}, number = {6}, issue = {6}, pages = {448--452}, year = {1976} } @Article{ wilson1972, title = {Asymptotic properties of nearest neighbor rules using edited data}, author = {Wilson, Dennis L.}, journal = {IEEE Transactions on Systems, Man, and Cybernetics}, volume = {2}, number = {3}, pages = {408--421}, year = {1972}, publisher = {IEEE} } @article{chen2004using, title={Using random forest to learn imbalanced data}, author={Chen, Chao and Liaw, Andy and Breiman, Leo}, journal={University of California, Berkeley}, volume={110}, pages={1--12}, year={2004} } @article{torelli2014rose, author = {Menardi, Giovanna and Torelli, Nicola}, title={Training and assessing classification rules with imbalanced data}, journal={Data Mining and Knowledge Discovery}, volume={28}, pages={92-122}, year={2014}, publisher={Springer}, issue = {1}, issn = {1573-756X}, url = {https://doi.org/10.1007/s10618-012-0295-5}, doi = {10.1007/s10618-012-0295-5} } @article{stanfill1986toward, title={Toward memory-based reasoning}, author={Stanfill, Craig and Waltz, David}, journal={Communications of the ACM}, volume={29}, number={12}, pages={1213--1228}, year={1986}, publisher={ACM New York, NY, USA} } @article{wilson1997improved, title={Improved heterogeneous distance functions}, author={Wilson, D Randall and Martinez, Tony R}, journal={Journal of artificial intelligence research}, volume={6}, pages={1--34}, year={1997} } @inproceedings{wang2009diversity, title={Diversity analysis on imbalanced data sets by using ensemble models}, author={Wang, Shuo and Yao, Xin}, booktitle={2009 IEEE symposium on computational intelligence and data mining}, pages={324--331}, year={2009}, organization={IEEE} } @article{hido2009roughly, title={Roughly balanced bagging for imbalanced data}, author={Hido, Shohei and Kashima, Hisashi and Takahashi, Yutaka}, journal={Statistical Analysis and Data Mining: The ASA Data Science Journal}, volume={2}, number={5-6}, pages={412--426}, year={2009}, publisher={Wiley Online Library} } @article{maclin1997empirical, title={An empirical evaluation of bagging and boosting}, author={Maclin, Richard and Opitz, David}, journal={AAAI/IAAI}, volume={1997}, pages={546--551}, year={1997} } imbalanced-learn-0.12.2/setup.cfg000066400000000000000000000016371460233407600166070ustar00rootroot00000000000000[bumpversion] current_version = 0.12.2 tag = False parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? serialize = {major}.{minor}.{patch}.{release}{dev} {major}.{minor}.{patch} [bumpversion:part:release] optional_value = gamma values = dev gamma [bumpversion:part:dev] [bumpversion:file:imblearn/_version.py] [aliases] test = pytest [tool:pytest] doctest_optionflags = NORMALIZE_WHITESPACE ELLIPSIS testpaths = imblearn addopts = --doctest-modules --color=yes -rs filterwarnings = ignore:the matrix subclass:PendingDeprecationWarning [flake8] max-line-length = 88 target-version = ['py38'] ignore = E24, E121, E123, E126, E203, E226, E704, E731, E741, W503, W504 exclude = .git, __pycache__, dist, doc/_build, doc/auto_examples, build, per-file-ignores = examples/*: E402 doc/conf.py: E402 [mypy] ignore_missing_imports = True allow_redefinition = True imbalanced-learn-0.12.2/setup.py000077500000000000000000000051271460233407600165010ustar00rootroot00000000000000#! /usr/bin/env python """Toolbox for imbalanced dataset in machine learning.""" import codecs import os from setuptools import find_packages, setup try: import builtins except ImportError: # Python 2 compat: just to be able to declare that Python >=3.7 is needed. import __builtin__ as builtins # This is a bit (!) hackish: we are setting a global variable so that the # main imblearn __init__ can detect if it is being loaded by the setup # routine, to avoid attempting to load components that aren't built yet: # the numpy distutils extensions that are used by imbalanced-learn to # recursively build the compiled extensions in sub-packages is based on the # Python import machinery. builtins.__IMBLEARN_SETUP__ = True import imblearn._min_dependencies as min_deps # noqa # get __version__ from _version.py ver_file = os.path.join("imblearn", "_version.py") with open(ver_file) as f: exec(f.read()) DISTNAME = "imbalanced-learn" DESCRIPTION = "Toolbox for imbalanced dataset in machine learning." with codecs.open("README.rst", encoding="utf-8-sig") as f: LONG_DESCRIPTION = f.read() MAINTAINER = "G. Lemaitre, C. Aridas" MAINTAINER_EMAIL = "g.lemaitre58@gmail.com, ichkoar@gmail.com" URL = "https://github.com/scikit-learn-contrib/imbalanced-learn" LICENSE = "MIT" DOWNLOAD_URL = "https://github.com/scikit-learn-contrib/imbalanced-learn" VERSION = __version__ # noqa CLASSIFIERS = [ "Intended Audience :: Science/Research", "Intended Audience :: Developers", "License :: OSI Approved", "Programming Language :: C", "Programming Language :: Python", "Topic :: Software Development", "Topic :: Scientific/Engineering", "Operating System :: Microsoft :: Windows", "Operating System :: POSIX", "Operating System :: Unix", "Operating System :: MacOS", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", ] PYTHON_REQUIRES = ">=3.8" INSTALL_REQUIRES = (min_deps.tag_to_packages["install"],) EXTRAS_REQUIRE = { key: value for key, value in min_deps.tag_to_packages.items() if key != "install" } setup( name=DISTNAME, maintainer=MAINTAINER, maintainer_email=MAINTAINER_EMAIL, description=DESCRIPTION, license=LICENSE, url=URL, version=VERSION, download_url=DOWNLOAD_URL, long_description=LONG_DESCRIPTION, zip_safe=False, # the package can run out of an .egg file classifiers=CLASSIFIERS, packages=find_packages(), install_requires=INSTALL_REQUIRES, extras_require=EXTRAS_REQUIRE, )